1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # motif.py - defines the motif object and its methods in cistematic
31 from string import upper, lower
32 from math import log, exp
33 from copy import deepcopy
34 from cistematic.core import complement
35 from cistematic.cisstat.score import pearsonCorrelation
36 import re, os, tempfile
38 if os.environ.get("CISTEMATIC_ROOT"):
39 cisRoot = os.environ.get("CISTEMATIC_ROOT")
41 cisRoot = "/proj/genome"
43 if os.environ.get("CISTEMATIC_TEMP"):
44 cisTemp = os.environ.get("CISTEMATIC_TEMP")
48 tempfile.tempdir = cisTemp
51 import cistematic.core._motif as _motif
52 hasMotifExtension = True
54 hasMotifExtension = False
62 symbolToMatrix = {"A": [1.0, 0.0, 0.0, 0.0],
63 "C": [0.0, 1.0, 0.0, 0.0],
64 "G": [0.0, 0.0, 1.0, 0.0],
65 "T": [0.0, 0.0, 0.0, 1.0],
66 "W": [0.5, 0.0, 0.0, 0.5],
67 "S": [0.0, 0.5, 0.5, 0.0],
68 "R": [0.5, 0.0, 0.5, 0.0],
69 "Y": [0.0, 0.5, 0.0, 0.5],
70 "M": [0.5, 0.5, 0.0, 0.0],
71 "K": [0.0, 0.0, 0.5, 0.5],
72 "H": [0.33, 0.33, 0.0, 0.34],
73 "B": [0.0, 0.33, 0.33, 0.34],
74 "V": [0.33, 0.33, 0.34, 0.0],
75 "D": [0.33, 0.0, 0.33, 0.34],
76 "N": [0.25, 0.25, 0.25, 0.25]
97 motifDict = {"A": ["W", "R", "M", "H", "V", "D"],
98 "T": ["W", "Y", "K", "H", "B", "D"],
99 "C": ["S", "Y", "M", "H", "B", "V"],
100 "G": ["S", "R", "K", "B", "V", "D"]
105 """ The Motif class is the heart of cistematic. It captures both the consensus,
106 PWM, and Markov(1) versions of the motif, as well as provides methods for scanning
107 sequences for matches.
122 def __init__(self, tagID="", motif="", PWM=[], seqs=[], thresh=0.0, info="", motifFile="", seqfile=""):
123 """ initialize a Motif object either with (A) a tagID identifier and either of
124 (i) a IUPAC consensus or (ii) a PWM or (iii) example sequences, or (B) from
125 a motif file in Cistematic motif format.
129 (fileTagID, motif, PWM, seqs, thresh, info) = self.readMotif(motifFile)
133 elif len(fileTagID) > 0:
134 self.setTagID(fileTagID)
136 self.setTagID("default")
141 # PWM overrules the motif
145 # a seqfile can be used to create a list of sequences
147 seqs = self.readSeqFile(seqfile)
149 # Sequences overrules the PWM and motif
151 self.setSequences(seqs)
152 self.setThreshold(len(seqs[0]))
154 self.setThreshold(float(thresh))
160 """ returns the length of the motif
162 return len(self.motifPWM)
165 def readMotif(self, motifFile):
166 """ read motif in cistmeatic motif format to initialize motif instance.
173 infile = open(motifFile, "r")
175 if len(line) < 4 or line[0] == "#":
178 fields = line.strip().split("\t")
179 fieldType = fields[0].lower()
180 if fieldType not in ["tagid", "motif", "acgt", "sequence", "threshold", "info"]:
181 print "could not process line %s" % line
187 if fieldType == "motif":
189 elif fieldType == "acgt":
190 PWM.append([float(fields[1]), float(fields[2]), float(fields[3]), float(fields[4])])
191 elif fieldType == "sequence":
192 seqs.append(fields[1])
193 elif fieldType == "threshold":
194 threshold = float(fields[1])
195 elif fieldType == "info":
196 info = fields[1].strip()
197 elif fieldType == "tagid":
202 return (tagID, motif, PWM, seqs, threshold, info)
205 def readSeqFile(self, seqFile):
206 """ read sequences from a sequence file.
209 infile = open(seqFile, "r")
211 seqs.append(line.strip().upper())
218 def saveMotif(self, motFile):
219 """ Save motif in cistematic motif format.
221 outfile = open(motFile, "w")
222 outfile.write("tagid\t%s\n" % self.tagID)
223 outfile.write("info\t%s\n" % self.info.strip())
224 outfile.write("threshold\t%s\n" % str(self.threshold))
225 outfile.write("motif\t%s\n" % self.buildConsensus())
226 for col in self.motifPWM:
227 outfile.write("acgt\t%f\t%f\t%f\t%f\n" % (col[0], col[1], col[2], col[3]))
229 for seq in self.sequences:
230 outfile.write("sequence\t%s\n" % seq)
235 def setTagID(self, tag):
236 """ set motif identifier.
241 def setInfo(self, info):
242 """ set motif info string.
247 def setThreshold(self, threshold):
248 """ set a pre-defined threshold or the highest-possible threshold, otherwise.
250 (sForward, sBackward) = self.scoreMotif(self.buildStrictConsensus())
251 if sForward > sBackward:
252 self.threshold = sForward
254 self.threshold = sBackward
256 if threshold < self.threshold and threshold > 0:
257 self.threshold = threshold
260 def setMotif(self, motif):
261 """ set the motif PWM using a IUPAC consensus.
263 self.motifSeq = motif
264 self.motifPWM = self.buildPWM(motif)
265 self.buildReversePWM()
266 self.strictConsensus = self.buildStrictConsensus()
269 def setSequences(self, seqs):
270 """ set the founder sequences for the motif and recalculate PWM with them.
272 self.sequences = seqs
273 self.calculatePWMfromSeqs()
274 self.calculateMarkov1()
277 def setPWM(self, PWM):
278 """ set the PWM for the motif and calculate consensus.
281 self.buildReversePWM()
282 self.motifSeq = self.buildConsensus()
283 self.strictConsensus = self.buildStrictConsensus()
286 def appendToPWM(self, col):
287 """ add a column to the PWM for the motif
289 self.motifPWM.append(col)
292 def buildPWM(self, motif):
293 """ returns the PWM for the provided consensus motif.
296 for letter in upper(motif):
297 PWM.append(symbolToMatrix[letter])
302 def buildReversePWM(self):
303 """ returns the reverse PWM for the motif
306 tempPWM = deepcopy(self.motifPWM)
309 theRevPWM.append([col[matrixRow["T"]], col[matrixRow["G"]], col[matrixRow["C"]], col[matrixRow["A"]]])
311 self.reversePWM = deepcopy(theRevPWM)
313 return self.reversePWM
317 """ returns the PWM for the motif
323 """ print the PWM and the consensus
329 cons = self.buildConsensus()
335 for col in self.motifPWM:
336 aRow = "%s%s\t" % (aRow, str(round(col[matrixRow["A"]],4)))
337 cRow = "%s%s\t" % (cRow, str(round(col[matrixRow["C"]],4)))
338 gRow = "%s%s\t" % (gRow, str(round(col[matrixRow["G"]],4)))
339 tRow = "%s%s\t" % (tRow, str(round(col[matrixRow["T"]],4)))
341 print "A:\t%s\nC:\t%s\nG:\t%s\nT:\t%s\n" % (aRow, cRow, gRow, tRow)
342 print "%s\n" % consLine
345 def saveLogo(self, outfilename, height=-1, width=-1):
346 """ saves a logo version of the motif as outfilename (assumes has .png)
347 if the motif is built from sequences.
348 will fail if weblogo 2.8.2 package is not installed in correct place.
350 logoPath = "%s/programs/weblogo/seqlogo" % cisRoot
351 if outfilename[-4:] in [".png", ".PNG"]:
352 outfilename = outfilename[:-4]
354 if len(self.sequences) < 1:
355 print "cannot run logo representation without founder sequences"
358 seqfilename = "%s.tmp" % tempfile.mktemp()
359 seqfile = open(seqfilename, "w")
360 for sequence in self.sequences:
361 seqfile.write("%s\n" % sequence)
366 dimensions += "-h %d " % height
368 dimensions += "-w %d " % width
370 dimensions += "-w %d " % len(self.motifPWM)
372 cmd = logoPath + " -f " + seqfilename + " -F PNG -c " + dimensions + "-a -Y -n -M -k 1 -o " + outfilename
373 contents = os.system(cmd)
375 os.remove(seqfilename)
377 print "failed to make logo: expecting weblogo 2.8.2 package in %s" % logoPath
378 print "also check if ghostscript package is correctly installed."
381 def getSymbol(self, col):
382 """ helper function for buildConsensus()
384 for NT in ["A", "C", "G", "T"]:
389 aColValue = col[matrixRow["A"]]
390 cColValue = col[matrixRow["C"]]
391 gColValue = col[matrixRow["G"]]
392 tColValue = col[matrixRow["T"]]
394 dualsList = [("R", aColValue + gColValue),
395 ("Y", tColValue + cColValue),
396 ("W", aColValue + tColValue),
397 ("S", cColValue + gColValue),
398 ("M", aColValue + cColValue),
399 ("K", tColValue + gColValue)
402 bestDual = self.getBestSymbol(dualsList)
403 if bestDual[1] > 0.9:
406 trioList = [("B", cColValue + gColValue + tColValue),
407 ("D", aColValue + gColValue + tColValue),
408 ("H", aColValue + cColValue + tColValue),
409 ("V", aColValue + cColValue + gColValue)
412 bestTrio = self.getBestSymbol(trioList)
413 if bestTrio[1] > 0.9:
419 def getBestSymbol(self, symbolProbabilityList):
420 bestSymbol = symbolProbabilityList[0]
421 for symbol in symbolProbabilityList[1:]:
422 if symbol[1] > bestSymbol[1]:
428 def buildConsensus(self):
429 """ returns the best consensus using the IUPAC alphabet.
432 for col in self.motifPWM:
433 consensus += self.getSymbol(col)
438 def buildStrictConsensus(self):
439 """ returns the best consensus using solely nucleotides.
442 for col in self.motifPWM:
444 for nt in ("A", "C", "G", "T"):
445 mRow.append((col[matrixRow[nt]], nt))
448 consensus += mRow[3][1]
453 def bestConsensusScore(self):
454 """ returns the best consensus score possible.
457 for col in self.motifPWM:
459 for nt in ["A", "C", "G", "T"]:
460 mRow.append((col[matrixRow[nt]], nt))
468 def expected(self, length, background=[0.25, 0.25, 0.25, 0.25], numMismatches=0):
469 """ returns the expected number of matches to the consensus in a sequence of a given length and background.
471 expectedNum = length * self.consensusProbability(background, numMismatches)
475 def consensusProbability(self, background=[0.25, 0.25, 0.25, 0.25], numMismatches=0):
476 """ returns the probability of the consensus given the background.
481 seqs = self.seqMismatches(self.buildConsensus().upper(), numMismatches)
482 motifs = seqs.split("|")
484 motifs.append(self.buildConsensus())
486 for theCons in motifs:
490 if NT in ("A", "W", "R", "M", "H", "V", "D", "N"):
491 currentProb += background[matrixRow["A"]]
493 if NT in ("C", "S", "Y", "M", "H", "B", "V", "N"):
494 currentProb += background[matrixRow["C"]]
496 if NT in ("G", "S", "R", "K", "B", "V", "D", "N"):
497 currentProb += background[matrixRow["G"]]
499 if NT in ("T", "W", "Y", "K", "H", "B", "D", "N"):
500 currentProb += background[matrixRow["T"]]
502 motProb = motProb + log(currentProb)
509 def pwmProbability(self, background):
510 """ returns probability of the PWM.
513 for row in self.motifPWM:
515 for NT in ["A", "C", "G", "T"]:
516 currentProb += row[matrixRow[NT]] * background[matrixRow[NT]]
518 prob = prob * currentProb
524 """ returns the reverse complement of the consensus of this motif.
526 return complement(self.buildConsensus(), len(self.motifPWM))
530 """ returns numbers of effective Ns in motif.
533 for col in self.motifPWM:
534 if self.getSymbol(col) == "N":
540 def buildMismatchSeq(self, rootSeq, tailSeq, numMismatches):
541 """ helper function called from seqMismatches().
544 tailLen = len(tailSeq)
545 if tailLen < 1 or numMismatches < 1:
546 return rootSeq + tailSeq
548 for pos in range(tailLen - numMismatches + 1):
550 newRootSeq += tailSeq[:pos]
552 finalSeq += self.buildMismatchSeq(newRootSeq, tailSeq[pos + 1:], numMismatches - 1)
558 def seqMismatches(self, seq, numMismatches):
559 """ Returns list of sequences that will be used by initializeMismatchRE().
561 return self.buildMismatchSeq("", seq, numMismatches)
564 def probMatchPWM(self, PWM, NT, colIndex):
565 """ returns the probability of seeing that particular nucleotide according to the PSFM.
568 if NT in ["A", "T", "C", "G"]:
570 return PWM[colIndex][row]
576 for motifNucleotide in ["A", "T", "C", "G"]:
577 if NT in motifDict[motifNucleotide]:
579 currentProb += PWM[colIndex][row]
584 def psfmOdds(self, PWM, NT, colIndex, background=[0.25, 0.25, 0.25, 0.25]):
585 """ calculates the odds of nucleotide NT coming from position colIndex in thePWM
586 as opposed to coming from the background.
589 currentProb = self.probMatchPWM(PWM, NT, colIndex)
590 backgroundProb = self.getBackgroundProbability(self, NT, background)
593 odds = currentProb / backgroundProb
594 except ZeroDivisionError:
600 def getBackgroundProbability(self, NT, background=[0.25, 0.25, 0.25, 0.25]):
602 if NT in ["A", "T", "C", "G"]:
604 return background[row]
610 for motifNucleotide in ["A", "T", "C", "G"]:
611 if NT in motifDict[motifNucleotide]:
613 backgroundProb += background[row]
615 return backgroundProb
618 def ntMatch(self, motifNT, seqNT):
619 """ returns True if seqNT matches motifNT.
624 if seqNT == "N" or motifNT == "N":
627 if motifNT in motifDict[seqNT]:
633 def scoreMotif(self, aSeq, diff=1000):
634 """ calculates the consensus score using the PSFM
636 motLength = len(self.motifPWM)
637 if len(aSeq) < motLength:
640 matchPWM = self.probMatchPWM
641 motPWM = self.motifPWM
642 revPWM = self.reversePWM
647 for index in range(motLength):
648 currentNT = theSeq[index]
649 forwardCons += matchPWM(motPWM, currentNT, index)
650 reverseCons += matchPWM(revPWM, currentNT, index)
651 bestCons += matchPWM(motPWM,self.strictConsensus[index], index)
653 if (forwardCons + diff) < bestCons and (reverseCons + diff) < bestCons:
656 return (forwardCons, reverseCons)
659 def scoreMotifLogOdds(self, aSeq, background=[0.25, 0.25, 0.25, 0.25]):
660 """ calculates the log-odds score using the PSFM given the markov(0) background.
662 motLength = len(self.motifPWM)
663 if len(aSeq) < motLength:
667 motPWM = self.motifPWM
668 revPWM = self.reversePWM
673 for index in range(motLength):
674 currentNT = theSeq[index]
676 forwardCons += log(odds(motPWM, currentNT, index, background), 2)
678 forwardCons += log(0.01, 2)
681 reverseCons += log(odds(revPWM, currentNT, index, background), 2)
683 reverseCons += log(0.01, 2)
685 bestCons += log(odds(motPWM,self.strictConsensus[index], index, background), 2)
687 return (forwardCons, reverseCons)
690 def bestLogOddsScore(self, background=[0.25, 0.25, 0.25, 0.25]):
691 """ calculates the best possible log-odds score using the PSFM given the markov(0) background.
693 motLength = len(self.motifPWM)
695 motPWM = self.motifPWM
697 for index in range(motLength):
698 bestLogOdds += log(odds(motPWM,self.strictConsensus[index], index, background), 2)
703 def locateConsensus(self, aSeq):
704 """ returns a list of positions on aSeq that match the consensus exactly.
706 cons = self.buildConsensus()
707 revComp = self.revComp()
708 motLength = len(cons)
710 if len(aSeq) < motLength:
716 seqLength = len(theSeq)
717 while pos <= (seqLength - motLength):
718 subSeq = theSeq[pos:pos + motLength].strip()
721 for index in range(motLength):
722 if not self.ntMatch(cons[index], subSeq[index]):
727 for index in range(motLength):
728 if not self.ntMatch(revComp[index], subSeq[index]):
732 print "chocked at pos %d" % pos
736 Position.append((pos, "F"))
738 elif revCompMot == 1:
739 Position.append((pos, "R"))
747 def compareConsensus(self, aSeq):
748 """ returns a sequence with nucleotide differences from consensus in lower case.
750 cons = self.buildConsensus()
751 revComp = self.revComp()
752 motLength = len(cons)
753 if len(aSeq) < motLength:
754 raise NameError, "Sequence too short"
763 for index in range(motLength):
764 if not self.ntMatch(cons[index], theSeq[index]):
766 forwardSeq += lower(theSeq[index])
768 forwardSeq += theSeq[index]
770 if not self.ntMatch(revComp[index], theSeq[index]):
771 backwardMismatch += 1
772 backwardSeq += lower(theSeq[index])
774 backwardSeq += theSeq[index]
776 if forwardMismatch <= backwardMismatch:
782 def scoreDiffPWM(self, compMotif):
783 """ returns a score scaled from 0 (no difference) to 2 (completely different) to
784 quantify the difference between the motif and another motif.
787 diffPWM = self.getDiffPWM(compMotif)
788 for pos in range(len(diffPWM)):
789 (adiff, cdiff, gdiff, tdiff) = diffPWM[pos]
790 score += abs(adiff) + abs(cdiff) + abs(gdiff) + abs(tdiff)
792 score /= float(len(diffPWM))
797 def getDiffPWM(self, compMotif):
798 """ subtracts the PWM of compMotif from existing PWM to compare differences.
799 Note that the comparison is only done on the length of the shorter motif.
802 compPWM = compMotif.getPWM()
803 numBasesToCompare = min(len(compMotif), len(self.motifPWM))
805 for pos in range(numBasesToCompare):
806 pwmCol = self.motifPWM[pos]
807 pwmColComp = compPWM[pos]
810 pwmEntry.append(pwmCol[NT] - pwmColComp[NT])
812 diffPWM.append(pwmEntry)
817 def initializeRE(self):
818 """ initializes Regular Expression motif engine.
822 cons = self.buildConsensus().upper()
823 revComp = self.revComp().upper()
831 reBackward += reMAP[NT]
833 reBackward = "ZZZZZZZ"
835 forwardRE = re.compile(reCons, re.IGNORECASE)
836 backwardRE = re.compile(reBackward, re.IGNORECASE)
839 def initializeMismatchRE(self, numMismatches):
840 """ initializes Regular Expression motif engine allowing for mismatches.
844 cons = self.seqMismatches(self.buildConsensus().upper(), numMismatches)
845 revComp = self.seqMismatches(self.revComp().upper(), numMismatches)
851 if self.revComp().upper() != self.buildConsensus().upper():
853 reBackward += reMAP[NT]
855 reBackward = "ZZZZZZZ"
857 forwardRE = re.compile(reCons, re.IGNORECASE)
858 backwardRE = re.compile(reBackward, re.IGNORECASE)
861 def locateConsensusRE(self, sequence):
862 """ Returns a list of positions on aSeq that match the consensus exactly.
863 Should be run after either initializeRE() or initializeMismatchRE(numMismatches)
865 motLength = len(self.motifPWM)
868 if len(sequence) < motLength:
871 forwardIter = forwardRE.finditer(sequence)
872 backwardIter = backwardRE.finditer(sequence)
873 for match in forwardIter:
874 position.append((match.start(), "F"))
876 for match in backwardIter:
877 position.append((match.start(), "R"))
879 positionLength = len(position)
880 if positionLength >= 1:
882 (prevPos, prevSense) = position[0]
883 results.append((prevPos, prevSense))
885 for index in range(1, positionLength):
886 (pos, sense) = position[index]
887 if pos >= prevPos + motLength:
888 results.append((pos, sense))
889 (pos, sense) = (prevPos, prevSense)
894 def locateStrictConsensus(self, aSeq, mismatches=0):
895 """ returns a list of positions on aSeq that match the strict
896 consensus within some mismatches.
897 Only available as a C-extension for greater speed-up for now.
899 forwardMer = self.buildStrictConsensus()
900 motLength = len(forwardMer)
901 revcompMer = complement(forwardMer, motLength)
902 if hasMotifExtension:
903 return _motif.locateMer(aSeq, forwardMer, revcompMer, mismatches)
905 print "only supported as part of the C extension for now"
909 def locateMotif(self, sequence, threshold=90.0, numberN=0):
910 """ returns a list of positions on aSeq that match the PWM within a Threshold,
911 given as a percentage of the optimal consensus score.
912 Will call C-extension for greater speed-up if compiled.
914 motifLength = len(self.motifPWM)
915 sequenceLength = len(sequence)
918 print "Threshold less than 50% - will abort locateMotif()"
921 maxScore = self.bestConsensusScore()
922 maxDiff = maxScore * (1 - threshold)
923 if sequenceLength < motifLength:
928 if hasMotifExtension:
929 return _motif.locateMotif(sequence, self.motifPWM, self.reversePWM, maxScore, maxDiff)
931 sequence = upper(sequence)
934 while position <= (sequenceLength - motifLength):
935 subSequence = sequence[position: position + motifLength]
936 if subSequence.count("N") > numberN:
940 (seqScore, revSeqScore) = self.scoreMotif(subSequence, maxDiff)
941 if seqScore >= revSeqScore and seqScore > 1.0:
942 positionList.append((position, "F"))
943 elif revSeqScore > 1.0:
944 positionList.append((position, "R"))
951 def locateMarkov1(self, sequence, maxFold=5.0):
952 """ returns a list of positions on sequence that match the Markov1 within maxFold.
954 motifLength = len(self.motifPWM)
955 sequenceLength = len(sequence)
957 print "maxFold less than 1.0 - will abort locateMarkov1()"
960 maxScore = self.bestMarkov1Score() * maxFold
961 if sequenceLength < motifLength:
966 if hasMotifExtension:
967 return _motif.locateMarkov1(sequence, self.motifMarkov1, self.revMarkov1, maxScore)
969 sequence = upper(sequence)
972 while position <= (sequenceLength - motifLength):
973 subSequence = sequence[position: position + motifLength]
974 if subSequence.count("N") > 0:
978 (seqScore, revSeqScore) = self.scoreMarkov1(subSequence, maxScore)
979 if seqScore <= revSeqScore and seqScore < maxScore:
980 positionList.append((position, "F"))
981 elif revSeqScore < maxScore:
982 positionList.append((position, "R"))
989 def calculatePWMfromSeqs(self):
990 """ calculate the PWM using a set of non-degenerate instances of the motif.
993 numSeqs = len(self.sequences)
998 # using length of first sequence as the length of the motif
999 length = len(self.sequences[0])
1000 for index in range(length):
1001 PWM.append([0.0, 0.0, 0.0, 0.0])
1003 for seq in self.sequences:
1005 theseq = seq.upper()
1007 PWM[index][matrixRow[NT]] += 1.0
1010 for index in range(length):
1011 for NT in ["A", "C", "G", "T"]:
1012 PWM[index][matrixRow[NT]] /= numSeqs
1015 self.buildReversePWM()
1016 self.motifSeq = self.buildConsensus()
1017 self.strictConsensus = self.buildStrictConsensus()
1020 def printMarkov1(self):
1021 """ print the Markov1 PSSM of the form previous NT -> current NT.
1024 for prior in range(4):
1025 row.append(["", "", "", ""])
1027 for pos in self.motifMarkov1:
1028 for prior in range(4):
1029 for current in range(4):
1030 row[prior][current] += str(round(pos[prior][current], 4)) + "\t"
1032 for prior in ["A", "C", "G", "T"]:
1033 for current in ["A", "C", "G", "T"]:
1035 print "%s -> %s\t%s\n" % (prior, current, row[matrixRow[prior]][matrixRow[current]])
1037 print "ERROR: %s %s" % (prior, current)
1041 def bestMarkov1Score(self):
1042 """ returns the best markov1 score possible.
1044 motLength = len(self.motifMarkov1)
1045 matchMarkov1 = self.probMatchMarkov1
1047 for index in range(motLength):
1048 col = self.motifMarkov1[index]
1050 for prior in ["A", "C", "G", "T"]:
1051 for current in ["A", "C", "G", "T"]:
1052 mRow.append((col[matrixRow[prior]][matrixRow[current]], prior, current))
1056 currentProb = matchMarkov1(self.motifMarkov1, "N", mRow[-1][2], index)
1058 currentProb = matchMarkov1(self.motifMarkov1, mRow[-1][1], mRow[-1][2], index)
1060 if currentProb < 0.0001:
1061 currentProb = 0.0001
1063 if currentProb > 0.0:
1064 score -= log(currentProb,2.0)
1069 def worstMarkov1Score(self):
1070 """ returns the worst markov1 score possible.
1072 motLength = len(self.motifMarkov1)
1073 currentProb = 0.0001
1074 score = -log(currentProb, 2.0) * (motLength - 1)
1079 def calculateMarkov1(self, pseudoCount=1.0):
1080 """ calculate the Markov1 PSSM using a set of non-degenerate instances of the motif.
1081 adds a pseudoCount for unseen combinations.
1083 self.motifMarkov1 = []
1084 numSeqs = len(self.sequences) + pseudoCount
1089 # using length of first sequence as the length of the motif
1090 length = len(self.sequences[0])
1092 for index in range(length):
1093 self.motifMarkov1.append([[pseudoCount, pseudoCount, pseudoCount, pseudoCount],
1094 [pseudoCount, pseudoCount, pseudoCount, pseudoCount],
1095 [pseudoCount, pseudoCount, pseudoCount, pseudoCount],
1096 [pseudoCount, pseudoCount, pseudoCount, pseudoCount]])
1098 for seq in self.sequences:
1099 theseq = seq.upper()
1104 for priorNT in range(4):
1105 self.motifMarkov1[index][priorNT][matrixRow[pos]] += 0.25
1107 self.motifMarkov1[index][prior][matrixRow[pos]] += 1.0
1109 prior = matrixRow[pos]
1112 for index in range(length):
1113 for prior in range(4):
1114 for current in range(4):
1115 self.motifMarkov1[index][prior][current] /= numSeqs
1117 self.buildReverseMarkov1(pseudoCount)
1120 def buildReverseMarkov1(self, pseudoCount=1.0):
1121 """ calculate the Reverse Markov1 PSSM using a set of non-degenerate instances of the motif.
1123 self.revMarkov1 = []
1124 numSeqs = len(self.sequences) + pseudoCount
1129 # using length of first sequence as the length of the motif
1130 length = len(self.sequences[0])
1131 for index in range(length):
1132 self.revMarkov1.append([[pseudoCount, pseudoCount, pseudoCount, pseudoCount],
1133 [pseudoCount, pseudoCount, pseudoCount, pseudoCount],
1134 [pseudoCount, pseudoCount, pseudoCount, pseudoCount],
1135 [pseudoCount, pseudoCount, pseudoCount, pseudoCount]])
1137 for aSeq in self.sequences:
1138 seq = complement(aSeq.upper(), length)
1143 for priorNT in range(4):
1144 self.revMarkov1[index][priorNT][matrixRow[pos]] += 0.25
1146 self.revMarkov1[index][prior][matrixRow[pos]] += 1.0
1148 prior = matrixRow[pos]
1151 for index in range(length):
1152 for prior in range(4):
1153 for current in range(4):
1154 self.revMarkov1[index][prior][current] /= numSeqs
1157 def scoreMarkov1(self, aSeq, maxScore=10000000.):
1158 """ calculate the matching score using the Markov1.
1159 limit search if given a low maxScore
1161 motLength = len(self.motifMarkov1)
1162 matchMarkov1 = self.probMatchMarkov1
1164 if len(aSeq) < motLength:
1167 theSeq = upper(aSeq)
1172 subSeq = theSeq[pos: pos + motLength]
1174 for index in range(motLength):
1175 currentNT = subSeq[index]
1176 currentProb = matchMarkov1(self.motifMarkov1, previousNT, currentNT, index)
1178 if currentProb < 0.0001:
1179 currentProb = 0.0001
1181 if currentProb > 0.0:
1182 seqProb -= log(currentProb,2.0)
1184 revCurrentProb = matchMarkov1(self.revMarkov1, previousNT, currentNT, index)
1185 if revCurrentProb < 0.002:
1186 revCurrentProb = 0.002
1188 if revCurrentProb > 0.0:
1189 revSeqProb -= log(revCurrentProb, 2.0)
1191 if seqProb > maxScore and revSeqProb > maxScore:
1192 return (seqProb, revSeqProb)
1194 previousNT = currentNT
1196 return (seqProb, revSeqProb)
1199 def probMatchMarkov1(self, theMarkov1, previousNT, NT, colIndex):
1200 """ returns the likelihood of seeing NT given previousNT at this position of the motif.
1203 if NT in ["A", "C", "G", "T"]:
1204 currentNT = matrixRow[NT]
1209 prevNT = matrixRow[previousNT]
1211 for index in range(4):
1212 currentProb += theMarkov1[colIndex][index][currentNT]
1216 if NT in ["A", "C", "G", "T"]:
1217 return theMarkov1[colIndex][prevNT][currentNT]
1222 for motifNucleotide in ["A", "T", "C", "G"]:
1223 if NT in motifDict[motifNucleotide]:
1225 currentProb += theMarkov1[colIndex][prevNT][row]
1230 def isSane(self, minLen=7, stretchLen=6):
1231 """ check for motif sanity, which includes: minimum length, less than half N's in consensus,
1232 motifs include more than two nucleotides, no nucleotide or dinucleotide is repeated more
1233 than stretchlen. The appropriate exceptions are made for 'GC' dinucleotides.
1235 stretchLen = int(stretchLen)
1236 minLen = int(minLen)
1237 stretchLen = min(stretchLen, minLen - 1)
1239 cons = self.buildConsensus()
1240 motifLen = float(len(cons))
1241 if motifLen < minLen:
1244 nCount = cons.count("N")
1245 if (nCount >= 0.5 * motifLen):
1248 aCount = cons.count("A")
1249 gCount = cons.count("G")
1250 cCount = cons.count("C")
1251 tCount = cons.count("T")
1253 atCount = aCount + tCount
1254 agCount = aCount + gCount
1255 acCount = aCount + cCount
1256 gtCount = gCount + tCount
1257 tcCount = tCount + cCount
1259 for pairedCount in [atCount, agCount, acCount, gtCount, tcCount]:
1260 if pairedCount == motifLen:
1263 cons = self.buildStrictConsensus()
1264 repeatSequences = []
1265 for nucleotide in ["A", "G", "C", "T"]:
1266 repeatSequences.append(nucleotide * stretchLen)
1268 if stretchLen % 2 != 0:
1271 repeatCount = stretchLen/2
1272 for dinucleotide in ["AG", "AC", "AT", "CT", "GT"]:
1273 repeatSequences.append(dinucleotide * repeatCount)
1275 for testSequence in repeatSequences:
1276 if cons.count(testSequence):
1282 def correlateMotifs(actualMotifA, actualMotifB, maxSlide=1):
1283 """ Compares two motifs using the "pearson correlation coefficient-like" MSS.
1284 Will slide a motif up to maxSlide bases compared to the other,
1285 and reports the best score.
1289 if len(actualMotifA) < len(actualMotifB):
1296 motApwm = motA.getPWM()
1297 motBpwm = motB.getPWM()
1298 motCpwm = motB.buildReversePWM()
1299 if hasMotifExtension:
1300 return _motif.correlateMotifs(motApwm, motBpwm, motCpwm, maxSlide)
1303 padLength = length - len(motB)
1304 Ncol = [symbolToMatrix["N"]]
1305 for slide in range(-1 * maxSlide, maxSlide + padLength + 1):
1306 pwmA = deepcopy(motApwm)
1307 pwmB = deepcopy(motBpwm)
1308 pwmC = deepcopy(motCpwm)
1310 pwmA = Ncol * abs(slide) + pwmA
1311 pwmB = pwmB + Ncol * (abs(slide) + padLength)
1312 pwmC = pwmC + Ncol * (abs(slide) + padLength)
1313 elif slide > 0 and slide <= maxSlide:
1315 if padLength >= slide:
1316 adjustedPadLength = padLength - slide
1319 adjustedPadLength = 0
1320 adjustedSlide = slide - padLength
1322 pwmA = pwmA + Ncol * adjustedSlide
1323 pwmB = Ncol * slide + pwmB + Ncol * adjustedPadLength
1324 pwmC = Ncol * slide + pwmC + Ncol * adjustedPadLength
1326 pwmA = pwmA + Ncol * slide
1327 pwmB = Ncol * slide + pwmB
1328 pwmC = Ncol * slide + pwmC
1329 elif slide > maxSlide:
1330 maxDiff = slide - maxSlide
1331 pwmA = pwmA + Ncol * maxSlide
1332 pwmB = Ncol * slide + pwmB + Ncol * (padLength - maxDiff)
1333 pwmC = Ncol * slide + pwmC + Ncol * (padLength - maxDiff)
1335 pwmB = pwmB + Ncol * padLength
1336 pwmC = pwmC + Ncol * padLength
1340 thisLength = len(pwmA)
1341 for index in range(thisLength):
1342 score1 += pearsonCorrelation(pwmA[index], pwmB[index])
1343 score2 += pearsonCorrelation(pwmA[index], pwmC[index])
1345 score1 = score1 / float(thisLength)
1346 score2 = score2 / float(thisLength)
1347 if score1 < score2 and score2 > bestScore:
1349 elif score1 > bestScore:
1355 def MSS(motifA, motifB, maxSlide=1):
1356 """ Compares two motifs using the motif similarity score (MSS).
1357 Will slide a motif up to maxSlide bases compared to the other,
1358 and reports the best score. Wrapper around correlateMotifs()
1360 return correlateMotifs(motifA, motifB, maxSlide)
1363 def printMSS(motifList, maxSlide=1):
1364 """ Prints a matrix of MSS comparisons between different motifs
1367 for mot1 in motifList:
1369 for mot2 in motifList:
1370 val = "\t%1.2f" % correlateMotifs(mot1, mot2, maxSlide)