erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / core / motif.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-10 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # motif.py - defines the motif object and its methods in cistematic
31 from string import upper, lower
32 from math import log, exp
33 from copy import deepcopy
34 from cistematic.core import complement
35 from cistematic.cisstat.score import pearsonCorrelation
36 import re, os, tempfile
37
38 if os.environ.get("CISTEMATIC_ROOT"):
39     cisRoot = os.environ.get("CISTEMATIC_ROOT") 
40 else:
41     cisRoot = "/proj/genome"
42
43 if os.environ.get("CISTEMATIC_TEMP"):
44     cisTemp = os.environ.get("CISTEMATIC_TEMP")
45 else:
46     cisTemp = "/tmp"
47
48 tempfile.tempdir = cisTemp
49
50 try:
51     import cistematic.core._motif as _motif
52     hasMotifExtension = True
53 except:
54     hasMotifExtension = False
55
56 matrixRow = {"A": 0,
57              "C": 1,
58              "G": 2,
59              "T": 3
60 }
61
62 symbolToMatrix = {"A": [1.0, 0.0, 0.0, 0.0],
63                   "C": [0.0, 1.0, 0.0, 0.0],
64                   "G": [0.0, 0.0, 1.0, 0.0],
65                   "T": [0.0, 0.0, 0.0, 1.0],
66                   "W": [0.5, 0.0, 0.0, 0.5],
67                   "S": [0.0, 0.5, 0.5, 0.0],
68                   "R": [0.5, 0.0, 0.5, 0.0],
69                   "Y": [0.0, 0.5, 0.0, 0.5],
70                   "M": [0.5, 0.5, 0.0, 0.0],
71                   "K": [0.0, 0.0, 0.5, 0.5],
72                   "H": [0.33, 0.33, 0.0, 0.34],
73                   "B": [0.0, 0.33, 0.33, 0.34],
74                   "V": [0.33, 0.33, 0.34, 0.0],
75                   "D": [0.33, 0.0, 0.33, 0.34],
76                   "N": [0.25, 0.25, 0.25, 0.25]
77 }
78
79 reMAP = {"A": "A",
80          "C": "C",
81          "G": "G",
82          "T": "T",
83          "N": "[ACGT]",
84          "W": "[AT]",
85          "S": "[GC]",
86          "R": "[AG]",
87          "Y": "[CT]",
88          "M": "[AC]",
89          "K": "[GT]",
90          "H": "[ACT]",
91          "B": "[CGT]",
92          "V": "[ACG]",
93          "D": "[AGT]",
94          "|": "I"
95 }
96
97 motifDict = {"A": ["W", "R", "M", "H", "V", "D"],
98              "T": ["W", "Y", "K", "H", "B", "D"],
99              "C": ["S", "Y", "M", "H", "B", "V"],
100              "G": ["S", "R", "K", "B", "V", "D"]
101 }
102
103
104 class Motif:
105     """ The Motif class is the heart of cistematic. It captures both the consensus, 
106         PWM, and Markov(1) versions of the motif, as well as provides methods for scanning 
107         sequences for matches.
108     """
109     motifSeq = ""
110     motifPWM = []
111     reversePWM = []
112     motifMarkov1 = []
113     revMarkov1 = []
114     tagID = ""
115     sequences = []
116     annotations = []
117     strictConsensus = ""
118     threshold = 1.0
119     info = ""
120
121
122     def __init__(self, tagID="", motif="", PWM=[], seqs=[], thresh=0.0, info="", motifFile="", seqfile=""):
123         """ initialize a Motif object either with (A) a tagID identifier and either of 
124             (i) a IUPAC consensus or (ii) a PWM or (iii) example sequences, or (B) from 
125             a motif file in Cistematic motif format.
126         """        
127         fileTagID = ""
128         if motifFile != "":
129             (fileTagID, motif, PWM, seqs, thresh, info) = self.readMotif(motifFile)
130
131         if len(tagID) > 0:
132             self.setTagID(tagID)
133         elif len(fileTagID) > 0:
134             self.setTagID(fileTagID)
135         else:
136             self.setTagID("default")
137
138         if motif <> "":
139             self.setMotif(motif)
140
141         # PWM overrules the motif
142         if len(PWM) > 1:
143             self.setPWM(PWM)
144
145         # a seqfile can be used to create a list of sequences
146         if len(seqfile) > 0:
147             seqs = self.readSeqFile(seqfile)
148
149         # Sequences overrules the PWM and motif
150         if len(seqs) > 1:
151             self.setSequences(seqs)
152             self.setThreshold(len(seqs[0]))
153
154         self.setThreshold(float(thresh))
155         if len(info) > 0:
156             self.setInfo(info)
157
158
159     def __len__(self):
160         """ returns the length of the motif 
161         """
162         return len(self.motifPWM)
163
164
165     def readMotif(self, motifFile):
166         """ read motif in cistmeatic motif format to initialize motif instance.
167         """
168         motif = ""
169         PWM = []
170         seqs = []
171         threshold = 0.0
172         info = ""
173         infile = open(motifFile, "r")
174         for line in infile:
175             if len(line) < 4 or line[0] == "#":
176                 continue
177
178             fields = line.strip().split("\t")
179             fieldType = fields[0].lower()
180             if fieldType not in ["tagid", "motif", "acgt", "sequence", "threshold", "info"]:
181                 print "could not process line %s" % line
182                 continue
183
184             if len(fields) < 2:
185                 continue
186
187             if fieldType == "motif":
188                 motif = fields[1]
189             elif fieldType == "acgt":
190                 PWM.append([float(fields[1]), float(fields[2]), float(fields[3]), float(fields[4])])
191             elif fieldType == "sequence":
192                 seqs.append(fields[1])
193             elif fieldType == "threshold":
194                 threshold = float(fields[1])
195             elif fieldType == "info":
196                 info = fields[1].strip()
197             elif fieldType == "tagid":
198                 tagID = fields[1]
199
200         infile.close()
201
202         return (tagID, motif, PWM, seqs, threshold, info)
203
204
205     def readSeqFile(self, seqFile):
206         """ read sequences from a sequence file.
207         """
208         seqs = []
209         infile = open(seqFile, "r")
210         for line in infile:
211             seqs.append(line.strip().upper())
212
213         infile.close()
214
215         return seqs
216
217
218     def saveMotif(self, motFile):
219         """ Save motif in cistematic motif format.
220         """
221         outfile = open(motFile, "w")
222         outfile.write("tagid\t%s\n" % self.tagID)
223         outfile.write("info\t%s\n" % self.info.strip())
224         outfile.write("threshold\t%s\n" % str(self.threshold))
225         outfile.write("motif\t%s\n" % self.buildConsensus())
226         for col in self.motifPWM:
227             outfile.write("acgt\t%f\t%f\t%f\t%f\n" % (col[0], col[1], col[2], col[3]))
228
229         for seq in self.sequences:
230             outfile.write("sequence\t%s\n" % seq)
231
232         outfile.close()
233
234
235     def setTagID(self, tag):
236         """ set motif identifier.
237         """
238         self.tagID = tag
239
240
241     def setInfo(self, info):
242         """ set motif info string.
243         """
244         self.info = info
245
246
247     def setThreshold(self, threshold):
248         """ set a pre-defined threshold or the highest-possible threshold, otherwise.
249         """
250         (sForward, sBackward) = self.scoreMotif(self.buildStrictConsensus())
251         if sForward > sBackward:
252             self.threshold = sForward
253         else:
254             self.threshold = sBackward
255
256         if threshold < self.threshold and threshold > 0:
257             self.threshold = threshold
258
259
260     def setMotif(self, motif):
261         """ set the motif PWM using a IUPAC consensus. 
262         """
263         self.motifSeq = motif
264         self.motifPWM = self.buildPWM(motif)
265         self.buildReversePWM()
266         self.strictConsensus = self.buildStrictConsensus()
267
268
269     def setSequences(self, seqs):
270         """ set the founder sequences for the motif and recalculate PWM with them. 
271         """
272         self.sequences = seqs
273         self.calculatePWMfromSeqs()
274         self.calculateMarkov1()
275
276
277     def setPWM(self, PWM):
278         """ set the PWM for the motif and calculate consensus.
279         """
280         self.motifPWM = PWM
281         self.buildReversePWM()
282         self.motifSeq = self.buildConsensus()
283         self.strictConsensus = self.buildStrictConsensus()
284
285
286     def appendToPWM(self, col):
287         """ add a column to the PWM for the motif 
288         """
289         self.motifPWM.append(col)
290
291
292     def buildPWM(self, motif):
293         """ returns the PWM for the provided consensus motif. 
294         """
295         PWM = []
296         for letter in upper(motif):
297             PWM.append(symbolToMatrix[letter])
298
299         return PWM
300
301
302     def buildReversePWM(self):
303         """ returns the reverse PWM for the motif 
304         """
305         theRevPWM = []
306         tempPWM = deepcopy(self.motifPWM)
307         tempPWM.reverse()
308         for col in tempPWM:
309             theRevPWM.append([col[matrixRow["T"]],  col[matrixRow["G"]], col[matrixRow["C"]], col[matrixRow["A"]]])
310
311         self.reversePWM = deepcopy(theRevPWM)
312
313         return self.reversePWM
314
315
316     def getPWM(self):
317         """ returns the PWM for the motif 
318         """
319         return self.motifPWM
320
321
322     def printPWM(self):
323         """ print the PWM and the consensus
324         """
325         aRow = ""
326         cRow = ""
327         gRow = ""
328         tRow = ""
329         cons = self.buildConsensus()
330         consLine = "Cons:"
331         for NT in cons:
332             consLine += "\t"
333             consLine += NT
334
335         for col in self.motifPWM:
336             aRow = "%s%s\t" % (aRow, str(round(col[matrixRow["A"]],4)))
337             cRow = "%s%s\t" % (cRow, str(round(col[matrixRow["C"]],4)))
338             gRow = "%s%s\t" % (gRow, str(round(col[matrixRow["G"]],4)))
339             tRow = "%s%s\t" % (tRow, str(round(col[matrixRow["T"]],4)))
340
341         print "A:\t%s\nC:\t%s\nG:\t%s\nT:\t%s\n" % (aRow, cRow, gRow, tRow)
342         print "%s\n" % consLine
343
344
345     def saveLogo(self, outfilename, height=-1, width=-1):
346         """ saves a logo version of the motif as outfilename (assumes has .png)
347             if the motif is built from sequences.
348             will fail if weblogo 2.8.2 package is not installed in correct place.
349         """
350         logoPath = "%s/programs/weblogo/seqlogo" % cisRoot
351         if outfilename[-4:] in [".png", ".PNG"]:
352             outfilename = outfilename[:-4]
353
354         if len(self.sequences) < 1:
355             print "cannot run logo representation without founder sequences"
356         else:
357             if True:
358                 seqfilename = "%s.tmp" % tempfile.mktemp()
359                 seqfile = open(seqfilename, "w")
360                 for sequence in self.sequences:
361                     seqfile.write("%s\n" % sequence)
362
363                 seqfile.flush()
364                 dimensions = ""
365                 if height > 0:
366                     dimensions += "-h %d " % height
367                 if width > 0:
368                     dimensions +=  "-w %d " % width
369                 else:
370                     dimensions += "-w %d " % len(self.motifPWM)
371
372                 cmd = logoPath + " -f " + seqfilename + " -F PNG -c " + dimensions + "-a -Y -n -M -k 1 -o " + outfilename
373                 contents = os.system(cmd)
374                 seqfile.close()
375                 os.remove(seqfilename)
376             else:
377                 print "failed to make logo: expecting weblogo 2.8.2 package in %s" % logoPath
378                 print "also check if ghostscript package is correctly installed."
379
380
381     def getSymbol(self, col):
382         """ helper function for buildConsensus()
383         """
384         for NT in ["A", "C", "G", "T"]:
385             row = matrixRow[NT]
386             if col[row] > 0.9:
387                 return NT
388
389         aColValue = col[matrixRow["A"]]
390         cColValue = col[matrixRow["C"]]
391         gColValue = col[matrixRow["G"]]
392         tColValue = col[matrixRow["T"]]
393
394         dualsList = [("R", aColValue + gColValue),
395                      ("Y", tColValue + cColValue),
396                      ("W", aColValue + tColValue),
397                      ("S", cColValue + gColValue),
398                      ("M", aColValue + cColValue),
399                      ("K", tColValue + gColValue)
400         ]
401
402         bestDual = self.getBestSymbol(dualsList)
403         if bestDual[1] > 0.9:
404             return bestDual[0]
405
406         trioList = [("B", cColValue + gColValue + tColValue),
407                     ("D", aColValue + gColValue + tColValue),
408                     ("H", aColValue + cColValue + tColValue),
409                     ("V", aColValue + cColValue + gColValue)
410         ]
411
412         bestTrio = self.getBestSymbol(trioList)
413         if bestTrio[1] > 0.9:
414             return bestTrio[0]
415
416         return "N"
417
418
419     def getBestSymbol(self, symbolProbabilityList):
420         bestSymbol = symbolProbabilityList[0]
421         for symbol in symbolProbabilityList[1:]:
422             if symbol[1] > bestSymbol[1]:
423                 bestSymbol = symbol
424
425         return bestSymbol
426
427
428     def buildConsensus(self):
429         """ returns the best consensus using the IUPAC alphabet.
430         """
431         consensus = ""
432         for col in self.motifPWM:
433             consensus += self.getSymbol(col)
434
435         return consensus
436
437
438     def buildStrictConsensus(self):
439         """ returns the best consensus using solely nucleotides.
440         """
441         consensus = ""
442         for col in self.motifPWM:
443             mRow = []
444             for nt in ("A", "C", "G", "T"):
445                 mRow.append((col[matrixRow[nt]], nt))
446
447             mRow.sort()
448             consensus += mRow[3][1]
449
450         return consensus
451
452
453     def bestConsensusScore(self):
454         """ returns the best consensus score possible.
455         """
456         score = 0.0
457         for col in self.motifPWM:
458             mRow = []
459             for nt in ["A", "C", "G", "T"]:
460                 mRow.append((col[matrixRow[nt]], nt))
461
462             mRow.sort()
463             score += mRow[3][0]
464
465         return score
466
467
468     def expected(self, length, background=[0.25, 0.25, 0.25, 0.25], numMismatches=0):
469         """ returns the expected number of matches to the consensus in a sequence of a given length and background. 
470         """
471         expectedNum = length * self.consensusProbability(background, numMismatches)
472         return expectedNum
473
474
475     def consensusProbability(self, background=[0.25, 0.25, 0.25, 0.25], numMismatches=0):
476         """ returns the probability of the consensus given the background.
477         """
478         prob = 0
479         motifs = []
480         if numMismatches> 0:
481             seqs = self.seqMismatches(self.buildConsensus().upper(), numMismatches)
482             motifs = seqs.split("|")
483         else:
484             motifs.append(self.buildConsensus())
485
486         for theCons in motifs:
487             motProb = 0
488             for NT in theCons:
489                 currentProb = 0.0
490                 if NT in ("A", "W", "R", "M", "H", "V", "D", "N"):
491                     currentProb += background[matrixRow["A"]]
492
493                 if NT in ("C", "S", "Y", "M", "H", "B", "V", "N"):
494                     currentProb += background[matrixRow["C"]]
495
496                 if NT in ("G", "S", "R", "K", "B", "V", "D", "N"):
497                     currentProb += background[matrixRow["G"]]
498
499                 if NT in ("T", "W", "Y", "K", "H", "B", "D", "N"):
500                     currentProb += background[matrixRow["T"]]
501
502                 motProb = motProb + log(currentProb)
503
504             prob += exp(motProb)
505
506         return prob
507
508
509     def pwmProbability(self, background):
510         """ returns probability of the PWM.
511         """
512         prob = 1.0
513         for row in self.motifPWM:
514             currentProb = 0.0
515             for NT in ["A", "C", "G", "T"]:
516                 currentProb += row[matrixRow[NT]] * background[matrixRow[NT]]
517
518             prob = prob * currentProb
519
520         return prob
521
522
523     def revComp(self):
524         """ returns the reverse complement of the consensus of this motif.
525         """
526         return complement(self.buildConsensus(), len(self.motifPWM))
527
528
529     def numberOfN(self):
530         """ returns numbers of effective Ns in motif.
531         """
532         index = 0
533         for col in self.motifPWM:
534             if self.getSymbol(col) == "N":
535                 index += 1
536
537         return index
538
539
540     def buildMismatchSeq(self, rootSeq, tailSeq, numMismatches):
541         """ helper function called from seqMismatches().
542         """
543         finalSeq = ""
544         tailLen = len(tailSeq)
545         if tailLen < 1 or numMismatches < 1:
546             return rootSeq + tailSeq
547
548         for pos in range(tailLen - numMismatches + 1):
549             newRootSeq = rootSeq
550             newRootSeq += tailSeq[:pos]
551             newRootSeq += "N"
552             finalSeq += self.buildMismatchSeq(newRootSeq, tailSeq[pos + 1:], numMismatches - 1)
553             finalSeq += "|"
554
555         return finalSeq[:-1]
556
557
558     def seqMismatches(self, seq, numMismatches):
559         """ Returns list of sequences that will be used by initializeMismatchRE().
560         """
561         return self.buildMismatchSeq("", seq, numMismatches)
562
563
564     def probMatchPWM(self, PWM, NT, colIndex):
565         """ returns the probability of seeing that particular nucleotide according to the PSFM.
566         """
567
568         if NT in ["A", "T", "C", "G"]:
569             row = matrixRow[NT]
570             return PWM[colIndex][row]
571
572         if NT == "N":
573             return 1.0
574
575         currentProb = 0.0
576         for motifNucleotide in ["A", "T", "C", "G"]:
577             if NT in motifDict[motifNucleotide]:
578                 row = matrixRow[NT]
579                 currentProb += PWM[colIndex][row]
580
581         return currentProb
582
583
584     def psfmOdds(self, PWM, NT, colIndex, background=[0.25, 0.25, 0.25, 0.25]):
585         """ calculates the odds of nucleotide NT coming from position colIndex in thePWM
586             as opposed to coming from the background.
587         """
588
589         currentProb = self.probMatchPWM(PWM, NT, colIndex)
590         backgroundProb = self.getBackgroundProbability(self, NT, background)
591
592         try:
593             odds = currentProb / backgroundProb
594         except ZeroDivisionError:
595             odds = 1.0
596
597         return odds
598
599
600     def getBackgroundProbability(self, NT, background=[0.25, 0.25, 0.25, 0.25]):
601
602         if NT in ["A", "T", "C", "G"]:
603             row = matrixRow[NT]
604             return background[row]
605
606         if NT == "N":
607             return 1.0
608
609         backgroundProb = 0.0
610         for motifNucleotide in ["A", "T", "C", "G"]:
611             if NT in motifDict[motifNucleotide]:
612                 row = matrixRow[NT]
613                 backgroundProb += background[row]
614
615         return backgroundProb
616
617
618     def ntMatch(self, motifNT, seqNT):
619         """ returns True if seqNT matches motifNT.
620         """
621         if motifNT == seqNT:
622             return True
623
624         if seqNT == "N" or motifNT == "N":
625             return True
626
627         if motifNT in motifDict[seqNT]:
628             return True
629
630         return False
631
632
633     def scoreMotif(self, aSeq, diff=1000):
634         """ calculates the consensus score using the PSFM
635         """
636         motLength = len(self.motifPWM)
637         if len(aSeq) < motLength:
638             return (0.0, 0.0)
639
640         matchPWM = self.probMatchPWM
641         motPWM = self.motifPWM
642         revPWM = self.reversePWM
643         theSeq = upper(aSeq)
644         forwardCons = 0.0
645         reverseCons = 0.0
646         bestCons = 0.0
647         for index in range(motLength):
648             currentNT = theSeq[index]
649             forwardCons += matchPWM(motPWM, currentNT, index)
650             reverseCons += matchPWM(revPWM, currentNT, index)
651             bestCons += matchPWM(motPWM,self.strictConsensus[index], index)
652
653         if (forwardCons + diff) < bestCons and (reverseCons + diff) < bestCons:
654             return (-1, -1)
655
656         return (forwardCons, reverseCons)
657
658
659     def scoreMotifLogOdds(self, aSeq, background=[0.25, 0.25, 0.25, 0.25]):
660         """ calculates the log-odds score using the PSFM given the markov(0) background.
661         """
662         motLength = len(self.motifPWM)
663         if len(aSeq) < motLength:
664             return (0.0, 0.0)
665
666         odds = self.psfmOdds
667         motPWM = self.motifPWM
668         revPWM = self.reversePWM
669         theSeq = upper(aSeq)
670         forwardCons = 0.0
671         reverseCons = 0.0
672         bestCons = 0.0
673         for index in range(motLength):
674             currentNT = theSeq[index]
675             try:
676                 forwardCons += log(odds(motPWM, currentNT, index, background), 2)
677             except:
678                 forwardCons += log(0.01, 2)
679
680             try:
681                 reverseCons += log(odds(revPWM, currentNT, index, background), 2)
682             except:
683                 reverseCons += log(0.01, 2)
684
685             bestCons += log(odds(motPWM,self.strictConsensus[index], index, background), 2)
686
687         return (forwardCons, reverseCons)
688
689
690     def bestLogOddsScore(self, background=[0.25, 0.25, 0.25, 0.25]):
691         """ calculates the best possible log-odds score using the PSFM given the markov(0) background.
692         """
693         motLength = len(self.motifPWM)
694         odds = self.psfmOdds
695         motPWM = self.motifPWM
696         bestLogOdds = 0.0
697         for index in range(motLength):
698             bestLogOdds += log(odds(motPWM,self.strictConsensus[index], index, background), 2)
699
700         return bestLogOdds
701
702
703     def locateConsensus(self, aSeq):
704         """ returns a list of positions on aSeq that match the consensus exactly.
705         """
706         cons = self.buildConsensus()
707         revComp = self.revComp()
708         motLength = len(cons)
709         Position = []
710         if len(aSeq) < motLength:
711             return []
712         else: 
713             theSeq = upper(aSeq)
714
715         pos = 0
716         seqLength = len(theSeq)
717         while pos <= (seqLength - motLength):
718             subSeq = theSeq[pos:pos + motLength].strip()
719             try:
720                 forwardMot = 1
721                 for index in range(motLength):
722                     if not self.ntMatch(cons[index], subSeq[index]):
723                         forwardMot = 0
724                         break
725
726                 revCompMot = 1
727                 for index in range(motLength):
728                     if not self.ntMatch(revComp[index], subSeq[index]):
729                         revCompMot = 0
730                         break
731             except:
732                 print "chocked at pos %d" % pos
733                 forwardMot = 0
734
735             if forwardMot == 1:
736                 Position.append((pos, "F"))
737                 pos += motLength
738             elif revCompMot == 1:
739                 Position.append((pos, "R"))
740                 pos += motLength
741             else:
742                 pos +=1
743
744         return Position
745
746
747     def compareConsensus(self, aSeq):
748         """ returns a sequence with nucleotide differences from consensus in lower case.
749         """
750         cons = self.buildConsensus()
751         revComp = self.revComp()
752         motLength = len(cons)        
753         if len(aSeq) < motLength:
754             raise NameError, "Sequence too short"
755         else: 
756             theSeq = upper(aSeq)
757
758         forwardMismatch = 0
759         backwardMismatch = 0
760         forwardSeq = ""
761         backwardSeq = ""
762
763         for index in range(motLength):
764             if not self.ntMatch(cons[index], theSeq[index]):
765                 forwardMismatch += 1
766                 forwardSeq += lower(theSeq[index])
767             else:
768                 forwardSeq += theSeq[index]
769
770             if not self.ntMatch(revComp[index], theSeq[index]):
771                 backwardMismatch += 1
772                 backwardSeq += lower(theSeq[index])
773             else:
774                 backwardSeq += theSeq[index]
775
776         if forwardMismatch <= backwardMismatch:
777             return forwardSeq
778         else:
779             return backwardSeq
780
781
782     def scoreDiffPWM(self, compMotif):
783         """ returns a score scaled from 0 (no difference) to 2 (completely different) to  
784             quantify the difference between the motif and another motif.
785         """
786         score = 0.0
787         diffPWM = self.getDiffPWM(compMotif)
788         for pos in range(len(diffPWM)):
789             (adiff, cdiff, gdiff, tdiff) = diffPWM[pos]
790             score += abs(adiff) + abs(cdiff) + abs(gdiff) + abs(tdiff)
791
792         score /= float(len(diffPWM))
793
794         return score
795
796
797     def getDiffPWM(self, compMotif):
798         """ subtracts the PWM of compMotif from existing PWM to compare differences. 
799             Note that the comparison is only done on the length of the shorter motif.
800         """
801         diffPWM = []
802         compPWM = compMotif.getPWM()
803         numBasesToCompare = min(len(compMotif), len(self.motifPWM))
804
805         for pos in range(numBasesToCompare):
806             pwmCol = self.motifPWM[pos]
807             pwmColComp = compPWM[pos]
808             pwmEntry = []
809             for NT in range(4):
810                 pwmEntry.append(pwmCol[NT] - pwmColComp[NT])
811
812             diffPWM.append(pwmEntry)
813
814         return diffPWM
815
816
817     def initializeRE(self):
818         """ initializes Regular Expression motif engine.
819         """
820         global forwardRE
821         global backwardRE
822         cons = self.buildConsensus().upper()
823         revComp = self.revComp().upper()
824         reCons = ""
825         reBackward = ""
826         for NT in cons:
827             reCons += reMAP[NT]
828
829         if revComp != cons:
830             for NT in revComp:
831                 reBackward += reMAP[NT]
832         else:
833             reBackward = "ZZZZZZZ"
834
835         forwardRE = re.compile(reCons, re.IGNORECASE)
836         backwardRE = re.compile(reBackward, re.IGNORECASE)
837
838
839     def initializeMismatchRE(self, numMismatches):
840         """ initializes Regular Expression motif engine allowing for mismatches.
841         """
842         global forwardRE
843         global backwardRE
844         cons = self.seqMismatches(self.buildConsensus().upper(), numMismatches)
845         revComp = self.seqMismatches(self.revComp().upper(), numMismatches)
846         reCons = ""
847         reBackward = ""
848         for NT in cons:
849             reCons += reMAP[NT]
850
851         if self.revComp().upper() != self.buildConsensus().upper():
852             for NT in revComp:
853                 reBackward += reMAP[NT]
854         else:
855             reBackward = "ZZZZZZZ"
856
857         forwardRE = re.compile(reCons, re.IGNORECASE)
858         backwardRE = re.compile(reBackward, re.IGNORECASE)
859
860
861     def locateConsensusRE(self, sequence):
862         """ Returns a list of positions on aSeq that match the consensus exactly. 
863             Should be run after either initializeRE() or initializeMismatchRE(numMismatches)
864         """
865         motLength = len(self.motifPWM)
866         position = []
867         results = []
868         if len(sequence) < motLength:
869             return []
870
871         forwardIter = forwardRE.finditer(sequence)
872         backwardIter = backwardRE.finditer(sequence)
873         for match in forwardIter:
874             position.append((match.start(), "F"))
875
876         for match in backwardIter:
877             position.append((match.start(), "R"))
878
879         positionLength = len(position)
880         if positionLength >= 1:
881             position.sort()
882             (prevPos, prevSense) = position[0]
883             results.append((prevPos, prevSense))
884
885             for index in range(1, positionLength):
886                 (pos, sense) = position[index]
887                 if pos >= prevPos + motLength:
888                     results.append((pos, sense))
889                     (pos, sense) = (prevPos, prevSense)
890
891         return results
892
893
894     def locateStrictConsensus(self, aSeq, mismatches=0):
895         """ returns a list of positions on aSeq that match the strict 
896             consensus within some mismatches.
897             Only available as a C-extension for greater speed-up for now.
898         """
899         forwardMer = self.buildStrictConsensus()
900         motLength = len(forwardMer)
901         revcompMer = complement(forwardMer, motLength)
902         if hasMotifExtension:
903             return  _motif.locateMer(aSeq, forwardMer, revcompMer, mismatches)
904         else:
905             print "only supported as part of the C extension for now"
906             return []
907
908
909     def locateMotif(self, sequence, threshold=90.0, numberN=0):
910         """ returns a list of positions on aSeq that match the PWM within a Threshold, 
911             given as a percentage of the optimal consensus score. 
912             Will call C-extension for greater speed-up if compiled.
913         """
914         motifLength = len(self.motifPWM)
915         sequenceLength = len(sequence)
916         threshold /=  100.0
917         if threshold < 0.5:
918             print "Threshold less than 50% - will abort locateMotif()"
919             return []
920
921         maxScore = self.bestConsensusScore()
922         maxDiff = maxScore * (1 - threshold)
923         if sequenceLength < motifLength:
924             return []
925         else:
926             sequence.strip()
927
928         if hasMotifExtension:
929             return  _motif.locateMotif(sequence, self.motifPWM, self.reversePWM, maxScore, maxDiff)
930
931         sequence = upper(sequence)
932         positionList = []
933         position = 0
934         while position <= (sequenceLength - motifLength):
935             subSequence = sequence[position: position + motifLength]
936             if subSequence.count("N") > numberN:
937                 position += 1
938                 continue
939
940             (seqScore, revSeqScore) = self.scoreMotif(subSequence, maxDiff)
941             if seqScore >= revSeqScore and seqScore > 1.0:
942                 positionList.append((position, "F"))
943             elif revSeqScore > 1.0:
944                 positionList.append((position, "R"))
945
946             position += 1
947
948         return positionList
949
950
951     def locateMarkov1(self, sequence, maxFold=5.0):
952         """ returns a list of positions on sequence that match the Markov1 within maxFold.
953         """
954         motifLength = len(self.motifPWM)
955         sequenceLength = len(sequence)
956         if maxFold < 1.0:
957             print "maxFold less than 1.0 - will abort locateMarkov1()"
958             return []
959
960         maxScore = self.bestMarkov1Score() * maxFold
961         if sequenceLength < motifLength:
962             return []
963         else:
964             sequence.strip()
965
966         if hasMotifExtension:
967             return  _motif.locateMarkov1(sequence, self.motifMarkov1, self.revMarkov1, maxScore)
968
969         sequence = upper(sequence)
970         positionList = []
971         position = 0
972         while position <= (sequenceLength - motifLength):
973             subSequence = sequence[position: position + motifLength]
974             if subSequence.count("N") > 0:
975                 position += 1
976                 continue
977
978             (seqScore, revSeqScore) = self.scoreMarkov1(subSequence, maxScore)    
979             if seqScore <= revSeqScore and seqScore < maxScore:
980                 positionList.append((position, "F"))
981             elif revSeqScore < maxScore:
982                 positionList.append((position, "R"))
983
984             position += 1
985
986         return positionList
987
988
989     def calculatePWMfromSeqs(self):
990         """ calculate the PWM using a set of non-degenerate instances of the motif.
991         """
992         PWM = []
993         numSeqs = len(self.sequences)
994
995         if numSeqs < 1:
996             return
997
998         # using length of first sequence as the length of the motif
999         length = len(self.sequences[0])
1000         for index in range(length):
1001             PWM.append([0.0, 0.0, 0.0, 0.0])
1002
1003         for seq in self.sequences:
1004             index = 0
1005             theseq = seq.upper()
1006             for NT in theseq:
1007                 PWM[index][matrixRow[NT]] += 1.0
1008                 index += 1
1009
1010         for index in range(length):
1011             for NT in ["A", "C", "G", "T"]:
1012                 PWM[index][matrixRow[NT]] /= numSeqs
1013
1014         self.motifPWM = PWM
1015         self.buildReversePWM()
1016         self.motifSeq = self.buildConsensus()
1017         self.strictConsensus = self.buildStrictConsensus()
1018
1019
1020     def printMarkov1(self):
1021         """ print the Markov1 PSSM of the form previous NT -> current NT.
1022         """
1023         row = []
1024         for prior in range(4):
1025             row.append(["", "", "", ""])
1026
1027         for pos in self.motifMarkov1:
1028             for prior in range(4):
1029                 for current in range(4):
1030                     row[prior][current] += str(round(pos[prior][current], 4)) + "\t"
1031
1032         for prior in ["A", "C", "G", "T"]:
1033             for current in ["A", "C", "G", "T"]:
1034                 try:
1035                     print "%s -> %s\t%s\n" % (prior, current, row[matrixRow[prior]][matrixRow[current]])
1036                 except:
1037                     print "ERROR: %s %s" % (prior, current)
1038         print "\n"
1039
1040
1041     def bestMarkov1Score(self):
1042         """ returns the best markov1 score possible.
1043         """
1044         motLength = len(self.motifMarkov1)
1045         matchMarkov1 = self.probMatchMarkov1
1046         score = 0.0
1047         for index in range(motLength):
1048             col = self.motifMarkov1[index]
1049             mRow = []
1050             for prior in ["A", "C", "G", "T"]:
1051                 for current in ["A", "C", "G", "T"]:
1052                     mRow.append((col[matrixRow[prior]][matrixRow[current]], prior, current))
1053             mRow.sort()
1054
1055             if index == 0:
1056                 currentProb = matchMarkov1(self.motifMarkov1, "N", mRow[-1][2], index)
1057             else:
1058                 currentProb = matchMarkov1(self.motifMarkov1, mRow[-1][1], mRow[-1][2], index)
1059
1060             if currentProb < 0.0001:
1061                 currentProb = 0.0001
1062
1063             if currentProb > 0.0:
1064                 score -= log(currentProb,2.0)            
1065
1066         return score
1067
1068
1069     def worstMarkov1Score(self):
1070         """ returns the worst markov1 score possible.
1071         """
1072         motLength = len(self.motifMarkov1)
1073         currentProb = 0.0001    
1074         score = -log(currentProb, 2.0) * (motLength - 1)            
1075
1076         return score
1077
1078
1079     def calculateMarkov1(self, pseudoCount=1.0):
1080         """ calculate the Markov1 PSSM using a set of non-degenerate instances of the motif.
1081         adds a pseudoCount for unseen combinations.
1082         """
1083         self.motifMarkov1 = []
1084         numSeqs = len(self.sequences) + pseudoCount
1085
1086         if numSeqs < 2:
1087             return []
1088
1089         # using length of first sequence as the length of the motif
1090         length = len(self.sequences[0])
1091
1092         for index in range(length):
1093             self.motifMarkov1.append([[pseudoCount, pseudoCount, pseudoCount, pseudoCount],
1094                                       [pseudoCount, pseudoCount, pseudoCount, pseudoCount],
1095                                       [pseudoCount, pseudoCount, pseudoCount, pseudoCount],
1096                                       [pseudoCount, pseudoCount, pseudoCount, pseudoCount]])
1097
1098         for seq in self.sequences:
1099             theseq = seq.upper()
1100             index = 0
1101             prior = -1
1102             for pos in theseq:
1103                 if index == 0:
1104                     for priorNT in range(4):
1105                         self.motifMarkov1[index][priorNT][matrixRow[pos]] += 0.25
1106                 else:
1107                     self.motifMarkov1[index][prior][matrixRow[pos]] += 1.0
1108
1109                 prior = matrixRow[pos] 
1110                 index += 1
1111
1112         for index in range(length):
1113             for prior in range(4):
1114                 for current in range(4):
1115                     self.motifMarkov1[index][prior][current] /= numSeqs
1116
1117         self.buildReverseMarkov1(pseudoCount)
1118
1119
1120     def buildReverseMarkov1(self, pseudoCount=1.0):
1121         """ calculate the Reverse Markov1 PSSM using a set of non-degenerate instances of the motif.
1122         """
1123         self.revMarkov1 = []
1124         numSeqs = len(self.sequences) + pseudoCount
1125
1126         if numSeqs < 2:
1127             return []
1128
1129         # using length of first sequence as the length of the motif
1130         length = len(self.sequences[0])
1131         for index in range(length):
1132             self.revMarkov1.append([[pseudoCount, pseudoCount, pseudoCount, pseudoCount],
1133                                     [pseudoCount, pseudoCount, pseudoCount, pseudoCount],
1134                                     [pseudoCount, pseudoCount, pseudoCount, pseudoCount],
1135                                     [pseudoCount, pseudoCount, pseudoCount, pseudoCount]])
1136
1137         for aSeq in self.sequences:
1138             seq = complement(aSeq.upper(), length)
1139             index = 0
1140             prior = -1
1141             for pos in seq:
1142                 if index == 0:
1143                     for priorNT in range(4):
1144                         self.revMarkov1[index][priorNT][matrixRow[pos]] += 0.25
1145                 else:
1146                     self.revMarkov1[index][prior][matrixRow[pos]] += 1.0
1147
1148                 prior = matrixRow[pos] 
1149                 index += 1
1150
1151         for index in range(length):
1152             for prior in range(4):
1153                 for current in range(4):
1154                     self.revMarkov1[index][prior][current] /= numSeqs
1155
1156
1157     def scoreMarkov1(self, aSeq, maxScore=10000000.):
1158         """ calculate the matching score using the Markov1.
1159             limit search if given a low maxScore
1160         """
1161         motLength = len(self.motifMarkov1)
1162         matchMarkov1 = self.probMatchMarkov1
1163         Score = []
1164         if len(aSeq) < motLength:
1165             return Score
1166         else:
1167             theSeq = upper(aSeq)
1168
1169         pos = 0    
1170         seqProb = 0.0
1171         revSeqProb = 0.0    
1172         subSeq = theSeq[pos: pos + motLength]    
1173         previousNT = "N"
1174         for index in range(motLength):
1175             currentNT = subSeq[index]
1176             currentProb = matchMarkov1(self.motifMarkov1, previousNT, currentNT, index)
1177
1178             if currentProb < 0.0001:
1179                 currentProb = 0.0001
1180
1181             if currentProb > 0.0:
1182                 seqProb -= log(currentProb,2.0)
1183
1184             revCurrentProb = matchMarkov1(self.revMarkov1, previousNT, currentNT, index)
1185             if revCurrentProb < 0.002:
1186                 revCurrentProb = 0.002
1187
1188             if revCurrentProb > 0.0:
1189                 revSeqProb -= log(revCurrentProb, 2.0) 
1190
1191             if seqProb > maxScore and revSeqProb > maxScore:
1192                 return (seqProb, revSeqProb)
1193
1194             previousNT = currentNT
1195
1196         return (seqProb, revSeqProb)
1197
1198
1199     def probMatchMarkov1(self, theMarkov1, previousNT, NT, colIndex):
1200         """ returns the likelihood of seeing NT given previousNT at this position of the motif.
1201         """
1202         currentProb = 0.0
1203         if NT in ["A", "C", "G", "T"]:
1204             currentNT = matrixRow[NT]
1205         else: 
1206             currentNT = 0
1207
1208         try:
1209             prevNT = matrixRow[previousNT]
1210         except KeyError:
1211             for index in range(4):
1212                 currentProb += theMarkov1[colIndex][index][currentNT]
1213
1214             return currentProb
1215
1216         if NT in ["A", "C", "G", "T"]:
1217             return theMarkov1[colIndex][prevNT][currentNT]
1218
1219         if NT == "N":
1220             return 1.0
1221
1222         for motifNucleotide in ["A", "T", "C", "G"]:
1223             if NT in motifDict[motifNucleotide]:
1224                 row = matrixRow[NT]
1225                 currentProb += theMarkov1[colIndex][prevNT][row]
1226
1227         return currentProb
1228
1229
1230     def isSane(self, minLen=7, stretchLen=6):
1231         """ check for motif sanity, which includes: minimum length, less than half N's in consensus, 
1232             motifs include more than two nucleotides, no nucleotide or dinucleotide is repeated more 
1233             than stretchlen. The appropriate exceptions are made for 'GC' dinucleotides.
1234         """
1235         stretchLen = int(stretchLen)
1236         minLen = int(minLen)
1237         stretchLen = min(stretchLen, minLen - 1)
1238
1239         cons = self.buildConsensus()
1240         motifLen = float(len(cons))
1241         if motifLen < minLen:
1242             return False
1243
1244         nCount = cons.count("N")
1245         if (nCount >= 0.5 * motifLen):
1246             return False
1247
1248         aCount = cons.count("A")
1249         gCount = cons.count("G")
1250         cCount = cons.count("C")
1251         tCount = cons.count("T")
1252
1253         atCount = aCount + tCount
1254         agCount = aCount + gCount
1255         acCount = aCount + cCount
1256         gtCount = gCount + tCount
1257         tcCount = tCount + cCount
1258
1259         for pairedCount in [atCount, agCount, acCount, gtCount, tcCount]:
1260             if pairedCount == motifLen:
1261                 return False
1262
1263         cons = self.buildStrictConsensus()
1264         repeatSequences = []
1265         for nucleotide in ["A", "G", "C", "T"]:
1266             repeatSequences.append(nucleotide * stretchLen)
1267
1268         if stretchLen % 2 != 0:
1269             stretchLen += 1
1270
1271         repeatCount = stretchLen/2
1272         for dinucleotide in ["AG", "AC", "AT", "CT", "GT"]:
1273             repeatSequences.append(dinucleotide * repeatCount)
1274
1275         for testSequence in repeatSequences:
1276             if cons.count(testSequence):
1277                 return False
1278
1279         return True
1280
1281
1282 def correlateMotifs(actualMotifA, actualMotifB, maxSlide=1):
1283     """ Compares two motifs using the "pearson correlation coefficient-like" MSS.
1284         Will slide a motif up to maxSlide bases compared to the other,
1285         and reports the best score.
1286     """
1287     bestScore = 0.0
1288
1289     if len(actualMotifA) < len(actualMotifB):
1290         motA = actualMotifB
1291         motB = actualMotifA
1292     else:
1293         motA = actualMotifA
1294         motB = actualMotifB
1295
1296     motApwm = motA.getPWM()
1297     motBpwm = motB.getPWM()
1298     motCpwm = motB.buildReversePWM()
1299     if hasMotifExtension:
1300         return  _motif.correlateMotifs(motApwm, motBpwm, motCpwm, maxSlide)
1301     else:
1302         length = len(motA)
1303         padLength = length - len(motB)
1304         Ncol = [symbolToMatrix["N"]]
1305         for slide in range(-1 * maxSlide, maxSlide + padLength + 1):
1306             pwmA = deepcopy(motApwm)
1307             pwmB = deepcopy(motBpwm)
1308             pwmC = deepcopy(motCpwm)
1309             if slide < 0:
1310                 pwmA = Ncol * abs(slide) + pwmA
1311                 pwmB = pwmB + Ncol * (abs(slide) + padLength) 
1312                 pwmC = pwmC + Ncol * (abs(slide) + padLength)
1313             elif slide > 0 and slide <= maxSlide:
1314                 if padLength > 0:
1315                     if padLength >= slide:
1316                         adjustedPadLength = padLength - slide
1317                         adjustedSlide = 0
1318                     else:
1319                         adjustedPadLength = 0
1320                         adjustedSlide = slide - padLength
1321
1322                     pwmA = pwmA + Ncol * adjustedSlide
1323                     pwmB = Ncol * slide + pwmB + Ncol * adjustedPadLength
1324                     pwmC = Ncol * slide + pwmC + Ncol * adjustedPadLength
1325                 else:
1326                     pwmA = pwmA + Ncol * slide
1327                     pwmB = Ncol * slide + pwmB 
1328                     pwmC = Ncol * slide + pwmC 
1329             elif slide > maxSlide:
1330                 maxDiff = slide - maxSlide
1331                 pwmA = pwmA + Ncol * maxSlide
1332                 pwmB = Ncol * slide  + pwmB + Ncol * (padLength - maxDiff)
1333                 pwmC = Ncol * slide  + pwmC + Ncol * (padLength - maxDiff)
1334             else:
1335                 pwmB = pwmB + Ncol * padLength
1336                 pwmC = pwmC + Ncol * padLength            
1337
1338             score1 = 0.0
1339             score2 = 0.0
1340             thisLength = len(pwmA)
1341             for index in range(thisLength):
1342                 score1 += pearsonCorrelation(pwmA[index], pwmB[index])
1343                 score2 += pearsonCorrelation(pwmA[index], pwmC[index])
1344
1345             score1 = score1 / float(thisLength)
1346             score2 = score2 / float(thisLength)
1347             if score1 < score2 and score2 > bestScore:
1348                 bestScore = score2
1349             elif score1 > bestScore:
1350                 bestScore = score1
1351
1352     return bestScore
1353
1354
1355 def MSS(motifA, motifB, maxSlide=1):
1356     """ Compares two motifs using the motif similarity score (MSS).
1357         Will slide a motif up to maxSlide bases compared to the other,
1358         and reports the best score. Wrapper around correlateMotifs()
1359     """
1360     return correlateMotifs(motifA, motifB, maxSlide)
1361
1362
1363 def printMSS(motifList, maxSlide=1):
1364     """ Prints a matrix of MSS comparisons between different motifs
1365         in motifList.
1366     """
1367     for mot1 in motifList:
1368         print mot1.tagID,
1369         for mot2 in motifList:
1370             val = "\t%1.2f" % correlateMotifs(mot1, mot2, maxSlide)
1371             print val,
1372
1373         print ""