cistematic/core/motif.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-10 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 # motif.py - defines the motif object and its methods in cistematic
  31 from string import upper, lower
  32 from math import log, exp
  33 from copy import deepcopy
  34 from cistematic.core import complement
  35 from cistematic.cisstat.score import pearsonCorrelation
  36 import re, os, tempfile
  37
  38 if os.environ.get("CISTEMATIC_ROOT"):
  39     cisRoot = os.environ.get("CISTEMATIC_ROOT")
  40 else:
  41     cisRoot = "/proj/genome"
  42
  43 if os.environ.get("CISTEMATIC_TEMP"):
  44     cisTemp = os.environ.get("CISTEMATIC_TEMP")
  45 else:
  46     cisTemp = "/tmp"
  47
  48 tempfile.tempdir = cisTemp
  49
  50 try:
  51     import cistematic.core._motif as _motif
  52     hasMotifExtension = True
  53 except:
  54     hasMotifExtension = False
  55
  56 matrixRow = {"A": 0,
  57              "C": 1,
  58              "G": 2,
  59              "T": 3
  60 }
  61
  62 symbolToMatrix = {"A": [1.0, 0.0, 0.0, 0.0],
  63                   "C": [0.0, 1.0, 0.0, 0.0],
  64                   "G": [0.0, 0.0, 1.0, 0.0],
  65                   "T": [0.0, 0.0, 0.0, 1.0],
  66                   "W": [0.5, 0.0, 0.0, 0.5],
  67                   "S": [0.0, 0.5, 0.5, 0.0],
  68                   "R": [0.5, 0.0, 0.5, 0.0],
  69                   "Y": [0.0, 0.5, 0.0, 0.5],
  70                   "M": [0.5, 0.5, 0.0, 0.0],
  71                   "K": [0.0, 0.0, 0.5, 0.5],
  72                   "H": [0.33, 0.33, 0.0, 0.34],
  73                   "B": [0.0, 0.33, 0.33, 0.34],
  74                   "V": [0.33, 0.33, 0.34, 0.0],
  75                   "D": [0.33, 0.0, 0.33, 0.34],
  76                   "N": [0.25, 0.25, 0.25, 0.25]
  77 }
  78
  79 reMAP = {"A": "A",
  80          "C": "C",
  81          "G": "G",
  82          "T": "T",
  83          "N": "[ACGT]",
  84          "W": "[AT]",
  85          "S": "[GC]",
  86          "R": "[AG]",
  87          "Y": "[CT]",
  88          "M": "[AC]",
  89          "K": "[GT]",
  90          "H": "[ACT]",
  91          "B": "[CGT]",
  92          "V": "[ACG]",
  93          "D": "[AGT]",
  94          "|": "I"
  95 }
  96
  97 motifDict = {"A": ["W", "R", "M", "H", "V", "D"],
  98              "T": ["W", "Y", "K", "H", "B", "D"],
  99              "C": ["S", "Y", "M", "H", "B", "V"],
 100              "G": ["S", "R", "K", "B", "V", "D"]
 101 }
 102
 103
 104 class Motif:
 105     """ The Motif class is the heart of cistematic. It captures both the consensus,
 106         PWM, and Markov(1) versions of the motif, as well as provides methods for scanning
 107         sequences for matches.
 108     """
 109     motifSeq = ""
 110     motifPWM = []
 111     reversePWM = []
 112     motifMarkov1 = []
 113     revMarkov1 = []
 114     tagID = ""
 115     sequences = []
 116     annotations = []
 117     strictConsensus = ""
 118     threshold = 1.0
 119     info = ""
 120
 121
 122     def __init__(self, tagID="", motif="", PWM=[], seqs=[], thresh=0.0, info="", motifFile="", seqfile=""):
 123         """ initialize a Motif object either with (A) a tagID identifier and either of
 124             (i) a IUPAC consensus or (ii) a PWM or (iii) example sequences, or (B) from
 125             a motif file in Cistematic motif format.
 126         """
 127         fileTagID = ""
 128         if motifFile != "":
 129             (fileTagID, motif, PWM, seqs, thresh, info) = self.readMotif(motifFile)
 130
 131         if len(tagID) > 0:
 132             self.setTagID(tagID)
 133         elif len(fileTagID) > 0:
 134             self.setTagID(fileTagID)
 135         else:
 136             self.setTagID("default")
 137
 138         if motif <> "":
 139             self.setMotif(motif)
 140
 141         # PWM overrules the motif
 142         if len(PWM) > 1:
 143             self.setPWM(PWM)
 144
 145         # a seqfile can be used to create a list of sequences
 146         if len(seqfile) > 0:
 147             seqs = self.readSeqFile(seqfile)
 148
 149         # Sequences overrules the PWM and motif
 150         if len(seqs) > 1:
 151             self.setSequences(seqs)
 152             self.setThreshold(len(seqs[0]))
 153
 154         self.setThreshold(float(thresh))
 155         if len(info) > 0:
 156             self.setInfo(info)
 157
 158
 159     def __len__(self):
 160         """ returns the length of the motif
 161         """
 162         return len(self.motifPWM)
 163
 164
 165     def readMotif(self, motifFile):
 166         """ read motif in cistmeatic motif format to initialize motif instance.
 167         """
 168         motif = ""
 169         PWM = []
 170         seqs = []
 171         threshold = 0.0
 172         info = ""
 173         infile = open(motifFile, "r")
 174         for line in infile:
 175             if len(line) < 4 or line[0] == "#":
 176                 continue
 177
 178             fields = line.strip().split("\t")
 179             fieldType = fields[0].lower()
 180             if fieldType not in ["tagid", "motif", "acgt", "sequence", "threshold", "info"]:
 181                 print "could not process line %s" % line
 182                 continue
 183
 184             if len(fields) < 2:
 185                 continue
 186
 187             if fieldType == "motif":
 188                 motif = fields[1]
 189             elif fieldType == "acgt":
 190                 PWM.append([float(fields[1]), float(fields[2]), float(fields[3]), float(fields[4])])
 191             elif fieldType == "sequence":
 192                 seqs.append(fields[1])
 193             elif fieldType == "threshold":
 194                 threshold = float(fields[1])
 195             elif fieldType == "info":
 196                 info = fields[1].strip()
 197             elif fieldType == "tagid":
 198                 tagID = fields[1]
 199
 200         infile.close()
 201
 202         return (tagID, motif, PWM, seqs, threshold, info)
 203
 204
 205     def readSeqFile(self, seqFile):
 206         """ read sequences from a sequence file.
 207         """
 208         seqs = []
 209         infile = open(seqFile, "r")
 210         for line in infile:
 211             seqs.append(line.strip().upper())
 212
 213         infile.close()
 214
 215         return seqs
 216
 217
 218     def saveMotif(self, motFile):
 219         """ Save motif in cistematic motif format.
 220         """
 221         outfile = open(motFile, "w")
 222         outfile.write("tagid\t%s\n" % self.tagID)
 223         outfile.write("info\t%s\n" % self.info.strip())
 224         outfile.write("threshold\t%s\n" % str(self.threshold))
 225         outfile.write("motif\t%s\n" % self.buildConsensus())
 226         for col in self.motifPWM:
 227             outfile.write("acgt\t%f\t%f\t%f\t%f\n" % (col[0], col[1], col[2], col[3]))
 228
 229         for seq in self.sequences:
 230             outfile.write("sequence\t%s\n" % seq)
 231
 232         outfile.close()
 233
 234
 235     def setTagID(self, tag):
 236         """ set motif identifier.
 237         """
 238         self.tagID = tag
 239
 240
 241     def setInfo(self, info):
 242         """ set motif info string.
 243         """
 244         self.info = info
 245
 246
 247     def setThreshold(self, threshold):
 248         """ set a pre-defined threshold or the highest-possible threshold, otherwise.
 249         """
 250         (sForward, sBackward) = self.scoreMotif(self.buildStrictConsensus())
 251         if sForward > sBackward:
 252             self.threshold = sForward
 253         else:
 254             self.threshold = sBackward
 255
 256         if threshold < self.threshold and threshold > 0:
 257             self.threshold = threshold
 258
 259
 260     def setMotif(self, motif):
 261         """ set the motif PWM using a IUPAC consensus.
 262         """
 263         self.motifSeq = motif
 264         self.motifPWM = self.buildPWM(motif)
 265         self.buildReversePWM()
 266         self.strictConsensus = self.buildStrictConsensus()
 267
 268
 269     def setSequences(self, seqs):
 270         """ set the founder sequences for the motif and recalculate PWM with them.
 271         """
 272         self.sequences = seqs
 273         self.calculatePWMfromSeqs()
 274         self.calculateMarkov1()
 275
 276
 277     def setPWM(self, PWM):
 278         """ set the PWM for the motif and calculate consensus.
 279         """
 280         self.motifPWM = PWM
 281         self.buildReversePWM()
 282         self.motifSeq = self.buildConsensus()
 283         self.strictConsensus = self.buildStrictConsensus()
 284
 285
 286     def appendToPWM(self, col):
 287         """ add a column to the PWM for the motif
 288         """
 289         self.motifPWM.append(col)
 290
 291
 292     def buildPWM(self, motif):
 293         """ returns the PWM for the provided consensus motif.
 294         """
 295         PWM = []
 296         for letter in upper(motif):
 297             PWM.append(symbolToMatrix[letter])
 298
 299         return PWM
 300
 301
 302     def buildReversePWM(self):
 303         """ returns the reverse PWM for the motif
 304         """
 305         theRevPWM = []
 306         tempPWM = deepcopy(self.motifPWM)
 307         tempPWM.reverse()
 308         for col in tempPWM:
 309             theRevPWM.append([col[matrixRow["T"]],  col[matrixRow["G"]], col[matrixRow["C"]], col[matrixRow["A"]]])
 310
 311         self.reversePWM = deepcopy(theRevPWM)
 312
 313         return self.reversePWM
 314
 315
 316     def getPWM(self):
 317         """ returns the PWM for the motif
 318         """
 319         return self.motifPWM
 320
 321
 322     def printPWM(self):
 323         """ print the PWM and the consensus
 324         """
 325         aRow = ""
 326         cRow = ""
 327         gRow = ""
 328         tRow = ""
 329         cons = self.buildConsensus()
 330         consLine = "Cons:"
 331         for NT in cons:
 332             consLine += "\t"
 333             consLine += NT
 334
 335         for col in self.motifPWM:
 336             aRow = "%s%s\t" % (aRow, str(round(col[matrixRow["A"]],4)))
 337             cRow = "%s%s\t" % (cRow, str(round(col[matrixRow["C"]],4)))
 338             gRow = "%s%s\t" % (gRow, str(round(col[matrixRow["G"]],4)))
 339             tRow = "%s%s\t" % (tRow, str(round(col[matrixRow["T"]],4)))
 340
 341         print "A:\t%s\nC:\t%s\nG:\t%s\nT:\t%s\n" % (aRow, cRow, gRow, tRow)
 342         print "%s\n" % consLine
 343
 344
 345     def saveLogo(self, outfilename, height=-1, width=-1):
 346         """ saves a logo version of the motif as outfilename (assumes has .png)
 347             if the motif is built from sequences.
 348             will fail if weblogo 2.8.2 package is not installed in correct place.
 349         """
 350         logoPath = "%s/programs/weblogo/seqlogo" % cisRoot
 351         if outfilename[-4:] in [".png", ".PNG"]:
 352             outfilename = outfilename[:-4]
 353
 354         if len(self.sequences) < 1:
 355             print "cannot run logo representation without founder sequences"
 356         else:
 357             if True:
 358                 seqfilename = "%s.tmp" % tempfile.mktemp()
 359                 seqfile = open(seqfilename, "w")
 360                 for sequence in self.sequences:
 361                     seqfile.write("%s\n" % sequence)
 362
 363                 seqfile.flush()
 364                 dimensions = ""
 365                 if height > 0:
 366                     dimensions += "-h %d " % height
 367                 if width > 0:
 368                     dimensions +=  "-w %d " % width
 369                 else:
 370                     dimensions += "-w %d " % len(self.motifPWM)
 371
 372                 cmd = logoPath + " -f " + seqfilename + " -F PNG -c " + dimensions + "-a -Y -n -M -k 1 -o " + outfilename
 373                 contents = os.system(cmd)
 374                 seqfile.close()
 375                 os.remove(seqfilename)
 376             else:
 377                 print "failed to make logo: expecting weblogo 2.8.2 package in %s" % logoPath
 378                 print "also check if ghostscript package is correctly installed."
 379
 380
 381     def getSymbol(self, col):
 382         """ helper function for buildConsensus()
 383         """
 384         for NT in ["A", "C", "G", "T"]:
 385             row = matrixRow[NT]
 386             if col[row] > 0.9:
 387                 return NT
 388
 389         aColValue = col[matrixRow["A"]]
 390         cColValue = col[matrixRow["C"]]
 391         gColValue = col[matrixRow["G"]]
 392         tColValue = col[matrixRow["T"]]
 393
 394         dualsList = [("R", aColValue + gColValue),
 395                      ("Y", tColValue + cColValue),
 396                      ("W", aColValue + tColValue),
 397                      ("S", cColValue + gColValue),
 398                      ("M", aColValue + cColValue),
 399                      ("K", tColValue + gColValue)
 400         ]
 401
 402         bestDual = self.getBestSymbol(dualsList)
 403         if bestDual[1] > 0.9:
 404             return bestDual[0]
 405
 406         trioList = [("B", cColValue + gColValue + tColValue),
 407                     ("D", aColValue + gColValue + tColValue),
 408                     ("H", aColValue + cColValue + tColValue),
 409                     ("V", aColValue + cColValue + gColValue)
 410         ]
 411
 412         bestTrio = self.getBestSymbol(trioList)
 413         if bestTrio[1] > 0.9:
 414             return bestTrio[0]
 415
 416         return "N"
 417
 418
 419     def getBestSymbol(self, symbolProbabilityList):
 420         bestSymbol = symbolProbabilityList[0]
 421         for symbol in symbolProbabilityList[1:]:
 422             if symbol[1] > bestSymbol[1]:
 423                 bestSymbol = symbol
 424
 425         return bestSymbol
 426
 427
 428     def buildConsensus(self):
 429         """ returns the best consensus using the IUPAC alphabet.
 430         """
 431         consensus = ""
 432         for col in self.motifPWM:
 433             consensus += self.getSymbol(col)
 434
 435         return consensus
 436
 437
 438     def buildStrictConsensus(self):
 439         """ returns the best consensus using solely nucleotides.
 440         """
 441         consensus = ""
 442         for col in self.motifPWM:
 443             mRow = []
 444             for nt in ("A", "C", "G", "T"):
 445                 mRow.append((col[matrixRow[nt]], nt))
 446
 447             mRow.sort()
 448             consensus += mRow[3][1]
 449
 450         return consensus
 451
 452
 453     def bestConsensusScore(self):
 454         """ returns the best consensus score possible.
 455         """
 456         score = 0.0
 457         for col in self.motifPWM:
 458             mRow = []
 459             for nt in ["A", "C", "G", "T"]:
 460                 mRow.append((col[matrixRow[nt]], nt))
 461
 462             mRow.sort()
 463             score += mRow[3][0]
 464
 465         return score
 466
 467
 468     def expected(self, length, background=[0.25, 0.25, 0.25, 0.25], numMismatches=0):
 469         """ returns the expected number of matches to the consensus in a sequence of a given length and background.
 470         """
 471         expectedNum = length * self.consensusProbability(background, numMismatches)
 472         return expectedNum
 473
 474
 475     def consensusProbability(self, background=[0.25, 0.25, 0.25, 0.25], numMismatches=0):
 476         """ returns the probability of the consensus given the background.
 477         """
 478         prob = 0
 479         motifs = []
 480         if numMismatches> 0:
 481             seqs = self.seqMismatches(self.buildConsensus().upper(), numMismatches)
 482             motifs = seqs.split("|")
 483         else:
 484             motifs.append(self.buildConsensus())
 485
 486         for theCons in motifs:
 487             motProb = 0
 488             for NT in theCons:
 489                 currentProb = 0.0
 490                 if NT in ("A", "W", "R", "M", "H", "V", "D", "N"):
 491                     currentProb += background[matrixRow["A"]]
 492
 493                 if NT in ("C", "S", "Y", "M", "H", "B", "V", "N"):
 494                     currentProb += background[matrixRow["C"]]
 495
 496                 if NT in ("G", "S", "R", "K", "B", "V", "D", "N"):
 497                     currentProb += background[matrixRow["G"]]
 498
 499                 if NT in ("T", "W", "Y", "K", "H", "B", "D", "N"):
 500                     currentProb += background[matrixRow["T"]]
 501
 502                 motProb = motProb + log(currentProb)
 503
 504             prob += exp(motProb)
 505
 506         return prob
 507
 508
 509     def pwmProbability(self, background):
 510         """ returns probability of the PWM.
 511         """
 512         prob = 1.0
 513         for row in self.motifPWM:
 514             currentProb = 0.0
 515             for NT in ["A", "C", "G", "T"]:
 516                 currentProb += row[matrixRow[NT]] * background[matrixRow[NT]]
 517
 518             prob = prob * currentProb
 519
 520         return prob
 521
 522
 523     def revComp(self):
 524         """ returns the reverse complement of the consensus of this motif.
 525         """
 526         return complement(self.buildConsensus(), len(self.motifPWM))
 527
 528
 529     def numberOfN(self):
 530         """ returns numbers of effective Ns in motif.
 531         """
 532         index = 0
 533         for col in self.motifPWM:
 534             if self.getSymbol(col) == "N":
 535                 index += 1
 536
 537         return index
 538
 539
 540     def buildMismatchSeq(self, rootSeq, tailSeq, numMismatches):
 541         """ helper function called from seqMismatches().
 542         """
 543         finalSeq = ""
 544         tailLen = len(tailSeq)
 545         if tailLen < 1 or numMismatches < 1:
 546             return rootSeq + tailSeq
 547
 548         for pos in range(tailLen - numMismatches + 1):
 549             newRootSeq = rootSeq
 550             newRootSeq += tailSeq[:pos]
 551             newRootSeq += "N"
 552             finalSeq += self.buildMismatchSeq(newRootSeq, tailSeq[pos + 1:], numMismatches - 1)
 553             finalSeq += "|"
 554
 555         return finalSeq[:-1]
 556
 557
 558     def seqMismatches(self, seq, numMismatches):
 559         """ Returns list of sequences that will be used by initializeMismatchRE().
 560         """
 561         return self.buildMismatchSeq("", seq, numMismatches)
 562
 563
 564     def probMatchPWM(self, PWM, NT, colIndex):
 565         """ returns the probability of seeing that particular nucleotide according to the PSFM.
 566         """
 567
 568         if NT in ["A", "T", "C", "G"]:
 569             row = matrixRow[NT]
 570             return PWM[colIndex][row]
 571
 572         if NT == "N":
 573             return 1.0
 574
 575         currentProb = 0.0
 576         for motifNucleotide in ["A", "T", "C", "G"]:
 577             if NT in motifDict[motifNucleotide]:
 578                 row = matrixRow[NT]
 579                 currentProb += PWM[colIndex][row]
 580
 581         return currentProb
 582
 583
 584     def psfmOdds(self, PWM, NT, colIndex, background=[0.25, 0.25, 0.25, 0.25]):
 585         """ calculates the odds of nucleotide NT coming from position colIndex in thePWM
 586             as opposed to coming from the background.
 587         """
 588
 589         currentProb = self.probMatchPWM(PWM, NT, colIndex)
 590         backgroundProb = self.getBackgroundProbability(self, NT, background)
 591
 592         try:
 593             odds = currentProb / backgroundProb
 594         except ZeroDivisionError:
 595             odds = 1.0
 596
 597         return odds
 598
 599
 600     def getBackgroundProbability(self, NT, background=[0.25, 0.25, 0.25, 0.25]):
 601
 602         if NT in ["A", "T", "C", "G"]:
 603             row = matrixRow[NT]
 604             return background[row]
 605
 606         if NT == "N":
 607             return 1.0
 608
 609         backgroundProb = 0.0
 610         for motifNucleotide in ["A", "T", "C", "G"]:
 611             if NT in motifDict[motifNucleotide]:
 612                 row = matrixRow[NT]
 613                 backgroundProb += background[row]
 614
 615         return backgroundProb
 616
 617
 618     def ntMatch(self, motifNT, seqNT):
 619         """ returns True if seqNT matches motifNT.
 620         """
 621         if motifNT == seqNT:
 622             return True
 623
 624         if seqNT == "N" or motifNT == "N":
 625             return True
 626
 627         if motifNT in motifDict[seqNT]:
 628             return True
 629
 630         return False
 631
 632
 633     def scoreMotif(self, aSeq, diff=1000):
 634         """ calculates the consensus score using the PSFM
 635         """
 636         motLength = len(self.motifPWM)
 637         if len(aSeq) < motLength:
 638             return (0.0, 0.0)
 639
 640         matchPWM = self.probMatchPWM
 641         motPWM = self.motifPWM
 642         revPWM = self.reversePWM
 643         theSeq = upper(aSeq)
 644         forwardCons = 0.0
 645         reverseCons = 0.0
 646         bestCons = 0.0
 647         for index in range(motLength):
 648             currentNT = theSeq[index]
 649             forwardCons += matchPWM(motPWM, currentNT, index)
 650             reverseCons += matchPWM(revPWM, currentNT, index)
 651             bestCons += matchPWM(motPWM,self.strictConsensus[index], index)
 652
 653         if (forwardCons + diff) < bestCons and (reverseCons + diff) < bestCons:
 654             return (-1, -1)
 655
 656         return (forwardCons, reverseCons)
 657
 658
 659     def scoreMotifLogOdds(self, aSeq, background=[0.25, 0.25, 0.25, 0.25]):
 660         """ calculates the log-odds score using the PSFM given the markov(0) background.
 661         """
 662         motLength = len(self.motifPWM)
 663         if len(aSeq) < motLength:
 664             return (0.0, 0.0)
 665
 666         odds = self.psfmOdds
 667         motPWM = self.motifPWM
 668         revPWM = self.reversePWM
 669         theSeq = upper(aSeq)
 670         forwardCons = 0.0
 671         reverseCons = 0.0
 672         bestCons = 0.0
 673         for index in range(motLength):
 674             currentNT = theSeq[index]
 675             try:
 676                 forwardCons += log(odds(motPWM, currentNT, index, background), 2)
 677             except:
 678                 forwardCons += log(0.01, 2)
 679
 680             try:
 681                 reverseCons += log(odds(revPWM, currentNT, index, background), 2)
 682             except:
 683                 reverseCons += log(0.01, 2)
 684
 685             bestCons += log(odds(motPWM,self.strictConsensus[index], index, background), 2)
 686
 687         return (forwardCons, reverseCons)
 688
 689
 690     def bestLogOddsScore(self, background=[0.25, 0.25, 0.25, 0.25]):
 691         """ calculates the best possible log-odds score using the PSFM given the markov(0) background.
 692         """
 693         motLength = len(self.motifPWM)
 694         odds = self.psfmOdds
 695         motPWM = self.motifPWM
 696         bestLogOdds = 0.0
 697         for index in range(motLength):
 698             bestLogOdds += log(odds(motPWM,self.strictConsensus[index], index, background), 2)
 699
 700         return bestLogOdds
 701
 702
 703     def locateConsensus(self, aSeq):
 704         """ returns a list of positions on aSeq that match the consensus exactly.
 705         """
 706         cons = self.buildConsensus()
 707         revComp = self.revComp()
 708         motLength = len(cons)
 709         Position = []
 710         if len(aSeq) < motLength:
 711             return []
 712         else:
 713             theSeq = upper(aSeq)
 714
 715         pos = 0
 716         seqLength = len(theSeq)
 717         while pos <= (seqLength - motLength):
 718             subSeq = theSeq[pos:pos + motLength].strip()
 719             try:
 720                 forwardMot = 1
 721                 for index in range(motLength):
 722                     if not self.ntMatch(cons[index], subSeq[index]):
 723                         forwardMot = 0
 724                         break
 725
 726                 revCompMot = 1
 727                 for index in range(motLength):
 728                     if not self.ntMatch(revComp[index], subSeq[index]):
 729                         revCompMot = 0
 730                         break
 731             except:
 732                 print "chocked at pos %d" % pos
 733                 forwardMot = 0
 734
 735             if forwardMot == 1:
 736                 Position.append((pos, "F"))
 737                 pos += motLength
 738             elif revCompMot == 1:
 739                 Position.append((pos, "R"))
 740                 pos += motLength
 741             else:
 742                 pos +=1
 743
 744         return Position
 745
 746
 747     def compareConsensus(self, aSeq):
 748         """ returns a sequence with nucleotide differences from consensus in lower case.
 749         """
 750         cons = self.buildConsensus()
 751         revComp = self.revComp()
 752         motLength = len(cons)
 753         if len(aSeq) < motLength:
 754             raise NameError, "Sequence too short"
 755         else:
 756             theSeq = upper(aSeq)
 757
 758         forwardMismatch = 0
 759         backwardMismatch = 0
 760         forwardSeq = ""
 761         backwardSeq = ""
 762
 763         for index in range(motLength):
 764             if not self.ntMatch(cons[index], theSeq[index]):
 765                 forwardMismatch += 1
 766                 forwardSeq += lower(theSeq[index])
 767             else:
 768                 forwardSeq += theSeq[index]
 769
 770             if not self.ntMatch(revComp[index], theSeq[index]):
 771                 backwardMismatch += 1
 772                 backwardSeq += lower(theSeq[index])
 773             else:
 774                 backwardSeq += theSeq[index]
 775
 776         if forwardMismatch <= backwardMismatch:
 777             return forwardSeq
 778         else:
 779             return backwardSeq
 780
 781
 782     def scoreDiffPWM(self, compMotif):
 783         """ returns a score scaled from 0 (no difference) to 2 (completely different) to
 784             quantify the difference between the motif and another motif.
 785         """
 786         score = 0.0
 787         diffPWM = self.getDiffPWM(compMotif)
 788         for pos in range(len(diffPWM)):
 789             (adiff, cdiff, gdiff, tdiff) = diffPWM[pos]
 790             score += abs(adiff) + abs(cdiff) + abs(gdiff) + abs(tdiff)
 791
 792         score /= float(len(diffPWM))
 793
 794         return score
 795
 796
 797     def getDiffPWM(self, compMotif):
 798         """ subtracts the PWM of compMotif from existing PWM to compare differences.
 799             Note that the comparison is only done on the length of the shorter motif.
 800         """
 801         diffPWM = []
 802         compPWM = compMotif.getPWM()
 803         numBasesToCompare = min(len(compMotif), len(self.motifPWM))
 804
 805         for pos in range(numBasesToCompare):
 806             pwmCol = self.motifPWM[pos]
 807             pwmColComp = compPWM[pos]
 808             pwmEntry = []
 809             for NT in range(4):
 810                 pwmEntry.append(pwmCol[NT] - pwmColComp[NT])
 811
 812             diffPWM.append(pwmEntry)
 813
 814         return diffPWM
 815
 816
 817     def initializeRE(self):
 818         """ initializes Regular Expression motif engine.
 819         """
 820         global forwardRE
 821         global backwardRE
 822         cons = self.buildConsensus().upper()
 823         revComp = self.revComp().upper()
 824         reCons = ""
 825         reBackward = ""
 826         for NT in cons:
 827             reCons += reMAP[NT]
 828
 829         if revComp != cons:
 830             for NT in revComp:
 831                 reBackward += reMAP[NT]
 832         else:
 833             reBackward = "ZZZZZZZ"
 834
 835         forwardRE = re.compile(reCons, re.IGNORECASE)
 836         backwardRE = re.compile(reBackward, re.IGNORECASE)
 837
 838
 839     def initializeMismatchRE(self, numMismatches):
 840         """ initializes Regular Expression motif engine allowing for mismatches.
 841         """
 842         global forwardRE
 843         global backwardRE
 844         cons = self.seqMismatches(self.buildConsensus().upper(), numMismatches)
 845         revComp = self.seqMismatches(self.revComp().upper(), numMismatches)
 846         reCons = ""
 847         reBackward = ""
 848         for NT in cons:
 849             reCons += reMAP[NT]
 850
 851         if self.revComp().upper() != self.buildConsensus().upper():
 852             for NT in revComp:
 853                 reBackward += reMAP[NT]
 854         else:
 855             reBackward = "ZZZZZZZ"
 856
 857         forwardRE = re.compile(reCons, re.IGNORECASE)
 858         backwardRE = re.compile(reBackward, re.IGNORECASE)
 859
 860
 861     def locateConsensusRE(self, sequence):
 862         """ Returns a list of positions on aSeq that match the consensus exactly.
 863             Should be run after either initializeRE() or initializeMismatchRE(numMismatches)
 864         """
 865         motLength = len(self.motifPWM)
 866         position = []
 867         results = []
 868         if len(sequence) < motLength:
 869             return []
 870
 871         forwardIter = forwardRE.finditer(sequence)
 872         backwardIter = backwardRE.finditer(sequence)
 873         for match in forwardIter:
 874             position.append((match.start(), "F"))
 875
 876         for match in backwardIter:
 877             position.append((match.start(), "R"))
 878
 879         positionLength = len(position)
 880         if positionLength >= 1:
 881             position.sort()
 882             (prevPos, prevSense) = position[0]
 883             results.append((prevPos, prevSense))
 884
 885             for index in range(1, positionLength):
 886                 (pos, sense) = position[index]
 887                 if pos >= prevPos + motLength:
 888                     results.append((pos, sense))
 889                     (pos, sense) = (prevPos, prevSense)
 890
 891         return results
 892
 893
 894     def locateStrictConsensus(self, aSeq, mismatches=0):
 895         """ returns a list of positions on aSeq that match the strict
 896             consensus within some mismatches.
 897             Only available as a C-extension for greater speed-up for now.
 898         """
 899         forwardMer = self.buildStrictConsensus()
 900         motLength = len(forwardMer)
 901         revcompMer = complement(forwardMer, motLength)
 902         if hasMotifExtension:
 903             return  _motif.locateMer(aSeq, forwardMer, revcompMer, mismatches)
 904         else:
 905             print "only supported as part of the C extension for now"
 906             return []
 907
 908
 909     def locateMotif(self, sequence, threshold=90.0, numberN=0):
 910         """ returns a list of positions on aSeq that match the PWM within a Threshold,
 911             given as a percentage of the optimal consensus score.
 912             Will call C-extension for greater speed-up if compiled.
 913         """
 914         motifLength = len(self.motifPWM)
 915         sequenceLength = len(sequence)
 916         threshold /=  100.0
 917         if threshold < 0.5:
 918             print "Threshold less than 50% - will abort locateMotif()"
 919             return []
 920
 921         maxScore = self.bestConsensusScore()
 922         maxDiff = maxScore * (1 - threshold)
 923         if sequenceLength < motifLength:
 924             return []
 925         else:
 926             sequence.strip()
 927
 928         if hasMotifExtension:
 929             return  _motif.locateMotif(sequence, self.motifPWM, self.reversePWM, maxScore, maxDiff)
 930
 931         sequence = upper(sequence)
 932         positionList = []
 933         position = 0
 934         while position <= (sequenceLength - motifLength):
 935             subSequence = sequence[position: position + motifLength]
 936             if subSequence.count("N") > numberN:
 937                 position += 1
 938                 continue
 939
 940             (seqScore, revSeqScore) = self.scoreMotif(subSequence, maxDiff)
 941             if seqScore >= revSeqScore and seqScore > 1.0:
 942                 positionList.append((position, "F"))
 943             elif revSeqScore > 1.0:
 944                 positionList.append((position, "R"))
 945
 946             position += 1
 947
 948         return positionList
 949
 950
 951     def locateMarkov1(self, sequence, maxFold=5.0):
 952         """ returns a list of positions on sequence that match the Markov1 within maxFold.
 953         """
 954         motifLength = len(self.motifPWM)
 955         sequenceLength = len(sequence)
 956         if maxFold < 1.0:
 957             print "maxFold less than 1.0 - will abort locateMarkov1()"
 958             return []
 959
 960         maxScore = self.bestMarkov1Score() * maxFold
 961         if sequenceLength < motifLength:
 962             return []
 963         else:
 964             sequence.strip()
 965
 966         if hasMotifExtension:
 967             return  _motif.locateMarkov1(sequence, self.motifMarkov1, self.revMarkov1, maxScore)
 968
 969         sequence = upper(sequence)
 970         positionList = []
 971         position = 0
 972         while position <= (sequenceLength - motifLength):
 973             subSequence = sequence[position: position + motifLength]
 974             if subSequence.count("N") > 0:
 975                 position += 1
 976                 continue
 977
 978             (seqScore, revSeqScore) = self.scoreMarkov1(subSequence, maxScore)
 979             if seqScore <= revSeqScore and seqScore < maxScore:
 980                 positionList.append((position, "F"))
 981             elif revSeqScore < maxScore:
 982                 positionList.append((position, "R"))
 983
 984             position += 1
 985
 986         return positionList
 987
 988
 989     def calculatePWMfromSeqs(self):
 990         """ calculate the PWM using a set of non-degenerate instances of the motif.
 991         """
 992         PWM = []
 993         numSeqs = len(self.sequences)
 994
 995         if numSeqs < 1:
 996             return
 997
 998         # using length of first sequence as the length of the motif
 999         length = len(self.sequences[0])
1000         for index in range(length):
1001             PWM.append([0.0, 0.0, 0.0, 0.0])
1002
1003         for seq in self.sequences:
1004             index = 0
1005             theseq = seq.upper()
1006             for NT in theseq:
1007                 PWM[index][matrixRow[NT]] += 1.0
1008                 index += 1
1009
1010         for index in range(length):
1011             for NT in ["A", "C", "G", "T"]:
1012                 PWM[index][matrixRow[NT]] /= numSeqs
1013
1014         self.motifPWM = PWM
1015         self.buildReversePWM()
1016         self.motifSeq = self.buildConsensus()
1017         self.strictConsensus = self.buildStrictConsensus()
1018
1019
1020     def printMarkov1(self):
1021         """ print the Markov1 PSSM of the form previous NT -> current NT.
1022         """
1023         row = []
1024         for prior in range(4):
1025             row.append(["", "", "", ""])
1026
1027         for pos in self.motifMarkov1:
1028             for prior in range(4):
1029                 for current in range(4):
1030                     row[prior][current] += str(round(pos[prior][current], 4)) + "\t"
1031
1032         for prior in ["A", "C", "G", "T"]:
1033             for current in ["A", "C", "G", "T"]:
1034                 try:
1035                     print "%s -> %s\t%s\n" % (prior, current, row[matrixRow[prior]][matrixRow[current]])
1036                 except:
1037                     print "ERROR: %s %s" % (prior, current)
1038         print "\n"
1039
1040
1041     def bestMarkov1Score(self):
1042         """ returns the best markov1 score possible.
1043         """
1044         motLength = len(self.motifMarkov1)
1045         matchMarkov1 = self.probMatchMarkov1
1046         score = 0.0
1047         for index in range(motLength):
1048             col = self.motifMarkov1[index]
1049             mRow = []
1050             for prior in ["A", "C", "G", "T"]:
1051                 for current in ["A", "C", "G", "T"]:
1052                     mRow.append((col[matrixRow[prior]][matrixRow[current]], prior, current))
1053             mRow.sort()
1054
1055             if index == 0:
1056                 currentProb = matchMarkov1(self.motifMarkov1, "N", mRow[-1][2], index)
1057             else:
1058                 currentProb = matchMarkov1(self.motifMarkov1, mRow[-1][1], mRow[-1][2], index)
1059
1060             if currentProb < 0.0001:
1061                 currentProb = 0.0001
1062
1063             if currentProb > 0.0:
1064                 score -= log(currentProb,2.0)
1065
1066         return score
1067
1068
1069     def worstMarkov1Score(self):
1070         """ returns the worst markov1 score possible.
1071         """
1072         motLength = len(self.motifMarkov1)
1073         currentProb = 0.0001
1074         score = -log(currentProb, 2.0) * (motLength - 1)
1075
1076         return score
1077
1078
1079     def calculateMarkov1(self, pseudoCount=1.0):
1080         """ calculate the Markov1 PSSM using a set of non-degenerate instances of the motif.
1081         adds a pseudoCount for unseen combinations.
1082         """
1083         self.motifMarkov1 = []
1084         numSeqs = len(self.sequences) + pseudoCount
1085
1086         if numSeqs < 2:
1087             return []
1088
1089         # using length of first sequence as the length of the motif
1090         length = len(self.sequences[0])
1091
1092         for index in range(length):
1093             self.motifMarkov1.append([[pseudoCount, pseudoCount, pseudoCount, pseudoCount],
1094                                       [pseudoCount, pseudoCount, pseudoCount, pseudoCount],
1095                                       [pseudoCount, pseudoCount, pseudoCount, pseudoCount],
1096                                       [pseudoCount, pseudoCount, pseudoCount, pseudoCount]])
1097
1098         for seq in self.sequences:
1099             theseq = seq.upper()
1100             index = 0
1101             prior = -1
1102             for pos in theseq:
1103                 if index == 0:
1104                     for priorNT in range(4):
1105                         self.motifMarkov1[index][priorNT][matrixRow[pos]] += 0.25
1106                 else:
1107                     self.motifMarkov1[index][prior][matrixRow[pos]] += 1.0
1108
1109                 prior = matrixRow[pos]
1110                 index += 1
1111
1112         for index in range(length):
1113             for prior in range(4):
1114                 for current in range(4):
1115                     self.motifMarkov1[index][prior][current] /= numSeqs
1116
1117         self.buildReverseMarkov1(pseudoCount)
1118
1119
1120     def buildReverseMarkov1(self, pseudoCount=1.0):
1121         """ calculate the Reverse Markov1 PSSM using a set of non-degenerate instances of the motif.
1122         """
1123         self.revMarkov1 = []
1124         numSeqs = len(self.sequences) + pseudoCount
1125
1126         if numSeqs < 2:
1127             return []
1128
1129         # using length of first sequence as the length of the motif
1130         length = len(self.sequences[0])
1131         for index in range(length):
1132             self.revMarkov1.append([[pseudoCount, pseudoCount, pseudoCount, pseudoCount],
1133                                     [pseudoCount, pseudoCount, pseudoCount, pseudoCount],
1134                                     [pseudoCount, pseudoCount, pseudoCount, pseudoCount],
1135                                     [pseudoCount, pseudoCount, pseudoCount, pseudoCount]])
1136
1137         for aSeq in self.sequences:
1138             seq = complement(aSeq.upper(), length)
1139             index = 0
1140             prior = -1
1141             for pos in seq:
1142                 if index == 0:
1143                     for priorNT in range(4):
1144                         self.revMarkov1[index][priorNT][matrixRow[pos]] += 0.25
1145                 else:
1146                     self.revMarkov1[index][prior][matrixRow[pos]] += 1.0
1147
1148                 prior = matrixRow[pos]
1149                 index += 1
1150
1151         for index in range(length):
1152             for prior in range(4):
1153                 for current in range(4):
1154                     self.revMarkov1[index][prior][current] /= numSeqs
1155
1156
1157     def scoreMarkov1(self, aSeq, maxScore=10000000.):
1158         """ calculate the matching score using the Markov1.
1159             limit search if given a low maxScore
1160         """
1161         motLength = len(self.motifMarkov1)
1162         matchMarkov1 = self.probMatchMarkov1
1163         Score = []
1164         if len(aSeq) < motLength:
1165             return Score
1166         else:
1167             theSeq = upper(aSeq)
1168
1169         pos = 0
1170         seqProb = 0.0
1171         revSeqProb = 0.0
1172         subSeq = theSeq[pos: pos + motLength]
1173         previousNT = "N"
1174         for index in range(motLength):
1175             currentNT = subSeq[index]
1176             currentProb = matchMarkov1(self.motifMarkov1, previousNT, currentNT, index)
1177
1178             if currentProb < 0.0001:
1179                 currentProb = 0.0001
1180
1181             if currentProb > 0.0:
1182                 seqProb -= log(currentProb,2.0)
1183
1184             revCurrentProb = matchMarkov1(self.revMarkov1, previousNT, currentNT, index)
1185             if revCurrentProb < 0.002:
1186                 revCurrentProb = 0.002
1187
1188             if revCurrentProb > 0.0:
1189                 revSeqProb -= log(revCurrentProb, 2.0)
1190
1191             if seqProb > maxScore and revSeqProb > maxScore:
1192                 return (seqProb, revSeqProb)
1193
1194             previousNT = currentNT
1195
1196         return (seqProb, revSeqProb)
1197
1198
1199     def probMatchMarkov1(self, theMarkov1, previousNT, NT, colIndex):
1200         """ returns the likelihood of seeing NT given previousNT at this position of the motif.
1201         """
1202         currentProb = 0.0
1203         if NT in ["A", "C", "G", "T"]:
1204             currentNT = matrixRow[NT]
1205         else:
1206             currentNT = 0
1207
1208         try:
1209             prevNT = matrixRow[previousNT]
1210         except KeyError:
1211             for index in range(4):
1212                 currentProb += theMarkov1[colIndex][index][currentNT]
1213
1214             return currentProb
1215
1216         if NT in ["A", "C", "G", "T"]:
1217             return theMarkov1[colIndex][prevNT][currentNT]
1218
1219         if NT == "N":
1220             return 1.0
1221
1222         for motifNucleotide in ["A", "T", "C", "G"]:
1223             if NT in motifDict[motifNucleotide]:
1224                 row = matrixRow[NT]
1225                 currentProb += theMarkov1[colIndex][prevNT][row]
1226
1227         return currentProb
1228
1229
1230     def isSane(self, minLen=7, stretchLen=6):
1231         """ check for motif sanity, which includes: minimum length, less than half N's in consensus,
1232             motifs include more than two nucleotides, no nucleotide or dinucleotide is repeated more
1233             than stretchlen. The appropriate exceptions are made for 'GC' dinucleotides.
1234         """
1235         stretchLen = int(stretchLen)
1236         minLen = int(minLen)
1237         stretchLen = min(stretchLen, minLen - 1)
1238
1239         cons = self.buildConsensus()
1240         motifLen = float(len(cons))
1241         if motifLen < minLen:
1242             return False
1243
1244         nCount = cons.count("N")
1245         if (nCount >= 0.5 * motifLen):
1246             return False
1247
1248         aCount = cons.count("A")
1249         gCount = cons.count("G")
1250         cCount = cons.count("C")
1251         tCount = cons.count("T")
1252
1253         atCount = aCount + tCount
1254         agCount = aCount + gCount
1255         acCount = aCount + cCount
1256         gtCount = gCount + tCount
1257         tcCount = tCount + cCount
1258
1259         for pairedCount in [atCount, agCount, acCount, gtCount, tcCount]:
1260             if pairedCount == motifLen:
1261                 return False
1262
1263         cons = self.buildStrictConsensus()
1264         repeatSequences = []
1265         for nucleotide in ["A", "G", "C", "T"]:
1266             repeatSequences.append(nucleotide * stretchLen)
1267
1268         if stretchLen % 2 != 0:
1269             stretchLen += 1
1270
1271         repeatCount = stretchLen/2
1272         for dinucleotide in ["AG", "AC", "AT", "CT", "GT"]:
1273             repeatSequences.append(dinucleotide * repeatCount)
1274
1275         for testSequence in repeatSequences:
1276             if cons.count(testSequence):
1277                 return False
1278
1279         return True
1280
1281
1282 def correlateMotifs(actualMotifA, actualMotifB, maxSlide=1):
1283     """ Compares two motifs using the "pearson correlation coefficient-like" MSS.
1284         Will slide a motif up to maxSlide bases compared to the other,
1285         and reports the best score.
1286     """
1287     bestScore = 0.0
1288
1289     if len(actualMotifA) < len(actualMotifB):
1290         motA = actualMotifB
1291         motB = actualMotifA
1292     else:
1293         motA = actualMotifA
1294         motB = actualMotifB
1295
1296     motApwm = motA.getPWM()
1297     motBpwm = motB.getPWM()
1298     motCpwm = motB.buildReversePWM()
1299     if hasMotifExtension:
1300         return  _motif.correlateMotifs(motApwm, motBpwm, motCpwm, maxSlide)
1301     else:
1302         length = len(motA)
1303         padLength = length - len(motB)
1304         Ncol = [symbolToMatrix["N"]]
1305         for slide in range(-1 * maxSlide, maxSlide + padLength + 1):
1306             pwmA = deepcopy(motApwm)
1307             pwmB = deepcopy(motBpwm)
1308             pwmC = deepcopy(motCpwm)
1309             if slide < 0:
1310                 pwmA = Ncol * abs(slide) + pwmA
1311                 pwmB = pwmB + Ncol * (abs(slide) + padLength)
1312                 pwmC = pwmC + Ncol * (abs(slide) + padLength)
1313             elif slide > 0 and slide <= maxSlide:
1314                 if padLength > 0:
1315                     if padLength >= slide:
1316                         adjustedPadLength = padLength - slide
1317                         adjustedSlide = 0
1318                     else:
1319                         adjustedPadLength = 0
1320                         adjustedSlide = slide - padLength
1321
1322                     pwmA = pwmA + Ncol * adjustedSlide
1323                     pwmB = Ncol * slide + pwmB + Ncol * adjustedPadLength
1324                     pwmC = Ncol * slide + pwmC + Ncol * adjustedPadLength
1325                 else:
1326                     pwmA = pwmA + Ncol * slide
1327                     pwmB = Ncol * slide + pwmB
1328                     pwmC = Ncol * slide + pwmC
1329             elif slide > maxSlide:
1330                 maxDiff = slide - maxSlide
1331                 pwmA = pwmA + Ncol * maxSlide
1332                 pwmB = Ncol * slide  + pwmB + Ncol * (padLength - maxDiff)
1333                 pwmC = Ncol * slide  + pwmC + Ncol * (padLength - maxDiff)
1334             else:
1335                 pwmB = pwmB + Ncol * padLength
1336                 pwmC = pwmC + Ncol * padLength
1337
1338             score1 = 0.0
1339             score2 = 0.0
1340             thisLength = len(pwmA)
1341             for index in range(thisLength):
1342                 score1 += pearsonCorrelation(pwmA[index], pwmB[index])
1343                 score2 += pearsonCorrelation(pwmA[index], pwmC[index])
1344
1345             score1 = score1 / float(thisLength)
1346             score2 = score2 / float(thisLength)
1347             if score1 < score2 and score2 > bestScore:
1348                 bestScore = score2
1349             elif score1 > bestScore:
1350                 bestScore = score1
1351
1352     return bestScore
1353
1354
1355 def MSS(motifA, motifB, maxSlide=1):
1356     """ Compares two motifs using the motif similarity score (MSS).
1357         Will slide a motif up to maxSlide bases compared to the other,
1358         and reports the best score. Wrapper around correlateMotifs()
1359     """
1360     return correlateMotifs(motifA, motifB, maxSlide)
1361
1362
1363 def printMSS(motifList, maxSlide=1):
1364     """ Prints a matrix of MSS comparisons between different motifs
1365         in motifList.
1366     """
1367     for mot1 in motifList:
1368         print mot1.tagID,
1369         for mot2 in motifList:
1370             val = "\t%1.2f" % correlateMotifs(mot1, mot2, maxSlide)
1371             print val,
1372
1373         print ""