cistematic/programs/Consensus.py

   1 ##Sarah Aerni
   2 ##Created:  July 12, 2005
   3 ##Modified: June 22, 2006
   4 ##Modified: Oct  22, 2008 by Ali (Minor modifications to outputs)
   5 ##This is a greedy motif finding program.
   6 ##Using background models it determines which motifs are overrepresented
   7 ###############################################################################
   8 ##include a way to go back and
   9 ##uses the new way of doing background and the new scoring rather than simple
  10 ##consensus it uses the log likelihood
  11 ###############################################################################
  12
  13 import hotshot
  14 import random
  15 import math
  16 import sys
  17 import copy
  18 import time
  19
  20 from math import log
  21 from math import e
  22 from math import ceil
  23 from random import shuffle
  24 from random import randint
  25
  26 ConsensusScore= {True:2, False:1}
  27
  28 NTDIndices = {'A':0, 'C':1, 'G':2, 'T':3}
  29 IndexNTDs  = {0:'A', 1:'C', 2:'G', 3:'T'}
  30 INSERTION_N = "N"
  31
  32 NTDA=0
  33 NTDC=1
  34 NTDG=2
  35 NTDT=3
  36
  37 INSERTION_MARKER="S"
  38 INSERTION_FILLER="N"
  39
  40 #markov size - 1 = markov model
  41 #example:
  42 #if you want a markov 1 you want to check 2 ntds total
  43 #markov_size = 2
  44 #declare global variables
  45 global MARKOV_WINDOW
  46 global MW1
  47 global percID
  48 global Founders
  49 global UseRC
  50 global Frequency
  51 global NumOfMismatch
  52 Frequency={}
  53 global sizeOfMotif
  54 global sequences
  55 global founderPercID
  56 #MotifFinder with 3 arguments
  57 #input:     sequences       sequences in which we are to find a motif
  58 #           sizeOfMotif     size of the motif to be identified in the sequences
  59 #           numberOfMotifs  the number of motifs of the indicated size to be
  60 #                           identified in the sequences
  61 #           Iterations      The number of iterations that will be perfdormed in
  62 #                           order to determine an optimal motif
  63 #           Frequency       Markov Model for background
  64 #           UseRC           a boolean indicating whether the reverse complement
  65 #                           should be used i the finding of a motif
  66 #                           (True = yes)
  67 #           Founders        a boolean indicating whether the first 2 sequences
  68 #                           should always be the founder sequences
  69 #           percID          percent of the aligned sequences that must be
  70 #                           identical in order to be considered for a motif to
  71 #                           be considered (between the founder sequences then
  72 #                           between founder sequence consensus and each other
  73 #                           subsequence being considered)
  74 #           founderPercID   The minimum percent identity two aligned sequences
  75 #                           must be to qualify as founder sequences
  76 #           MARKOV_SIZE     Size of the markov model (corresponds to length of
  77 #                           the words in the dictionary Frequency)
  78 #output:    (List1, List2)  List1 contains PSFMs of the indicated length and up
  79 #                           to the indicated number (there may be fewer) in the
  80 #                           standard cistematic format, while List2 holds the
  81 #                           corresponding sequences.
  82 #
  83 #This function will call the overloaded MotifFinder function, passing in a
  84 #default amount for the number of iterations (currently set at 100)
  85 def MotifFinder(sequences, minSize, maxSize, numberOfMotifs, Iterations,
  86                 Frequency, UseRC, Founders, percID, founderPercID,MARKOV_SIZE,
  87                 model,masking):
  88
  89     #declare global variables
  90     global MARKOV_WINDOW
  91     global MW1
  92     global NumOfMismatch
  93     global sizeOfMotif
  94     numOfSequences=len(sequences)
  95
  96     #the minimum window size permitted by this program is a motif of length 6
  97     if minSize < 6:
  98         minSize = 6
  99
 100     #sanity check for reasonable motif sizes
 101     if minSize > maxSize:
 102         print "Please input a minimum size that is smaller than a maximum size"
 103         print "User input:\nMinimum Size:\t%i\nMaximumSize:\t%i" % (minSize, maxSize)
 104         return None
 105
 106     #sanity check for reasonable percentID
 107     if percID> founderPercID:
 108         print "Your input %i for percent identity for founder sequences" % founderPercID,
 109         print "is less than your input for general percent identity at",percID
 110         print "Please adjust your input"
 111         return None
 112
 113     #a user may input a number of iterations which exceeds the possible
 114     #for a given set of input values, in which case the number of iterations
 115     #should be decreased to another size
 116     maxIterations = (maxSize - minSize+1)*Choose(len(sequences),2)*Factorial(len(sequences)-2)
 117     if maxIterations < Iterations:
 118         print "The number of Iterations you entered", Iterations,
 119         print "is being reduced to",maxIterations,
 120         Iterations = maxIterations
 121
 122     #adjusting the window as remarked above
 123     MARKOV_WINDOW = MARKOV_SIZE + 1
 124     MW1=MARKOV_SIZE
 125     print len(sequences), "sequences search for",numberOfMotifs,
 126     print " motifs of size",minSize,"to",maxSize
 127     print "The percent Identity input is",percID,"%"
 128
 129     #this list will contain motifs found (up to the NumberOfMotifs passed in
 130     sequencesToCompare = [i.upper() for i in sequences]
 131     #as a preprocessing steps, a string of Ns will be replaced by one N
 132     for i in xrange(len(sequences)):
 133         splitByN=sequencesToCompare[i].split('N')
 134         j = 0;
 135         finalSequence=""
 136         max_j=len(splitByN)
 137         while j  < max_j :
 138             if len(splitByN[j])==0:
 139                 finalSequence="".join([finalSequence,'N'])
 140                 while len(splitByN[j])==0:
 141                     j+=1
 142                     if j==max_j:
 143                         break
 144             else:
 145                 finalSequence="".join([finalSequence,splitByN[j]])
 146                 j+=1
 147         sequencesToCompare[i]=finalSequence
 148     originalSeqs = []
 149     aveCalcSeqs = []
 150
 151     if UseRC:
 152         for seqIndex in xrange(len(sequencesToCompare)):
 153             sequence = revComp(sequencesToCompare[seqIndex])
 154             aveCalcSeqs.append(''.join([sequencesToCompare[seqIndex], sequence]))
 155             originalSeqs.append(''.join([sequencesToCompare[seqIndex],INSERTION_N,sequence]))
 156             sequencesToCompare[seqIndex] = ''.join([sequencesToCompare[seqIndex],INSERTION_N*(MARKOV_WINDOW),sequence[MW1:]])
 157             sequence = ''.join([INSERTION_N*(MW1),sequencesToCompare[seqIndex][MW1:]])
 158             sequencesToCompare[seqIndex] = sequence
 159
 160     AllPSFMS = []
 161     AllPSFMseqs = []
 162     for motif_i in xrange(numberOfMotifs):
 163         empty=min([len(sequencesToCompare[i]) for i in xrange(len(sequencesToCompare))])
 164         if empty < maxSize:
 165             print "A sequence has been depleted of all options"
 166             return (AllPSFMS, AllPSFMseqs)
 167
 168         OverallBest = "Infinity"
 169         print "MOTIF NUMBER %i\n" % motif_i
 170         #multiple iterations are attempted to avoid local maxima
 171         for step in xrange(Iterations):
 172             sys.stdout.flush()
 173             numIncluded = 2 #used in order to create PSFMs from PWMs
 174             #CompareList will be used to randomly generate the sequences to be
 175             #compared
 176             CompareList = range(len(sequencesToCompare))
 177             BestScore = "Infinity"
 178             #find the best shared motif between the first two sequences
 179             #the motif must be completely contained within the sequence
 180             #or its reverse complement
 181             NumberOfTries = 0
 182             while (BestScore == "Infinity"):
 183                 #PWM created from all sequences included in the motif
 184                 PWM = []
 185                 #pick motif size randomly within the range provided by the user
 186                 sizeOfMotif = random.randint(minSize,maxSize)
 187                 NumOfMismatch = sizeOfMotif-ceil((percID)/100.0*sizeOfMotif)
 188                 #if the motif average is not yet established create the average
 189                 #and store tis value in the dictionary
 190
 191                 #best motif startpoints will be stored in this list of start
 192                 #locations
 193                 BestMotif = [-1] * len(sequencesToCompare)
 194                 #these are start locations are used in order to test new best
 195                 #start indices
 196                 startLocs = BestMotif[:]
 197                 #if it appears there are no more motifs possible quit
 198                 if (NumberOfTries > Choose(len(sequencesToCompare), 2)):
 199                     print "No more motifs to be found after %i tries" % NumberOfTries
 200                     return (AllPSFMS, AllPSFMseqs)
 201                 NumberOfTries +=1
 202                 #randomize sequence order
 203                 #if the sequence founders are input then you only want to
 204                 #randomize the remaining sequences
 205                 if Founders:
 206                     CompareList = [0,1]
 207                     remaining = range(2,len(sequencesToCompare))
 208                     shuffle(remaining)
 209                     CompareList.extend(remaining)
 210                 #if no founders are specified, randomize all
 211                 else:
 212                     shuffle(CompareList)
 213
 214                 SeqIndex0 = CompareList[0]
 215                 SeqIndex1 = CompareList[1]
 216                 #founder sequences - first 2 sequences - are selected
 217                 #create random start sites to avoid favoring the 5' region
 218                 startLocs[SeqIndex0] = -1
 219                 startLocs[SeqIndex1] = -1
 220                 BestMotif[SeqIndex0] = -1
 221                 BestMotif[SeqIndex1] = -1
 222                 BestScore = "Infinity"
 223                 #define the best motif between the first two sequences the
 224                 #inital set of comparisons are performed at the same indices in
 225                 #both "founder" sequences
 226
 227                 #to ensure that the startpoint is random we will rearrange the
 228                 # sequences
 229                 start0 = randint(0,len(sequencesToCompare[SeqIndex0])-sizeOfMotif-1)
 230                 accessSeq=sequencesToCompare[SeqIndex0]
 231                 seq0 = ''.join([accessSeq[start0+1:],'N',accessSeq[:start0+sizeOfMotif]])
 232                 start1 = randint(0,len(sequencesToCompare[SeqIndex1])-sizeOfMotif-1)
 233                 accessSeq=sequencesToCompare[SeqIndex1]
 234                 seq1 = ''.join([accessSeq[start1+1:],'N',accessSeq[:start1+sizeOfMotif]])
 235
 236                 #create shifted score, then check to see if it meets all
 237                 #criteria to become the best.
 238
 239                 #Perform a time efficient alignment of the founder sequences
 240                 (BestScore) = TimeEfficientAlignment(seq0,seq1,SeqIndex0,SeqIndex1,startLocs, founderPercID)
 241
 242                 if BestScore=="Infinity":
 243                     continue
 244                 NLoc0=len(sequencesToCompare[SeqIndex0])-start0-1
 245                 NLoc1=len(sequencesToCompare[SeqIndex1])-start1-1
 246
 247                 #remap to the correct location
 248                 if startLocs[SeqIndex0] > NLoc0:
 249                     startLocs[SeqIndex0]=(startLocs[SeqIndex0]+start0)%len(sequencesToCompare[SeqIndex0])
 250                 else:
 251                     startLocs[SeqIndex0]=(startLocs[SeqIndex0]+start0+1)
 252
 253                 if startLocs[SeqIndex1] > NLoc1:
 254                     startLocs[SeqIndex1]=(startLocs[SeqIndex1]+start1)%len(sequencesToCompare[SeqIndex1])
 255                 else:
 256                     startLocs[SeqIndex1]=(startLocs[SeqIndex1]+start1+1)
 257
 258                 #if we found a good score get all the info necessary to
 259                 #continue, including a PWM and updating the startLocs
 260                 if BestScore != "Infinity":
 261                     index0 = startLocs[SeqIndex0]
 262                     index1 = startLocs[SeqIndex1]
 263                     seq0 = sequencesToCompare[SeqIndex0]
 264                     seq1 = sequencesToCompare[SeqIndex1]
 265                     motif0 = seq0[index0:index0+sizeOfMotif]
 266                     motif1 = seq1[index1:index1+sizeOfMotif]
 267                     BestMotifSeqs=[motif0,motif1]
 268                     PWM = convert2PWM([motif0, motif1],sizeOfMotif)
 269
 270                     #this PWM will be used in order to establish a criterion
 271                     #for future sequences, whether they meet a threshold
 272                     #similiarty to the founder sequences
 273
 274             #if it exists, find the best scoring motifs in the remaining
 275             #sequences.
 276
 277             for seqCompIndex in xrange(2, len(sequencesToCompare)):
 278                 maxScores=[max(PWM[P_I]) for P_I in range(len(PWM))]
 279                 BestScore = "Infinity"
 280                 seq=CompareList[seqCompIndex]
 281                 sequenceToAdd = sequencesToCompare[seq]
 282                 max_i=len(sequenceToAdd)-sizeOfMotif
 283                 bestMismatch = NumOfMismatch
 284                 #to ensure that the startpoint is random we will rearrange the
 285                 #sequences
 286                 start_i = randint(0,max_i-1)
 287                 seq_i = ''.join([sequenceToAdd[start_i+1:],'N',sequenceToAdd[:start_i+sizeOfMotif]])
 288                 i=0
 289                 while (i<=max_i):
 290                     motif=seq_i[i:i+sizeOfMotif]
 291                     #skip this area if there are any masked regions found
 292                     nLoc=motif.rfind('N')
 293                     if nLoc >=0:
 294                         i+=nLoc+1
 295                         continue
 296
 297                     mmNum = 0
 298                     j = 0
 299                     while (j < sizeOfMotif):
 300                         mmNum+=int(PWM[j][NTDIndices[motif[j]]]!=maxScores[j])
 301                         if mmNum > bestMismatch:
 302                             break
 303                         j+=1
 304
 305                     #set a new best if we found it (always keep originals)
 306                     if mmNum < bestMismatch:
 307                         bestMismatch = mmNum
 308                         startLocs[seq]=i
 309
 310                     #we cannot improve upon a perfect match so we break end our
 311                     #search if this occurs
 312                     if mmNum == 0:
 313                         startLocs[seq]=i
 314                         break
 315                     i+=1
 316
 317                 #if somethigng was found we add it to our PWM
 318                 if startLocs[seq]!= -1:
 319                     NLoc_i=len(sequenceToAdd)-start_i-1
 320                     if startLocs[seq] > NLoc_i:
 321                         startLocs[seq]=(startLocs[seq]+start_i)%len(sequenceToAdd)
 322                     else:
 323                         startLocs[seq]=(startLocs[seq]+start_i+1)
 324
 325                     starti=startLocs[seq]
 326                     sAdd=sequencesToCompare[seq][starti:starti+sizeOfMotif]
 327                     checkPWM=add2PWMReturn(sAdd,PWM)
 328                     add=True
 329                     checkMaxScores=[max(checkPWM[P_I]) for P_I in range(len(checkPWM))]
 330                     for seq in BestMotifSeqs:
 331                         mismatches=0
 332                         for i in range(len(seq)):
 333                             mismatches+=int(checkPWM[i][NTDIndices[seq[i]]]!=checkMaxScores[i])
 334                             if mismatches > NumOfMismatch:
 335                                 add=False
 336                                 break
 337                     if add:
 338                         BestMotifSeqs.append(sAdd)
 339                         PWM=copy.deepcopy(checkPWM)
 340                         numIncluded+=1
 341                     elif model=="oops":
 342                         break
 343                 #if we require one occurence per sequence then if we do not
 344                 #have a motif in this sequence we stop our search
 345                 elif model=="oops":
 346                     break
 347
 348             GroupScore =0
 349             #if we have an oops model and not all sequences are being included,
 350             #we have to continue without recording what we have so far
 351             if model=="oops" and numIncluded<numOfSequences:
 352                 continue
 353
 354             #obtain motif scores
 355             GroupScore=0
 356             for i in range(len(PWM)):
 357                 GroupScore += max(PWM[i])
 358
 359             #store this as the best location if it is the best of this set of
 360             #motifs part of that if statement below
 361             if ((OverallBest == "Infinity" and GroupScore != "Infinity" )or \
 362                 GroupScore > OverallBest):
 363                 nextToCompare = sequencesToCompare[:]
 364                 MotifSeqs = []
 365                 OrderOfBest = CompareList[:]
 366                 BestIndices = startLocs[:]
 367                 bestMotifSeqYet = BestMotifSeqs
 368                 OverallBest = GroupScore
 369                 PSFM = convert2PSFM(PWM, numIncluded)
 370                 OverallPWM=PWM[:]
 371                 for i in xrange(len(sequencesToCompare)):
 372                     SeqIndexi = CompareList[i]
 373                     #some sequences may not contain the motifs, if so you do no
 374                     #want to include them
 375                     if BestIndices[SeqIndexi] != -1:
 376                         sAdd=sequencesToCompare[SeqIndexi]
 377                         sAddI=BestIndices[SeqIndexi]
 378                         Motifi = sAdd[sAddI:sAddI+sizeOfMotif]
 379                         MotifSeqs.append(Motifi)
 380
 381                         #mask out original location
 382                         nextToCompare[SeqIndexi]=''.join([sAdd[:sAddI],INSERTION_N*(sizeOfMotif),sAdd[sAddI+sizeOfMotif:]])
 383                         sAdd=nextToCompare[SeqIndexi]
 384                         sAddLen=len(sAdd)
 385                         nextToCompare[SeqIndexi]=''.join([sAdd[:sAddLen-sAddI-sizeOfMotif],INSERTION_N*(sizeOfMotif),sAdd[sAddLen-sAddI:]])
 386             TopMotif = convert2motif(MotifSeqs, sizeOfMotif)
 387
 388         #if not more motifs are found (best score is -1) then it is likely that
 389         #we have found as many as we can
 390         if OverallBest == "Infinity":
 391             print "No more motifs were found!"
 392             return (AllPSFMS, AllPSFMseqs)
 393
 394         #if a motif was found report it here and add it to the total motifs
 395         #Print out your results if any were reached!
 396         for i in xrange(len(sequencesToCompare)):
 397             SeqIndexi = OrderOfBest[i]
 398             if BestIndices[SeqIndexi] != -1:
 399                 Motifi = sequencesToCompare[SeqIndexi][BestIndices[SeqIndexi]:BestIndices[SeqIndexi]+sizeOfMotif]
 400                 MotifSeqs.append(Motifi)
 401
 402         #add the current best motif to the total of all motifs for returning
 403         #to the user
 404         AllPSFMS.append(PSFM)
 405         AllPSFMseqs.append(bestMotifSeqYet)
 406         print OverallPWM
 407
 408         NTDSTUFF = {0:'A',1:'C',2:'G',3:'T'}
 409         for Pntd in [0,1,2,3]:
 410             print ""
 411             print NTDSTUFF[Pntd],
 412             for Pj in xrange(len(PSFM)):
 413                 print "\t %.003f"%(PSFM[Pj][Pntd]),
 414
 415         print "\n**********\n"
 416         print TopMotif
 417         print "\n**********\n"
 418         print "Score:",OverallBest
 419         maxScores=[max(OverallPWM[P_I]) for P_I in range(len(OverallPWM))]
 420         for maski in xrange(numOfSequences):
 421             maskThis=nextToCompare[maski]
 422             i = 0
 423             max_i = len(maskThis)-sizeOfMotif
 424             while (i<=max_i):
 425                 motif=maskThis[i:i+sizeOfMotif]
 426                 #skip this area if there are any masked regions found
 427                 nLoc=motif.rfind('N')
 428                 if nLoc >=0:
 429                     i+=nLoc+1
 430                     continue
 431                 mmNum = 0
 432                 j = 0
 433                 while (j < sizeOfMotif):
 434                     if PWM[j][NTDIndices[motif[j]]]!=maxScores[j]:
 435                         mmNum+=1
 436                         if mmNum > NumOfMismatch:
 437                             break
 438                     j+=1
 439                 #if the location is below threshold we mask it out also
 440                 if mmNum>NumOfMismatch:
 441                     i+=1
 442                     continue
 443
 444                 i+=1
 445
 446         #replace old sequences with the new one
 447         sequencesToCompare = nextToCompare[:]
 448         #reduce masked regions to single N
 449         for i in xrange(len(sequences)):
 450             splitByN=sequencesToCompare[i].split('N')
 451             j = 0;
 452             finalSequence=""
 453             max_j=len(splitByN)
 454             while j  < max_j :
 455                 if len(splitByN[j])==0:
 456                     finalSequence="".join([finalSequence,'N'])
 457                     while len(splitByN[j])==0:
 458                         j+=1
 459                         if j==max_j:
 460                             break
 461                 else:
 462                     finalSequence="".join([finalSequence,splitByN[j]])
 463                     j+=1
 464             sequencesToCompare[i]=finalSequence
 465
 466     return (AllPSFMS, AllPSFMseqs)
 467
 468
 469 #MaskLocation
 470 #input:     sequence    sequence to be masked
 471 #           location    location at which to mask the sequence
 472 #           length      length of area to mask
 473 #output     string      masked sequence
 474 def MaskLocation(sequence,start,length):
 475     finalsequence=''.join([sequence[:start],INSERTION_FILLER*(length),sequence[start+length:]])
 476     newLen=len(finalsequence)
 477     finalsequence=''.join([finalsequence[:newLen-start-length],INSERTION_FILLER*(length),finalsequence[newLen-start:]])
 478
 479     return finalsequence
 480
 481
 482 #TimeEfficientAlignment
 483 #input:     seq0        string sequence in which to calculate a best location
 484 #           seq1        string second sequence
 485 #           start0      integer start location for alignment (gives a skew)
 486 #           start1      integer start for second sequence
 487 #           BestScore   float which is the current bestscore
 488 #           percID   percent of the aligned sequences that must be identical
 489 #                       in order to be considered for a motif
 490 #output     float       Best scoring alignment's score
 491 #
 492 #will be calculated as an ungapped consensus between two sequences as the
 493 #algorithm steps through each window
 494 def TimeEfficientAlignment(seq0, seq1, SeqIndex0, SeqIndex1, startLocs, percID):
 495     global sizeOfMotif
 496     NumOfMismatch = sizeOfMotif-ceil((percID)/100.0*sizeOfMotif)
 497     bestMismatch = NumOfMismatch
 498     MW1=MARKOV_WINDOW-1
 499
 500     #create shifted score, then check to see if it meets all
 501     #criteria to become the best.
 502
 503     i = 0
 504     maxi=len(seq0)-sizeOfMotif
 505     maxj=len(seq1)-sizeOfMotif
 506
 507     Score="Infinity"
 508
 509     while (i<=maxi):
 510         motif=seq0[i:sizeOfMotif+i]
 511         Nloc=motif.rfind('N')
 512         if Nloc>=0:
 513             i+=Nloc+1
 514             continue
 515         j=0
 516         while (j<=maxj):
 517             Nloc=seq1[j:j+sizeOfMotif].rfind('N')
 518             if Nloc>=0:
 519                 j+=Nloc+1
 520                 continue
 521             mismatchedNTDs=0
 522             for k in xrange(sizeOfMotif):
 523                 if motif[k]!=seq1[k+j]:
 524                     mismatchedNTDs+=1
 525                     if mismatchedNTDs > bestMismatch:
 526                         break
 527             if mismatchedNTDs == 0:
 528                 Score = 2*sizeOfMotif-bestMismatch
 529                 startLocs[SeqIndex0]=i
 530                 startLocs[SeqIndex1]=j
 531                 return (Score)
 532             if mismatchedNTDs < bestMismatch:
 533                 bestMismatch = mismatchedNTDs
 534                 Score = 2*sizeOfMotif-bestMismatch
 535                 startLocs[SeqIndex0]=i
 536                 startLocs[SeqIndex1]=j
 537             j+=1
 538         i+=1
 539     return (Score)
 540
 541
 542 #AlignmentScore
 543 #input:     sequenceToCompare   List of sequences whose substrings will be
 544 #           sizeOfMotif         aligned integer size of the motif being found
 545 #                               (length of the subseqeunce that will be aligned
 546 #                                from each sequence in above list (window size)
 547 #           startLocs           start locations of the motifs to be aligned to
 548 #                               each other
 549 #           CompareList         the indices of sequencesToCompare to be aligned
 550 #           numSeqs             the number of sequences -1 from
 551 #                               sequencesToCompare to be aligned. ie, the indices
 552 #                               of sequenceToCompare stored in the first numSeqs
 553 #                               indices of in CompareList.
 554 #           Frequency           markov model being used to calculate background
 555 #           originalSeqs        contain the unmasked sequences used for checking
 556 #                               the markov score
 557 #           NumOfMismatch       number of mismatches allowable
 558 #output:    integer             Score indicating the consensus score of these
 559 #                               sequences
 560 #           2D-List             contains the PSFM
 561 #           2D-List             contains the log markov scores
 562 #
 563 #will be calculated as an ungapped consensus between those elements in the
 564 #CompareList Consensus score is calculated by choosing the largest number of the
 565 #same elements in each column, and adding all these numbers up across all
 566 #columns
 567 def AlignmentScore(sequencesToCompare, sizeOfMotif, startLocs, CompareList, numSeqs, Frequency, originalSeqs, MARKOV_WINDOW,NumOfMismatch):
 568     TotalScore = 0;
 569     Scores = []
 570     MW1=MARKOV_WINDOW-1
 571     PWM = []
 572     Log = []
 573     ConsensusScore = 0
 574     len(sequencesToCompare)
 575     #traverse each column individually
 576     for i in xrange (sizeOfMotif):
 577         divisor = 0.0
 578         PWMi = [0.0, 0.0, 0.0, 0.0]
 579         Logi = [0.0, 0.0, 0.0, 0.0]
 580         #traverse this index in each sequence
 581         for j in xrange(numSeqs+1):
 582             SequenceIndex = CompareList[j]
 583             CurrSeq = sequencesToCompare[SequenceIndex]
 584             CurrOr = originalSeqs[SequenceIndex]
 585             #some sequences may not contain the motifs, if so you do not want
 586             #to include them in the consensus. These have uninitialized start
 587             #locations (ie startLocs would be -1
 588             if startLocs[SequenceIndex] != -1:
 589                 divisor += 1.0
 590                 if sequencesToCompare[SequenceIndex][startLocs[SequenceIndex]+i] == 'N':
 591                     print sequencesToCompare
 592                     print "\nBAD HERE!"
 593                     print CurrSeq
 594                     print startLocs
 595                     print startLocs[SequenceIndex]
 596                     print CompareList
 597                     print j
 598                     print SequenceIndex
 599                     print numSeqs
 600                     print sizeOfMotif
 601                     print CurrSeq[startLocs[SequenceIndex]:startLocs[SequenceIndex]+sizeOfMotif]
 602
 603                 PWMi[NTDIndices[CurrSeq[startLocs[SequenceIndex]+i]]] += 1.0
 604                 Logi[NTDIndices[CurrSeq[startLocs[SequenceIndex]+i]]] += Frequency[CurrOr[startLocs[SequenceIndex]-MARKOV_WINDOW+i+1:startLocs[SequenceIndex]+i+1]]
 605
 606         Scores.append(0)
 607         Top = -1
 608         for NTD in ['A', 'C', 'G', 'T']:
 609             #avoid ln(0) errors
 610             if PWMi[NTDIndices[NTD]] > 0:
 611                 Scores[i] += PWMi[NTDIndices[NTD]]/divisor *log(PWMi[NTDIndices[NTD]]/(divisor*Logi[NTDIndices[NTD]]/PWMi[NTDIndices[NTD]]), e)
 612             if PWMi[NTDIndices[NTD]] > Top:
 613                 Top = PWMi[NTDIndices[NTD]]
 614         TotalScore += Scores[i]
 615         ConsensusScore += Top
 616         PWM.append(PWMi)
 617         Log.append(Logi)
 618
 619     return (TotalScore, Scores, ConsensusScore,PWM, Log)
 620
 621
 622 #MarkovFreq
 623 #input:     prefix      string of length MARKV_SIZE - 1 prefix used for model
 624 #           actualNTD   character NTD at the en dof the prefix being calculated
 625 #           Frequency   Markov model for calculations
 626 #output:    float that gives the markov score for this specific sequence
 627 #
 628 #The helper function will run through and find all possible words with the
 629 #prefix and determine the markov score based on this
 630 def MarkovFreq (prefix, actualNTD, Frequency):
 631
 632     denominator = 0.0
 633     numerator = 0.0
 634     for NTD in ['A', 'C', 'G', 'T']:
 635         value = M_Score(prefix+NTD, Frequency, False,MARKOV_WINDOW)
 636         if NTD == actualNTD :
 637             numerator = value
 638         denominator += value
 639     retVal = numerator/denominator
 640     return retVal
 641
 642
 643 #revComp
 644 #input:     sequence    DNA sequence to be converted to reverse complement
 645 #output:    string      reverse complement of input sequence
 646 #
 647 #obtains the reverse complement of an input sequence
 648 def revComp (sequence):
 649     #base pairs
 650     RevDict={'A':'T','T':'A', 'C':'G', 'G':'C', 'N':'N'}
 651     reverse = ""
 652     #reverse the sequene
 653     for i in xrange(len(sequence)):
 654         reverse = RevDict[sequence[i].upper()]+reverse
 655     return reverse
 656
 657
 658 #Markov3
 659 #input:     sequences   list that are being used to create the background
 660 #output:    dictionary of all 6mers (reverse complement also) and their -log2
 661 #           proportion seen
 662 #
 663 #background will build a markov model of the background in order to be able
 664 #to differentiate the motifs from the pure background of a certain size
 665 #they will be stored as -log(fraction)
 666 def Markov(sequences, IncludeRC,MARKOV):
 667     MARKOV_WINDOW = MARKOV + 1
 668     WordDict = {}
 669     totalWindows = 0
 670
 671     #take each sequence and use it in order to separately determine background
 672     for seq in sequences:
 673         #all sequences for the background must be full-length
 674         for index in xrange(len(seq)-MARKOV):
 675             subseq = seq[index:index+MARKOV_WINDOW].upper()
 676             if "N" in subseq:
 677                 continue
 678
 679             totalWindows += 1
 680             if subseq not in WordDict:
 681                 WordDict[subseq] = 0.0
 682             WordDict[subseq] += 1.0
 683             if IncludeRC:
 684                 totalWindows += 1
 685                 RC = revComp(subseq)
 686                 if RC not in WordDict:
 687                     WordDict[RC] = 0.0
 688                 WordDict[RC] += 1.0
 689
 690     #convert to logs
 691     for key in WordDict:
 692         WordDict[key] = 1.0*WordDict[key]/totalWindows
 693     return WordDict
 694
 695
 696 #Average_M
 697 #input:     sequences   List of sequences on which to find the average
 698 #                       markov score
 699 #           Model       Dictionary containing pvalues for seeing 3mers
 700 #           l           integer designating the word sizes from which to
 701 #                       determine average pvalue
 702 #output:    average probability of all input lmers in the sequences in the Model
 703 #
 704 #finds the probability of seeing all subsequence in the total strings
 705 #using the markov model created using the background. Markov3 is used
 706 #(window size of 3) and from this determine the average. This functio will
 707 #also screen the background model
 708 def Average_M (sequence, Model, l,MARKOV_WINDOW):
 709     totalSum = 0.0;
 710     totalWords = 0.0;
 711     MW1=MARKOV_WINDOW-1
 712     for seq in sequence:
 713         for i in xrange(MW1,len(seq)-l+1):
 714             sequenceCheck=seq[i-MW1:i+1]
 715             Nindex=sequenceCheck.rfind('N')
 716             if Nindex >= 0:
 717                 continue
 718             totalWords += 1.0     #increase number of words
 719             PVal = M_Score(sequenceCheck, Model, True,MARKOV_WINDOW)
 720             #add current word to the running total of scores
 721             totalSum += PVal
 722     retVal = totalSum/totalWords
 723     print totalWords
 724
 725     return retVal
 726
 727
 728 #M_Score
 729 #input:     sequence    string for which the Pvalue is to be determined
 730 #           Model       Dictionary containing log2 pvalues for seeing 6mers
 731 #           check       Boolean which determines whether to also check for
 732 #                       completeness of markov model
 733 #output:    log2 probability of seeing the input seqeunce in the Model
 734 #
 735 #gives the probability of seeing the given subsequence in the total strings
 736 #using the markov model created using the background. Markov6 is used
 737 #(window size of 3)
 738 def M_Score (sequence, Model, check,MARKOV_WINDOW):
 739     PVal = 0.0
 740     MW1=MARKOV_WINDOW-1
 741     for j in xrange(len(sequence)-MARKOV_WINDOW+1):
 742         #if the subsequences is not in the background value it
 743         #the program returns an exit code and asks the user to
 744         #revise background model input
 745         if sequence[j:j+MARKOV_WINDOW] not in Model:
 746             if check:
 747                 print "The Markov Model is inadequate for your input",
 748                 print "sequences\n %s is"%sequence[j:j+MARKOV_WINDOW],
 749                 print "not contained in model provided\n",
 750                 print "Please revise your provided model or consider",
 751                 print "using Background Modelling provided"
 752                 sys.exit(0)
 753             continue
 754         #calculates score
 755         PVal += -log(Model[sequence[j:j+MARKOV_WINDOW]],e)
 756
 757     return PVal
 758
 759 #LogOdds
 760 #input:     sequences   sequences for which to find the log odds score
 761 #           startLocs   start locations at which to start computing score
 762 #           sizeOfMotif size of the motif being created
 763 #           Freqeuncy   Markov3 Model
 764 #           Index       Current Indices
 765 #           CompareList Order of Comparison
 766 #output     returns the log odds score for the consensus
 767 #the equation used is as follow:
 768 #S(j = 1 to sizeOfMotif (S(i = [A,C,G,T]) f_ij * ln(S(Prob each path))))
 769 def LogOddsoPT(sequences, startLocs, sizeOfMotif, Frequency, Index,CompareList,MARKOV_WINDOW):
 770     MW1=MARKOV_WINDOW-1
 771     for i in range(sizeOfMotif):
 772         for j in range(Index+1):
 773             seqIndex = CompareList[j]
 774             lnValue = [0.0, 0.0, 0.0, 0.0]
 775             totalNum = [0.0, 0.0, 0.0, 0.0]
 776             #add the frequency of seeing the given nucleotides in each position
 777             #a runnning total of each one. Then add them to the equation
 778             if startLocs[seqIndex] != -1:
 779                 index = startLocs[seqIndex]+i
 780                 previous = sequences[seqIndex][index-(MW1):index+1]
 781                 denominator = 0.0
 782                 numerator = 0.0
 783                 #determine probabilities for each path
 784                 for NTD in ['A', 'C', 'G', 'T']:
 785                     full = previous+NTD
 786                     if full in Frequency:
 787                         value = Frequency[full]
 788                         if NTD == sequences[seqIndex][index+1]:
 789                             numerator = value
 790                         denominator += value
 791                 #increase the summation of each location
 792                 lnValue[NTDIndices[sequences[seqIndex][index+1]]] +=numerator/denominator
 793                 #increase number of given nucleotides at the index
 794                 totalNum[NTDIndices[sequences[seqIndex][index+1]]] += 1.0
 795
 796
 797 #LogOdds
 798 #input:     sequence    relevant part of the sequence being added to the PWM
 799 #           PWM         information on sequences already in the motif
 800 #           LogsM       frequency information on sequences already in motif
 801 #           Frequency   markov model for background
 802 #           sizeOfMotif size f the motif being found
 803 #output     returns the log odds score for the consensus
 804 #the equation used is as follow:
 805 #S(j = 1 to sizeOfMotif (S(i = [A,C,G,T]) f_ij * ln(S(Prob each path))))
 806 def LogOdds(PWM, LogsM, sequence, Frequency,MARKOV_WINDOW):
 807     Score = 0
 808     PWMout = copy.deepcopy(PWM)
 809     LogsMout = copy.deepcopy(LogsM)
 810     MW1=MARKOV_WINDOW-1
 811     #since each column of the PWM must add up to the total umber of sequences
 812     #in that PWM, in addition one must be added for the current sequence
 813     totalSeqs = PWM[0][0]+PWM[0][1]+PWM[0][2]+PWM[0][3] + 1
 814     for j in range(len(PWMout)):
 815         for i in ['A', 'C', 'G', 'T']:
 816             if i == sequence[j+MW1]:
 817                 PWMout[j][NTDIndices[i]] += 1.0
 818                 #prefix to check for prob of specific ntd given prefix
 819                 word = sequence[j:j+MARKOV_WINDOW]
 820                 #determine the frequency of the words individually
 821                 LogsMout[j][NTDIndices[i]] += Frequency[word]
 822             if PWMout[j][NTDIndices[i]]> 0:
 823                 Score +=PWMout[j][NTDIndices[i]]/totalSeqs*log(PWMout[j][NTDIndices[i]]/(totalSeqs*LogsMout[j][NTDIndices[i]]/PWMout[j][NTDIndices[i]]),e)
 824
 825     return Score, PWMout, LogsMout
 826
 827
 828 #convert2motif
 829 #input:     sequences   list containing the sequences in A,C,G,T alphabet
 830 #                       comprising the motif to be converted into symbols
 831 #           size        size of the motif
 832 # maybe add threshold?!?!
 833 #output:    string      motif converted into descriptive symbols
 834 #
 835 #takes in a list of motifs that were found at each point and converts them to
 836 #an actual motif
 837 def convert2motif(sequences, size):
 838     #column composition is replaced by symbols
 839     SymbolDict = {'CGT':'B','AGT':'D','ACT':'H','GT':'K','AC':'M',
 840                   'ACGT':'N','AG':'R','CG':'S','ACG':'V','AT':'W',
 841                   'CT':'Y','A':'A','C':'C','G':'G','T':'T'}
 842     Motif = ""
 843     #determine the composition of each column
 844     numSeqs = len(sequences)
 845     for i in xrange(size):
 846         A = 0.
 847         C = 0.
 848         G = 0.
 849         T = 0.
 850         for seq in sequences:
 851             if seq[i].upper() == "A":
 852                 A += 1
 853             elif seq[i].upper() == "C":
 854                 C += 1
 855             elif seq[i].upper() == "G":
 856                 G += 1
 857             else:
 858                 T += 1
 859
 860         characterCode = ""
 861         if (A/numSeqs > 0.1):
 862             characterCode += "A"
 863         if (C/numSeqs > 0.1):
 864             characterCode += "C"
 865         if (G/numSeqs > 0.1):
 866             characterCode += "G"
 867         if (T/numSeqs > 0.1):
 868             characterCode += "T"
 869         Motif += SymbolDict[characterCode]
 870
 871     return Motif
 872
 873
 874 #PMW2motif
 875 #input:     array       PWM
 876 #output:    string      motif converted into descriptive symbols
 877 #
 878 #takes in a PWM that was created and converts it to
 879 #an actual motif
 880 def PWM2Motif(PWM):
 881     #column composition is replaced by symbols
 882     SymbolDict = {'CGT':'B','AGT':'D','ACT':'H','GT':'K','AC':'M',
 883                   'ACGT':'N','AG':'R','CG':'S','ACG':'V','AT':'W',
 884                   'CT':'Y','A':'A','C':'C','G':'G','T':'T'}
 885     Motif = ""
 886     #determine the composition of each column
 887     for i in xrange(len(PWM)):
 888         characterCode = ""
 889         if (PWM[i][NTDA] > 0.1):
 890             characterCode += "A"
 891         if (PWM[i][NTDC] > 0.1):
 892             characterCode += "C"
 893         if (PWM[i][NTDG] > 0.1):
 894             characterCode += "G"
 895         if (PWM[i][NTDT] > 0.1):
 896             characterCode += "T"
 897         Motif += SymbolDict[characterCode]
 898
 899     return Motif
 900
 901
 902
 903 #convert2PSFM
 904 #input:     sequences   list containing the sequences in A,C,G,T alphabet
 905 #                       comprising the motif to be converted into symbols
 906 #           size        size of the motif
 907 #output:    2Darray     will contain the PSFM where indices 0-3 of each list
 908 #                       will be A,C,G,T respectively
 909 #
 910 #takes in a list of motifs that were found at each point and converts them to
 911 #a PSFM
 912 def convert2PSFM (PWM, NumOfSeqs):
 913     #Position specific frequency matrix to be returned
 914     PSFM = []
 915     #determine the composition of each column
 916     for i in xrange(len(PWM)):
 917         index = []
 918         #get frequencies for each NTD
 919         for j in [0,1,2,3]:
 920             index.append(PWM[i][j]/NumOfSeqs)
 921         #add all nucleotide frequencies to the columns
 922         PSFM.append(index)
 923
 924     return PSFM
 925
 926
 927 #convert2PWM
 928 #input:     sequences   list containing the sequences in A,C,G,T alphabet
 929 #                       comprising the motif to be converted into symbols
 930 #           size        size of the motif
 931 #output:    2Darray     will contain the PSFM where indices 0-3 of each list
 932 #                       will be A,C,G,T respectively
 933 #
 934 #takes in a list of motifs that were found at each point and converts them to
 935 #a PWM
 936 def convert2PWM (sequences, size):
 937     #Position specific frequency matrix to be returned
 938     PWM = []
 939     #determine the composition of each column
 940     for i in xrange(size):
 941         indices = [0.0, 0.0, 0.0, 0.0]
 942         for seq in sequences:
 943             indices[NTDIndices[seq[i].upper()]] += 1.0
 944         #add all nucleotide frequencies to the columns
 945         PWM.append(indices)
 946
 947     return PWM
 948
 949
 950 #add2PWM
 951 #input:     sequence    sequence to be added to PWM
 952 #           PWM         PWM being modiifed
 953 #
 954 #mutator method takes in a sequence and adds it to the PWM
 955 def add2PWM(sequence, PWM):
 956     #determine the composition to add
 957     for i in xrange(len(PWM)):
 958         PWM[i][NTDIndices[sequence[i].upper()]] += 1.0
 959
 960
 961 #add2PWMReturn
 962 #input:     sequence    sequence to be added to PWM
 963 #           PWM         PWM being modiifed
 964 #
 965 #mutator method takes in a sequence and adds it to the PWM
 966 def add2PWMReturn(sequence, PWM):
 967     retPWM=copy.deepcopy(PWM)
 968     #determine the composition to add
 969     for i in xrange(len(retPWM)):
 970         retPWM[i][NTDIndices[sequence[i].upper()]] += 1.0
 971
 972     return retPWM
 973
 974
 975 #Align2PWM
 976 #input:     Motifi      Sequence being aligned to PWM
 977 #           PWM         PWM to which the sequence is being aligned
 978 #output:    float       alignment score to the matrix
 979 #
 980 #takes in a PWM and aligns the sequence to this PWM and returns
 981 #the consensus scoreo
 982 def Align2PSFM(Motifi,PWM):
 983     Best = 0.0
 984     for i in xrange(len(PWM)):
 985         CurrentIndex = PWM[i][:]
 986         CurrentIndex[NTDIndices[Motifi[i].upper()]] += 1.0
 987         Top = CurrentIndex[0]
 988         for j in [1,2,3]:
 989             if Top < CurrentIndex[j]:
 990                 Top = CurrentIndex[j]
 991         Best += Top
 992
 993     return Best
 994
 995
 996 #Factorial
 997 #input:     n       Number for which we will calculate the factorial
 998 #output:    float   factorial of input number
 999 #
1000 #calculates the factorial of input number n
1001 def Factorial(n):
1002     #by definition factorial should be 1
1003     Fact = 1
1004     for i in xrange(2,n+1):
1005         Fact *= i
1006
1007     return Fact
1008
1009
1010 #Choose
1011 #input:     n       integer for total number of elements in a set
1012 #           k       integer for total number of elements to be chose from set
1013 #output:    float   the binomial coefficient of the above number, ie nCk
1014 #
1015 #calculates the number of ways to choose k elements from a set of n
1016 def Choose(n, k):
1017     return Factorial(n)/Factorial(n-k)/Factorial(k)
1018
1019
1020 #GetMotif
1021 #input: sequencesToCompare  sequences where the motif is being found
1022 #       sequencesLast       original version of the sequences
1023 #       CompareList         Order in which sequences are examined
1024 #       Best Motif          The location where the motifs score highest
1025 #       startLocs           The current locations for the motifs being examined
1026 #       sizeOfMotif         Size of the motif to be found in the sequences
1027 #       Index               Index up to which the motif has been optimized so
1028 #                           far
1029 #       BestScore           The best score of aligned motifs
1030 #       Frequency           Markov3 Model
1031 #       PWM                 PWM which includes alll the sequences which have
1032 #                           been included so far
1033 #       PWMFounder          This is the "founder" PWM, includes only the
1034 #                           founder sequences' information
1035 #       ConScoreTop         This value stores the Consensus score of the
1036 #                           founder sequences only
1037 #       normalization       normalization value for the current words
1038 #       LogsM               frequency of markov words
1039 #       percID              percent of the aligned sequences that must be
1040 #                           identical in order to be considered for a motif to
1041 #                           be considered (between the founder sequences then
1042 #                           between founder sequence consensus and each other
1043 #                           subsequence being considered)
1044 #       originalSeqs        contain unmasked sequence for markov scores
1045 #output:integer             Score of the top scoring motif
1046 #
1047 #this function will align the Indexth item in the sequencesToCompare to all the
1048 #sequences previously aligned from their best indices in a way to optimize the
1049 #alignment score (consensus score)
1050
1051 def GetMotif (sequencesToCompare, sequencesLast, CompareList, BestMotif,
1052               startLocs, sizeOfMotif, Index, BestScore, Frequency,
1053               PWM, PWMFounder, ConScoreTop, normalization, LogsM,percID,
1054               originalSeqs,origLast,MARKOV_WINDOW):
1055
1056     MW1=MARKOV_WINDOW-1
1057     #variable to indicate whether sequencesLast needs to be updated
1058     NewBestDetermined = False
1059     SeqIndexi = CompareList[Index]
1060
1061     #search through all positions for this index
1062     starti=MW1
1063     endi=len(sequencesToCompare[SeqIndexi])-sizeOfMotif
1064     while (starti<=endi):
1065         Motifi = sequencesToCompare[SeqIndexi][starti:starti + sizeOfMotif]
1066         MMotifi = originalSeqs[SeqIndexi][starti-MW1:starti + sizeOfMotif]
1067
1068         #if this area has already been earmarked as a motif it cannot be
1069         #used again
1070         Nlocation=Motifi.rfind('N')
1071         if Nlocation >= 0:
1072             starti+=1+Nlocation+MARKOV_WINDOW
1073
1074         #check to see if the current sequence has a high enough consensus
1075         #with the founder sequences in order to be considered further
1076         ScoreTop = Align2PSFM(Motifi, PWMFounder)
1077         Poss = float(ScoreTop - ConScoreTop)
1078         GOF = Poss/sizeOfMotif
1079         if GOF < percID/100.0:
1080             starti+=1
1081             continue
1082
1083         #if the word is not above noise
1084         if M_Score(MMotifi,Frequency, False,MARKOV_WINDOW) - normalization < 0:
1085             starti+=1
1086             continue
1087
1088         startLocs[SeqIndexi] = starti
1089         #new best scores are assigned by aligning the current sequence to
1090         #the PWM
1091         (ScoreAll, PWM2, LogsM2) = LogOdds(PWM, LogsM, originalSeqs[SeqIndexi] [starti-MW1:starti+sizeOfMotif],Frequency,MARKOV_WINDOW)
1092         Score = ScoreAll
1093
1094         #new best scores are evaluted
1095         if (BestScore == "Infinity" and Score != "Infinity") or Score > BestScore :
1096             #record best position
1097             BestMotif[SeqIndexi] = startLocs[SeqIndexi]
1098             BestScore = Score
1099             LogsMhold = LogsM2[:]
1100             PWMhold = PWM2[:]
1101             NewBestDetermined = True
1102
1103         starti+=1
1104
1105     startLocs[SeqIndexi] = BestMotif[SeqIndexi]
1106     if NewBestDetermined:
1107         PWM = PWMhold[:]
1108         LogsM = LogsMhold[:]
1109
1110     return (BestScore, NewBestDetermined, PWM, LogsM)