2 ##Created: July 12, 2005
3 ##Modified: June 22, 2006
4 ##Modified: Oct 22, 2008 by Ali (Minor modifications to outputs)
5 ##This is a greedy motif finding program.
6 ##Using background models it determines which motifs are overrepresented
7 ###############################################################################
8 ##include a way to go back and
9 ##uses the new way of doing background and the new scoring rather than simple
10 ##consensus it uses the log likelihood
11 ###############################################################################
23 from random import shuffle
24 from random import randint
26 ConsensusScore= {True:2, False:1}
28 NTDIndices = {'A':0, 'C':1, 'G':2, 'T':3}
29 IndexNTDs = {0:'A', 1:'C', 2:'G', 3:'T'}
40 #markov size - 1 = markov model
42 #if you want a markov 1 you want to check 2 ntds total
44 #declare global variables
56 #MotifFinder with 3 arguments
57 #input: sequences sequences in which we are to find a motif
58 # sizeOfMotif size of the motif to be identified in the sequences
59 # numberOfMotifs the number of motifs of the indicated size to be
60 # identified in the sequences
61 # Iterations The number of iterations that will be perfdormed in
62 # order to determine an optimal motif
63 # Frequency Markov Model for background
64 # UseRC a boolean indicating whether the reverse complement
65 # should be used i the finding of a motif
67 # Founders a boolean indicating whether the first 2 sequences
68 # should always be the founder sequences
69 # percID percent of the aligned sequences that must be
70 # identical in order to be considered for a motif to
71 # be considered (between the founder sequences then
72 # between founder sequence consensus and each other
73 # subsequence being considered)
74 # founderPercID The minimum percent identity two aligned sequences
75 # must be to qualify as founder sequences
76 # MARKOV_SIZE Size of the markov model (corresponds to length of
77 # the words in the dictionary Frequency)
78 #output: (List1, List2) List1 contains PSFMs of the indicated length and up
79 # to the indicated number (there may be fewer) in the
80 # standard cistematic format, while List2 holds the
81 # corresponding sequences.
83 #This function will call the overloaded MotifFinder function, passing in a
84 #default amount for the number of iterations (currently set at 100)
85 def MotifFinder(sequences, minSize, maxSize, numberOfMotifs, Iterations,
86 Frequency, UseRC, Founders, percID, founderPercID,MARKOV_SIZE,
89 #declare global variables
94 numOfSequences=len(sequences)
96 #the minimum window size permitted by this program is a motif of length 6
100 #sanity check for reasonable motif sizes
101 if minSize > maxSize:
102 print "Please input a minimum size that is smaller than a maximum size"
103 print "User input:\nMinimum Size:\t%i\nMaximumSize:\t%i" % (minSize, maxSize)
106 #sanity check for reasonable percentID
107 if percID> founderPercID:
108 print "Your input %i for percent identity for founder sequences" % founderPercID,
109 print "is less than your input for general percent identity at",percID
110 print "Please adjust your input"
113 #a user may input a number of iterations which exceeds the possible
114 #for a given set of input values, in which case the number of iterations
115 #should be decreased to another size
116 maxIterations = (maxSize - minSize+1)*Choose(len(sequences),2)*Factorial(len(sequences)-2)
117 if maxIterations < Iterations:
118 print "The number of Iterations you entered", Iterations,
119 print "is being reduced to",maxIterations,
120 Iterations = maxIterations
122 #adjusting the window as remarked above
123 MARKOV_WINDOW = MARKOV_SIZE + 1
125 print len(sequences), "sequences search for",numberOfMotifs,
126 print " motifs of size",minSize,"to",maxSize
127 print "The percent Identity input is",percID,"%"
129 #this list will contain motifs found (up to the NumberOfMotifs passed in
130 sequencesToCompare = [i.upper() for i in sequences]
131 #as a preprocessing steps, a string of Ns will be replaced by one N
132 for i in xrange(len(sequences)):
133 splitByN=sequencesToCompare[i].split('N')
138 if len(splitByN[j])==0:
139 finalSequence="".join([finalSequence,'N'])
140 while len(splitByN[j])==0:
145 finalSequence="".join([finalSequence,splitByN[j]])
147 sequencesToCompare[i]=finalSequence
152 for seqIndex in xrange(len(sequencesToCompare)):
153 sequence = revComp(sequencesToCompare[seqIndex])
154 aveCalcSeqs.append(''.join([sequencesToCompare[seqIndex], sequence]))
155 originalSeqs.append(''.join([sequencesToCompare[seqIndex],INSERTION_N,sequence]))
156 sequencesToCompare[seqIndex] = ''.join([sequencesToCompare[seqIndex],INSERTION_N*(MARKOV_WINDOW),sequence[MW1:]])
157 sequence = ''.join([INSERTION_N*(MW1),sequencesToCompare[seqIndex][MW1:]])
158 sequencesToCompare[seqIndex] = sequence
162 for motif_i in xrange(numberOfMotifs):
163 empty=min([len(sequencesToCompare[i]) for i in xrange(len(sequencesToCompare))])
165 print "A sequence has been depleted of all options"
166 return (AllPSFMS, AllPSFMseqs)
168 OverallBest = "Infinity"
169 print "MOTIF NUMBER %i\n" % motif_i
170 #multiple iterations are attempted to avoid local maxima
171 for step in xrange(Iterations):
173 numIncluded = 2 #used in order to create PSFMs from PWMs
174 #CompareList will be used to randomly generate the sequences to be
176 CompareList = range(len(sequencesToCompare))
177 BestScore = "Infinity"
178 #find the best shared motif between the first two sequences
179 #the motif must be completely contained within the sequence
180 #or its reverse complement
182 while (BestScore == "Infinity"):
183 #PWM created from all sequences included in the motif
185 #pick motif size randomly within the range provided by the user
186 sizeOfMotif = random.randint(minSize,maxSize)
187 NumOfMismatch = sizeOfMotif-ceil((percID)/100.0*sizeOfMotif)
188 #if the motif average is not yet established create the average
189 #and store tis value in the dictionary
191 #best motif startpoints will be stored in this list of start
193 BestMotif = [-1] * len(sequencesToCompare)
194 #these are start locations are used in order to test new best
196 startLocs = BestMotif[:]
197 #if it appears there are no more motifs possible quit
198 if (NumberOfTries > Choose(len(sequencesToCompare), 2)):
199 print "No more motifs to be found after %i tries" % NumberOfTries
200 return (AllPSFMS, AllPSFMseqs)
202 #randomize sequence order
203 #if the sequence founders are input then you only want to
204 #randomize the remaining sequences
207 remaining = range(2,len(sequencesToCompare))
209 CompareList.extend(remaining)
210 #if no founders are specified, randomize all
214 SeqIndex0 = CompareList[0]
215 SeqIndex1 = CompareList[1]
216 #founder sequences - first 2 sequences - are selected
217 #create random start sites to avoid favoring the 5' region
218 startLocs[SeqIndex0] = -1
219 startLocs[SeqIndex1] = -1
220 BestMotif[SeqIndex0] = -1
221 BestMotif[SeqIndex1] = -1
222 BestScore = "Infinity"
223 #define the best motif between the first two sequences the
224 #inital set of comparisons are performed at the same indices in
225 #both "founder" sequences
227 #to ensure that the startpoint is random we will rearrange the
229 start0 = randint(0,len(sequencesToCompare[SeqIndex0])-sizeOfMotif-1)
230 accessSeq=sequencesToCompare[SeqIndex0]
231 seq0 = ''.join([accessSeq[start0+1:],'N',accessSeq[:start0+sizeOfMotif]])
232 start1 = randint(0,len(sequencesToCompare[SeqIndex1])-sizeOfMotif-1)
233 accessSeq=sequencesToCompare[SeqIndex1]
234 seq1 = ''.join([accessSeq[start1+1:],'N',accessSeq[:start1+sizeOfMotif]])
236 #create shifted score, then check to see if it meets all
237 #criteria to become the best.
239 #Perform a time efficient alignment of the founder sequences
240 (BestScore) = TimeEfficientAlignment(seq0,seq1,SeqIndex0,SeqIndex1,startLocs, founderPercID)
242 if BestScore=="Infinity":
244 NLoc0=len(sequencesToCompare[SeqIndex0])-start0-1
245 NLoc1=len(sequencesToCompare[SeqIndex1])-start1-1
247 #remap to the correct location
248 if startLocs[SeqIndex0] > NLoc0:
249 startLocs[SeqIndex0]=(startLocs[SeqIndex0]+start0)%len(sequencesToCompare[SeqIndex0])
251 startLocs[SeqIndex0]=(startLocs[SeqIndex0]+start0+1)
253 if startLocs[SeqIndex1] > NLoc1:
254 startLocs[SeqIndex1]=(startLocs[SeqIndex1]+start1)%len(sequencesToCompare[SeqIndex1])
256 startLocs[SeqIndex1]=(startLocs[SeqIndex1]+start1+1)
258 #if we found a good score get all the info necessary to
259 #continue, including a PWM and updating the startLocs
260 if BestScore != "Infinity":
261 index0 = startLocs[SeqIndex0]
262 index1 = startLocs[SeqIndex1]
263 seq0 = sequencesToCompare[SeqIndex0]
264 seq1 = sequencesToCompare[SeqIndex1]
265 motif0 = seq0[index0:index0+sizeOfMotif]
266 motif1 = seq1[index1:index1+sizeOfMotif]
267 BestMotifSeqs=[motif0,motif1]
268 PWM = convert2PWM([motif0, motif1],sizeOfMotif)
270 #this PWM will be used in order to establish a criterion
271 #for future sequences, whether they meet a threshold
272 #similiarty to the founder sequences
274 #if it exists, find the best scoring motifs in the remaining
277 for seqCompIndex in xrange(2, len(sequencesToCompare)):
278 maxScores=[max(PWM[P_I]) for P_I in range(len(PWM))]
279 BestScore = "Infinity"
280 seq=CompareList[seqCompIndex]
281 sequenceToAdd = sequencesToCompare[seq]
282 max_i=len(sequenceToAdd)-sizeOfMotif
283 bestMismatch = NumOfMismatch
284 #to ensure that the startpoint is random we will rearrange the
286 start_i = randint(0,max_i-1)
287 seq_i = ''.join([sequenceToAdd[start_i+1:],'N',sequenceToAdd[:start_i+sizeOfMotif]])
290 motif=seq_i[i:i+sizeOfMotif]
291 #skip this area if there are any masked regions found
292 nLoc=motif.rfind('N')
299 while (j < sizeOfMotif):
300 mmNum+=int(PWM[j][NTDIndices[motif[j]]]!=maxScores[j])
301 if mmNum > bestMismatch:
305 #set a new best if we found it (always keep originals)
306 if mmNum < bestMismatch:
310 #we cannot improve upon a perfect match so we break end our
311 #search if this occurs
317 #if somethigng was found we add it to our PWM
318 if startLocs[seq]!= -1:
319 NLoc_i=len(sequenceToAdd)-start_i-1
320 if startLocs[seq] > NLoc_i:
321 startLocs[seq]=(startLocs[seq]+start_i)%len(sequenceToAdd)
323 startLocs[seq]=(startLocs[seq]+start_i+1)
325 starti=startLocs[seq]
326 sAdd=sequencesToCompare[seq][starti:starti+sizeOfMotif]
327 checkPWM=add2PWMReturn(sAdd,PWM)
329 checkMaxScores=[max(checkPWM[P_I]) for P_I in range(len(checkPWM))]
330 for seq in BestMotifSeqs:
332 for i in range(len(seq)):
333 mismatches+=int(checkPWM[i][NTDIndices[seq[i]]]!=checkMaxScores[i])
334 if mismatches > NumOfMismatch:
338 BestMotifSeqs.append(sAdd)
339 PWM=copy.deepcopy(checkPWM)
343 #if we require one occurence per sequence then if we do not
344 #have a motif in this sequence we stop our search
349 #if we have an oops model and not all sequences are being included,
350 #we have to continue without recording what we have so far
351 if model=="oops" and numIncluded<numOfSequences:
356 for i in range(len(PWM)):
357 GroupScore += max(PWM[i])
359 #store this as the best location if it is the best of this set of
360 #motifs part of that if statement below
361 if ((OverallBest == "Infinity" and GroupScore != "Infinity" )or \
362 GroupScore > OverallBest):
363 nextToCompare = sequencesToCompare[:]
365 OrderOfBest = CompareList[:]
366 BestIndices = startLocs[:]
367 bestMotifSeqYet = BestMotifSeqs
368 OverallBest = GroupScore
369 PSFM = convert2PSFM(PWM, numIncluded)
371 for i in xrange(len(sequencesToCompare)):
372 SeqIndexi = CompareList[i]
373 #some sequences may not contain the motifs, if so you do no
374 #want to include them
375 if BestIndices[SeqIndexi] != -1:
376 sAdd=sequencesToCompare[SeqIndexi]
377 sAddI=BestIndices[SeqIndexi]
378 Motifi = sAdd[sAddI:sAddI+sizeOfMotif]
379 MotifSeqs.append(Motifi)
381 #mask out original location
382 nextToCompare[SeqIndexi]=''.join([sAdd[:sAddI],INSERTION_N*(sizeOfMotif),sAdd[sAddI+sizeOfMotif:]])
383 sAdd=nextToCompare[SeqIndexi]
385 nextToCompare[SeqIndexi]=''.join([sAdd[:sAddLen-sAddI-sizeOfMotif],INSERTION_N*(sizeOfMotif),sAdd[sAddLen-sAddI:]])
386 TopMotif = convert2motif(MotifSeqs, sizeOfMotif)
388 #if not more motifs are found (best score is -1) then it is likely that
389 #we have found as many as we can
390 if OverallBest == "Infinity":
391 print "No more motifs were found!"
392 return (AllPSFMS, AllPSFMseqs)
394 #if a motif was found report it here and add it to the total motifs
395 #Print out your results if any were reached!
396 for i in xrange(len(sequencesToCompare)):
397 SeqIndexi = OrderOfBest[i]
398 if BestIndices[SeqIndexi] != -1:
399 Motifi = sequencesToCompare[SeqIndexi][BestIndices[SeqIndexi]:BestIndices[SeqIndexi]+sizeOfMotif]
400 MotifSeqs.append(Motifi)
402 #add the current best motif to the total of all motifs for returning
404 AllPSFMS.append(PSFM)
405 AllPSFMseqs.append(bestMotifSeqYet)
408 NTDSTUFF = {0:'A',1:'C',2:'G',3:'T'}
409 for Pntd in [0,1,2,3]:
411 print NTDSTUFF[Pntd],
412 for Pj in xrange(len(PSFM)):
413 print "\t %.003f"%(PSFM[Pj][Pntd]),
415 print "\n**********\n"
417 print "\n**********\n"
418 print "Score:",OverallBest
419 maxScores=[max(OverallPWM[P_I]) for P_I in range(len(OverallPWM))]
420 for maski in xrange(numOfSequences):
421 maskThis=nextToCompare[maski]
423 max_i = len(maskThis)-sizeOfMotif
425 motif=maskThis[i:i+sizeOfMotif]
426 #skip this area if there are any masked regions found
427 nLoc=motif.rfind('N')
433 while (j < sizeOfMotif):
434 if PWM[j][NTDIndices[motif[j]]]!=maxScores[j]:
436 if mmNum > NumOfMismatch:
439 #if the location is below threshold we mask it out also
440 if mmNum>NumOfMismatch:
446 #replace old sequences with the new one
447 sequencesToCompare = nextToCompare[:]
448 #reduce masked regions to single N
449 for i in xrange(len(sequences)):
450 splitByN=sequencesToCompare[i].split('N')
455 if len(splitByN[j])==0:
456 finalSequence="".join([finalSequence,'N'])
457 while len(splitByN[j])==0:
462 finalSequence="".join([finalSequence,splitByN[j]])
464 sequencesToCompare[i]=finalSequence
466 return (AllPSFMS, AllPSFMseqs)
470 #input: sequence sequence to be masked
471 # location location at which to mask the sequence
472 # length length of area to mask
473 #output string masked sequence
474 def MaskLocation(sequence,start,length):
475 finalsequence=''.join([sequence[:start],INSERTION_FILLER*(length),sequence[start+length:]])
476 newLen=len(finalsequence)
477 finalsequence=''.join([finalsequence[:newLen-start-length],INSERTION_FILLER*(length),finalsequence[newLen-start:]])
482 #TimeEfficientAlignment
483 #input: seq0 string sequence in which to calculate a best location
484 # seq1 string second sequence
485 # start0 integer start location for alignment (gives a skew)
486 # start1 integer start for second sequence
487 # BestScore float which is the current bestscore
488 # percID percent of the aligned sequences that must be identical
489 # in order to be considered for a motif
490 #output float Best scoring alignment's score
492 #will be calculated as an ungapped consensus between two sequences as the
493 #algorithm steps through each window
494 def TimeEfficientAlignment(seq0, seq1, SeqIndex0, SeqIndex1, startLocs, percID):
496 NumOfMismatch = sizeOfMotif-ceil((percID)/100.0*sizeOfMotif)
497 bestMismatch = NumOfMismatch
500 #create shifted score, then check to see if it meets all
501 #criteria to become the best.
504 maxi=len(seq0)-sizeOfMotif
505 maxj=len(seq1)-sizeOfMotif
510 motif=seq0[i:sizeOfMotif+i]
511 Nloc=motif.rfind('N')
517 Nloc=seq1[j:j+sizeOfMotif].rfind('N')
522 for k in xrange(sizeOfMotif):
523 if motif[k]!=seq1[k+j]:
525 if mismatchedNTDs > bestMismatch:
527 if mismatchedNTDs == 0:
528 Score = 2*sizeOfMotif-bestMismatch
529 startLocs[SeqIndex0]=i
530 startLocs[SeqIndex1]=j
532 if mismatchedNTDs < bestMismatch:
533 bestMismatch = mismatchedNTDs
534 Score = 2*sizeOfMotif-bestMismatch
535 startLocs[SeqIndex0]=i
536 startLocs[SeqIndex1]=j
543 #input: sequenceToCompare List of sequences whose substrings will be
544 # sizeOfMotif aligned integer size of the motif being found
545 # (length of the subseqeunce that will be aligned
546 # from each sequence in above list (window size)
547 # startLocs start locations of the motifs to be aligned to
549 # CompareList the indices of sequencesToCompare to be aligned
550 # numSeqs the number of sequences -1 from
551 # sequencesToCompare to be aligned. ie, the indices
552 # of sequenceToCompare stored in the first numSeqs
553 # indices of in CompareList.
554 # Frequency markov model being used to calculate background
555 # originalSeqs contain the unmasked sequences used for checking
557 # NumOfMismatch number of mismatches allowable
558 #output: integer Score indicating the consensus score of these
560 # 2D-List contains the PSFM
561 # 2D-List contains the log markov scores
563 #will be calculated as an ungapped consensus between those elements in the
564 #CompareList Consensus score is calculated by choosing the largest number of the
565 #same elements in each column, and adding all these numbers up across all
567 def AlignmentScore(sequencesToCompare, sizeOfMotif, startLocs, CompareList, numSeqs, Frequency, originalSeqs, MARKOV_WINDOW,NumOfMismatch):
574 len(sequencesToCompare)
575 #traverse each column individually
576 for i in xrange (sizeOfMotif):
578 PWMi = [0.0, 0.0, 0.0, 0.0]
579 Logi = [0.0, 0.0, 0.0, 0.0]
580 #traverse this index in each sequence
581 for j in xrange(numSeqs+1):
582 SequenceIndex = CompareList[j]
583 CurrSeq = sequencesToCompare[SequenceIndex]
584 CurrOr = originalSeqs[SequenceIndex]
585 #some sequences may not contain the motifs, if so you do not want
586 #to include them in the consensus. These have uninitialized start
587 #locations (ie startLocs would be -1
588 if startLocs[SequenceIndex] != -1:
590 if sequencesToCompare[SequenceIndex][startLocs[SequenceIndex]+i] == 'N':
591 print sequencesToCompare
595 print startLocs[SequenceIndex]
601 print CurrSeq[startLocs[SequenceIndex]:startLocs[SequenceIndex]+sizeOfMotif]
603 PWMi[NTDIndices[CurrSeq[startLocs[SequenceIndex]+i]]] += 1.0
604 Logi[NTDIndices[CurrSeq[startLocs[SequenceIndex]+i]]] += Frequency[CurrOr[startLocs[SequenceIndex]-MARKOV_WINDOW+i+1:startLocs[SequenceIndex]+i+1]]
608 for NTD in ['A', 'C', 'G', 'T']:
610 if PWMi[NTDIndices[NTD]] > 0:
611 Scores[i] += PWMi[NTDIndices[NTD]]/divisor *log(PWMi[NTDIndices[NTD]]/(divisor*Logi[NTDIndices[NTD]]/PWMi[NTDIndices[NTD]]), e)
612 if PWMi[NTDIndices[NTD]] > Top:
613 Top = PWMi[NTDIndices[NTD]]
614 TotalScore += Scores[i]
615 ConsensusScore += Top
619 return (TotalScore, Scores, ConsensusScore,PWM, Log)
623 #input: prefix string of length MARKV_SIZE - 1 prefix used for model
624 # actualNTD character NTD at the en dof the prefix being calculated
625 # Frequency Markov model for calculations
626 #output: float that gives the markov score for this specific sequence
628 #The helper function will run through and find all possible words with the
629 #prefix and determine the markov score based on this
630 def MarkovFreq (prefix, actualNTD, Frequency):
634 for NTD in ['A', 'C', 'G', 'T']:
635 value = M_Score(prefix+NTD, Frequency, False,MARKOV_WINDOW)
636 if NTD == actualNTD :
639 retVal = numerator/denominator
644 #input: sequence DNA sequence to be converted to reverse complement
645 #output: string reverse complement of input sequence
647 #obtains the reverse complement of an input sequence
648 def revComp (sequence):
650 RevDict={'A':'T','T':'A', 'C':'G', 'G':'C', 'N':'N'}
653 for i in xrange(len(sequence)):
654 reverse = RevDict[sequence[i].upper()]+reverse
659 #input: sequences list that are being used to create the background
660 #output: dictionary of all 6mers (reverse complement also) and their -log2
663 #background will build a markov model of the background in order to be able
664 #to differentiate the motifs from the pure background of a certain size
665 #they will be stored as -log(fraction)
666 def Markov(sequences, IncludeRC,MARKOV):
667 MARKOV_WINDOW = MARKOV + 1
671 #take each sequence and use it in order to separately determine background
672 for seq in sequences:
673 #all sequences for the background must be full-length
674 for index in xrange(len(seq)-MARKOV):
675 subseq = seq[index:index+MARKOV_WINDOW].upper()
680 if subseq not in WordDict:
681 WordDict[subseq] = 0.0
682 WordDict[subseq] += 1.0
686 if RC not in WordDict:
692 WordDict[key] = 1.0*WordDict[key]/totalWindows
697 #input: sequences List of sequences on which to find the average
699 # Model Dictionary containing pvalues for seeing 3mers
700 # l integer designating the word sizes from which to
701 # determine average pvalue
702 #output: average probability of all input lmers in the sequences in the Model
704 #finds the probability of seeing all subsequence in the total strings
705 #using the markov model created using the background. Markov3 is used
706 #(window size of 3) and from this determine the average. This functio will
707 #also screen the background model
708 def Average_M (sequence, Model, l,MARKOV_WINDOW):
713 for i in xrange(MW1,len(seq)-l+1):
714 sequenceCheck=seq[i-MW1:i+1]
715 Nindex=sequenceCheck.rfind('N')
718 totalWords += 1.0 #increase number of words
719 PVal = M_Score(sequenceCheck, Model, True,MARKOV_WINDOW)
720 #add current word to the running total of scores
722 retVal = totalSum/totalWords
729 #input: sequence string for which the Pvalue is to be determined
730 # Model Dictionary containing log2 pvalues for seeing 6mers
731 # check Boolean which determines whether to also check for
732 # completeness of markov model
733 #output: log2 probability of seeing the input seqeunce in the Model
735 #gives the probability of seeing the given subsequence in the total strings
736 #using the markov model created using the background. Markov6 is used
738 def M_Score (sequence, Model, check,MARKOV_WINDOW):
741 for j in xrange(len(sequence)-MARKOV_WINDOW+1):
742 #if the subsequences is not in the background value it
743 #the program returns an exit code and asks the user to
744 #revise background model input
745 if sequence[j:j+MARKOV_WINDOW] not in Model:
747 print "The Markov Model is inadequate for your input",
748 print "sequences\n %s is"%sequence[j:j+MARKOV_WINDOW],
749 print "not contained in model provided\n",
750 print "Please revise your provided model or consider",
751 print "using Background Modelling provided"
755 PVal += -log(Model[sequence[j:j+MARKOV_WINDOW]],e)
760 #input: sequences sequences for which to find the log odds score
761 # startLocs start locations at which to start computing score
762 # sizeOfMotif size of the motif being created
763 # Freqeuncy Markov3 Model
764 # Index Current Indices
765 # CompareList Order of Comparison
766 #output returns the log odds score for the consensus
767 #the equation used is as follow:
768 #S(j = 1 to sizeOfMotif (S(i = [A,C,G,T]) f_ij * ln(S(Prob each path))))
769 def LogOddsoPT(sequences, startLocs, sizeOfMotif, Frequency, Index,CompareList,MARKOV_WINDOW):
771 for i in range(sizeOfMotif):
772 for j in range(Index+1):
773 seqIndex = CompareList[j]
774 lnValue = [0.0, 0.0, 0.0, 0.0]
775 totalNum = [0.0, 0.0, 0.0, 0.0]
776 #add the frequency of seeing the given nucleotides in each position
777 #a runnning total of each one. Then add them to the equation
778 if startLocs[seqIndex] != -1:
779 index = startLocs[seqIndex]+i
780 previous = sequences[seqIndex][index-(MW1):index+1]
783 #determine probabilities for each path
784 for NTD in ['A', 'C', 'G', 'T']:
786 if full in Frequency:
787 value = Frequency[full]
788 if NTD == sequences[seqIndex][index+1]:
791 #increase the summation of each location
792 lnValue[NTDIndices[sequences[seqIndex][index+1]]] +=numerator/denominator
793 #increase number of given nucleotides at the index
794 totalNum[NTDIndices[sequences[seqIndex][index+1]]] += 1.0
798 #input: sequence relevant part of the sequence being added to the PWM
799 # PWM information on sequences already in the motif
800 # LogsM frequency information on sequences already in motif
801 # Frequency markov model for background
802 # sizeOfMotif size f the motif being found
803 #output returns the log odds score for the consensus
804 #the equation used is as follow:
805 #S(j = 1 to sizeOfMotif (S(i = [A,C,G,T]) f_ij * ln(S(Prob each path))))
806 def LogOdds(PWM, LogsM, sequence, Frequency,MARKOV_WINDOW):
808 PWMout = copy.deepcopy(PWM)
809 LogsMout = copy.deepcopy(LogsM)
811 #since each column of the PWM must add up to the total umber of sequences
812 #in that PWM, in addition one must be added for the current sequence
813 totalSeqs = PWM[0][0]+PWM[0][1]+PWM[0][2]+PWM[0][3] + 1
814 for j in range(len(PWMout)):
815 for i in ['A', 'C', 'G', 'T']:
816 if i == sequence[j+MW1]:
817 PWMout[j][NTDIndices[i]] += 1.0
818 #prefix to check for prob of specific ntd given prefix
819 word = sequence[j:j+MARKOV_WINDOW]
820 #determine the frequency of the words individually
821 LogsMout[j][NTDIndices[i]] += Frequency[word]
822 if PWMout[j][NTDIndices[i]]> 0:
823 Score +=PWMout[j][NTDIndices[i]]/totalSeqs*log(PWMout[j][NTDIndices[i]]/(totalSeqs*LogsMout[j][NTDIndices[i]]/PWMout[j][NTDIndices[i]]),e)
825 return Score, PWMout, LogsMout
829 #input: sequences list containing the sequences in A,C,G,T alphabet
830 # comprising the motif to be converted into symbols
831 # size size of the motif
832 # maybe add threshold?!?!
833 #output: string motif converted into descriptive symbols
835 #takes in a list of motifs that were found at each point and converts them to
837 def convert2motif(sequences, size):
838 #column composition is replaced by symbols
839 SymbolDict = {'CGT':'B','AGT':'D','ACT':'H','GT':'K','AC':'M',
840 'ACGT':'N','AG':'R','CG':'S','ACG':'V','AT':'W',
841 'CT':'Y','A':'A','C':'C','G':'G','T':'T'}
843 #determine the composition of each column
844 numSeqs = len(sequences)
845 for i in xrange(size):
850 for seq in sequences:
851 if seq[i].upper() == "A":
853 elif seq[i].upper() == "C":
855 elif seq[i].upper() == "G":
861 if (A/numSeqs > 0.1):
863 if (C/numSeqs > 0.1):
865 if (G/numSeqs > 0.1):
867 if (T/numSeqs > 0.1):
869 Motif += SymbolDict[characterCode]
876 #output: string motif converted into descriptive symbols
878 #takes in a PWM that was created and converts it to
881 #column composition is replaced by symbols
882 SymbolDict = {'CGT':'B','AGT':'D','ACT':'H','GT':'K','AC':'M',
883 'ACGT':'N','AG':'R','CG':'S','ACG':'V','AT':'W',
884 'CT':'Y','A':'A','C':'C','G':'G','T':'T'}
886 #determine the composition of each column
887 for i in xrange(len(PWM)):
889 if (PWM[i][NTDA] > 0.1):
891 if (PWM[i][NTDC] > 0.1):
893 if (PWM[i][NTDG] > 0.1):
895 if (PWM[i][NTDT] > 0.1):
897 Motif += SymbolDict[characterCode]
904 #input: sequences list containing the sequences in A,C,G,T alphabet
905 # comprising the motif to be converted into symbols
906 # size size of the motif
907 #output: 2Darray will contain the PSFM where indices 0-3 of each list
908 # will be A,C,G,T respectively
910 #takes in a list of motifs that were found at each point and converts them to
912 def convert2PSFM (PWM, NumOfSeqs):
913 #Position specific frequency matrix to be returned
915 #determine the composition of each column
916 for i in xrange(len(PWM)):
918 #get frequencies for each NTD
920 index.append(PWM[i][j]/NumOfSeqs)
921 #add all nucleotide frequencies to the columns
928 #input: sequences list containing the sequences in A,C,G,T alphabet
929 # comprising the motif to be converted into symbols
930 # size size of the motif
931 #output: 2Darray will contain the PSFM where indices 0-3 of each list
932 # will be A,C,G,T respectively
934 #takes in a list of motifs that were found at each point and converts them to
936 def convert2PWM (sequences, size):
937 #Position specific frequency matrix to be returned
939 #determine the composition of each column
940 for i in xrange(size):
941 indices = [0.0, 0.0, 0.0, 0.0]
942 for seq in sequences:
943 indices[NTDIndices[seq[i].upper()]] += 1.0
944 #add all nucleotide frequencies to the columns
951 #input: sequence sequence to be added to PWM
952 # PWM PWM being modiifed
954 #mutator method takes in a sequence and adds it to the PWM
955 def add2PWM(sequence, PWM):
956 #determine the composition to add
957 for i in xrange(len(PWM)):
958 PWM[i][NTDIndices[sequence[i].upper()]] += 1.0
962 #input: sequence sequence to be added to PWM
963 # PWM PWM being modiifed
965 #mutator method takes in a sequence and adds it to the PWM
966 def add2PWMReturn(sequence, PWM):
967 retPWM=copy.deepcopy(PWM)
968 #determine the composition to add
969 for i in xrange(len(retPWM)):
970 retPWM[i][NTDIndices[sequence[i].upper()]] += 1.0
976 #input: Motifi Sequence being aligned to PWM
977 # PWM PWM to which the sequence is being aligned
978 #output: float alignment score to the matrix
980 #takes in a PWM and aligns the sequence to this PWM and returns
981 #the consensus scoreo
982 def Align2PSFM(Motifi,PWM):
984 for i in xrange(len(PWM)):
985 CurrentIndex = PWM[i][:]
986 CurrentIndex[NTDIndices[Motifi[i].upper()]] += 1.0
987 Top = CurrentIndex[0]
989 if Top < CurrentIndex[j]:
990 Top = CurrentIndex[j]
997 #input: n Number for which we will calculate the factorial
998 #output: float factorial of input number
1000 #calculates the factorial of input number n
1002 #by definition factorial should be 1
1004 for i in xrange(2,n+1):
1011 #input: n integer for total number of elements in a set
1012 # k integer for total number of elements to be chose from set
1013 #output: float the binomial coefficient of the above number, ie nCk
1015 #calculates the number of ways to choose k elements from a set of n
1017 return Factorial(n)/Factorial(n-k)/Factorial(k)
1021 #input: sequencesToCompare sequences where the motif is being found
1022 # sequencesLast original version of the sequences
1023 # CompareList Order in which sequences are examined
1024 # Best Motif The location where the motifs score highest
1025 # startLocs The current locations for the motifs being examined
1026 # sizeOfMotif Size of the motif to be found in the sequences
1027 # Index Index up to which the motif has been optimized so
1029 # BestScore The best score of aligned motifs
1030 # Frequency Markov3 Model
1031 # PWM PWM which includes alll the sequences which have
1032 # been included so far
1033 # PWMFounder This is the "founder" PWM, includes only the
1034 # founder sequences' information
1035 # ConScoreTop This value stores the Consensus score of the
1036 # founder sequences only
1037 # normalization normalization value for the current words
1038 # LogsM frequency of markov words
1039 # percID percent of the aligned sequences that must be
1040 # identical in order to be considered for a motif to
1041 # be considered (between the founder sequences then
1042 # between founder sequence consensus and each other
1043 # subsequence being considered)
1044 # originalSeqs contain unmasked sequence for markov scores
1045 #output:integer Score of the top scoring motif
1047 #this function will align the Indexth item in the sequencesToCompare to all the
1048 #sequences previously aligned from their best indices in a way to optimize the
1049 #alignment score (consensus score)
1051 def GetMotif (sequencesToCompare, sequencesLast, CompareList, BestMotif,
1052 startLocs, sizeOfMotif, Index, BestScore, Frequency,
1053 PWM, PWMFounder, ConScoreTop, normalization, LogsM,percID,
1054 originalSeqs,origLast,MARKOV_WINDOW):
1057 #variable to indicate whether sequencesLast needs to be updated
1058 NewBestDetermined = False
1059 SeqIndexi = CompareList[Index]
1061 #search through all positions for this index
1063 endi=len(sequencesToCompare[SeqIndexi])-sizeOfMotif
1064 while (starti<=endi):
1065 Motifi = sequencesToCompare[SeqIndexi][starti:starti + sizeOfMotif]
1066 MMotifi = originalSeqs[SeqIndexi][starti-MW1:starti + sizeOfMotif]
1068 #if this area has already been earmarked as a motif it cannot be
1070 Nlocation=Motifi.rfind('N')
1072 starti+=1+Nlocation+MARKOV_WINDOW
1074 #check to see if the current sequence has a high enough consensus
1075 #with the founder sequences in order to be considered further
1076 ScoreTop = Align2PSFM(Motifi, PWMFounder)
1077 Poss = float(ScoreTop - ConScoreTop)
1078 GOF = Poss/sizeOfMotif
1079 if GOF < percID/100.0:
1083 #if the word is not above noise
1084 if M_Score(MMotifi,Frequency, False,MARKOV_WINDOW) - normalization < 0:
1088 startLocs[SeqIndexi] = starti
1089 #new best scores are assigned by aligning the current sequence to
1091 (ScoreAll, PWM2, LogsM2) = LogOdds(PWM, LogsM, originalSeqs[SeqIndexi] [starti-MW1:starti+sizeOfMotif],Frequency,MARKOV_WINDOW)
1094 #new best scores are evaluted
1095 if (BestScore == "Infinity" and Score != "Infinity") or Score > BestScore :
1096 #record best position
1097 BestMotif[SeqIndexi] = startLocs[SeqIndexi]
1099 LogsMhold = LogsM2[:]
1101 NewBestDetermined = True
1105 startLocs[SeqIndexi] = BestMotif[SeqIndexi]
1106 if NewBestDetermined:
1108 LogsM = LogsMhold[:]
1110 return (BestScore, NewBestDetermined, PWM, LogsM)