2 ##Created: June 25, 2005
3 ##Modified: July 11, 2005
4 ##Motif Finder using Gibbs sampling
5 ##convergence is measured when the difference in the sequences is negligible
16 ConsensusScore= {True:2, False:1}
18 NTDIndices = {"A": 0, "C": 1, "G": 2, "T": 3}
19 IndexNTDs = {0: "A", 1: "C", 2: "G", 3: "T"}
23 global thresholdPercent
26 global numOfMismatches
28 global maxLoopsWithoutImprovement
31 markov size - 1 = markov model
33 if you want a markov 1 you want to check 2 ntds total
38 input: sequenceToCompare List of sequences whose substrings will be
39 sizeOfMotif aligned integer size of the motif being found
40 (length of the subseqeunce that will be aligned
41 from each sequence in above list (window size)
42 startLocs start locations of the motifs to be aligned to
44 CompareList the indices of sequencesToCompare to be aligned
45 numSeqs the number of sequences -1 from
46 sequencesToCompare to be aligned, the indices
47 of sequenceToCompare stored in the first
48 numSeqs indices of in CompareList.
49 Frequency markov model being used to calculate background
50 originalSeqs contain unmasked sequences used for checking
52 output: integer Score indicating the consensus score of these
54 2D-List contains the PSFM
55 2D-List contains the log markov scores
57 will be calculated as an ungapped consensus between those elements in the
58 CompareList Consensus score is calculated by choosing the largest number of
59 the same elements in each column, and adding all these numbers up across all
64 def AlignmentScore(sequencesToCompare, sizeOfMotif, startLocs, CompareList, numSeqs):
68 len(sequencesToCompare)
69 for i in range (sizeOfMotif):
70 PWMi = [0.0, 0.0, 0.0, 0.0]
71 for j in range(numSeqs+1):
72 SequenceIndex = CompareList[j]
73 CurrSeq = sequencesToCompare[SequenceIndex]
75 #some sequences may not contain the motifs, if so you do not want
76 #to include them in the consensus. These have uninitialized start
77 #locations (ie startLocs would be -1
78 if startLocs[SequenceIndex] != -1:
79 if sequencesToCompare[SequenceIndex]\
80 [startLocs[SequenceIndex]+i] == "N":
81 print sequencesToCompare
85 print startLocs[SequenceIndex]
91 print CurrSeq[startLocs[SequenceIndex]:startLocs[SequenceIndex]+sizeOfMotif]
92 PWMi[NTDIndices[CurrSeq[startLocs[SequenceIndex]+i]]] += 1.0
95 maxScores.append(maxHere)
98 return (TotalScore, PWM,maxScores)
101 def MarkovFreq (prefix, actualNTD, Frequency,MARKOV_WINDOW):
103 input: prefix string of length MARKV_SIZE - 1 prefix used for model
104 actualNTD character NTD at the en dof the prefix being calculated
105 Frequency Markov model for calculations
106 output: float that gives the markov score for this specific sequence
108 The helper function will run through and find all possible words with the
109 prefix and determine the markov score based on this
113 for NTD in ["A", "C", "G", "T"]:
114 value = M_Score(prefix+NTD, Frequency, False, MARKOV_WINDOW)
115 if NTD == actualNTD :
118 retVal = numerator/denominator
123 def revComp (sequence):
125 input: sequence DNA sequence to be converted to reverse complement
126 output: string reverse complement of input sequence
128 obtains the reverse complement of an input sequence
137 for i in range(len(sequence)):
138 reverse = RevDict[sequence[i].upper()]+reverse
143 def Markov(sequences, IncludeRC, MARKOV):
145 input: sequences list that are being used to create the background
146 output: dictionary of all 6mers (reverse complement also) and their -log2
149 background will build a markov model of the background in order to be able
150 to differentiate the motifs from the pure background of a certain size
151 they will be stored as -log(fraction)
153 MARKOV_WINDOW = MARKOV + 1
157 totalWindows += (len(i)-MARKOV_WINDOW+1)*2
159 for seq in sequences:
160 for index in range(len(seq)-MARKOV_WINDOW+1):
161 subseq = seq[index:index+MARKOV_WINDOW].upper()
162 if subseq not in WordDict:
163 WordDict[subseq] = 0.0
165 WordDict[subseq] += 1.0
168 if RC not in WordDict:
174 WordDict[key] = 1.0*WordDict[key]/totalWindows
179 def Average_M (sequence, Model, l, MARKOV_WINDOW):
181 input: sequences List of sequences on which to find the average
183 Model Dictionary containing pvalues for seeing 3mers
184 l integer designating the word sizes from which to
185 determine average pvalue
186 output: average probability of all input lmers in sequences in the Model
188 finds the probability of seeing all subsequence in the total strings
189 using the markov model created using the background. Markov3 is used
190 (window size of 3) and from this determine the average. This function will
191 also screen the background model
196 for i in range(MARKOV_WINDOW-1,len(seq)-l+1):
198 PVal = M_Score(seq[i-MARKOV_WINDOW+1:i+l], Model, True, MARKOV_WINDOW)
201 retVal = totalSum/totalWords
206 def M_Score (sequence, Model, check, MARKOV_WINDOW):
208 input: sequence string for which the Pvalue is to be determined
209 Model Dictionary containing log2 pvalues for seeing 6mers
210 check Boolean which determines whether to also check for
211 completeness of markov model
212 output: log2 probability of seeing the input seqeunce in the Model
214 gives the probability of seeing the given subsequence in the total strings
215 using the markov model created using the background. Markov6 is used
219 for j in range(len(sequence)-MARKOV_WINDOW+1):
220 if sequence[j:j+MARKOV_WINDOW] not in Model:
222 print "The Markov Model is inadequate for your input",
223 print "sequences\n %s is"%sequence[j:j+MARKOV_WINDOW],
224 print "not contained in model provided\n",
225 print "Please revise your provided model or consider",
226 print "using Background Modelling provided"
231 PVal += -math.log(Model[sequence[j:j+MARKOV_WINDOW]],math.e)
236 def LogOdds(PWM, LogsM, sequence, Frequency, MARKOV_WINDOW):
238 input: sequence relevant part of the sequence being added to the PWM
239 PWM information on sequences already in the motif
240 LogsM frequency information on sequences already in motif
241 Frequency markov model for background
242 sizeOfMotif size f the motif being found
243 output returns the log odds score for the consensus
244 the equation used is as follow:
245 S(j = 1 to sizeOfMotif (S(i = [A,C,G,T]) f_ij * ln(S(Prob each path))))
248 PWMout = copy.deepcopy(PWM)
249 LogsMout = copy.deepcopy(LogsM)
250 #since each column of the PWM must add up to the total umber of sequences
251 #in that PWM, in addition one must be added for the current sequence
252 totalSeqs = PWM[0][0]+PWM[0][1]+PWM[0][2]+PWM[0][3] + 1
253 for j in range(len(PWMout)):
254 for i in ['A', 'C', 'G', 'T']:
255 if i == sequence[j+MARKOV_WINDOW-1]:
256 PWMout[j][NTDIndices[i]] += 1.0
257 word = sequence[j:j+MARKOV_WINDOW]
258 LogsMout[j][NTDIndices[i]] += Frequency[word]
260 if PWMout[j][NTDIndices[i]]> 0:
261 Score += PWMout[j][NTDIndices[i]]/totalSeqs*math.log(PWMout[j][NTDIndices[i]]/(totalSeqs*LogsMout[j][NTDIndices[i]]/PWMout[j][NTDIndices[i]]),math.e)
263 return Score, PWMout, LogsMout
266 def convert2motif(sequences, size):
268 input: sequences list containing the sequences in A,C,G,T alphabet
269 comprising the motif to be converted into symbols
270 size size of the motif
271 maybe add threshold?!?!
272 output: string motif converted into descriptive symbols
274 takes in a list of motifs that were found at each point and converts them to
277 #column composition is replaced by symbols
278 SymbolDict = {'CGT':'B','AGT':'D','ACT':'H','GT':'K','AC':'M', 'ACGT':'N','AG':'R','CG':'S','ACG':'V','AT':'W', 'CT':'Y','A':'A','C':'C','G':'G','T':'T'}
280 for i in range(size):
285 for seq in sequences:
286 if seq[i].upper() == "A":
288 elif seq[i].upper() == "C":
290 elif seq[i].upper() == "G":
297 #translate column composition into symbols
298 ###########should we use percentages?! ie. A >certain percent##################
311 Motif += SymbolDict[characterCode]
316 def convert2PSFM (sequences, NumOfSeqs):
318 input: sequences list containing the sequences in A,C,G,T alphabet
319 comprising the motif to be converted into symbols
320 size size of the motif
321 output: 2Darray will contain the PSFM where indices 0-3 of each list
322 will be A,C,G,T respectively
324 takes in a list of motifs that were found at each point and converts them to
328 PWM = convert2PWM(sequences, len(sequences[0]))
329 for i in xrange(len(PWM)):
332 index.append(PWM[i][j]/NumOfSeqs)
339 def convert2PWM (sequences, size):
341 input: sequences list containing the sequences in A,C,G,T alphabet
342 comprising the motif to be converted into symbols
343 size size of the motif
344 output: 2Darray will contain the PSFM where indices 0-3 of each list
345 will be A,C,G,T respectively
347 takes in a list of motifs that were found at each point and converts them to
351 for i in range(size):
352 indices = [0.0, 0.0, 0.0, 0.0]
353 for seq in sequences:
354 indices[NTDIndices[seq[i].upper()]] += 1.0
361 def add2PWM(sequence, PWM):
363 input: sequence sequence to be added to PWM
364 PWM PWM being modiifed
366 takes in a sequence and adds it to the PWM
368 #determine the composition to add
369 for i in range(len(PWM)):
370 PWM[i][NTDIndices[sequence[i].upper()]] += 1.0
373 def Align2PWM(Motifi,PWM):
375 input: Motifi Sequence being aligned to PWM
376 PWM PWM to which the sequence is being aligned
377 output: float alignment score to the matrix
379 takes in a PWM and aligns the sequence to this PWM and returns
383 for i in range(len(PWM)):
384 Score += PWM[i][NTDIndices[Motifi[i]]]
390 input: n Number for which we will calculate the factorial
391 output: float factorial of input number
393 calculates the factorial of input number n
395 #by definition factorial should be 1
397 for i in range(2,n+1):
404 input: n integer for total number of elements in a set
405 k integer for total number of elements to be chose from set
406 output: float the binomial coefficient of the above number, ie nCk
408 calculates the number of ways to choose k elements from a set of n
410 return Factorial(n)/Factorial(n-k)/Factorial(k)
413 getNTDVals = {0:'A',1:'C',2:'G', 3:'T'}
415 getNTDIndex = {'A':0,'C':1,'G':2, 'T':3}
418 def MotifFinder(InputSequences, minS, maxS, numberOfMotifs, Markov_Size,
419 UseRC, Frequency,excludeBelowAve, percID,
420 maxIter, maxLoopsW_outImprovement):
425 global thresholdPercent
426 thresholdPercent=percID
429 global numOfMismatches
431 maxIterations=maxIter
433 global maxLoopsWithoutImprovement
434 maxLoopsWithoutImprovement=maxLoopsW_outImprovement
436 print "Running Sampler with %i sequences"%(len(InputSequences))
437 print "Finding %i motifs of size %i to %i using markov size %i" % (numberOfMotifs,minSize,maxSize,Markov_Size)
439 sequences = [InputSequences[i].upper() for i in range(len(InputSequences))]
442 for seqIndex in xrange(len(sequences)):
443 RCSeq = revComp(sequences[seqIndex])
444 sequences[seqIndex] += INSERTION_N+ RCSeq
446 #this will track the movement of each sequence
447 #if a movement exceeds a certain threshold we are not finished
448 for motifNum in range(numberOfMotifs):
450 #to improve speed shrink sequences by replacing strings of Ns by
452 for i in xrange(len(sequences)):
453 splitByN=sequences[i].split('N')
458 if len(splitByN[j])==0:
459 finalSequence="".join([finalSequence,'N'])
460 while len(splitByN[j])==0:
465 finalSequence="".join([finalSequence,splitByN[j]])
467 sequences[i]=finalSequence
469 print "MOTIF NUMBER %i" %motifNum
470 empty=min([len(sequences[i]) for i in xrange(len(sequences))])
471 #pick motif size randomly within the range provided by the user
472 sizeOfMotif = random.randint(minSize,maxSize)
476 numOfMismatches=sizeOfMotif-ceil(thresholdPercent/100.0*sizeOfMotif)
477 (PWM,PWMScores,startLocs)=GibbsRunner(100)
478 MaxVals=[0 for i in xrange(len(PWM))]
479 for ConsI in xrange(len(PWM)):
480 MaxVals[ConsI] = max(PWM[ConsI])
481 PWMScores = [0 for i in range(len(sequences))]
482 for SIndex in range(len(sequences)):
483 subseq = sequences[SIndex][startLocs[SIndex]:startLocs[SIndex]+sizeOfMotif]
484 PWMScores[SIndex] = 0
485 #######################start here##########
486 for subIndex in range(len(subseq)):
487 PWMScores[SIndex] += PWM[subIndex][NTDIndices[subseq[subIndex]]]
489 maxScore = max(PWMScores)
490 #get rid of all the sequences that do not achieve a certain consensus
491 #score defined by the top one
492 thresh = thresholdPercent/100.0 * maxScore
494 for SIndex in range(len(PWMScores)):
495 if PWMScores[SIndex] > thresh:
496 FinalPWMSeqs.append(sequences[SIndex][startLocs[SIndex]:startLocs[SIndex]+sizeOfMotif])
498 startLocs[SIndex] = -1
500 FinalPSFM= convert2PSFM (FinalPWMSeqs, len(FinalPWMSeqs))
501 PSFMs.append(FinalPSFM)
502 for i in xrange(len(sequences)):
503 if startLocs[i] != -1:
504 sequences[i] = sequences[i][:startLocs[i]]+INSERTION_N*sizeOfMotif+sequences[i][startLocs[i]+sizeOfMotif:]
505 sequences[i] = sequences[i][:len(sequences[i])-startLocs[i]-sizeOfMotif]+INSERTION_N*sizeOfMotif+sequences[i][len(sequences[i])-startLocs[i]:]
509 def GibbsRunner(iterIn):
516 global thresholdPercent
519 global numOfMismatches
521 global maxLoopsWithoutImprovement
522 maxScore=sizeOfMotif*len(sequences)
525 while iterAll < iterIn:
527 print "%.03f\t"%(en-st),
530 startLocs = [-1 ] * len(sequences)
532 for i in range(len(sequences)):
533 startLocs[i] = random.randint(0,len(sequences[i])-sizeOfMotif)
534 while "N" in sequences[i][startLocs[i]:startLocs[i]+sizeOfMotif]:
535 startLocs[i] = random.randint(0,len(sequences[i])-sizeOfMotif)
537 (TotalScore, PWM,dummy)= AlignmentScore(sequences, sizeOfMotif, startLocs, [i for i in range(len(sequences))], len(sequences)-1)
538 PWMScore=[Align2PWM(sequences[i][startLocs[i]:startLocs[i]+sizeOfMotif],PWM) for i in range(len(sequences))]
539 print "PWM is right now"
541 print "scores for each"
545 PreviousBestScore = 0
546 PreviousBestTime = -1
548 while iterations < maxIterations and (ConsensusScore > PreviousBestScore or PreviousBestTime <= maxLoopsWithoutImprovement):
550 SOi = random.randint(0,len(sequences)-1)
553 for i in range(len(sequences)):
557 SeqMotifs.append(sequences[i][startLocs[i]:startLocs[i]+sizeOfMotif])
559 (TotalScore, PWM,maxScores)= AlignmentScore(sequences, sizeOfMotif, locs, [i for i in range(len(sequences))], len(sequences)-1)
562 SOSeq = sequences[SOi]
565 endloc=len(SOSeq)-sizeOfMotif
566 while(start<=endloc):
567 Motif = SOSeq[start:start+sizeOfMotif]
568 locOfN=Motif.rfind("N")
575 while (j<sizeOfMotif):
576 letterScore=PWM[j][NTDIndices[Motif[j]]]
577 probAtPosn+=letterScore
578 mmNum+=int(letterScore!=maxScores[j])
579 if mmNum>numOfMismatches:
587 startLocsI.append(start)
588 startLocsProb.append(probAtPosn)
592 if len(startLocsProb) == 0:
595 choice = random.random()
596 choiceLoc = choice*total
598 for PrefI in range(len(startLocsProb)):
599 if totalToHere+startLocsProb[PrefI] == 0:
601 if choiceLoc < totalToHere+startLocsProb[PrefI]:
603 totalToHere += startLocsProb[PrefI]
605 startLocs[SOi] = startLocsI[PrefI]
606 PWMScore[SOi] = startLocsProb[PrefI]
607 newMotif=SOSeq[startLocs[SOi]:startLocs[SOi]+sizeOfMotif]
608 add2PWM (newMotif, PWM)
612 for i in range(len(sequences)):
614 Motif_i=sequences[i][startLocs[i]:startLocs[i]+sizeOfMotif]
615 for j in xrange(sizeOfMotif):
616 NewScore+=PWM[j][NTDIndices[Motif_i[j]]]
618 NewScores.append(NewScore)
619 PercentChange.append(math.fabs(NewScore - PWMScore[i])/PWMScore[i])
621 TotConsensusScore = sum(NewScores)
622 AveConsensusScore = TotConsensusScore/(len(sequences))
623 if AveConsensusScore > PreviousBestScore:
624 PreviousBestScore = AveConsensusScore
627 PreviousBestTime += 1
629 PWMScore=NewScores[:]
631 Consensus=sum([max(PWM[i]) for i in xrange(len(PWM))])
632 if Consensus> BestScore:
635 BestLocs=startLocs[:]
636 if BestScore==maxScore:
639 print "iterated %i times to find"%iterAll
642 return(BestPWM,BestScore,BestLocs)
645 def probFromPSFM(sequence, PSFM):
647 for i in range(len (sequence)):
648 probability *= PSFM[i][getNTDIndex[sequence[i]]]