X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=cistematic%2Fprograms%2FGibbs.py;fp=cistematic%2Fprograms%2FGibbs.py;h=c4aaff823797b66ea157220e3e7c7f0204907c05;hp=0000000000000000000000000000000000000000;hb=bc30aca13e5ec397c92e67002fbf7a103130b828;hpb=0d3e3112fd04c2e6b44a25cacef1d591658ad181 diff --git a/cistematic/programs/Gibbs.py b/cistematic/programs/Gibbs.py new file mode 100644 index 0000000..c4aaff8 --- /dev/null +++ b/cistematic/programs/Gibbs.py @@ -0,0 +1,650 @@ +##Sarah Aerni +##Created: June 25, 2005 +##Modified: July 11, 2005 +##Motif Finder using Gibbs sampling +##convergence is measured when the difference in the sequences is negligible + +import random +import math +import sys +import copy +from time import time + +from math import ceil + +Frequency = {} +ConsensusScore= {True:2, False:1} + +NTDIndices = {"A": 0, "C": 1, "G": 2, "T": 3} +IndexNTDs = {0: "A", 1: "C", 2: "G", 3: "T"} +INSERTION_N = "N" +global minSize +global maxSize +global thresholdPercent +global sequences +global sizeOfMotif +global numOfMismatches +global maxIterations +global maxLoopsWithoutImprovement + +""" +markov size - 1 = markov model +example: +if you want a markov 1 you want to check 2 ntds total +markov_size = 2 +MARKOV_SIZE = 2 + +AlignmentScore +input: sequenceToCompare List of sequences whose substrings will be + sizeOfMotif aligned integer size of the motif being found + (length of the subseqeunce that will be aligned + from each sequence in above list (window size) + startLocs start locations of the motifs to be aligned to + each other + CompareList the indices of sequencesToCompare to be aligned + numSeqs the number of sequences -1 from + sequencesToCompare to be aligned, the indices + of sequenceToCompare stored in the first + numSeqs indices of in CompareList. + Frequency markov model being used to calculate background + originalSeqs contain unmasked sequences used for checking + the markov score +output: integer Score indicating the consensus score of these + sequences + 2D-List contains the PSFM + 2D-List contains the log markov scores + +will be calculated as an ungapped consensus between those elements in the +CompareList Consensus score is calculated by choosing the largest number of +the same elements in each column, and adding all these numbers up across all +columns +""" + + +def AlignmentScore(sequencesToCompare, sizeOfMotif, startLocs, CompareList, numSeqs): + TotalScore = 0; + PWM = [] + maxScores=[] + len(sequencesToCompare) + for i in range (sizeOfMotif): + PWMi = [0.0, 0.0, 0.0, 0.0] + for j in range(numSeqs+1): + SequenceIndex = CompareList[j] + CurrSeq = sequencesToCompare[SequenceIndex] + + #some sequences may not contain the motifs, if so you do not want + #to include them in the consensus. These have uninitialized start + #locations (ie startLocs would be -1 + if startLocs[SequenceIndex] != -1: + if sequencesToCompare[SequenceIndex]\ + [startLocs[SequenceIndex]+i] == "N": + print sequencesToCompare + print "\nBAD HERE!" + print CurrSeq + print startLocs + print startLocs[SequenceIndex] + print CompareList + print j + print SequenceIndex + print numSeqs + print sizeOfMotif + print CurrSeq[startLocs[SequenceIndex]:startLocs[SequenceIndex]+sizeOfMotif] + PWMi[NTDIndices[CurrSeq[startLocs[SequenceIndex]+i]]] += 1.0 + maxHere=max(PWMi) + TotalScore += maxHere + maxScores.append(maxHere) + PWM.append(PWMi) + + return (TotalScore, PWM,maxScores) + + +def MarkovFreq (prefix, actualNTD, Frequency,MARKOV_WINDOW): + """ MarkovFreq + input: prefix string of length MARKV_SIZE - 1 prefix used for model + actualNTD character NTD at the en dof the prefix being calculated + Frequency Markov model for calculations + output: float that gives the markov score for this specific sequence + + The helper function will run through and find all possible words with the + prefix and determine the markov score based on this + """ + denominator = 0.0 + numerator = 0.0 + for NTD in ["A", "C", "G", "T"]: + value = M_Score(prefix+NTD, Frequency, False, MARKOV_WINDOW) + if NTD == actualNTD : + numerator = value + denominator += value + retVal = numerator/denominator + + return retVal + + +def revComp (sequence): + """ revComp + input: sequence DNA sequence to be converted to reverse complement + output: string reverse complement of input sequence + + obtains the reverse complement of an input sequence + """ + RevDict={"A": "T", + "T": "A", + "C": "G", + "G": "C", + "N": "N" + } + reverse = "" + for i in range(len(sequence)): + reverse = RevDict[sequence[i].upper()]+reverse + + return reverse + + +def Markov(sequences, IncludeRC, MARKOV): + """ Markov3 + input: sequences list that are being used to create the background + output: dictionary of all 6mers (reverse complement also) and their -log2 + proportion seen + + background will build a markov model of the background in order to be able + to differentiate the motifs from the pure background of a certain size + they will be stored as -log(fraction) + """ + MARKOV_WINDOW = MARKOV + 1 + WordDict = {} + totalWindows = 0 + for i in sequences: + totalWindows += (len(i)-MARKOV_WINDOW+1)*2 + + for seq in sequences: + for index in range(len(seq)-MARKOV_WINDOW+1): + subseq = seq[index:index+MARKOV_WINDOW].upper() + if subseq not in WordDict: + WordDict[subseq] = 0.0 + + WordDict[subseq] += 1.0 + if IncludeRC: + RC = revComp(subseq) + if RC not in WordDict: + WordDict[RC] = 0.0 + + WordDict[RC] += 1.0 + + for key in WordDict: + WordDict[key] = 1.0*WordDict[key]/totalWindows + + return WordDict + + +def Average_M (sequence, Model, l, MARKOV_WINDOW): + """ Average_M + input: sequences List of sequences on which to find the average + markov score + Model Dictionary containing pvalues for seeing 3mers + l integer designating the word sizes from which to + determine average pvalue + output: average probability of all input lmers in sequences in the Model + + finds the probability of seeing all subsequence in the total strings + using the markov model created using the background. Markov3 is used + (window size of 3) and from this determine the average. This function will + also screen the background model + """ + totalSum = 0.0; + totalWords = 0.0; + for seq in sequence: + for i in range(MARKOV_WINDOW-1,len(seq)-l+1): + totalWords += 1.0 + PVal = M_Score(seq[i-MARKOV_WINDOW+1:i+l], Model, True, MARKOV_WINDOW) + totalSum += PVal + + retVal = totalSum/totalWords + print totalWords + return retVal + + +def M_Score (sequence, Model, check, MARKOV_WINDOW): + """ M_Score + input: sequence string for which the Pvalue is to be determined + Model Dictionary containing log2 pvalues for seeing 6mers + check Boolean which determines whether to also check for + completeness of markov model + output: log2 probability of seeing the input seqeunce in the Model + + gives the probability of seeing the given subsequence in the total strings + using the markov model created using the background. Markov6 is used + (window size of 3) + """ + PVal = 0.0 + for j in range(len(sequence)-MARKOV_WINDOW+1): + if sequence[j:j+MARKOV_WINDOW] not in Model: + if check: + print "The Markov Model is inadequate for your input", + print "sequences\n %s is"%sequence[j:j+MARKOV_WINDOW], + print "not contained in model provided\n", + print "Please revise your provided model or consider", + print "using Background Modelling provided" + sys.exit(0) + + continue + + PVal += -math.log(Model[sequence[j:j+MARKOV_WINDOW]],math.e) + + return PVal + + +def LogOdds(PWM, LogsM, sequence, Frequency, MARKOV_WINDOW): + """ LogOdds + input: sequence relevant part of the sequence being added to the PWM + PWM information on sequences already in the motif + LogsM frequency information on sequences already in motif + Frequency markov model for background + sizeOfMotif size f the motif being found + output returns the log odds score for the consensus + the equation used is as follow: + S(j = 1 to sizeOfMotif (S(i = [A,C,G,T]) f_ij * ln(S(Prob each path)))) + """ + Score = 0 + PWMout = copy.deepcopy(PWM) + LogsMout = copy.deepcopy(LogsM) + #since each column of the PWM must add up to the total umber of sequences + #in that PWM, in addition one must be added for the current sequence + totalSeqs = PWM[0][0]+PWM[0][1]+PWM[0][2]+PWM[0][3] + 1 + for j in range(len(PWMout)): + for i in ['A', 'C', 'G', 'T']: + if i == sequence[j+MARKOV_WINDOW-1]: + PWMout[j][NTDIndices[i]] += 1.0 + word = sequence[j:j+MARKOV_WINDOW] + LogsMout[j][NTDIndices[i]] += Frequency[word] + + if PWMout[j][NTDIndices[i]]> 0: + Score += PWMout[j][NTDIndices[i]]/totalSeqs*math.log(PWMout[j][NTDIndices[i]]/(totalSeqs*LogsMout[j][NTDIndices[i]]/PWMout[j][NTDIndices[i]]),math.e) + + return Score, PWMout, LogsMout + + +def convert2motif(sequences, size): + """ convert2motif + input: sequences list containing the sequences in A,C,G,T alphabet + comprising the motif to be converted into symbols + size size of the motif + maybe add threshold?!?! + output: string motif converted into descriptive symbols + + takes in a list of motifs that were found at each point and converts them to + an actual motif + """ + #column composition is replaced by symbols + SymbolDict = {'CGT':'B','AGT':'D','ACT':'H','GT':'K','AC':'M', 'ACGT':'N','AG':'R','CG':'S','ACG':'V','AT':'W', 'CT':'Y','A':'A','C':'C','G':'G','T':'T'} + Motif = "" + for i in range(size): + A = 0 + C = 0 + G = 0 + T = 0 + for seq in sequences: + if seq[i].upper() == "A": + A += 1 + elif seq[i].upper() == "C": + C += 1 + elif seq[i].upper() == "G": + G += 1 + else: + T += 1 + + characterCode = "" + + #translate column composition into symbols + ###########should we use percentages?! ie. A >certain percent################## + if (A > 0): + characterCode += "A" + + if (C > 0): + characterCode += "C" + + if (G > 0): + characterCode += "G" + + if (T > 0): + characterCode += "T" + + Motif += SymbolDict[characterCode] + + return Motif + + +def convert2PSFM (sequences, NumOfSeqs): + """ convert2PSFM + input: sequences list containing the sequences in A,C,G,T alphabet + comprising the motif to be converted into symbols + size size of the motif + output: 2Darray will contain the PSFM where indices 0-3 of each list + will be A,C,G,T respectively + + takes in a list of motifs that were found at each point and converts them to + a PSFM + """ + PSFM = [] + PWM = convert2PWM(sequences, len(sequences[0])) + for i in xrange(len(PWM)): + index = [] + for j in [0,1,2,3]: + index.append(PWM[i][j]/NumOfSeqs) + + PSFM.append(index) + + return PSFM + + +def convert2PWM (sequences, size): + """ convert2PWM + input: sequences list containing the sequences in A,C,G,T alphabet + comprising the motif to be converted into symbols + size size of the motif + output: 2Darray will contain the PSFM where indices 0-3 of each list + will be A,C,G,T respectively + + takes in a list of motifs that were found at each point and converts them to + a PWM + """ + PWM = [] + for i in range(size): + indices = [0.0, 0.0, 0.0, 0.0] + for seq in sequences: + indices[NTDIndices[seq[i].upper()]] += 1.0 + + PWM.append(indices) + + return PWM + + +def add2PWM(sequence, PWM): + """ add2PWM + input: sequence sequence to be added to PWM + PWM PWM being modiifed + + takes in a sequence and adds it to the PWM + """ + #determine the composition to add + for i in range(len(PWM)): + PWM[i][NTDIndices[sequence[i].upper()]] += 1.0 + + +def Align2PWM(Motifi,PWM): + """ Align2PWM + input: Motifi Sequence being aligned to PWM + PWM PWM to which the sequence is being aligned + output: float alignment score to the matrix + + takes in a PWM and aligns the sequence to this PWM and returns + the consensus scoreo + """ + Score = 0.0 + for i in range(len(PWM)): + Score += PWM[i][NTDIndices[Motifi[i]]] + return Score + + +def Factorial(n): + """ Factorial + input: n Number for which we will calculate the factorial + output: float factorial of input number + + calculates the factorial of input number n + """ + #by definition factorial should be 1 + Fact = 1 + for i in range(2,n+1): + Fact *= i + return Fact + + +def Choose(n, k): + """ Choose + input: n integer for total number of elements in a set + k integer for total number of elements to be chose from set + output: float the binomial coefficient of the above number, ie nCk + + calculates the number of ways to choose k elements from a set of n + """ + return Factorial(n)/Factorial(n-k)/Factorial(k) + + +getNTDVals = {0:'A',1:'C',2:'G', 3:'T'} + +getNTDIndex = {'A':0,'C':1,'G':2, 'T':3} + + +def MotifFinder(InputSequences, minS, maxS, numberOfMotifs, Markov_Size, + UseRC, Frequency,excludeBelowAve, percID, + maxIter, maxLoopsW_outImprovement): + global minSize + minSize=minS + global maxSize + maxSize=maxS + global thresholdPercent + thresholdPercent=percID + global sequences + global sizeOfMotif + global numOfMismatches + global maxIterations + maxIterations=maxIter + + global maxLoopsWithoutImprovement + maxLoopsWithoutImprovement=maxLoopsW_outImprovement + + print "Running Sampler with %i sequences"%(len(InputSequences)) + print "Finding %i motifs of size %i to %i using markov size %i" % (numberOfMotifs,minSize,maxSize,Markov_Size) + + sequences = [InputSequences[i].upper() for i in range(len(InputSequences))] + PSFMs = [] + if UseRC: + for seqIndex in xrange(len(sequences)): + RCSeq = revComp(sequences[seqIndex]) + sequences[seqIndex] += INSERTION_N+ RCSeq + + #this will track the movement of each sequence + #if a movement exceeds a certain threshold we are not finished + for motifNum in range(numberOfMotifs): + + #to improve speed shrink sequences by replacing strings of Ns by + #a single N + for i in xrange(len(sequences)): + splitByN=sequences[i].split('N') + j = 0; + finalSequence="" + max_j=len(splitByN) + while j < max_j : + if len(splitByN[j])==0: + finalSequence="".join([finalSequence,'N']) + while len(splitByN[j])==0: + j+=1 + if j==max_j: + break + else: + finalSequence="".join([finalSequence,splitByN[j]]) + j+=1 + sequences[i]=finalSequence + + print "MOTIF NUMBER %i" %motifNum + empty=min([len(sequences[i]) for i in xrange(len(sequences))]) + #pick motif size randomly within the range provided by the user + sizeOfMotif = random.randint(minSize,maxSize) + if empty < maxSize: + return ALLPSFMS + + numOfMismatches=sizeOfMotif-ceil(thresholdPercent/100.0*sizeOfMotif) + (PWM,PWMScores,startLocs)=GibbsRunner(100) + MaxVals=[0 for i in xrange(len(PWM))] + for ConsI in xrange(len(PWM)): + MaxVals[ConsI] = max(PWM[ConsI]) + PWMScores = [0 for i in range(len(sequences))] + for SIndex in range(len(sequences)): + subseq = sequences[SIndex][startLocs[SIndex]:startLocs[SIndex]+sizeOfMotif] + PWMScores[SIndex] = 0 + #######################start here########## + for subIndex in range(len(subseq)): + PWMScores[SIndex] += PWM[subIndex][NTDIndices[subseq[subIndex]]] + + maxScore = max(PWMScores) + #get rid of all the sequences that do not achieve a certain consensus + #score defined by the top one + thresh = thresholdPercent/100.0 * maxScore + FinalPWMSeqs = [] + for SIndex in range(len(PWMScores)): + if PWMScores[SIndex] > thresh: + FinalPWMSeqs.append(sequences[SIndex][startLocs[SIndex]:startLocs[SIndex]+sizeOfMotif]) + else: + startLocs[SIndex] = -1 + + FinalPSFM= convert2PSFM (FinalPWMSeqs, len(FinalPWMSeqs)) + PSFMs.append(FinalPSFM) + for i in xrange(len(sequences)): + if startLocs[i] != -1: + sequences[i] = sequences[i][:startLocs[i]]+INSERTION_N*sizeOfMotif+sequences[i][startLocs[i]+sizeOfMotif:] + sequences[i] = sequences[i][:len(sequences[i])-startLocs[i]-sizeOfMotif]+INSERTION_N*sizeOfMotif+sequences[i][len(sequences[i])-startLocs[i]:] + return PSFMs + + +def GibbsRunner(iterIn): + iterAll = 0 + BestPWM=[] + BestScore=0 + BestLocs=[] + global minSize + global maxSize + global thresholdPercent + global sequences + global sizeOfMotif + global numOfMismatches + global maxIterations + global maxLoopsWithoutImprovement + maxScore=sizeOfMotif*len(sequences) + st=time() + + while iterAll < iterIn: + en=time() + print "%.03f\t"%(en-st), + st=time() + iterAll+=1 + startLocs = [-1 ] * len(sequences) + + for i in range(len(sequences)): + startLocs[i] = random.randint(0,len(sequences[i])-sizeOfMotif) + while "N" in sequences[i][startLocs[i]:startLocs[i]+sizeOfMotif]: + startLocs[i] = random.randint(0,len(sequences[i])-sizeOfMotif) + + (TotalScore, PWM,dummy)= AlignmentScore(sequences, sizeOfMotif, startLocs, [i for i in range(len(sequences))], len(sequences)-1) + PWMScore=[Align2PWM(sequences[i][startLocs[i]:startLocs[i]+sizeOfMotif],PWM) for i in range(len(sequences))] + print "PWM is right now" + print PWM + print "scores for each" + print PWMScore + SOi = -1 + ConsensusScore = 0 + PreviousBestScore = 0 + PreviousBestTime = -1 + iterations = 0 + while iterations < maxIterations and (ConsensusScore > PreviousBestScore or PreviousBestTime <= maxLoopsWithoutImprovement): + iterations += 1 + SOi = random.randint(0,len(sequences)-1) + SeqMotifs = [] + locs=startLocs[:] + for i in range(len(sequences)): + if(SOi == i): + locs[i]=-1 + continue + SeqMotifs.append(sequences[i][startLocs[i]:startLocs[i]+sizeOfMotif]) + + (TotalScore, PWM,maxScores)= AlignmentScore(sequences, sizeOfMotif, locs, [i for i in range(len(sequences))], len(sequences)-1) + startLocsProb = [] + startLocsI= [] + SOSeq = sequences[SOi] + total = 0 + start = 0 + endloc=len(SOSeq)-sizeOfMotif + while(start<=endloc): + Motif = SOSeq[start:start+sizeOfMotif] + locOfN=Motif.rfind("N") + if locOfN>=0: + start+=locOfN+1 + continue + probAtPosn=0 + j=0 + mmNum=0 + while (jnumOfMismatches: + probAtPosn=0 + break + j+=1 + + if probAtPosn == 0: + start+=1 + continue + startLocsI.append(start) + startLocsProb.append(probAtPosn) + total += probAtPosn + start+=1 + + if len(startLocsProb) == 0: + continue + + choice = random.random() + choiceLoc = choice*total + totalToHere = 0 + for PrefI in range(len(startLocsProb)): + if totalToHere+startLocsProb[PrefI] == 0: + continue + if choiceLoc < totalToHere+startLocsProb[PrefI]: + break + totalToHere += startLocsProb[PrefI] + + startLocs[SOi] = startLocsI[PrefI] + PWMScore[SOi] = startLocsProb[PrefI] + newMotif=SOSeq[startLocs[SOi]:startLocs[SOi]+sizeOfMotif] + add2PWM (newMotif, PWM) + + NewScores=[] + PercentChange = [] + for i in range(len(sequences)): + NewScore = 0 + Motif_i=sequences[i][startLocs[i]:startLocs[i]+sizeOfMotif] + for j in xrange(sizeOfMotif): + NewScore+=PWM[j][NTDIndices[Motif_i[j]]] + + NewScores.append(NewScore) + PercentChange.append(math.fabs(NewScore - PWMScore[i])/PWMScore[i]) + + TotConsensusScore = sum(NewScores) + AveConsensusScore = TotConsensusScore/(len(sequences)) + if AveConsensusScore > PreviousBestScore: + PreviousBestScore = AveConsensusScore + PreviousBestTime = 0 + else: + PreviousBestTime += 1 + + PWMScore=NewScores[:] + + Consensus=sum([max(PWM[i]) for i in xrange(len(PWM))]) + if Consensus> BestScore: + BestScore=Consensus + BestPWM=PWM[:] + BestLocs=startLocs[:] + if BestScore==maxScore: + break + + print "iterated %i times to find"%iterAll + print BestPWM + + return(BestPWM,BestScore,BestLocs) + + +def probFromPSFM(sequence, PSFM): + probability = 1 + for i in range(len (sequence)): + probability *= PSFM[i][getNTDIndex[sequence[i]]] + + return probability \ No newline at end of file