X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=cistematic%2Fprograms%2FConsensus.py;fp=cistematic%2Fprograms%2FConsensus.py;h=3bb39a8e7acd16ef312c58c4666b744316ceab3e;hp=0000000000000000000000000000000000000000;hb=bc30aca13e5ec397c92e67002fbf7a103130b828;hpb=0d3e3112fd04c2e6b44a25cacef1d591658ad181 diff --git a/cistematic/programs/Consensus.py b/cistematic/programs/Consensus.py new file mode 100755 index 0000000..3bb39a8 --- /dev/null +++ b/cistematic/programs/Consensus.py @@ -0,0 +1,1110 @@ +##Sarah Aerni +##Created: July 12, 2005 +##Modified: June 22, 2006 +##Modified: Oct 22, 2008 by Ali (Minor modifications to outputs) +##This is a greedy motif finding program. +##Using background models it determines which motifs are overrepresented +############################################################################### +##include a way to go back and +##uses the new way of doing background and the new scoring rather than simple +##consensus it uses the log likelihood +############################################################################### + +import hotshot +import random +import math +import sys +import copy +import time + +from math import log +from math import e +from math import ceil +from random import shuffle +from random import randint + +ConsensusScore= {True:2, False:1} + +NTDIndices = {'A':0, 'C':1, 'G':2, 'T':3} +IndexNTDs = {0:'A', 1:'C', 2:'G', 3:'T'} +INSERTION_N = "N" + +NTDA=0 +NTDC=1 +NTDG=2 +NTDT=3 + +INSERTION_MARKER="S" +INSERTION_FILLER="N" + +#markov size - 1 = markov model +#example: +#if you want a markov 1 you want to check 2 ntds total +#markov_size = 2 +#declare global variables +global MARKOV_WINDOW +global MW1 +global percID +global Founders +global UseRC +global Frequency +global NumOfMismatch +Frequency={} +global sizeOfMotif +global sequences +global founderPercID +#MotifFinder with 3 arguments +#input: sequences sequences in which we are to find a motif +# sizeOfMotif size of the motif to be identified in the sequences +# numberOfMotifs the number of motifs of the indicated size to be +# identified in the sequences +# Iterations The number of iterations that will be perfdormed in +# order to determine an optimal motif +# Frequency Markov Model for background +# UseRC a boolean indicating whether the reverse complement +# should be used i the finding of a motif +# (True = yes) +# Founders a boolean indicating whether the first 2 sequences +# should always be the founder sequences +# percID percent of the aligned sequences that must be +# identical in order to be considered for a motif to +# be considered (between the founder sequences then +# between founder sequence consensus and each other +# subsequence being considered) +# founderPercID The minimum percent identity two aligned sequences +# must be to qualify as founder sequences +# MARKOV_SIZE Size of the markov model (corresponds to length of +# the words in the dictionary Frequency) +#output: (List1, List2) List1 contains PSFMs of the indicated length and up +# to the indicated number (there may be fewer) in the +# standard cistematic format, while List2 holds the +# corresponding sequences. +# +#This function will call the overloaded MotifFinder function, passing in a +#default amount for the number of iterations (currently set at 100) +def MotifFinder(sequences, minSize, maxSize, numberOfMotifs, Iterations, + Frequency, UseRC, Founders, percID, founderPercID,MARKOV_SIZE, + model,masking): + + #declare global variables + global MARKOV_WINDOW + global MW1 + global NumOfMismatch + global sizeOfMotif + numOfSequences=len(sequences) + + #the minimum window size permitted by this program is a motif of length 6 + if minSize < 6: + minSize = 6 + + #sanity check for reasonable motif sizes + if minSize > maxSize: + print "Please input a minimum size that is smaller than a maximum size" + print "User input:\nMinimum Size:\t%i\nMaximumSize:\t%i" % (minSize, maxSize) + return None + + #sanity check for reasonable percentID + if percID> founderPercID: + print "Your input %i for percent identity for founder sequences" % founderPercID, + print "is less than your input for general percent identity at",percID + print "Please adjust your input" + return None + + #a user may input a number of iterations which exceeds the possible + #for a given set of input values, in which case the number of iterations + #should be decreased to another size + maxIterations = (maxSize - minSize+1)*Choose(len(sequences),2)*Factorial(len(sequences)-2) + if maxIterations < Iterations: + print "The number of Iterations you entered", Iterations, + print "is being reduced to",maxIterations, + Iterations = maxIterations + + #adjusting the window as remarked above + MARKOV_WINDOW = MARKOV_SIZE + 1 + MW1=MARKOV_SIZE + print len(sequences), "sequences search for",numberOfMotifs, + print " motifs of size",minSize,"to",maxSize + print "The percent Identity input is",percID,"%" + + #this list will contain motifs found (up to the NumberOfMotifs passed in + sequencesToCompare = [i.upper() for i in sequences] + #as a preprocessing steps, a string of Ns will be replaced by one N + for i in xrange(len(sequences)): + splitByN=sequencesToCompare[i].split('N') + j = 0; + finalSequence="" + max_j=len(splitByN) + while j < max_j : + if len(splitByN[j])==0: + finalSequence="".join([finalSequence,'N']) + while len(splitByN[j])==0: + j+=1 + if j==max_j: + break + else: + finalSequence="".join([finalSequence,splitByN[j]]) + j+=1 + sequencesToCompare[i]=finalSequence + originalSeqs = [] + aveCalcSeqs = [] + + if UseRC: + for seqIndex in xrange(len(sequencesToCompare)): + sequence = revComp(sequencesToCompare[seqIndex]) + aveCalcSeqs.append(''.join([sequencesToCompare[seqIndex], sequence])) + originalSeqs.append(''.join([sequencesToCompare[seqIndex],INSERTION_N,sequence])) + sequencesToCompare[seqIndex] = ''.join([sequencesToCompare[seqIndex],INSERTION_N*(MARKOV_WINDOW),sequence[MW1:]]) + sequence = ''.join([INSERTION_N*(MW1),sequencesToCompare[seqIndex][MW1:]]) + sequencesToCompare[seqIndex] = sequence + + AllPSFMS = [] + AllPSFMseqs = [] + for motif_i in xrange(numberOfMotifs): + empty=min([len(sequencesToCompare[i]) for i in xrange(len(sequencesToCompare))]) + if empty < maxSize: + print "A sequence has been depleted of all options" + return (AllPSFMS, AllPSFMseqs) + + OverallBest = "Infinity" + print "MOTIF NUMBER %i\n" % motif_i + #multiple iterations are attempted to avoid local maxima + for step in xrange(Iterations): + sys.stdout.flush() + numIncluded = 2 #used in order to create PSFMs from PWMs + #CompareList will be used to randomly generate the sequences to be + #compared + CompareList = range(len(sequencesToCompare)) + BestScore = "Infinity" + #find the best shared motif between the first two sequences + #the motif must be completely contained within the sequence + #or its reverse complement + NumberOfTries = 0 + while (BestScore == "Infinity"): + #PWM created from all sequences included in the motif + PWM = [] + #pick motif size randomly within the range provided by the user + sizeOfMotif = random.randint(minSize,maxSize) + NumOfMismatch = sizeOfMotif-ceil((percID)/100.0*sizeOfMotif) + #if the motif average is not yet established create the average + #and store tis value in the dictionary + + #best motif startpoints will be stored in this list of start + #locations + BestMotif = [-1] * len(sequencesToCompare) + #these are start locations are used in order to test new best + #start indices + startLocs = BestMotif[:] + #if it appears there are no more motifs possible quit + if (NumberOfTries > Choose(len(sequencesToCompare), 2)): + print "No more motifs to be found after %i tries" % NumberOfTries + return (AllPSFMS, AllPSFMseqs) + NumberOfTries +=1 + #randomize sequence order + #if the sequence founders are input then you only want to + #randomize the remaining sequences + if Founders: + CompareList = [0,1] + remaining = range(2,len(sequencesToCompare)) + shuffle(remaining) + CompareList.extend(remaining) + #if no founders are specified, randomize all + else: + shuffle(CompareList) + + SeqIndex0 = CompareList[0] + SeqIndex1 = CompareList[1] + #founder sequences - first 2 sequences - are selected + #create random start sites to avoid favoring the 5' region + startLocs[SeqIndex0] = -1 + startLocs[SeqIndex1] = -1 + BestMotif[SeqIndex0] = -1 + BestMotif[SeqIndex1] = -1 + BestScore = "Infinity" + #define the best motif between the first two sequences the + #inital set of comparisons are performed at the same indices in + #both "founder" sequences + + #to ensure that the startpoint is random we will rearrange the + # sequences + start0 = randint(0,len(sequencesToCompare[SeqIndex0])-sizeOfMotif-1) + accessSeq=sequencesToCompare[SeqIndex0] + seq0 = ''.join([accessSeq[start0+1:],'N',accessSeq[:start0+sizeOfMotif]]) + start1 = randint(0,len(sequencesToCompare[SeqIndex1])-sizeOfMotif-1) + accessSeq=sequencesToCompare[SeqIndex1] + seq1 = ''.join([accessSeq[start1+1:],'N',accessSeq[:start1+sizeOfMotif]]) + + #create shifted score, then check to see if it meets all + #criteria to become the best. + + #Perform a time efficient alignment of the founder sequences + (BestScore) = TimeEfficientAlignment(seq0,seq1,SeqIndex0,SeqIndex1,startLocs, founderPercID) + + if BestScore=="Infinity": + continue + NLoc0=len(sequencesToCompare[SeqIndex0])-start0-1 + NLoc1=len(sequencesToCompare[SeqIndex1])-start1-1 + + #remap to the correct location + if startLocs[SeqIndex0] > NLoc0: + startLocs[SeqIndex0]=(startLocs[SeqIndex0]+start0)%len(sequencesToCompare[SeqIndex0]) + else: + startLocs[SeqIndex0]=(startLocs[SeqIndex0]+start0+1) + + if startLocs[SeqIndex1] > NLoc1: + startLocs[SeqIndex1]=(startLocs[SeqIndex1]+start1)%len(sequencesToCompare[SeqIndex1]) + else: + startLocs[SeqIndex1]=(startLocs[SeqIndex1]+start1+1) + + #if we found a good score get all the info necessary to + #continue, including a PWM and updating the startLocs + if BestScore != "Infinity": + index0 = startLocs[SeqIndex0] + index1 = startLocs[SeqIndex1] + seq0 = sequencesToCompare[SeqIndex0] + seq1 = sequencesToCompare[SeqIndex1] + motif0 = seq0[index0:index0+sizeOfMotif] + motif1 = seq1[index1:index1+sizeOfMotif] + BestMotifSeqs=[motif0,motif1] + PWM = convert2PWM([motif0, motif1],sizeOfMotif) + + #this PWM will be used in order to establish a criterion + #for future sequences, whether they meet a threshold + #similiarty to the founder sequences + + #if it exists, find the best scoring motifs in the remaining + #sequences. + + for seqCompIndex in xrange(2, len(sequencesToCompare)): + maxScores=[max(PWM[P_I]) for P_I in range(len(PWM))] + BestScore = "Infinity" + seq=CompareList[seqCompIndex] + sequenceToAdd = sequencesToCompare[seq] + max_i=len(sequenceToAdd)-sizeOfMotif + bestMismatch = NumOfMismatch + #to ensure that the startpoint is random we will rearrange the + #sequences + start_i = randint(0,max_i-1) + seq_i = ''.join([sequenceToAdd[start_i+1:],'N',sequenceToAdd[:start_i+sizeOfMotif]]) + i=0 + while (i<=max_i): + motif=seq_i[i:i+sizeOfMotif] + #skip this area if there are any masked regions found + nLoc=motif.rfind('N') + if nLoc >=0: + i+=nLoc+1 + continue + + mmNum = 0 + j = 0 + while (j < sizeOfMotif): + mmNum+=int(PWM[j][NTDIndices[motif[j]]]!=maxScores[j]) + if mmNum > bestMismatch: + break + j+=1 + + #set a new best if we found it (always keep originals) + if mmNum < bestMismatch: + bestMismatch = mmNum + startLocs[seq]=i + + #we cannot improve upon a perfect match so we break end our + #search if this occurs + if mmNum == 0: + startLocs[seq]=i + break + i+=1 + + #if somethigng was found we add it to our PWM + if startLocs[seq]!= -1: + NLoc_i=len(sequenceToAdd)-start_i-1 + if startLocs[seq] > NLoc_i: + startLocs[seq]=(startLocs[seq]+start_i)%len(sequenceToAdd) + else: + startLocs[seq]=(startLocs[seq]+start_i+1) + + starti=startLocs[seq] + sAdd=sequencesToCompare[seq][starti:starti+sizeOfMotif] + checkPWM=add2PWMReturn(sAdd,PWM) + add=True + checkMaxScores=[max(checkPWM[P_I]) for P_I in range(len(checkPWM))] + for seq in BestMotifSeqs: + mismatches=0 + for i in range(len(seq)): + mismatches+=int(checkPWM[i][NTDIndices[seq[i]]]!=checkMaxScores[i]) + if mismatches > NumOfMismatch: + add=False + break + if add: + BestMotifSeqs.append(sAdd) + PWM=copy.deepcopy(checkPWM) + numIncluded+=1 + elif model=="oops": + break + #if we require one occurence per sequence then if we do not + #have a motif in this sequence we stop our search + elif model=="oops": + break + + GroupScore =0 + #if we have an oops model and not all sequences are being included, + #we have to continue without recording what we have so far + if model=="oops" and numIncluded OverallBest): + nextToCompare = sequencesToCompare[:] + MotifSeqs = [] + OrderOfBest = CompareList[:] + BestIndices = startLocs[:] + bestMotifSeqYet = BestMotifSeqs + OverallBest = GroupScore + PSFM = convert2PSFM(PWM, numIncluded) + OverallPWM=PWM[:] + for i in xrange(len(sequencesToCompare)): + SeqIndexi = CompareList[i] + #some sequences may not contain the motifs, if so you do no + #want to include them + if BestIndices[SeqIndexi] != -1: + sAdd=sequencesToCompare[SeqIndexi] + sAddI=BestIndices[SeqIndexi] + Motifi = sAdd[sAddI:sAddI+sizeOfMotif] + MotifSeqs.append(Motifi) + + #mask out original location + nextToCompare[SeqIndexi]=''.join([sAdd[:sAddI],INSERTION_N*(sizeOfMotif),sAdd[sAddI+sizeOfMotif:]]) + sAdd=nextToCompare[SeqIndexi] + sAddLen=len(sAdd) + nextToCompare[SeqIndexi]=''.join([sAdd[:sAddLen-sAddI-sizeOfMotif],INSERTION_N*(sizeOfMotif),sAdd[sAddLen-sAddI:]]) + TopMotif = convert2motif(MotifSeqs, sizeOfMotif) + + #if not more motifs are found (best score is -1) then it is likely that + #we have found as many as we can + if OverallBest == "Infinity": + print "No more motifs were found!" + return (AllPSFMS, AllPSFMseqs) + + #if a motif was found report it here and add it to the total motifs + #Print out your results if any were reached! + for i in xrange(len(sequencesToCompare)): + SeqIndexi = OrderOfBest[i] + if BestIndices[SeqIndexi] != -1: + Motifi = sequencesToCompare[SeqIndexi][BestIndices[SeqIndexi]:BestIndices[SeqIndexi]+sizeOfMotif] + MotifSeqs.append(Motifi) + + #add the current best motif to the total of all motifs for returning + #to the user + AllPSFMS.append(PSFM) + AllPSFMseqs.append(bestMotifSeqYet) + print OverallPWM + + NTDSTUFF = {0:'A',1:'C',2:'G',3:'T'} + for Pntd in [0,1,2,3]: + print "" + print NTDSTUFF[Pntd], + for Pj in xrange(len(PSFM)): + print "\t %.003f"%(PSFM[Pj][Pntd]), + + print "\n**********\n" + print TopMotif + print "\n**********\n" + print "Score:",OverallBest + maxScores=[max(OverallPWM[P_I]) for P_I in range(len(OverallPWM))] + for maski in xrange(numOfSequences): + maskThis=nextToCompare[maski] + i = 0 + max_i = len(maskThis)-sizeOfMotif + while (i<=max_i): + motif=maskThis[i:i+sizeOfMotif] + #skip this area if there are any masked regions found + nLoc=motif.rfind('N') + if nLoc >=0: + i+=nLoc+1 + continue + mmNum = 0 + j = 0 + while (j < sizeOfMotif): + if PWM[j][NTDIndices[motif[j]]]!=maxScores[j]: + mmNum+=1 + if mmNum > NumOfMismatch: + break + j+=1 + #if the location is below threshold we mask it out also + if mmNum>NumOfMismatch: + i+=1 + continue + + i+=1 + + #replace old sequences with the new one + sequencesToCompare = nextToCompare[:] + #reduce masked regions to single N + for i in xrange(len(sequences)): + splitByN=sequencesToCompare[i].split('N') + j = 0; + finalSequence="" + max_j=len(splitByN) + while j < max_j : + if len(splitByN[j])==0: + finalSequence="".join([finalSequence,'N']) + while len(splitByN[j])==0: + j+=1 + if j==max_j: + break + else: + finalSequence="".join([finalSequence,splitByN[j]]) + j+=1 + sequencesToCompare[i]=finalSequence + + return (AllPSFMS, AllPSFMseqs) + + +#MaskLocation +#input: sequence sequence to be masked +# location location at which to mask the sequence +# length length of area to mask +#output string masked sequence +def MaskLocation(sequence,start,length): + finalsequence=''.join([sequence[:start],INSERTION_FILLER*(length),sequence[start+length:]]) + newLen=len(finalsequence) + finalsequence=''.join([finalsequence[:newLen-start-length],INSERTION_FILLER*(length),finalsequence[newLen-start:]]) + + return finalsequence + + +#TimeEfficientAlignment +#input: seq0 string sequence in which to calculate a best location +# seq1 string second sequence +# start0 integer start location for alignment (gives a skew) +# start1 integer start for second sequence +# BestScore float which is the current bestscore +# percID percent of the aligned sequences that must be identical +# in order to be considered for a motif +#output float Best scoring alignment's score +# +#will be calculated as an ungapped consensus between two sequences as the +#algorithm steps through each window +def TimeEfficientAlignment(seq0, seq1, SeqIndex0, SeqIndex1, startLocs, percID): + global sizeOfMotif + NumOfMismatch = sizeOfMotif-ceil((percID)/100.0*sizeOfMotif) + bestMismatch = NumOfMismatch + MW1=MARKOV_WINDOW-1 + + #create shifted score, then check to see if it meets all + #criteria to become the best. + + i = 0 + maxi=len(seq0)-sizeOfMotif + maxj=len(seq1)-sizeOfMotif + + Score="Infinity" + + while (i<=maxi): + motif=seq0[i:sizeOfMotif+i] + Nloc=motif.rfind('N') + if Nloc>=0: + i+=Nloc+1 + continue + j=0 + while (j<=maxj): + Nloc=seq1[j:j+sizeOfMotif].rfind('N') + if Nloc>=0: + j+=Nloc+1 + continue + mismatchedNTDs=0 + for k in xrange(sizeOfMotif): + if motif[k]!=seq1[k+j]: + mismatchedNTDs+=1 + if mismatchedNTDs > bestMismatch: + break + if mismatchedNTDs == 0: + Score = 2*sizeOfMotif-bestMismatch + startLocs[SeqIndex0]=i + startLocs[SeqIndex1]=j + return (Score) + if mismatchedNTDs < bestMismatch: + bestMismatch = mismatchedNTDs + Score = 2*sizeOfMotif-bestMismatch + startLocs[SeqIndex0]=i + startLocs[SeqIndex1]=j + j+=1 + i+=1 + return (Score) + + +#AlignmentScore +#input: sequenceToCompare List of sequences whose substrings will be +# sizeOfMotif aligned integer size of the motif being found +# (length of the subseqeunce that will be aligned +# from each sequence in above list (window size) +# startLocs start locations of the motifs to be aligned to +# each other +# CompareList the indices of sequencesToCompare to be aligned +# numSeqs the number of sequences -1 from +# sequencesToCompare to be aligned. ie, the indices +# of sequenceToCompare stored in the first numSeqs +# indices of in CompareList. +# Frequency markov model being used to calculate background +# originalSeqs contain the unmasked sequences used for checking +# the markov score +# NumOfMismatch number of mismatches allowable +#output: integer Score indicating the consensus score of these +# sequences +# 2D-List contains the PSFM +# 2D-List contains the log markov scores +# +#will be calculated as an ungapped consensus between those elements in the +#CompareList Consensus score is calculated by choosing the largest number of the +#same elements in each column, and adding all these numbers up across all +#columns +def AlignmentScore(sequencesToCompare, sizeOfMotif, startLocs, CompareList, numSeqs, Frequency, originalSeqs, MARKOV_WINDOW,NumOfMismatch): + TotalScore = 0; + Scores = [] + MW1=MARKOV_WINDOW-1 + PWM = [] + Log = [] + ConsensusScore = 0 + len(sequencesToCompare) + #traverse each column individually + for i in xrange (sizeOfMotif): + divisor = 0.0 + PWMi = [0.0, 0.0, 0.0, 0.0] + Logi = [0.0, 0.0, 0.0, 0.0] + #traverse this index in each sequence + for j in xrange(numSeqs+1): + SequenceIndex = CompareList[j] + CurrSeq = sequencesToCompare[SequenceIndex] + CurrOr = originalSeqs[SequenceIndex] + #some sequences may not contain the motifs, if so you do not want + #to include them in the consensus. These have uninitialized start + #locations (ie startLocs would be -1 + if startLocs[SequenceIndex] != -1: + divisor += 1.0 + if sequencesToCompare[SequenceIndex][startLocs[SequenceIndex]+i] == 'N': + print sequencesToCompare + print "\nBAD HERE!" + print CurrSeq + print startLocs + print startLocs[SequenceIndex] + print CompareList + print j + print SequenceIndex + print numSeqs + print sizeOfMotif + print CurrSeq[startLocs[SequenceIndex]:startLocs[SequenceIndex]+sizeOfMotif] + + PWMi[NTDIndices[CurrSeq[startLocs[SequenceIndex]+i]]] += 1.0 + Logi[NTDIndices[CurrSeq[startLocs[SequenceIndex]+i]]] += Frequency[CurrOr[startLocs[SequenceIndex]-MARKOV_WINDOW+i+1:startLocs[SequenceIndex]+i+1]] + + Scores.append(0) + Top = -1 + for NTD in ['A', 'C', 'G', 'T']: + #avoid ln(0) errors + if PWMi[NTDIndices[NTD]] > 0: + Scores[i] += PWMi[NTDIndices[NTD]]/divisor *log(PWMi[NTDIndices[NTD]]/(divisor*Logi[NTDIndices[NTD]]/PWMi[NTDIndices[NTD]]), e) + if PWMi[NTDIndices[NTD]] > Top: + Top = PWMi[NTDIndices[NTD]] + TotalScore += Scores[i] + ConsensusScore += Top + PWM.append(PWMi) + Log.append(Logi) + + return (TotalScore, Scores, ConsensusScore,PWM, Log) + + +#MarkovFreq +#input: prefix string of length MARKV_SIZE - 1 prefix used for model +# actualNTD character NTD at the en dof the prefix being calculated +# Frequency Markov model for calculations +#output: float that gives the markov score for this specific sequence +# +#The helper function will run through and find all possible words with the +#prefix and determine the markov score based on this +def MarkovFreq (prefix, actualNTD, Frequency): + + denominator = 0.0 + numerator = 0.0 + for NTD in ['A', 'C', 'G', 'T']: + value = M_Score(prefix+NTD, Frequency, False,MARKOV_WINDOW) + if NTD == actualNTD : + numerator = value + denominator += value + retVal = numerator/denominator + return retVal + + +#revComp +#input: sequence DNA sequence to be converted to reverse complement +#output: string reverse complement of input sequence +# +#obtains the reverse complement of an input sequence +def revComp (sequence): + #base pairs + RevDict={'A':'T','T':'A', 'C':'G', 'G':'C', 'N':'N'} + reverse = "" + #reverse the sequene + for i in xrange(len(sequence)): + reverse = RevDict[sequence[i].upper()]+reverse + return reverse + + +#Markov3 +#input: sequences list that are being used to create the background +#output: dictionary of all 6mers (reverse complement also) and their -log2 +# proportion seen +# +#background will build a markov model of the background in order to be able +#to differentiate the motifs from the pure background of a certain size +#they will be stored as -log(fraction) +def Markov(sequences, IncludeRC,MARKOV): + MARKOV_WINDOW = MARKOV + 1 + WordDict = {} + totalWindows = 0 + + #take each sequence and use it in order to separately determine background + for seq in sequences: + #all sequences for the background must be full-length + for index in xrange(len(seq)-MARKOV): + subseq = seq[index:index+MARKOV_WINDOW].upper() + if "N" in subseq: + continue + + totalWindows += 1 + if subseq not in WordDict: + WordDict[subseq] = 0.0 + WordDict[subseq] += 1.0 + if IncludeRC: + totalWindows += 1 + RC = revComp(subseq) + if RC not in WordDict: + WordDict[RC] = 0.0 + WordDict[RC] += 1.0 + + #convert to logs + for key in WordDict: + WordDict[key] = 1.0*WordDict[key]/totalWindows + return WordDict + + +#Average_M +#input: sequences List of sequences on which to find the average +# markov score +# Model Dictionary containing pvalues for seeing 3mers +# l integer designating the word sizes from which to +# determine average pvalue +#output: average probability of all input lmers in the sequences in the Model +# +#finds the probability of seeing all subsequence in the total strings +#using the markov model created using the background. Markov3 is used +#(window size of 3) and from this determine the average. This functio will +#also screen the background model +def Average_M (sequence, Model, l,MARKOV_WINDOW): + totalSum = 0.0; + totalWords = 0.0; + MW1=MARKOV_WINDOW-1 + for seq in sequence: + for i in xrange(MW1,len(seq)-l+1): + sequenceCheck=seq[i-MW1:i+1] + Nindex=sequenceCheck.rfind('N') + if Nindex >= 0: + continue + totalWords += 1.0 #increase number of words + PVal = M_Score(sequenceCheck, Model, True,MARKOV_WINDOW) + #add current word to the running total of scores + totalSum += PVal + retVal = totalSum/totalWords + print totalWords + + return retVal + + +#M_Score +#input: sequence string for which the Pvalue is to be determined +# Model Dictionary containing log2 pvalues for seeing 6mers +# check Boolean which determines whether to also check for +# completeness of markov model +#output: log2 probability of seeing the input seqeunce in the Model +# +#gives the probability of seeing the given subsequence in the total strings +#using the markov model created using the background. Markov6 is used +#(window size of 3) +def M_Score (sequence, Model, check,MARKOV_WINDOW): + PVal = 0.0 + MW1=MARKOV_WINDOW-1 + for j in xrange(len(sequence)-MARKOV_WINDOW+1): + #if the subsequences is not in the background value it + #the program returns an exit code and asks the user to + #revise background model input + if sequence[j:j+MARKOV_WINDOW] not in Model: + if check: + print "The Markov Model is inadequate for your input", + print "sequences\n %s is"%sequence[j:j+MARKOV_WINDOW], + print "not contained in model provided\n", + print "Please revise your provided model or consider", + print "using Background Modelling provided" + sys.exit(0) + continue + #calculates score + PVal += -log(Model[sequence[j:j+MARKOV_WINDOW]],e) + + return PVal + +#LogOdds +#input: sequences sequences for which to find the log odds score +# startLocs start locations at which to start computing score +# sizeOfMotif size of the motif being created +# Freqeuncy Markov3 Model +# Index Current Indices +# CompareList Order of Comparison +#output returns the log odds score for the consensus +#the equation used is as follow: +#S(j = 1 to sizeOfMotif (S(i = [A,C,G,T]) f_ij * ln(S(Prob each path)))) +def LogOddsoPT(sequences, startLocs, sizeOfMotif, Frequency, Index,CompareList,MARKOV_WINDOW): + MW1=MARKOV_WINDOW-1 + for i in range(sizeOfMotif): + for j in range(Index+1): + seqIndex = CompareList[j] + lnValue = [0.0, 0.0, 0.0, 0.0] + totalNum = [0.0, 0.0, 0.0, 0.0] + #add the frequency of seeing the given nucleotides in each position + #a runnning total of each one. Then add them to the equation + if startLocs[seqIndex] != -1: + index = startLocs[seqIndex]+i + previous = sequences[seqIndex][index-(MW1):index+1] + denominator = 0.0 + numerator = 0.0 + #determine probabilities for each path + for NTD in ['A', 'C', 'G', 'T']: + full = previous+NTD + if full in Frequency: + value = Frequency[full] + if NTD == sequences[seqIndex][index+1]: + numerator = value + denominator += value + #increase the summation of each location + lnValue[NTDIndices[sequences[seqIndex][index+1]]] +=numerator/denominator + #increase number of given nucleotides at the index + totalNum[NTDIndices[sequences[seqIndex][index+1]]] += 1.0 + + +#LogOdds +#input: sequence relevant part of the sequence being added to the PWM +# PWM information on sequences already in the motif +# LogsM frequency information on sequences already in motif +# Frequency markov model for background +# sizeOfMotif size f the motif being found +#output returns the log odds score for the consensus +#the equation used is as follow: +#S(j = 1 to sizeOfMotif (S(i = [A,C,G,T]) f_ij * ln(S(Prob each path)))) +def LogOdds(PWM, LogsM, sequence, Frequency,MARKOV_WINDOW): + Score = 0 + PWMout = copy.deepcopy(PWM) + LogsMout = copy.deepcopy(LogsM) + MW1=MARKOV_WINDOW-1 + #since each column of the PWM must add up to the total umber of sequences + #in that PWM, in addition one must be added for the current sequence + totalSeqs = PWM[0][0]+PWM[0][1]+PWM[0][2]+PWM[0][3] + 1 + for j in range(len(PWMout)): + for i in ['A', 'C', 'G', 'T']: + if i == sequence[j+MW1]: + PWMout[j][NTDIndices[i]] += 1.0 + #prefix to check for prob of specific ntd given prefix + word = sequence[j:j+MARKOV_WINDOW] + #determine the frequency of the words individually + LogsMout[j][NTDIndices[i]] += Frequency[word] + if PWMout[j][NTDIndices[i]]> 0: + Score +=PWMout[j][NTDIndices[i]]/totalSeqs*log(PWMout[j][NTDIndices[i]]/(totalSeqs*LogsMout[j][NTDIndices[i]]/PWMout[j][NTDIndices[i]]),e) + + return Score, PWMout, LogsMout + + +#convert2motif +#input: sequences list containing the sequences in A,C,G,T alphabet +# comprising the motif to be converted into symbols +# size size of the motif +# maybe add threshold?!?! +#output: string motif converted into descriptive symbols +# +#takes in a list of motifs that were found at each point and converts them to +#an actual motif +def convert2motif(sequences, size): + #column composition is replaced by symbols + SymbolDict = {'CGT':'B','AGT':'D','ACT':'H','GT':'K','AC':'M', + 'ACGT':'N','AG':'R','CG':'S','ACG':'V','AT':'W', + 'CT':'Y','A':'A','C':'C','G':'G','T':'T'} + Motif = "" + #determine the composition of each column + numSeqs = len(sequences) + for i in xrange(size): + A = 0. + C = 0. + G = 0. + T = 0. + for seq in sequences: + if seq[i].upper() == "A": + A += 1 + elif seq[i].upper() == "C": + C += 1 + elif seq[i].upper() == "G": + G += 1 + else: + T += 1 + + characterCode = "" + if (A/numSeqs > 0.1): + characterCode += "A" + if (C/numSeqs > 0.1): + characterCode += "C" + if (G/numSeqs > 0.1): + characterCode += "G" + if (T/numSeqs > 0.1): + characterCode += "T" + Motif += SymbolDict[characterCode] + + return Motif + + +#PMW2motif +#input: array PWM +#output: string motif converted into descriptive symbols +# +#takes in a PWM that was created and converts it to +#an actual motif +def PWM2Motif(PWM): + #column composition is replaced by symbols + SymbolDict = {'CGT':'B','AGT':'D','ACT':'H','GT':'K','AC':'M', + 'ACGT':'N','AG':'R','CG':'S','ACG':'V','AT':'W', + 'CT':'Y','A':'A','C':'C','G':'G','T':'T'} + Motif = "" + #determine the composition of each column + for i in xrange(len(PWM)): + characterCode = "" + if (PWM[i][NTDA] > 0.1): + characterCode += "A" + if (PWM[i][NTDC] > 0.1): + characterCode += "C" + if (PWM[i][NTDG] > 0.1): + characterCode += "G" + if (PWM[i][NTDT] > 0.1): + characterCode += "T" + Motif += SymbolDict[characterCode] + + return Motif + + + +#convert2PSFM +#input: sequences list containing the sequences in A,C,G,T alphabet +# comprising the motif to be converted into symbols +# size size of the motif +#output: 2Darray will contain the PSFM where indices 0-3 of each list +# will be A,C,G,T respectively +# +#takes in a list of motifs that were found at each point and converts them to +#a PSFM +def convert2PSFM (PWM, NumOfSeqs): + #Position specific frequency matrix to be returned + PSFM = [] + #determine the composition of each column + for i in xrange(len(PWM)): + index = [] + #get frequencies for each NTD + for j in [0,1,2,3]: + index.append(PWM[i][j]/NumOfSeqs) + #add all nucleotide frequencies to the columns + PSFM.append(index) + + return PSFM + + +#convert2PWM +#input: sequences list containing the sequences in A,C,G,T alphabet +# comprising the motif to be converted into symbols +# size size of the motif +#output: 2Darray will contain the PSFM where indices 0-3 of each list +# will be A,C,G,T respectively +# +#takes in a list of motifs that were found at each point and converts them to +#a PWM +def convert2PWM (sequences, size): + #Position specific frequency matrix to be returned + PWM = [] + #determine the composition of each column + for i in xrange(size): + indices = [0.0, 0.0, 0.0, 0.0] + for seq in sequences: + indices[NTDIndices[seq[i].upper()]] += 1.0 + #add all nucleotide frequencies to the columns + PWM.append(indices) + + return PWM + + +#add2PWM +#input: sequence sequence to be added to PWM +# PWM PWM being modiifed +# +#mutator method takes in a sequence and adds it to the PWM +def add2PWM(sequence, PWM): + #determine the composition to add + for i in xrange(len(PWM)): + PWM[i][NTDIndices[sequence[i].upper()]] += 1.0 + + +#add2PWMReturn +#input: sequence sequence to be added to PWM +# PWM PWM being modiifed +# +#mutator method takes in a sequence and adds it to the PWM +def add2PWMReturn(sequence, PWM): + retPWM=copy.deepcopy(PWM) + #determine the composition to add + for i in xrange(len(retPWM)): + retPWM[i][NTDIndices[sequence[i].upper()]] += 1.0 + + return retPWM + + +#Align2PWM +#input: Motifi Sequence being aligned to PWM +# PWM PWM to which the sequence is being aligned +#output: float alignment score to the matrix +# +#takes in a PWM and aligns the sequence to this PWM and returns +#the consensus scoreo +def Align2PSFM(Motifi,PWM): + Best = 0.0 + for i in xrange(len(PWM)): + CurrentIndex = PWM[i][:] + CurrentIndex[NTDIndices[Motifi[i].upper()]] += 1.0 + Top = CurrentIndex[0] + for j in [1,2,3]: + if Top < CurrentIndex[j]: + Top = CurrentIndex[j] + Best += Top + + return Best + + +#Factorial +#input: n Number for which we will calculate the factorial +#output: float factorial of input number +# +#calculates the factorial of input number n +def Factorial(n): + #by definition factorial should be 1 + Fact = 1 + for i in xrange(2,n+1): + Fact *= i + + return Fact + + +#Choose +#input: n integer for total number of elements in a set +# k integer for total number of elements to be chose from set +#output: float the binomial coefficient of the above number, ie nCk +# +#calculates the number of ways to choose k elements from a set of n +def Choose(n, k): + return Factorial(n)/Factorial(n-k)/Factorial(k) + + +#GetMotif +#input: sequencesToCompare sequences where the motif is being found +# sequencesLast original version of the sequences +# CompareList Order in which sequences are examined +# Best Motif The location where the motifs score highest +# startLocs The current locations for the motifs being examined +# sizeOfMotif Size of the motif to be found in the sequences +# Index Index up to which the motif has been optimized so +# far +# BestScore The best score of aligned motifs +# Frequency Markov3 Model +# PWM PWM which includes alll the sequences which have +# been included so far +# PWMFounder This is the "founder" PWM, includes only the +# founder sequences' information +# ConScoreTop This value stores the Consensus score of the +# founder sequences only +# normalization normalization value for the current words +# LogsM frequency of markov words +# percID percent of the aligned sequences that must be +# identical in order to be considered for a motif to +# be considered (between the founder sequences then +# between founder sequence consensus and each other +# subsequence being considered) +# originalSeqs contain unmasked sequence for markov scores +#output:integer Score of the top scoring motif +# +#this function will align the Indexth item in the sequencesToCompare to all the +#sequences previously aligned from their best indices in a way to optimize the +#alignment score (consensus score) + +def GetMotif (sequencesToCompare, sequencesLast, CompareList, BestMotif, + startLocs, sizeOfMotif, Index, BestScore, Frequency, + PWM, PWMFounder, ConScoreTop, normalization, LogsM,percID, + originalSeqs,origLast,MARKOV_WINDOW): + + MW1=MARKOV_WINDOW-1 + #variable to indicate whether sequencesLast needs to be updated + NewBestDetermined = False + SeqIndexi = CompareList[Index] + + #search through all positions for this index + starti=MW1 + endi=len(sequencesToCompare[SeqIndexi])-sizeOfMotif + while (starti<=endi): + Motifi = sequencesToCompare[SeqIndexi][starti:starti + sizeOfMotif] + MMotifi = originalSeqs[SeqIndexi][starti-MW1:starti + sizeOfMotif] + + #if this area has already been earmarked as a motif it cannot be + #used again + Nlocation=Motifi.rfind('N') + if Nlocation >= 0: + starti+=1+Nlocation+MARKOV_WINDOW + + #check to see if the current sequence has a high enough consensus + #with the founder sequences in order to be considered further + ScoreTop = Align2PSFM(Motifi, PWMFounder) + Poss = float(ScoreTop - ConScoreTop) + GOF = Poss/sizeOfMotif + if GOF < percID/100.0: + starti+=1 + continue + + #if the word is not above noise + if M_Score(MMotifi,Frequency, False,MARKOV_WINDOW) - normalization < 0: + starti+=1 + continue + + startLocs[SeqIndexi] = starti + #new best scores are assigned by aligning the current sequence to + #the PWM + (ScoreAll, PWM2, LogsM2) = LogOdds(PWM, LogsM, originalSeqs[SeqIndexi] [starti-MW1:starti+sizeOfMotif],Frequency,MARKOV_WINDOW) + Score = ScoreAll + + #new best scores are evaluted + if (BestScore == "Infinity" and Score != "Infinity") or Score > BestScore : + #record best position + BestMotif[SeqIndexi] = startLocs[SeqIndexi] + BestScore = Score + LogsMhold = LogsM2[:] + PWMhold = PWM2[:] + NewBestDetermined = True + + starti+=1 + + startLocs[SeqIndexi] = BestMotif[SeqIndexi] + if NewBestDetermined: + PWM = PWMhold[:] + LogsM = LogsMhold[:] + + return (BestScore, NewBestDetermined, PWM, LogsM) \ No newline at end of file