--- /dev/null
+##Sarah Aerni
+##Created: July 12, 2005
+##Modified: June 22, 2006
+##Modified: Oct 22, 2008 by Ali (Minor modifications to outputs)
+##This is a greedy motif finding program.
+##Using background models it determines which motifs are overrepresented
+###############################################################################
+##include a way to go back and
+##uses the new way of doing background and the new scoring rather than simple
+##consensus it uses the log likelihood
+###############################################################################
+
+import hotshot
+import random
+import math
+import sys
+import copy
+import time
+
+from math import log
+from math import e
+from math import ceil
+from random import shuffle
+from random import randint
+
+ConsensusScore= {True:2, False:1}
+
+NTDIndices = {'A':0, 'C':1, 'G':2, 'T':3}
+IndexNTDs = {0:'A', 1:'C', 2:'G', 3:'T'}
+INSERTION_N = "N"
+
+NTDA=0
+NTDC=1
+NTDG=2
+NTDT=3
+
+INSERTION_MARKER="S"
+INSERTION_FILLER="N"
+
+#markov size - 1 = markov model
+#example:
+#if you want a markov 1 you want to check 2 ntds total
+#markov_size = 2
+#declare global variables
+global MARKOV_WINDOW
+global MW1
+global percID
+global Founders
+global UseRC
+global Frequency
+global NumOfMismatch
+Frequency={}
+global sizeOfMotif
+global sequences
+global founderPercID
+#MotifFinder with 3 arguments
+#input: sequences sequences in which we are to find a motif
+# sizeOfMotif size of the motif to be identified in the sequences
+# numberOfMotifs the number of motifs of the indicated size to be
+# identified in the sequences
+# Iterations The number of iterations that will be perfdormed in
+# order to determine an optimal motif
+# Frequency Markov Model for background
+# UseRC a boolean indicating whether the reverse complement
+# should be used i the finding of a motif
+# (True = yes)
+# Founders a boolean indicating whether the first 2 sequences
+# should always be the founder sequences
+# percID percent of the aligned sequences that must be
+# identical in order to be considered for a motif to
+# be considered (between the founder sequences then
+# between founder sequence consensus and each other
+# subsequence being considered)
+# founderPercID The minimum percent identity two aligned sequences
+# must be to qualify as founder sequences
+# MARKOV_SIZE Size of the markov model (corresponds to length of
+# the words in the dictionary Frequency)
+#output: (List1, List2) List1 contains PSFMs of the indicated length and up
+# to the indicated number (there may be fewer) in the
+# standard cistematic format, while List2 holds the
+# corresponding sequences.
+#
+#This function will call the overloaded MotifFinder function, passing in a
+#default amount for the number of iterations (currently set at 100)
+def MotifFinder(sequences, minSize, maxSize, numberOfMotifs, Iterations,
+ Frequency, UseRC, Founders, percID, founderPercID,MARKOV_SIZE,
+ model,masking):
+
+ #declare global variables
+ global MARKOV_WINDOW
+ global MW1
+ global NumOfMismatch
+ global sizeOfMotif
+ numOfSequences=len(sequences)
+
+ #the minimum window size permitted by this program is a motif of length 6
+ if minSize < 6:
+ minSize = 6
+
+ #sanity check for reasonable motif sizes
+ if minSize > maxSize:
+ print "Please input a minimum size that is smaller than a maximum size"
+ print "User input:\nMinimum Size:\t%i\nMaximumSize:\t%i" % (minSize, maxSize)
+ return None
+
+ #sanity check for reasonable percentID
+ if percID> founderPercID:
+ print "Your input %i for percent identity for founder sequences" % founderPercID,
+ print "is less than your input for general percent identity at",percID
+ print "Please adjust your input"
+ return None
+
+ #a user may input a number of iterations which exceeds the possible
+ #for a given set of input values, in which case the number of iterations
+ #should be decreased to another size
+ maxIterations = (maxSize - minSize+1)*Choose(len(sequences),2)*Factorial(len(sequences)-2)
+ if maxIterations < Iterations:
+ print "The number of Iterations you entered", Iterations,
+ print "is being reduced to",maxIterations,
+ Iterations = maxIterations
+
+ #adjusting the window as remarked above
+ MARKOV_WINDOW = MARKOV_SIZE + 1
+ MW1=MARKOV_SIZE
+ print len(sequences), "sequences search for",numberOfMotifs,
+ print " motifs of size",minSize,"to",maxSize
+ print "The percent Identity input is",percID,"%"
+
+ #this list will contain motifs found (up to the NumberOfMotifs passed in
+ sequencesToCompare = [i.upper() for i in sequences]
+ #as a preprocessing steps, a string of Ns will be replaced by one N
+ for i in xrange(len(sequences)):
+ splitByN=sequencesToCompare[i].split('N')
+ j = 0;
+ finalSequence=""
+ max_j=len(splitByN)
+ while j < max_j :
+ if len(splitByN[j])==0:
+ finalSequence="".join([finalSequence,'N'])
+ while len(splitByN[j])==0:
+ j+=1
+ if j==max_j:
+ break
+ else:
+ finalSequence="".join([finalSequence,splitByN[j]])
+ j+=1
+ sequencesToCompare[i]=finalSequence
+ originalSeqs = []
+ aveCalcSeqs = []
+
+ if UseRC:
+ for seqIndex in xrange(len(sequencesToCompare)):
+ sequence = revComp(sequencesToCompare[seqIndex])
+ aveCalcSeqs.append(''.join([sequencesToCompare[seqIndex], sequence]))
+ originalSeqs.append(''.join([sequencesToCompare[seqIndex],INSERTION_N,sequence]))
+ sequencesToCompare[seqIndex] = ''.join([sequencesToCompare[seqIndex],INSERTION_N*(MARKOV_WINDOW),sequence[MW1:]])
+ sequence = ''.join([INSERTION_N*(MW1),sequencesToCompare[seqIndex][MW1:]])
+ sequencesToCompare[seqIndex] = sequence
+
+ AllPSFMS = []
+ AllPSFMseqs = []
+ for motif_i in xrange(numberOfMotifs):
+ empty=min([len(sequencesToCompare[i]) for i in xrange(len(sequencesToCompare))])
+ if empty < maxSize:
+ print "A sequence has been depleted of all options"
+ return (AllPSFMS, AllPSFMseqs)
+
+ OverallBest = "Infinity"
+ print "MOTIF NUMBER %i\n" % motif_i
+ #multiple iterations are attempted to avoid local maxima
+ for step in xrange(Iterations):
+ sys.stdout.flush()
+ numIncluded = 2 #used in order to create PSFMs from PWMs
+ #CompareList will be used to randomly generate the sequences to be
+ #compared
+ CompareList = range(len(sequencesToCompare))
+ BestScore = "Infinity"
+ #find the best shared motif between the first two sequences
+ #the motif must be completely contained within the sequence
+ #or its reverse complement
+ NumberOfTries = 0
+ while (BestScore == "Infinity"):
+ #PWM created from all sequences included in the motif
+ PWM = []
+ #pick motif size randomly within the range provided by the user
+ sizeOfMotif = random.randint(minSize,maxSize)
+ NumOfMismatch = sizeOfMotif-ceil((percID)/100.0*sizeOfMotif)
+ #if the motif average is not yet established create the average
+ #and store tis value in the dictionary
+
+ #best motif startpoints will be stored in this list of start
+ #locations
+ BestMotif = [-1] * len(sequencesToCompare)
+ #these are start locations are used in order to test new best
+ #start indices
+ startLocs = BestMotif[:]
+ #if it appears there are no more motifs possible quit
+ if (NumberOfTries > Choose(len(sequencesToCompare), 2)):
+ print "No more motifs to be found after %i tries" % NumberOfTries
+ return (AllPSFMS, AllPSFMseqs)
+ NumberOfTries +=1
+ #randomize sequence order
+ #if the sequence founders are input then you only want to
+ #randomize the remaining sequences
+ if Founders:
+ CompareList = [0,1]
+ remaining = range(2,len(sequencesToCompare))
+ shuffle(remaining)
+ CompareList.extend(remaining)
+ #if no founders are specified, randomize all
+ else:
+ shuffle(CompareList)
+
+ SeqIndex0 = CompareList[0]
+ SeqIndex1 = CompareList[1]
+ #founder sequences - first 2 sequences - are selected
+ #create random start sites to avoid favoring the 5' region
+ startLocs[SeqIndex0] = -1
+ startLocs[SeqIndex1] = -1
+ BestMotif[SeqIndex0] = -1
+ BestMotif[SeqIndex1] = -1
+ BestScore = "Infinity"
+ #define the best motif between the first two sequences the
+ #inital set of comparisons are performed at the same indices in
+ #both "founder" sequences
+
+ #to ensure that the startpoint is random we will rearrange the
+ # sequences
+ start0 = randint(0,len(sequencesToCompare[SeqIndex0])-sizeOfMotif-1)
+ accessSeq=sequencesToCompare[SeqIndex0]
+ seq0 = ''.join([accessSeq[start0+1:],'N',accessSeq[:start0+sizeOfMotif]])
+ start1 = randint(0,len(sequencesToCompare[SeqIndex1])-sizeOfMotif-1)
+ accessSeq=sequencesToCompare[SeqIndex1]
+ seq1 = ''.join([accessSeq[start1+1:],'N',accessSeq[:start1+sizeOfMotif]])
+
+ #create shifted score, then check to see if it meets all
+ #criteria to become the best.
+
+ #Perform a time efficient alignment of the founder sequences
+ (BestScore) = TimeEfficientAlignment(seq0,seq1,SeqIndex0,SeqIndex1,startLocs, founderPercID)
+
+ if BestScore=="Infinity":
+ continue
+ NLoc0=len(sequencesToCompare[SeqIndex0])-start0-1
+ NLoc1=len(sequencesToCompare[SeqIndex1])-start1-1
+
+ #remap to the correct location
+ if startLocs[SeqIndex0] > NLoc0:
+ startLocs[SeqIndex0]=(startLocs[SeqIndex0]+start0)%len(sequencesToCompare[SeqIndex0])
+ else:
+ startLocs[SeqIndex0]=(startLocs[SeqIndex0]+start0+1)
+
+ if startLocs[SeqIndex1] > NLoc1:
+ startLocs[SeqIndex1]=(startLocs[SeqIndex1]+start1)%len(sequencesToCompare[SeqIndex1])
+ else:
+ startLocs[SeqIndex1]=(startLocs[SeqIndex1]+start1+1)
+
+ #if we found a good score get all the info necessary to
+ #continue, including a PWM and updating the startLocs
+ if BestScore != "Infinity":
+ index0 = startLocs[SeqIndex0]
+ index1 = startLocs[SeqIndex1]
+ seq0 = sequencesToCompare[SeqIndex0]
+ seq1 = sequencesToCompare[SeqIndex1]
+ motif0 = seq0[index0:index0+sizeOfMotif]
+ motif1 = seq1[index1:index1+sizeOfMotif]
+ BestMotifSeqs=[motif0,motif1]
+ PWM = convert2PWM([motif0, motif1],sizeOfMotif)
+
+ #this PWM will be used in order to establish a criterion
+ #for future sequences, whether they meet a threshold
+ #similiarty to the founder sequences
+
+ #if it exists, find the best scoring motifs in the remaining
+ #sequences.
+
+ for seqCompIndex in xrange(2, len(sequencesToCompare)):
+ maxScores=[max(PWM[P_I]) for P_I in range(len(PWM))]
+ BestScore = "Infinity"
+ seq=CompareList[seqCompIndex]
+ sequenceToAdd = sequencesToCompare[seq]
+ max_i=len(sequenceToAdd)-sizeOfMotif
+ bestMismatch = NumOfMismatch
+ #to ensure that the startpoint is random we will rearrange the
+ #sequences
+ start_i = randint(0,max_i-1)
+ seq_i = ''.join([sequenceToAdd[start_i+1:],'N',sequenceToAdd[:start_i+sizeOfMotif]])
+ i=0
+ while (i<=max_i):
+ motif=seq_i[i:i+sizeOfMotif]
+ #skip this area if there are any masked regions found
+ nLoc=motif.rfind('N')
+ if nLoc >=0:
+ i+=nLoc+1
+ continue
+
+ mmNum = 0
+ j = 0
+ while (j < sizeOfMotif):
+ mmNum+=int(PWM[j][NTDIndices[motif[j]]]!=maxScores[j])
+ if mmNum > bestMismatch:
+ break
+ j+=1
+
+ #set a new best if we found it (always keep originals)
+ if mmNum < bestMismatch:
+ bestMismatch = mmNum
+ startLocs[seq]=i
+
+ #we cannot improve upon a perfect match so we break end our
+ #search if this occurs
+ if mmNum == 0:
+ startLocs[seq]=i
+ break
+ i+=1
+
+ #if somethigng was found we add it to our PWM
+ if startLocs[seq]!= -1:
+ NLoc_i=len(sequenceToAdd)-start_i-1
+ if startLocs[seq] > NLoc_i:
+ startLocs[seq]=(startLocs[seq]+start_i)%len(sequenceToAdd)
+ else:
+ startLocs[seq]=(startLocs[seq]+start_i+1)
+
+ starti=startLocs[seq]
+ sAdd=sequencesToCompare[seq][starti:starti+sizeOfMotif]
+ checkPWM=add2PWMReturn(sAdd,PWM)
+ add=True
+ checkMaxScores=[max(checkPWM[P_I]) for P_I in range(len(checkPWM))]
+ for seq in BestMotifSeqs:
+ mismatches=0
+ for i in range(len(seq)):
+ mismatches+=int(checkPWM[i][NTDIndices[seq[i]]]!=checkMaxScores[i])
+ if mismatches > NumOfMismatch:
+ add=False
+ break
+ if add:
+ BestMotifSeqs.append(sAdd)
+ PWM=copy.deepcopy(checkPWM)
+ numIncluded+=1
+ elif model=="oops":
+ break
+ #if we require one occurence per sequence then if we do not
+ #have a motif in this sequence we stop our search
+ elif model=="oops":
+ break
+
+ GroupScore =0
+ #if we have an oops model and not all sequences are being included,
+ #we have to continue without recording what we have so far
+ if model=="oops" and numIncluded<numOfSequences:
+ continue
+
+ #obtain motif scores
+ GroupScore=0
+ for i in range(len(PWM)):
+ GroupScore += max(PWM[i])
+
+ #store this as the best location if it is the best of this set of
+ #motifs part of that if statement below
+ if ((OverallBest == "Infinity" and GroupScore != "Infinity" )or \
+ GroupScore > OverallBest):
+ nextToCompare = sequencesToCompare[:]
+ MotifSeqs = []
+ OrderOfBest = CompareList[:]
+ BestIndices = startLocs[:]
+ bestMotifSeqYet = BestMotifSeqs
+ OverallBest = GroupScore
+ PSFM = convert2PSFM(PWM, numIncluded)
+ OverallPWM=PWM[:]
+ for i in xrange(len(sequencesToCompare)):
+ SeqIndexi = CompareList[i]
+ #some sequences may not contain the motifs, if so you do no
+ #want to include them
+ if BestIndices[SeqIndexi] != -1:
+ sAdd=sequencesToCompare[SeqIndexi]
+ sAddI=BestIndices[SeqIndexi]
+ Motifi = sAdd[sAddI:sAddI+sizeOfMotif]
+ MotifSeqs.append(Motifi)
+
+ #mask out original location
+ nextToCompare[SeqIndexi]=''.join([sAdd[:sAddI],INSERTION_N*(sizeOfMotif),sAdd[sAddI+sizeOfMotif:]])
+ sAdd=nextToCompare[SeqIndexi]
+ sAddLen=len(sAdd)
+ nextToCompare[SeqIndexi]=''.join([sAdd[:sAddLen-sAddI-sizeOfMotif],INSERTION_N*(sizeOfMotif),sAdd[sAddLen-sAddI:]])
+ TopMotif = convert2motif(MotifSeqs, sizeOfMotif)
+
+ #if not more motifs are found (best score is -1) then it is likely that
+ #we have found as many as we can
+ if OverallBest == "Infinity":
+ print "No more motifs were found!"
+ return (AllPSFMS, AllPSFMseqs)
+
+ #if a motif was found report it here and add it to the total motifs
+ #Print out your results if any were reached!
+ for i in xrange(len(sequencesToCompare)):
+ SeqIndexi = OrderOfBest[i]
+ if BestIndices[SeqIndexi] != -1:
+ Motifi = sequencesToCompare[SeqIndexi][BestIndices[SeqIndexi]:BestIndices[SeqIndexi]+sizeOfMotif]
+ MotifSeqs.append(Motifi)
+
+ #add the current best motif to the total of all motifs for returning
+ #to the user
+ AllPSFMS.append(PSFM)
+ AllPSFMseqs.append(bestMotifSeqYet)
+ print OverallPWM
+
+ NTDSTUFF = {0:'A',1:'C',2:'G',3:'T'}
+ for Pntd in [0,1,2,3]:
+ print ""
+ print NTDSTUFF[Pntd],
+ for Pj in xrange(len(PSFM)):
+ print "\t %.003f"%(PSFM[Pj][Pntd]),
+
+ print "\n**********\n"
+ print TopMotif
+ print "\n**********\n"
+ print "Score:",OverallBest
+ maxScores=[max(OverallPWM[P_I]) for P_I in range(len(OverallPWM))]
+ for maski in xrange(numOfSequences):
+ maskThis=nextToCompare[maski]
+ i = 0
+ max_i = len(maskThis)-sizeOfMotif
+ while (i<=max_i):
+ motif=maskThis[i:i+sizeOfMotif]
+ #skip this area if there are any masked regions found
+ nLoc=motif.rfind('N')
+ if nLoc >=0:
+ i+=nLoc+1
+ continue
+ mmNum = 0
+ j = 0
+ while (j < sizeOfMotif):
+ if PWM[j][NTDIndices[motif[j]]]!=maxScores[j]:
+ mmNum+=1
+ if mmNum > NumOfMismatch:
+ break
+ j+=1
+ #if the location is below threshold we mask it out also
+ if mmNum>NumOfMismatch:
+ i+=1
+ continue
+
+ i+=1
+
+ #replace old sequences with the new one
+ sequencesToCompare = nextToCompare[:]
+ #reduce masked regions to single N
+ for i in xrange(len(sequences)):
+ splitByN=sequencesToCompare[i].split('N')
+ j = 0;
+ finalSequence=""
+ max_j=len(splitByN)
+ while j < max_j :
+ if len(splitByN[j])==0:
+ finalSequence="".join([finalSequence,'N'])
+ while len(splitByN[j])==0:
+ j+=1
+ if j==max_j:
+ break
+ else:
+ finalSequence="".join([finalSequence,splitByN[j]])
+ j+=1
+ sequencesToCompare[i]=finalSequence
+
+ return (AllPSFMS, AllPSFMseqs)
+
+
+#MaskLocation
+#input: sequence sequence to be masked
+# location location at which to mask the sequence
+# length length of area to mask
+#output string masked sequence
+def MaskLocation(sequence,start,length):
+ finalsequence=''.join([sequence[:start],INSERTION_FILLER*(length),sequence[start+length:]])
+ newLen=len(finalsequence)
+ finalsequence=''.join([finalsequence[:newLen-start-length],INSERTION_FILLER*(length),finalsequence[newLen-start:]])
+
+ return finalsequence
+
+
+#TimeEfficientAlignment
+#input: seq0 string sequence in which to calculate a best location
+# seq1 string second sequence
+# start0 integer start location for alignment (gives a skew)
+# start1 integer start for second sequence
+# BestScore float which is the current bestscore
+# percID percent of the aligned sequences that must be identical
+# in order to be considered for a motif
+#output float Best scoring alignment's score
+#
+#will be calculated as an ungapped consensus between two sequences as the
+#algorithm steps through each window
+def TimeEfficientAlignment(seq0, seq1, SeqIndex0, SeqIndex1, startLocs, percID):
+ global sizeOfMotif
+ NumOfMismatch = sizeOfMotif-ceil((percID)/100.0*sizeOfMotif)
+ bestMismatch = NumOfMismatch
+ MW1=MARKOV_WINDOW-1
+
+ #create shifted score, then check to see if it meets all
+ #criteria to become the best.
+
+ i = 0
+ maxi=len(seq0)-sizeOfMotif
+ maxj=len(seq1)-sizeOfMotif
+
+ Score="Infinity"
+
+ while (i<=maxi):
+ motif=seq0[i:sizeOfMotif+i]
+ Nloc=motif.rfind('N')
+ if Nloc>=0:
+ i+=Nloc+1
+ continue
+ j=0
+ while (j<=maxj):
+ Nloc=seq1[j:j+sizeOfMotif].rfind('N')
+ if Nloc>=0:
+ j+=Nloc+1
+ continue
+ mismatchedNTDs=0
+ for k in xrange(sizeOfMotif):
+ if motif[k]!=seq1[k+j]:
+ mismatchedNTDs+=1
+ if mismatchedNTDs > bestMismatch:
+ break
+ if mismatchedNTDs == 0:
+ Score = 2*sizeOfMotif-bestMismatch
+ startLocs[SeqIndex0]=i
+ startLocs[SeqIndex1]=j
+ return (Score)
+ if mismatchedNTDs < bestMismatch:
+ bestMismatch = mismatchedNTDs
+ Score = 2*sizeOfMotif-bestMismatch
+ startLocs[SeqIndex0]=i
+ startLocs[SeqIndex1]=j
+ j+=1
+ i+=1
+ return (Score)
+
+
+#AlignmentScore
+#input: sequenceToCompare List of sequences whose substrings will be
+# sizeOfMotif aligned integer size of the motif being found
+# (length of the subseqeunce that will be aligned
+# from each sequence in above list (window size)
+# startLocs start locations of the motifs to be aligned to
+# each other
+# CompareList the indices of sequencesToCompare to be aligned
+# numSeqs the number of sequences -1 from
+# sequencesToCompare to be aligned. ie, the indices
+# of sequenceToCompare stored in the first numSeqs
+# indices of in CompareList.
+# Frequency markov model being used to calculate background
+# originalSeqs contain the unmasked sequences used for checking
+# the markov score
+# NumOfMismatch number of mismatches allowable
+#output: integer Score indicating the consensus score of these
+# sequences
+# 2D-List contains the PSFM
+# 2D-List contains the log markov scores
+#
+#will be calculated as an ungapped consensus between those elements in the
+#CompareList Consensus score is calculated by choosing the largest number of the
+#same elements in each column, and adding all these numbers up across all
+#columns
+def AlignmentScore(sequencesToCompare, sizeOfMotif, startLocs, CompareList, numSeqs, Frequency, originalSeqs, MARKOV_WINDOW,NumOfMismatch):
+ TotalScore = 0;
+ Scores = []
+ MW1=MARKOV_WINDOW-1
+ PWM = []
+ Log = []
+ ConsensusScore = 0
+ len(sequencesToCompare)
+ #traverse each column individually
+ for i in xrange (sizeOfMotif):
+ divisor = 0.0
+ PWMi = [0.0, 0.0, 0.0, 0.0]
+ Logi = [0.0, 0.0, 0.0, 0.0]
+ #traverse this index in each sequence
+ for j in xrange(numSeqs+1):
+ SequenceIndex = CompareList[j]
+ CurrSeq = sequencesToCompare[SequenceIndex]
+ CurrOr = originalSeqs[SequenceIndex]
+ #some sequences may not contain the motifs, if so you do not want
+ #to include them in the consensus. These have uninitialized start
+ #locations (ie startLocs would be -1
+ if startLocs[SequenceIndex] != -1:
+ divisor += 1.0
+ if sequencesToCompare[SequenceIndex][startLocs[SequenceIndex]+i] == 'N':
+ print sequencesToCompare
+ print "\nBAD HERE!"
+ print CurrSeq
+ print startLocs
+ print startLocs[SequenceIndex]
+ print CompareList
+ print j
+ print SequenceIndex
+ print numSeqs
+ print sizeOfMotif
+ print CurrSeq[startLocs[SequenceIndex]:startLocs[SequenceIndex]+sizeOfMotif]
+
+ PWMi[NTDIndices[CurrSeq[startLocs[SequenceIndex]+i]]] += 1.0
+ Logi[NTDIndices[CurrSeq[startLocs[SequenceIndex]+i]]] += Frequency[CurrOr[startLocs[SequenceIndex]-MARKOV_WINDOW+i+1:startLocs[SequenceIndex]+i+1]]
+
+ Scores.append(0)
+ Top = -1
+ for NTD in ['A', 'C', 'G', 'T']:
+ #avoid ln(0) errors
+ if PWMi[NTDIndices[NTD]] > 0:
+ Scores[i] += PWMi[NTDIndices[NTD]]/divisor *log(PWMi[NTDIndices[NTD]]/(divisor*Logi[NTDIndices[NTD]]/PWMi[NTDIndices[NTD]]), e)
+ if PWMi[NTDIndices[NTD]] > Top:
+ Top = PWMi[NTDIndices[NTD]]
+ TotalScore += Scores[i]
+ ConsensusScore += Top
+ PWM.append(PWMi)
+ Log.append(Logi)
+
+ return (TotalScore, Scores, ConsensusScore,PWM, Log)
+
+
+#MarkovFreq
+#input: prefix string of length MARKV_SIZE - 1 prefix used for model
+# actualNTD character NTD at the en dof the prefix being calculated
+# Frequency Markov model for calculations
+#output: float that gives the markov score for this specific sequence
+#
+#The helper function will run through and find all possible words with the
+#prefix and determine the markov score based on this
+def MarkovFreq (prefix, actualNTD, Frequency):
+
+ denominator = 0.0
+ numerator = 0.0
+ for NTD in ['A', 'C', 'G', 'T']:
+ value = M_Score(prefix+NTD, Frequency, False,MARKOV_WINDOW)
+ if NTD == actualNTD :
+ numerator = value
+ denominator += value
+ retVal = numerator/denominator
+ return retVal
+
+
+#revComp
+#input: sequence DNA sequence to be converted to reverse complement
+#output: string reverse complement of input sequence
+#
+#obtains the reverse complement of an input sequence
+def revComp (sequence):
+ #base pairs
+ RevDict={'A':'T','T':'A', 'C':'G', 'G':'C', 'N':'N'}
+ reverse = ""
+ #reverse the sequene
+ for i in xrange(len(sequence)):
+ reverse = RevDict[sequence[i].upper()]+reverse
+ return reverse
+
+
+#Markov3
+#input: sequences list that are being used to create the background
+#output: dictionary of all 6mers (reverse complement also) and their -log2
+# proportion seen
+#
+#background will build a markov model of the background in order to be able
+#to differentiate the motifs from the pure background of a certain size
+#they will be stored as -log(fraction)
+def Markov(sequences, IncludeRC,MARKOV):
+ MARKOV_WINDOW = MARKOV + 1
+ WordDict = {}
+ totalWindows = 0
+
+ #take each sequence and use it in order to separately determine background
+ for seq in sequences:
+ #all sequences for the background must be full-length
+ for index in xrange(len(seq)-MARKOV):
+ subseq = seq[index:index+MARKOV_WINDOW].upper()
+ if "N" in subseq:
+ continue
+
+ totalWindows += 1
+ if subseq not in WordDict:
+ WordDict[subseq] = 0.0
+ WordDict[subseq] += 1.0
+ if IncludeRC:
+ totalWindows += 1
+ RC = revComp(subseq)
+ if RC not in WordDict:
+ WordDict[RC] = 0.0
+ WordDict[RC] += 1.0
+
+ #convert to logs
+ for key in WordDict:
+ WordDict[key] = 1.0*WordDict[key]/totalWindows
+ return WordDict
+
+
+#Average_M
+#input: sequences List of sequences on which to find the average
+# markov score
+# Model Dictionary containing pvalues for seeing 3mers
+# l integer designating the word sizes from which to
+# determine average pvalue
+#output: average probability of all input lmers in the sequences in the Model
+#
+#finds the probability of seeing all subsequence in the total strings
+#using the markov model created using the background. Markov3 is used
+#(window size of 3) and from this determine the average. This functio will
+#also screen the background model
+def Average_M (sequence, Model, l,MARKOV_WINDOW):
+ totalSum = 0.0;
+ totalWords = 0.0;
+ MW1=MARKOV_WINDOW-1
+ for seq in sequence:
+ for i in xrange(MW1,len(seq)-l+1):
+ sequenceCheck=seq[i-MW1:i+1]
+ Nindex=sequenceCheck.rfind('N')
+ if Nindex >= 0:
+ continue
+ totalWords += 1.0 #increase number of words
+ PVal = M_Score(sequenceCheck, Model, True,MARKOV_WINDOW)
+ #add current word to the running total of scores
+ totalSum += PVal
+ retVal = totalSum/totalWords
+ print totalWords
+
+ return retVal
+
+
+#M_Score
+#input: sequence string for which the Pvalue is to be determined
+# Model Dictionary containing log2 pvalues for seeing 6mers
+# check Boolean which determines whether to also check for
+# completeness of markov model
+#output: log2 probability of seeing the input seqeunce in the Model
+#
+#gives the probability of seeing the given subsequence in the total strings
+#using the markov model created using the background. Markov6 is used
+#(window size of 3)
+def M_Score (sequence, Model, check,MARKOV_WINDOW):
+ PVal = 0.0
+ MW1=MARKOV_WINDOW-1
+ for j in xrange(len(sequence)-MARKOV_WINDOW+1):
+ #if the subsequences is not in the background value it
+ #the program returns an exit code and asks the user to
+ #revise background model input
+ if sequence[j:j+MARKOV_WINDOW] not in Model:
+ if check:
+ print "The Markov Model is inadequate for your input",
+ print "sequences\n %s is"%sequence[j:j+MARKOV_WINDOW],
+ print "not contained in model provided\n",
+ print "Please revise your provided model or consider",
+ print "using Background Modelling provided"
+ sys.exit(0)
+ continue
+ #calculates score
+ PVal += -log(Model[sequence[j:j+MARKOV_WINDOW]],e)
+
+ return PVal
+
+#LogOdds
+#input: sequences sequences for which to find the log odds score
+# startLocs start locations at which to start computing score
+# sizeOfMotif size of the motif being created
+# Freqeuncy Markov3 Model
+# Index Current Indices
+# CompareList Order of Comparison
+#output returns the log odds score for the consensus
+#the equation used is as follow:
+#S(j = 1 to sizeOfMotif (S(i = [A,C,G,T]) f_ij * ln(S(Prob each path))))
+def LogOddsoPT(sequences, startLocs, sizeOfMotif, Frequency, Index,CompareList,MARKOV_WINDOW):
+ MW1=MARKOV_WINDOW-1
+ for i in range(sizeOfMotif):
+ for j in range(Index+1):
+ seqIndex = CompareList[j]
+ lnValue = [0.0, 0.0, 0.0, 0.0]
+ totalNum = [0.0, 0.0, 0.0, 0.0]
+ #add the frequency of seeing the given nucleotides in each position
+ #a runnning total of each one. Then add them to the equation
+ if startLocs[seqIndex] != -1:
+ index = startLocs[seqIndex]+i
+ previous = sequences[seqIndex][index-(MW1):index+1]
+ denominator = 0.0
+ numerator = 0.0
+ #determine probabilities for each path
+ for NTD in ['A', 'C', 'G', 'T']:
+ full = previous+NTD
+ if full in Frequency:
+ value = Frequency[full]
+ if NTD == sequences[seqIndex][index+1]:
+ numerator = value
+ denominator += value
+ #increase the summation of each location
+ lnValue[NTDIndices[sequences[seqIndex][index+1]]] +=numerator/denominator
+ #increase number of given nucleotides at the index
+ totalNum[NTDIndices[sequences[seqIndex][index+1]]] += 1.0
+
+
+#LogOdds
+#input: sequence relevant part of the sequence being added to the PWM
+# PWM information on sequences already in the motif
+# LogsM frequency information on sequences already in motif
+# Frequency markov model for background
+# sizeOfMotif size f the motif being found
+#output returns the log odds score for the consensus
+#the equation used is as follow:
+#S(j = 1 to sizeOfMotif (S(i = [A,C,G,T]) f_ij * ln(S(Prob each path))))
+def LogOdds(PWM, LogsM, sequence, Frequency,MARKOV_WINDOW):
+ Score = 0
+ PWMout = copy.deepcopy(PWM)
+ LogsMout = copy.deepcopy(LogsM)
+ MW1=MARKOV_WINDOW-1
+ #since each column of the PWM must add up to the total umber of sequences
+ #in that PWM, in addition one must be added for the current sequence
+ totalSeqs = PWM[0][0]+PWM[0][1]+PWM[0][2]+PWM[0][3] + 1
+ for j in range(len(PWMout)):
+ for i in ['A', 'C', 'G', 'T']:
+ if i == sequence[j+MW1]:
+ PWMout[j][NTDIndices[i]] += 1.0
+ #prefix to check for prob of specific ntd given prefix
+ word = sequence[j:j+MARKOV_WINDOW]
+ #determine the frequency of the words individually
+ LogsMout[j][NTDIndices[i]] += Frequency[word]
+ if PWMout[j][NTDIndices[i]]> 0:
+ Score +=PWMout[j][NTDIndices[i]]/totalSeqs*log(PWMout[j][NTDIndices[i]]/(totalSeqs*LogsMout[j][NTDIndices[i]]/PWMout[j][NTDIndices[i]]),e)
+
+ return Score, PWMout, LogsMout
+
+
+#convert2motif
+#input: sequences list containing the sequences in A,C,G,T alphabet
+# comprising the motif to be converted into symbols
+# size size of the motif
+# maybe add threshold?!?!
+#output: string motif converted into descriptive symbols
+#
+#takes in a list of motifs that were found at each point and converts them to
+#an actual motif
+def convert2motif(sequences, size):
+ #column composition is replaced by symbols
+ SymbolDict = {'CGT':'B','AGT':'D','ACT':'H','GT':'K','AC':'M',
+ 'ACGT':'N','AG':'R','CG':'S','ACG':'V','AT':'W',
+ 'CT':'Y','A':'A','C':'C','G':'G','T':'T'}
+ Motif = ""
+ #determine the composition of each column
+ numSeqs = len(sequences)
+ for i in xrange(size):
+ A = 0.
+ C = 0.
+ G = 0.
+ T = 0.
+ for seq in sequences:
+ if seq[i].upper() == "A":
+ A += 1
+ elif seq[i].upper() == "C":
+ C += 1
+ elif seq[i].upper() == "G":
+ G += 1
+ else:
+ T += 1
+
+ characterCode = ""
+ if (A/numSeqs > 0.1):
+ characterCode += "A"
+ if (C/numSeqs > 0.1):
+ characterCode += "C"
+ if (G/numSeqs > 0.1):
+ characterCode += "G"
+ if (T/numSeqs > 0.1):
+ characterCode += "T"
+ Motif += SymbolDict[characterCode]
+
+ return Motif
+
+
+#PMW2motif
+#input: array PWM
+#output: string motif converted into descriptive symbols
+#
+#takes in a PWM that was created and converts it to
+#an actual motif
+def PWM2Motif(PWM):
+ #column composition is replaced by symbols
+ SymbolDict = {'CGT':'B','AGT':'D','ACT':'H','GT':'K','AC':'M',
+ 'ACGT':'N','AG':'R','CG':'S','ACG':'V','AT':'W',
+ 'CT':'Y','A':'A','C':'C','G':'G','T':'T'}
+ Motif = ""
+ #determine the composition of each column
+ for i in xrange(len(PWM)):
+ characterCode = ""
+ if (PWM[i][NTDA] > 0.1):
+ characterCode += "A"
+ if (PWM[i][NTDC] > 0.1):
+ characterCode += "C"
+ if (PWM[i][NTDG] > 0.1):
+ characterCode += "G"
+ if (PWM[i][NTDT] > 0.1):
+ characterCode += "T"
+ Motif += SymbolDict[characterCode]
+
+ return Motif
+
+
+
+#convert2PSFM
+#input: sequences list containing the sequences in A,C,G,T alphabet
+# comprising the motif to be converted into symbols
+# size size of the motif
+#output: 2Darray will contain the PSFM where indices 0-3 of each list
+# will be A,C,G,T respectively
+#
+#takes in a list of motifs that were found at each point and converts them to
+#a PSFM
+def convert2PSFM (PWM, NumOfSeqs):
+ #Position specific frequency matrix to be returned
+ PSFM = []
+ #determine the composition of each column
+ for i in xrange(len(PWM)):
+ index = []
+ #get frequencies for each NTD
+ for j in [0,1,2,3]:
+ index.append(PWM[i][j]/NumOfSeqs)
+ #add all nucleotide frequencies to the columns
+ PSFM.append(index)
+
+ return PSFM
+
+
+#convert2PWM
+#input: sequences list containing the sequences in A,C,G,T alphabet
+# comprising the motif to be converted into symbols
+# size size of the motif
+#output: 2Darray will contain the PSFM where indices 0-3 of each list
+# will be A,C,G,T respectively
+#
+#takes in a list of motifs that were found at each point and converts them to
+#a PWM
+def convert2PWM (sequences, size):
+ #Position specific frequency matrix to be returned
+ PWM = []
+ #determine the composition of each column
+ for i in xrange(size):
+ indices = [0.0, 0.0, 0.0, 0.0]
+ for seq in sequences:
+ indices[NTDIndices[seq[i].upper()]] += 1.0
+ #add all nucleotide frequencies to the columns
+ PWM.append(indices)
+
+ return PWM
+
+
+#add2PWM
+#input: sequence sequence to be added to PWM
+# PWM PWM being modiifed
+#
+#mutator method takes in a sequence and adds it to the PWM
+def add2PWM(sequence, PWM):
+ #determine the composition to add
+ for i in xrange(len(PWM)):
+ PWM[i][NTDIndices[sequence[i].upper()]] += 1.0
+
+
+#add2PWMReturn
+#input: sequence sequence to be added to PWM
+# PWM PWM being modiifed
+#
+#mutator method takes in a sequence and adds it to the PWM
+def add2PWMReturn(sequence, PWM):
+ retPWM=copy.deepcopy(PWM)
+ #determine the composition to add
+ for i in xrange(len(retPWM)):
+ retPWM[i][NTDIndices[sequence[i].upper()]] += 1.0
+
+ return retPWM
+
+
+#Align2PWM
+#input: Motifi Sequence being aligned to PWM
+# PWM PWM to which the sequence is being aligned
+#output: float alignment score to the matrix
+#
+#takes in a PWM and aligns the sequence to this PWM and returns
+#the consensus scoreo
+def Align2PSFM(Motifi,PWM):
+ Best = 0.0
+ for i in xrange(len(PWM)):
+ CurrentIndex = PWM[i][:]
+ CurrentIndex[NTDIndices[Motifi[i].upper()]] += 1.0
+ Top = CurrentIndex[0]
+ for j in [1,2,3]:
+ if Top < CurrentIndex[j]:
+ Top = CurrentIndex[j]
+ Best += Top
+
+ return Best
+
+
+#Factorial
+#input: n Number for which we will calculate the factorial
+#output: float factorial of input number
+#
+#calculates the factorial of input number n
+def Factorial(n):
+ #by definition factorial should be 1
+ Fact = 1
+ for i in xrange(2,n+1):
+ Fact *= i
+
+ return Fact
+
+
+#Choose
+#input: n integer for total number of elements in a set
+# k integer for total number of elements to be chose from set
+#output: float the binomial coefficient of the above number, ie nCk
+#
+#calculates the number of ways to choose k elements from a set of n
+def Choose(n, k):
+ return Factorial(n)/Factorial(n-k)/Factorial(k)
+
+
+#GetMotif
+#input: sequencesToCompare sequences where the motif is being found
+# sequencesLast original version of the sequences
+# CompareList Order in which sequences are examined
+# Best Motif The location where the motifs score highest
+# startLocs The current locations for the motifs being examined
+# sizeOfMotif Size of the motif to be found in the sequences
+# Index Index up to which the motif has been optimized so
+# far
+# BestScore The best score of aligned motifs
+# Frequency Markov3 Model
+# PWM PWM which includes alll the sequences which have
+# been included so far
+# PWMFounder This is the "founder" PWM, includes only the
+# founder sequences' information
+# ConScoreTop This value stores the Consensus score of the
+# founder sequences only
+# normalization normalization value for the current words
+# LogsM frequency of markov words
+# percID percent of the aligned sequences that must be
+# identical in order to be considered for a motif to
+# be considered (between the founder sequences then
+# between founder sequence consensus and each other
+# subsequence being considered)
+# originalSeqs contain unmasked sequence for markov scores
+#output:integer Score of the top scoring motif
+#
+#this function will align the Indexth item in the sequencesToCompare to all the
+#sequences previously aligned from their best indices in a way to optimize the
+#alignment score (consensus score)
+
+def GetMotif (sequencesToCompare, sequencesLast, CompareList, BestMotif,
+ startLocs, sizeOfMotif, Index, BestScore, Frequency,
+ PWM, PWMFounder, ConScoreTop, normalization, LogsM,percID,
+ originalSeqs,origLast,MARKOV_WINDOW):
+
+ MW1=MARKOV_WINDOW-1
+ #variable to indicate whether sequencesLast needs to be updated
+ NewBestDetermined = False
+ SeqIndexi = CompareList[Index]
+
+ #search through all positions for this index
+ starti=MW1
+ endi=len(sequencesToCompare[SeqIndexi])-sizeOfMotif
+ while (starti<=endi):
+ Motifi = sequencesToCompare[SeqIndexi][starti:starti + sizeOfMotif]
+ MMotifi = originalSeqs[SeqIndexi][starti-MW1:starti + sizeOfMotif]
+
+ #if this area has already been earmarked as a motif it cannot be
+ #used again
+ Nlocation=Motifi.rfind('N')
+ if Nlocation >= 0:
+ starti+=1+Nlocation+MARKOV_WINDOW
+
+ #check to see if the current sequence has a high enough consensus
+ #with the founder sequences in order to be considered further
+ ScoreTop = Align2PSFM(Motifi, PWMFounder)
+ Poss = float(ScoreTop - ConScoreTop)
+ GOF = Poss/sizeOfMotif
+ if GOF < percID/100.0:
+ starti+=1
+ continue
+
+ #if the word is not above noise
+ if M_Score(MMotifi,Frequency, False,MARKOV_WINDOW) - normalization < 0:
+ starti+=1
+ continue
+
+ startLocs[SeqIndexi] = starti
+ #new best scores are assigned by aligning the current sequence to
+ #the PWM
+ (ScoreAll, PWM2, LogsM2) = LogOdds(PWM, LogsM, originalSeqs[SeqIndexi] [starti-MW1:starti+sizeOfMotif],Frequency,MARKOV_WINDOW)
+ Score = ScoreAll
+
+ #new best scores are evaluted
+ if (BestScore == "Infinity" and Score != "Infinity") or Score > BestScore :
+ #record best position
+ BestMotif[SeqIndexi] = startLocs[SeqIndexi]
+ BestScore = Score
+ LogsMhold = LogsM2[:]
+ PWMhold = PWM2[:]
+ NewBestDetermined = True
+
+ starti+=1
+
+ startLocs[SeqIndexi] = BestMotif[SeqIndexi]
+ if NewBestDetermined:
+ PWM = PWMhold[:]
+ LogsM = LogsMhold[:]
+
+ return (BestScore, NewBestDetermined, PWM, LogsM)
\ No newline at end of file