erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / programs / Consensus.py
diff --git a/cistematic/programs/Consensus.py b/cistematic/programs/Consensus.py
new file mode 100755 (executable)
index 0000000..3bb39a8
--- /dev/null
@@ -0,0 +1,1110 @@
+##Sarah Aerni
+##Created:  July 12, 2005
+##Modified: June 22, 2006
+##Modified: Oct  22, 2008 by Ali (Minor modifications to outputs)
+##This is a greedy motif finding program.
+##Using background models it determines which motifs are overrepresented
+###############################################################################
+##include a way to go back and
+##uses the new way of doing background and the new scoring rather than simple
+##consensus it uses the log likelihood
+###############################################################################
+
+import hotshot
+import random
+import math
+import sys
+import copy
+import time
+
+from math import log
+from math import e
+from math import ceil
+from random import shuffle
+from random import randint
+
+ConsensusScore= {True:2, False:1}
+
+NTDIndices = {'A':0, 'C':1, 'G':2, 'T':3}
+IndexNTDs  = {0:'A', 1:'C', 2:'G', 3:'T'}
+INSERTION_N = "N"
+
+NTDA=0
+NTDC=1
+NTDG=2
+NTDT=3
+
+INSERTION_MARKER="S"
+INSERTION_FILLER="N"
+
+#markov size - 1 = markov model
+#example:
+#if you want a markov 1 you want to check 2 ntds total
+#markov_size = 2
+#declare global variables
+global MARKOV_WINDOW
+global MW1
+global percID
+global Founders
+global UseRC
+global Frequency
+global NumOfMismatch
+Frequency={}
+global sizeOfMotif
+global sequences
+global founderPercID
+#MotifFinder with 3 arguments
+#input:     sequences       sequences in which we are to find a motif
+#           sizeOfMotif     size of the motif to be identified in the sequences
+#           numberOfMotifs  the number of motifs of the indicated size to be
+#                           identified in the sequences
+#           Iterations      The number of iterations that will be perfdormed in
+#                           order to determine an optimal motif
+#           Frequency       Markov Model for background
+#           UseRC           a boolean indicating whether the reverse complement 
+#                           should be used i the finding of a motif
+#                           (True = yes)
+#           Founders        a boolean indicating whether the first 2 sequences
+#                           should always be the founder sequences
+#           percID          percent of the aligned sequences that must be 
+#                           identical in order to be considered for a motif to
+#                           be considered (between the founder sequences then
+#                           between founder sequence consensus and each other
+#                           subsequence being considered)
+#           founderPercID   The minimum percent identity two aligned sequences
+#                           must be to qualify as founder sequences
+#           MARKOV_SIZE     Size of the markov model (corresponds to length of
+#                           the words in the dictionary Frequency)
+#output:    (List1, List2)  List1 contains PSFMs of the indicated length and up
+#                           to the indicated number (there may be fewer) in the
+#                           standard cistematic format, while List2 holds the 
+#                           corresponding sequences.
+#
+#This function will call the overloaded MotifFinder function, passing in a 
+#default amount for the number of iterations (currently set at 100)
+def MotifFinder(sequences, minSize, maxSize, numberOfMotifs, Iterations,
+                Frequency, UseRC, Founders, percID, founderPercID,MARKOV_SIZE,
+                model,masking):
+
+    #declare global variables
+    global MARKOV_WINDOW
+    global MW1
+    global NumOfMismatch
+    global sizeOfMotif
+    numOfSequences=len(sequences)
+
+    #the minimum window size permitted by this program is a motif of length 6
+    if minSize < 6:
+        minSize = 6
+
+    #sanity check for reasonable motif sizes
+    if minSize > maxSize:
+        print "Please input a minimum size that is smaller than a maximum size"
+        print "User input:\nMinimum Size:\t%i\nMaximumSize:\t%i" % (minSize, maxSize)
+        return None
+
+    #sanity check for reasonable percentID
+    if percID> founderPercID:
+        print "Your input %i for percent identity for founder sequences" % founderPercID,
+        print "is less than your input for general percent identity at",percID
+        print "Please adjust your input"
+        return None
+
+    #a user may input a number of iterations which exceeds the possible
+    #for a given set of input values, in which case the number of iterations
+    #should be decreased to another size
+    maxIterations = (maxSize - minSize+1)*Choose(len(sequences),2)*Factorial(len(sequences)-2)
+    if maxIterations < Iterations:
+        print "The number of Iterations you entered", Iterations,
+        print "is being reduced to",maxIterations,
+        Iterations = maxIterations
+
+    #adjusting the window as remarked above
+    MARKOV_WINDOW = MARKOV_SIZE + 1
+    MW1=MARKOV_SIZE
+    print len(sequences), "sequences search for",numberOfMotifs,
+    print " motifs of size",minSize,"to",maxSize
+    print "The percent Identity input is",percID,"%"
+
+    #this list will contain motifs found (up to the NumberOfMotifs passed in
+    sequencesToCompare = [i.upper() for i in sequences]
+    #as a preprocessing steps, a string of Ns will be replaced by one N
+    for i in xrange(len(sequences)):
+        splitByN=sequencesToCompare[i].split('N')
+        j = 0;
+        finalSequence=""
+        max_j=len(splitByN)
+        while j  < max_j :
+            if len(splitByN[j])==0:
+                finalSequence="".join([finalSequence,'N'])
+                while len(splitByN[j])==0:
+                    j+=1
+                    if j==max_j:
+                        break
+            else:
+                finalSequence="".join([finalSequence,splitByN[j]])
+                j+=1
+        sequencesToCompare[i]=finalSequence
+    originalSeqs = []
+    aveCalcSeqs = []
+
+    if UseRC:
+        for seqIndex in xrange(len(sequencesToCompare)):
+            sequence = revComp(sequencesToCompare[seqIndex])
+            aveCalcSeqs.append(''.join([sequencesToCompare[seqIndex], sequence]))
+            originalSeqs.append(''.join([sequencesToCompare[seqIndex],INSERTION_N,sequence]))
+            sequencesToCompare[seqIndex] = ''.join([sequencesToCompare[seqIndex],INSERTION_N*(MARKOV_WINDOW),sequence[MW1:]])
+            sequence = ''.join([INSERTION_N*(MW1),sequencesToCompare[seqIndex][MW1:]])
+            sequencesToCompare[seqIndex] = sequence
+
+    AllPSFMS = []
+    AllPSFMseqs = [] 
+    for motif_i in xrange(numberOfMotifs):
+        empty=min([len(sequencesToCompare[i]) for i in xrange(len(sequencesToCompare))])
+        if empty < maxSize:
+            print "A sequence has been depleted of all options"           
+            return (AllPSFMS, AllPSFMseqs)
+
+        OverallBest = "Infinity"
+        print "MOTIF NUMBER %i\n" % motif_i
+        #multiple iterations are attempted to avoid local maxima
+        for step in xrange(Iterations):
+            sys.stdout.flush() 
+            numIncluded = 2 #used in order to create PSFMs from PWMs
+            #CompareList will be used to randomly generate the sequences to be
+            #compared
+            CompareList = range(len(sequencesToCompare))
+            BestScore = "Infinity"
+            #find the best shared motif between the first two sequences
+            #the motif must be completely contained within the sequence
+            #or its reverse complement
+            NumberOfTries = 0
+            while (BestScore == "Infinity"):
+                #PWM created from all sequences included in the motif
+                PWM = []
+                #pick motif size randomly within the range provided by the user
+                sizeOfMotif = random.randint(minSize,maxSize)
+                NumOfMismatch = sizeOfMotif-ceil((percID)/100.0*sizeOfMotif)
+                #if the motif average is not yet established create the average
+                #and store tis value in the dictionary
+
+                #best motif startpoints will be stored in this list of start
+                #locations
+                BestMotif = [-1] * len(sequencesToCompare)
+                #these are start locations are used in order to test new best
+                #start indices
+                startLocs = BestMotif[:]
+                #if it appears there are no more motifs possible quit       
+                if (NumberOfTries > Choose(len(sequencesToCompare), 2)):
+                    print "No more motifs to be found after %i tries" % NumberOfTries
+                    return (AllPSFMS, AllPSFMseqs)
+                NumberOfTries +=1
+                #randomize sequence order
+                #if the sequence founders are input then you only want to
+                #randomize the remaining sequences
+                if Founders:
+                    CompareList = [0,1]
+                    remaining = range(2,len(sequencesToCompare))
+                    shuffle(remaining)
+                    CompareList.extend(remaining)
+                #if no founders are specified, randomize all
+                else:
+                    shuffle(CompareList)
+
+                SeqIndex0 = CompareList[0]
+                SeqIndex1 = CompareList[1]
+                #founder sequences - first 2 sequences - are selected 
+                #create random start sites to avoid favoring the 5' region
+                startLocs[SeqIndex0] = -1
+                startLocs[SeqIndex1] = -1
+                BestMotif[SeqIndex0] = -1
+                BestMotif[SeqIndex1] = -1
+                BestScore = "Infinity"            
+                #define the best motif between the first two sequences the 
+                #inital set of comparisons are performed at the same indices in
+                #both "founder" sequences
+
+                #to ensure that the startpoint is random we will rearrange the
+                # sequences
+                start0 = randint(0,len(sequencesToCompare[SeqIndex0])-sizeOfMotif-1)
+                accessSeq=sequencesToCompare[SeqIndex0]
+                seq0 = ''.join([accessSeq[start0+1:],'N',accessSeq[:start0+sizeOfMotif]])
+                start1 = randint(0,len(sequencesToCompare[SeqIndex1])-sizeOfMotif-1)
+                accessSeq=sequencesToCompare[SeqIndex1]
+                seq1 = ''.join([accessSeq[start1+1:],'N',accessSeq[:start1+sizeOfMotif]])
+
+                #create shifted score, then check to see if it meets all
+                #criteria to become the best.
+
+                #Perform a time efficient alignment of the founder sequences
+                (BestScore) = TimeEfficientAlignment(seq0,seq1,SeqIndex0,SeqIndex1,startLocs, founderPercID)
+
+                if BestScore=="Infinity":
+                    continue
+                NLoc0=len(sequencesToCompare[SeqIndex0])-start0-1
+                NLoc1=len(sequencesToCompare[SeqIndex1])-start1-1
+
+                #remap to the correct location
+                if startLocs[SeqIndex0] > NLoc0:
+                    startLocs[SeqIndex0]=(startLocs[SeqIndex0]+start0)%len(sequencesToCompare[SeqIndex0])
+                else:
+                    startLocs[SeqIndex0]=(startLocs[SeqIndex0]+start0+1)
+
+                if startLocs[SeqIndex1] > NLoc1:
+                    startLocs[SeqIndex1]=(startLocs[SeqIndex1]+start1)%len(sequencesToCompare[SeqIndex1])
+                else:
+                    startLocs[SeqIndex1]=(startLocs[SeqIndex1]+start1+1)
+
+                #if we found a good score get all the info necessary to
+                #continue, including a PWM and updating the startLocs
+                if BestScore != "Infinity":
+                    index0 = startLocs[SeqIndex0]
+                    index1 = startLocs[SeqIndex1]
+                    seq0 = sequencesToCompare[SeqIndex0]
+                    seq1 = sequencesToCompare[SeqIndex1]
+                    motif0 = seq0[index0:index0+sizeOfMotif]
+                    motif1 = seq1[index1:index1+sizeOfMotif]
+                    BestMotifSeqs=[motif0,motif1]
+                    PWM = convert2PWM([motif0, motif1],sizeOfMotif)
+
+                    #this PWM will be used in order to establish a criterion
+                    #for future sequences, whether they meet a threshold
+                    #similiarty to the founder sequences
+
+            #if it exists, find the best scoring motifs in the remaining
+            #sequences.
+
+            for seqCompIndex in xrange(2, len(sequencesToCompare)):
+                maxScores=[max(PWM[P_I]) for P_I in range(len(PWM))]
+                BestScore = "Infinity"
+                seq=CompareList[seqCompIndex]
+                sequenceToAdd = sequencesToCompare[seq]
+                max_i=len(sequenceToAdd)-sizeOfMotif
+                bestMismatch = NumOfMismatch
+                #to ensure that the startpoint is random we will rearrange the 
+                #sequences
+                start_i = randint(0,max_i-1)
+                seq_i = ''.join([sequenceToAdd[start_i+1:],'N',sequenceToAdd[:start_i+sizeOfMotif]])
+                i=0
+                while (i<=max_i):
+                    motif=seq_i[i:i+sizeOfMotif]
+                    #skip this area if there are any masked regions found
+                    nLoc=motif.rfind('N')
+                    if nLoc >=0:
+                        i+=nLoc+1
+                        continue
+
+                    mmNum = 0
+                    j = 0
+                    while (j < sizeOfMotif):
+                        mmNum+=int(PWM[j][NTDIndices[motif[j]]]!=maxScores[j])
+                        if mmNum > bestMismatch:
+                            break
+                        j+=1
+
+                    #set a new best if we found it (always keep originals)
+                    if mmNum < bestMismatch:
+                        bestMismatch = mmNum
+                        startLocs[seq]=i
+
+                    #we cannot improve upon a perfect match so we break end our
+                    #search if this occurs
+                    if mmNum == 0:
+                        startLocs[seq]=i
+                        break
+                    i+=1
+
+                #if somethigng was found we add it to our PWM
+                if startLocs[seq]!= -1:
+                    NLoc_i=len(sequenceToAdd)-start_i-1
+                    if startLocs[seq] > NLoc_i:
+                        startLocs[seq]=(startLocs[seq]+start_i)%len(sequenceToAdd)
+                    else:
+                        startLocs[seq]=(startLocs[seq]+start_i+1)
+
+                    starti=startLocs[seq]
+                    sAdd=sequencesToCompare[seq][starti:starti+sizeOfMotif]
+                    checkPWM=add2PWMReturn(sAdd,PWM)
+                    add=True
+                    checkMaxScores=[max(checkPWM[P_I]) for P_I in range(len(checkPWM))]
+                    for seq in BestMotifSeqs:
+                        mismatches=0
+                        for i in range(len(seq)):
+                            mismatches+=int(checkPWM[i][NTDIndices[seq[i]]]!=checkMaxScores[i])
+                            if mismatches > NumOfMismatch:
+                                add=False
+                                break
+                    if add:
+                        BestMotifSeqs.append(sAdd)
+                        PWM=copy.deepcopy(checkPWM)
+                        numIncluded+=1 
+                    elif model=="oops":
+                        break
+                #if we require one occurence per sequence then if we do not
+                #have a motif in this sequence we stop our search
+                elif model=="oops":
+                    break
+
+            GroupScore =0
+            #if we have an oops model and not all sequences are being included,
+            #we have to continue without recording what we have so far
+            if model=="oops" and numIncluded<numOfSequences:
+                continue
+
+            #obtain motif scores        
+            GroupScore=0
+            for i in range(len(PWM)):
+                GroupScore += max(PWM[i])
+
+            #store this as the best location if it is the best of this set of 
+            #motifs part of that if statement below
+            if ((OverallBest == "Infinity" and GroupScore != "Infinity" )or \
+                GroupScore > OverallBest):
+                nextToCompare = sequencesToCompare[:]
+                MotifSeqs = []
+                OrderOfBest = CompareList[:]
+                BestIndices = startLocs[:]
+                bestMotifSeqYet = BestMotifSeqs
+                OverallBest = GroupScore
+                PSFM = convert2PSFM(PWM, numIncluded)
+                OverallPWM=PWM[:]
+                for i in xrange(len(sequencesToCompare)):
+                    SeqIndexi = CompareList[i]
+                    #some sequences may not contain the motifs, if so you do no
+                    #want to include them
+                    if BestIndices[SeqIndexi] != -1:
+                        sAdd=sequencesToCompare[SeqIndexi]
+                        sAddI=BestIndices[SeqIndexi]
+                        Motifi = sAdd[sAddI:sAddI+sizeOfMotif]
+                        MotifSeqs.append(Motifi)
+
+                        #mask out original location
+                        nextToCompare[SeqIndexi]=''.join([sAdd[:sAddI],INSERTION_N*(sizeOfMotif),sAdd[sAddI+sizeOfMotif:]])
+                        sAdd=nextToCompare[SeqIndexi]
+                        sAddLen=len(sAdd)
+                        nextToCompare[SeqIndexi]=''.join([sAdd[:sAddLen-sAddI-sizeOfMotif],INSERTION_N*(sizeOfMotif),sAdd[sAddLen-sAddI:]])
+            TopMotif = convert2motif(MotifSeqs, sizeOfMotif)
+
+        #if not more motifs are found (best score is -1) then it is likely that
+        #we have found as many as we can
+        if OverallBest == "Infinity":
+            print "No more motifs were found!"
+            return (AllPSFMS, AllPSFMseqs)
+
+        #if a motif was found report it here and add it to the total motifs
+        #Print out your results if any were reached!
+        for i in xrange(len(sequencesToCompare)):
+            SeqIndexi = OrderOfBest[i]            
+            if BestIndices[SeqIndexi] != -1:
+                Motifi = sequencesToCompare[SeqIndexi][BestIndices[SeqIndexi]:BestIndices[SeqIndexi]+sizeOfMotif]
+                MotifSeqs.append(Motifi)
+
+        #add the current best motif to the total of all motifs for returning
+        #to the user
+        AllPSFMS.append(PSFM)
+        AllPSFMseqs.append(bestMotifSeqYet)
+        print OverallPWM
+
+        NTDSTUFF = {0:'A',1:'C',2:'G',3:'T'}
+        for Pntd in [0,1,2,3]:
+            print ""
+            print NTDSTUFF[Pntd],
+            for Pj in xrange(len(PSFM)):
+                print "\t %.003f"%(PSFM[Pj][Pntd]),
+
+        print "\n**********\n"
+        print TopMotif
+        print "\n**********\n"
+        print "Score:",OverallBest 
+        maxScores=[max(OverallPWM[P_I]) for P_I in range(len(OverallPWM))]
+        for maski in xrange(numOfSequences):
+            maskThis=nextToCompare[maski]
+            i = 0
+            max_i = len(maskThis)-sizeOfMotif
+            while (i<=max_i):
+                motif=maskThis[i:i+sizeOfMotif]
+                #skip this area if there are any masked regions found
+                nLoc=motif.rfind('N')
+                if nLoc >=0:
+                    i+=nLoc+1
+                    continue
+                mmNum = 0
+                j = 0
+                while (j < sizeOfMotif):
+                    if PWM[j][NTDIndices[motif[j]]]!=maxScores[j]:
+                        mmNum+=1
+                        if mmNum > NumOfMismatch:
+                            break
+                    j+=1
+                #if the location is below threshold we mask it out also
+                if mmNum>NumOfMismatch:
+                    i+=1
+                    continue
+
+                i+=1
+
+        #replace old sequences with the new one
+        sequencesToCompare = nextToCompare[:]
+        #reduce masked regions to single N
+        for i in xrange(len(sequences)):
+            splitByN=sequencesToCompare[i].split('N')
+            j = 0;
+            finalSequence=""
+            max_j=len(splitByN)
+            while j  < max_j :
+                if len(splitByN[j])==0:
+                    finalSequence="".join([finalSequence,'N'])
+                    while len(splitByN[j])==0:
+                        j+=1
+                        if j==max_j:
+                            break
+                else:
+                    finalSequence="".join([finalSequence,splitByN[j]])
+                    j+=1
+            sequencesToCompare[i]=finalSequence
+
+    return (AllPSFMS, AllPSFMseqs)
+
+
+#MaskLocation
+#input:     sequence    sequence to be masked
+#          location    location at which to mask the sequence
+#           length      length of area to mask
+#output     string      masked sequence
+def MaskLocation(sequence,start,length):
+    finalsequence=''.join([sequence[:start],INSERTION_FILLER*(length),sequence[start+length:]])
+    newLen=len(finalsequence)
+    finalsequence=''.join([finalsequence[:newLen-start-length],INSERTION_FILLER*(length),finalsequence[newLen-start:]])
+
+    return finalsequence
+
+
+#TimeEfficientAlignment
+#input:     seq0        string sequence in which to calculate a best location
+#           seq1        string second sequence
+#           start0      integer start location for alignment (gives a skew)
+#           start1      integer start for second sequence
+#           BestScore   float which is the current bestscore
+#           percID   percent of the aligned sequences that must be identical
+#                       in order to be considered for a motif
+#output     float       Best scoring alignment's score
+#
+#will be calculated as an ungapped consensus between two sequences as the
+#algorithm steps through each window
+def TimeEfficientAlignment(seq0, seq1, SeqIndex0, SeqIndex1, startLocs, percID):
+    global sizeOfMotif
+    NumOfMismatch = sizeOfMotif-ceil((percID)/100.0*sizeOfMotif)
+    bestMismatch = NumOfMismatch
+    MW1=MARKOV_WINDOW-1
+
+    #create shifted score, then check to see if it meets all
+    #criteria to become the best.
+
+    i = 0
+    maxi=len(seq0)-sizeOfMotif
+    maxj=len(seq1)-sizeOfMotif
+
+    Score="Infinity"
+
+    while (i<=maxi):
+        motif=seq0[i:sizeOfMotif+i]
+        Nloc=motif.rfind('N')
+        if Nloc>=0:
+            i+=Nloc+1
+            continue
+        j=0
+        while (j<=maxj):
+            Nloc=seq1[j:j+sizeOfMotif].rfind('N')
+            if Nloc>=0:
+                j+=Nloc+1
+                continue
+            mismatchedNTDs=0
+            for k in xrange(sizeOfMotif):
+                if motif[k]!=seq1[k+j]:
+                    mismatchedNTDs+=1
+                    if mismatchedNTDs > bestMismatch:
+                        break
+            if mismatchedNTDs == 0:
+                Score = 2*sizeOfMotif-bestMismatch
+                startLocs[SeqIndex0]=i
+                startLocs[SeqIndex1]=j
+                return (Score)
+            if mismatchedNTDs < bestMismatch:
+                bestMismatch = mismatchedNTDs
+                Score = 2*sizeOfMotif-bestMismatch
+                startLocs[SeqIndex0]=i
+                startLocs[SeqIndex1]=j
+            j+=1
+        i+=1
+    return (Score)
+
+
+#AlignmentScore
+#input:     sequenceToCompare   List of sequences whose substrings will be 
+#           sizeOfMotif         aligned integer size of the motif being found 
+#                               (length of the subseqeunce that will be aligned
+#                                from each sequence in above list (window size)
+#           startLocs           start locations of the motifs to be aligned to
+#                               each other
+#           CompareList         the indices of sequencesToCompare to be aligned
+#           numSeqs             the number of sequences -1 from 
+#                               sequencesToCompare to be aligned. ie, the indices
+#                               of sequenceToCompare stored in the first numSeqs
+#                               indices of in CompareList.
+#           Frequency           markov model being used to calculate background
+#           originalSeqs        contain the unmasked sequences used for checking
+#                               the markov score
+#           NumOfMismatch      number of mismatches allowable
+#output:    integer             Score indicating the consensus score of these
+#                               sequences
+#           2D-List             contains the PSFM
+#           2D-List             contains the log markov scores
+#
+#will be calculated as an ungapped consensus between those elements in the 
+#CompareList Consensus score is calculated by choosing the largest number of the 
+#same elements in each column, and adding all these numbers up across all
+#columns
+def AlignmentScore(sequencesToCompare, sizeOfMotif, startLocs, CompareList, numSeqs, Frequency, originalSeqs, MARKOV_WINDOW,NumOfMismatch):
+    TotalScore = 0;
+    Scores = []
+    MW1=MARKOV_WINDOW-1
+    PWM = []
+    Log = []
+    ConsensusScore = 0
+    len(sequencesToCompare)
+    #traverse each column individually
+    for i in xrange (sizeOfMotif):
+        divisor = 0.0
+        PWMi = [0.0, 0.0, 0.0, 0.0]
+        Logi = [0.0, 0.0, 0.0, 0.0]
+        #traverse this index in each sequence        
+        for j in xrange(numSeqs+1):
+            SequenceIndex = CompareList[j]
+            CurrSeq = sequencesToCompare[SequenceIndex]
+            CurrOr = originalSeqs[SequenceIndex]
+            #some sequences may not contain the motifs, if so you do not want 
+            #to include them in the consensus. These have uninitialized start
+            #locations (ie startLocs would be -1
+            if startLocs[SequenceIndex] != -1:
+                divisor += 1.0
+                if sequencesToCompare[SequenceIndex][startLocs[SequenceIndex]+i] == 'N':
+                    print sequencesToCompare
+                    print "\nBAD HERE!"
+                    print CurrSeq
+                    print startLocs
+                    print startLocs[SequenceIndex]
+                    print CompareList
+                    print j
+                    print SequenceIndex
+                    print numSeqs
+                    print sizeOfMotif
+                    print CurrSeq[startLocs[SequenceIndex]:startLocs[SequenceIndex]+sizeOfMotif]
+
+                PWMi[NTDIndices[CurrSeq[startLocs[SequenceIndex]+i]]] += 1.0
+                Logi[NTDIndices[CurrSeq[startLocs[SequenceIndex]+i]]] += Frequency[CurrOr[startLocs[SequenceIndex]-MARKOV_WINDOW+i+1:startLocs[SequenceIndex]+i+1]]
+
+        Scores.append(0)
+        Top = -1
+        for NTD in ['A', 'C', 'G', 'T']:
+            #avoid ln(0) errors
+            if PWMi[NTDIndices[NTD]] > 0:
+                Scores[i] += PWMi[NTDIndices[NTD]]/divisor *log(PWMi[NTDIndices[NTD]]/(divisor*Logi[NTDIndices[NTD]]/PWMi[NTDIndices[NTD]]), e)
+            if PWMi[NTDIndices[NTD]] > Top:
+                Top = PWMi[NTDIndices[NTD]]
+        TotalScore += Scores[i]
+        ConsensusScore += Top
+        PWM.append(PWMi)
+        Log.append(Logi)
+
+    return (TotalScore, Scores, ConsensusScore,PWM, Log)
+
+
+#MarkovFreq
+#input:     prefix      string of length MARKV_SIZE - 1 prefix used for model
+#           actualNTD   character NTD at the en dof the prefix being calculated
+#           Frequency   Markov model for calculations
+#output:    float that gives the markov score for this specific sequence
+#
+#The helper function will run through and find all possible words with the
+#prefix and determine the markov score based on this
+def MarkovFreq (prefix, actualNTD, Frequency):
+
+    denominator = 0.0
+    numerator = 0.0
+    for NTD in ['A', 'C', 'G', 'T']:
+        value = M_Score(prefix+NTD, Frequency, False,MARKOV_WINDOW)
+        if NTD == actualNTD :
+            numerator = value
+        denominator += value
+    retVal = numerator/denominator
+    return retVal
+
+
+#revComp
+#input:     sequence    DNA sequence to be converted to reverse complement
+#output:    string      reverse complement of input sequence
+#
+#obtains the reverse complement of an input sequence
+def revComp (sequence):
+    #base pairs
+    RevDict={'A':'T','T':'A', 'C':'G', 'G':'C', 'N':'N'}
+    reverse = ""
+    #reverse the sequene
+    for i in xrange(len(sequence)):
+        reverse = RevDict[sequence[i].upper()]+reverse
+    return reverse
+
+
+#Markov3
+#input:     sequences   list that are being used to create the background
+#output:    dictionary of all 6mers (reverse complement also) and their -log2
+#           proportion seen
+#
+#background will build a markov model of the background in order to be able     
+#to differentiate the motifs from the pure background of a certain size
+#they will be stored as -log(fraction)
+def Markov(sequences, IncludeRC,MARKOV):
+    MARKOV_WINDOW = MARKOV + 1
+    WordDict = {}
+    totalWindows = 0
+
+    #take each sequence and use it in order to separately determine background
+    for seq in sequences:
+        #all sequences for the background must be full-length            
+        for index in xrange(len(seq)-MARKOV):
+            subseq = seq[index:index+MARKOV_WINDOW].upper()
+            if "N" in subseq:
+                continue
+
+            totalWindows += 1
+            if subseq not in WordDict:
+                WordDict[subseq] = 0.0
+            WordDict[subseq] += 1.0
+            if IncludeRC:
+                totalWindows += 1
+                RC = revComp(subseq)
+                if RC not in WordDict:
+                    WordDict[RC] = 0.0
+                WordDict[RC] += 1.0
+
+    #convert to logs                
+    for key in WordDict:
+        WordDict[key] = 1.0*WordDict[key]/totalWindows
+    return WordDict    
+
+
+#Average_M
+#input:     sequences   List of sequences on which to find the average
+#                       markov score
+#           Model       Dictionary containing pvalues for seeing 3mers
+#           l           integer designating the word sizes from which to
+#                       determine average pvalue
+#output:    average probability of all input lmers in the sequences in the Model
+#
+#finds the probability of seeing all subsequence in the total strings
+#using the markov model created using the background. Markov3 is used
+#(window size of 3) and from this determine the average. This functio will
+#also screen the background model
+def Average_M (sequence, Model, l,MARKOV_WINDOW):
+    totalSum = 0.0;
+    totalWords = 0.0;
+    MW1=MARKOV_WINDOW-1
+    for seq in sequence:
+        for i in xrange(MW1,len(seq)-l+1):
+            sequenceCheck=seq[i-MW1:i+1]
+            Nindex=sequenceCheck.rfind('N')
+            if Nindex >= 0:
+                continue
+            totalWords += 1.0     #increase number of words
+            PVal = M_Score(sequenceCheck, Model, True,MARKOV_WINDOW)
+            #add current word to the running total of scores
+            totalSum += PVal
+    retVal = totalSum/totalWords
+    print totalWords
+
+    return retVal
+
+
+#M_Score
+#input:     sequence    string for which the Pvalue is to be determined
+#           Model       Dictionary containing log2 pvalues for seeing 6mers
+#           check       Boolean which determines whether to also check for
+#                       completeness of markov model
+#output:    log2 probability of seeing the input seqeunce in the Model
+#
+#gives the probability of seeing the given subsequence in the total strings
+#using the markov model created using the background. Markov6 is used
+#(window size of 3)
+def M_Score (sequence, Model, check,MARKOV_WINDOW):
+    PVal = 0.0
+    MW1=MARKOV_WINDOW-1
+    for j in xrange(len(sequence)-MARKOV_WINDOW+1):
+        #if the subsequences is not in the background value it
+        #the program returns an exit code and asks the user to
+        #revise background model input
+        if sequence[j:j+MARKOV_WINDOW] not in Model:
+            if check:
+                print "The Markov Model is inadequate for your input",
+                print "sequences\n %s is"%sequence[j:j+MARKOV_WINDOW],
+                print "not contained in model provided\n",
+                print "Please revise your provided model or consider",
+                print "using Background Modelling provided"
+                sys.exit(0)
+            continue
+        #calculates score
+        PVal += -log(Model[sequence[j:j+MARKOV_WINDOW]],e)
+
+    return PVal
+
+#LogOdds
+#input:     sequences   sequences for which to find the log odds score
+#           startLocs   start locations at which to start computing score
+#           sizeOfMotif size of the motif being created
+#           Freqeuncy   Markov3 Model
+#           Index       Current Indices
+#           CompareList Order of Comparison
+#output     returns the log odds score for the consensus
+#the equation used is as follow:
+#S(j = 1 to sizeOfMotif (S(i = [A,C,G,T]) f_ij * ln(S(Prob each path))))
+def LogOddsoPT(sequences, startLocs, sizeOfMotif, Frequency, Index,CompareList,MARKOV_WINDOW):
+    MW1=MARKOV_WINDOW-1
+    for i in range(sizeOfMotif):
+        for j in range(Index+1):
+            seqIndex = CompareList[j]
+            lnValue = [0.0, 0.0, 0.0, 0.0]
+            totalNum = [0.0, 0.0, 0.0, 0.0]
+            #add the frequency of seeing the given nucleotides in each position
+            #a runnning total of each one. Then add them to the equation
+            if startLocs[seqIndex] != -1:
+                index = startLocs[seqIndex]+i
+                previous = sequences[seqIndex][index-(MW1):index+1]
+                denominator = 0.0
+                numerator = 0.0
+                #determine probabilities for each path
+                for NTD in ['A', 'C', 'G', 'T']:
+                    full = previous+NTD
+                    if full in Frequency:
+                        value = Frequency[full]
+                        if NTD == sequences[seqIndex][index+1]:
+                            numerator = value
+                        denominator += value
+                #increase the summation of each location
+                lnValue[NTDIndices[sequences[seqIndex][index+1]]] +=numerator/denominator
+                #increase number of given nucleotides at the index
+                totalNum[NTDIndices[sequences[seqIndex][index+1]]] += 1.0
+
+#LogOdds
+#input:     sequence    relevant part of the sequence being added to the PWM
+#           PWM         information on sequences already in the motif
+#           LogsM       frequency information on sequences already in motif
+#           Frequency   markov model for background
+#           sizeOfMotif size f the motif being found
+#output     returns the log odds score for the consensus
+#the equation used is as follow:
+#S(j = 1 to sizeOfMotif (S(i = [A,C,G,T]) f_ij * ln(S(Prob each path))))
+def LogOdds(PWM, LogsM, sequence, Frequency,MARKOV_WINDOW):
+    Score = 0
+    PWMout = copy.deepcopy(PWM)
+    LogsMout = copy.deepcopy(LogsM)
+    MW1=MARKOV_WINDOW-1
+    #since each column of the PWM must add up to the total umber of sequences
+    #in that PWM, in addition one must be added for the current sequence
+    totalSeqs = PWM[0][0]+PWM[0][1]+PWM[0][2]+PWM[0][3] + 1
+    for j in range(len(PWMout)):
+        for i in ['A', 'C', 'G', 'T']:
+            if i == sequence[j+MW1]:
+                PWMout[j][NTDIndices[i]] += 1.0
+                #prefix to check for prob of specific ntd given prefix
+                word = sequence[j:j+MARKOV_WINDOW]
+                #determine the frequency of the words individually                     
+                LogsMout[j][NTDIndices[i]] += Frequency[word]   
+            if PWMout[j][NTDIndices[i]]> 0:
+                Score +=PWMout[j][NTDIndices[i]]/totalSeqs*log(PWMout[j][NTDIndices[i]]/(totalSeqs*LogsMout[j][NTDIndices[i]]/PWMout[j][NTDIndices[i]]),e)
+
+    return Score, PWMout, LogsMout
+
+
+#convert2motif
+#input:     sequences   list containing the sequences in A,C,G,T alphabet 
+#                       comprising the motif to be converted into symbols
+#           size        size of the motif
+# maybe add threshold?!?!
+#output:    string      motif converted into descriptive symbols
+#
+#takes in a list of motifs that were found at each point and converts them to
+#an actual motif
+def convert2motif(sequences, size):
+    #column composition is replaced by symbols
+    SymbolDict = {'CGT':'B','AGT':'D','ACT':'H','GT':'K','AC':'M',
+                  'ACGT':'N','AG':'R','CG':'S','ACG':'V','AT':'W',
+                  'CT':'Y','A':'A','C':'C','G':'G','T':'T'}
+    Motif = ""
+    #determine the composition of each column
+    numSeqs = len(sequences)
+    for i in xrange(size):
+        A = 0.
+        C = 0.
+        G = 0.
+        T = 0.       
+        for seq in sequences:
+            if seq[i].upper() == "A":
+                A += 1
+            elif seq[i].upper() == "C":
+                C += 1
+            elif seq[i].upper() == "G":
+                G += 1
+            else:
+                T += 1
+
+        characterCode = ""
+        if (A/numSeqs > 0.1):
+            characterCode += "A"
+        if (C/numSeqs > 0.1):
+            characterCode += "C"
+        if (G/numSeqs > 0.1):
+            characterCode += "G"
+        if (T/numSeqs > 0.1):
+            characterCode += "T"
+        Motif += SymbolDict[characterCode]
+
+    return Motif
+
+
+#PMW2motif
+#input:     array       PWM
+#output:    string      motif converted into descriptive symbols
+#
+#takes in a PWM that was created and converts it to 
+#an actual motif
+def PWM2Motif(PWM):
+    #column composition is replaced by symbols
+    SymbolDict = {'CGT':'B','AGT':'D','ACT':'H','GT':'K','AC':'M',
+                  'ACGT':'N','AG':'R','CG':'S','ACG':'V','AT':'W',
+                  'CT':'Y','A':'A','C':'C','G':'G','T':'T'}
+    Motif = ""
+    #determine the composition of each column
+    for i in xrange(len(PWM)):
+        characterCode = ""
+        if (PWM[i][NTDA] > 0.1):
+            characterCode += "A"
+        if (PWM[i][NTDC] > 0.1):
+            characterCode += "C"
+        if (PWM[i][NTDG] > 0.1):
+            characterCode += "G"
+        if (PWM[i][NTDT] > 0.1):
+            characterCode += "T"
+        Motif += SymbolDict[characterCode]
+
+    return Motif
+
+
+
+#convert2PSFM
+#input:     sequences   list containing the sequences in A,C,G,T alphabet 
+#                       comprising the motif to be converted into symbols
+#           size        size of the motif
+#output:    2Darray     will contain the PSFM where indices 0-3 of each list 
+#                       will be A,C,G,T respectively
+#
+#takes in a list of motifs that were found at each point and converts them to
+#a PSFM
+def convert2PSFM (PWM, NumOfSeqs):
+    #Position specific frequency matrix to be returned
+    PSFM = []
+    #determine the composition of each column
+    for i in xrange(len(PWM)):
+        index = []
+        #get frequencies for each NTD
+        for j in [0,1,2,3]:
+            index.append(PWM[i][j]/NumOfSeqs)
+        #add all nucleotide frequencies to the columns
+        PSFM.append(index)
+
+    return PSFM
+
+
+#convert2PWM
+#input:     sequences   list containing the sequences in A,C,G,T alphabet 
+#                       comprising the motif to be converted into symbols
+#           size        size of the motif
+#output:    2Darray     will contain the PSFM where indices 0-3 of each list 
+#                       will be A,C,G,T respectively
+#
+#takes in a list of motifs that were found at each point and converts them to
+#a PWM
+def convert2PWM (sequences, size):
+    #Position specific frequency matrix to be returned
+    PWM = []
+    #determine the composition of each column
+    for i in xrange(size):
+        indices = [0.0, 0.0, 0.0, 0.0]
+        for seq in sequences:
+            indices[NTDIndices[seq[i].upper()]] += 1.0
+        #add all nucleotide frequencies to the columns
+        PWM.append(indices)
+
+    return PWM
+
+
+#add2PWM
+#input:     sequence    sequence to be added to PWM
+#           PWM         PWM being modiifed         
+#
+#mutator method takes in a sequence and adds it to the PWM
+def add2PWM(sequence, PWM):
+    #determine the composition to add
+    for i in xrange(len(PWM)):
+        PWM[i][NTDIndices[sequence[i].upper()]] += 1.0
+
+
+#add2PWMReturn
+#input:     sequence    sequence to be added to PWM
+#           PWM         PWM being modiifed         
+#
+#mutator method takes in a sequence and adds it to the PWM
+def add2PWMReturn(sequence, PWM):
+    retPWM=copy.deepcopy(PWM)
+    #determine the composition to add
+    for i in xrange(len(retPWM)):
+        retPWM[i][NTDIndices[sequence[i].upper()]] += 1.0
+
+    return retPWM
+
+
+#Align2PWM
+#input:     Motifi      Sequence being aligned to PWM
+#           PWM         PWM to which the sequence is being aligned
+#output:    float       alignment score to the matrix
+#
+#takes in a PWM and aligns the sequence to this PWM and returns
+#the consensus scoreo
+def Align2PSFM(Motifi,PWM):
+    Best = 0.0
+    for i in xrange(len(PWM)):
+        CurrentIndex = PWM[i][:]
+        CurrentIndex[NTDIndices[Motifi[i].upper()]] += 1.0
+        Top = CurrentIndex[0]
+        for j in [1,2,3]:
+            if Top < CurrentIndex[j]:
+                Top = CurrentIndex[j]
+        Best += Top
+
+    return Best
+
+
+#Factorial
+#input:     n       Number for which we will calculate the factorial
+#output:    float   factorial of input number
+#
+#calculates the factorial of input number n
+def Factorial(n):
+    #by definition factorial should be 1
+    Fact = 1
+    for i in xrange(2,n+1):
+        Fact *= i
+
+    return Fact
+
+
+#Choose
+#input:     n       integer for total number of elements in a set
+#           k       integer for total number of elements to be chose from set
+#output:    float   the binomial coefficient of the above number, ie nCk
+#
+#calculates the number of ways to choose k elements from a set of n
+def Choose(n, k):
+    return Factorial(n)/Factorial(n-k)/Factorial(k)
+
+
+#GetMotif
+#input: sequencesToCompare  sequences where the motif is being found
+#       sequencesLast       original version of the sequences
+#       CompareList         Order in which sequences are examined
+#       Best Motif          The location where the motifs score highest
+#       startLocs           The current locations for the motifs being examined
+#       sizeOfMotif         Size of the motif to be found in the sequences
+#       Index               Index up to which the motif has been optimized so
+#                           far
+#       BestScore           The best score of aligned motifs
+#       Frequency           Markov3 Model
+#       PWM                 PWM which includes alll the sequences which have
+#                           been included so far
+#       PWMFounder          This is the "founder" PWM, includes only the
+#                           founder sequences' information
+#       ConScoreTop         This value stores the Consensus score of the
+#                           founder sequences only
+#       normalization       normalization value for the current words
+#       LogsM               frequency of markov words
+#       percID              percent of the aligned sequences that must be 
+#                           identical in order to be considered for a motif to
+#                           be considered (between the founder sequences then
+#                           between founder sequence consensus and each other
+#                           subsequence being considered)
+#       originalSeqs        contain unmasked sequence for markov scores
+#output:integer             Score of the top scoring motif
+#
+#this function will align the Indexth item in the sequencesToCompare to all the
+#sequences previously aligned from their best indices in a way to optimize the
+#alignment score (consensus score)
+
+def GetMotif (sequencesToCompare, sequencesLast, CompareList, BestMotif,
+              startLocs, sizeOfMotif, Index, BestScore, Frequency,
+              PWM, PWMFounder, ConScoreTop, normalization, LogsM,percID,
+              originalSeqs,origLast,MARKOV_WINDOW):
+
+    MW1=MARKOV_WINDOW-1
+    #variable to indicate whether sequencesLast needs to be updated
+    NewBestDetermined = False
+    SeqIndexi = CompareList[Index]
+
+    #search through all positions for this index
+    starti=MW1
+    endi=len(sequencesToCompare[SeqIndexi])-sizeOfMotif
+    while (starti<=endi):
+        Motifi = sequencesToCompare[SeqIndexi][starti:starti + sizeOfMotif]
+        MMotifi = originalSeqs[SeqIndexi][starti-MW1:starti + sizeOfMotif]
+
+        #if this area has already been earmarked as a motif it cannot be
+        #used again
+        Nlocation=Motifi.rfind('N')
+        if Nlocation >= 0:
+            starti+=1+Nlocation+MARKOV_WINDOW
+
+        #check to see if the current sequence has a high enough consensus
+        #with the founder sequences in order to be considered further
+        ScoreTop = Align2PSFM(Motifi, PWMFounder)
+        Poss = float(ScoreTop - ConScoreTop)
+        GOF = Poss/sizeOfMotif
+        if GOF < percID/100.0:
+            starti+=1
+            continue
+
+        #if the word is not above noise
+        if M_Score(MMotifi,Frequency, False,MARKOV_WINDOW) - normalization < 0:
+            starti+=1
+            continue
+
+        startLocs[SeqIndexi] = starti
+        #new best scores are assigned by aligning the current sequence to
+        #the PWM
+        (ScoreAll, PWM2, LogsM2) = LogOdds(PWM, LogsM, originalSeqs[SeqIndexi] [starti-MW1:starti+sizeOfMotif],Frequency,MARKOV_WINDOW)
+        Score = ScoreAll
+
+        #new best scores are evaluted
+        if (BestScore == "Infinity" and Score != "Infinity") or Score > BestScore :
+            #record best position
+            BestMotif[SeqIndexi] = startLocs[SeqIndexi]
+            BestScore = Score
+            LogsMhold = LogsM2[:]
+            PWMhold = PWM2[:]
+            NewBestDetermined = True
+    
+        starti+=1
+
+    startLocs[SeqIndexi] = BestMotif[SeqIndexi]
+    if NewBestDetermined:
+        PWM = PWMhold[:]
+        LogsM = LogsMhold[:]
+
+    return (BestScore, NewBestDetermined, PWM, LogsM)
\ No newline at end of file