erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / programs / Gibbs.py
diff --git a/cistematic/programs/Gibbs.py b/cistematic/programs/Gibbs.py
new file mode 100644 (file)
index 0000000..c4aaff8
--- /dev/null
@@ -0,0 +1,650 @@
+##Sarah Aerni
+##Created:  June 25, 2005
+##Modified: July 11, 2005
+##Motif Finder using Gibbs sampling
+##convergence is measured when the difference in the sequences is negligible
+
+import random
+import math
+import sys
+import copy
+from time import time
+
+from math import ceil
+
+Frequency = {}
+ConsensusScore= {True:2, False:1}
+
+NTDIndices = {"A": 0, "C": 1, "G": 2, "T": 3}
+IndexNTDs  = {0: "A", 1: "C", 2: "G", 3: "T"}
+INSERTION_N = "N"
+global minSize
+global maxSize
+global thresholdPercent
+global sequences
+global sizeOfMotif
+global numOfMismatches
+global maxIterations
+global maxLoopsWithoutImprovement
+
+"""
+markov size - 1 = markov model
+example:
+if you want a markov 1 you want to check 2 ntds total
+markov_size = 2
+MARKOV_SIZE = 2
+
+AlignmentScore
+input:     sequenceToCompare   List of sequences whose substrings will be 
+           sizeOfMotif         aligned integer size of the motif being found 
+                               (length of the subseqeunce that will be aligned
+                                from each sequence in above list (window size)
+           startLocs           start locations of the motifs to be aligned to
+                               each other
+           CompareList         the indices of sequencesToCompare to be aligned
+           numSeqs             the number of sequences -1 from 
+                               sequencesToCompare to be aligned, the indices
+                               of sequenceToCompare stored in the first 
+                               numSeqs indices of in CompareList.
+           Frequency           markov model being used to calculate background
+           originalSeqs        contain unmasked sequences used for checking
+                               the markov score
+output:    integer             Score indicating the consensus score of these
+                               sequences
+           2D-List             contains the PSFM
+           2D-List             contains the log markov scores
+
+will be calculated as an ungapped consensus between those elements in the 
+CompareList Consensus score is calculated by choosing the largest number of
+the same elements in each column, and adding all these numbers up across all
+columns
+"""
+
+
+def AlignmentScore(sequencesToCompare, sizeOfMotif, startLocs, CompareList, numSeqs):
+    TotalScore = 0;
+    PWM = []
+    maxScores=[]
+    len(sequencesToCompare)
+    for i in range (sizeOfMotif):
+        PWMi = [0.0, 0.0, 0.0, 0.0]      
+        for j in range(numSeqs+1):
+            SequenceIndex = CompareList[j]
+            CurrSeq = sequencesToCompare[SequenceIndex]
+
+            #some sequences may not contain the motifs, if so you do not want 
+            #to include them in the consensus. These have uninitialized start
+            #locations (ie startLocs would be -1
+            if startLocs[SequenceIndex] != -1:
+                if sequencesToCompare[SequenceIndex]\
+                    [startLocs[SequenceIndex]+i] == "N":
+                    print sequencesToCompare
+                    print "\nBAD HERE!"
+                    print CurrSeq
+                    print startLocs
+                    print startLocs[SequenceIndex]
+                    print CompareList
+                    print j
+                    print SequenceIndex
+                    print numSeqs
+                    print sizeOfMotif
+                    print CurrSeq[startLocs[SequenceIndex]:startLocs[SequenceIndex]+sizeOfMotif]
+                PWMi[NTDIndices[CurrSeq[startLocs[SequenceIndex]+i]]] += 1.0
+        maxHere=max(PWMi)
+        TotalScore += maxHere
+        maxScores.append(maxHere)
+        PWM.append(PWMi)
+
+    return (TotalScore, PWM,maxScores)
+
+
+def MarkovFreq (prefix, actualNTD, Frequency,MARKOV_WINDOW):
+    """ MarkovFreq
+        input:     prefix      string of length MARKV_SIZE - 1 prefix used for model
+                   actualNTD   character NTD at the en dof the prefix being calculated
+                   Frequency   Markov model for calculations
+        output:    float that gives the markov score for this specific sequence
+
+        The helper function will run through and find all possible words with the
+        prefix and determine the markov score based on this
+    """
+    denominator = 0.0
+    numerator = 0.0
+    for NTD in ["A", "C", "G", "T"]:
+        value = M_Score(prefix+NTD, Frequency, False, MARKOV_WINDOW)
+        if NTD == actualNTD :
+            numerator = value
+        denominator += value
+    retVal = numerator/denominator
+
+    return retVal
+
+
+def revComp (sequence):
+    """ revComp
+        input:     sequence    DNA sequence to be converted to reverse complement
+        output:    string      reverse complement of input sequence
+
+        obtains the reverse complement of an input sequence
+    """
+    RevDict={"A": "T",
+             "T": "A",
+             "C": "G",
+             "G": "C",
+             "N": "N"
+    }
+    reverse = ""
+    for i in range(len(sequence)):
+        reverse = RevDict[sequence[i].upper()]+reverse
+
+    return reverse
+
+
+def Markov(sequences, IncludeRC, MARKOV):
+    """ Markov3
+        input:     sequences   list that are being used to create the background
+        output:    dictionary of all 6mers (reverse complement also) and their -log2
+                   proportion seen
+
+        background will build a markov model of the background in order to be able   
+        to differentiate the motifs from the pure background of a certain size
+        they will be stored as -log(fraction)
+    """
+    MARKOV_WINDOW = MARKOV + 1
+    WordDict = {}
+    totalWindows = 0
+    for i in sequences:
+        totalWindows += (len(i)-MARKOV_WINDOW+1)*2
+
+    for seq in sequences:            
+        for index in range(len(seq)-MARKOV_WINDOW+1):
+            subseq = seq[index:index+MARKOV_WINDOW].upper()
+            if subseq not in WordDict:
+                WordDict[subseq] = 0.0
+
+            WordDict[subseq] += 1.0
+            if IncludeRC:
+                RC = revComp(subseq)
+                if RC not in WordDict:
+                    WordDict[RC] = 0.0
+
+                WordDict[RC] += 1.0
+               
+    for key in WordDict:
+        WordDict[key] = 1.0*WordDict[key]/totalWindows
+
+    return WordDict    
+
+
+def Average_M (sequence, Model, l, MARKOV_WINDOW):
+    """ Average_M
+        input:     sequences   List of sequences on which to find the average
+                               markov score
+                   Model       Dictionary containing pvalues for seeing 3mers
+                   l           integer designating the word sizes from which to
+                               determine average pvalue
+        output:    average probability of all input lmers in sequences in the Model
+
+        finds the probability of seeing all subsequence in the total strings
+        using the markov model created using the background. Markov3 is used
+        (window size of 3) and from this determine the average. This function will
+        also screen the background model
+    """
+    totalSum = 0.0;
+    totalWords = 0.0;
+    for seq in sequence:
+        for i in range(MARKOV_WINDOW-1,len(seq)-l+1):
+            totalWords += 1.0
+            PVal = M_Score(seq[i-MARKOV_WINDOW+1:i+l], Model, True, MARKOV_WINDOW)
+            totalSum += PVal
+
+    retVal = totalSum/totalWords
+    print totalWords
+    return retVal
+
+
+def M_Score (sequence, Model, check, MARKOV_WINDOW):
+    """ M_Score
+        input:     sequence    string for which the Pvalue is to be determined
+                   Model       Dictionary containing log2 pvalues for seeing 6mers
+                   check       Boolean which determines whether to also check for
+                               completeness of markov model
+        output:    log2 probability of seeing the input seqeunce in the Model
+
+        gives the probability of seeing the given subsequence in the total strings
+        using the markov model created using the background. Markov6 is used
+        (window size of 3)
+    """
+    PVal = 0.0
+    for j in range(len(sequence)-MARKOV_WINDOW+1):
+        if sequence[j:j+MARKOV_WINDOW] not in Model:
+            if check:
+                print "The Markov Model is inadequate for your input",
+                print "sequences\n %s is"%sequence[j:j+MARKOV_WINDOW],
+                print "not contained in model provided\n",
+                print "Please revise your provided model or consider",
+                print "using Background Modelling provided"
+                sys.exit(0)
+
+            continue
+
+        PVal += -math.log(Model[sequence[j:j+MARKOV_WINDOW]],math.e)
+
+    return PVal
+
+
+def LogOdds(PWM, LogsM, sequence, Frequency, MARKOV_WINDOW):
+    """ LogOdds
+        input:     sequence    relevant part of the sequence being added to the PWM
+                   PWM         information on sequences already in the motif
+                   LogsM       frequency information on sequences already in motif
+                   Frequency   markov model for background
+                   sizeOfMotif size f the motif being found
+        output     returns the log odds score for the consensus
+        the equation used is as follow:
+        S(j = 1 to sizeOfMotif (S(i = [A,C,G,T]) f_ij * ln(S(Prob each path))))
+    """
+    Score = 0
+    PWMout = copy.deepcopy(PWM)
+    LogsMout = copy.deepcopy(LogsM)
+    #since each column of the PWM must add up to the total umber of sequences
+    #in that PWM, in addition one must be added for the current sequence
+    totalSeqs = PWM[0][0]+PWM[0][1]+PWM[0][2]+PWM[0][3] + 1
+    for j in range(len(PWMout)):
+        for i in ['A', 'C', 'G', 'T']:
+            if i == sequence[j+MARKOV_WINDOW-1]:
+                PWMout[j][NTDIndices[i]] += 1.0
+                word = sequence[j:j+MARKOV_WINDOW]
+                LogsMout[j][NTDIndices[i]] += Frequency[word]
+            if PWMout[j][NTDIndices[i]]> 0:
+                Score += PWMout[j][NTDIndices[i]]/totalSeqs*math.log(PWMout[j][NTDIndices[i]]/(totalSeqs*LogsMout[j][NTDIndices[i]]/PWMout[j][NTDIndices[i]]),math.e)
+
+    return Score, PWMout, LogsMout
+
+
+def convert2motif(sequences, size):
+    """ convert2motif
+        input:     sequences   list containing the sequences in A,C,G,T alphabet 
+                               comprising the motif to be converted into symbols
+                   size        size of the motif
+        maybe add threshold?!?!
+        output:    string      motif converted into descriptive symbols
+
+        takes in a list of motifs that were found at each point and converts them to
+        an actual motif
+    """
+    #column composition is replaced by symbols
+    SymbolDict = {'CGT':'B','AGT':'D','ACT':'H','GT':'K','AC':'M', 'ACGT':'N','AG':'R','CG':'S','ACG':'V','AT':'W', 'CT':'Y','A':'A','C':'C','G':'G','T':'T'}
+    Motif = ""
+    for i in range(size):
+        A = 0
+        C = 0
+        G = 0
+        T = 0        
+        for seq in sequences:
+            if seq[i].upper() == "A":
+                A += 1
+            elif seq[i].upper() == "C":
+                C += 1
+            elif seq[i].upper() == "G":
+                G += 1
+            else:
+                T += 1
+
+        characterCode = ""
+
+        #translate column composition into symbols
+        ###########should we use percentages?! ie. A >certain percent##################
+        if (A > 0):
+            characterCode += "A"
+
+        if (C > 0):
+            characterCode += "C"
+
+        if (G > 0):
+            characterCode += "G"
+
+        if (T > 0):
+            characterCode += "T"
+
+        Motif += SymbolDict[characterCode]
+
+    return Motif
+
+
+def convert2PSFM (sequences, NumOfSeqs):
+    """ convert2PSFM
+        input:     sequences   list containing the sequences in A,C,G,T alphabet 
+                               comprising the motif to be converted into symbols
+                   size        size of the motif
+        output:    2Darray     will contain the PSFM where indices 0-3 of each list 
+                               will be A,C,G,T respectively
+
+        takes in a list of motifs that were found at each point and converts them to
+        a PSFM
+    """
+    PSFM = []
+    PWM = convert2PWM(sequences, len(sequences[0]))
+    for i in xrange(len(PWM)):
+        index = []
+        for j in [0,1,2,3]:
+            index.append(PWM[i][j]/NumOfSeqs)
+
+        PSFM.append(index)
+
+    return PSFM
+
+
+def convert2PWM (sequences, size):
+    """ convert2PWM
+        input:     sequences   list containing the sequences in A,C,G,T alphabet 
+                               comprising the motif to be converted into symbols
+                   size        size of the motif
+        output:    2Darray     will contain the PSFM where indices 0-3 of each list 
+                               will be A,C,G,T respectively
+
+        takes in a list of motifs that were found at each point and converts them to
+        a PWM
+    """
+    PWM = []
+    for i in range(size):
+        indices = [0.0, 0.0, 0.0, 0.0]
+        for seq in sequences:
+            indices[NTDIndices[seq[i].upper()]] += 1.0
+
+        PWM.append(indices)
+
+    return PWM
+
+
+def add2PWM(sequence, PWM):
+    """ add2PWM
+        input:     sequence    sequence to be added to PWM
+                   PWM         PWM being modiifed         
+        takes in a sequence and adds it to the PWM
+    """
+    #determine the composition to add
+    for i in range(len(PWM)):
+        PWM[i][NTDIndices[sequence[i].upper()]] += 1.0
+
+
+def Align2PWM(Motifi,PWM):
+    """ Align2PWM
+        input:     Motifi      Sequence being aligned to PWM
+                   PWM         PWM to which the sequence is being aligned
+        output:    float       alignment score to the matrix
+
+        takes in a PWM and aligns the sequence to this PWM and returns
+        the consensus scoreo
+    """
+    Score = 0.0
+    for i in range(len(PWM)):
+        Score += PWM[i][NTDIndices[Motifi[i]]]
+    return Score
+
+
+def Factorial(n):
+    """ Factorial
+        input:     n       Number for which we will calculate the factorial
+        output:    float   factorial of input number
+
+        calculates the factorial of input number n
+    """
+    #by definition factorial should be 1
+    Fact = 1
+    for i in range(2,n+1):
+        Fact *= i
+    return Fact
+
+
+def Choose(n, k):
+    """ Choose
+        input:     n       integer for total number of elements in a set
+                   k       integer for total number of elements to be chose from set
+        output:    float   the binomial coefficient of the above number, ie nCk
+
+        calculates the number of ways to choose k elements from a set of n
+    """
+    return Factorial(n)/Factorial(n-k)/Factorial(k)
+
+      
+getNTDVals = {0:'A',1:'C',2:'G', 3:'T'}
+
+getNTDIndex = {'A':0,'C':1,'G':2, 'T':3}
+
+
+def MotifFinder(InputSequences, minS, maxS, numberOfMotifs, Markov_Size,
+                       UseRC, Frequency,excludeBelowAve, percID,
+                       maxIter, maxLoopsW_outImprovement):
+    global minSize
+    minSize=minS
+    global maxSize
+    maxSize=maxS
+    global thresholdPercent
+    thresholdPercent=percID
+    global sequences
+    global sizeOfMotif
+    global numOfMismatches
+    global maxIterations
+    maxIterations=maxIter
+
+    global maxLoopsWithoutImprovement
+    maxLoopsWithoutImprovement=maxLoopsW_outImprovement
+
+    print "Running Sampler with %i sequences"%(len(InputSequences))
+    print "Finding %i motifs of size %i to %i using markov size %i" % (numberOfMotifs,minSize,maxSize,Markov_Size)
+
+    sequences = [InputSequences[i].upper() for i in range(len(InputSequences))]
+    PSFMs = []
+    if UseRC:
+        for seqIndex in xrange(len(sequences)):
+            RCSeq = revComp(sequences[seqIndex])
+            sequences[seqIndex] += INSERTION_N+        RCSeq
+
+    #this will track the movement of each sequence
+    #if a movement exceeds a certain threshold we are not finished
+    for motifNum in range(numberOfMotifs):
+
+        #to improve speed shrink sequences by replacing strings of Ns by
+        #a single N
+        for i in xrange(len(sequences)):
+            splitByN=sequences[i].split('N')
+            j = 0;
+            finalSequence=""
+            max_j=len(splitByN)
+            while j  < max_j :
+                if len(splitByN[j])==0:
+                    finalSequence="".join([finalSequence,'N'])
+                    while len(splitByN[j])==0:
+                        j+=1
+                        if j==max_j:
+                            break
+                else:
+                    finalSequence="".join([finalSequence,splitByN[j]])
+                    j+=1
+            sequences[i]=finalSequence
+
+        print "MOTIF NUMBER %i" %motifNum
+        empty=min([len(sequences[i]) for i in xrange(len(sequences))])
+        #pick motif size randomly within the range provided by the user
+        sizeOfMotif = random.randint(minSize,maxSize)
+        if empty < maxSize:
+            return ALLPSFMS
+
+        numOfMismatches=sizeOfMotif-ceil(thresholdPercent/100.0*sizeOfMotif)
+        (PWM,PWMScores,startLocs)=GibbsRunner(100)
+        MaxVals=[0 for i in xrange(len(PWM))]
+        for ConsI in xrange(len(PWM)):
+            MaxVals[ConsI] = max(PWM[ConsI])
+        PWMScores = [0 for i in range(len(sequences))]
+        for SIndex in range(len(sequences)):
+            subseq = sequences[SIndex][startLocs[SIndex]:startLocs[SIndex]+sizeOfMotif]
+            PWMScores[SIndex] = 0
+            #######################start here##########
+            for subIndex in range(len(subseq)):
+                PWMScores[SIndex] += PWM[subIndex][NTDIndices[subseq[subIndex]]]
+
+        maxScore = max(PWMScores)
+        #get rid of all the sequences that do not achieve a certain consensus
+        #score defined by the top one
+        thresh = thresholdPercent/100.0 * maxScore
+        FinalPWMSeqs = []
+        for SIndex in range(len(PWMScores)):
+            if PWMScores[SIndex] > thresh:
+                FinalPWMSeqs.append(sequences[SIndex][startLocs[SIndex]:startLocs[SIndex]+sizeOfMotif])
+            else:
+                startLocs[SIndex] = -1
+
+        FinalPSFM= convert2PSFM (FinalPWMSeqs, len(FinalPWMSeqs))
+        PSFMs.append(FinalPSFM)
+        for i in xrange(len(sequences)):
+            if startLocs[i] != -1:
+                sequences[i] = sequences[i][:startLocs[i]]+INSERTION_N*sizeOfMotif+sequences[i][startLocs[i]+sizeOfMotif:]
+                sequences[i] = sequences[i][:len(sequences[i])-startLocs[i]-sizeOfMotif]+INSERTION_N*sizeOfMotif+sequences[i][len(sequences[i])-startLocs[i]:]
+    return PSFMs
+
+
+def GibbsRunner(iterIn):
+    iterAll = 0
+    BestPWM=[]
+    BestScore=0
+    BestLocs=[]
+    global minSize
+    global maxSize
+    global thresholdPercent
+    global sequences
+    global sizeOfMotif
+    global numOfMismatches
+    global maxIterations
+    global maxLoopsWithoutImprovement
+    maxScore=sizeOfMotif*len(sequences)
+    st=time()
+
+    while iterAll < iterIn: 
+        en=time()      
+        print "%.03f\t"%(en-st),
+        st=time()
+        iterAll+=1
+        startLocs = [-1 ] * len(sequences)    
+
+        for i in range(len(sequences)):
+            startLocs[i] = random.randint(0,len(sequences[i])-sizeOfMotif)
+            while "N" in sequences[i][startLocs[i]:startLocs[i]+sizeOfMotif]:
+                startLocs[i] = random.randint(0,len(sequences[i])-sizeOfMotif)
+
+        (TotalScore, PWM,dummy)= AlignmentScore(sequences, sizeOfMotif, startLocs, [i for i in range(len(sequences))], len(sequences)-1)
+        PWMScore=[Align2PWM(sequences[i][startLocs[i]:startLocs[i]+sizeOfMotif],PWM) for i in range(len(sequences))]
+        print "PWM is right now"
+        print PWM
+        print "scores for each"
+        print PWMScore
+        SOi = -1
+        ConsensusScore = 0
+        PreviousBestScore = 0
+        PreviousBestTime = -1
+        iterations = 0
+        while iterations < maxIterations and (ConsensusScore > PreviousBestScore or PreviousBestTime <= maxLoopsWithoutImprovement):
+            iterations += 1
+            SOi = random.randint(0,len(sequences)-1)
+            SeqMotifs = []
+            locs=startLocs[:]
+            for i in range(len(sequences)):
+                if(SOi == i):
+                    locs[i]=-1
+                    continue
+                SeqMotifs.append(sequences[i][startLocs[i]:startLocs[i]+sizeOfMotif])
+
+            (TotalScore, PWM,maxScores)= AlignmentScore(sequences, sizeOfMotif, locs, [i for i in range(len(sequences))], len(sequences)-1)
+            startLocsProb = []
+            startLocsI= []
+            SOSeq = sequences[SOi]    
+            total = 0
+            start = 0
+            endloc=len(SOSeq)-sizeOfMotif
+            while(start<=endloc):
+                Motif = SOSeq[start:start+sizeOfMotif]
+                locOfN=Motif.rfind("N")
+                if locOfN>=0:
+                    start+=locOfN+1
+                    continue
+                probAtPosn=0
+                j=0
+                mmNum=0
+                while (j<sizeOfMotif):
+                    letterScore=PWM[j][NTDIndices[Motif[j]]]
+                    probAtPosn+=letterScore
+                    mmNum+=int(letterScore!=maxScores[j])
+                    if mmNum>numOfMismatches:
+                        probAtPosn=0
+                        break
+                    j+=1
+
+                if probAtPosn == 0:
+                    start+=1
+                    continue
+                startLocsI.append(start)
+                startLocsProb.append(probAtPosn)
+                total += probAtPosn
+                start+=1
+
+            if len(startLocsProb) == 0:
+                continue
+
+            choice = random.random()
+            choiceLoc = choice*total
+            totalToHere = 0        
+            for PrefI in range(len(startLocsProb)):
+                if totalToHere+startLocsProb[PrefI] == 0:
+                    continue
+                if choiceLoc < totalToHere+startLocsProb[PrefI]:
+                    break
+                totalToHere += startLocsProb[PrefI]
+
+            startLocs[SOi] = startLocsI[PrefI]
+            PWMScore[SOi] = startLocsProb[PrefI]
+            newMotif=SOSeq[startLocs[SOi]:startLocs[SOi]+sizeOfMotif]
+            add2PWM (newMotif, PWM)
+
+            NewScores=[]
+            PercentChange = []
+            for i in range(len(sequences)):
+                NewScore = 0
+                Motif_i=sequences[i][startLocs[i]:startLocs[i]+sizeOfMotif]
+                for j in xrange(sizeOfMotif):
+                    NewScore+=PWM[j][NTDIndices[Motif_i[j]]]
+
+                NewScores.append(NewScore)
+                PercentChange.append(math.fabs(NewScore - PWMScore[i])/PWMScore[i])
+
+            TotConsensusScore = sum(NewScores)
+            AveConsensusScore = TotConsensusScore/(len(sequences))
+            if AveConsensusScore > PreviousBestScore:
+                PreviousBestScore = AveConsensusScore
+                PreviousBestTime = 0
+            else:
+                PreviousBestTime += 1
+
+            PWMScore=NewScores[:]
+
+        Consensus=sum([max(PWM[i]) for i in xrange(len(PWM))])
+        if Consensus> BestScore:
+            BestScore=Consensus
+            BestPWM=PWM[:]
+            BestLocs=startLocs[:]
+            if BestScore==maxScore:
+                break
+
+    print "iterated %i times to find"%iterAll
+    print BestPWM
+
+    return(BestPWM,BestScore,BestLocs)
+
+
+def probFromPSFM(sequence, PSFM):    
+    probability = 1
+    for i in range(len (sequence)):
+        probability *= PSFM[i][getNTDIndex[sequence[i]]]
+
+    return probability
\ No newline at end of file