erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / core / protein.py
diff --git a/cistematic/core/protein.py b/cistematic/core/protein.py
new file mode 100644 (file)
index 0000000..e1dc95c
--- /dev/null
@@ -0,0 +1,440 @@
+###########################################################################
+#                                                                         #
+# C O P Y R I G H T   N O T I C E                                         #
+#  Copyright (c) 2003-10 by:                                              #
+#    * California Institute of Technology                                 #
+#                                                                         #
+#    All Rights Reserved.                                                 #
+#                                                                         #
+# Permission is hereby granted, free of charge, to any person             #
+# obtaining a copy of this software and associated documentation files    #
+# (the "Software"), to deal in the Software without restriction,          #
+# including without limitation the rights to use, copy, modify, merge,    #
+# publish, distribute, sublicense, and/or sell copies of the Software,    #
+# and to permit persons to whom the Software is furnished to do so,       #
+# subject to the following conditions:                                    #
+#                                                                         #
+# The above copyright notice and this permission notice shall be          #
+# included in all copies or substantial portions of the Software.         #
+#                                                                         #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
+# SOFTWARE.                                                               #
+###########################################################################
+#
+import string
+import copy
+from math import log
+
+AA = {"TTT": "F",
+      "TTC": "F",
+      "TTA": "L",
+      "TTG": "L",
+      "TCT": "S",
+      "TCC": "S",
+      "TCA": "S",
+      "TCG": "S",
+      "TAT": "Y",
+      "TAC": "Y",
+      "TAA": "*",
+      "TAG": "*",
+      "TGT": "C",
+      "TGC": "C",
+      "TGA": "*",
+      "TGG": "W",
+      "CTT": "L",
+      "CTC": "L",
+      "CTA": "L",
+      "CTG": "L",
+      "CCT": "P",
+      "CCC": "P",
+      "CCA": "P",
+      "CCG": "P",
+      "CAT": "H",
+      "CAC": "H",
+      "CAA": "Q",
+      "CAG": "Q",
+      "CGT": "R",
+      "CGC": "R",
+      "CGA": "R",
+      "CGG": "R",
+      "ATT": "I",
+      "ATC": "I",
+      "ATA": "I",
+      "ATG": "M",
+      "ACT": "T",
+      "ACC": "T",
+      "ACA": "T",
+      "ACG": "T",
+      "AAT": "N",
+      "AAC": "N",
+      "AAA": "K",
+      "AAG": "K",
+      "AGT": "S",
+      "AGC": "S",
+      "AGA": "R",
+      "AGG": "R",
+      "GTT": "V",
+      "GTC": "V",
+      "GTA": "V",
+      "GTG": "V",
+      "GCT": "A",
+      "GCC": "A",
+      "GCA": "A",
+      "GCG": "A",
+      "GAT": "D",
+      "GAC": "D",
+      "GAA": "E",
+      "GAG": "E",
+      "GGT": "G",
+      "GGC": "G",
+      "GGA": "G",
+      "GGG": "G",
+      "---": "-",
+      "NNN": "X"
+}
+
+
+def getAA(codon):
+    """ returns one-letter AA symbol corresponding to codon, if existing.
+        returns X for unknown codon, '-' for a complete gap, and '*' for a 
+        stop codon.
+    """
+    codon = codon.upper()
+    codon = string.replace(codon, "U", "T")
+    try:
+        aa = AA[codon]
+    except KeyError:
+        aa = "X"
+
+    return aa
+
+
+def translate(mRNA, frame=1):
+    """ translate a sequence into protein based on frame (1, 2, 3 only)
+    """
+    prot = ""
+    if frame < 1:
+        frame = 1
+    elif frame > 3:
+        frame = 3
+
+    for nucPos in range(frame - 1, len(mRNA) - 2, 3):
+        theCodon = mRNA[nucPos:nucPos+3]
+        print theCodon
+        theAA = getAA(theCodon)
+        prot += theAA
+
+    return prot
+
+
+def iCodon(codon):
+    """ returns the number of possible synonymous changes at
+        each site of the codon, e.g (1, 0, 3) for CTG (Leucine)
+    """
+    isynonyms = [0, 0, 0]
+    origAA = getAA(codon)
+    if origAA == "X" or origAA == "-":
+        return [-1, -1, -1]
+
+    for pos in range(len(codon)):
+        isyn = 0
+        bases = ["A", "C", "G", "T"]
+        site = codon[pos]
+        bases.remove(site)
+        for nt in bases:
+            newcodon = []
+            for nuc in codon:
+                newcodon.append(nuc)
+
+            newcodon[pos] = nt
+            newcodon = string.join(newcodon, "")
+            newAA = getAA(newcodon)
+            if newAA == origAA:
+                isyn += 1
+
+        isynonyms[pos] = isyn
+
+    return isynonyms
+
+
+def calcMutationTypes(codonList1, codonList2, verbose=False):
+    """ returns the number of (synonymous, nonsynonymous) 
+        mutations in informative codons.
+    """
+    synonymous = 0.0
+    nonsynonymous = 0.0
+    for pos in range(len(codonList1)):
+        diffList = []
+        codon1 = codonList1[pos]
+        codon2 = codonList2[pos]
+        for isite in range(len(codon1)):
+            isite1 = codon1[isite]
+            isite2 = codon2[isite]
+            if isite1 != isite2:
+                diffList.append(isite)
+
+        aa1 = getAA(codon1)
+        aa2 = getAA(codon2)
+        if aa1 == aa2:
+            if aa1 == "S":
+                if codon1[0] != codon2[0]:
+                    if verbose:
+                        print "nonsynonymous Serines"
+
+                    nonsynonymous += 1
+
+                synonymous += 1
+            else:
+                synonymous += len(diffList)
+
+        else:
+            if len(diffList) == 1:
+                nonsynonymous += 1
+            else:
+                if verbose:
+                    print "parsimonious mutation path estimator - assuming 1 parameter"
+                    print "diffList = %s\t%s (%s) \t%s (%s)" % (diffList, codon1, aa1, codon2, aa2)
+
+                if 1 in diffList:
+                    if verbose:
+                        print "middle site is nonsynonymous"
+
+                    nonsynonymous += 1
+                    diffList.remove(1)
+
+                if 2 in diffList:
+                    codon3 = codon1[:-1] + codon1[2]
+                    aa3 = getAA(codon3)
+                    if aa3 == aa1:
+                        if verbose:
+                            print "last site is synonymous"
+
+                        synonymous += 1
+                        diffList.remove(2)
+
+                if 0 in diffList:
+                    codon3 = codon2[0] + codon1[1:]
+                    aa3 = getAA(codon3)
+                    if aa3 == aa1:
+                        if verbose:
+                            print "first site is synonymous"
+
+                        synonymous += 1
+                        diffList.remove(0)
+
+                if len(diffList) > 0:
+                    if verbose:
+                        print "%s must be non-synonymous" % str(diffList)
+
+                    nonsynonymous += len(diffList)
+
+    return (synonymous, nonsynonymous)
+
+
+def calcSubstitutionSites(codonList):
+    """ returns the number of (synonymous, nonsynonymous) sites
+        in a list of codons, which should be filtered for gaps and X's.
+    """
+    synonymous = 0.0
+    nonsynonymous = 0.0
+    for codon in codonList:
+        (site0, site1, site2) = iCodon(codon)
+        if site0 < 0:
+            continue
+
+        synonymous += (site0 + site1 + site2) / 3.0
+        nonsynonymous += (9.0 - site0 - site1 - site2) / 3.0
+
+    return (synonymous, nonsynonymous)
+
+
+def calcSubstitutionsPerSite(codonList1, codonList2):
+    """ returns the number of substitutions per site as 
+        a triplet in comparable or informative sites.
+    """
+    site0 = 0.0
+    site1 = 0.0
+    site2 = 0.0
+    for pos in range(len(codonList1)):
+        codon1 = codonList1[pos]
+        codon2 = codonList2[pos]
+        if codon1[0] != codon2[0]:
+            site0 +=1
+
+        if codon1[1] != codon2[1]:
+            site1 += 1
+
+        if codon1[2] != codon2[2]:
+            site2 += 1
+
+    return (site0, site1, site2)
+
+
+def calcKs(Ms, Ns):
+    """ returns a Ks calculated using Ms and Ns and adjusted using
+        Jukes and Cantor's formula.
+    """
+    Ks = -0.75 * log(1 - ((4.0/3.0) * Ms / Ns))
+    return Ks
+
+
+def calcKa(Ma, Na):
+    """ returns a Ka calculated using Ma and Na and adjusted using
+        Jukes and Cantor's formula.
+    """
+    Ka = -0.75 * log(1 - ((4.0/3.0) * Ma / Na))
+    return Ka
+
+
+def printCDSdict(cdsDict, printAA=True):
+    """ Prints every locus in a cdsDict. Optionally truns off AA translation.
+    """
+    for locus in cdsDict:
+        printCDSlocus(cdsDict, locus, printAA)
+
+
+def printCDSlocus(cdsDict, locus, printAA=True):
+    """ Prints a locus in a given cdsDict. Optionally turns off AA translation.
+    """
+    cdsOutLines = []
+    aaOutLines = []
+    cdsOutLine = locus + "\t"
+    aaOutLine = " " * len(locus) + "\t"
+    for pos in range(len(cdsDict[locus])):
+        if len(cdsOutLine) > 69:
+            cdsOutLines.append(cdsOutLine + cdsDict[locus][pos] + "\n")
+            cdsOutLine = locus + "\t"
+            aaOutLines.append(aaOutLine + " " + getAA(cdsDict[locus][pos]) + " \n")
+            aaOutLine = " " * len(locus) + "\t"
+        else:
+            cdsOutLine += cdsDict[locus][pos] + " " 
+            aaOutLine += " " + getAA(cdsDict[locus][pos]) + "  "
+    cdsOutLines.append(cdsOutLine + "\n")
+    aaOutLines.append(aaOutLine + " \n")
+    for index in range(len(cdsOutLines)):
+        print cdsOutLines[index]
+        if printAA:
+            print aaOutLines[index]
+
+
+def getComparableSites(cdsDict, loci=[]):
+    """ given a cdsDict and a list of loci, returns a new cdsDict with positions with 
+        gaps or X codons in any one sequence deleted in all sequences.
+    """
+    newDict = {}
+    seqArray = []
+    locArray = []
+    deleteList = []
+    index = 0
+    for locus in loci:
+        locArray.append(locus)
+        seqArray.append(copy.deepcopy(cdsDict[locus]))
+
+    for index in range(len(locArray)):
+        for pos in range(len(seqArray[index])):
+            aa = getAA(seqArray[index][pos])
+            if aa == "X" or aa == "-":
+                if pos not in deleteList:
+                    deleteList.append(pos)
+
+    deleteList.sort()
+    deleteList.reverse()
+    for pos in deleteList:
+        for index in range(len(locArray)):
+            del seqArray[index][pos]
+
+    for index in range(len(locArray)):
+        newDict[locArray[index]] = seqArray[index]
+
+    return newDict
+
+
+def getInformativeSites(cdsDict, loci=[]):
+    """ given a cdsDict of comparable sites and a list of loci, returns a new 
+        cdsDict with positions with only codons that differ in one or more 
+        sequences and which are therefore (possibly) informative.
+    """
+    newDict = {}
+    seqArray = []
+    locArray = []
+    deleteList = []
+    index = 0
+    for locus in loci:
+        locArray.append(locus)
+        seqArray.append(copy.deepcopy(cdsDict[locus]))
+
+    for pos in range(len(seqArray[0])):
+        deleteCodon = True
+        refcodon = seqArray[0][pos]
+        for index in range(1, len(locArray)):
+            seqcodon = seqArray[index][pos]
+            if seqcodon != refcodon:
+                deleteCodon = False
+
+        if deleteCodon:
+            deleteList.append(pos)
+
+    deleteList.sort()
+    deleteList.reverse()
+    for pos in deleteList:
+        for index in range(len(locArray)):
+            del seqArray[index][pos]
+
+    for index in range(len(locArray)):
+        newDict[locArray[index]] = seqArray[index]
+
+    return newDict
+
+
+def buildCDSdict(cdsFileName):
+    """ imports a set of *ALIGNED* sequences in a fasta-format file and splits 
+        each sequence into its individual codons. Returns a Dictionary of the 
+        sequences with the fasta ID as the key and the codons in a list.
+    """
+    cdsfile = open(cdsFileName, "r")
+    cdslines = cdsfile.readlines()
+    cdsfile.close()    
+    cdsDict = {}
+    locus = ""
+    for line in cdslines:
+        partialCodon = ""
+        inFrame = True
+        line = line[:-1]
+        if line[0] == ">":
+            fields = line.split(" ")
+            if len(fields[0]) > 1 and fields[0][0] == ">":
+                locus = fields[0][1:]
+            else:
+                locus = fields[1]
+
+            cdsDict[locus] = []
+        else:
+            for pos in range(0, len(line), 3):
+                codon = line[pos:pos+3]
+                if codon == "---":
+                    cdsDict[locus].append("---")
+                elif "-" in codon and inFrame:
+                    inFrame = False
+                    partialCodon = string.replace(codon, "-", "")
+                    cdsDict[locus].append("---")
+                elif not inFrame:
+                    partialCodon += string.replace(codon, "-", "")
+                    if len(partialCodon) == 3:
+                        cdsDict[locus].append(partialCodon)
+                        inFrame = True
+                        partialCodon = ""
+
+                    if len(partialCodon) > 3:
+                        cdsDict[locus].append(partialCodon[:3])
+                        partialCodon = partialCodon[3:]
+                else:
+                    cdsDict[locus].append(codon)
+
+    return cdsDict
\ No newline at end of file