cistematic/core/protein.py

   1 ###########################################################################
   2 #                                                                         #
   3 # C O P Y R I G H T   N O T I C E                                         #
   4 #  Copyright (c) 2003-10 by:                                              #
   5 #    * California Institute of Technology                                 #
   6 #                                                                         #
   7 #    All Rights Reserved.                                                 #
   8 #                                                                         #
   9 # Permission is hereby granted, free of charge, to any person             #
  10 # obtaining a copy of this software and associated documentation files    #
  11 # (the "Software"), to deal in the Software without restriction,          #
  12 # including without limitation the rights to use, copy, modify, merge,    #
  13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
  14 # and to permit persons to whom the Software is furnished to do so,       #
  15 # subject to the following conditions:                                    #
  16 #                                                                         #
  17 # The above copyright notice and this permission notice shall be          #
  18 # included in all copies or substantial portions of the Software.         #
  19 #                                                                         #
  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
  21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
  22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
  23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
  24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
  25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
  26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
  27 # SOFTWARE.                                                               #
  28 ###########################################################################
  29 #
  30 import string
  31 import copy
  32 from math import log
  33
  34 AA = {"TTT": "F",
  35       "TTC": "F",
  36       "TTA": "L",
  37       "TTG": "L",
  38       "TCT": "S",
  39       "TCC": "S",
  40       "TCA": "S",
  41       "TCG": "S",
  42       "TAT": "Y",
  43       "TAC": "Y",
  44       "TAA": "*",
  45       "TAG": "*",
  46       "TGT": "C",
  47       "TGC": "C",
  48       "TGA": "*",
  49       "TGG": "W",
  50       "CTT": "L",
  51       "CTC": "L",
  52       "CTA": "L",
  53       "CTG": "L",
  54       "CCT": "P",
  55       "CCC": "P",
  56       "CCA": "P",
  57       "CCG": "P",
  58       "CAT": "H",
  59       "CAC": "H",
  60       "CAA": "Q",
  61       "CAG": "Q",
  62       "CGT": "R",
  63       "CGC": "R",
  64       "CGA": "R",
  65       "CGG": "R",
  66       "ATT": "I",
  67       "ATC": "I",
  68       "ATA": "I",
  69       "ATG": "M",
  70       "ACT": "T",
  71       "ACC": "T",
  72       "ACA": "T",
  73       "ACG": "T",
  74       "AAT": "N",
  75       "AAC": "N",
  76       "AAA": "K",
  77       "AAG": "K",
  78       "AGT": "S",
  79       "AGC": "S",
  80       "AGA": "R",
  81       "AGG": "R",
  82       "GTT": "V",
  83       "GTC": "V",
  84       "GTA": "V",
  85       "GTG": "V",
  86       "GCT": "A",
  87       "GCC": "A",
  88       "GCA": "A",
  89       "GCG": "A",
  90       "GAT": "D",
  91       "GAC": "D",
  92       "GAA": "E",
  93       "GAG": "E",
  94       "GGT": "G",
  95       "GGC": "G",
  96       "GGA": "G",
  97       "GGG": "G",
  98       "---": "-",
  99       "NNN": "X"
 100 }
 101
 102
 103 def getAA(codon):
 104     """ returns one-letter AA symbol corresponding to codon, if existing.
 105         returns X for unknown codon, '-' for a complete gap, and '*' for a
 106         stop codon.
 107     """
 108     codon = codon.upper()
 109     codon = string.replace(codon, "U", "T")
 110     try:
 111         aa = AA[codon]
 112     except KeyError:
 113         aa = "X"
 114
 115     return aa
 116
 117
 118 def translate(mRNA, frame=1):
 119     """ translate a sequence into protein based on frame (1, 2, 3 only)
 120     """
 121     prot = ""
 122     if frame < 1:
 123         frame = 1
 124     elif frame > 3:
 125         frame = 3
 126
 127     for nucPos in range(frame - 1, len(mRNA) - 2, 3):
 128         theCodon = mRNA[nucPos:nucPos+3]
 129         print theCodon
 130         theAA = getAA(theCodon)
 131         prot += theAA
 132
 133     return prot
 134
 135
 136 def iCodon(codon):
 137     """ returns the number of possible synonymous changes at
 138         each site of the codon, e.g (1, 0, 3) for CTG (Leucine)
 139     """
 140     isynonyms = [0, 0, 0]
 141     origAA = getAA(codon)
 142     if origAA == "X" or origAA == "-":
 143         return [-1, -1, -1]
 144
 145     for pos in range(len(codon)):
 146         isyn = 0
 147         bases = ["A", "C", "G", "T"]
 148         site = codon[pos]
 149         bases.remove(site)
 150         for nt in bases:
 151             newcodon = []
 152             for nuc in codon:
 153                 newcodon.append(nuc)
 154
 155             newcodon[pos] = nt
 156             newcodon = string.join(newcodon, "")
 157             newAA = getAA(newcodon)
 158             if newAA == origAA:
 159                 isyn += 1
 160
 161         isynonyms[pos] = isyn
 162
 163     return isynonyms
 164
 165
 166 def calcMutationTypes(codonList1, codonList2, verbose=False):
 167     """ returns the number of (synonymous, nonsynonymous)
 168         mutations in informative codons.
 169     """
 170     synonymous = 0.0
 171     nonsynonymous = 0.0
 172     for pos in range(len(codonList1)):
 173         diffList = []
 174         codon1 = codonList1[pos]
 175         codon2 = codonList2[pos]
 176         for isite in range(len(codon1)):
 177             isite1 = codon1[isite]
 178             isite2 = codon2[isite]
 179             if isite1 != isite2:
 180                 diffList.append(isite)
 181
 182         aa1 = getAA(codon1)
 183         aa2 = getAA(codon2)
 184         if aa1 == aa2:
 185             if aa1 == "S":
 186                 if codon1[0] != codon2[0]:
 187                     if verbose:
 188                         print "nonsynonymous Serines"
 189
 190                     nonsynonymous += 1
 191
 192                 synonymous += 1
 193             else:
 194                 synonymous += len(diffList)
 195
 196         else:
 197             if len(diffList) == 1:
 198                 nonsynonymous += 1
 199             else:
 200                 if verbose:
 201                     print "parsimonious mutation path estimator - assuming 1 parameter"
 202                     print "diffList = %s\t%s (%s) \t%s (%s)" % (diffList, codon1, aa1, codon2, aa2)
 203
 204                 if 1 in diffList:
 205                     if verbose:
 206                         print "middle site is nonsynonymous"
 207
 208                     nonsynonymous += 1
 209                     diffList.remove(1)
 210
 211                 if 2 in diffList:
 212                     codon3 = codon1[:-1] + codon1[2]
 213                     aa3 = getAA(codon3)
 214                     if aa3 == aa1:
 215                         if verbose:
 216                             print "last site is synonymous"
 217
 218                         synonymous += 1
 219                         diffList.remove(2)
 220
 221                 if 0 in diffList:
 222                     codon3 = codon2[0] + codon1[1:]
 223                     aa3 = getAA(codon3)
 224                     if aa3 == aa1:
 225                         if verbose:
 226                             print "first site is synonymous"
 227
 228                         synonymous += 1
 229                         diffList.remove(0)
 230
 231                 if len(diffList) > 0:
 232                     if verbose:
 233                         print "%s must be non-synonymous" % str(diffList)
 234
 235                     nonsynonymous += len(diffList)
 236
 237     return (synonymous, nonsynonymous)
 238
 239
 240 def calcSubstitutionSites(codonList):
 241     """ returns the number of (synonymous, nonsynonymous) sites
 242         in a list of codons, which should be filtered for gaps and X's.
 243     """
 244     synonymous = 0.0
 245     nonsynonymous = 0.0
 246     for codon in codonList:
 247         (site0, site1, site2) = iCodon(codon)
 248         if site0 < 0:
 249             continue
 250
 251         synonymous += (site0 + site1 + site2) / 3.0
 252         nonsynonymous += (9.0 - site0 - site1 - site2) / 3.0
 253
 254     return (synonymous, nonsynonymous)
 255
 256
 257 def calcSubstitutionsPerSite(codonList1, codonList2):
 258     """ returns the number of substitutions per site as
 259         a triplet in comparable or informative sites.
 260     """
 261     site0 = 0.0
 262     site1 = 0.0
 263     site2 = 0.0
 264     for pos in range(len(codonList1)):
 265         codon1 = codonList1[pos]
 266         codon2 = codonList2[pos]
 267         if codon1[0] != codon2[0]:
 268             site0 +=1
 269
 270         if codon1[1] != codon2[1]:
 271             site1 += 1
 272
 273         if codon1[2] != codon2[2]:
 274             site2 += 1
 275
 276     return (site0, site1, site2)
 277
 278
 279 def calcKs(Ms, Ns):
 280     """ returns a Ks calculated using Ms and Ns and adjusted using
 281         Jukes and Cantor's formula.
 282     """
 283     Ks = -0.75 * log(1 - ((4.0/3.0) * Ms / Ns))
 284     return Ks
 285
 286
 287 def calcKa(Ma, Na):
 288     """ returns a Ka calculated using Ma and Na and adjusted using
 289         Jukes and Cantor's formula.
 290     """
 291     Ka = -0.75 * log(1 - ((4.0/3.0) * Ma / Na))
 292     return Ka
 293
 294
 295 def printCDSdict(cdsDict, printAA=True):
 296     """ Prints every locus in a cdsDict. Optionally truns off AA translation.
 297     """
 298     for locus in cdsDict:
 299         printCDSlocus(cdsDict, locus, printAA)
 300
 301
 302 def printCDSlocus(cdsDict, locus, printAA=True):
 303     """ Prints a locus in a given cdsDict. Optionally turns off AA translation.
 304     """
 305     cdsOutLines = []
 306     aaOutLines = []
 307     cdsOutLine = locus + "\t"
 308     aaOutLine = " " * len(locus) + "\t"
 309     for pos in range(len(cdsDict[locus])):
 310         if len(cdsOutLine) > 69:
 311             cdsOutLines.append(cdsOutLine + cdsDict[locus][pos] + "\n")
 312             cdsOutLine = locus + "\t"
 313             aaOutLines.append(aaOutLine + " " + getAA(cdsDict[locus][pos]) + " \n")
 314             aaOutLine = " " * len(locus) + "\t"
 315         else:
 316             cdsOutLine += cdsDict[locus][pos] + " "
 317             aaOutLine += " " + getAA(cdsDict[locus][pos]) + "  "
 318
 319     cdsOutLines.append(cdsOutLine + "\n")
 320     aaOutLines.append(aaOutLine + " \n")
 321     for index in range(len(cdsOutLines)):
 322         print cdsOutLines[index]
 323         if printAA:
 324             print aaOutLines[index]
 325
 326
 327 def getComparableSites(cdsDict, loci=[]):
 328     """ given a cdsDict and a list of loci, returns a new cdsDict with positions with
 329         gaps or X codons in any one sequence deleted in all sequences.
 330     """
 331     newDict = {}
 332     seqArray = []
 333     locArray = []
 334     deleteList = []
 335     index = 0
 336     for locus in loci:
 337         locArray.append(locus)
 338         seqArray.append(copy.deepcopy(cdsDict[locus]))
 339
 340     for index in range(len(locArray)):
 341         for pos in range(len(seqArray[index])):
 342             aa = getAA(seqArray[index][pos])
 343             if aa == "X" or aa == "-":
 344                 if pos not in deleteList:
 345                     deleteList.append(pos)
 346
 347     deleteList.sort()
 348     deleteList.reverse()
 349     for pos in deleteList:
 350         for index in range(len(locArray)):
 351             del seqArray[index][pos]
 352
 353     for index in range(len(locArray)):
 354         newDict[locArray[index]] = seqArray[index]
 355
 356     return newDict
 357
 358
 359 def getInformativeSites(cdsDict, loci=[]):
 360     """ given a cdsDict of comparable sites and a list of loci, returns a new
 361         cdsDict with positions with only codons that differ in one or more
 362         sequences and which are therefore (possibly) informative.
 363     """
 364     newDict = {}
 365     seqArray = []
 366     locArray = []
 367     deleteList = []
 368     index = 0
 369     for locus in loci:
 370         locArray.append(locus)
 371         seqArray.append(copy.deepcopy(cdsDict[locus]))
 372
 373     for pos in range(len(seqArray[0])):
 374         deleteCodon = True
 375         refcodon = seqArray[0][pos]
 376         for index in range(1, len(locArray)):
 377             seqcodon = seqArray[index][pos]
 378             if seqcodon != refcodon:
 379                 deleteCodon = False
 380
 381         if deleteCodon:
 382             deleteList.append(pos)
 383
 384     deleteList.sort()
 385     deleteList.reverse()
 386     for pos in deleteList:
 387         for index in range(len(locArray)):
 388             del seqArray[index][pos]
 389
 390     for index in range(len(locArray)):
 391         newDict[locArray[index]] = seqArray[index]
 392
 393     return newDict
 394
 395
 396 def buildCDSdict(cdsFileName):
 397     """ imports a set of *ALIGNED* sequences in a fasta-format file and splits
 398         each sequence into its individual codons. Returns a Dictionary of the
 399         sequences with the fasta ID as the key and the codons in a list.
 400     """
 401     cdsfile = open(cdsFileName, "r")
 402     cdslines = cdsfile.readlines()
 403     cdsfile.close()
 404     cdsDict = {}
 405     locus = ""
 406     for line in cdslines:
 407         partialCodon = ""
 408         inFrame = True
 409         line = line[:-1]
 410         if line[0] == ">":
 411             fields = line.split(" ")
 412             if len(fields[0]) > 1 and fields[0][0] == ">":
 413                 locus = fields[0][1:]
 414             else:
 415                 locus = fields[1]
 416
 417             cdsDict[locus] = []
 418         else:
 419             for pos in range(0, len(line), 3):
 420                 codon = line[pos:pos+3]
 421                 if codon == "---":
 422                     cdsDict[locus].append("---")
 423                 elif "-" in codon and inFrame:
 424                     inFrame = False
 425                     partialCodon = string.replace(codon, "-", "")
 426                     cdsDict[locus].append("---")
 427                 elif not inFrame:
 428                     partialCodon += string.replace(codon, "-", "")
 429                     if len(partialCodon) == 3:
 430                         cdsDict[locus].append(partialCodon)
 431                         inFrame = True
 432                         partialCodon = ""
 433
 434                     if len(partialCodon) > 3:
 435                         cdsDict[locus].append(partialCodon[:3])
 436                         partialCodon = partialCodon[3:]
 437                 else:
 438                     cdsDict[locus].append(codon)
 439
 440     return cdsDict