1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
104 """ returns one-letter AA symbol corresponding to codon, if existing.
105 returns X for unknown codon, '-' for a complete gap, and '*' for a
108 codon = codon.upper()
109 codon = string.replace(codon, "U", "T")
118 def translate(mRNA, frame=1):
119 """ translate a sequence into protein based on frame (1, 2, 3 only)
127 for nucPos in range(frame - 1, len(mRNA) - 2, 3):
128 theCodon = mRNA[nucPos:nucPos+3]
130 theAA = getAA(theCodon)
137 """ returns the number of possible synonymous changes at
138 each site of the codon, e.g (1, 0, 3) for CTG (Leucine)
140 isynonyms = [0, 0, 0]
141 origAA = getAA(codon)
142 if origAA == "X" or origAA == "-":
145 for pos in range(len(codon)):
147 bases = ["A", "C", "G", "T"]
156 newcodon = string.join(newcodon, "")
157 newAA = getAA(newcodon)
161 isynonyms[pos] = isyn
166 def calcMutationTypes(codonList1, codonList2, verbose=False):
167 """ returns the number of (synonymous, nonsynonymous)
168 mutations in informative codons.
172 for pos in range(len(codonList1)):
174 codon1 = codonList1[pos]
175 codon2 = codonList2[pos]
176 for isite in range(len(codon1)):
177 isite1 = codon1[isite]
178 isite2 = codon2[isite]
180 diffList.append(isite)
186 if codon1[0] != codon2[0]:
188 print "nonsynonymous Serines"
194 synonymous += len(diffList)
197 if len(diffList) == 1:
201 print "parsimonious mutation path estimator - assuming 1 parameter"
202 print "diffList = %s\t%s (%s) \t%s (%s)" % (diffList, codon1, aa1, codon2, aa2)
206 print "middle site is nonsynonymous"
212 codon3 = codon1[:-1] + codon1[2]
216 print "last site is synonymous"
222 codon3 = codon2[0] + codon1[1:]
226 print "first site is synonymous"
231 if len(diffList) > 0:
233 print "%s must be non-synonymous" % str(diffList)
235 nonsynonymous += len(diffList)
237 return (synonymous, nonsynonymous)
240 def calcSubstitutionSites(codonList):
241 """ returns the number of (synonymous, nonsynonymous) sites
242 in a list of codons, which should be filtered for gaps and X's.
246 for codon in codonList:
247 (site0, site1, site2) = iCodon(codon)
251 synonymous += (site0 + site1 + site2) / 3.0
252 nonsynonymous += (9.0 - site0 - site1 - site2) / 3.0
254 return (synonymous, nonsynonymous)
257 def calcSubstitutionsPerSite(codonList1, codonList2):
258 """ returns the number of substitutions per site as
259 a triplet in comparable or informative sites.
264 for pos in range(len(codonList1)):
265 codon1 = codonList1[pos]
266 codon2 = codonList2[pos]
267 if codon1[0] != codon2[0]:
270 if codon1[1] != codon2[1]:
273 if codon1[2] != codon2[2]:
276 return (site0, site1, site2)
280 """ returns a Ks calculated using Ms and Ns and adjusted using
281 Jukes and Cantor's formula.
283 Ks = -0.75 * log(1 - ((4.0/3.0) * Ms / Ns))
288 """ returns a Ka calculated using Ma and Na and adjusted using
289 Jukes and Cantor's formula.
291 Ka = -0.75 * log(1 - ((4.0/3.0) * Ma / Na))
295 def printCDSdict(cdsDict, printAA=True):
296 """ Prints every locus in a cdsDict. Optionally truns off AA translation.
298 for locus in cdsDict:
299 printCDSlocus(cdsDict, locus, printAA)
302 def printCDSlocus(cdsDict, locus, printAA=True):
303 """ Prints a locus in a given cdsDict. Optionally turns off AA translation.
307 cdsOutLine = locus + "\t"
308 aaOutLine = " " * len(locus) + "\t"
309 for pos in range(len(cdsDict[locus])):
310 if len(cdsOutLine) > 69:
311 cdsOutLines.append(cdsOutLine + cdsDict[locus][pos] + "\n")
312 cdsOutLine = locus + "\t"
313 aaOutLines.append(aaOutLine + " " + getAA(cdsDict[locus][pos]) + " \n")
314 aaOutLine = " " * len(locus) + "\t"
316 cdsOutLine += cdsDict[locus][pos] + " "
317 aaOutLine += " " + getAA(cdsDict[locus][pos]) + " "
319 cdsOutLines.append(cdsOutLine + "\n")
320 aaOutLines.append(aaOutLine + " \n")
321 for index in range(len(cdsOutLines)):
322 print cdsOutLines[index]
324 print aaOutLines[index]
327 def getComparableSites(cdsDict, loci=[]):
328 """ given a cdsDict and a list of loci, returns a new cdsDict with positions with
329 gaps or X codons in any one sequence deleted in all sequences.
337 locArray.append(locus)
338 seqArray.append(copy.deepcopy(cdsDict[locus]))
340 for index in range(len(locArray)):
341 for pos in range(len(seqArray[index])):
342 aa = getAA(seqArray[index][pos])
343 if aa == "X" or aa == "-":
344 if pos not in deleteList:
345 deleteList.append(pos)
349 for pos in deleteList:
350 for index in range(len(locArray)):
351 del seqArray[index][pos]
353 for index in range(len(locArray)):
354 newDict[locArray[index]] = seqArray[index]
359 def getInformativeSites(cdsDict, loci=[]):
360 """ given a cdsDict of comparable sites and a list of loci, returns a new
361 cdsDict with positions with only codons that differ in one or more
362 sequences and which are therefore (possibly) informative.
370 locArray.append(locus)
371 seqArray.append(copy.deepcopy(cdsDict[locus]))
373 for pos in range(len(seqArray[0])):
375 refcodon = seqArray[0][pos]
376 for index in range(1, len(locArray)):
377 seqcodon = seqArray[index][pos]
378 if seqcodon != refcodon:
382 deleteList.append(pos)
386 for pos in deleteList:
387 for index in range(len(locArray)):
388 del seqArray[index][pos]
390 for index in range(len(locArray)):
391 newDict[locArray[index]] = seqArray[index]
396 def buildCDSdict(cdsFileName):
397 """ imports a set of *ALIGNED* sequences in a fasta-format file and splits
398 each sequence into its individual codons. Returns a Dictionary of the
399 sequences with the fasta ID as the key and the codons in a list.
401 cdsfile = open(cdsFileName, "r")
402 cdslines = cdsfile.readlines()
406 for line in cdslines:
411 fields = line.split(" ")
412 if len(fields[0]) > 1 and fields[0][0] == ">":
413 locus = fields[0][1:]
419 for pos in range(0, len(line), 3):
420 codon = line[pos:pos+3]
422 cdsDict[locus].append("---")
423 elif "-" in codon and inFrame:
425 partialCodon = string.replace(codon, "-", "")
426 cdsDict[locus].append("---")
428 partialCodon += string.replace(codon, "-", "")
429 if len(partialCodon) == 3:
430 cdsDict[locus].append(partialCodon)
434 if len(partialCodon) > 3:
435 cdsDict[locus].append(partialCodon[:3])
436 partialCodon = partialCodon[3:]
438 cdsDict[locus].append(codon)