1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 __all__ = ["motif", "homology", "geneinfo", "protein"]
33 from cistematic.genomes import Genome, geneDB
34 import shutil, tempfile, os
35 from os import environ
37 if environ.get("CISTEMATIC_TEMP"):
38 cisTemp = environ.get("CISTEMATIC_TEMP")
42 tempfile.tempdir = cisTemp
50 def cacheGeneDB(genome):
51 """ save a copy of a genome's gene database to a local cache.
53 if genome not in cache:
55 tempgen = "%s.db" % tempfile.mktemp()
56 shutil.copyfile(geneDB[genome], tempgen)
57 cache[genome] = tempgen
59 print "could not cache genome %s" % genome
61 tempgen = cache[genome]
66 def uncacheGeneDB(genome=""):
67 """ remove the local copy of a genome's gene database.
72 os.remove(cache[genome])
74 print "could not delete %s" % cache[genome]
82 print "could not delete %s" % cache[gen]
88 """ return lists of genomes with a gene database in the local cache.
93 def chooseDB(genome, dbfile=""):
94 """ helper function to use genome's gene database from the local cache if present.
97 if dbfile == "" and genome in cache:
98 dbfile = cache[genome]
103 def readChromosome(genome, chrom, db=""):
104 """ return sequence for entire chromosome
106 aGenome = Genome(genome, chrom, dbFile=chooseDB(genome, db))
107 return aGenome.getChromosomeSequence()
110 def getGenomeEntries(genome, db=""):
111 """ return the entries for a given genome.
114 if db == "" and genome in cache:
117 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
118 return (genome, aGenome.allGIDs())
121 def getGenomeGeneIDs(genome, db=""):
122 """ return the entries for a given genome.
125 if db == "" and genome in cache:
128 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
129 return aGenome.allGIDs()
132 def getChromoGeneEntries(chromosome, lowerbound=-1, upperbound=-1, db=""):
133 """ return the entries for a given chromosome.
135 (genome, chromID) = chromosome
136 aGenome = Genome(genome, chromID, dbFile=chooseDB(genome, db))
137 return aGenome.chromGeneEntries(chromID, lowerbound, upperbound)
140 def getChromosomeNames(genome, db="", partition=1, slice=0):
141 """ return the chromosomes for a given genome.
143 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
144 return aGenome.allChromNames(partition, slice)
147 def geneEntry(geneID, db="", version="1"):
148 """ returns (chrom, start, stop, length, sense) for a given geneID
151 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
152 return aGenome.geneInfo(geneID, version)
156 """ returns the complementary basepair to base nt
158 compDict = {"A": "T", "T": "A",
173 return compDict.get(nt, "N")
176 def complement(sequence, length=-1):
177 """ returns the complement of the sequence.
180 seqLength = len(sequence)
181 if length == seqLength or length < 0:
182 seqList = list(sequence)
184 return "".join(map(compNT, seqList))
186 for index in range(seqLength - 1,seqLength - length - 1, -1):
188 newSeq += compNT(sequence[index])
195 def upstreamToNextGene(geneID, radius, version="1", db=""):
196 """ return distance to gene immediately upstream.
200 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
202 if aGenome.checkGene(geneID):
203 (chrom, start, stop, length, sense) = aGenome.geneInfo(geneID, version)
205 upstream = aGenome.leftGeneDistance(geneID, upstream, version)
207 upstream = aGenome.rightGeneDistance(geneID, upstream, version)
214 def downstreamToNextGene(geneID, radius, version="1", db=""):
215 """ return distance to gene immediately downstream.
219 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
222 if aGenome.checkGene(geneID):
223 (chrom, start, stop, length, sense) = aGenome.geneInfo(geneID, version)
225 downstream = aGenome.rightGeneDistance(geneID, downstream, version)
227 downstream = aGenome.leftGeneDistance(geneID, downstream, version)
234 def retrieveFeatures(match, radius, featureType="", db=""):
235 """ return the features around a given match.
237 (chromosome, hit) = match
238 (genome, chromID) = chromosome
239 lowerboundHit = int(hit[0]) - int(radius)
240 if lowerboundHit < 0:
243 aGenome = Genome(genome, chromID, dbFile=chooseDB(genome, db))
244 results = aGenome.getFeaturesIntersecting(chromID, lowerboundHit, 2 * int(radius), featureType)
249 def retrieveSeqFeatures(geneID, upstream, cds, downstream, boundToNextGene = False, geneDB=""):
250 """ retrieve CDS features upstream, all or none of the cds, and downstream of a geneID.
251 Feature positions are normalized and truncated to local sequence coordinates.
254 (genome, gID) = geneID
255 aGenome = Genome(genome, dbFile=chooseDB(genome, geneDB))
259 if aGenome.checkGene(geneID):
260 (chrom, start, stop, length, sense) = aGenome.geneInfo(geneID)
267 # figure out normalized seqstart and seqstop
270 upstream = aGenome.leftGeneDistance(geneID, upstream)
272 seqstart = start - upstream
287 downstream = aGenome.rightGeneDistance(geneID, downstream)
295 allresults = aGenome.getFeaturesIntersecting(chrom, seqstart, seqlen, "CDS")
296 for entry in allresults:
297 (fname, fversion, fchromosome, fstart, fstop, forientation, ftype) = entry
302 forstart = fstart - seqstart # normalize
303 if forstart < 0: # truncate
306 forstop = fstop - seqstart # normalize
307 if forstop > seqlen: # truncate
310 if (ftype, forstart, forstop, forientation) not in results:
311 results.append((ftype, forstart, forstop, forientation))
313 # figure out normalized seqstart and seqstop
316 upstream = aGenome.rightGeneDistance(geneID, upstream)
318 seqstart = stop + upstream
329 downstream = aGenome.leftGeneDistance(geneID, downstream)
337 allresults = aGenome.getFeaturesIntersecting(chrom, seqstart - seqlen, seqlen, "CDS")
338 for entry in allresults:
339 (fname, fversion, fchromosome, fstart, fstop, forientation, ftype) = entry
344 revstart = seqstart - fstop
348 revstop = seqstart - fstart
352 if (ftype, revstart, revstop, forientation) not in results:
353 results.append((ftype, revstart, revstop, forientation))
360 def getFeaturesIntersecting(genome, chrom, start, length, db="", ftype="CDS"):
361 """ return features of type ftype that fall within the given region.
363 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
364 return aGenome.getFeaturesIntersecting(chrom, start, length, ftype)
367 def retrieveSequence(genome, chrom, start, stop, sense="F", db=""):
368 """ retrieve a sequence given a genome, chromosome, start, stop, and sense.
371 length = abs(stop - start) + 1
373 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
380 sequence = aGenome.sequence(chrom, seqStart, length)
384 entrySeq= aGenome.sequence(chrom, seqStart, length)
387 print "Couldn't retrieve sequence %s %s %s %s %s" % (genome, chrom, start, stop, sense)
392 def retrieveCDS(geneID, maskCDS=False, maskLower=False, db="", version="1"):
393 """ retrieveCDS() - retrieve a sequence given a gene identifier
397 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
399 if aGenome.checkGene(geneID):
400 entrySeq = aGenome.geneSeq(geneID, maskCDS, maskLower, version)
402 print "Could not find %s " % str(geneID)
407 def retrieveUpstream(geneID, upstream, maskCDS=False, maskLower=False, boundToNextGene=False, db="", version="1"):
408 """ retrieve sequence 5' of cds of length upstream for a given a gene identifier
412 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
414 if aGenome.checkGene(geneID):
415 (chrom, start, stop, length, sense) = aGenome.geneInfo(geneID, version)
418 upstream = aGenome.leftGeneDistance(geneID, upstream, version)
420 if (start - upstream) > 1:
421 seqStart = start - upstream - 1
428 upstream = aGenome.rightGeneDistance(geneID, upstream, version)
433 sequence = aGenome.sequence(chrom, seqStart, seqLength, maskCDS, maskLower)
434 # do CDS masking here....
438 entrySeq = complement(sequence, upstream)
441 print "Couldn't find ", geneID
446 def retrieveDownstream(geneID, downstream, maskCDS=False, maskLower=False, boundToNextGene=False, db="", version="1"):
447 """ retrieve sequence 3' of CDS of length downstream for a given a gene identifier
451 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
453 if aGenome.checkGene(geneID):
454 (chrom, start, stop, length, sense) = aGenome.geneInfo(geneID, version)
457 downstream = aGenome.rightGeneDistance(geneID, downstream, version)
460 seqLength = downstream + 1
463 downstream = aGenome.leftGeneDistance(geneID, downstream, version)
465 if (start - downstream) > 1:
466 seqStart = start - downstream
467 seqLength = downstream
472 sequence = aGenome.sequence(chrom, seqStart, seqLength, maskCDS, maskLower)
473 # do CDS masking here
477 entrySeq = complement(sequence, downstream)
482 def retrieveSeq(geneID, upstream, cds, downstream, geneDB="", maskLower = False, boundToNextGene = False, version="1"):
483 """ retrieve upstream, all or none of the cds, and downstream of a geneID
492 geneSeq += retrieveUpstream(geneID, upstream, maskCDS, maskLower, boundToNextGene, geneDB, version)
495 geneSeq += retrieveCDS(geneID, maskCDS, maskLower, geneDB, version)
498 geneSeq += retrieveDownstream(geneID, downstream, maskCDS, maskLower, boundToNextGene, geneDB, version)
500 if len(geneSeq) == 0:
501 print "retrieveSeq Warning: retrieved null sequence for %s: %s (splice form %s) from geneDB %s" % (geneID[0], geneID[1], version, geneDB)
506 def retrieveAll(genome, genes, upstream, downstream, outputFilePath):
507 """ retrieve set of upstream and downstrean sequences for a list of genes in a genome and save them to a file.
509 outFile = open(outputFilePath, "w")
511 print "Processing " , gene
512 outFile.write("> %s \n" % (gene))
513 geneID = (genome, gene)
514 outFile.write("%s\n" % retrieveSeq(geneID, upstream, 0, downstream))
519 def fasta(geneID, seq):
520 """ write a fasta formated seq with geneID in the header.
522 fastaString = "> %s-%s\n%s\n" % (geneID[0],geneID[1], seq)
527 def loadGOInfo(genome, db=""):
528 """ load GO for a given genome
530 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
531 if genome not in goDict.keys():
532 goDict[genome] = aGenome.allGOInfo()
535 def getGOInfo(geneID, db=""):
536 """ retrieve GO info for geneID
538 (genome, locus) = geneID
539 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
541 return aGenome.goInfo(geneID)
546 def getGOIDCount(genome, GOID, db=""):
547 """ retrieve count of genes with a particular GOID.
549 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
551 return aGenome.getGOIDCount(GOID)
556 def allGOTerms(genome, db=""):
557 """ return all GO Terms.
559 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
561 return aGenome.allGOterms()
566 def getAllGOInfo(genome, db=""):
567 """ return all GO Info.
569 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
571 return aGenome.allGoInfo()
576 def loadAnnotInfo(genome, db=""):
577 """ load Annotations for a given genome
579 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
580 if genome not in annotDict.keys():
581 annotDict[genome] = aGenome.allAnnotInfo()
584 def getAnnotInfo(geneID, db=""):
585 """ retrieve Annotations for a given geneID
587 (genome, locus) = geneID
588 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
590 return aGenome.annotInfo(geneID)
595 def getAllAnnotInfo(genome, db=""):
596 """ return all Annotation Info.
598 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
600 return aGenome.allAnnotInfo()
605 def sanitize(inSeq, windowSize=16):
606 """ make sure that very simple repeats are out of the sequence.
607 Soft-mask any window that has windowSize - 2 of mononucleotides
608 and (windowSize / 2) - 1 non-GC dinucleotides.
611 outSeq = list(inSeq.upper())
612 winmin2 = windowSize - 2
613 winhalf = windowSize/2 - 1
614 for pos in range(seqlen - windowSize):
615 window = inSeq[pos:pos + windowSize].upper()
616 if window.count("A") > winmin2 or window.count("C") > winmin2 or window.count("G") > winmin2 or window.count("T") > winmin2:
617 for index in range(windowSize):
618 outSeq[pos + index] = outSeq[pos + index].lower()
620 if window.count("AC") >= winhalf or window.count("AG") >= winhalf or window.count("AT") >= winhalf or window.count("CT") >= winhalf or window.count("GT") >= winhalf or window.count("TA") >= winhalf or window.count("TC") >= winhalf or window.count("TG") >= winhalf or window.count("GA") > winhalf or window.count("CA") > winhalf:
621 for index in range(windowSize):
622 outSeq[pos + index] = outSeq[pos + index].lower()
624 return "".join(outSeq)
627 def featuresIntersecting(genome, posList, radius, ftype, name="", chrom="", version="", db="", extendGen="", replaceMod=False):
628 """ returns a dictionary of matching features to positions of the double form (chromosome, position).
629 Only positions with features within radius are returned.
633 aGenome = Genome(genome, dbFile=chooseDB(genome, db), inRAM=True)
634 aGenome.extendFeatures(extendGen, replace = replaceMod)
636 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
638 features = aGenome.getFeatures(ftype, name, chrom, version)
639 if len(posList) < 1 or len(features) < 1:
642 chromList = features.keys()
643 for (chrom, pos) in posList:
645 if chrom not in chromList:
648 for (name, version, chromosome, start, stop, orientation, atype) in features[chrom]:
649 if (pos + radius) < start or (pos - radius) > stop:
652 tempList.append((name, version, chromosome, start, stop, orientation, atype))
654 if len(tempList) > 0:
655 resultDict[(chrom, pos)] = tempList
660 def genesIntersecting(genome, posList, name="", chrom="", version="", db="", flank=0, extendGen="", replaceMod=False):
661 """ returns a dictionary of matching genes to positions of the double form (chromosome, position).
662 Only positions with features within radius are returned.
666 aGenome = Genome(genome, dbFile=chooseDB(genome, db), inRAM=True)
667 aGenome.extendFeatures(extendGen, replace = replaceMod)
669 aGenome = Genome(genome, dbFile=chooseDB(genome, db))
671 genes = aGenome.getallGeneInfo(name, chrom, version)
672 if len(posList) < 1 or len(genes) < 1:
675 chromList = genes.keys()
676 for (chrom, pos) in posList:
678 if chrom not in chromList:
681 for (name, chromosome, start, stop, orientation) in genes[chrom]:
682 if start-flank <= pos <= stop+flank:
683 tempList.append((name, "noversion", chromosome, start, stop, orientation))
685 if len(tempList) > 0:
686 resultDict[(chrom, pos)] = tempList