erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / core / __init__.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-10 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 __all__ = ["motif", "homology", "geneinfo", "protein"]
31
32 import cistematic
33 from cistematic.genomes import Genome, geneDB
34 import shutil, tempfile, os
35 from os import environ
36
37 if environ.get("CISTEMATIC_TEMP"):
38     cisTemp = environ.get("CISTEMATIC_TEMP")
39 else:
40     cisTemp = "/tmp"
41
42 tempfile.tempdir = cisTemp
43
44 goDict = {}
45 annotDict = {}
46 global cache 
47 cache = {}
48
49
50 def cacheGeneDB(genome):
51     """ save a copy of a genome's gene database to a local cache.
52     """
53     if genome not in cache:
54         try:
55             tempgen = "%s.db" % tempfile.mktemp()
56             shutil.copyfile(geneDB[genome], tempgen)
57             cache[genome] = tempgen
58         except:
59             print "could not cache genome %s" % genome
60     else:
61         tempgen = cache[genome]
62
63     return tempgen
64
65
66 def uncacheGeneDB(genome=""):
67     """ remove the local copy of a genome's gene database.
68     """
69     global cache
70     if genome in cache:
71         try:
72             os.remove(cache[genome])
73         except:
74             print "could not delete %s" % cache[genome]
75
76         del cache[genome]
77     else:
78         for gen in cache:
79             try:
80                 os.remove(cache[gen])
81             except:
82                 print "could not delete %s" % cache[gen]
83
84         cache = {}
85
86
87 def cachedGenomes():
88     """ return lists of genomes with a gene database in the local cache.
89     """
90     return cache.keys()
91
92
93 def chooseDB(genome, dbfile=""):
94     """ helper function to use genome's gene database from the local cache if present.
95     """
96     global cache
97     if dbfile == "" and genome in cache:
98         dbfile = cache[genome]
99
100     return dbfile
101
102
103 def readChromosome(genome, chrom, db=""):
104     """ return sequence for entire chromosome
105     """
106     aGenome = Genome(genome, chrom, dbFile=chooseDB(genome, db))
107     return aGenome.getChromosomeSequence()
108
109
110 def getGenomeEntries(genome, db=""):
111     """ return the entries for a given genome. 
112     """
113     global cache
114     if db == "" and genome in cache:
115         db = cache[genome]
116
117     aGenome = Genome(genome, dbFile=chooseDB(genome, db))
118     return (genome, aGenome.allGIDs())
119
120
121 def getGenomeGeneIDs(genome, db=""):
122     """ return the entries for a given genome. 
123     """
124     global cache
125     if db == "" and genome in cache:
126         db = cache[genome]
127
128     aGenome = Genome(genome, dbFile=chooseDB(genome, db))
129     return aGenome.allGIDs()
130
131
132 def getChromoGeneEntries(chromosome, lowerbound=-1, upperbound=-1, db=""):
133     """ return the entries for a given chromosome.
134     """
135     (genome, chromID) = chromosome
136     aGenome = Genome(genome, chromID, dbFile=chooseDB(genome, db))
137     return aGenome.chromGeneEntries(chromID, lowerbound, upperbound)
138
139
140 def getChromosomeNames(genome, db="", partition=1, slice=0):
141     """ return the chromosomes for a given genome.
142     """
143     aGenome = Genome(genome, dbFile=chooseDB(genome, db))
144     return aGenome.allChromNames(partition, slice)
145
146
147 def geneEntry(geneID, db="", version="1"):
148     """ returns (chrom, start, stop, length, sense) for a given geneID
149     """
150     genome = geneID[0]
151     aGenome = Genome(genome, dbFile=chooseDB(genome, db))
152     return aGenome.geneInfo(geneID, version)
153
154
155 def compNT(nt):
156     """ returns the complementary basepair to base nt
157     """
158     compDict = {"A": "T", "T": "A",
159                 "G": "C", "C": "G",
160                 "S": "S",
161                 "W": "W",
162                 "R": "Y", "Y": "R",
163                 "M": "K", "K": "M",
164                 "H": "D", "D": "H",
165                 "B": "V", "V": "B",
166                 "N": "N",
167                 "a": "t", "t": "a",
168                 "g": "c", "c": "g",
169                 "n": "n",
170                 "z": "z"
171     }
172
173     return compDict.get(nt, "N")
174
175
176 def complement(sequence, length=-1):
177     """ returns the complement of the sequence.
178     """
179     newSeq = ""
180     seqLength = len(sequence)
181     if length == seqLength or length < 0:
182         seqList = list(sequence)
183         seqList.reverse()
184         return "".join(map(compNT, seqList))
185
186     for index in range(seqLength - 1,seqLength - length - 1, -1):
187         try:
188             newSeq += compNT(sequence[index])
189         except:
190             newSeq += "N"
191
192     return newSeq
193
194
195 def upstreamToNextGene(geneID, radius, version="1", db=""):
196     """ return distance to gene immediately upstream.
197     """
198     upstream = radius
199     genome = geneID[0]
200     aGenome = Genome(genome, dbFile=chooseDB(genome, db))
201     try:
202         if aGenome.checkGene(geneID):
203             (chrom, start, stop, length, sense) = aGenome.geneInfo(geneID, version)
204             if sense == "F":
205                 upstream = aGenome.leftGeneDistance(geneID, upstream, version)
206             else:
207                 upstream = aGenome.rightGeneDistance(geneID, upstream, version)
208     except:
209         pass
210
211     return upstream
212
213
214 def downstreamToNextGene(geneID, radius, version="1", db=""):
215     """ return distance to gene immediately downstream.
216     """
217     downstream = radius
218     genome = geneID[0]
219     aGenome = Genome(genome, dbFile=chooseDB(genome, db))
220
221     try:
222         if aGenome.checkGene(geneID):
223             (chrom, start, stop, length, sense) = aGenome.geneInfo(geneID, version)
224             if sense == "F":
225                 downstream = aGenome.rightGeneDistance(geneID, downstream, version)
226             else:
227                 downstream = aGenome.leftGeneDistance(geneID, downstream, version)
228     except:
229         pass
230
231     return downstream
232
233
234 def retrieveFeatures(match, radius, featureType="", db=""):
235     """ return the features around a given match.
236     """
237     (chromosome, hit) = match
238     (genome, chromID) = chromosome
239     lowerboundHit = int(hit[0]) - int(radius)
240     if lowerboundHit < 0:
241         lowerboundHit = 0
242
243     aGenome = Genome(genome, chromID, dbFile=chooseDB(genome, db))
244     results = aGenome.getFeaturesIntersecting(chromID, lowerboundHit, 2 * int(radius), featureType)
245
246     return results
247
248
249 def retrieveSeqFeatures(geneID, upstream, cds, downstream, boundToNextGene = False, geneDB=""):
250     """ retrieve CDS features upstream, all or none of the cds, and downstream of a geneID.
251         Feature positions are normalized and truncated to local sequence coordinates.
252     """
253     results = []
254     (genome, gID) = geneID
255     aGenome = Genome(genome, dbFile=chooseDB(genome, geneDB))
256     if True:
257         seqstart = 0
258         seqlen = 0
259         if aGenome.checkGene(geneID):
260             (chrom, start, stop, length, sense) = aGenome.geneInfo(geneID)
261             if stop < start:
262                 pos = stop
263                 stop = start
264                 start = pos
265
266             if sense == "F":
267                 # figure out normalized seqstart and seqstop
268                 if upstream > 0:
269                     if boundToNextGene:
270                         upstream = aGenome.leftGeneDistance(geneID, upstream)
271
272                     seqstart = start - upstream
273                     if seqstart < 0:
274                         seqstart = 0
275                         upstream = start
276
277                     seqlen = upstream
278
279                 if cds > 0:
280                     if seqlen == 0:
281                         seqstart = start
282
283                     seqlen += length
284
285                 if downstream > 0:
286                     if boundToNextGene:
287                         downstream = aGenome.rightGeneDistance(geneID, downstream)
288
289                     if seqlen == 0:
290                         seqstart = stop
291
292                     seqlen += downstream
293
294                 # process features
295                 allresults = aGenome.getFeaturesIntersecting(chrom, seqstart, seqlen, "CDS")
296                 for entry in allresults:
297                     (fname, fversion, fchromosome, fstart, fstop, forientation, ftype) = entry
298                     if fstop < fstart:
299                         fstop = fstart
300                         fstart = fstop
301
302                     forstart = fstart - seqstart       # normalize
303                     if forstart < 0:                # truncate
304                         forstart = 0
305
306                     forstop = fstop - seqstart # normalize
307                     if forstop > seqlen:  # truncate
308                         forstop = seqlen
309
310                     if (ftype, forstart, forstop, forientation) not in results:
311                         results.append((ftype, forstart, forstop, forientation))
312             else:
313                 # figure out normalized seqstart and seqstop
314                 if upstream > 0:
315                     if boundToNextGene:
316                         upstream = aGenome.rightGeneDistance(geneID, upstream)
317
318                     seqstart = stop + upstream
319                     seqlen = upstream
320
321                 if cds > 0:
322                     if seqlen == 0:
323                         seqstart = stop
324
325                     seqlen += length
326
327                 if downstream > 0:
328                     if boundToNextGene:
329                         downstream = aGenome.leftGeneDistance(geneID, downstream)
330
331                     if seqlen == 0:
332                         seqstart = start
333
334                     seqlen += downstream
335
336                 # process features
337                 allresults = aGenome.getFeaturesIntersecting(chrom, seqstart - seqlen, seqlen, "CDS")
338                 for entry in allresults:
339                     (fname, fversion, fchromosome, fstart, fstop, forientation, ftype) = entry
340                     if fstop < fstart:
341                         fstop = fstart
342                         fstart = fstop
343
344                     revstart = seqstart - fstop
345                     if revstart < 0:
346                         revstart = 0
347
348                     revstop = seqstart - fstart
349                     if revstop > seqlen:
350                         fstop = seqlen
351
352                     if (ftype, revstart, revstop, forientation) not in results:
353                         results.append((ftype, revstart, revstop, forientation))
354     else:
355         pass
356
357     return results
358
359
360 def getFeaturesIntersecting(genome, chrom, start, length, db="", ftype="CDS"):
361     """ return features of type ftype that fall within the given region.
362     """
363     aGenome = Genome(genome, dbFile=chooseDB(genome, db))
364     return aGenome.getFeaturesIntersecting(chrom, start, length, ftype)
365
366
367 def retrieveSequence(genome, chrom, start, stop, sense="F", db=""):
368     """ retrieve a sequence given a genome, chromosome, start, stop, and sense.
369     """
370     entrySeq = ""
371     length = abs(stop - start) + 1
372     try:
373         aGenome = Genome(genome, dbFile=chooseDB(genome, db))
374         if sense == "F":
375             if start  < 1:
376                 seqStart  = 0
377             else:
378                 seqStart  = start - 1
379
380             sequence = aGenome.sequence(chrom, seqStart, length)
381             entrySeq = sequence
382         else:
383             seqStart  = stop - 1
384             entrySeq= aGenome.sequence(chrom, seqStart, length)
385
386     except IOError:
387         print "Couldn't retrieve sequence %s %s %s %s %s" % (genome, chrom, start, stop, sense) 
388
389     return entrySeq
390
391
392 def retrieveCDS(geneID, maskCDS=False, maskLower=False, db="", version="1"):
393     """ retrieveCDS() - retrieve a sequence given a gene identifier
394     """
395     entrySeq = ""
396     genome = geneID[0]
397     aGenome = Genome(genome, dbFile=chooseDB(genome, db))
398     try:
399         if aGenome.checkGene(geneID):
400             entrySeq = aGenome.geneSeq(geneID, maskCDS, maskLower, version)  
401     except IOError:
402         print "Could not find %s " % str(geneID)
403
404     return entrySeq
405
406
407 def retrieveUpstream(geneID, upstream, maskCDS=False, maskLower=False, boundToNextGene=False, db="", version="1"):
408     """ retrieve sequence 5' of cds of length upstream for a given a gene identifier
409     """
410     entrySeq = ""
411     genome = geneID[0]
412     aGenome = Genome(genome, dbFile=chooseDB(genome, db))    
413     try:
414         if aGenome.checkGene(geneID):
415             (chrom, start, stop, length, sense) = aGenome.geneInfo(geneID, version)
416             if sense == "F":
417                 if boundToNextGene:
418                     upstream = aGenome.leftGeneDistance(geneID, upstream, version)
419
420                 if (start - upstream) > 1:
421                     seqStart  = start - upstream - 1
422                     seqLength = upstream
423                 else:
424                     seqStart  = 0
425                     seqLength = upstream
426             else:
427                 if boundToNextGene:
428                     upstream = aGenome.rightGeneDistance(geneID, upstream, version)
429
430                 seqStart  = stop
431                 seqLength = upstream
432
433             sequence = aGenome.sequence(chrom, seqStart, seqLength, maskCDS, maskLower)
434             # do CDS masking here....
435             if sense == "F":
436                 entrySeq = sequence
437             else:
438                 entrySeq = complement(sequence, upstream)
439
440     except IOError:
441         print "Couldn't find ", geneID
442
443     return entrySeq
444
445
446 def retrieveDownstream(geneID, downstream, maskCDS=False, maskLower=False, boundToNextGene=False, db="", version="1"):
447     """ retrieve sequence 3' of CDS of length downstream for a given a gene identifier
448     """
449     entrySeq = ""
450     genome = geneID[0]
451     aGenome = Genome(genome, dbFile=chooseDB(genome, db))    
452     if True:
453         if aGenome.checkGene(geneID):
454             (chrom, start, stop, length, sense) = aGenome.geneInfo(geneID, version)
455             if sense == "F":
456                 if boundToNextGene:
457                     downstream = aGenome.rightGeneDistance(geneID, downstream, version)
458
459                 seqStart  = stop - 1
460                 seqLength = downstream + 1
461             else:
462                 if boundToNextGene:
463                     downstream = aGenome.leftGeneDistance(geneID, downstream, version)
464
465                 if (start - downstream) > 1:
466                     seqStart  = start - downstream
467                     seqLength = downstream
468                 else:
469                     seqStart  = 0
470                     seqLength = stop
471
472             sequence = aGenome.sequence(chrom, seqStart, seqLength, maskCDS, maskLower)
473             # do CDS masking here
474             if sense == "F":
475                 entrySeq = sequence
476             else:
477                 entrySeq = complement(sequence, downstream)
478
479     return entrySeq
480
481
482 def retrieveSeq(geneID, upstream, cds, downstream, geneDB="", maskLower = False, boundToNextGene = False, version="1"):
483     """ retrieve upstream, all or none of the cds, and downstream of a geneID
484     """
485     geneSeq = ""
486     if int(cds) == 2:
487         maskCDS = True
488     else:
489         maskCDS = False
490
491     if upstream > 0:
492         geneSeq += retrieveUpstream(geneID, upstream, maskCDS, maskLower, boundToNextGene, geneDB, version)
493
494     if cds > 0:
495         geneSeq += retrieveCDS(geneID, maskCDS, maskLower, geneDB, version)
496
497     if downstream > 0:
498         geneSeq += retrieveDownstream(geneID, downstream, maskCDS, maskLower, boundToNextGene, geneDB, version)
499
500     if len(geneSeq) == 0:
501         print "retrieveSeq Warning: retrieved null sequence for %s: %s (splice form %s) from geneDB %s" % (geneID[0], geneID[1], version, geneDB)
502
503     return geneSeq
504
505
506 def retrieveAll(genome, genes, upstream, downstream, outputFilePath):
507     """ retrieve set of upstream and downstrean sequences for a list of genes in a genome and save them to a file.
508     """
509     outFile = open(outputFilePath, "w")                 
510     for gene in genes:
511         print "Processing " , gene
512         outFile.write("> %s \n" % (gene))
513         geneID = (genome, gene)
514         outFile.write("%s\n" % retrieveSeq(geneID, upstream, 0, downstream))
515
516     outFile.close()
517
518
519 def fasta(geneID, seq):
520     """ write a fasta formated seq with geneID in the header.
521     """
522     fastaString = "> %s-%s\n%s\n" % (geneID[0],geneID[1], seq)
523
524     return fastaString
525
526
527 def loadGOInfo(genome, db=""):
528     """ load GO for a given genome
529     """
530     aGenome = Genome(genome, dbFile=chooseDB(genome, db))
531     if genome not in goDict.keys():
532         goDict[genome] = aGenome.allGOInfo()
533
534
535 def getGOInfo(geneID, db=""):
536     """ retrieve GO info for geneID
537     """
538     (genome, locus) = geneID
539     aGenome = Genome(genome, dbFile=chooseDB(genome, db))
540     try:
541         return aGenome.goInfo(geneID)
542     except:
543         return []
544
545
546 def getGOIDCount(genome, GOID, db=""):
547     """ retrieve count of genes with a particular GOID.
548     """
549     aGenome = Genome(genome, dbFile=chooseDB(genome, db))
550     try:
551         return aGenome.getGOIDCount(GOID)
552     except:
553         return []
554
555
556 def allGOTerms(genome, db=""):
557     """ return all GO Terms.
558     """
559     aGenome = Genome(genome, dbFile=chooseDB(genome, db))
560     try:
561         return aGenome.allGOterms()
562     except:
563         return []
564
565
566 def getAllGOInfo(genome, db=""):
567     """ return all GO Info.
568     """
569     aGenome = Genome(genome, dbFile=chooseDB(genome, db))
570     try:
571         return aGenome.allGoInfo()
572     except:
573         return []
574
575
576 def loadAnnotInfo(genome, db=""):
577     """ load Annotations for a given genome
578     """
579     aGenome = Genome(genome, dbFile=chooseDB(genome, db))
580     if genome not in annotDict.keys():
581         annotDict[genome] = aGenome.allAnnotInfo()
582
583
584 def getAnnotInfo(geneID, db=""):
585     """ retrieve Annotations for a given geneID
586     """
587     (genome, locus) = geneID
588     aGenome = Genome(genome, dbFile=chooseDB(genome, db))
589     try:
590         return aGenome.annotInfo(geneID)
591     except:
592         return []
593
594
595 def getAllAnnotInfo(genome, db=""):
596     """ return all Annotation Info.
597     """
598     aGenome = Genome(genome, dbFile=chooseDB(genome, db))
599     try:
600         return aGenome.allAnnotInfo()
601     except:
602         return []
603
604
605 def sanitize(inSeq, windowSize=16):
606     """ make sure that very simple repeats are out of the sequence. 
607         Soft-mask any window that has windowSize - 2 of mononucleotides 
608         and (windowSize / 2) - 1 non-GC dinucleotides.     
609     """
610     seqlen = len(inSeq)
611     outSeq = list(inSeq.upper())
612     winmin2 = windowSize - 2
613     winhalf = windowSize/2 - 1
614     for pos in range(seqlen - windowSize):
615         window = inSeq[pos:pos + windowSize].upper()
616         if window.count("A") > winmin2 or window.count("C") > winmin2 or window.count("G") > winmin2 or window.count("T") > winmin2:
617             for index in range(windowSize):
618                 outSeq[pos + index] = outSeq[pos + index].lower()
619
620         if window.count("AC") >= winhalf or window.count("AG") >= winhalf or window.count("AT") >= winhalf or window.count("CT") >= winhalf or window.count("GT") >= winhalf or window.count("TA") >= winhalf or window.count("TC") >= winhalf or window.count("TG") >= winhalf or window.count("GA") > winhalf or window.count("CA") > winhalf:
621             for index in range(windowSize):
622                 outSeq[pos + index] = outSeq[pos + index].lower()
623
624     return "".join(outSeq)
625
626
627 def featuresIntersecting(genome, posList, radius, ftype, name="", chrom="", version="", db="", extendGen="", replaceMod=False):
628     """ returns a dictionary of matching features to positions of the double form (chromosome, position).
629         Only positions with features within radius are returned.
630     """
631     resultDict = {}
632     if extendGen != "":
633         aGenome = Genome(genome, dbFile=chooseDB(genome, db), inRAM=True)
634         aGenome.extendFeatures(extendGen, replace = replaceMod)
635     else:
636         aGenome = Genome(genome, dbFile=chooseDB(genome, db))
637
638     features = aGenome.getFeatures(ftype, name, chrom, version)
639     if len(posList) < 1 or len(features) < 1:
640         return resultDict
641
642     chromList = features.keys()
643     for (chrom, pos) in posList:
644         tempList = []
645         if chrom not in chromList:
646             continue
647
648         for (name, version, chromosome, start, stop, orientation, atype) in features[chrom]:
649             if (pos + radius) < start or (pos - radius) > stop:
650                 continue
651
652             tempList.append((name, version, chromosome, start, stop, orientation, atype))
653
654         if len(tempList) > 0:
655             resultDict[(chrom, pos)] = tempList
656
657     return resultDict
658
659
660 def genesIntersecting(genome, posList, name="", chrom="", version="", db="", flank=0, extendGen="", replaceMod=False):
661     """ returns a dictionary of matching genes to positions of the double form (chromosome, position).
662         Only positions with features within radius are returned.
663     """
664     resultDict = {}
665     if extendGen != "":
666         aGenome = Genome(genome, dbFile=chooseDB(genome, db), inRAM=True)
667         aGenome.extendFeatures(extendGen, replace = replaceMod)
668     else:
669         aGenome = Genome(genome, dbFile=chooseDB(genome, db))
670
671     genes = aGenome.getallGeneInfo(name, chrom, version)
672     if len(posList) < 1 or len(genes) < 1:
673         return resultDict
674
675     chromList = genes.keys()
676     for (chrom, pos) in posList:
677         tempList = []
678         if chrom not in chromList:
679             continue
680
681         for (name, chromosome, start, stop, orientation) in genes[chrom]:
682             if start-flank <= pos <= stop+flank:
683                 tempList.append((name, "noversion", chromosome, start, stop, orientation))
684
685         if len(tempList) > 0:
686             resultDict[(chrom, pos)] = tempList
687
688     return resultDict