snapshot of 4.0a development. initial git repo commit
[erange.git] / geneMrnaCounts.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     print "psyco not running"
6
7 import sys
8 import optparse
9 from commoncode import readDataset, getFeaturesByChromDict
10 from cistematic.genomes import Genome
11 from cistematic.core.geneinfo import geneinfoDB
12
13 print "%s: version 5.1" % sys.argv[0]
14
15
16 def main(argv=None):
17     if not argv:
18         argv = sys.argv
19
20     usage = "usage: python %prog genome rdsfile outfilename [options]"
21
22     parser = optparse.OptionParser(usage=usage)
23     parser.add_option("--stranded", action="store_true", dest="trackStrand")
24     parser.add_option("--splices", action="store_true", dest="doSplices")
25     parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
26     parser.add_option("--multi", action="store_true", dest="doMulti")
27     parser.add_option("--models", dest="extendGenome")
28     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
29     parser.add_option("--searchGID", action="store_true", dest="searchGID")
30     parser.add_option("--countfeatures", action="store_true", dest="countFeats")
31     parser.add_option("--cache", type="int", dest="cachePages")
32     parser.add_option("--markGID", action="store_true", dest="markGID")
33     parser.set_defaults(trackStrand=False, doSplices=False, doUniqs=True, doMulti=False,
34                         extendGenome="", replaceModels=False, searchGID=False,
35                         countFeats=False, cachePages=None, markGID=False)
36
37     (options, args) = parser.parse_args(argv[1:])
38
39     if len(args) < 3:
40         print usage
41         sys.exit(1)
42
43     genomeName = args[0]
44     hitfile =  args[1]
45     outfilename = args[2]
46
47     geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices,
48                    options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels,
49                    options.searchGID, options.countFeats, options.cachePages, options.markGID)
50
51
52 def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplices=False,
53                    doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
54                    searchGID=False, countFeats=False, cachePages=None, markGID=False):
55
56     if trackStrand:
57         print "will track strandedness"
58         doStranded = "track"
59     else:
60         doStranded = "both"
61
62     if extendGenome:
63         if replaceModels:
64             print "will replace gene models with %s" % extendGenome
65         else:
66             print "will extend gene models with %s" % extendGenome
67     else:
68         replaceModels = False
69
70     if cachePages is not None:
71         doCache = True
72     else:
73         cachePages = 100000
74         doCache = False
75
76     hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
77     if cachePages > hitRDS.getDefaultCacheSize():
78         hitRDS.setDBcache(cachePages)
79
80     genome = Genome(genomeName, inRAM=True)
81     if extendGenome != "":
82         genome.extendFeatures(extendGenome, replace=replaceModels)
83
84     print "getting gene features...."
85     featuresByChromDict = getFeaturesByChromDict(genome)
86
87     seenFeaturesByChromDict = {}
88     print "getting geneIDs...."
89     gidList = genome.allGIDs()
90     gidList.sort()
91     gidCount = {}
92     for gid in gidList:
93         gidCount[gid] = 0
94
95     chromList = hitRDS.getChromosomes(fullChrom=False)
96     if len(chromList) == 0 and doSplices:
97         chromList = hitRDS.getChromosomes(table="splices", fullChrom=False)
98
99     if markGID:
100         print "Flagging all reads as NM"
101         hitRDS.setFlags("NM", uniqs=doUniqs, multi=doMulti, splices=doSplices)
102
103     for chrom in chromList:
104         if chrom not in featuresByChromDict:
105             continue
106
107         if countFeats:
108             seenFeaturesByChromDict[chrom] = []
109
110         print "\nchr%s" % chrom
111         fullchrom = "chr%s" % chrom
112         regionList = []        
113         print "counting GIDs"
114         for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]:
115             try:
116                 if doStranded == "track":
117                     checkSense = "+"
118                     if featureSense == "R":
119                         checkSense = "-"
120
121                     regionList.append((gid, fullchrom, start, stop, checkSense))
122                     count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
123                 else:
124                     regionList.append((gid, fullchrom, start, stop))
125                     count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
126                     if count != 0:
127                         print count
128
129                 gidCount[gid] += count
130                 if countFeats:
131                     if (start, stop, gid, featureSense) not in seenFeaturesByChromDict[chrom]:
132                         seenFeaturesByChromDict[chrom].append((start, stop, gid, featureSense))
133             except:
134                 print "problem with %s - skipping" % gid
135
136         if markGID:
137             print "marking GIDs"
138             hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=doStranded)
139             print "finished marking"
140
141     print " "
142     if countFeats:
143         numFeatures = countFeatures(seenFeaturesByChromDict)
144         print "saw %d features" % numFeatures
145
146     writeOutputFile(outfilename, genome, gidList, gidCount, searchGID)
147     if markGID and doCache:
148         hitRDS.saveCacheDB(hitfile)
149
150
151 def countFeatures(seenFeaturesByChromDict):
152     count = 0
153     for chrom in seenFeaturesByChromDict.keys():
154         try:
155             count += len(seenFeaturesByChromDict[chrom])
156         except TypeError:
157             pass
158
159     return count
160
161
162 def writeOutputFile(outfilename, genome, gidList, gidCount, searchGID):
163     geneAnnotDict = genome.allAnnotInfo()
164     genomeName = genome.genome
165     outfile = open(outfilename, "w")
166     idb = geneinfoDB(cache=True)
167     geneInfoDict = idb.getallGeneInfo(genomeName)
168     for gid in gidList:
169         symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)
170         if gid in gidCount:
171             outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
172         else:
173             outfile.write("%s\t%s\t0\n" % (gid, symbol))
174
175     outfile.close()
176
177
178 def getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict):
179     lookupGID = gid
180     if searchGID and gid not in geneInfoDict:
181         actualGeneID = idb.getGeneID(genomeName, gid)
182         if len(actualGeneID) > 0:
183             lookupGID = actualGeneID[1]
184
185     try:
186         geneinfo = geneInfoDict[lookupGID]
187         symbol = geneinfo[0][0]
188     except (KeyError, IndexError):
189         try:
190             symbol = geneAnnotDict[(genomeName, gid)][0]
191         except (KeyError, IndexError):
192             symbol = "LOC%s" % gid
193
194     return symbol
195
196
197 if __name__ == "__main__":
198     main(sys.argv)