erange version 4.0a dev release
[erange.git] / geneMrnaCounts.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     print "psyco not running"
6
7 import sys
8 import optparse
9 from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
10 import ReadDataset
11 from cistematic.genomes import Genome
12 from cistematic.core.geneinfo import geneinfoDB
13
14 print "geneMrnaCounts: version 5.2"
15
16
17 def main(argv=None):
18     if not argv:
19         argv = sys.argv
20
21     usage = "usage: python %prog genome rdsfile outfilename [options]"
22
23     parser = getParser(usage)
24     (options, args) = parser.parse_args(argv[1:])
25
26     if len(args) < 3:
27         print usage
28         sys.exit(1)
29
30     genomeName = args[0]
31     hitfile =  args[1]
32     outfilename = args[2]
33
34     geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices,
35                    options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels,
36                    options.searchGID, options.countFeats, options.cachePages, options.markGID)
37
38
39 def getParser(usage):
40     parser = optparse.OptionParser(usage=usage)
41     parser.add_option("--stranded", action="store_true", dest="trackStrand")
42     parser.add_option("--splices", action="store_true", dest="doSplices")
43     parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
44     parser.add_option("--multi", action="store_true", dest="doMulti")
45     parser.add_option("--models", dest="extendGenome")
46     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
47     parser.add_option("--searchGID", action="store_true", dest="searchGID")
48     parser.add_option("--countfeatures", action="store_true", dest="countFeats")
49     parser.add_option("--cache", type="int", dest="cachePages")
50     parser.add_option("--markGID", action="store_true", dest="markGID")
51
52     configParser = getConfigParser()
53     section = "geneMrnaCounts"
54     trackStrand = getConfigBoolOption(configParser, section, "trackStrand", False)
55     doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
56     doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
57     doMulti = getConfigBoolOption(configParser, section, "doMulti", False)
58     extendGenome = getConfigOption(configParser, section, "extendGenome", "")
59     replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
60     searchGID = getConfigBoolOption(configParser, section, "searchGID", False)
61     countFeats = getConfigBoolOption(configParser, section, "countFeats", False)
62     cachePages = getConfigOption(configParser, section, "cachePages", None)
63     markGID = getConfigBoolOption(configParser, section, "markGID", False)
64
65     parser.set_defaults(trackStrand=trackStrand, doSplices=doSplices, doUniqs=doUniqs, doMulti=doMulti,
66                         extendGenome=extendGenome, replaceModels=replaceModels, searchGID=searchGID,
67                         countFeats=countFeats, cachePages=cachePages, markGID=markGID)
68
69     return parser
70
71 def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplices=False,
72                    doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
73                    searchGID=False, countFeats=False, cachePages=None, markGID=False):
74
75     if trackStrand:
76         print "will track strandedness"
77         doStranded = "track"
78     else:
79         doStranded = "both"
80
81     if extendGenome:
82         if replaceModels:
83             print "will replace gene models with %s" % extendGenome
84         else:
85             print "will extend gene models with %s" % extendGenome
86     else:
87         replaceModels = False
88
89     if cachePages is not None:
90         doCache = True
91     else:
92         cachePages = 100000
93         doCache = False
94
95     hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
96     if cachePages > hitRDS.getDefaultCacheSize():
97         hitRDS.setDBcache(cachePages)
98
99     genome = Genome(genomeName, inRAM=True)
100     if extendGenome != "":
101         genome.extendFeatures(extendGenome, replace=replaceModels)
102
103     print "getting gene features...."
104     featuresByChromDict = getFeaturesByChromDict(genome)
105
106     seenFeaturesByChromDict = {}
107     print "getting geneIDs...."
108     gidList = genome.allGIDs()
109     gidList.sort()
110     gidCount = {}
111     for gid in gidList:
112         gidCount[gid] = 0
113
114     chromList = hitRDS.getChromosomes(fullChrom=False)
115     if len(chromList) == 0 and doSplices:
116         chromList = hitRDS.getChromosomes(table="splices", fullChrom=False)
117
118     if markGID:
119         print "Flagging all reads as NM"
120         hitRDS.setFlags("NM", uniqs=doUniqs, multi=doMulti, splices=doSplices)
121
122     for chrom in chromList:
123         if chrom not in featuresByChromDict:
124             continue
125
126         if countFeats:
127             seenFeaturesByChromDict[chrom] = []
128
129         print "\nchr%s" % chrom
130         fullchrom = "chr%s" % chrom
131         regionList = []        
132         print "counting GIDs"
133         for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]:
134             try:
135                 if doStranded == "track":
136                     checkSense = "+"
137                     if featureSense == "R":
138                         checkSense = "-"
139
140                     regionList.append((gid, fullchrom, start, stop, checkSense))
141                     count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
142                 else:
143                     regionList.append((gid, fullchrom, start, stop))
144                     count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
145                     if count != 0:
146                         print count
147
148                 gidCount[gid] += count
149                 if countFeats:
150                     if (start, stop, gid, featureSense) not in seenFeaturesByChromDict[chrom]:
151                         seenFeaturesByChromDict[chrom].append((start, stop, gid, featureSense))
152             except:
153                 print "problem with %s - skipping" % gid
154
155         if markGID:
156             print "marking GIDs"
157             hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=doStranded)
158             print "finished marking"
159
160     print " "
161     if countFeats:
162         numFeatures = countFeatures(seenFeaturesByChromDict)
163         print "saw %d features" % numFeatures
164
165     writeOutputFile(outfilename, genome, gidList, gidCount, searchGID)
166     if markGID and doCache:
167         hitRDS.saveCacheDB(hitfile)
168
169
170 def countFeatures(seenFeaturesByChromDict):
171     count = 0
172     for chrom in seenFeaturesByChromDict.keys():
173         try:
174             count += len(seenFeaturesByChromDict[chrom])
175         except TypeError:
176             pass
177
178     return count
179
180
181 def writeOutputFile(outfilename, genome, gidList, gidCount, searchGID):
182     geneAnnotDict = genome.allAnnotInfo()
183     genomeName = genome.genome
184     outfile = open(outfilename, "w")
185     idb = geneinfoDB(cache=True)
186     geneInfoDict = idb.getallGeneInfo(genomeName)
187     for gid in gidList:
188         symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)
189         if gid in gidCount:
190             outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
191         else:
192             outfile.write("%s\t%s\t0\n" % (gid, symbol))
193
194     outfile.close()
195
196
197 def getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict):
198     lookupGID = gid
199     if searchGID and gid not in geneInfoDict:
200         actualGeneID = idb.getGeneID(genomeName, gid)
201         if len(actualGeneID) > 0:
202             lookupGID = actualGeneID[1]
203
204     try:
205         geneinfo = geneInfoDict[lookupGID]
206         symbol = geneinfo[0][0]
207     except (KeyError, IndexError):
208         try:
209             symbol = geneAnnotDict[(genomeName, gid)][0]
210         except (KeyError, IndexError):
211             symbol = "LOC%s" % gid
212
213     return symbol
214
215
216 if __name__ == "__main__":
217     main(sys.argv)