first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / geneMrnaCounts.py
1 try:
2     import psyco
3     psyco.full()
4 except:
5     print "psyco not running"
6
7 import sys
8 import optparse
9 from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, countThisRead
10 from cistematic.genomes import Genome
11 from cistematic.core.geneinfo import geneinfoDB
12
13 print "geneMrnaCounts: version 6.0"
14
15
16 def main(argv=None):
17     if not argv:
18         argv = sys.argv
19
20     usage = "usage: python %prog genome rdsfile outfilename [options]"
21
22     parser = getParser(usage)
23     (options, args) = parser.parse_args(argv[1:])
24
25     if len(args) < 3:
26         print usage
27         sys.exit(1)
28
29     genomeName = args[0]
30     hitfile =  args[1]
31     outfilename = args[2]
32
33     geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices,
34                    options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels,
35                    options.searchGID, options.countFeats, options.cachePages)
36
37
38 def getParser(usage):
39     parser = optparse.OptionParser(usage=usage)
40     parser.add_option("--stranded", action="store_true", dest="trackStrand")
41     parser.add_option("--splices", action="store_true", dest="doSplices")
42     parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
43     parser.add_option("--multi", action="store_true", dest="doMulti")
44     parser.add_option("--models", dest="extendGenome")
45     parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
46     parser.add_option("--searchGID", action="store_true", dest="searchGID")
47     parser.add_option("--countfeatures", action="store_true", dest="countFeats")
48     parser.add_option("--cache", type="int", dest="cachePages")
49
50     configParser = getConfigParser()
51     section = "geneMrnaCounts"
52     trackStrand = getConfigBoolOption(configParser, section, "trackStrand", False)
53     doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
54     doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
55     doMulti = getConfigBoolOption(configParser, section, "doMulti", False)
56     extendGenome = getConfigOption(configParser, section, "extendGenome", "")
57     replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
58     searchGID = getConfigBoolOption(configParser, section, "searchGID", False)
59     countFeats = getConfigBoolOption(configParser, section, "countFeats", False)
60     cachePages = getConfigOption(configParser, section, "cachePages", None)
61
62     parser.set_defaults(trackStrand=trackStrand, doSplices=doSplices, doUniqs=doUniqs, doMulti=doMulti,
63                         extendGenome=extendGenome, replaceModels=replaceModels, searchGID=searchGID,
64                         countFeats=countFeats, cachePages=cachePages)
65
66     return parser
67
68 def geneMrnaCounts(genomeName, bamfile, outfilename, trackStrand=False, doSplices=False,
69                    doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
70                    searchGID=False, countFeats=False):
71
72     if trackStrand:
73         print "will track strandedness"
74         doStranded = "track"
75     else:
76         doStranded = "both"
77
78     if extendGenome:
79         if replaceModels:
80             print "will replace gene models with %s" % extendGenome
81         else:
82             print "will extend gene models with %s" % extendGenome
83     else:
84         replaceModels = False
85
86     genome = Genome(genomeName, inRAM=True)
87     if extendGenome != "":
88         genome.extendFeatures(extendGenome, replace=replaceModels)
89
90     print "getting gene features...."
91     featuresByChromDict = getFeaturesByChromDict(genome)
92
93     seenFeaturesByChromDict = {}
94     print "getting geneIDs...."
95     gidList = genome.allGIDs()
96     gidList.sort()
97     gidCount = {}
98     for gid in gidList:
99         gidCount[gid] = 0
100
101     chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
102     for chrom in chromosomeList:
103         if chrom not in featuresByChromDict:
104             continue
105
106         if countFeats:
107             seenFeaturesByChromDict[chrom] = set([])
108
109         print "\nchr%s" % chrom      
110         print "counting GIDs"
111         for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]:
112             try:
113                 if doStranded == "track":
114                     checkSense = "+"
115                     if featureSense == "R":
116                         checkSense = "-"
117
118                     count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
119                 else:
120                     count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
121
122                 gidCount[gid] += count
123                 if countFeats:
124                     seenFeaturesByChromDict[chrom].add((start, stop, gid, featureSense))
125             except:
126                 print "problem with %s - skipping" % gid
127
128     print " "
129     if countFeats:
130         numFeatures = countFeatures(seenFeaturesByChromDict)
131         print "saw %d features" % numFeatures
132
133     writeOutputFile(outfilename, genome, gidCount, searchGID)
134
135
136 def getCounts(bamfile, chrom, start, stop, uniqs=True, multi=False, splices=False, sense=''):
137     count = 0.0
138     for alignedread in bamfile.fetch(chrom, start, stop):
139         if countThisRead(alignedread, uniqs, multi, splices, sense):
140             count += 1.0/alignedread.opt('NH')
141
142     return count
143
144
145 def countFeatures(seenFeaturesByChromDict):
146     count = 0
147     for chrom in seenFeaturesByChromDict.keys():
148         try:
149             count += len(seenFeaturesByChromDict[chrom])
150         except TypeError:
151             pass
152
153     return count
154
155
156 def writeOutputFile(outfilename, genome, gidCount, searchGID):
157     geneAnnotDict = genome.allAnnotInfo()
158     genomeName = genome.genome
159     outfile = open(outfilename, "w")
160     idb = geneinfoDB(cache=True)
161     geneInfoDict = idb.getallGeneInfo(genomeName)
162     for gid in gidCount:
163         symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)
164         outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
165
166     outfile.close()
167
168
169 def getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict):
170     lookupGID = gid
171     if searchGID and gid not in geneInfoDict:
172         actualGeneID = idb.getGeneID(genomeName, gid)
173         if len(actualGeneID) > 0:
174             lookupGID = actualGeneID[1]
175
176     try:
177         geneinfo = geneInfoDict[lookupGID]
178         symbol = geneinfo[0][0]
179     except (KeyError, IndexError):
180         try:
181             symbol = geneAnnotDict[(genomeName, gid)][0]
182         except (KeyError, IndexError):
183             symbol = "LOC%s" % gid
184
185     return symbol
186
187
188 if __name__ == "__main__":
189     main(sys.argv)