5 print "psyco not running"
9 from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
11 from cistematic.genomes import Genome
12 from cistematic.core.geneinfo import geneinfoDB
14 print "geneMrnaCounts: version 5.2"
21 usage = "usage: python %prog genome rdsfile outfilename [options]"
23 parser = getParser(usage)
24 (options, args) = parser.parse_args(argv[1:])
34 geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices,
35 options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels,
36 options.searchGID, options.countFeats, options.cachePages, options.markGID)
40 parser = optparse.OptionParser(usage=usage)
41 parser.add_option("--stranded", action="store_true", dest="trackStrand")
42 parser.add_option("--splices", action="store_true", dest="doSplices")
43 parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
44 parser.add_option("--multi", action="store_true", dest="doMulti")
45 parser.add_option("--models", dest="extendGenome")
46 parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
47 parser.add_option("--searchGID", action="store_true", dest="searchGID")
48 parser.add_option("--countfeatures", action="store_true", dest="countFeats")
49 parser.add_option("--cache", type="int", dest="cachePages")
50 parser.add_option("--markGID", action="store_true", dest="markGID")
52 configParser = getConfigParser()
53 section = "geneMrnaCounts"
54 trackStrand = getConfigBoolOption(configParser, section, "trackStrand", False)
55 doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
56 doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
57 doMulti = getConfigBoolOption(configParser, section, "doMulti", False)
58 extendGenome = getConfigOption(configParser, section, "extendGenome", "")
59 replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
60 searchGID = getConfigBoolOption(configParser, section, "searchGID", False)
61 countFeats = getConfigBoolOption(configParser, section, "countFeats", False)
62 cachePages = getConfigOption(configParser, section, "cachePages", None)
63 markGID = getConfigBoolOption(configParser, section, "markGID", False)
65 parser.set_defaults(trackStrand=trackStrand, doSplices=doSplices, doUniqs=doUniqs, doMulti=doMulti,
66 extendGenome=extendGenome, replaceModels=replaceModels, searchGID=searchGID,
67 countFeats=countFeats, cachePages=cachePages, markGID=markGID)
71 def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplices=False,
72 doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
73 searchGID=False, countFeats=False, cachePages=None, markGID=False):
76 print "will track strandedness"
83 print "will replace gene models with %s" % extendGenome
85 print "will extend gene models with %s" % extendGenome
89 if cachePages is not None:
95 hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
96 if cachePages > hitRDS.getDefaultCacheSize():
97 hitRDS.setDBcache(cachePages)
99 genome = Genome(genomeName, inRAM=True)
100 if extendGenome != "":
101 genome.extendFeatures(extendGenome, replace=replaceModels)
103 print "getting gene features...."
104 featuresByChromDict = getFeaturesByChromDict(genome)
106 seenFeaturesByChromDict = {}
107 print "getting geneIDs...."
108 gidList = genome.allGIDs()
114 chromList = hitRDS.getChromosomes(fullChrom=False)
115 if len(chromList) == 0 and doSplices:
116 chromList = hitRDS.getChromosomes(table="splices", fullChrom=False)
119 print "Flagging all reads as NM"
120 hitRDS.setFlags("NM", uniqs=doUniqs, multi=doMulti, splices=doSplices)
122 for chrom in chromList:
123 if chrom not in featuresByChromDict:
127 seenFeaturesByChromDict[chrom] = []
129 print "\nchr%s" % chrom
130 fullchrom = "chr%s" % chrom
132 print "counting GIDs"
133 for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]:
135 if doStranded == "track":
137 if featureSense == "R":
140 regionList.append((gid, fullchrom, start, stop, checkSense))
141 count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
143 regionList.append((gid, fullchrom, start, stop))
144 count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
148 gidCount[gid] += count
150 if (start, stop, gid, featureSense) not in seenFeaturesByChromDict[chrom]:
151 seenFeaturesByChromDict[chrom].append((start, stop, gid, featureSense))
153 print "problem with %s - skipping" % gid
157 hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=doStranded)
158 print "finished marking"
162 numFeatures = countFeatures(seenFeaturesByChromDict)
163 print "saw %d features" % numFeatures
165 writeOutputFile(outfilename, genome, gidList, gidCount, searchGID)
166 if markGID and doCache:
167 hitRDS.saveCacheDB(hitfile)
170 def countFeatures(seenFeaturesByChromDict):
172 for chrom in seenFeaturesByChromDict.keys():
174 count += len(seenFeaturesByChromDict[chrom])
181 def writeOutputFile(outfilename, genome, gidList, gidCount, searchGID):
182 geneAnnotDict = genome.allAnnotInfo()
183 genomeName = genome.genome
184 outfile = open(outfilename, "w")
185 idb = geneinfoDB(cache=True)
186 geneInfoDict = idb.getallGeneInfo(genomeName)
188 symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)
190 outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
192 outfile.write("%s\t%s\t0\n" % (gid, symbol))
197 def getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict):
199 if searchGID and gid not in geneInfoDict:
200 actualGeneID = idb.getGeneID(genomeName, gid)
201 if len(actualGeneID) > 0:
202 lookupGID = actualGeneID[1]
205 geneinfo = geneInfoDict[lookupGID]
206 symbol = geneinfo[0][0]
207 except (KeyError, IndexError):
209 symbol = geneAnnotDict[(genomeName, gid)][0]
210 except (KeyError, IndexError):
211 symbol = "LOC%s" % gid
216 if __name__ == "__main__":