5 print "psyco not running"
9 from commoncode import readDataset, getFeaturesByChromDict
10 from cistematic.genomes import Genome
11 from cistematic.core.geneinfo import geneinfoDB
13 print "%s: version 5.1" % sys.argv[0]
20 usage = "usage: python %prog genome rdsfile outfilename [options]"
22 parser = optparse.OptionParser(usage=usage)
23 parser.add_option("--stranded", action="store_true", dest="trackStrand")
24 parser.add_option("--splices", action="store_true", dest="doSplices")
25 parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
26 parser.add_option("--multi", action="store_true", dest="doMulti")
27 parser.add_option("--models", dest="extendGenome")
28 parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
29 parser.add_option("--searchGID", action="store_true", dest="searchGID")
30 parser.add_option("--countfeatures", action="store_true", dest="countFeats")
31 parser.add_option("--cache", type="int", dest="cachePages")
32 parser.add_option("--markGID", action="store_true", dest="markGID")
33 parser.set_defaults(trackStrand=False, doSplices=False, doUniqs=True, doMulti=False,
34 extendGenome="", replaceModels=False, searchGID=False,
35 countFeats=False, cachePages=None, markGID=False)
37 (options, args) = parser.parse_args(argv[1:])
47 geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices,
48 options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels,
49 options.searchGID, options.countFeats, options.cachePages, options.markGID)
52 def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplices=False,
53 doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
54 searchGID=False, countFeats=False, cachePages=None, markGID=False):
57 print "will track strandedness"
64 print "will replace gene models with %s" % extendGenome
66 print "will extend gene models with %s" % extendGenome
70 if cachePages is not None:
76 hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
77 if cachePages > hitRDS.getDefaultCacheSize():
78 hitRDS.setDBcache(cachePages)
80 genome = Genome(genomeName, inRAM=True)
81 if extendGenome != "":
82 genome.extendFeatures(extendGenome, replace=replaceModels)
84 print "getting gene features...."
85 featuresByChromDict = getFeaturesByChromDict(genome)
87 seenFeaturesByChromDict = {}
88 print "getting geneIDs...."
89 gidList = genome.allGIDs()
95 chromList = hitRDS.getChromosomes(fullChrom=False)
96 if len(chromList) == 0 and doSplices:
97 chromList = hitRDS.getChromosomes(table="splices", fullChrom=False)
100 print "Flagging all reads as NM"
101 hitRDS.setFlags("NM", uniqs=doUniqs, multi=doMulti, splices=doSplices)
103 for chrom in chromList:
104 if chrom not in featuresByChromDict:
108 seenFeaturesByChromDict[chrom] = []
110 print "\nchr%s" % chrom
111 fullchrom = "chr%s" % chrom
113 print "counting GIDs"
114 for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]:
116 if doStranded == "track":
118 if featureSense == "R":
121 regionList.append((gid, fullchrom, start, stop, checkSense))
122 count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
124 regionList.append((gid, fullchrom, start, stop))
125 count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
129 gidCount[gid] += count
131 if (start, stop, gid, featureSense) not in seenFeaturesByChromDict[chrom]:
132 seenFeaturesByChromDict[chrom].append((start, stop, gid, featureSense))
134 print "problem with %s - skipping" % gid
138 hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=doStranded)
139 print "finished marking"
143 numFeatures = countFeatures(seenFeaturesByChromDict)
144 print "saw %d features" % numFeatures
146 writeOutputFile(outfilename, genome, gidList, gidCount, searchGID)
147 if markGID and doCache:
148 hitRDS.saveCacheDB(hitfile)
151 def countFeatures(seenFeaturesByChromDict):
153 for chrom in seenFeaturesByChromDict.keys():
155 count += len(seenFeaturesByChromDict[chrom])
162 def writeOutputFile(outfilename, genome, gidList, gidCount, searchGID):
163 geneAnnotDict = genome.allAnnotInfo()
164 genomeName = genome.genome
165 outfile = open(outfilename, "w")
166 idb = geneinfoDB(cache=True)
167 geneInfoDict = idb.getallGeneInfo(genomeName)
169 symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)
171 outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
173 outfile.write("%s\t%s\t0\n" % (gid, symbol))
178 def getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict):
180 if searchGID and gid not in geneInfoDict:
181 actualGeneID = idb.getGeneID(genomeName, gid)
182 if len(actualGeneID) > 0:
183 lookupGID = actualGeneID[1]
186 geneinfo = geneInfoDict[lookupGID]
187 symbol = geneinfo[0][0]
188 except (KeyError, IndexError):
190 symbol = geneAnnotDict[(genomeName, gid)][0]
191 except (KeyError, IndexError):
192 symbol = "LOC%s" % gid
197 if __name__ == "__main__":