import sys
import optparse
-from commoncode import readDataset, getFeaturesByChromDict
+from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, countThisRead
from cistematic.genomes import Genome
from cistematic.core.geneinfo import geneinfoDB
-print "%s: version 5.1" % sys.argv[0]
+print "geneMrnaCounts: version 6.0"
def main(argv=None):
usage = "usage: python %prog genome rdsfile outfilename [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--stranded", action="store_true", dest="trackStrand")
- parser.add_option("--splices", action="store_true", dest="doSplices")
- parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
- parser.add_option("--multi", action="store_true", dest="doMulti")
- parser.add_option("--models", dest="extendGenome")
- parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
- parser.add_option("--searchGID", action="store_true", dest="searchGID")
- parser.add_option("--countfeatures", action="store_true", dest="countFeats")
- parser.add_option("--cache", type="int", dest="cachePages")
- parser.add_option("--markGID", action="store_true", dest="markGID")
- parser.set_defaults(trackStrand=False, doSplices=False, doUniqs=True, doMulti=False,
- extendGenome="", replaceModels=False, searchGID=False,
- countFeats=False, cachePages=None, markGID=False)
-
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices,
options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels,
- options.searchGID, options.countFeats, options.cachePages, options.markGID)
+ options.searchGID, options.countFeats, options.cachePages)
-def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplices=False,
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--stranded", action="store_true", dest="trackStrand")
+ parser.add_option("--splices", action="store_true", dest="doSplices")
+ parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
+ parser.add_option("--multi", action="store_true", dest="doMulti")
+ parser.add_option("--models", dest="extendGenome")
+ parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+ parser.add_option("--searchGID", action="store_true", dest="searchGID")
+ parser.add_option("--countfeatures", action="store_true", dest="countFeats")
+ parser.add_option("--cache", type="int", dest="cachePages")
+
+ configParser = getConfigParser()
+ section = "geneMrnaCounts"
+ trackStrand = getConfigBoolOption(configParser, section, "trackStrand", False)
+ doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+ doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
+ doMulti = getConfigBoolOption(configParser, section, "doMulti", False)
+ extendGenome = getConfigOption(configParser, section, "extendGenome", "")
+ replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
+ searchGID = getConfigBoolOption(configParser, section, "searchGID", False)
+ countFeats = getConfigBoolOption(configParser, section, "countFeats", False)
+ cachePages = getConfigOption(configParser, section, "cachePages", None)
+
+ parser.set_defaults(trackStrand=trackStrand, doSplices=doSplices, doUniqs=doUniqs, doMulti=doMulti,
+ extendGenome=extendGenome, replaceModels=replaceModels, searchGID=searchGID,
+ countFeats=countFeats, cachePages=cachePages)
+
+ return parser
+
+def geneMrnaCounts(genomeName, bamfile, outfilename, trackStrand=False, doSplices=False,
doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
- searchGID=False, countFeats=False, cachePages=None, markGID=False):
+ searchGID=False, countFeats=False):
if trackStrand:
print "will track strandedness"
else:
replaceModels = False
- if cachePages is not None:
- doCache = True
- else:
- cachePages = 100000
- doCache = False
-
- hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
- if cachePages > hitRDS.getDefaultCacheSize():
- hitRDS.setDBcache(cachePages)
-
genome = Genome(genomeName, inRAM=True)
if extendGenome != "":
genome.extendFeatures(extendGenome, replace=replaceModels)
for gid in gidList:
gidCount[gid] = 0
- chromList = hitRDS.getChromosomes(fullChrom=False)
- if len(chromList) == 0 and doSplices:
- chromList = hitRDS.getChromosomes(table="splices", fullChrom=False)
-
- if markGID:
- print "Flagging all reads as NM"
- hitRDS.setFlags("NM", uniqs=doUniqs, multi=doMulti, splices=doSplices)
-
- for chrom in chromList:
+ chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+ for chrom in chromosomeList:
if chrom not in featuresByChromDict:
continue
if countFeats:
- seenFeaturesByChromDict[chrom] = []
+ seenFeaturesByChromDict[chrom] = set([])
- print "\nchr%s" % chrom
- fullchrom = "chr%s" % chrom
- regionList = []
+ print "\nchr%s" % chrom
print "counting GIDs"
for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]:
try:
if featureSense == "R":
checkSense = "-"
- regionList.append((gid, fullchrom, start, stop, checkSense))
- count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
+ count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
else:
- regionList.append((gid, fullchrom, start, stop))
- count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
- if count != 0:
- print count
+ count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
gidCount[gid] += count
if countFeats:
- if (start, stop, gid, featureSense) not in seenFeaturesByChromDict[chrom]:
- seenFeaturesByChromDict[chrom].append((start, stop, gid, featureSense))
+ seenFeaturesByChromDict[chrom].add((start, stop, gid, featureSense))
except:
print "problem with %s - skipping" % gid
- if markGID:
- print "marking GIDs"
- hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=doStranded)
- print "finished marking"
-
print " "
if countFeats:
numFeatures = countFeatures(seenFeaturesByChromDict)
print "saw %d features" % numFeatures
- writeOutputFile(outfilename, genome, gidList, gidCount, searchGID)
- if markGID and doCache:
- hitRDS.saveCacheDB(hitfile)
+ writeOutputFile(outfilename, genome, gidCount, searchGID)
+
+
+def getCounts(bamfile, chrom, start, stop, uniqs=True, multi=False, splices=False, sense=''):
+ count = 0.0
+ for alignedread in bamfile.fetch(chrom, start, stop):
+ if countThisRead(alignedread, uniqs, multi, splices, sense):
+ count += 1.0/alignedread.opt('NH')
+
+ return count
def countFeatures(seenFeaturesByChromDict):
return count
-def writeOutputFile(outfilename, genome, gidList, gidCount, searchGID):
+def writeOutputFile(outfilename, genome, gidCount, searchGID):
geneAnnotDict = genome.allAnnotInfo()
genomeName = genome.genome
outfile = open(outfilename, "w")
idb = geneinfoDB(cache=True)
geneInfoDict = idb.getallGeneInfo(genomeName)
- for gid in gidList:
+ for gid in gidCount:
symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)
- if gid in gidCount:
- outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
- else:
- outfile.write("%s\t%s\t0\n" % (gid, symbol))
+ outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
outfile.close()