import sys
import optparse
-from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
-import ReadDataset
+from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, countThisRead
from cistematic.genomes import Genome
from cistematic.core.geneinfo import geneinfoDB
-print "geneMrnaCounts: version 5.2"
+print "geneMrnaCounts: version 6.0"
def main(argv=None):
geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices,
options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels,
- options.searchGID, options.countFeats, options.cachePages, options.markGID)
+ options.searchGID, options.countFeats, options.cachePages)
def getParser(usage):
parser.add_option("--searchGID", action="store_true", dest="searchGID")
parser.add_option("--countfeatures", action="store_true", dest="countFeats")
parser.add_option("--cache", type="int", dest="cachePages")
- parser.add_option("--markGID", action="store_true", dest="markGID")
configParser = getConfigParser()
section = "geneMrnaCounts"
searchGID = getConfigBoolOption(configParser, section, "searchGID", False)
countFeats = getConfigBoolOption(configParser, section, "countFeats", False)
cachePages = getConfigOption(configParser, section, "cachePages", None)
- markGID = getConfigBoolOption(configParser, section, "markGID", False)
parser.set_defaults(trackStrand=trackStrand, doSplices=doSplices, doUniqs=doUniqs, doMulti=doMulti,
extendGenome=extendGenome, replaceModels=replaceModels, searchGID=searchGID,
- countFeats=countFeats, cachePages=cachePages, markGID=markGID)
+ countFeats=countFeats, cachePages=cachePages)
return parser
-def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplices=False,
+def geneMrnaCounts(genomeName, bamfile, outfilename, trackStrand=False, doSplices=False,
doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
- searchGID=False, countFeats=False, cachePages=None, markGID=False):
+ searchGID=False, countFeats=False):
if trackStrand:
print "will track strandedness"
else:
replaceModels = False
- if cachePages is not None:
- doCache = True
- else:
- cachePages = 100000
- doCache = False
-
- hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
- if cachePages > hitRDS.getDefaultCacheSize():
- hitRDS.setDBcache(cachePages)
-
genome = Genome(genomeName, inRAM=True)
if extendGenome != "":
genome.extendFeatures(extendGenome, replace=replaceModels)
for gid in gidList:
gidCount[gid] = 0
- chromList = hitRDS.getChromosomes(fullChrom=False)
- if len(chromList) == 0 and doSplices:
- chromList = hitRDS.getChromosomes(table="splices", fullChrom=False)
-
- if markGID:
- print "Flagging all reads as NM"
- hitRDS.setFlags("NM", uniqs=doUniqs, multi=doMulti, splices=doSplices)
-
- for chrom in chromList:
+ chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+ for chrom in chromosomeList:
if chrom not in featuresByChromDict:
continue
if countFeats:
seenFeaturesByChromDict[chrom] = set([])
- print "\nchr%s" % chrom
- fullchrom = "chr%s" % chrom
- regionList = []
+ print "\nchr%s" % chrom
print "counting GIDs"
for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]:
try:
if featureSense == "R":
checkSense = "-"
- regionData = (gid, fullchrom, start, stop, checkSense)
- count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
+ count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
else:
- regionData = (gid, fullchrom, start, stop)
- count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+ count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
gidCount[gid] += count
- if markGID:
- regionList.append(regionData)
-
if countFeats:
seenFeaturesByChromDict[chrom].add((start, stop, gid, featureSense))
except:
print "problem with %s - skipping" % gid
- if markGID:
- print "marking GIDs"
- hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=doStranded)
- print "finished marking"
-
print " "
if countFeats:
numFeatures = countFeatures(seenFeaturesByChromDict)
print "saw %d features" % numFeatures
- writeOutputFile(outfilename, genome, gidList, gidCount, searchGID)
- if markGID and doCache:
- hitRDS.saveCacheDB(hitfile)
+ writeOutputFile(outfilename, genome, gidCount, searchGID)
+
+
+def getCounts(bamfile, chrom, start, stop, uniqs=True, multi=False, splices=False, sense=''):
+ count = 0.0
+ for alignedread in bamfile.fetch(chrom, start, stop):
+ if countThisRead(alignedread, uniqs, multi, splices, sense):
+ count += 1.0/alignedread.opt('NH')
+
+ return count
def countFeatures(seenFeaturesByChromDict):
return count
-def writeOutputFile(outfilename, genome, gidList, gidCount, searchGID):
+def writeOutputFile(outfilename, genome, gidCount, searchGID):
geneAnnotDict = genome.allAnnotInfo()
genomeName = genome.genome
outfile = open(outfilename, "w")
idb = geneinfoDB(cache=True)
geneInfoDict = idb.getallGeneInfo(genomeName)
- for gid in gidList:
+ for gid in gidCount:
symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)
- if gid in gidCount:
- outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
- else:
- outfile.write("%s\t%s\t0\n" % (gid, symbol))
+ outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
outfile.close()