import sys
import optparse
-from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getHeaderComment, addPairIDtoReadID, isSpliceEntry
from cistematic.genomes import Genome
from commoncode import getGeneInfoDict
from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
-print "geneMrnaCountsWeighted: version 4.3"
+print "geneMrnaCountsWeighted: version 5.0"
def main(argv=None):
hitfile = args[1]
countfile = args[2]
outfilename = args[3]
+ bamfile = pysam.Samfile(hitfile, "rb")
- geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
+ geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, options.ignoreSense,
options.withUniqs, options.withMulti,
options.acceptfile, options.cachePages, options.doVerbose,
options.extendGenome, options.replaceModels)
return parser
-#TODO: Reported user performance issue. Long run times in conditions:
-# small number of reads ~40-50M
-# all features on single chromosome
-#
-# User states has been a long time problem.
-
-def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
+def geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, ignoreSense=True,
withUniqs=False, withMulti=False, acceptfile=None,
cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
doCache = True
else:
doCache = False
- cachePages = 0
hg = Genome(genome, inRAM=True)
if extendGenome:
hg.extendFeatures(extendGenome, replace=replaceModels)
- hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache)
- if cachePages > hitRDS.getDefaultCacheSize():
- hitRDS.setDBcache(cachePages)
-
allGIDs = set(hg.allGIDs())
if acceptfile is not None:
regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
gidReadDict[gid] = []
index = 0
- if withMulti and not withUniqs:
- chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
- else:
- chromList = hitRDS.getChromosomes(fullChrom=False)
-
- readlen = hitRDS.getReadSize()
- for chromosome in chromList:
+ chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+ for chromosome in chromosomeList:
if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
continue
print "\n%s " % chromosome,
- fullchrom = "chr%s" % chromosome
- hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
featureList = featuresByChromDict[chromosome]
- readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index)
+ readGidList, totalProcessedReads = getReadGIDs(bamfile, chromosome, featureList, index, withUniqs, withMulti, ignoreSense=ignoreSense)
index = totalProcessedReads
for (tagReadID, gid) in readGidList:
try:
return chromosome not in chromosomeList
-def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
+def getReadGIDs(bamfile, chromosome, featList, index, doUniqs, doMulti, ignoreSense=True):
startFeature = 0
readGidList = []
- ignoreSense = True
- for read in hitDict[fullchrom]:
- tagStart = read["start"]
- tagReadID = read["readID"]
- if read.has_key("sense"):
- tagSense = read["sense"]
- ignoreSense = False
+ readlen = getHeaderComment(bamfile.header, "ReadLength")
+ for alignedread in bamfile.fetch(chromosome):
+ if doNotProcessRead(alignedread, doUniqs, doMulti):
+ continue
+
+ tagStart = alignedread.pos
+ tagReadID = addPairIDtoReadID(alignedread)
+ tagSense = ""
+ if not ignoreSense:
+ if alignedread.is_reverse:
+ tagSense = "-"
+ else:
+ tagSense = "+"
index += 1
if index % 100000 == 0:
return readGidList, index
+def doNotProcessRead(read, doUniqs, doMulti):
+ doNotProcess = False
+ if isSpliceEntry(read.cigar):
+ doNotProcess = True
+ elif doUniqs and read.opt('NH') > 1:
+ doNotProcess = True
+ elif doMulti and read.opt('NH') == 1:
+ doNotProcess = True
+
+ return doNotProcess
+
+
def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
uniqueCountDict = {}
try:
tagValue = uniqueCountDict[gid]
except KeyError:
- tagValue = 1
+ pass
tagDenom = 0.
for relatedGID in read2GidDict[readID]: