X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=geneMrnaCountsWeighted.py;fp=geneMrnaCountsWeighted.py;h=3d02c679d28cf03c8712a3031d5c207ac65bd45d;hp=31213c3f7a4703f9d9c8c1e87f17f4c0854e4574;hb=4ad5495359e4322da39868020a7398676261679e;hpb=cfc5602b26323ad2365295145e3f6c622d912eb4 diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py index 31213c3..3d02c67 100755 --- a/geneMrnaCountsWeighted.py +++ b/geneMrnaCountsWeighted.py @@ -6,13 +6,13 @@ except: import sys import optparse -from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption -import ReadDataset +import pysam +from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getHeaderComment, addPairIDtoReadID, isSpliceEntry from cistematic.genomes import Genome from commoncode import getGeneInfoDict from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB -print "geneMrnaCountsWeighted: version 4.3" +print "geneMrnaCountsWeighted: version 5.0" def main(argv=None): @@ -32,8 +32,9 @@ def main(argv=None): hitfile = args[1] countfile = args[2] outfilename = args[3] + bamfile = pysam.Samfile(hitfile, "rb") - geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense, + geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, options.ignoreSense, options.withUniqs, options.withMulti, options.acceptfile, options.cachePages, options.doVerbose, options.extendGenome, options.replaceModels) @@ -68,13 +69,7 @@ def makeParser(usage=""): return parser -#TODO: Reported user performance issue. Long run times in conditions: -# small number of reads ~40-50M -# all features on single chromosome -# -# User states has been a long time problem. - -def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True, +def geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, ignoreSense=True, withUniqs=False, withMulti=False, acceptfile=None, cachePages=None, doVerbose=False, extendGenome="", replaceModels=False): @@ -89,7 +84,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense= doCache = True else: doCache = False - cachePages = 0 hg = Genome(genome, inRAM=True) if extendGenome: @@ -100,10 +94,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense= hg.extendFeatures(extendGenome, replace=replaceModels) - hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache) - if cachePages > hitRDS.getDefaultCacheSize(): - hitRDS.setDBcache(cachePages) - allGIDs = set(hg.allGIDs()) if acceptfile is not None: regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose) @@ -121,22 +111,15 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense= gidReadDict[gid] = [] index = 0 - if withMulti and not withUniqs: - chromList = hitRDS.getChromosomes(table="multi", fullChrom=False) - else: - chromList = hitRDS.getChromosomes(fullChrom=False) - - readlen = hitRDS.getReadSize() - for chromosome in chromList: + chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"] + for chromosome in chromosomeList: if doNotProcessChromosome(chromosome, featuresByChromDict.keys()): continue print "\n%s " % chromosome, - fullchrom = "chr%s" % chromosome - hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti) featureList = featuresByChromDict[chromosome] - readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index) + readGidList, totalProcessedReads = getReadGIDs(bamfile, chromosome, featureList, index, withUniqs, withMulti, ignoreSense=ignoreSense) index = totalProcessedReads for (tagReadID, gid) in readGidList: try: @@ -157,17 +140,23 @@ def doNotProcessChromosome(chromosome, chromosomeList): return chromosome not in chromosomeList -def getReadGIDs(hitDict, fullchrom, featList, readlen, index): +def getReadGIDs(bamfile, chromosome, featList, index, doUniqs, doMulti, ignoreSense=True): startFeature = 0 readGidList = [] - ignoreSense = True - for read in hitDict[fullchrom]: - tagStart = read["start"] - tagReadID = read["readID"] - if "sense" in read: - tagSense = read["sense"] - ignoreSense = False + readlen = getHeaderComment(bamfile.header, "ReadLength") + for alignedread in bamfile.fetch(chromosome): + if doNotProcessRead(alignedread, doUniqs, doMulti): + continue + + tagStart = alignedread.pos + tagReadID = addPairIDtoReadID(alignedread) + tagSense = "" + if not ignoreSense: + if alignedread.is_reverse: + tagSense = "-" + else: + tagSense = "+" index += 1 if index % 100000 == 0: @@ -199,6 +188,18 @@ def getReadGIDs(hitDict, fullchrom, featList, readlen, index): return readGidList, index +def doNotProcessRead(read, doUniqs, doMulti): + doNotProcess = False + if isSpliceEntry(read.cigar): + doNotProcess = True + elif doUniqs and read.opt('NH') > 1: + doNotProcess = True + elif doMulti and read.opt('NH') == 1: + doNotProcess = True + + return doNotProcess + + def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False): uniqueCountDict = {}