X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=geneMrnaCountsWeighted.py;h=3d02c679d28cf03c8712a3031d5c207ac65bd45d;hp=5299d27d26c23f8729063eb31f0da390730b5ce5;hb=HEAD;hpb=77dccd7c98d8cdb60caaf178b1123df71ea662c9 diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py index 5299d27..3d02c67 100755 --- a/geneMrnaCountsWeighted.py +++ b/geneMrnaCountsWeighted.py @@ -6,13 +6,13 @@ except: import sys import optparse -from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption -import ReadDataset +import pysam +from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getHeaderComment, addPairIDtoReadID, isSpliceEntry from cistematic.genomes import Genome from commoncode import getGeneInfoDict from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB -print "geneMrnaCountsWeighted: version 4.3" +print "geneMrnaCountsWeighted: version 5.0" def main(argv=None): @@ -32,8 +32,9 @@ def main(argv=None): hitfile = args[1] countfile = args[2] outfilename = args[3] + bamfile = pysam.Samfile(hitfile, "rb") - geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense, + geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, options.ignoreSense, options.withUniqs, options.withMulti, options.acceptfile, options.cachePages, options.doVerbose, options.extendGenome, options.replaceModels) @@ -68,7 +69,7 @@ def makeParser(usage=""): return parser -def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True, +def geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, ignoreSense=True, withUniqs=False, withMulti=False, acceptfile=None, cachePages=None, doVerbose=False, extendGenome="", replaceModels=False): @@ -83,7 +84,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense= doCache = True else: doCache = False - cachePages = 0 hg = Genome(genome, inRAM=True) if extendGenome: @@ -94,10 +94,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense= hg.extendFeatures(extendGenome, replace=replaceModels) - hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache) - if cachePages > hitRDS.getDefaultCacheSize(): - hitRDS.setDBcache(cachePages) - allGIDs = set(hg.allGIDs()) if acceptfile is not None: regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose) @@ -115,22 +111,15 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense= gidReadDict[gid] = [] index = 0 - if withMulti and not withUniqs: - chromList = hitRDS.getChromosomes(table="multi", fullChrom=False) - else: - chromList = hitRDS.getChromosomes(fullChrom=False) - - readlen = hitRDS.getReadSize() - for chromosome in chromList: + chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"] + for chromosome in chromosomeList: if doNotProcessChromosome(chromosome, featuresByChromDict.keys()): continue print "\n%s " % chromosome, - fullchrom = "chr%s" % chromosome - hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti) featureList = featuresByChromDict[chromosome] - readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index) + readGidList, totalProcessedReads = getReadGIDs(bamfile, chromosome, featureList, index, withUniqs, withMulti, ignoreSense=ignoreSense) index = totalProcessedReads for (tagReadID, gid) in readGidList: try: @@ -151,17 +140,23 @@ def doNotProcessChromosome(chromosome, chromosomeList): return chromosome not in chromosomeList -def getReadGIDs(hitDict, fullchrom, featList, readlen, index): +def getReadGIDs(bamfile, chromosome, featList, index, doUniqs, doMulti, ignoreSense=True): startFeature = 0 readGidList = [] - ignoreSense = True - for read in hitDict[fullchrom]: - tagStart = read["start"] - tagReadID = read["readID"] - if read.has_key("sense"): - tagSense = read["sense"] - ignoreSense = False + readlen = getHeaderComment(bamfile.header, "ReadLength") + for alignedread in bamfile.fetch(chromosome): + if doNotProcessRead(alignedread, doUniqs, doMulti): + continue + + tagStart = alignedread.pos + tagReadID = addPairIDtoReadID(alignedread) + tagSense = "" + if not ignoreSense: + if alignedread.is_reverse: + tagSense = "-" + else: + tagSense = "+" index += 1 if index % 100000 == 0: @@ -193,6 +188,18 @@ def getReadGIDs(hitDict, fullchrom, featList, readlen, index): return readGidList, index +def doNotProcessRead(read, doUniqs, doMulti): + doNotProcess = False + if isSpliceEntry(read.cigar): + doNotProcess = True + elif doUniqs and read.opt('NH') > 1: + doNotProcess = True + elif doMulti and read.opt('NH') == 1: + doNotProcess = True + + return doNotProcess + + def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False): uniqueCountDict = {} @@ -200,6 +207,7 @@ def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, for line in uniquecounts: fields = line.strip().split() # add a pseudo-count here to ease calculations below + #TODO: figure out why this was done in prior implementation... uniqueCountDict[fields[0]] = float(fields[-1]) + 1 uniquecounts.close() @@ -246,7 +254,7 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict): try: tagValue = uniqueCountDict[gid] except KeyError: - tagValue = 1 + pass tagDenom = 0. for relatedGID in read2GidDict[readID]: @@ -264,4 +272,4 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict): if __name__ == "__main__": - main(sys.argv) + main(sys.argv) \ No newline at end of file