convert standard analysis pipelines to use bam format natively
[erange.git] / geneMrnaCountsWeighted.py
index 31213c3f7a4703f9d9c8c1e87f17f4c0854e4574..3d02c679d28cf03c8712a3031d5c207ac65bd45d 100755 (executable)
@@ -6,13 +6,13 @@ except:
 
 import sys
 import optparse
-from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getHeaderComment, addPairIDtoReadID, isSpliceEntry
 from cistematic.genomes import Genome
 from commoncode import getGeneInfoDict
 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
 
-print "geneMrnaCountsWeighted: version 4.3"
+print "geneMrnaCountsWeighted: version 5.0"
 
 
 def main(argv=None):
@@ -32,8 +32,9 @@ def main(argv=None):
     hitfile =  args[1]
     countfile = args[2]
     outfilename = args[3]
+    bamfile = pysam.Samfile(hitfile, "rb")
 
-    geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
+    geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, options.ignoreSense,
                            options.withUniqs, options.withMulti,
                            options.acceptfile, options.cachePages, options.doVerbose,
                            options.extendGenome, options.replaceModels)
@@ -68,13 +69,7 @@ def makeParser(usage=""):
     return parser
 
 
-#TODO: Reported user performance issue. Long run times in conditions:
-#    small number of reads ~40-50M
-#    all features on single chromosome
-#
-#    User states has been a long time problem.
-
-def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
+def geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, ignoreSense=True,
                            withUniqs=False, withMulti=False, acceptfile=None,
                            cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
 
@@ -89,7 +84,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
         doCache = True
     else:
         doCache = False
-        cachePages = 0
         hg = Genome(genome, inRAM=True)
 
     if extendGenome:
@@ -100,10 +94,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
 
         hg.extendFeatures(extendGenome, replace=replaceModels)
 
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache)
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
-
     allGIDs = set(hg.allGIDs())
     if acceptfile is not None:
         regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
@@ -121,22 +111,15 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
         gidReadDict[gid] = []
 
     index = 0
-    if withMulti and not withUniqs:
-        chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
-    else:
-        chromList = hitRDS.getChromosomes(fullChrom=False)
-
-    readlen = hitRDS.getReadSize()
-    for chromosome in chromList:
+    chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+    for chromosome in chromosomeList:
         if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
             continue
 
         print "\n%s " % chromosome,
-        fullchrom = "chr%s" % chromosome
-        hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
         featureList = featuresByChromDict[chromosome]
 
-        readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index)
+        readGidList, totalProcessedReads = getReadGIDs(bamfile, chromosome, featureList, index, withUniqs, withMulti, ignoreSense=ignoreSense)
         index = totalProcessedReads
         for (tagReadID, gid) in readGidList:
             try:
@@ -157,17 +140,23 @@ def doNotProcessChromosome(chromosome, chromosomeList):
     return chromosome not in chromosomeList
 
 
-def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
+def getReadGIDs(bamfile, chromosome, featList, index, doUniqs, doMulti, ignoreSense=True):
 
     startFeature = 0
     readGidList = []
-    ignoreSense = True
-    for read in hitDict[fullchrom]:
-        tagStart = read["start"]
-        tagReadID = read["readID"]
-        if "sense" in read:
-            tagSense = read["sense"]
-            ignoreSense = False
+    readlen = getHeaderComment(bamfile.header, "ReadLength")
+    for alignedread in bamfile.fetch(chromosome):
+        if doNotProcessRead(alignedread, doUniqs, doMulti):
+            continue
+
+        tagStart = alignedread.pos
+        tagReadID = addPairIDtoReadID(alignedread)
+        tagSense = ""
+        if not ignoreSense:
+            if alignedread.is_reverse:
+                tagSense = "-"
+            else:
+                tagSense = "+"
 
         index += 1
         if index % 100000 == 0:
@@ -199,6 +188,18 @@ def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
     return readGidList, index
 
 
+def doNotProcessRead(read, doUniqs, doMulti):
+    doNotProcess = False
+    if isSpliceEntry(read.cigar):
+        doNotProcess = True
+    elif doUniqs and read.opt('NH') > 1:
+        doNotProcess = True
+    elif doMulti and read.opt('NH') == 1:
+        doNotProcess = True
+
+    return doNotProcess
+
+
 def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
 
     uniqueCountDict = {}