convert standard analysis pipelines to use bam format natively
[erange.git] / geneMrnaCountsWeighted.py
index 31213c3f7a4703f9d9c8c1e87f17f4c0854e4574..3d02c679d28cf03c8712a3031d5c207ac65bd45d 100755 (executable)
@@ -6,13 +6,13 @@ except:
 
 import sys
 import optparse
 
 import sys
 import optparse
-from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getHeaderComment, addPairIDtoReadID, isSpliceEntry
 from cistematic.genomes import Genome
 from commoncode import getGeneInfoDict
 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
 
 from cistematic.genomes import Genome
 from commoncode import getGeneInfoDict
 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
 
-print "geneMrnaCountsWeighted: version 4.3"
+print "geneMrnaCountsWeighted: version 5.0"
 
 
 def main(argv=None):
 
 
 def main(argv=None):
@@ -32,8 +32,9 @@ def main(argv=None):
     hitfile =  args[1]
     countfile = args[2]
     outfilename = args[3]
     hitfile =  args[1]
     countfile = args[2]
     outfilename = args[3]
+    bamfile = pysam.Samfile(hitfile, "rb")
 
 
-    geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
+    geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, options.ignoreSense,
                            options.withUniqs, options.withMulti,
                            options.acceptfile, options.cachePages, options.doVerbose,
                            options.extendGenome, options.replaceModels)
                            options.withUniqs, options.withMulti,
                            options.acceptfile, options.cachePages, options.doVerbose,
                            options.extendGenome, options.replaceModels)
@@ -68,13 +69,7 @@ def makeParser(usage=""):
     return parser
 
 
     return parser
 
 
-#TODO: Reported user performance issue. Long run times in conditions:
-#    small number of reads ~40-50M
-#    all features on single chromosome
-#
-#    User states has been a long time problem.
-
-def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
+def geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, ignoreSense=True,
                            withUniqs=False, withMulti=False, acceptfile=None,
                            cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
 
                            withUniqs=False, withMulti=False, acceptfile=None,
                            cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
 
@@ -89,7 +84,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
         doCache = True
     else:
         doCache = False
         doCache = True
     else:
         doCache = False
-        cachePages = 0
         hg = Genome(genome, inRAM=True)
 
     if extendGenome:
         hg = Genome(genome, inRAM=True)
 
     if extendGenome:
@@ -100,10 +94,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
 
         hg.extendFeatures(extendGenome, replace=replaceModels)
 
 
         hg.extendFeatures(extendGenome, replace=replaceModels)
 
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache)
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
-
     allGIDs = set(hg.allGIDs())
     if acceptfile is not None:
         regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
     allGIDs = set(hg.allGIDs())
     if acceptfile is not None:
         regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
@@ -121,22 +111,15 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
         gidReadDict[gid] = []
 
     index = 0
         gidReadDict[gid] = []
 
     index = 0
-    if withMulti and not withUniqs:
-        chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
-    else:
-        chromList = hitRDS.getChromosomes(fullChrom=False)
-
-    readlen = hitRDS.getReadSize()
-    for chromosome in chromList:
+    chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+    for chromosome in chromosomeList:
         if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
             continue
 
         print "\n%s " % chromosome,
         if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
             continue
 
         print "\n%s " % chromosome,
-        fullchrom = "chr%s" % chromosome
-        hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
         featureList = featuresByChromDict[chromosome]
 
         featureList = featuresByChromDict[chromosome]
 
-        readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index)
+        readGidList, totalProcessedReads = getReadGIDs(bamfile, chromosome, featureList, index, withUniqs, withMulti, ignoreSense=ignoreSense)
         index = totalProcessedReads
         for (tagReadID, gid) in readGidList:
             try:
         index = totalProcessedReads
         for (tagReadID, gid) in readGidList:
             try:
@@ -157,17 +140,23 @@ def doNotProcessChromosome(chromosome, chromosomeList):
     return chromosome not in chromosomeList
 
 
     return chromosome not in chromosomeList
 
 
-def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
+def getReadGIDs(bamfile, chromosome, featList, index, doUniqs, doMulti, ignoreSense=True):
 
     startFeature = 0
     readGidList = []
 
     startFeature = 0
     readGidList = []
-    ignoreSense = True
-    for read in hitDict[fullchrom]:
-        tagStart = read["start"]
-        tagReadID = read["readID"]
-        if "sense" in read:
-            tagSense = read["sense"]
-            ignoreSense = False
+    readlen = getHeaderComment(bamfile.header, "ReadLength")
+    for alignedread in bamfile.fetch(chromosome):
+        if doNotProcessRead(alignedread, doUniqs, doMulti):
+            continue
+
+        tagStart = alignedread.pos
+        tagReadID = addPairIDtoReadID(alignedread)
+        tagSense = ""
+        if not ignoreSense:
+            if alignedread.is_reverse:
+                tagSense = "-"
+            else:
+                tagSense = "+"
 
         index += 1
         if index % 100000 == 0:
 
         index += 1
         if index % 100000 == 0:
@@ -199,6 +188,18 @@ def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
     return readGidList, index
 
 
     return readGidList, index
 
 
+def doNotProcessRead(read, doUniqs, doMulti):
+    doNotProcess = False
+    if isSpliceEntry(read.cigar):
+        doNotProcess = True
+    elif doUniqs and read.opt('NH') > 1:
+        doNotProcess = True
+    elif doMulti and read.opt('NH') == 1:
+        doNotProcess = True
+
+    return doNotProcess
+
+
 def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
 
     uniqueCountDict = {}
 def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
 
     uniqueCountDict = {}