convert standard analysis pipelines to use bam format natively

[erange.git] / geneMrnaCountsWeighted.py
diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py

index 31213c3f7a4703f9d9c8c1e87f17f4c0854e4574..3d02c679d28cf03c8712a3031d5c207ac65bd45d 100755 (executable)
--- a/geneMrnaCountsWeighted.py
+++ b/geneMrnaCountsWeighted.py
@@ -6,13 +6,13 @@ except:
  
  import sys
  import optparse
  
  import sys
  import optparse
-from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getHeaderComment, addPairIDtoReadID, isSpliceEntry
  from cistematic.genomes import Genome
  from commoncode import getGeneInfoDict
  from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
  
  from cistematic.genomes import Genome
  from commoncode import getGeneInfoDict
  from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
  
-print "geneMrnaCountsWeighted: version 4.3"
+print "geneMrnaCountsWeighted: version 5.0"
  
  
  def main(argv=None):
  
  
  def main(argv=None):
@@ -32,8 +32,9 @@ def main(argv=None):
      hitfile =  args[1]
      countfile = args[2]
      outfilename = args[3]
      hitfile =  args[1]
      countfile = args[2]
      outfilename = args[3]
+    bamfile = pysam.Samfile(hitfile, "rb")
  
  
-    geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
+    geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, options.ignoreSense,
                             options.withUniqs, options.withMulti,
                             options.acceptfile, options.cachePages, options.doVerbose,
                             options.extendGenome, options.replaceModels)
                             options.withUniqs, options.withMulti,
                             options.acceptfile, options.cachePages, options.doVerbose,
                             options.extendGenome, options.replaceModels)
@@ -68,13 +69,7 @@ def makeParser(usage=""):
      return parser
  
  
      return parser
  
  
-#TODO: Reported user performance issue. Long run times in conditions:
-#    small number of reads ~40-50M
-#    all features on single chromosome
-#
-#    User states has been a long time problem.
-
-def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
+def geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, ignoreSense=True,
                             withUniqs=False, withMulti=False, acceptfile=None,
                             cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
  
                             withUniqs=False, withMulti=False, acceptfile=None,
                             cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
  
@@ -89,7 +84,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
          doCache = True
      else:
          doCache = False
          doCache = True
      else:
          doCache = False
-        cachePages = 0
          hg = Genome(genome, inRAM=True)
  
      if extendGenome:
          hg = Genome(genome, inRAM=True)
  
      if extendGenome:
@@ -100,10 +94,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
  
          hg.extendFeatures(extendGenome, replace=replaceModels)
  
  
          hg.extendFeatures(extendGenome, replace=replaceModels)
  
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache)
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
-
      allGIDs = set(hg.allGIDs())
      if acceptfile is not None:
          regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
      allGIDs = set(hg.allGIDs())
      if acceptfile is not None:
          regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
@@ -121,22 +111,15 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
          gidReadDict[gid] = []
  
      index = 0
          gidReadDict[gid] = []
  
      index = 0
-    if withMulti and not withUniqs:
-        chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
-    else:
-        chromList = hitRDS.getChromosomes(fullChrom=False)
-
-    readlen = hitRDS.getReadSize()
-    for chromosome in chromList:
+    chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+    for chromosome in chromosomeList:
          if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
              continue
  
          print "\n%s " % chromosome,
          if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
              continue
  
          print "\n%s " % chromosome,
-        fullchrom = "chr%s" % chromosome
-        hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
          featureList = featuresByChromDict[chromosome]
  
          featureList = featuresByChromDict[chromosome]
  
-        readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index)
+        readGidList, totalProcessedReads = getReadGIDs(bamfile, chromosome, featureList, index, withUniqs, withMulti, ignoreSense=ignoreSense)
          index = totalProcessedReads
          for (tagReadID, gid) in readGidList:
              try:
          index = totalProcessedReads
          for (tagReadID, gid) in readGidList:
              try:
@@ -157,17 +140,23 @@ def doNotProcessChromosome(chromosome, chromosomeList):
      return chromosome not in chromosomeList
  
  
      return chromosome not in chromosomeList
  
  
-def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
+def getReadGIDs(bamfile, chromosome, featList, index, doUniqs, doMulti, ignoreSense=True):
  
      startFeature = 0
      readGidList = []
  
      startFeature = 0
      readGidList = []
-    ignoreSense = True
-    for read in hitDict[fullchrom]:
-        tagStart = read["start"]
-        tagReadID = read["readID"]
-        if "sense" in read:
-            tagSense = read["sense"]
-            ignoreSense = False
+    readlen = getHeaderComment(bamfile.header, "ReadLength")
+    for alignedread in bamfile.fetch(chromosome):
+        if doNotProcessRead(alignedread, doUniqs, doMulti):
+            continue
+
+        tagStart = alignedread.pos
+        tagReadID = addPairIDtoReadID(alignedread)
+        tagSense = ""
+        if not ignoreSense:
+            if alignedread.is_reverse:
+                tagSense = "-"
+            else:
+                tagSense = "+"
  
          index += 1
          if index % 100000 == 0:
  
          index += 1
          if index % 100000 == 0:
@@ -199,6 +188,18 @@ def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
      return readGidList, index
  
  
      return readGidList, index
  
  
+def doNotProcessRead(read, doUniqs, doMulti):
+    doNotProcess = False
+    if isSpliceEntry(read.cigar):
+        doNotProcess = True
+    elif doUniqs and read.opt('NH') > 1:
+        doNotProcess = True
+    elif doMulti and read.opt('NH') == 1:
+        doNotProcess = True
+
+    return doNotProcess
+
+
  def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
  
      uniqueCountDict = {}
  def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
  
      uniqueCountDict = {}