convert standard analysis pipelines to use bam format natively

[erange.git] / geneMrnaCountsWeighted.py
diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py

index 31213c3f7a4703f9d9c8c1e87f17f4c0854e4574..3d02c679d28cf03c8712a3031d5c207ac65bd45d 100755 (executable)
--- a/geneMrnaCountsWeighted.py
+++ b/geneMrnaCountsWeighted.py
@@ -6,13 +6,13 @@ except:
  
  import sys
  import optparse
-from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getHeaderComment, addPairIDtoReadID, isSpliceEntry
  from cistematic.genomes import Genome
  from commoncode import getGeneInfoDict
  from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
  
-print "geneMrnaCountsWeighted: version 4.3"
+print "geneMrnaCountsWeighted: version 5.0"
  
  
  def main(argv=None):
@@ -32,8 +32,9 @@ def main(argv=None):
      hitfile =  args[1]
      countfile = args[2]
      outfilename = args[3]
+    bamfile = pysam.Samfile(hitfile, "rb")
  
-    geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
+    geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, options.ignoreSense,
                             options.withUniqs, options.withMulti,
                             options.acceptfile, options.cachePages, options.doVerbose,
                             options.extendGenome, options.replaceModels)
@@ -68,13 +69,7 @@ def makeParser(usage=""):
      return parser
  
  
-#TODO: Reported user performance issue. Long run times in conditions:
-#    small number of reads ~40-50M
-#    all features on single chromosome
-#
-#    User states has been a long time problem.
-
-def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
+def geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, ignoreSense=True,
                             withUniqs=False, withMulti=False, acceptfile=None,
                             cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
  
@@ -89,7 +84,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
          doCache = True
      else:
          doCache = False
-        cachePages = 0
          hg = Genome(genome, inRAM=True)
  
      if extendGenome:
@@ -100,10 +94,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
  
          hg.extendFeatures(extendGenome, replace=replaceModels)
  
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache)
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
-
      allGIDs = set(hg.allGIDs())
      if acceptfile is not None:
          regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
@@ -121,22 +111,15 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
          gidReadDict[gid] = []
  
      index = 0
-    if withMulti and not withUniqs:
-        chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
-    else:
-        chromList = hitRDS.getChromosomes(fullChrom=False)
-
-    readlen = hitRDS.getReadSize()
-    for chromosome in chromList:
+    chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+    for chromosome in chromosomeList:
          if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
              continue
  
          print "\n%s " % chromosome,
-        fullchrom = "chr%s" % chromosome
-        hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
          featureList = featuresByChromDict[chromosome]
  
-        readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index)
+        readGidList, totalProcessedReads = getReadGIDs(bamfile, chromosome, featureList, index, withUniqs, withMulti, ignoreSense=ignoreSense)
          index = totalProcessedReads
          for (tagReadID, gid) in readGidList:
              try:
@@ -157,17 +140,23 @@ def doNotProcessChromosome(chromosome, chromosomeList):
      return chromosome not in chromosomeList
  
  
-def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
+def getReadGIDs(bamfile, chromosome, featList, index, doUniqs, doMulti, ignoreSense=True):
  
      startFeature = 0
      readGidList = []
-    ignoreSense = True
-    for read in hitDict[fullchrom]:
-        tagStart = read["start"]
-        tagReadID = read["readID"]
-        if "sense" in read:
-            tagSense = read["sense"]
-            ignoreSense = False
+    readlen = getHeaderComment(bamfile.header, "ReadLength")
+    for alignedread in bamfile.fetch(chromosome):
+        if doNotProcessRead(alignedread, doUniqs, doMulti):
+            continue
+
+        tagStart = alignedread.pos
+        tagReadID = addPairIDtoReadID(alignedread)
+        tagSense = ""
+        if not ignoreSense:
+            if alignedread.is_reverse:
+                tagSense = "-"
+            else:
+                tagSense = "+"
  
          index += 1
          if index % 100000 == 0:
@@ -199,6 +188,18 @@ def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
      return readGidList, index
  
  
+def doNotProcessRead(read, doUniqs, doMulti):
+    doNotProcess = False
+    if isSpliceEntry(read.cigar):
+        doNotProcess = True
+    elif doUniqs and read.opt('NH') > 1:
+        doNotProcess = True
+    elif doMulti and read.opt('NH') == 1:
+        doNotProcess = True
+
+    return doNotProcess
+
+
  def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
  
      uniqueCountDict = {}