first pass cleanup of cistematic/genomes; change bamPreprocessing

[erange.git] / geneMrnaCountsWeighted.py
diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py

index 5299d27d26c23f8729063eb31f0da390730b5ce5..3d02c679d28cf03c8712a3031d5c207ac65bd45d 100755 (executable)
--- a/geneMrnaCountsWeighted.py
+++ b/geneMrnaCountsWeighted.py
@@ -6,13 +6,13 @@ except:
  
  import sys
  import optparse
  
  import sys
  import optparse
-from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getHeaderComment, addPairIDtoReadID, isSpliceEntry
  from cistematic.genomes import Genome
  from commoncode import getGeneInfoDict
  from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
  
  from cistematic.genomes import Genome
  from commoncode import getGeneInfoDict
  from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
  
-print "geneMrnaCountsWeighted: version 4.3"
+print "geneMrnaCountsWeighted: version 5.0"
  
  
  def main(argv=None):
  
  
  def main(argv=None):
@@ -32,8 +32,9 @@ def main(argv=None):
      hitfile =  args[1]
      countfile = args[2]
      outfilename = args[3]
      hitfile =  args[1]
      countfile = args[2]
      outfilename = args[3]
+    bamfile = pysam.Samfile(hitfile, "rb")
  
  
-    geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
+    geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, options.ignoreSense,
                             options.withUniqs, options.withMulti,
                             options.acceptfile, options.cachePages, options.doVerbose,
                             options.extendGenome, options.replaceModels)
                             options.withUniqs, options.withMulti,
                             options.acceptfile, options.cachePages, options.doVerbose,
                             options.extendGenome, options.replaceModels)
@@ -68,7 +69,7 @@ def makeParser(usage=""):
      return parser
  
  
      return parser
  
  
-def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
+def geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, ignoreSense=True,
                             withUniqs=False, withMulti=False, acceptfile=None,
                             cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
  
                             withUniqs=False, withMulti=False, acceptfile=None,
                             cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
  
@@ -83,7 +84,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
          doCache = True
      else:
          doCache = False
          doCache = True
      else:
          doCache = False
-        cachePages = 0
          hg = Genome(genome, inRAM=True)
  
      if extendGenome:
          hg = Genome(genome, inRAM=True)
  
      if extendGenome:
@@ -94,10 +94,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
  
          hg.extendFeatures(extendGenome, replace=replaceModels)
  
  
          hg.extendFeatures(extendGenome, replace=replaceModels)
  
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache)
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
-
      allGIDs = set(hg.allGIDs())
      if acceptfile is not None:
          regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
      allGIDs = set(hg.allGIDs())
      if acceptfile is not None:
          regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
@@ -115,22 +111,15 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
          gidReadDict[gid] = []
  
      index = 0
          gidReadDict[gid] = []
  
      index = 0
-    if withMulti and not withUniqs:
-        chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
-    else:
-        chromList = hitRDS.getChromosomes(fullChrom=False)
-
-    readlen = hitRDS.getReadSize()
-    for chromosome in chromList:
+    chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+    for chromosome in chromosomeList:
          if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
              continue
  
          print "\n%s " % chromosome,
          if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
              continue
  
          print "\n%s " % chromosome,
-        fullchrom = "chr%s" % chromosome
-        hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
          featureList = featuresByChromDict[chromosome]
  
          featureList = featuresByChromDict[chromosome]
  
-        readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index)
+        readGidList, totalProcessedReads = getReadGIDs(bamfile, chromosome, featureList, index, withUniqs, withMulti, ignoreSense=ignoreSense)
          index = totalProcessedReads
          for (tagReadID, gid) in readGidList:
              try:
          index = totalProcessedReads
          for (tagReadID, gid) in readGidList:
              try:
@@ -151,17 +140,23 @@ def doNotProcessChromosome(chromosome, chromosomeList):
      return chromosome not in chromosomeList
  
  
      return chromosome not in chromosomeList
  
  
-def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
+def getReadGIDs(bamfile, chromosome, featList, index, doUniqs, doMulti, ignoreSense=True):
  
      startFeature = 0
      readGidList = []
  
      startFeature = 0
      readGidList = []
-    ignoreSense = True
-    for read in hitDict[fullchrom]:
-        tagStart = read["start"]
-        tagReadID = read["readID"]
-        if read.has_key("sense"):
-            tagSense = read["sense"]
-            ignoreSense = False
+    readlen = getHeaderComment(bamfile.header, "ReadLength")
+    for alignedread in bamfile.fetch(chromosome):
+        if doNotProcessRead(alignedread, doUniqs, doMulti):
+            continue
+
+        tagStart = alignedread.pos
+        tagReadID = addPairIDtoReadID(alignedread)
+        tagSense = ""
+        if not ignoreSense:
+            if alignedread.is_reverse:
+                tagSense = "-"
+            else:
+                tagSense = "+"
  
          index += 1
          if index % 100000 == 0:
  
          index += 1
          if index % 100000 == 0:
@@ -193,6 +188,18 @@ def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
      return readGidList, index
  
  
      return readGidList, index
  
  
+def doNotProcessRead(read, doUniqs, doMulti):
+    doNotProcess = False
+    if isSpliceEntry(read.cigar):
+        doNotProcess = True
+    elif doUniqs and read.opt('NH') > 1:
+        doNotProcess = True
+    elif doMulti and read.opt('NH') == 1:
+        doNotProcess = True
+
+    return doNotProcess
+
+
  def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
  
      uniqueCountDict = {}
  def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
  
      uniqueCountDict = {}
@@ -200,6 +207,7 @@ def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict,
      for line in uniquecounts:
          fields = line.strip().split()
          # add a pseudo-count here to ease calculations below
      for line in uniquecounts:
          fields = line.strip().split()
          # add a pseudo-count here to ease calculations below
+        #TODO: figure out why this was done in prior implementation...
          uniqueCountDict[fields[0]] = float(fields[-1]) + 1
  
      uniquecounts.close()
          uniqueCountDict[fields[0]] = float(fields[-1]) + 1
  
      uniquecounts.close()
@@ -246,7 +254,7 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
          try:
              tagValue = uniqueCountDict[gid]
          except KeyError:
          try:
              tagValue = uniqueCountDict[gid]
          except KeyError:
-            tagValue = 1
+            pass
  
          tagDenom = 0.
          for relatedGID in read2GidDict[readID]:
  
          tagDenom = 0.
          for relatedGID in read2GidDict[readID]:
@@ -264,4 +272,4 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
  
  
  if __name__ == "__main__":
  
  
  if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
+\ No newline at end of file