first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / geneMrnaCountsWeighted.py
index 5299d27d26c23f8729063eb31f0da390730b5ce5..3d02c679d28cf03c8712a3031d5c207ac65bd45d 100755 (executable)
@@ -6,13 +6,13 @@ except:
 
 import sys
 import optparse
 
 import sys
 import optparse
-from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getHeaderComment, addPairIDtoReadID, isSpliceEntry
 from cistematic.genomes import Genome
 from commoncode import getGeneInfoDict
 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
 
 from cistematic.genomes import Genome
 from commoncode import getGeneInfoDict
 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
 
-print "geneMrnaCountsWeighted: version 4.3"
+print "geneMrnaCountsWeighted: version 5.0"
 
 
 def main(argv=None):
 
 
 def main(argv=None):
@@ -32,8 +32,9 @@ def main(argv=None):
     hitfile =  args[1]
     countfile = args[2]
     outfilename = args[3]
     hitfile =  args[1]
     countfile = args[2]
     outfilename = args[3]
+    bamfile = pysam.Samfile(hitfile, "rb")
 
 
-    geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
+    geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, options.ignoreSense,
                            options.withUniqs, options.withMulti,
                            options.acceptfile, options.cachePages, options.doVerbose,
                            options.extendGenome, options.replaceModels)
                            options.withUniqs, options.withMulti,
                            options.acceptfile, options.cachePages, options.doVerbose,
                            options.extendGenome, options.replaceModels)
@@ -68,7 +69,7 @@ def makeParser(usage=""):
     return parser
 
 
     return parser
 
 
-def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
+def geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, ignoreSense=True,
                            withUniqs=False, withMulti=False, acceptfile=None,
                            cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
 
                            withUniqs=False, withMulti=False, acceptfile=None,
                            cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
 
@@ -83,7 +84,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
         doCache = True
     else:
         doCache = False
         doCache = True
     else:
         doCache = False
-        cachePages = 0
         hg = Genome(genome, inRAM=True)
 
     if extendGenome:
         hg = Genome(genome, inRAM=True)
 
     if extendGenome:
@@ -94,10 +94,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
 
         hg.extendFeatures(extendGenome, replace=replaceModels)
 
 
         hg.extendFeatures(extendGenome, replace=replaceModels)
 
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache)
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
-
     allGIDs = set(hg.allGIDs())
     if acceptfile is not None:
         regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
     allGIDs = set(hg.allGIDs())
     if acceptfile is not None:
         regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
@@ -115,22 +111,15 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
         gidReadDict[gid] = []
 
     index = 0
         gidReadDict[gid] = []
 
     index = 0
-    if withMulti and not withUniqs:
-        chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
-    else:
-        chromList = hitRDS.getChromosomes(fullChrom=False)
-
-    readlen = hitRDS.getReadSize()
-    for chromosome in chromList:
+    chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+    for chromosome in chromosomeList:
         if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
             continue
 
         print "\n%s " % chromosome,
         if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
             continue
 
         print "\n%s " % chromosome,
-        fullchrom = "chr%s" % chromosome
-        hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
         featureList = featuresByChromDict[chromosome]
 
         featureList = featuresByChromDict[chromosome]
 
-        readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index)
+        readGidList, totalProcessedReads = getReadGIDs(bamfile, chromosome, featureList, index, withUniqs, withMulti, ignoreSense=ignoreSense)
         index = totalProcessedReads
         for (tagReadID, gid) in readGidList:
             try:
         index = totalProcessedReads
         for (tagReadID, gid) in readGidList:
             try:
@@ -151,17 +140,23 @@ def doNotProcessChromosome(chromosome, chromosomeList):
     return chromosome not in chromosomeList
 
 
     return chromosome not in chromosomeList
 
 
-def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
+def getReadGIDs(bamfile, chromosome, featList, index, doUniqs, doMulti, ignoreSense=True):
 
     startFeature = 0
     readGidList = []
 
     startFeature = 0
     readGidList = []
-    ignoreSense = True
-    for read in hitDict[fullchrom]:
-        tagStart = read["start"]
-        tagReadID = read["readID"]
-        if read.has_key("sense"):
-            tagSense = read["sense"]
-            ignoreSense = False
+    readlen = getHeaderComment(bamfile.header, "ReadLength")
+    for alignedread in bamfile.fetch(chromosome):
+        if doNotProcessRead(alignedread, doUniqs, doMulti):
+            continue
+
+        tagStart = alignedread.pos
+        tagReadID = addPairIDtoReadID(alignedread)
+        tagSense = ""
+        if not ignoreSense:
+            if alignedread.is_reverse:
+                tagSense = "-"
+            else:
+                tagSense = "+"
 
         index += 1
         if index % 100000 == 0:
 
         index += 1
         if index % 100000 == 0:
@@ -193,6 +188,18 @@ def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
     return readGidList, index
 
 
     return readGidList, index
 
 
+def doNotProcessRead(read, doUniqs, doMulti):
+    doNotProcess = False
+    if isSpliceEntry(read.cigar):
+        doNotProcess = True
+    elif doUniqs and read.opt('NH') > 1:
+        doNotProcess = True
+    elif doMulti and read.opt('NH') == 1:
+        doNotProcess = True
+
+    return doNotProcess
+
+
 def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
 
     uniqueCountDict = {}
 def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
 
     uniqueCountDict = {}
@@ -200,6 +207,7 @@ def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict,
     for line in uniquecounts:
         fields = line.strip().split()
         # add a pseudo-count here to ease calculations below
     for line in uniquecounts:
         fields = line.strip().split()
         # add a pseudo-count here to ease calculations below
+        #TODO: figure out why this was done in prior implementation...
         uniqueCountDict[fields[0]] = float(fields[-1]) + 1
 
     uniquecounts.close()
         uniqueCountDict[fields[0]] = float(fields[-1]) + 1
 
     uniquecounts.close()
@@ -246,7 +254,7 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
         try:
             tagValue = uniqueCountDict[gid]
         except KeyError:
         try:
             tagValue = uniqueCountDict[gid]
         except KeyError:
-            tagValue = 1
+            pass
 
         tagDenom = 0.
         for relatedGID in read2GidDict[readID]:
 
         tagDenom = 0.
         for relatedGID in read2GidDict[readID]:
@@ -264,4 +272,4 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
 
 
 if __name__ == "__main__":
 
 
 if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
\ No newline at end of file