X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=geneMrnaCountsWeighted.py;h=3d02c679d28cf03c8712a3031d5c207ac65bd45d;hp=5299d27d26c23f8729063eb31f0da390730b5ce5;hb=HEAD;hpb=77dccd7c98d8cdb60caaf178b1123df71ea662c9

diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py
index 5299d27..3d02c67 100755
--- a/geneMrnaCountsWeighted.py
+++ b/geneMrnaCountsWeighted.py
@@ -6,13 +6,13 @@ except:
 
 import sys
 import optparse
-from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getHeaderComment, addPairIDtoReadID, isSpliceEntry
 from cistematic.genomes import Genome
 from commoncode import getGeneInfoDict
 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
 
-print "geneMrnaCountsWeighted: version 4.3"
+print "geneMrnaCountsWeighted: version 5.0"
 
 
 def main(argv=None):
@@ -32,8 +32,9 @@ def main(argv=None):
     hitfile =  args[1]
     countfile = args[2]
     outfilename = args[3]
+    bamfile = pysam.Samfile(hitfile, "rb")
 
-    geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
+    geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, options.ignoreSense,
                            options.withUniqs, options.withMulti,
                            options.acceptfile, options.cachePages, options.doVerbose,
                            options.extendGenome, options.replaceModels)
@@ -68,7 +69,7 @@ def makeParser(usage=""):
     return parser
 
 
-def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
+def geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, ignoreSense=True,
                            withUniqs=False, withMulti=False, acceptfile=None,
                            cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
 
@@ -83,7 +84,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
         doCache = True
     else:
         doCache = False
-        cachePages = 0
         hg = Genome(genome, inRAM=True)
 
     if extendGenome:
@@ -94,10 +94,6 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
 
         hg.extendFeatures(extendGenome, replace=replaceModels)
 
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=doVerbose, cache=doCache)
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
-
     allGIDs = set(hg.allGIDs())
     if acceptfile is not None:
         regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
@@ -115,22 +111,15 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
         gidReadDict[gid] = []
 
     index = 0
-    if withMulti and not withUniqs:
-        chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
-    else:
-        chromList = hitRDS.getChromosomes(fullChrom=False)
-
-    readlen = hitRDS.getReadSize()
-    for chromosome in chromList:
+    chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+    for chromosome in chromosomeList:
         if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
             continue
 
         print "\n%s " % chromosome,
-        fullchrom = "chr%s" % chromosome
-        hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
         featureList = featuresByChromDict[chromosome]
 
-        readGidList, totalProcessedReads = getReadGIDs(hitDict, fullchrom, featureList, readlen, index)
+        readGidList, totalProcessedReads = getReadGIDs(bamfile, chromosome, featureList, index, withUniqs, withMulti, ignoreSense=ignoreSense)
         index = totalProcessedReads
         for (tagReadID, gid) in readGidList:
             try:
@@ -151,17 +140,23 @@ def doNotProcessChromosome(chromosome, chromosomeList):
     return chromosome not in chromosomeList
 
 
-def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
+def getReadGIDs(bamfile, chromosome, featList, index, doUniqs, doMulti, ignoreSense=True):
 
     startFeature = 0
     readGidList = []
-    ignoreSense = True
-    for read in hitDict[fullchrom]:
-        tagStart = read["start"]
-        tagReadID = read["readID"]
-        if read.has_key("sense"):
-            tagSense = read["sense"]
-            ignoreSense = False
+    readlen = getHeaderComment(bamfile.header, "ReadLength")
+    for alignedread in bamfile.fetch(chromosome):
+        if doNotProcessRead(alignedread, doUniqs, doMulti):
+            continue
+
+        tagStart = alignedread.pos
+        tagReadID = addPairIDtoReadID(alignedread)
+        tagSense = ""
+        if not ignoreSense:
+            if alignedread.is_reverse:
+                tagSense = "-"
+            else:
+                tagSense = "+"
 
         index += 1
         if index % 100000 == 0:
@@ -193,6 +188,18 @@ def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
     return readGidList, index
 
 
+def doNotProcessRead(read, doUniqs, doMulti):
+    doNotProcess = False
+    if isSpliceEntry(read.cigar):
+        doNotProcess = True
+    elif doUniqs and read.opt('NH') > 1:
+        doNotProcess = True
+    elif doMulti and read.opt('NH') == 1:
+        doNotProcess = True
+
+    return doNotProcess
+
+
 def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
 
     uniqueCountDict = {}
@@ -200,6 +207,7 @@ def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict,
     for line in uniquecounts:
         fields = line.strip().split()
         # add a pseudo-count here to ease calculations below
+        #TODO: figure out why this was done in prior implementation...
         uniqueCountDict[fields[0]] = float(fields[-1]) + 1
 
     uniquecounts.close()
@@ -246,7 +254,7 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
         try:
             tagValue = uniqueCountDict[gid]
         except KeyError:
-            tagValue = 1
+            pass
 
         tagDenom = 0.
         for relatedGID in read2GidDict[readID]:
@@ -264,4 +272,4 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
 
 
 if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
\ No newline at end of file