X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=geneMrnaCountsWeighted.py;h=3d02c679d28cf03c8712a3031d5c207ac65bd45d;hp=7acf0b92fd1d93da0c29d4bdbba08af6c7d2a937;hb=HEAD;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4

diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py
index 7acf0b9..3d02c67 100755
--- a/geneMrnaCountsWeighted.py
+++ b/geneMrnaCountsWeighted.py
@@ -2,15 +2,17 @@ try:
     import psyco
     psyco.full()
 except:
-    print 'psyco not running'
+    print "psyco not running"
 
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, getFeaturesByChromDict
+import sys
+import optparse
+import pysam
+from commoncode import getMergedRegions, getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getHeaderComment, addPairIDtoReadID, isSpliceEntry
 from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict
 from cistematic.core import chooseDB, cacheGeneDB, uncacheGeneDB
 
-print '%s: version 4.1' % sys.argv[0]
+print "geneMrnaCountsWeighted: version 5.0"
 
 
 def main(argv=None):
@@ -19,21 +21,7 @@ def main(argv=None):
 
     usage = "usage: python %s genome rdsfile uniqcountfile outfile [options]"
 
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--stranded", action="store_false", dest="ignoreSense")
-    parser.add_option("--uniq", action="store_true", dest="withUniqs")
-    parser.add_option("--multi", action="store_true", dest="withMulti")
-    parser.add_option("--record", action="store_true", dest="recording",
-                      help="ignored with uniq reads")
-    parser.add_option("--accept", dest="acceptfile")
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.add_option("--verbose", action="store_true", dest="doVerbose")
-    parser.add_option("--models", dest="extendGenome")
-    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
-    parser.set_defaults(ignoreSense=True, withUniqs=False, withMulti=False, recording=False,
-                        acceptfile=None, cachePages=None, doVerbose=False, extendGenome="",
-                        replaceModels=False)
-
+    parser = makeParser(usage)
     (options, args) = parser.parse_args(argv[1:])
 
     if len(args) < 4:
@@ -44,15 +32,45 @@ def main(argv=None):
     hitfile =  args[1]
     countfile = args[2]
     outfilename = args[3]
+    bamfile = pysam.Samfile(hitfile, "rb")
 
-    geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, options.ignoreSense,
-                           options.withUniqs, options.withMulti, options.recording,
+    geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, options.ignoreSense,
+                           options.withUniqs, options.withMulti,
                            options.acceptfile, options.cachePages, options.doVerbose,
                            options.extendGenome, options.replaceModels)
 
 
-def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
-                           withUniqs=False, withMulti=False, recording=False, acceptfile=None,
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--stranded", action="store_false", dest="ignoreSense")
+    parser.add_option("--uniq", action="store_true", dest="withUniqs")
+    parser.add_option("--multi", action="store_true", dest="withMulti")
+    parser.add_option("--accept", dest="acceptfile")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--verbose", action="store_true", dest="doVerbose")
+    parser.add_option("--models", dest="extendGenome")
+    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+
+    configParser = getConfigParser()
+    section = "geneMrnaCountsWeighted"
+    ignoreSense = getConfigBoolOption(configParser, section, "ignoreSense", True)
+    withUniqs = getConfigBoolOption(configParser, section, "withUniqs", False)
+    withMulti = getConfigBoolOption(configParser, section, "withMulti", False)
+    acceptfile = getConfigOption(configParser, section, "acceptfile", None)
+    cachePages = getConfigOption(configParser, section, "cachePages", None)
+    doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
+    extendGenome = getConfigOption(configParser, section, "extendGenome", "")
+    replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
+
+    parser.set_defaults(ignoreSense=ignoreSense, withUniqs=withUniqs, withMulti=withMulti,
+                        acceptfile=acceptfile, cachePages=cachePages, doVerbose=doVerbose, extendGenome=extendGenome,
+                        replaceModels=replaceModels)
+
+    return parser
+
+
+def geneMrnaCountsWeighted(genome, bamfile, countfile, outfilename, ignoreSense=True,
+                           withUniqs=False, withMulti=False, acceptfile=None,
                            cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
 
     if (not withUniqs and not withMulti) or (withUniqs and withMulti):
@@ -62,204 +80,195 @@ def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=
     if cachePages is not None:
         cacheGeneDB(genome)
         hg = Genome(genome, dbFile=chooseDB(genome), inRAM=True)
-        idb = geneinfoDB(cache=True)
         print "%s cached" % genome
         doCache = True
     else:
         doCache = False
-        cachePages = 0
         hg = Genome(genome, inRAM=True)
-        idb = geneinfoDB()
-
-    if acceptfile is not None:
-        acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
-    else:
-        acceptDict = {}
-
-    if recording and withUniqs:
-        recording = False
 
     if extendGenome:
         if replaceModels:
             print "will replace gene models with %s" % extendGenome
         else:
             print "will extend gene models with %s" % extendGenome
-    else:
-        replaceModels = False
 
-    if extendGenome != "":
-        hg.extendFeatures(extendGenome, replace = replaceModels)
-    
-    hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
+        hg.extendFeatures(extendGenome, replace=replaceModels)
 
-    readlen = hitRDS.getReadSize()
+    allGIDs = set(hg.allGIDs())
+    if acceptfile is not None:
+        regionDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=doVerbose)
+        for chrom in regionDict:
+            for region in regionDict[chrom]:
+                allGIDs.add(region.label)
+    else:
+        regionDict = {}
+
+    featuresByChromDict = getFeaturesByChromDict(hg, regionDict)
 
-    geneinfoDict = idb.getallGeneInfo(genome)
-    geneannotDict = hg.allAnnotInfo()
-    gidCount = {}
     gidReadDict = {}
+    read2GidDict = {}
+    for gid in allGIDs:
+        gidReadDict[gid] = []
 
-    featuresByChromDict = getFeaturesByChromDict(hg, acceptDict)
-    gidList = hg.allGIDs()
+    index = 0
+    chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+    for chromosome in chromosomeList:
+        if doNotProcessChromosome(chromosome, featuresByChromDict.keys()):
+            continue
 
-    gidList.sort()
-    for chrom in acceptDict:
-        for (label, start, stop, length) in acceptDict[chrom]:
-            if label not in gidList:
-                gidList.append(label)
+        print "\n%s " % chromosome,
+        featureList = featuresByChromDict[chromosome]
 
-    for gid in gidList:
-        gidCount[gid] = 0
-        gidReadDict[gid] = []
+        readGidList, totalProcessedReads = getReadGIDs(bamfile, chromosome, featureList, index, withUniqs, withMulti, ignoreSense=ignoreSense)
+        index = totalProcessedReads
+        for (tagReadID, gid) in readGidList:
+            try:
+                gidReadDict[gid].append(tagReadID)
+                if tagReadID in read2GidDict:
+                    read2GidDict[tagReadID].add(gid)
+                else:
+                    read2GidDict[tagReadID] = set([gid])
+            except KeyError:
+                print "gid %s not in gidReadDict" % gid
 
-    uniqueCountDict = {}
-    read2GidDict = {}
+    writeCountsToFile(outfilename, countfile, allGIDs, hg, gidReadDict, read2GidDict, doVerbose, doCache)
+    if doCache:
+        uncacheGeneDB(genome)
 
-    uniquecounts = open(countfile)
+
+def doNotProcessChromosome(chromosome, chromosomeList):
+    return chromosome not in chromosomeList
+
+
+def getReadGIDs(bamfile, chromosome, featList, index, doUniqs, doMulti, ignoreSense=True):
+
+    startFeature = 0
+    readGidList = []
+    readlen = getHeaderComment(bamfile.header, "ReadLength")
+    for alignedread in bamfile.fetch(chromosome):
+        if doNotProcessRead(alignedread, doUniqs, doMulti):
+            continue
+
+        tagStart = alignedread.pos
+        tagReadID = addPairIDtoReadID(alignedread)
+        tagSense = ""
+        if not ignoreSense:
+            if alignedread.is_reverse:
+                tagSense = "-"
+            else:
+                tagSense = "+"
+
+        index += 1
+        if index % 100000 == 0:
+            print "read %d" % index,
+
+        stopPoint = tagStart + readlen
+        if startFeature < 0:
+            startFeature = 0
+
+        for (start, stop, gid, sense, ftype) in featList[startFeature:]:
+            if tagStart > stop:
+                startFeature += 1
+                continue
+
+            if start > stopPoint:
+                startFeature -= 100
+                break
+
+            if not ignoreSense:
+                if sense == "R":
+                    sense = "-"
+                else:
+                    sense = "+"
+
+            if start <= tagStart <= stop and (ignoreSense or tagSense == sense):
+                readGidList.append((tagReadID, gid))
+                stopPoint = stop
+
+    return readGidList, index
+
+
+def doNotProcessRead(read, doUniqs, doMulti):
+    doNotProcess = False
+    if isSpliceEntry(read.cigar):
+        doNotProcess = True
+    elif doUniqs and read.opt('NH') > 1:
+        doNotProcess = True
+    elif doMulti and read.opt('NH') == 1:
+        doNotProcess = True
+
+    return doNotProcess
+
+
+def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, read2GidDict, doVerbose=False, doCache=False):
+
+    uniqueCountDict = {}
+    uniquecounts = open(countFilename)
     for line in uniquecounts:
         fields = line.strip().split()
         # add a pseudo-count here to ease calculations below
+        #TODO: figure out why this was done in prior implementation...
         uniqueCountDict[fields[0]] = float(fields[-1]) + 1
 
     uniquecounts.close()
 
-    outfile = open(outfilename, "w")
+    genomeName = genome.genome
+    geneinfoDict = getGeneInfoDict(genomeName, cache=doCache)
+    geneannotDict = genome.allAnnotInfo()
+    outfile = open(outFilename, "w")
+    for gid in allGIDs:
+        symbol = getGeneSymbol(gid, genomeName, geneinfoDict, geneannotDict)
+        tagCount = getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict)
+        if doVerbose:
+            print "%s %s %f" % (gid, symbol, tagCount)
 
-    index = 0
-    if withMulti and not withUniqs:
-        chromList = hitRDS.getChromosomes(table="multi", fullChrom=False)
-    else:
-        chromList = hitRDS.getChromosomes(fullChrom=False)
+        outfile.write("%s\t%s\t%d\n" % (gid, symbol, tagCount))
 
-    for achrom in chromList:
-        if achrom not in featuresByChromDict:
-            continue
+    outfile.close()
 
-        print "\n" + achrom + " ",
-        startFeature = 0
-        fullchrom = "chr" + achrom
-        hitDict = hitRDS.getReadsDict(noSense=ignoreSense, fullChrom=True, chrom=fullchrom, withID=True, doUniqs=withUniqs, doMulti=withMulti)
-        featList = featuresByChromDict[achrom]
-        if ignoreSense:
-            for (tagStart, tagReadID) in hitDict[fullchrom]:
-                index += 1
-                if index % 100000 == 0:
-                    print "read %d" % index,
-
-                stopPoint = tagStart + readlen
-                if startFeature < 0:
-                    startFeature = 0
-
-                for (start, stop, gid, sense, ftype) in featList[startFeature:]:
-                    if tagStart > stop:
-                        startFeature += 1
-                        continue
-
-                    if start > stopPoint:
-                        startFeature -= 100
-                        break
-
-                    if start <= tagStart <= stop:
-                        try:
-                            gidReadDict[gid].append(tagReadID)
-                            if tagReadID in read2GidDict:
-                                if gid not in read2GidDict[tagReadID]:
-                                    read2GidDict[tagReadID].append(gid)
-                            else:
-                                read2GidDict[tagReadID] = [gid]
-
-                            gidCount[gid] += 1
-                        except:
-                            print "gid %s not in gidReadDict" % gid
-
-                        stopPoint = stop
-        else:
-            for (tagStart, tSense, tagReadID) in hitDict[fullchrom]:
-                index += 1
-                if index % 100000 == 0:
-                    print "read %d" % index,
-
-                stopPoint = tagStart + readlen
-                if startFeature < 0:
-                    startFeature = 0
-
-                for (start, stop, gid, sense, ftype) in featList[startFeature:]:
-                    if tagStart > stop:
-                        startFeature += 1
-                        continue
-
-                    if start > stopPoint:
-                        startFeature -= 100
-                        break
-
-                    if sense == "R":
-                        sense = "-"
-                    else:
-                        sense = "+"
-
-                    if start <= tagStart <= stop and sense == tSense:
-                        try:
-                            gidReadDict[gid].append(tagReadID)
-                            if tagReadID in read2GidDict:
-                                if gid not in read2GidDict[tagReadID]:
-                                    read2GidDict[tagReadID].append(gid)
-                            else:
-                                read2GidDict[tagReadID] = [gid]
-
-                            gidCount[gid] += 1
-                        except:
-                            print "gid %s not in gidReadDict" % gid
-
-                        stopPoint = stop
-
-    for gid in gidList:
-        if "FAR" not in gid:
-            symbol = "LOC" + gid
-            geneinfo = ""
-            try:
-                geneinfo = geneinfoDict[gid]
-                if genome == "celegans":
-                    symbol = geneinfo[0][1]
-                else:
-                    symbol = geneinfo[0][0]
-            except:
-                try:
-                    symbol = geneannotDict[(genome, gid)][0]
-                except:
-                    symbol = "LOC" + gid
-        else:
-            symbol = gid
 
-        tagCount = 0.
-        for readID in gidReadDict[gid]:
+def getGeneSymbol(gid, genomeName, geneinfoDict, geneannotDict):
+    if "FAR" not in gid:
+        symbol = "LOC%s" % gid
+        geneinfo = ""
+        try:
+            geneinfo = geneinfoDict[gid]
+            if genomeName == "celegans":
+                symbol = geneinfo[0][1]
+            else:
+                symbol = geneinfo[0][0]
+        except (KeyError, IndexError):
             try:
-                tagValue = uniqueCountDict[gid]
-            except:
-                tagValue = 1
+                symbol = geneannotDict[(genomeName, gid)][0]
+            except (KeyError, IndexError):
+                symbol = "LOC%s" % gid
+    else:
+        symbol = gid
+
+    return symbol
 
-            tagDenom = 0.
-            for aGid in read2GidDict[readID]:
-                try:
-                    tagDenom += uniqueCountDict[aGid]
-                except:
-                    tagDenom += 1
 
+def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
+    tagCount = 0.
+    for readID in gidReadDict[gid]:
         try:
-            tagCount += tagValue / tagDenom
-        except ZeroDivisionError:
-            tagCount = 0
-    
-        if doVerbose:
-            print "%s %s %f" % (gid, symbol, tagCount)
+            tagValue = uniqueCountDict[gid]
+        except KeyError:
+            pass
 
-        outfile.write("%s\t%s\t%d\n" % (gid, symbol, tagCount))
+        tagDenom = 0.
+        for relatedGID in read2GidDict[readID]:
+            try:
+                tagDenom += uniqueCountDict[relatedGID]
+            except KeyError:
+                tagDenom += 1
 
-    outfile.close()
+        try:
+            tagCount += tagValue / tagDenom
+        except ZeroDivisionError:
+            pass
 
-    if doCache:
-        uncacheGeneDB(genome)
+    return tagCount
 
 
 if __name__ == "__main__":