first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / geneMrnaCounts.py
index b905cf0f1317e2fbdb6af64008511870dd81c107..10ccc837dca3e6b36f3c7a14d94d3ebd4938ccd4 100755 (executable)
@@ -6,11 +6,11 @@ except:
 
 import sys
 import optparse
-from commoncode import readDataset, getFeaturesByChromDict
+from commoncode import getFeaturesByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, countThisRead
 from cistematic.genomes import Genome
 from cistematic.core.geneinfo import geneinfoDB
 
-print "%s: version 5.1" % sys.argv[0]
+print "geneMrnaCounts: version 6.0"
 
 
 def main(argv=None):
@@ -19,21 +19,7 @@ def main(argv=None):
 
     usage = "usage: python %prog genome rdsfile outfilename [options]"
 
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--stranded", action="store_true", dest="trackStrand")
-    parser.add_option("--splices", action="store_true", dest="doSplices")
-    parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
-    parser.add_option("--multi", action="store_true", dest="doMulti")
-    parser.add_option("--models", dest="extendGenome")
-    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
-    parser.add_option("--searchGID", action="store_true", dest="searchGID")
-    parser.add_option("--countfeatures", action="store_true", dest="countFeats")
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.add_option("--markGID", action="store_true", dest="markGID")
-    parser.set_defaults(trackStrand=False, doSplices=False, doUniqs=True, doMulti=False,
-                        extendGenome="", replaceModels=False, searchGID=False,
-                        countFeats=False, cachePages=None, markGID=False)
-
+    parser = getParser(usage)
     (options, args) = parser.parse_args(argv[1:])
 
     if len(args) < 3:
@@ -46,12 +32,42 @@ def main(argv=None):
 
     geneMrnaCounts(genomeName, hitfile, outfilename, options.trackStrand, options.doSplices,
                    options.doUniqs, options.doMulti, options.extendGenome, options.replaceModels,
-                   options.searchGID, options.countFeats, options.cachePages, options.markGID)
+                   options.searchGID, options.countFeats, options.cachePages)
 
 
-def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplices=False,
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--stranded", action="store_true", dest="trackStrand")
+    parser.add_option("--splices", action="store_true", dest="doSplices")
+    parser.add_option("--noUniqs", action="store_false", dest="doUniqs")
+    parser.add_option("--multi", action="store_true", dest="doMulti")
+    parser.add_option("--models", dest="extendGenome")
+    parser.add_option("--replacemodels", action="store_true", dest="replaceModels")
+    parser.add_option("--searchGID", action="store_true", dest="searchGID")
+    parser.add_option("--countfeatures", action="store_true", dest="countFeats")
+    parser.add_option("--cache", type="int", dest="cachePages")
+
+    configParser = getConfigParser()
+    section = "geneMrnaCounts"
+    trackStrand = getConfigBoolOption(configParser, section, "trackStrand", False)
+    doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+    doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
+    doMulti = getConfigBoolOption(configParser, section, "doMulti", False)
+    extendGenome = getConfigOption(configParser, section, "extendGenome", "")
+    replaceModels = getConfigBoolOption(configParser, section, "replaceModels", False)
+    searchGID = getConfigBoolOption(configParser, section, "searchGID", False)
+    countFeats = getConfigBoolOption(configParser, section, "countFeats", False)
+    cachePages = getConfigOption(configParser, section, "cachePages", None)
+
+    parser.set_defaults(trackStrand=trackStrand, doSplices=doSplices, doUniqs=doUniqs, doMulti=doMulti,
+                        extendGenome=extendGenome, replaceModels=replaceModels, searchGID=searchGID,
+                        countFeats=countFeats, cachePages=cachePages)
+
+    return parser
+
+def geneMrnaCounts(genomeName, bamfile, outfilename, trackStrand=False, doSplices=False,
                    doUniqs=True, doMulti=False, extendGenome="", replaceModels=False,
-                   searchGID=False, countFeats=False, cachePages=None, markGID=False):
+                   searchGID=False, countFeats=False):
 
     if trackStrand:
         print "will track strandedness"
@@ -67,16 +83,6 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice
     else:
         replaceModels = False
 
-    if cachePages is not None:
-        doCache = True
-    else:
-        cachePages = 100000
-        doCache = False
-
-    hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
-
     genome = Genome(genomeName, inRAM=True)
     if extendGenome != "":
         genome.extendFeatures(extendGenome, replace=replaceModels)
@@ -92,24 +98,15 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice
     for gid in gidList:
         gidCount[gid] = 0
 
-    chromList = hitRDS.getChromosomes(fullChrom=False)
-    if len(chromList) == 0 and doSplices:
-        chromList = hitRDS.getChromosomes(table="splices", fullChrom=False)
-
-    if markGID:
-        print "Flagging all reads as NM"
-        hitRDS.setFlags("NM", uniqs=doUniqs, multi=doMulti, splices=doSplices)
-
-    for chrom in chromList:
+    chromosomeList = [chrom for chrom in bamfile.references if chrom != "chrM"]
+    for chrom in chromosomeList:
         if chrom not in featuresByChromDict:
             continue
 
         if countFeats:
-            seenFeaturesByChromDict[chrom] = []
+            seenFeaturesByChromDict[chrom] = set([])
 
-        print "\nchr%s" % chrom
-        fullchrom = "chr%s" % chrom
-        regionList = []        
+        print "\nchr%s" % chrom      
         print "counting GIDs"
         for (start, stop, gid, featureSense, featureType) in featuresByChromDict[chrom]:
             try:
@@ -118,34 +115,31 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice
                     if featureSense == "R":
                         checkSense = "-"
 
-                    regionList.append((gid, fullchrom, start, stop, checkSense))
-                    count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
+                    count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
                 else:
-                    regionList.append((gid, fullchrom, start, stop))
-                    count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
-                    if count != 0:
-                        print count
+                    count = getCounts(chrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
 
                 gidCount[gid] += count
                 if countFeats:
-                    if (start, stop, gid, featureSense) not in seenFeaturesByChromDict[chrom]:
-                        seenFeaturesByChromDict[chrom].append((start, stop, gid, featureSense))
+                    seenFeaturesByChromDict[chrom].add((start, stop, gid, featureSense))
             except:
                 print "problem with %s - skipping" % gid
 
-        if markGID:
-            print "marking GIDs"
-            hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=doStranded)
-            print "finished marking"
-
     print " "
     if countFeats:
         numFeatures = countFeatures(seenFeaturesByChromDict)
         print "saw %d features" % numFeatures
 
-    writeOutputFile(outfilename, genome, gidList, gidCount, searchGID)
-    if markGID and doCache:
-        hitRDS.saveCacheDB(hitfile)
+    writeOutputFile(outfilename, genome, gidCount, searchGID)
+
+
+def getCounts(bamfile, chrom, start, stop, uniqs=True, multi=False, splices=False, sense=''):
+    count = 0.0
+    for alignedread in bamfile.fetch(chrom, start, stop):
+        if countThisRead(alignedread, uniqs, multi, splices, sense):
+            count += 1.0/alignedread.opt('NH')
+
+    return count
 
 
 def countFeatures(seenFeaturesByChromDict):
@@ -159,18 +153,15 @@ def countFeatures(seenFeaturesByChromDict):
     return count
 
 
-def writeOutputFile(outfilename, genome, gidList, gidCount, searchGID):
+def writeOutputFile(outfilename, genome, gidCount, searchGID):
     geneAnnotDict = genome.allAnnotInfo()
     genomeName = genome.genome
     outfile = open(outfilename, "w")
     idb = geneinfoDB(cache=True)
     geneInfoDict = idb.getallGeneInfo(genomeName)
-    for gid in gidList:
+    for gid in gidCount:
         symbol = getGeneSymbol(gid, searchGID, geneInfoDict, idb, genomeName, geneAnnotDict)
-        if gid in gidCount:
-            outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
-        else:
-            outfile.write("%s\t%s\t0\n" % (gid, symbol))
+        outfile.write("%s\t%s\t%d\n" % (gid, symbol, gidCount[gid]))
 
     outfile.close()