erange version 4.0a dev release
[erange.git] / geneLocusCounts.py
index 0e8792b740c3c83b524a03c0084027ed0bc7766c..218017e10a0d9a04d0b1cd8adde8449ecb60dcb8 100755 (executable)
@@ -17,12 +17,14 @@ try:
 except:
     pass
 
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, getLocusByChromDict
+import sys
+import optparse
+from commoncode import getMergedRegions, getLocusByChromDict, getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
+import ReadDataset
 from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict
 
-print '%s: version 3.0' % sys.argv[0]
+print "geneLocusCounts: version 3.1"
 
 def main(argv=None):
     if not argv:
@@ -30,19 +32,7 @@ def main(argv=None):
 
     usage = "usage: python %prog genome readDB outfilename [options]"
 
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--noUniqs", action="store_false", dest="doUniqs",
-                      help="do not count unique reads")
-    parser.add_option("--multi", action="store_true", dest="doUniqs",
-                      help="count multi reads")
-    parser.add_option("--splices", action="store_true", dest="doUniqs",
-                      help="count splice reads")
-    parser.add_option("--spanTSS", action="store_true", dest="spanTSS")
-    parser.add_option("--regions", dest="acceptfile")
-    parser.add_option("--noCDS", action="store_false", dest="useCDS")
-    parser.add_option("--locusLength", type="int", dest="bplength",
-                      help="number of bases to report")
-    parser.set_defaults(doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile="")
+    parser = getParser(usage)
     (options, args) = parser.parse_args(argv[1:])
 
     if len(args) < 3:
@@ -68,71 +58,107 @@ def main(argv=None):
     except ValueError:
         pass
 
-    geneLocusCounts(genome, hitfile, outfilename, upstream, downstream, options.doUniqs, options.doMulti, options.doSplices, options.useCDS, options.spanTSS, options.bplength, options.acceptfile)
+    geneLocusCounts(genome, hitfile, outfilename, upstream, downstream, options.doUniqs,
+                    options.doMulti, options.doSplices, options.useCDS, options.spanTSS,
+                    options.bplength, options.acceptfile)
 
 
-def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0, doUniqs=True, doMulti=False, doSplices=False, useCDS=True, spanTSS=False, bplength=0, acceptfile=""):
-    print 'returning only up to %d bp from gene locus' % bplength
-    print 'upstream = %d downstream = %d useCDS = %s spanTSS = %s' % (upstream, downstream, useCDS, spanTSS)
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--noUniqs", action="store_false", dest="doUniqs",
+                      help="do not count unique reads")
+    parser.add_option("--multi", action="store_true", dest="doUniqs",
+                      help="count multi reads")
+    parser.add_option("--splices", action="store_true", dest="doUniqs",
+                      help="count splice reads")
+    parser.add_option("--spanTSS", action="store_true", dest="spanTSS")
+    parser.add_option("--regions", dest="acceptfile")
+    parser.add_option("--noCDS", action="store_false", dest="useCDS")
+    parser.add_option("--locusLength", type="int", dest="bplength",
+                      help="number of bases to report")
+
+    configParser = getConfigParser()
+    section = "geneLocusCounts"
+    doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
+    doMulti = getConfigBoolOption(configParser, section, "doMulti", False)
+    doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+    useCDS = getConfigBoolOption(configParser, section, "useCDS", True)
+    spanTSS = getConfigBoolOption(configParser, section, "spanTSS", False)
+    bplength = getConfigIntOption(configParser, section, "bplength", 0)
+    acceptfile = getConfigOption(configParser, section, "acceptfile", "")
+
+    parser.set_defaults(doUniqs=doUniqs, doMulti=doMulti, doSplices=doSplices,
+                        useCDS=useCDS, spanTSS=spanTSS, bplength=bplength,
+                        acceptfile=acceptfile)
+
+    return parser
+
+
+def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0,
+                    doUniqs=True, doMulti=False, doSplices=False, useCDS=True,
+                    spanTSS=False, bplength=0, acceptfile=""):
+
+    print "returning only up to %d bp from gene locus" % bplength
+    print "upstream = %d downstream = %d useCDS = %s spanTSS = %s" % (upstream, downstream, useCDS, spanTSS)
 
     if acceptfile:
         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
 
-    hitRDS = readDataset(hitfile, verbose = True)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True)
 
     totalCount = hitRDS.getCounts(uniqs=doUniqs, multi=doMulti, splices=doSplices)
 
     hg = Genome(genome)
-    idb = geneinfoDB(cache=True)
-
-    gidCount = {}
-    gidList = []
-    gidLen = {}
-    geneinfoDict = idb.getallGeneInfo(genome)
-    locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS, acceptDict, upstreamSpanTSS = spanTSS, lengthCDS = bplength)
-
+    gidDict = {}
+    locusByChromDict = getLocusByChromDict(hg, upstream, downstream, useCDS, acceptDict, upstreamSpanTSS=spanTSS, lengthCDS=bplength)
     locusChroms = locusByChromDict.keys()
     chromList = hitRDS.getChromosomes(fullChrom=False)
     chromList.sort()
     for chrom in chromList:
-        if chrom == 'M' or chrom not in locusChroms:
+        if doNotProcessChromosome(chrom, locusChroms):
             continue
 
-        print 'chr' + chrom
-        fullchrom = 'chr' + chrom
+        fullchrom = "chr%s" % chrom
+        print fullchrom
         hitRDS.memSync(fullchrom, index=True)
         for (start, stop, gid, length) in locusByChromDict[chrom]:
-            if gid not in gidList:
-                gidList.append(gid)
-                gidCount[gid] = 0
-                gidLen[gid] = length
+            if not gidDict.has_key(gid):
+                gidDict[gid] = {"count": 0, "length": length}
 
-            gidCount[gid] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+            gidDict[gid]["count"] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
 
-    outfile = open(outfilename,'w')
+    outfile = open(outfilename, "w")
 
     totalCount /= 1000000.
 
-    outfile.write('#gid\tsymbol\tgidCount\tgidLen\trpm\trpkm\n')
+    outfile.write("#gid\tsymbol\tgidCount\tgidLen\trpm\trpkm\n")
+    gidList = gidDict.keys()
     gidList.sort()
+    geneinfoDict = getGeneInfoDict(genome, cache=True)
     for gid in gidList:
-        if 'FAR' not in gid:
-            symbol = 'LOC' + gid
-            geneinfo = ''
+        if "FAR" not in gid:
+            symbol = "LOC%s" % gid
+            geneinfo = ""
             try:
                 geneinfo = geneinfoDict[gid]
                 symbol = geneinfo[0][0]
-            except:
+            except (KeyError, IndexError):
                 pass
         else:
             symbol = gid
 
-        if gid in gidCount and gid in gidLen:
-            rpm  = gidCount[gid] / totalCount
-            rpkm = 1000. * rpm / gidLen[gid]
-            outfile.write('%s\t%s\t%d\t%d\t%2.2f\t%2.2f\n' % (gid, symbol, gidCount[gid], gidLen[gid], rpm, rpkm))
+        gidCount = gidDict[gid]["count"]
+        gidLength = gidDict[gid]["length"]
+        rpm  = gidCount / totalCount
+        rpkm = 1000. * rpm / gidLength
+        outfile.write("%s\t%s\t%d\t%d\t%2.2f\t%2.2f\n" % (gid, symbol, gidCount, gidLength, rpm, rpkm))
 
     outfile.close()
 
+
+def doNotProcessChromosome(chrom, locusChroms):
+    return chrom == "M" or chrom not in locusChroms
+
+
 if __name__ == "__main__":
     main(sys.argv)
\ No newline at end of file