first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / geneLocusBins.py
index e6b403f2b33ef9beb6b0a8b2d1d8acb778cc1547..14cb2b5ee2b5617fba45f1a69ef94386784c0b11 100755 (executable)
@@ -10,12 +10,15 @@ try:
 except:
     pass
 
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, getLocusByChromDict, computeRegionBins
+import sys
+import optparse
+import string
+from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
+import ReadDataset
 from cistematic.genomes import Genome
-from cistematic.core.geneinfo import geneinfoDB
+from commoncode import getGeneInfoDict
 
-print '%s: version 2.1' % sys.argv[0]
+print "geneLocusBins: version 2.2"
 
 def main(argv=None):
     if not argv:
@@ -23,25 +26,7 @@ def main(argv=None):
 
     usage = "usage: python %prog genome rdsfile outfilename [--bins numbins] [--flank bp] [--upstream bp] [--downstream bp] [--nocds] [--regions acceptfile] [--cache] [--raw] [--force]"
 
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--bins", type="int", dest="bins",
-                      help="number of bins to use [default: 10]")
-    parser.add_option("--flank", type="int", dest="flankBP",
-                      help="number of flanking BP on both upstream and downstream [default: 0]")
-    parser.add_option("--upstream", type="int", dest="upstreamBP",
-                      help="number of upstream flanking BP [default: 0]")
-    parser.add_option("--downstream", type="int", dest="downstreamBP",
-                      help="number of downstream flanking BP [default: 0]")
-    parser.add_option("--nocds", action="store_false", dest="doCDS",
-                      help="do not CDS")
-    parser.add_option("--raw", action="store_false", dest="normalizeBins",
-                      help="do not normalize results")
-    parser.add_option("--force", action="store_false", dest="limitNeighbor",
-                      help="limit neighbor region")
-    parser.add_option("--regions", dest="acceptfile")
-    parser.add_option("--cache", action="store_true", dest="doCache",
-                      help="use cache")
-    parser.set_defaults(normalizeBins=True, doCache=False, bins=10, flankBP=None, upstreamBP=None, downstreamBP=None, doCDS=True, limitNeighbor=True)
+    parser = getParser(usage)
     (options, args) = parser.parse_args(argv[1:])
 
     if len(args) < 3:
@@ -71,64 +56,114 @@ def main(argv=None):
     geneLocusBins(genome, hitfile, outfilename, upstreamBp, downstreamBp, doFlank, options.normalizeBins, options.doCache, options.bins, options.doCDS, options.limitNeighbor, options.acceptfile)
 
 
-def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False, normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True, acceptfile=None):
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--bins", type="int", dest="bins",
+                      help="number of bins to use [default: 10]")
+    parser.add_option("--flank", type="int", dest="flankBP",
+                      help="number of flanking BP on both upstream and downstream [default: 0]")
+    parser.add_option("--upstream", type="int", dest="upstreamBP",
+                      help="number of upstream flanking BP [default: 0]")
+    parser.add_option("--downstream", type="int", dest="downstreamBP",
+                      help="number of downstream flanking BP [default: 0]")
+    parser.add_option("--nocds", action="store_false", dest="doCDS",
+                      help="do not CDS")
+    parser.add_option("--raw", action="store_false", dest="normalizeBins",
+                      help="do not normalize results")
+    parser.add_option("--force", action="store_false", dest="limitNeighbor",
+                      help="limit neighbor region")
+    parser.add_option("--regions", dest="acceptfile")
+    parser.add_option("--cache", action="store_true", dest="doCache",
+                      help="use cache")
+
+    configParser = getConfigParser()
+    section = "geneLocusBins"
+    normalizeBins = getConfigBoolOption(configParser, section, "normalizeBins", True)
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    bins = getConfigIntOption(configParser, section, "bins", 10)
+    flankBP = getConfigOption(configParser, section, "flankBP", None)
+    upstreamBP = getConfigOption(configParser, section, "upstreamBP", None)
+    downstreamBP = getConfigOption(configParser, section, "downstreamBP", None)
+    doCDS = getConfigBoolOption(configParser, section, "doCDS", True)
+    limitNeighbor = getConfigBoolOption(configParser, section, "limitNeighbor", True)
+
+    parser.set_defaults(normalizeBins=normalizeBins, doCache=doCache, bins=bins, flankBP=flankBP,
+                        upstreamBP=upstreamBP, downstreamBP=downstreamBP, doCDS=doCDS,
+                        limitNeighbor=limitNeighbor)
+
+    return parser
+
+
+def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
+                  normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
+                  acceptfile=None):
+
+
+    hg = Genome(genome)
+    gidList = hg.allGIDs()
+    gidList.sort()
     if acceptfile is None:
         acceptDict = {}
     else:
         acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
-    hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
-    readlen = hitRDS.getReadSize()
-    normalizationFactor = 1.0
-    if normalizeBins:
-        totalCount = len(hitRDS)
-        normalizationFactor = totalCount / 1000000.
-
-    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
-
-    hg = Genome(genome)
-    idb = geneinfoDB(cache=doCache)
+        for chrom in acceptDict:
+            for region in acceptDict[chrom]:
+                if region.label not in gidList:
+                    gidList.append(region.label)
 
-    geneinfoDict = idb.getallGeneInfo(genome)
     if doFlank:
-        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor)
+        locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
     else:
         locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
 
-    gidList = hg.allGIDs()
-    gidList.sort()
-    for chrom in acceptDict:
-        for (label, start, stop, length) in acceptDict[chrom]:
-            if label not in gidList:
-                gidList.append(label)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
+    hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
+    readlen = hitRDS.getReadSize()
+    normalizationFactor = 1.0
+    if normalizeBins:
+        totalCount = len(hitRDS)
+        normalizationFactor = totalCount / 1000000.
 
     (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
+    geneinfoDict = getGeneInfoDict(genome, cache=doCache)
+    writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins)
 
-    outfile = open(outfilename,'w')
 
+def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins=True):
+    outfile = open(outfilename, "w")
     for gid in gidList:
-        if 'FAR' not in gid:
-            symbol = 'LOC' + gid
-            geneinfo = ''
+        if "FAR" not in gid:
+            symbol = "LOC%s" % gid
+            geneinfo = ""
             try:
                 geneinfo = geneinfoDict[gid]
                 symbol = geneinfo[0][0]
-            except:
+            except (KeyError, IndexError):
                 pass
         else:
             symbol = gid
+
         if gid in gidBins and gid in gidLen:
             tagCount = 0.
             for binAmount in gidBins[gid]:
                 tagCount += binAmount
-        outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid]))
+
+        outputList = [gid, symbol, tagCount, gidLen[gid]]
         for binAmount in gidBins[gid]:
             if normalizeBins:
-                if tagCount == 0:
-                    tagCount = 1
-                outfile.write('\t%.1f' % (100. * binAmount / tagCount))
-            else:
-                outfile.write('\t%.1f' % binAmount)
-        outfile.write('\n')
+                try:
+                    normalizedValue = 100. * binAmount / tagCount
+                except ZeroDivisionError:
+                    #TODO: this is the right way to refactor the original code, but I don't think this is the right answer
+                    normalizedValue = 100. * binAmount
+
+                binAmount = normalizedValue
+
+            outputList.append("%.1f" % binAmount)
+
+        outLine = string.join(outputList, "\t")
+        outfile.write("%s\n" % outLine)
+
     outfile.close()