erange version 4.0a dev release
[erange.git] / regionCounts.py
index 0104cc2f4bf97bb64797a1b6ce418a8b7e1b4528..ae005cb3146b686fd3934cd9a152c331900ebde4 100755 (executable)
@@ -9,10 +9,13 @@ try:
 except:
     print 'psyco not running'
 
-import sys, string, optparse
-from commoncode import readDataset, getMergedRegions, findPeak, writeLog
+import sys
+import string
+import optparse
+from commoncode import getMergedRegions, findPeak, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
+import ReadDataset
 
-versionString = "%prog: version 3.9"
+versionString = "regionCounts: version 3.10"
 print versionString
 
 def main(argv=None):
@@ -21,6 +24,25 @@ def main(argv=None):
 
     usage = "usage: python %prog regionfile rdsfile outfilename [options]"
 
+    parser = getParser(usage)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    regionfilename = args[0]
+    hitfile =  args[1]
+    outfilename = args[2]
+
+    regionCounts(regionfilename, hitfile, outfilename, options.flagRDS, options.cField,
+                 options.useFullchrom, options.normalize, options.padregion,
+                 options.mergeregion, options.merging, options.doUniqs, options.doMulti,
+                 options.doSplices, options.usePeak, options.cachePages, options.logfilename,
+                 options.doRPKM, options.doLength, options.forceRegion)
+
+
+def getParser(usage):
     parser = optparse.OptionParser(usage=usage)
     parser.add_option("--markRDS", action="store_true", dest="flagRDS")
     parser.add_option("--chromField", type="int", dest="cField")
@@ -38,27 +60,33 @@ def main(argv=None):
     parser.add_option("--rpkm", action="store_true", dest="doRPKM")
     parser.add_option("--length", action="store_true", dest="doLength")
     parser.add_option("--force", action="store_true", dest="forceRegion")
-    parser.set_defaults(flagRDS=False, cField=1, useFullchrom=False, normalize=True,
-                        padregion=0, mergeregion=0, merging=True, doUniqs=True,
-                        doMulti=True, doSplices=False, usePeak=False, cachePages=-1,
-                        logfilename="regionCounts.log", doRPKM=False, doLength=False,
-                        forceRegion=False)
-
-    (options, args) = parser.parse_args(argv[1:])
 
-    if len(args) < 3:
-        print usage
-        sys.exit(1)
-
-    regionfilename = args[0]
-    hitfile =  args[1]
-    outfilename = args[2]
-
-    regionCounts(regionfilename, hitfile, outfilename, options.flagRDS, options.cField,
-                 options.useFullchrom, options.normalize, options.padregion,
-                 options.mergeregion, options.merging, options.doUniqs, options.doMulti,
-                 options.doSplices, options.usePeak, options.cachePages, options.logfilename,
-                 options.doRPKM, options.doLength, options.forceRegion)
+    configParser = getConfigParser()
+    section = "regionCounts"
+    flagRDS = getConfigBoolOption(configParser, section, "flagRDS", False)
+    cField = getConfigIntOption(configParser, section, "cField", 1)
+    useFullchrom = getConfigBoolOption(configParser, section, "useFullchrom", False)
+    normalize = getConfigBoolOption(configParser, section, "normalize", True)
+    padregion = getConfigIntOption(configParser, section, "padregion", 0)
+    mergeregion = getConfigIntOption(configParser, section, "mergeregion", 0)
+    merging = getConfigBoolOption(configParser, section, "merging", True)
+    doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True)
+    doMulti = getConfigBoolOption(configParser, section, "doMulti", True)
+    doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+    usePeak = getConfigBoolOption(configParser, section, "usePeak", False)
+    cachePages = getConfigIntOption(configParser, section, "cachePages", -1)
+    logfilename = getConfigOption(configParser, section, "logfilename", "regionCounts.log")
+    doRPKM = getConfigBoolOption(configParser, section, "doRPKM", False)
+    doLength = getConfigBoolOption(configParser, section, "doLength", False)
+    forceRegion = getConfigBoolOption(configParser, section, "forceRegion", False)
+
+    parser.set_defaults(flagRDS=flagRDS, cField=cField, useFullchrom=useFullchrom, normalize=normalize,
+                        padregion=padregion, mergeregion=mergeregion, merging=merging, doUniqs=doUniqs,
+                        doMulti=doMulti, doSplices=doSplices, usePeak=usePeak, cachePages=cachePages,
+                        logfilename=logfilename, doRPKM=doRPKM, doLength=doLength,
+                        forceRegion=forceRegion)
+
+    return parser
 
 
 def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
@@ -91,7 +119,7 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
     labeltoRegionDict = {}
     regionCount = {}
 
-    hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
     readlen = hitRDS.getReadSize()
     if cachePages > hitRDS.getDefaultCacheSize():
         hitRDS.setDBcache(cachePages)
@@ -112,10 +140,10 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
     for rchrom in regionDict:
         if forceRegion and rchrom not in chromList:
             print rchrom
-            for (label, start, stop, length) in regionDict[rchrom]:
-                regionCount[label] = 0
-                labelList.append(label)
-                labeltoRegionDict[label] = (rchrom, start, stop)
+            for region in regionDict[rchrom]:
+                regionCount[region.label] = 0
+                labelList.append(region.label)
+                labeltoRegionDict[region.label] = (rchrom, region.start, region.stop)
 
     for rchrom in chromList:
         regionList = []
@@ -133,25 +161,21 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
             rindex = 0
             dictLen = len(readDict[fullchrom])
 
-        for (label, start, stop, length) in regionDict[rchrom]:
+        for region in regionDict[rchrom]:
+            label = region.label
+            start = region.start
+            stop = region.stop
             regionCount[label] = 0
             labelList.append(label)
             labeltoRegionDict[label] = (rchrom, start, stop)
-
-        if useFullchrom:
-            fullchrom = rchrom
-        else:
-            fullchrom = "chr%s" % rchrom
-
-        for (label, rstart, rstop, length) in regionDict[rchrom]:
-            regionList.append((label, fullchrom, rstart, rstop))
+            regionList.append((label, fullchrom, start, stop))
             if usePeak:
                 readList = []
                 for localIndex in xrange(rindex, dictLen):
                     read = readDict[fullchrom][localIndex]
-                    if read[0] < rstart:
+                    if read["start"] < start:
                         rindex += 1
-                    elif rstart <= read[0] <= rstop:
+                    elif start <= read["start"] <= stop:
                         readList.append(read)
                     else:
                         break
@@ -160,16 +184,16 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1,
                     continue
 
                 readList.sort()
-                (topPos, numHits, smoothArray, numPlus) = findPeak(readList, rstart, rstop - rstart, readlen, doWeight=True)
+                peak = findPeak(readList, start, stop - start, readlen, doWeight=True)
                 try:
-                    topValue = smoothArray[topPos[0]]
+                    topValue = peak.smoothArray[peak.topPos[0]]
                 except:
-                    print "problem with %s %s" % (str(topPos), str(smoothArray))
+                    print "problem with %s %s" % (str(peak.topPos), str(peak.smoothArray))
                     continue
 
                 regionCount[label] += topValue
             else:
-                regionCount[label] += hitRDS.getCounts(fullchrom, rstart, rstop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
+                regionCount[label] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
 
         if flagRDS:
             hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices)