erange version 4.0a dev release
[erange.git] / getfasta.py
index 0b2faf9d4177b4ea20971e4b433fb17074399fc0..ea347e4aa44f1a21a0b7c54d11b407352ce91b6f 100755 (executable)
@@ -9,11 +9,13 @@ try:
 except:
     pass
 
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, findPeak
+import sys
+import optparse
+from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
+import ReadDataset
 from cistematic.genomes import Genome
 
-print "%s: version 3.4" % sys.argv[0]
+print "getfasta: version 3.5"
 
 
 def main(argv=None):
@@ -22,18 +24,7 @@ def main(argv=None):
 
     usage = "usage: python %s genome regionfile outfilename [options]"
 
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--seqradius", type="int", dest="seqsize")
-    parser.add_option("--minreads", type="int", dest="minHitThresh")
-    parser.add_option("--returnTop", type="int", dest="topRegions")
-    parser.add_option("--maxsize", type="int", dest="maxsize")
-    parser.add_option("--usepeak", action="store_true", dest="usePeaks")
-    parser.add_option("--dataset", dest="hitfile")
-    parser.add_option("--cache", action="store_true", dest="doCache")
-    parser.add_option("--compact", action="store_true", dest="doCompact")
-    parser.set_defaults(seqsize=50, minHitThresh=-1, topRegions=0, maxsize=300000000,
-                        usePeaks=False, hitfile=None, doCache=False, doCompact=False)
-
+    parser = getParser(usage)
     (options, args) = parser.parse_args(argv[1:])
 
     if len(args) < 3:
@@ -49,6 +40,33 @@ def main(argv=None):
              options.doCache, options.doCompact)
 
 
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--seqradius", type="int", dest="seqsize")
+    parser.add_option("--minreads", type="int", dest="minHitThresh")
+    parser.add_option("--returnTop", type="int", dest="topRegions")
+    parser.add_option("--maxsize", type="int", dest="maxsize")
+    parser.add_option("--usepeak", action="store_true", dest="usePeaks")
+    parser.add_option("--dataset", dest="hitfile")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.add_option("--compact", action="store_true", dest="doCompact")
+
+    configParser = getConfigParser()
+    section = "getfasta"
+    seqsize = getConfigIntOption(configParser, section, "seqsize", 50)
+    minHitThresh = getConfigIntOption(configParser, section, "minHitThresh", -1)
+    topRegions = getConfigIntOption(configParser, section, "topRegions", 0)
+    maxsize = getConfigIntOption(configParser, section, "maxsize", 300000000)
+    usePeaks = getConfigBoolOption(configParser, section, "usePeaks", False)
+    hitfile = getConfigOption(configParser, section, "hitFile", None)
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    doCompact = getConfigBoolOption(configParser, section, "doCompact", False)
+
+    parser.set_defaults(seqsize=seqsize, minHitThresh=minHitThresh, topRegions=topRegions, maxsize=maxsize,
+                        usePeaks=usePeaks, hitfile=hitfile, doCache=doCache, doCompact=doCompact)
+
+    return parser
+
 def getfasta(genome, regionfile, outfilename, seqsize=50, minHitThresh=-1, topRegions=0,
              maxsize=300000000, usePeaks=False, hitfile=None, doCache=False, doCompact=False):
     doDataset = False
@@ -69,7 +87,7 @@ def getfasta(genome, regionfile, outfilename, seqsize=50, minHitThresh=-1, topRe
     if usePeaks:
         ncregions = getRegionUsingPeaks(mergedRegions, minHitThresh, maxsize)
     elif doDataset:
-        hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+        hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
         ncregions = getRegionUsingRDS(mergedRegions, hitRDS, minHitThresh, maxsize)
     else:
         ncregions = getDefaultRegion(mergedRegions, maxsize)
@@ -100,23 +118,24 @@ def writeFastaFile(ncregions, genome, outfilename, seqsize=50):
 
 def getDefaultRegion(regionDict, maxsize):
     ncregions = {}
-    for chrom in regionDict:
-        ncregions[chrom] = []
+    for chromosome in regionDict:
+        ncregions[chromosome] = []
 
-    for achrom in regionDict:
-        print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
-        for region in regionDict[achrom]:
-            (rstart, rstop, rlen) = region
+    for chromosome in regionDict:
+        print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome]))
+        for region in regionDict[chromosome]:
+            start = region.start
+            length = region.length
 
-            if rlen > maxsize:
-                print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
+            if length > maxsize:
+                print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, region.stop, length, maxsize)
                 continue
 
-            resultDict = {"start": rstart,
-                          "length": rlen,
+            resultDict = {"start": start,
+                          "length": length,
                           "topPos": [-1]
             }
-            ncregions[achrom].append(resultDict)
+            ncregions[chromosome].append(resultDict)
 
     return ncregions
 
@@ -124,25 +143,26 @@ def getDefaultRegion(regionDict, maxsize):
 def getRegionUsingPeaks(regionDict, minHitThresh=-1, maxsize=300000000):
 
     ncregions = {}
-    for chrom in regionDict:
-        ncregions[chrom] = []
+    for chromosome in regionDict:
+        ncregions[chromosome] = []
 
-    for achrom in regionDict:
-        print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
-        for region in regionDict[achrom]:
-            (rstart, rstop, rlen, peakPos, peakHeight) = region
+    for chromosome in regionDict:
+        print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome]))
+        for region in regionDict[chromosome]:
+            start = region.start
+            length = region.length
 
-            if rlen > maxsize:
-                print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
+            if length > maxsize:
+                print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, region.stop, length, maxsize)
                 continue
 
-            topPos = peakPos - rstart
-            if peakHeight > minHitThresh:
-                resultDict = {"start": rstart,
-                              "length": rlen,
+            topPos = region.peakPos - start
+            if region.peakHeight > minHitThresh:
+                resultDict = {"start": start,
+                              "length": length,
                               "topPos": [topPos]
                 }
-                ncregions[achrom].append(resultDict)
+                ncregions[chromosome].append(resultDict)
 
     return ncregions
 
@@ -152,29 +172,31 @@ def getRegionUsingRDS(regionDict, hitRDS, minHitThresh=-1, maxsize=300000000):
     readlen = hitRDS.getReadSize()
 
     ncregions = {}
-    for chrom in regionDict:
-        ncregions[chrom] = []
-
-    for achrom in regionDict:
-        print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
-        for region in regionDict[achrom]:
-            (rstart, rstop, rlen) = region
-
-            if rlen > maxsize:
-                print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
+    for chromosome in regionDict:
+        ncregions[chromosome] = []
+
+    for chromosome in regionDict:
+        print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome]))
+        for region in regionDict[chromosome]:
+            start = region.start
+            stop = region.stop
+            length = region.length
+
+            if length > maxsize:
+                print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, stop, length, maxsize)
                 continue
 
-            thechrom = "chr%s" % achrom
+            thechrom = "chr%s" % chromosome
             print "."
-            hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=rstart, stop=rstop)
+            hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=start, stop=stop)
             print "hitDict length: %d", len(hitDict[thechrom])
-            (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[thechrom], rstart, rlen, readlen)
-            if numHits > minHitThresh:
-                resultDict = {"start": rstart,
-                              "length": rlen,
-                              "topPos": topPos
+            peak = findPeak(hitDict[thechrom], start, length, readlen)
+            if peak.numHits > minHitThresh:
+                resultDict = {"start": start,
+                              "length": length,
+                              "topPos": peak.topPos
                 }
-                ncregions[achrom].append(resultDict)
+                ncregions[chromosome].append(resultDict)
 
     return ncregions