except:
pass
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, findPeak
+import sys
+import optparse
+from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
+import ReadDataset
from cistematic.genomes import Genome
-print "%s: version 3.4" % sys.argv[0]
+print "getfasta: version 3.5"
def main(argv=None):
usage = "usage: python %s genome regionfile outfilename [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--seqradius", type="int", dest="seqsize")
- parser.add_option("--minreads", type="int", dest="minHitThresh")
- parser.add_option("--returnTop", type="int", dest="topRegions")
- parser.add_option("--maxsize", type="int", dest="maxsize")
- parser.add_option("--usepeak", action="store_true", dest="usePeaks")
- parser.add_option("--dataset", dest="hitfile")
- parser.add_option("--cache", action="store_true", dest="doCache")
- parser.add_option("--compact", action="store_true", dest="doCompact")
- parser.set_defaults(seqsize=50, minHitThresh=-1, topRegions=0, maxsize=300000000,
- usePeaks=False, hitfile=None, doCache=False, doCompact=False)
-
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 3:
options.doCache, options.doCompact)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--seqradius", type="int", dest="seqsize")
+ parser.add_option("--minreads", type="int", dest="minHitThresh")
+ parser.add_option("--returnTop", type="int", dest="topRegions")
+ parser.add_option("--maxsize", type="int", dest="maxsize")
+ parser.add_option("--usepeak", action="store_true", dest="usePeaks")
+ parser.add_option("--dataset", dest="hitfile")
+ parser.add_option("--cache", action="store_true", dest="doCache")
+ parser.add_option("--compact", action="store_true", dest="doCompact")
+
+ configParser = getConfigParser()
+ section = "getfasta"
+ seqsize = getConfigIntOption(configParser, section, "seqsize", 50)
+ minHitThresh = getConfigIntOption(configParser, section, "minHitThresh", -1)
+ topRegions = getConfigIntOption(configParser, section, "topRegions", 0)
+ maxsize = getConfigIntOption(configParser, section, "maxsize", 300000000)
+ usePeaks = getConfigBoolOption(configParser, section, "usePeaks", False)
+ hitfile = getConfigOption(configParser, section, "hitFile", None)
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+ doCompact = getConfigBoolOption(configParser, section, "doCompact", False)
+
+ parser.set_defaults(seqsize=seqsize, minHitThresh=minHitThresh, topRegions=topRegions, maxsize=maxsize,
+ usePeaks=usePeaks, hitfile=hitfile, doCache=doCache, doCompact=doCompact)
+
+ return parser
+
def getfasta(genome, regionfile, outfilename, seqsize=50, minHitThresh=-1, topRegions=0,
maxsize=300000000, usePeaks=False, hitfile=None, doCache=False, doCompact=False):
doDataset = False
if usePeaks:
ncregions = getRegionUsingPeaks(mergedRegions, minHitThresh, maxsize)
elif doDataset:
- hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+ hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
ncregions = getRegionUsingRDS(mergedRegions, hitRDS, minHitThresh, maxsize)
else:
ncregions = getDefaultRegion(mergedRegions, maxsize)
def getDefaultRegion(regionDict, maxsize):
ncregions = {}
- for chrom in regionDict:
- ncregions[chrom] = []
+ for chromosome in regionDict:
+ ncregions[chromosome] = []
- for achrom in regionDict:
- print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
- for region in regionDict[achrom]:
- (rstart, rstop, rlen) = region
+ for chromosome in regionDict:
+ print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome]))
+ for region in regionDict[chromosome]:
+ start = region.start
+ length = region.length
- if rlen > maxsize:
- print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
+ if length > maxsize:
+ print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, region.stop, length, maxsize)
continue
- resultDict = {"start": rstart,
- "length": rlen,
+ resultDict = {"start": start,
+ "length": length,
"topPos": [-1]
}
- ncregions[achrom].append(resultDict)
+ ncregions[chromosome].append(resultDict)
return ncregions
def getRegionUsingPeaks(regionDict, minHitThresh=-1, maxsize=300000000):
ncregions = {}
- for chrom in regionDict:
- ncregions[chrom] = []
+ for chromosome in regionDict:
+ ncregions[chromosome] = []
- for achrom in regionDict:
- print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
- for region in regionDict[achrom]:
- (rstart, rstop, rlen, peakPos, peakHeight) = region
+ for chromosome in regionDict:
+ print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome]))
+ for region in regionDict[chromosome]:
+ start = region.start
+ length = region.length
- if rlen > maxsize:
- print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
+ if length > maxsize:
+ print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, region.stop, length, maxsize)
continue
- topPos = peakPos - rstart
- if peakHeight > minHitThresh:
- resultDict = {"start": rstart,
- "length": rlen,
+ topPos = region.peakPos - start
+ if region.peakHeight > minHitThresh:
+ resultDict = {"start": start,
+ "length": length,
"topPos": [topPos]
}
- ncregions[achrom].append(resultDict)
+ ncregions[chromosome].append(resultDict)
return ncregions
readlen = hitRDS.getReadSize()
ncregions = {}
- for chrom in regionDict:
- ncregions[chrom] = []
-
- for achrom in regionDict:
- print "%s: processing %d regions" % (achrom, len(regionDict[achrom]))
- for region in regionDict[achrom]:
- (rstart, rstop, rlen) = region
-
- if rlen > maxsize:
- print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize)
+ for chromosome in regionDict:
+ ncregions[chromosome] = []
+
+ for chromosome in regionDict:
+ print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome]))
+ for region in regionDict[chromosome]:
+ start = region.start
+ stop = region.stop
+ length = region.length
+
+ if length > maxsize:
+ print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, stop, length, maxsize)
continue
- thechrom = "chr%s" % achrom
+ thechrom = "chr%s" % chromosome
print "."
- hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=rstart, stop=rstop)
+ hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=start, stop=stop)
print "hitDict length: %d", len(hitDict[thechrom])
- (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[thechrom], rstart, rlen, readlen)
- if numHits > minHitThresh:
- resultDict = {"start": rstart,
- "length": rlen,
- "topPos": topPos
+ peak = findPeak(hitDict[thechrom], start, length, readlen)
+ if peak.numHits > minHitThresh:
+ resultDict = {"start": start,
+ "length": length,
+ "topPos": peak.topPos
}
- ncregions[achrom].append(resultDict)
+ ncregions[chromosome].append(resultDict)
return ncregions