X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=getfasta.py;h=acae47d68c896d14cf7b894e63adac2552483291;hp=0b2faf9d4177b4ea20971e4b433fb17074399fc0;hb=HEAD;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/getfasta.py b/getfasta.py index 0b2faf9..acae47d 100755 --- a/getfasta.py +++ b/getfasta.py @@ -9,11 +9,13 @@ try: except: pass -import sys, optparse -from commoncode import readDataset, getMergedRegions, findPeak +import sys +import optparse +from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption +import ReadDataset from cistematic.genomes import Genome -print "%s: version 3.4" % sys.argv[0] +print "getfasta: version 3.5" def main(argv=None): @@ -22,18 +24,7 @@ def main(argv=None): usage = "usage: python %s genome regionfile outfilename [options]" - parser = optparse.OptionParser(usage=usage) - parser.add_option("--seqradius", type="int", dest="seqsize") - parser.add_option("--minreads", type="int", dest="minHitThresh") - parser.add_option("--returnTop", type="int", dest="topRegions") - parser.add_option("--maxsize", type="int", dest="maxsize") - parser.add_option("--usepeak", action="store_true", dest="usePeaks") - parser.add_option("--dataset", dest="hitfile") - parser.add_option("--cache", action="store_true", dest="doCache") - parser.add_option("--compact", action="store_true", dest="doCompact") - parser.set_defaults(seqsize=50, minHitThresh=-1, topRegions=0, maxsize=300000000, - usePeaks=False, hitfile=None, doCache=False, doCompact=False) - + parser = getParser(usage) (options, args) = parser.parse_args(argv[1:]) if len(args) < 3: @@ -45,10 +36,37 @@ def main(argv=None): outfilename = args[2] getfasta(genome, regionfile, outfilename, options.seqsize, options.minHitThresh, - options.topRegions, options.maxsize, options.usePeaks, options.hitFile, + options.topRegions, options.maxsize, options.usePeaks, options.hitfile, options.doCache, options.doCompact) +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--seqradius", type="int", dest="seqsize") + parser.add_option("--minreads", type="int", dest="minHitThresh") + parser.add_option("--returnTop", type="int", dest="topRegions") + parser.add_option("--maxsize", type="int", dest="maxsize") + parser.add_option("--usepeak", action="store_true", dest="usePeaks") + parser.add_option("--dataset", dest="hitfile") + parser.add_option("--cache", action="store_true", dest="doCache") + parser.add_option("--compact", action="store_true", dest="doCompact") + + configParser = getConfigParser() + section = "getfasta" + seqsize = getConfigIntOption(configParser, section, "seqsize", 50) + minHitThresh = getConfigIntOption(configParser, section, "minHitThresh", -1) + topRegions = getConfigIntOption(configParser, section, "topRegions", 0) + maxsize = getConfigIntOption(configParser, section, "maxsize", 300000000) + usePeaks = getConfigBoolOption(configParser, section, "usePeaks", False) + hitfile = getConfigOption(configParser, section, "hitFile", None) + doCache = getConfigBoolOption(configParser, section, "doCache", False) + doCompact = getConfigBoolOption(configParser, section, "doCompact", False) + + parser.set_defaults(seqsize=seqsize, minHitThresh=minHitThresh, topRegions=topRegions, maxsize=maxsize, + usePeaks=usePeaks, hitfile=hitfile, doCache=doCache, doCompact=doCompact) + + return parser + def getfasta(genome, regionfile, outfilename, seqsize=50, minHitThresh=-1, topRegions=0, maxsize=300000000, usePeaks=False, hitfile=None, doCache=False, doCompact=False): doDataset = False @@ -69,7 +87,7 @@ def getfasta(genome, regionfile, outfilename, seqsize=50, minHitThresh=-1, topRe if usePeaks: ncregions = getRegionUsingPeaks(mergedRegions, minHitThresh, maxsize) elif doDataset: - hitRDS = readDataset(hitfile, verbose=True, cache=doCache) + hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache) ncregions = getRegionUsingRDS(mergedRegions, hitRDS, minHitThresh, maxsize) else: ncregions = getDefaultRegion(mergedRegions, maxsize) @@ -100,23 +118,24 @@ def writeFastaFile(ncregions, genome, outfilename, seqsize=50): def getDefaultRegion(regionDict, maxsize): ncregions = {} - for chrom in regionDict: - ncregions[chrom] = [] + for chromosome in regionDict: + ncregions[chromosome] = [] - for achrom in regionDict: - print "%s: processing %d regions" % (achrom, len(regionDict[achrom])) - for region in regionDict[achrom]: - (rstart, rstop, rlen) = region + for chromosome in regionDict: + print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome])) + for region in regionDict[chromosome]: + start = region.start + length = region.length - if rlen > maxsize: - print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize) + if length > maxsize: + print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, region.stop, length, maxsize) continue - resultDict = {"start": rstart, - "length": rlen, + resultDict = {"start": start, + "length": length, "topPos": [-1] } - ncregions[achrom].append(resultDict) + ncregions[chromosome].append(resultDict) return ncregions @@ -124,25 +143,26 @@ def getDefaultRegion(regionDict, maxsize): def getRegionUsingPeaks(regionDict, minHitThresh=-1, maxsize=300000000): ncregions = {} - for chrom in regionDict: - ncregions[chrom] = [] + for chromosome in regionDict: + ncregions[chromosome] = [] - for achrom in regionDict: - print "%s: processing %d regions" % (achrom, len(regionDict[achrom])) - for region in regionDict[achrom]: - (rstart, rstop, rlen, peakPos, peakHeight) = region + for chromosome in regionDict: + print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome])) + for region in regionDict[chromosome]: + start = region.start + length = region.length - if rlen > maxsize: - print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize) + if length > maxsize: + print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, region.stop, length, maxsize) continue - topPos = peakPos - rstart - if peakHeight > minHitThresh: - resultDict = {"start": rstart, - "length": rlen, + topPos = region.peakPos - start + if region.peakHeight > minHitThresh: + resultDict = {"start": start, + "length": length, "topPos": [topPos] } - ncregions[achrom].append(resultDict) + ncregions[chromosome].append(resultDict) return ncregions @@ -152,29 +172,31 @@ def getRegionUsingRDS(regionDict, hitRDS, minHitThresh=-1, maxsize=300000000): readlen = hitRDS.getReadSize() ncregions = {} - for chrom in regionDict: - ncregions[chrom] = [] - - for achrom in regionDict: - print "%s: processing %d regions" % (achrom, len(regionDict[achrom])) - for region in regionDict[achrom]: - (rstart, rstop, rlen) = region - - if rlen > maxsize: - print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize) + for chromosome in regionDict: + ncregions[chromosome] = [] + + for chromosome in regionDict: + print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome])) + for region in regionDict[chromosome]: + start = region.start + stop = region.stop + length = region.length + + if length > maxsize: + print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, stop, length, maxsize) continue - thechrom = "chr%s" % achrom + thechrom = "chr%s" % chromosome print "." - hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=rstart, stop=rstop) + hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=start, stop=stop) print "hitDict length: %d", len(hitDict[thechrom]) - (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[thechrom], rstart, rlen, readlen) - if numHits > minHitThresh: - resultDict = {"start": rstart, - "length": rlen, - "topPos": topPos + peak = findPeak(hitDict[thechrom], start, length, readlen) + if peak.numHits > minHitThresh: + resultDict = {"start": start, + "length": length, + "topPos": peak.topPos } - ncregions[achrom].append(resultDict) + ncregions[chromosome].append(resultDict) return ncregions