X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=getfasta.py;h=ea347e4aa44f1a21a0b7c54d11b407352ce91b6f;hp=0b2faf9d4177b4ea20971e4b433fb17074399fc0;hb=0d3e3112fd04c2e6b44a25cacef1d591658ad181;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/getfasta.py b/getfasta.py index 0b2faf9..ea347e4 100755 --- a/getfasta.py +++ b/getfasta.py @@ -9,11 +9,13 @@ try: except: pass -import sys, optparse -from commoncode import readDataset, getMergedRegions, findPeak +import sys +import optparse +from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption +import ReadDataset from cistematic.genomes import Genome -print "%s: version 3.4" % sys.argv[0] +print "getfasta: version 3.5" def main(argv=None): @@ -22,18 +24,7 @@ def main(argv=None): usage = "usage: python %s genome regionfile outfilename [options]" - parser = optparse.OptionParser(usage=usage) - parser.add_option("--seqradius", type="int", dest="seqsize") - parser.add_option("--minreads", type="int", dest="minHitThresh") - parser.add_option("--returnTop", type="int", dest="topRegions") - parser.add_option("--maxsize", type="int", dest="maxsize") - parser.add_option("--usepeak", action="store_true", dest="usePeaks") - parser.add_option("--dataset", dest="hitfile") - parser.add_option("--cache", action="store_true", dest="doCache") - parser.add_option("--compact", action="store_true", dest="doCompact") - parser.set_defaults(seqsize=50, minHitThresh=-1, topRegions=0, maxsize=300000000, - usePeaks=False, hitfile=None, doCache=False, doCompact=False) - + parser = getParser(usage) (options, args) = parser.parse_args(argv[1:]) if len(args) < 3: @@ -49,6 +40,33 @@ def main(argv=None): options.doCache, options.doCompact) +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--seqradius", type="int", dest="seqsize") + parser.add_option("--minreads", type="int", dest="minHitThresh") + parser.add_option("--returnTop", type="int", dest="topRegions") + parser.add_option("--maxsize", type="int", dest="maxsize") + parser.add_option("--usepeak", action="store_true", dest="usePeaks") + parser.add_option("--dataset", dest="hitfile") + parser.add_option("--cache", action="store_true", dest="doCache") + parser.add_option("--compact", action="store_true", dest="doCompact") + + configParser = getConfigParser() + section = "getfasta" + seqsize = getConfigIntOption(configParser, section, "seqsize", 50) + minHitThresh = getConfigIntOption(configParser, section, "minHitThresh", -1) + topRegions = getConfigIntOption(configParser, section, "topRegions", 0) + maxsize = getConfigIntOption(configParser, section, "maxsize", 300000000) + usePeaks = getConfigBoolOption(configParser, section, "usePeaks", False) + hitfile = getConfigOption(configParser, section, "hitFile", None) + doCache = getConfigBoolOption(configParser, section, "doCache", False) + doCompact = getConfigBoolOption(configParser, section, "doCompact", False) + + parser.set_defaults(seqsize=seqsize, minHitThresh=minHitThresh, topRegions=topRegions, maxsize=maxsize, + usePeaks=usePeaks, hitfile=hitfile, doCache=doCache, doCompact=doCompact) + + return parser + def getfasta(genome, regionfile, outfilename, seqsize=50, minHitThresh=-1, topRegions=0, maxsize=300000000, usePeaks=False, hitfile=None, doCache=False, doCompact=False): doDataset = False @@ -69,7 +87,7 @@ def getfasta(genome, regionfile, outfilename, seqsize=50, minHitThresh=-1, topRe if usePeaks: ncregions = getRegionUsingPeaks(mergedRegions, minHitThresh, maxsize) elif doDataset: - hitRDS = readDataset(hitfile, verbose=True, cache=doCache) + hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache) ncregions = getRegionUsingRDS(mergedRegions, hitRDS, minHitThresh, maxsize) else: ncregions = getDefaultRegion(mergedRegions, maxsize) @@ -100,23 +118,24 @@ def writeFastaFile(ncregions, genome, outfilename, seqsize=50): def getDefaultRegion(regionDict, maxsize): ncregions = {} - for chrom in regionDict: - ncregions[chrom] = [] + for chromosome in regionDict: + ncregions[chromosome] = [] - for achrom in regionDict: - print "%s: processing %d regions" % (achrom, len(regionDict[achrom])) - for region in regionDict[achrom]: - (rstart, rstop, rlen) = region + for chromosome in regionDict: + print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome])) + for region in regionDict[chromosome]: + start = region.start + length = region.length - if rlen > maxsize: - print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize) + if length > maxsize: + print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, region.stop, length, maxsize) continue - resultDict = {"start": rstart, - "length": rlen, + resultDict = {"start": start, + "length": length, "topPos": [-1] } - ncregions[achrom].append(resultDict) + ncregions[chromosome].append(resultDict) return ncregions @@ -124,25 +143,26 @@ def getDefaultRegion(regionDict, maxsize): def getRegionUsingPeaks(regionDict, minHitThresh=-1, maxsize=300000000): ncregions = {} - for chrom in regionDict: - ncregions[chrom] = [] + for chromosome in regionDict: + ncregions[chromosome] = [] - for achrom in regionDict: - print "%s: processing %d regions" % (achrom, len(regionDict[achrom])) - for region in regionDict[achrom]: - (rstart, rstop, rlen, peakPos, peakHeight) = region + for chromosome in regionDict: + print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome])) + for region in regionDict[chromosome]: + start = region.start + length = region.length - if rlen > maxsize: - print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize) + if length > maxsize: + print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, region.stop, length, maxsize) continue - topPos = peakPos - rstart - if peakHeight > minHitThresh: - resultDict = {"start": rstart, - "length": rlen, + topPos = region.peakPos - start + if region.peakHeight > minHitThresh: + resultDict = {"start": start, + "length": length, "topPos": [topPos] } - ncregions[achrom].append(resultDict) + ncregions[chromosome].append(resultDict) return ncregions @@ -152,29 +172,31 @@ def getRegionUsingRDS(regionDict, hitRDS, minHitThresh=-1, maxsize=300000000): readlen = hitRDS.getReadSize() ncregions = {} - for chrom in regionDict: - ncregions[chrom] = [] - - for achrom in regionDict: - print "%s: processing %d regions" % (achrom, len(regionDict[achrom])) - for region in regionDict[achrom]: - (rstart, rstop, rlen) = region - - if rlen > maxsize: - print "%s:%d-%d length %d > %d max region size - skipping" % (achrom, rstart, rstop, rlen, maxsize) + for chromosome in regionDict: + ncregions[chromosome] = [] + + for chromosome in regionDict: + print "%s: processing %d regions" % (chromosome, len(regionDict[chromosome])) + for region in regionDict[chromosome]: + start = region.start + stop = region.stop + length = region.length + + if length > maxsize: + print "%s:%d-%d length %d > %d max region size - skipping" % (chromosome, start, stop, length, maxsize) continue - thechrom = "chr%s" % achrom + thechrom = "chr%s" % chromosome print "." - hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=rstart, stop=rstop) + hitDict = hitRDS.getReadsDict(chrom=thechrom, withWeight=True, doMulti=True, findallOptimize=True, start=start, stop=stop) print "hitDict length: %d", len(hitDict[thechrom]) - (topPos, numHits, smoothArray, numPlus) = findPeak(hitDict[thechrom], rstart, rlen, readlen) - if numHits > minHitThresh: - resultDict = {"start": rstart, - "length": rlen, - "topPos": topPos + peak = findPeak(hitDict[thechrom], start, length, readlen) + if peak.numHits > minHitThresh: + resultDict = {"start": start, + "length": length, + "topPos": peak.topPos } - ncregions[achrom].append(resultDict) + ncregions[chromosome].append(resultDict) return ncregions