X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=regionCounts.py;h=ae005cb3146b686fd3934cd9a152c331900ebde4;hp=0104cc2f4bf97bb64797a1b6ce418a8b7e1b4528;hb=0d3e3112fd04c2e6b44a25cacef1d591658ad181;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/regionCounts.py b/regionCounts.py index 0104cc2..ae005cb 100755 --- a/regionCounts.py +++ b/regionCounts.py @@ -9,10 +9,13 @@ try: except: print 'psyco not running' -import sys, string, optparse -from commoncode import readDataset, getMergedRegions, findPeak, writeLog +import sys +import string +import optparse +from commoncode import getMergedRegions, findPeak, writeLog, getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption +import ReadDataset -versionString = "%prog: version 3.9" +versionString = "regionCounts: version 3.10" print versionString def main(argv=None): @@ -21,6 +24,25 @@ def main(argv=None): usage = "usage: python %prog regionfile rdsfile outfilename [options]" + parser = getParser(usage) + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + print usage + sys.exit(1) + + regionfilename = args[0] + hitfile = args[1] + outfilename = args[2] + + regionCounts(regionfilename, hitfile, outfilename, options.flagRDS, options.cField, + options.useFullchrom, options.normalize, options.padregion, + options.mergeregion, options.merging, options.doUniqs, options.doMulti, + options.doSplices, options.usePeak, options.cachePages, options.logfilename, + options.doRPKM, options.doLength, options.forceRegion) + + +def getParser(usage): parser = optparse.OptionParser(usage=usage) parser.add_option("--markRDS", action="store_true", dest="flagRDS") parser.add_option("--chromField", type="int", dest="cField") @@ -38,27 +60,33 @@ def main(argv=None): parser.add_option("--rpkm", action="store_true", dest="doRPKM") parser.add_option("--length", action="store_true", dest="doLength") parser.add_option("--force", action="store_true", dest="forceRegion") - parser.set_defaults(flagRDS=False, cField=1, useFullchrom=False, normalize=True, - padregion=0, mergeregion=0, merging=True, doUniqs=True, - doMulti=True, doSplices=False, usePeak=False, cachePages=-1, - logfilename="regionCounts.log", doRPKM=False, doLength=False, - forceRegion=False) - - (options, args) = parser.parse_args(argv[1:]) - if len(args) < 3: - print usage - sys.exit(1) - - regionfilename = args[0] - hitfile = args[1] - outfilename = args[2] - - regionCounts(regionfilename, hitfile, outfilename, options.flagRDS, options.cField, - options.useFullchrom, options.normalize, options.padregion, - options.mergeregion, options.merging, options.doUniqs, options.doMulti, - options.doSplices, options.usePeak, options.cachePages, options.logfilename, - options.doRPKM, options.doLength, options.forceRegion) + configParser = getConfigParser() + section = "regionCounts" + flagRDS = getConfigBoolOption(configParser, section, "flagRDS", False) + cField = getConfigIntOption(configParser, section, "cField", 1) + useFullchrom = getConfigBoolOption(configParser, section, "useFullchrom", False) + normalize = getConfigBoolOption(configParser, section, "normalize", True) + padregion = getConfigIntOption(configParser, section, "padregion", 0) + mergeregion = getConfigIntOption(configParser, section, "mergeregion", 0) + merging = getConfigBoolOption(configParser, section, "merging", True) + doUniqs = getConfigBoolOption(configParser, section, "doUniqs", True) + doMulti = getConfigBoolOption(configParser, section, "doMulti", True) + doSplices = getConfigBoolOption(configParser, section, "doSplices", False) + usePeak = getConfigBoolOption(configParser, section, "usePeak", False) + cachePages = getConfigIntOption(configParser, section, "cachePages", -1) + logfilename = getConfigOption(configParser, section, "logfilename", "regionCounts.log") + doRPKM = getConfigBoolOption(configParser, section, "doRPKM", False) + doLength = getConfigBoolOption(configParser, section, "doLength", False) + forceRegion = getConfigBoolOption(configParser, section, "forceRegion", False) + + parser.set_defaults(flagRDS=flagRDS, cField=cField, useFullchrom=useFullchrom, normalize=normalize, + padregion=padregion, mergeregion=mergeregion, merging=merging, doUniqs=doUniqs, + doMulti=doMulti, doSplices=doSplices, usePeak=usePeak, cachePages=cachePages, + logfilename=logfilename, doRPKM=doRPKM, doLength=doLength, + forceRegion=forceRegion) + + return parser def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1, @@ -91,7 +119,7 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1, labeltoRegionDict = {} regionCount = {} - hitRDS = readDataset(hitfile, verbose=True, cache=doCache) + hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache) readlen = hitRDS.getReadSize() if cachePages > hitRDS.getDefaultCacheSize(): hitRDS.setDBcache(cachePages) @@ -112,10 +140,10 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1, for rchrom in regionDict: if forceRegion and rchrom not in chromList: print rchrom - for (label, start, stop, length) in regionDict[rchrom]: - regionCount[label] = 0 - labelList.append(label) - labeltoRegionDict[label] = (rchrom, start, stop) + for region in regionDict[rchrom]: + regionCount[region.label] = 0 + labelList.append(region.label) + labeltoRegionDict[region.label] = (rchrom, region.start, region.stop) for rchrom in chromList: regionList = [] @@ -133,25 +161,21 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1, rindex = 0 dictLen = len(readDict[fullchrom]) - for (label, start, stop, length) in regionDict[rchrom]: + for region in regionDict[rchrom]: + label = region.label + start = region.start + stop = region.stop regionCount[label] = 0 labelList.append(label) labeltoRegionDict[label] = (rchrom, start, stop) - - if useFullchrom: - fullchrom = rchrom - else: - fullchrom = "chr%s" % rchrom - - for (label, rstart, rstop, length) in regionDict[rchrom]: - regionList.append((label, fullchrom, rstart, rstop)) + regionList.append((label, fullchrom, start, stop)) if usePeak: readList = [] for localIndex in xrange(rindex, dictLen): read = readDict[fullchrom][localIndex] - if read[0] < rstart: + if read["start"] < start: rindex += 1 - elif rstart <= read[0] <= rstop: + elif start <= read["start"] <= stop: readList.append(read) else: break @@ -160,16 +184,16 @@ def regionCounts(regionfilename, hitfile, outfilename, flagRDS=False, cField=1, continue readList.sort() - (topPos, numHits, smoothArray, numPlus) = findPeak(readList, rstart, rstop - rstart, readlen, doWeight=True) + peak = findPeak(readList, start, stop - start, readlen, doWeight=True) try: - topValue = smoothArray[topPos[0]] + topValue = peak.smoothArray[peak.topPos[0]] except: - print "problem with %s %s" % (str(topPos), str(smoothArray)) + print "problem with %s %s" % (str(peak.topPos), str(peak.smoothArray)) continue regionCount[label] += topValue else: - regionCount[label] += hitRDS.getCounts(fullchrom, rstart, rstop, uniqs=doUniqs, multi=doMulti, splices=doSplices) + regionCount[label] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices) if flagRDS: hitRDS.flagReads(regionList, uniqs=doUniqs, multi=doMulti, splices=doSplices)