X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=geneLocusBins.py;fp=geneLocusBins.py;h=6a1efc48e270d92a8453e8fcb6907db00185ed92;hp=e6b403f2b33ef9beb6b0a8b2d1d8acb778cc1547;hb=0d3e3112fd04c2e6b44a25cacef1d591658ad181;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/geneLocusBins.py b/geneLocusBins.py index e6b403f..6a1efc4 100755 --- a/geneLocusBins.py +++ b/geneLocusBins.py @@ -10,12 +10,14 @@ try: except: pass -import sys, optparse -from commoncode import readDataset, getMergedRegions, getLocusByChromDict, computeRegionBins +import sys +import optparse +from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption +import ReadDataset from cistematic.genomes import Genome -from cistematic.core.geneinfo import geneinfoDB +from commoncode import getGeneInfoDict -print '%s: version 2.1' % sys.argv[0] +print "geneLocusBins: version 2.2" def main(argv=None): if not argv: @@ -23,25 +25,7 @@ def main(argv=None): usage = "usage: python %prog genome rdsfile outfilename [--bins numbins] [--flank bp] [--upstream bp] [--downstream bp] [--nocds] [--regions acceptfile] [--cache] [--raw] [--force]" - parser = optparse.OptionParser(usage=usage) - parser.add_option("--bins", type="int", dest="bins", - help="number of bins to use [default: 10]") - parser.add_option("--flank", type="int", dest="flankBP", - help="number of flanking BP on both upstream and downstream [default: 0]") - parser.add_option("--upstream", type="int", dest="upstreamBP", - help="number of upstream flanking BP [default: 0]") - parser.add_option("--downstream", type="int", dest="downstreamBP", - help="number of downstream flanking BP [default: 0]") - parser.add_option("--nocds", action="store_false", dest="doCDS", - help="do not CDS") - parser.add_option("--raw", action="store_false", dest="normalizeBins", - help="do not normalize results") - parser.add_option("--force", action="store_false", dest="limitNeighbor", - help="limit neighbor region") - parser.add_option("--regions", dest="acceptfile") - parser.add_option("--cache", action="store_true", dest="doCache", - help="use cache") - parser.set_defaults(normalizeBins=True, doCache=False, bins=10, flankBP=None, upstreamBP=None, downstreamBP=None, doCDS=True, limitNeighbor=True) + parser = getParser(usage) (options, args) = parser.parse_args(argv[1:]) if len(args) < 3: @@ -71,12 +55,53 @@ def main(argv=None): geneLocusBins(genome, hitfile, outfilename, upstreamBp, downstreamBp, doFlank, options.normalizeBins, options.doCache, options.bins, options.doCDS, options.limitNeighbor, options.acceptfile) -def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False, normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True, acceptfile=None): +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--bins", type="int", dest="bins", + help="number of bins to use [default: 10]") + parser.add_option("--flank", type="int", dest="flankBP", + help="number of flanking BP on both upstream and downstream [default: 0]") + parser.add_option("--upstream", type="int", dest="upstreamBP", + help="number of upstream flanking BP [default: 0]") + parser.add_option("--downstream", type="int", dest="downstreamBP", + help="number of downstream flanking BP [default: 0]") + parser.add_option("--nocds", action="store_false", dest="doCDS", + help="do not CDS") + parser.add_option("--raw", action="store_false", dest="normalizeBins", + help="do not normalize results") + parser.add_option("--force", action="store_false", dest="limitNeighbor", + help="limit neighbor region") + parser.add_option("--regions", dest="acceptfile") + parser.add_option("--cache", action="store_true", dest="doCache", + help="use cache") + + configParser = getConfigParser() + section = "geneLocusBins" + normalizeBins = getConfigBoolOption(configParser, section, "normalizeBins", True) + doCache = getConfigBoolOption(configParser, section, "doCache", False) + bins = getConfigIntOption(configParser, section, "bins", 10) + flankBP = getConfigOption(configParser, section, "flankBP", None) + upstreamBP = getConfigOption(configParser, section, "upstreamBP", None) + downstreamBP = getConfigOption(configParser, section, "downstreamBP", None) + doCDS = getConfigBoolOption(configParser, section, "doCDS", True) + limitNeighbor = getConfigBoolOption(configParser, section, "limitNeighbor", True) + + parser.set_defaults(normalizeBins=normalizeBins, doCache=doCache, bins=bins, flankBP=flankBP, + upstreamBP=upstreamBP, downstreamBP=downstreamBP, doCDS=doCDS, + limitNeighbor=limitNeighbor) + + return parser + +def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False, + normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True, + acceptfile=None): + if acceptfile is None: acceptDict = {} else: acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True) - hitRDS = readDataset(hitfile, verbose = True, cache=doCache) + + hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache) readlen = hitRDS.getReadSize() normalizationFactor = 1.0 if normalizeBins: @@ -86,9 +111,7 @@ def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, do hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True) hg = Genome(genome) - idb = geneinfoDB(cache=doCache) - - geneinfoDict = idb.getallGeneInfo(genome) + geneinfoDict = getGeneInfoDict(genome, cache=doCache) if doFlank: locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor) else: @@ -97,9 +120,9 @@ def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, do gidList = hg.allGIDs() gidList.sort() for chrom in acceptDict: - for (label, start, stop, length) in acceptDict[chrom]: - if label not in gidList: - gidList.append(label) + for region in acceptDict[chrom]: + if region.label not in gidList: + gidList.append(region.label) (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)