X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=geneStallingBins.py;h=4b869f2ff4b64f4d301eac701cf1e8f612ee61fd;hp=f08abe6292ee9ba8cc7ea26039073f8ae2afb91d;hb=0d3e3112fd04c2e6b44a25cacef1d591658ad181;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/geneStallingBins.py b/geneStallingBins.py index f08abe6..4b869f2 100755 --- a/geneStallingBins.py +++ b/geneStallingBins.py @@ -11,12 +11,14 @@ try: except: pass -import sys, optparse -from commoncode import readDataset, getMergedRegions, computeRegionBins, getLocusByChromDict +import sys +import optparse +from commoncode import getMergedRegions, computeRegionBins, getLocusByChromDict, getConfigParser, getConfigBoolOption, getConfigIntOption, getConfigOption +import ReadDataset from cistematic.genomes import Genome -from cistematic.core.geneinfo import geneinfoDB +from commoncode import getGeneInfoDict -print "%prog: version 1.3" +print "geneStallingBins: version 1.4" def main(argv=None): @@ -25,16 +27,7 @@ def main(argv=None): usage = "usage: python %s genome rdsfile controlrdsfile outfilename [--upstream bp] [--downstream bp] [--regions acceptfile] [--cache] [--normalize] [--tagCount]" - parser = optparse.OptionParser(usage=usage) - parser.add_option("--upstream", type="int", dest="upstreamBp") - parser.add_option("--downstream", type="int", dest="downstreamBp") - parser.add_option("--regions", dest="acceptfile") - parser.add_option("--cache", action="store_true", dest="doCache") - parser.add_option("--normalize", action="store_true", dest="normalize") - parser.add_option("--tagCount", action="store_true", dest="doTagCount") - parser.add_option("--bins", type="int", dest="bins") - parser.set_defaults(upstreamBp=300, downstreamBp=0, acceptfile="", - doCache=False, normalize=False, doTagCount=False, bins=4) + parser = getParser(usage) (options, args) = parser.parse_args(argv[1:]) if len(args) < 4: @@ -51,6 +44,32 @@ def main(argv=None): options.normalize, options.doTagCount, options.bins) +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--upstream", type="int", dest="upstreamBp") + parser.add_option("--downstream", type="int", dest="downstreamBp") + parser.add_option("--regions", dest="acceptfile") + parser.add_option("--cache", action="store_true", dest="doCache") + parser.add_option("--normalize", action="store_true", dest="normalize") + parser.add_option("--tagCount", action="store_true", dest="doTagCount") + parser.add_option("--bins", type="int", dest="bins") + + configParser = getConfigParser() + section = "geneStallingBins" + upstreamBp = getConfigIntOption(configParser, section, "upstreamBp", 300) + downstreamBp = getConfigIntOption(configParser, section, "downstreamBp", 0) + acceptfile = getConfigOption(configParser, section, "acceptfile", "") + doCache = getConfigBoolOption(configParser, section, "doCache", False) + normalize = getConfigBoolOption(configParser, section, "normalize", False) + doTagCount = getConfigBoolOption(configParser, section, "doTagCount", False) + bins = getConfigIntOption(configParser, section, "bins", 4) + + parser.set_defaults(upstreamBp=upstreamBp, downstreamBp=downstreamBp, acceptfile=acceptfile, + doCache=doCache, normalize=normalize, doTagCount=doTagCount, bins=bins) + + return parser + + def geneStallingBins(genome, hitfile, controlfile, outfilename, upstreamBp=300, downstreamBp=0, acceptfile="", doCache=False, normalize=False, doTagCount=False, bins=4): @@ -62,14 +81,14 @@ def geneStallingBins(genome, hitfile, controlfile, outfilename, upstreamBp=300, doCDS = True limitNeighbor = False - hitRDS = readDataset(hitfile, verbose=True, cache=doCache) + hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache) readlen = hitRDS.getReadSize() hitNormalizationFactor = 1.0 if normalize: hitDictSize = len(hitRDS) hitNormalizationFactor = hitDictSize / 1000000. - controlRDS = readDataset(hitfile, verbose=True, cache=doCache) + controlRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache) controlNormalizationFactor = 1.0 if normalize: controlDictSize = len(hitRDS) @@ -79,17 +98,15 @@ def geneStallingBins(genome, hitfile, controlfile, outfilename, upstreamBp=300, controlDict = controlRDS.getReadsDict(doMulti=True, findallOptimize=True) hg = Genome(genome) - idb = geneinfoDB(cache=doCache) - - geneinfoDict = idb.getallGeneInfo(genome) + geneinfoDict = getGeneInfoDict(genome, cache=doCache) locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor) gidList = hg.allGIDs() gidList.sort() for chrom in acceptDict: - for (label, start, stop, length) in acceptDict[chrom]: - if label not in gidList: - gidList.append(label) + for region in acceptDict[chrom]: + if region.label not in gidList: + gidList.append(region.label) (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, hitNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp) (controlBins, gidLen) = computeRegionBins(locusByChromDict, controlDict, bins, readlen, gidList, controlNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)