X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=geneLocusBins.py;h=14cb2b5ee2b5617fba45f1a69ef94386784c0b11;hp=e6b403f2b33ef9beb6b0a8b2d1d8acb778cc1547;hb=HEAD;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/geneLocusBins.py b/geneLocusBins.py index e6b403f..14cb2b5 100755 --- a/geneLocusBins.py +++ b/geneLocusBins.py @@ -10,12 +10,15 @@ try: except: pass -import sys, optparse -from commoncode import readDataset, getMergedRegions, getLocusByChromDict, computeRegionBins +import sys +import optparse +import string +from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption +import ReadDataset from cistematic.genomes import Genome -from cistematic.core.geneinfo import geneinfoDB +from commoncode import getGeneInfoDict -print '%s: version 2.1' % sys.argv[0] +print "geneLocusBins: version 2.2" def main(argv=None): if not argv: @@ -23,25 +26,7 @@ def main(argv=None): usage = "usage: python %prog genome rdsfile outfilename [--bins numbins] [--flank bp] [--upstream bp] [--downstream bp] [--nocds] [--regions acceptfile] [--cache] [--raw] [--force]" - parser = optparse.OptionParser(usage=usage) - parser.add_option("--bins", type="int", dest="bins", - help="number of bins to use [default: 10]") - parser.add_option("--flank", type="int", dest="flankBP", - help="number of flanking BP on both upstream and downstream [default: 0]") - parser.add_option("--upstream", type="int", dest="upstreamBP", - help="number of upstream flanking BP [default: 0]") - parser.add_option("--downstream", type="int", dest="downstreamBP", - help="number of downstream flanking BP [default: 0]") - parser.add_option("--nocds", action="store_false", dest="doCDS", - help="do not CDS") - parser.add_option("--raw", action="store_false", dest="normalizeBins", - help="do not normalize results") - parser.add_option("--force", action="store_false", dest="limitNeighbor", - help="limit neighbor region") - parser.add_option("--regions", dest="acceptfile") - parser.add_option("--cache", action="store_true", dest="doCache", - help="use cache") - parser.set_defaults(normalizeBins=True, doCache=False, bins=10, flankBP=None, upstreamBP=None, downstreamBP=None, doCDS=True, limitNeighbor=True) + parser = getParser(usage) (options, args) = parser.parse_args(argv[1:]) if len(args) < 3: @@ -71,64 +56,114 @@ def main(argv=None): geneLocusBins(genome, hitfile, outfilename, upstreamBp, downstreamBp, doFlank, options.normalizeBins, options.doCache, options.bins, options.doCDS, options.limitNeighbor, options.acceptfile) -def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False, normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True, acceptfile=None): +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--bins", type="int", dest="bins", + help="number of bins to use [default: 10]") + parser.add_option("--flank", type="int", dest="flankBP", + help="number of flanking BP on both upstream and downstream [default: 0]") + parser.add_option("--upstream", type="int", dest="upstreamBP", + help="number of upstream flanking BP [default: 0]") + parser.add_option("--downstream", type="int", dest="downstreamBP", + help="number of downstream flanking BP [default: 0]") + parser.add_option("--nocds", action="store_false", dest="doCDS", + help="do not CDS") + parser.add_option("--raw", action="store_false", dest="normalizeBins", + help="do not normalize results") + parser.add_option("--force", action="store_false", dest="limitNeighbor", + help="limit neighbor region") + parser.add_option("--regions", dest="acceptfile") + parser.add_option("--cache", action="store_true", dest="doCache", + help="use cache") + + configParser = getConfigParser() + section = "geneLocusBins" + normalizeBins = getConfigBoolOption(configParser, section, "normalizeBins", True) + doCache = getConfigBoolOption(configParser, section, "doCache", False) + bins = getConfigIntOption(configParser, section, "bins", 10) + flankBP = getConfigOption(configParser, section, "flankBP", None) + upstreamBP = getConfigOption(configParser, section, "upstreamBP", None) + downstreamBP = getConfigOption(configParser, section, "downstreamBP", None) + doCDS = getConfigBoolOption(configParser, section, "doCDS", True) + limitNeighbor = getConfigBoolOption(configParser, section, "limitNeighbor", True) + + parser.set_defaults(normalizeBins=normalizeBins, doCache=doCache, bins=bins, flankBP=flankBP, + upstreamBP=upstreamBP, downstreamBP=downstreamBP, doCDS=doCDS, + limitNeighbor=limitNeighbor) + + return parser + + +def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False, + normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True, + acceptfile=None): + + + hg = Genome(genome) + gidList = hg.allGIDs() + gidList.sort() if acceptfile is None: acceptDict = {} else: acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True) - hitRDS = readDataset(hitfile, verbose = True, cache=doCache) - readlen = hitRDS.getReadSize() - normalizationFactor = 1.0 - if normalizeBins: - totalCount = len(hitRDS) - normalizationFactor = totalCount / 1000000. - - hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True) - - hg = Genome(genome) - idb = geneinfoDB(cache=doCache) + for chrom in acceptDict: + for region in acceptDict[chrom]: + if region.label not in gidList: + gidList.append(region.label) - geneinfoDict = idb.getallGeneInfo(genome) if doFlank: - locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor) + locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor) else: locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True) - gidList = hg.allGIDs() - gidList.sort() - for chrom in acceptDict: - for (label, start, stop, length) in acceptDict[chrom]: - if label not in gidList: - gidList.append(label) + hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache) + hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True) + readlen = hitRDS.getReadSize() + normalizationFactor = 1.0 + if normalizeBins: + totalCount = len(hitRDS) + normalizationFactor = totalCount / 1000000. (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False) + geneinfoDict = getGeneInfoDict(genome, cache=doCache) + writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins) - outfile = open(outfilename,'w') +def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins=True): + outfile = open(outfilename, "w") for gid in gidList: - if 'FAR' not in gid: - symbol = 'LOC' + gid - geneinfo = '' + if "FAR" not in gid: + symbol = "LOC%s" % gid + geneinfo = "" try: geneinfo = geneinfoDict[gid] symbol = geneinfo[0][0] - except: + except (KeyError, IndexError): pass else: symbol = gid + if gid in gidBins and gid in gidLen: tagCount = 0. for binAmount in gidBins[gid]: tagCount += binAmount - outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid])) + + outputList = [gid, symbol, tagCount, gidLen[gid]] for binAmount in gidBins[gid]: if normalizeBins: - if tagCount == 0: - tagCount = 1 - outfile.write('\t%.1f' % (100. * binAmount / tagCount)) - else: - outfile.write('\t%.1f' % binAmount) - outfile.write('\n') + try: + normalizedValue = 100. * binAmount / tagCount + except ZeroDivisionError: + #TODO: this is the right way to refactor the original code, but I don't think this is the right answer + normalizedValue = 100. * binAmount + + binAmount = normalizedValue + + outputList.append("%.1f" % binAmount) + + outLine = string.join(outputList, "\t") + outfile.write("%s\n" % outLine) + outfile.close()