X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=geneLocusBins.py;h=14cb2b5ee2b5617fba45f1a69ef94386784c0b11;hp=6a1efc48e270d92a8453e8fcb6907db00185ed92;hb=HEAD;hpb=0d3e3112fd04c2e6b44a25cacef1d591658ad181 diff --git a/geneLocusBins.py b/geneLocusBins.py index 6a1efc4..14cb2b5 100755 --- a/geneLocusBins.py +++ b/geneLocusBins.py @@ -12,6 +12,7 @@ except: import sys import optparse +import string from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption import ReadDataset from cistematic.genomes import Genome @@ -92,66 +93,77 @@ def getParser(usage): return parser + def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False, normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True, acceptfile=None): + + hg = Genome(genome) + gidList = hg.allGIDs() + gidList.sort() if acceptfile is None: acceptDict = {} else: acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True) + for chrom in acceptDict: + for region in acceptDict[chrom]: + if region.label not in gidList: + gidList.append(region.label) + + if doFlank: + locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor) + else: + locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True) - hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache) + hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache) + hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True) readlen = hitRDS.getReadSize() normalizationFactor = 1.0 if normalizeBins: totalCount = len(hitRDS) normalizationFactor = totalCount / 1000000. - hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True) - - hg = Genome(genome) - geneinfoDict = getGeneInfoDict(genome, cache=doCache) - if doFlank: - locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor) - else: - locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True) - - gidList = hg.allGIDs() - gidList.sort() - for chrom in acceptDict: - for region in acceptDict[chrom]: - if region.label not in gidList: - gidList.append(region.label) - (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False) + geneinfoDict = getGeneInfoDict(genome, cache=doCache) + writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins) - outfile = open(outfilename,'w') +def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins=True): + outfile = open(outfilename, "w") for gid in gidList: - if 'FAR' not in gid: - symbol = 'LOC' + gid - geneinfo = '' + if "FAR" not in gid: + symbol = "LOC%s" % gid + geneinfo = "" try: geneinfo = geneinfoDict[gid] symbol = geneinfo[0][0] - except: + except (KeyError, IndexError): pass else: symbol = gid + if gid in gidBins and gid in gidLen: tagCount = 0. for binAmount in gidBins[gid]: tagCount += binAmount - outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid])) + + outputList = [gid, symbol, tagCount, gidLen[gid]] for binAmount in gidBins[gid]: if normalizeBins: - if tagCount == 0: - tagCount = 1 - outfile.write('\t%.1f' % (100. * binAmount / tagCount)) - else: - outfile.write('\t%.1f' % binAmount) - outfile.write('\n') + try: + normalizedValue = 100. * binAmount / tagCount + except ZeroDivisionError: + #TODO: this is the right way to refactor the original code, but I don't think this is the right answer + normalizedValue = 100. * binAmount + + binAmount = normalizedValue + + outputList.append("%.1f" % binAmount) + + outLine = string.join(outputList, "\t") + outfile.write("%s\n" % outLine) + outfile.close()