X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=geneLocusBins.py;fp=geneLocusBins.py;h=49745dc0c1361c2a8dbe4736c9c8630396fd93e4;hp=6a1efc48e270d92a8453e8fcb6907db00185ed92;hb=77dccd7c98d8cdb60caaf178b1123df71ea662c9;hpb=bc30aca13e5ec397c92e67002fbf7a103130b828 diff --git a/geneLocusBins.py b/geneLocusBins.py index 6a1efc4..49745dc 100755 --- a/geneLocusBins.py +++ b/geneLocusBins.py @@ -12,6 +12,7 @@ except: import sys import optparse +import string from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption import ReadDataset from cistematic.genomes import Genome @@ -92,68 +93,78 @@ def getParser(usage): return parser + def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False, normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True, acceptfile=None): + + hg = Genome(genome) + gidList = hg.allGIDs() + gidList.sort() if acceptfile is None: acceptDict = {} else: acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True) + for chrom in acceptDict: + for region in acceptDict[chrom]: + if region.label not in gidList: + gidList.append(region.label) + + if doFlank: + locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor) + else: + locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True) - hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache) + hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache) + hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True) readlen = hitRDS.getReadSize() normalizationFactor = 1.0 if normalizeBins: totalCount = len(hitRDS) normalizationFactor = totalCount / 1000000. - hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True) - - hg = Genome(genome) - geneinfoDict = getGeneInfoDict(genome, cache=doCache) - if doFlank: - locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor) - else: - locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True) - - gidList = hg.allGIDs() - gidList.sort() - for chrom in acceptDict: - for region in acceptDict[chrom]: - if region.label not in gidList: - gidList.append(region.label) - (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False) + geneinfoDict = getGeneInfoDict(genome, cache=doCache) + writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins) - outfile = open(outfilename,'w') +def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins=True): + outfile = open(outfilename, "w") for gid in gidList: - if 'FAR' not in gid: - symbol = 'LOC' + gid - geneinfo = '' + if "FAR" not in gid: + symbol = "LOC%s" % gid + geneinfo = "" try: geneinfo = geneinfoDict[gid] symbol = geneinfo[0][0] - except: + except KeyError: pass else: symbol = gid + if gid in gidBins and gid in gidLen: tagCount = 0. for binAmount in gidBins[gid]: tagCount += binAmount - outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid])) + + outputList = [gid, symbol, tagCount, gidLen[gid]] for binAmount in gidBins[gid]: if normalizeBins: - if tagCount == 0: - tagCount = 1 - outfile.write('\t%.1f' % (100. * binAmount / tagCount)) - else: - outfile.write('\t%.1f' % binAmount) - outfile.write('\n') + try: + normalizedValue = 100. * binAmount / tagCount + except ZeroDivisionError: + normalizedValue = 100. * binAmount + + binAmount = normalizedValue + + outputList.append("%.1f" % binAmount) + + outLine = string.join(outputList, "\t") + outfile.write("%s\n" % outLine) + outfile.close() if __name__ == "__main__": - main(sys.argv) \ No newline at end of file + main(sys.argv)