6 # originally from version 1.3 of geneDownstreamBins.py
16 from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
18 from cistematic.genomes import Genome
19 from commoncode import getGeneInfoDict
21 print "geneLocusBins: version 2.2"
27 usage = "usage: python %prog genome rdsfile outfilename [--bins numbins] [--flank bp] [--upstream bp] [--downstream bp] [--nocds] [--regions acceptfile] [--cache] [--raw] [--force]"
29 parser = getParser(usage)
30 (options, args) = parser.parse_args(argv[1:])
43 if options.flankBP is not None:
44 upstreamBp = options.flankBP
45 downstreamBp = options.flankBP
48 if options.upstreamBP is not None:
49 upstreamBp = options.upstreamBP
52 if options.downstreamBP is not None:
53 downstreamBp = options.downstreamBP
56 geneLocusBins(genome, hitfile, outfilename, upstreamBp, downstreamBp, doFlank, options.normalizeBins, options.doCache, options.bins, options.doCDS, options.limitNeighbor, options.acceptfile)
60 parser = optparse.OptionParser(usage=usage)
61 parser.add_option("--bins", type="int", dest="bins",
62 help="number of bins to use [default: 10]")
63 parser.add_option("--flank", type="int", dest="flankBP",
64 help="number of flanking BP on both upstream and downstream [default: 0]")
65 parser.add_option("--upstream", type="int", dest="upstreamBP",
66 help="number of upstream flanking BP [default: 0]")
67 parser.add_option("--downstream", type="int", dest="downstreamBP",
68 help="number of downstream flanking BP [default: 0]")
69 parser.add_option("--nocds", action="store_false", dest="doCDS",
71 parser.add_option("--raw", action="store_false", dest="normalizeBins",
72 help="do not normalize results")
73 parser.add_option("--force", action="store_false", dest="limitNeighbor",
74 help="limit neighbor region")
75 parser.add_option("--regions", dest="acceptfile")
76 parser.add_option("--cache", action="store_true", dest="doCache",
79 configParser = getConfigParser()
80 section = "geneLocusBins"
81 normalizeBins = getConfigBoolOption(configParser, section, "normalizeBins", True)
82 doCache = getConfigBoolOption(configParser, section, "doCache", False)
83 bins = getConfigIntOption(configParser, section, "bins", 10)
84 flankBP = getConfigOption(configParser, section, "flankBP", None)
85 upstreamBP = getConfigOption(configParser, section, "upstreamBP", None)
86 downstreamBP = getConfigOption(configParser, section, "downstreamBP", None)
87 doCDS = getConfigBoolOption(configParser, section, "doCDS", True)
88 limitNeighbor = getConfigBoolOption(configParser, section, "limitNeighbor", True)
90 parser.set_defaults(normalizeBins=normalizeBins, doCache=doCache, bins=bins, flankBP=flankBP,
91 upstreamBP=upstreamBP, downstreamBP=downstreamBP, doCDS=doCDS,
92 limitNeighbor=limitNeighbor)
97 def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
98 normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
103 gidList = hg.allGIDs()
105 if acceptfile is None:
108 acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
109 for chrom in acceptDict:
110 for region in acceptDict[chrom]:
111 if region.label not in gidList:
112 gidList.append(region.label)
115 locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
117 locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
119 hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
120 hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
121 readlen = hitRDS.getReadSize()
122 normalizationFactor = 1.0
124 totalCount = len(hitRDS)
125 normalizationFactor = totalCount / 1000000.
127 (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
128 geneinfoDict = getGeneInfoDict(genome, cache=doCache)
129 writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins)
132 def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins=True):
133 outfile = open(outfilename, "w")
136 symbol = "LOC%s" % gid
139 geneinfo = geneinfoDict[gid]
140 symbol = geneinfo[0][0]
141 except (KeyError, IndexError):
146 if gid in gidBins and gid in gidLen:
148 for binAmount in gidBins[gid]:
149 tagCount += binAmount
151 outputList = [gid, symbol, tagCount, gidLen[gid]]
152 for binAmount in gidBins[gid]:
155 normalizedValue = 100. * binAmount / tagCount
156 except ZeroDivisionError:
157 #TODO: this is the right way to refactor the original code, but I don't think this is the right answer
158 normalizedValue = 100. * binAmount
160 binAmount = normalizedValue
162 outputList.append("%.1f" % binAmount)
164 outLine = string.join(outputList, "\t")
165 outfile.write("%s\n" % outLine)
170 if __name__ == "__main__":