7 # originally from geneLocusBins.py
16 from commoncode import getMergedRegions, computeRegionBins, getLocusByChromDict, getConfigParser, getConfigBoolOption, getConfigIntOption, getConfigOption
18 from cistematic.genomes import Genome
19 from commoncode import getGeneInfoDict
21 print "geneStallingBins: version 1.4"
28 usage = "usage: python %s genome rdsfile controlrdsfile outfilename [--upstream bp] [--downstream bp] [--regions acceptfile] [--cache] [--normalize] [--tagCount]"
30 parser = getParser(usage)
31 (options, args) = parser.parse_args(argv[1:])
42 geneStallingBins(genome, hitfile, controlfile, outfilename, options.upstreamBp,
43 options.downstreamBp, options.acceptfile, options.doCache,
44 options.normalize, options.doTagCount, options.bins)
48 parser = optparse.OptionParser(usage=usage)
49 parser.add_option("--upstream", type="int", dest="upstreamBp")
50 parser.add_option("--downstream", type="int", dest="downstreamBp")
51 parser.add_option("--regions", dest="acceptfile")
52 parser.add_option("--cache", action="store_true", dest="doCache")
53 parser.add_option("--normalize", action="store_true", dest="normalize")
54 parser.add_option("--tagCount", action="store_true", dest="doTagCount")
55 parser.add_option("--bins", type="int", dest="bins")
57 configParser = getConfigParser()
58 section = "geneStallingBins"
59 upstreamBp = getConfigIntOption(configParser, section, "upstreamBp", 300)
60 downstreamBp = getConfigIntOption(configParser, section, "downstreamBp", 0)
61 acceptfile = getConfigOption(configParser, section, "acceptfile", "")
62 doCache = getConfigBoolOption(configParser, section, "doCache", False)
63 normalize = getConfigBoolOption(configParser, section, "normalize", False)
64 doTagCount = getConfigBoolOption(configParser, section, "doTagCount", False)
65 bins = getConfigIntOption(configParser, section, "bins", 4)
67 parser.set_defaults(upstreamBp=upstreamBp, downstreamBp=downstreamBp, acceptfile=acceptfile,
68 doCache=doCache, normalize=normalize, doTagCount=doTagCount, bins=bins)
73 def geneStallingBins(genome, hitfile, controlfile, outfilename, upstreamBp=300,
74 downstreamBp=0, acceptfile="", doCache=False, normalize=False,
75 doTagCount=False, bins=4):
79 acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
84 hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
85 readlen = hitRDS.getReadSize()
86 hitNormalizationFactor = 1.0
88 hitDictSize = len(hitRDS)
89 hitNormalizationFactor = hitDictSize / 1000000.
91 controlRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
92 controlNormalizationFactor = 1.0
94 controlDictSize = len(hitRDS)
95 controlNormalizationFactor = controlDictSize / 1000000.
97 hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
98 controlDict = controlRDS.getReadsDict(doMulti=True, findallOptimize=True)
101 geneinfoDict = getGeneInfoDict(genome, cache=doCache)
102 locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
104 gidList = hg.allGIDs()
106 for chrom in acceptDict:
107 for region in acceptDict[chrom]:
108 if region.label not in gidList:
109 gidList.append(region.label)
111 (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, hitNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)
112 (controlBins, gidLen) = computeRegionBins(locusByChromDict, controlDict, bins, readlen, gidList, controlNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)
114 outfile = open(outfilename, "w")
121 geneinfo = geneinfoDict[gid]
122 symbol = geneinfo[0][0]
128 if gid in gidBins and gid in gidLen:
131 for binAmount in gidBins[gid]:
132 tagCount += binAmount
134 for binAmount in controlBins[gid]:
135 controlCount += abs(binAmount)
137 diffCount = tagCount + controlCount
141 outfile.write("%s\t%s\t%.1f\t%d" % (gid, symbol, diffCount, gidLen[gid]))
142 if (gidLen[gid] - 3 * upstreamBp) < upstreamBp:
143 outfile.write("\tshort\n")
146 TSSbins = (tagCount * (gidBins[gid][0] + gidBins[gid][1]) + controlCount * (controlBins[gid][0] + controlBins[gid][1])) / (upstreamBp / 50.)
147 finalbin = (tagCount * gidBins[gid][-1] + controlCount * controlBins[gid][-1]) / ((gidLen[gid] - 3. * upstreamBp) / 100.)
154 ratio = float(TSSbins)/float(finalbin)
155 for binAmount in gidBins[gid]:
157 binAmount = binAmount * tagCount / 100.
163 outfile.write("\t%.1f" % (100. * binAmount / tagCount))
165 outfile.write("\t%.1f" % binAmount)
167 outfile.write("\t%.2f\n" % ratio)
172 if __name__ == "__main__":