7 # originally from geneLocusBins.py
15 from commoncode import readDataset, getMergedRegions, computeRegionBins, getLocusByChromDict
16 from cistematic.genomes import Genome
17 from cistematic.core.geneinfo import geneinfoDB
19 print "%prog: version 1.3"
26 usage = "usage: python %s genome rdsfile controlrdsfile outfilename [--upstream bp] [--downstream bp] [--regions acceptfile] [--cache] [--normalize] [--tagCount]"
28 parser = optparse.OptionParser(usage=usage)
29 parser.add_option("--upstream", type="int", dest="upstreamBp")
30 parser.add_option("--downstream", type="int", dest="downstreamBp")
31 parser.add_option("--regions", dest="acceptfile")
32 parser.add_option("--cache", action="store_true", dest="doCache")
33 parser.add_option("--normalize", action="store_true", dest="normalize")
34 parser.add_option("--tagCount", action="store_true", dest="doTagCount")
35 parser.add_option("--bins", type="int", dest="bins")
36 parser.set_defaults(upstreamBp=300, downstreamBp=0, acceptfile="",
37 doCache=False, normalize=False, doTagCount=False, bins=4)
38 (options, args) = parser.parse_args(argv[1:])
49 geneStallingBins(genome, hitfile, controlfile, outfilename, options.upstreamBp,
50 options.downstreamBp, options.acceptfile, options.doCache,
51 options.normalize, options.doTagCount, options.bins)
54 def geneStallingBins(genome, hitfile, controlfile, outfilename, upstreamBp=300,
55 downstreamBp=0, acceptfile="", doCache=False, normalize=False,
56 doTagCount=False, bins=4):
60 acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
65 hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
66 readlen = hitRDS.getReadSize()
67 hitNormalizationFactor = 1.0
69 hitDictSize = len(hitRDS)
70 hitNormalizationFactor = hitDictSize / 1000000.
72 controlRDS = readDataset(hitfile, verbose=True, cache=doCache)
73 controlNormalizationFactor = 1.0
75 controlDictSize = len(hitRDS)
76 controlNormalizationFactor = controlDictSize / 1000000.
78 hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
79 controlDict = controlRDS.getReadsDict(doMulti=True, findallOptimize=True)
82 idb = geneinfoDB(cache=doCache)
84 geneinfoDict = idb.getallGeneInfo(genome)
85 locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor=limitNeighbor)
87 gidList = hg.allGIDs()
89 for chrom in acceptDict:
90 for (label, start, stop, length) in acceptDict[chrom]:
91 if label not in gidList:
94 (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, hitNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)
95 (controlBins, gidLen) = computeRegionBins(locusByChromDict, controlDict, bins, readlen, gidList, controlNormalizationFactor, defaultRegionFormat=False, binLength=upstreamBp)
97 outfile = open(outfilename, "w")
104 geneinfo = geneinfoDict[gid]
105 symbol = geneinfo[0][0]
111 if gid in gidBins and gid in gidLen:
114 for binAmount in gidBins[gid]:
115 tagCount += binAmount
117 for binAmount in controlBins[gid]:
118 controlCount += abs(binAmount)
120 diffCount = tagCount + controlCount
124 outfile.write("%s\t%s\t%.1f\t%d" % (gid, symbol, diffCount, gidLen[gid]))
125 if (gidLen[gid] - 3 * upstreamBp) < upstreamBp:
126 outfile.write("\tshort\n")
129 TSSbins = (tagCount * (gidBins[gid][0] + gidBins[gid][1]) + controlCount * (controlBins[gid][0] + controlBins[gid][1])) / (upstreamBp / 50.)
130 finalbin = (tagCount * gidBins[gid][-1] + controlCount * controlBins[gid][-1]) / ((gidLen[gid] - 3. * upstreamBp) / 100.)
137 ratio = float(TSSbins)/float(finalbin)
138 for binAmount in gidBins[gid]:
140 binAmount = binAmount * tagCount / 100.
146 outfile.write("\t%.1f" % (100. * binAmount / tagCount))
148 outfile.write("\t%.1f" % binAmount)
150 outfile.write("\t%.2f\n" % ratio)
155 if __name__ == "__main__":