6 # originally from version 1.3 of geneDownstreamBins.py
14 from commoncode import readDataset, getMergedRegions, getLocusByChromDict, computeRegionBins
15 from cistematic.genomes import Genome
16 from cistematic.core.geneinfo import geneinfoDB
18 print '%s: version 2.1' % sys.argv[0]
24 usage = "usage: python %prog genome rdsfile outfilename [--bins numbins] [--flank bp] [--upstream bp] [--downstream bp] [--nocds] [--regions acceptfile] [--cache] [--raw] [--force]"
26 parser = optparse.OptionParser(usage=usage)
27 parser.add_option("--bins", type="int", dest="bins",
28 help="number of bins to use [default: 10]")
29 parser.add_option("--flank", type="int", dest="flankBP",
30 help="number of flanking BP on both upstream and downstream [default: 0]")
31 parser.add_option("--upstream", type="int", dest="upstreamBP",
32 help="number of upstream flanking BP [default: 0]")
33 parser.add_option("--downstream", type="int", dest="downstreamBP",
34 help="number of downstream flanking BP [default: 0]")
35 parser.add_option("--nocds", action="store_false", dest="doCDS",
37 parser.add_option("--raw", action="store_false", dest="normalizeBins",
38 help="do not normalize results")
39 parser.add_option("--force", action="store_false", dest="limitNeighbor",
40 help="limit neighbor region")
41 parser.add_option("--regions", dest="acceptfile")
42 parser.add_option("--cache", action="store_true", dest="doCache",
44 parser.set_defaults(normalizeBins=True, doCache=False, bins=10, flankBP=None, upstreamBP=None, downstreamBP=None, doCDS=True, limitNeighbor=True)
45 (options, args) = parser.parse_args(argv[1:])
58 if options.flankBP is not None:
59 upstreamBp = options.flankBP
60 downstreamBp = options.flankBP
63 if options.upstreamBP is not None:
64 upstreamBp = options.upstreamBP
67 if options.downstreamBP is not None:
68 downstreamBp = options.downstreamBP
71 geneLocusBins(genome, hitfile, outfilename, upstreamBp, downstreamBp, doFlank, options.normalizeBins, options.doCache, options.bins, options.doCDS, options.limitNeighbor, options.acceptfile)
74 def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False, normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True, acceptfile=None):
75 if acceptfile is None:
78 acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
79 hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
80 readlen = hitRDS.getReadSize()
81 normalizationFactor = 1.0
83 totalCount = len(hitRDS)
84 normalizationFactor = totalCount / 1000000.
86 hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
89 idb = geneinfoDB(cache=doCache)
91 geneinfoDict = idb.getallGeneInfo(genome)
93 locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor)
95 locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
97 gidList = hg.allGIDs()
99 for chrom in acceptDict:
100 for (label, start, stop, length) in acceptDict[chrom]:
101 if label not in gidList:
102 gidList.append(label)
104 (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
106 outfile = open(outfilename,'w')
113 geneinfo = geneinfoDict[gid]
114 symbol = geneinfo[0][0]
119 if gid in gidBins and gid in gidLen:
121 for binAmount in gidBins[gid]:
122 tagCount += binAmount
123 outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid]))
124 for binAmount in gidBins[gid]:
128 outfile.write('\t%.1f' % (100. * binAmount / tagCount))
130 outfile.write('\t%.1f' % binAmount)
135 if __name__ == "__main__":