6 # originally from version 1.3 of geneDownstreamBins.py
15 from commoncode import getMergedRegions, getLocusByChromDict, computeRegionBins, getConfigParser, getConfigIntOption, getConfigOption, getConfigBoolOption
17 from cistematic.genomes import Genome
18 from commoncode import getGeneInfoDict
20 print "geneLocusBins: version 2.2"
26 usage = "usage: python %prog genome rdsfile outfilename [--bins numbins] [--flank bp] [--upstream bp] [--downstream bp] [--nocds] [--regions acceptfile] [--cache] [--raw] [--force]"
28 parser = getParser(usage)
29 (options, args) = parser.parse_args(argv[1:])
42 if options.flankBP is not None:
43 upstreamBp = options.flankBP
44 downstreamBp = options.flankBP
47 if options.upstreamBP is not None:
48 upstreamBp = options.upstreamBP
51 if options.downstreamBP is not None:
52 downstreamBp = options.downstreamBP
55 geneLocusBins(genome, hitfile, outfilename, upstreamBp, downstreamBp, doFlank, options.normalizeBins, options.doCache, options.bins, options.doCDS, options.limitNeighbor, options.acceptfile)
59 parser = optparse.OptionParser(usage=usage)
60 parser.add_option("--bins", type="int", dest="bins",
61 help="number of bins to use [default: 10]")
62 parser.add_option("--flank", type="int", dest="flankBP",
63 help="number of flanking BP on both upstream and downstream [default: 0]")
64 parser.add_option("--upstream", type="int", dest="upstreamBP",
65 help="number of upstream flanking BP [default: 0]")
66 parser.add_option("--downstream", type="int", dest="downstreamBP",
67 help="number of downstream flanking BP [default: 0]")
68 parser.add_option("--nocds", action="store_false", dest="doCDS",
70 parser.add_option("--raw", action="store_false", dest="normalizeBins",
71 help="do not normalize results")
72 parser.add_option("--force", action="store_false", dest="limitNeighbor",
73 help="limit neighbor region")
74 parser.add_option("--regions", dest="acceptfile")
75 parser.add_option("--cache", action="store_true", dest="doCache",
78 configParser = getConfigParser()
79 section = "geneLocusBins"
80 normalizeBins = getConfigBoolOption(configParser, section, "normalizeBins", True)
81 doCache = getConfigBoolOption(configParser, section, "doCache", False)
82 bins = getConfigIntOption(configParser, section, "bins", 10)
83 flankBP = getConfigOption(configParser, section, "flankBP", None)
84 upstreamBP = getConfigOption(configParser, section, "upstreamBP", None)
85 downstreamBP = getConfigOption(configParser, section, "downstreamBP", None)
86 doCDS = getConfigBoolOption(configParser, section, "doCDS", True)
87 limitNeighbor = getConfigBoolOption(configParser, section, "limitNeighbor", True)
89 parser.set_defaults(normalizeBins=normalizeBins, doCache=doCache, bins=bins, flankBP=flankBP,
90 upstreamBP=upstreamBP, downstreamBP=downstreamBP, doCDS=doCDS,
91 limitNeighbor=limitNeighbor)
95 def geneLocusBins(genome, hitfile, outfilename, upstreamBp=0, downstreamBp=0, doFlank=False,
96 normalizeBins=True, doCache=False, bins=10, doCDS=True, limitNeighbor=True,
99 if acceptfile is None:
102 acceptDict = getMergedRegions(acceptfile, maxDist=0, keepLabel=True, verbose=True)
104 hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
105 readlen = hitRDS.getReadSize()
106 normalizationFactor = 1.0
108 totalCount = len(hitRDS)
109 normalizationFactor = totalCount / 1000000.
111 hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
114 geneinfoDict = getGeneInfoDict(genome, cache=doCache)
116 locusByChromDict = getLocusByChromDict(hg, upstream=upstreamBp, downstream=downstreamBp, useCDS=doCDS, additionalRegionsDict=acceptDict, keepSense=True, adjustToNeighbor = limitNeighbor)
118 locusByChromDict = getLocusByChromDict(hg, additionalRegionsDict=acceptDict, keepSense=True)
120 gidList = hg.allGIDs()
122 for chrom in acceptDict:
123 for region in acceptDict[chrom]:
124 if region.label not in gidList:
125 gidList.append(region.label)
127 (gidBins, gidLen) = computeRegionBins(locusByChromDict, hitDict, bins, readlen, gidList, normalizationFactor, defaultRegionFormat=False)
129 outfile = open(outfilename,'w')
136 geneinfo = geneinfoDict[gid]
137 symbol = geneinfo[0][0]
142 if gid in gidBins and gid in gidLen:
144 for binAmount in gidBins[gid]:
145 tagCount += binAmount
146 outfile.write('%s\t%s\t%.1f\t%d' % (gid, symbol, tagCount, gidLen[gid]))
147 for binAmount in gidBins[gid]:
151 outfile.write('\t%.1f' % (100. * binAmount / tagCount))
153 outfile.write('\t%.1f' % binAmount)
158 if __name__ == "__main__":