2 # geneDownstreamBins.py
12 # originally from version 1.3 of geneDnaDownstreamCounts.py
14 from commoncode import readDataset
15 from cistematic.genomes import Genome
16 from cistematic.core.geneinfo import geneinfoDB
18 print "%prog: version 2.0"
24 usage = "usage: %prog genome rdsfile outfilename [--max regionSize]"
26 parser = optparse.OptionParser(usage=usage)
27 parser.add_option("--max", type="int", dest="standardMinDist",
28 help="maximum region in bp")
29 parser.set_defaults(standardMinDist=3000)
30 (options, args) = parser.parse_args(argv[1:])
40 geneDownstreamBins(genome, hitfile, outfilename, options.standardMinDist)
43 def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
45 standardMinThresh = standardMinDist / bins
47 hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
48 normalizationFactor = 1.0
50 hitDictSize = len(hitRDS)
51 normalizationFactor = hitDictSize / 1000000.
53 hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
56 idb = geneinfoDB(cache=True)
58 geneinfoDict = idb.getallGeneInfo(genome)
59 featuresDict = hg.getallGeneFeatures()
61 outfile = open(outfilename, "w")
63 gidList = hg.allGIDs()
70 geneinfo = geneinfoDict[gid]
71 featureList = featuresDict[gid]
72 symbol = geneinfo[0][0]
76 if len(featureList) == 0:
80 for (ftype, chrom, start, stop, fsense) in featureList:
81 if (start, stop) not in newfeatureList:
82 newfeatureList.append((start, stop))
84 if chrom not in hitDict:
88 if len(newfeatureList) < 1:
91 glen = standardMinDist
93 nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
94 if nextGene < glen * 2:
100 gstart = newfeatureList[-1][1]
102 nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
103 if nextGene < glen * 2:
109 gstart = newfeatureList[0][0] - glen
114 if glen < standardMinDist:
117 binList = [0.] * bins
118 for (tagStart, sense, weight) in hitDict[chrom]:
126 # we are relying on python's integer division quirk
127 binID = tagStart / standardMinThresh
128 binList[binID] += weight
130 rdist = glen - tagStart
131 binID = rdist / standardMinThresh
132 binList[binID] += weight
137 tagCount *= normalizationFactor
138 print "%s %s %.2f %d %s" % (gid, symbol, tagCount, glen, str(binList))
139 outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, glen))
140 for binAmount in binList:
141 outfile.write("\t%.2f" % binAmount)
148 if __name__ == "__main__":