2 # geneDownstreamBins.py
12 # originally from version 1.3 of geneDnaDownstreamCounts.py
16 from cistematic.genomes import Genome
17 from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption
19 print "geneDownstreamBins: version 2.1"
25 usage = "usage: %prog genome rdsfile outfilename [--max regionSize]"
27 parser = makeParser(usage)
28 (options, args) = parser.parse_args(argv[1:])
38 geneDownstreamBins(genome, hitfile, outfilename, options.standardMinDist)
41 def makeParser(usage=""):
42 parser = optparse.OptionParser(usage=usage)
43 parser.add_option("--max", type="int", dest="standardMinDist",
44 help="maximum region in bp")
46 configParser = getConfigParser()
47 section = "geneDownstreamBins"
48 standardMinDist = getConfigIntOption(configParser, section, "regionSize", 3000)
50 parser.set_defaults(standardMinDist=standardMinDist)
55 def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
57 standardMinThresh = standardMinDist / bins
59 hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
60 normalizationFactor = 1.0
62 hitDictSize = len(hitRDS)
63 normalizationFactor = hitDictSize / 1000000.
65 hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
66 geneinfoDict = getGeneInfoDict(genome, cache=True)
68 featuresDict = hg.getallGeneFeatures()
70 outfile = open(outfilename, "w")
72 gidList = hg.allGIDs()
79 geneinfo = geneinfoDict[gid]
80 featureList = featuresDict[gid]
81 symbol = geneinfo[0][0]
85 if len(featureList) == 0:
89 for (ftype, chrom, start, stop, fsense) in featureList:
90 if (start, stop) not in newfeatureList:
91 newfeatureList.append((start, stop))
93 if chrom not in hitDict:
97 if len(newfeatureList) < 1:
100 glen = standardMinDist
102 nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
103 if nextGene < glen * 2:
109 gstart = newfeatureList[-1][1]
111 nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
112 if nextGene < glen * 2:
118 gstart = newfeatureList[0][0] - glen
123 if glen < standardMinDist:
126 binList = [0.] * bins
127 for read in hitDict[chrom]:
128 tagStart = read["start"]
129 weight = read["weight"]
137 # we are relying on python's integer division quirk
138 binID = tagStart / standardMinThresh
139 binList[binID] += weight
141 rdist = glen - tagStart
142 binID = rdist / standardMinThresh
143 binList[binID] += weight
148 tagCount *= normalizationFactor
149 print "%s %s %.2f %d %s" % (gid, symbol, tagCount, glen, str(binList))
150 outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, glen))
151 for binAmount in binList:
152 outfile.write("\t%.2f" % binAmount)
159 if __name__ == "__main__":