5 # originally from version 1.3 of geneDownstreamBins.py
15 from cistematic.genomes import Genome
16 from commoncode import getGeneInfoDict, getConfigParser, getConfigBoolOption, getConfigIntOption
18 print "geneUpstreamBins: version 2.1"
24 usage = "usage: python %prog genome rdsfile outfilename [--max regionSize] [--raw] [--cache]"
26 parser = makeParser(usage)
27 (options, args) = parser.parse_args(argv[1:])
37 geneUpstreamBins(genome, hitfile, outfilename, options.standardMinDist, options.normalize, options.doCache)
40 def makeParser(usage=""):
41 parser = optparse.OptionParser(usage=usage)
42 parser.add_option("--raw", action="store_false", dest="normalize",
43 help="maximum region in bp")
44 parser.add_option("--max", type="int", dest="standardMinDist")
45 parser.add_option("--cache", action="store_true", dest="doCache")
47 configParser = getConfigParser()
48 section = "geneUpstreamBins"
49 standardMinDist = getConfigIntOption(configParser, section, "regionSize", 3000)
50 normalize = getConfigBoolOption(configParser, section, "normalize", True)
51 doCache = getConfigBoolOption(configParser, section, "doCache", False)
53 parser.set_defaults(standardMinDist=standardMinDist, normalize=normalize, doCache=doCache)
58 def geneUpstreamBins(genome, hitfile, outfilename, standardMinDist=3000, normalize=True, doCache=False):
60 standardMinThresh = standardMinDist / bins
62 hitRDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache)
63 normalizationFactor = 1.0
65 totalCount = len(hitRDS)
66 normalizationFactor = totalCount / 1000000.
68 hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
71 geneinfoDict = getGeneInfoDict(genome, cache=True)
72 featuresDict = hg.getallGeneFeatures()
74 outfile = open(outfilename,"w")
76 gidList = hg.allGIDs()
83 geneinfo = geneinfoDict[gid]
84 featureList = featuresDict[gid]
85 symbol = geneinfo[0][0]
90 if len(featureList) == 0:
93 for (ftype, chrom, start, stop, fsense) in featureList:
94 if (start, stop) not in newfeatureList:
95 newfeatureList.append((start, stop))
97 if chrom not in hitDict:
100 newfeatureList.sort()
101 if len(newfeatureList) < 1:
104 glen = standardMinDist
106 nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
107 if nextGene < glen * 2:
113 gstart = newfeatureList[0][0] - glen
118 nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
119 if nextGene < glen * 2:
125 gstart = newfeatureList[-1][1]
128 if glen < standardMinDist:
132 for read in hitDict[chrom]:
133 tagStart = read["start"]
134 weight = read["weight"]
142 # we are relying on python's integer division quirk
143 binID = tagStart / standardMinThresh
144 binList[binID] += weight
146 rdist = glen - tagStart
147 binID = rdist / standardMinThresh
148 binList[binID] += weight
153 print "%s %s %d %d %s" % (gid, symbol, normalizationFactor * tagCount, glen, str(binList))
154 outfile.write("%s\t%s\t%d\t%d" % (gid, symbol, normalizationFactor * tagCount, glen))
155 for binAmount in binList:
156 outfile.write("\t%d" % binAmount)
162 if __name__ == "__main__":