5 # originally from version 1.3 of geneDownstreamBins.py
13 from commoncode import readDataset
14 from cistematic.genomes import Genome
15 from cistematic.core.geneinfo import geneinfoDB
17 print "%prog: version 2.0"
23 usage = "usage: python %prog genome rdsfile outfilename [--max regionSize] [--raw] [--cache]"
25 parser = optparse.OptionParser(usage=usage)
26 parser.add_option("--raw", action="store_false", dest="normalize",
27 help="maximum region in bp")
28 parser.add_option("--max", type="int", dest="standardMinDist")
29 parser.add_option("--cache", action="store_true", dest="doCache")
30 parser.set_defaults(standardMinDist=3000, normalize=True, doCache=False)
31 (options, args) = parser.parse_args(argv[1:])
41 geneUpstreamBins(genome, hitfile, outfilename, options.standardMinDist, options.normalize, options.doCache)
44 def geneUpstreamBins(genome, hitfile, outfilename, standardMinDist=3000, normalize=True, doCache=False):
46 standardMinThresh = standardMinDist / bins
48 hitRDS = readDataset(hitfile, verbose = True, cache=doCache)
49 normalizationFactor = 1.0
51 totalCount = len(hitRDS)
52 normalizationFactor = totalCount / 1000000.
54 hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
57 idb = geneinfoDB(cache=True)
59 geneinfoDict = idb.getallGeneInfo(genome)
60 featuresDict = hg.getallGeneFeatures()
62 outfile = open(outfilename,"w")
64 gidList = hg.allGIDs()
71 geneinfo = geneinfoDict[gid]
72 featureList = featuresDict[gid]
73 symbol = geneinfo[0][0]
78 if len(featureList) == 0:
81 for (ftype, chrom, start, stop, fsense) in featureList:
82 if (start, stop) not in newfeatureList:
83 newfeatureList.append((start, stop))
85 if chrom not in hitDict:
89 if len(newfeatureList) < 1:
92 glen = standardMinDist
94 nextGene = hg.leftGeneDistance((genome, gid), glen * 2)
95 if nextGene < glen * 2:
101 gstart = newfeatureList[0][0] - glen
106 nextGene = hg.rightGeneDistance((genome, gid), glen * 2)
107 if nextGene < glen * 2:
113 gstart = newfeatureList[-1][1]
116 if glen < standardMinDist:
120 for (tagStart, sense, weight) in hitDict[chrom]:
128 # we are relying on python's integer division quirk
129 binID = tagStart / standardMinThresh
130 binList[binID] += weight
132 rdist = glen - tagStart
133 binID = rdist / standardMinThresh
134 binList[binID] += weight
139 print "%s %s %d %d %s" % (gid, symbol, normalizationFactor * tagCount, glen, str(binList))
140 outfile.write("%s\t%s\t%d\t%d" % (gid, symbol, normalizationFactor * tagCount, glen))
141 for binAmount in binList:
142 outfile.write("\t%d" % binAmount)
148 if __name__ == "__main__":