2 # geneDownstreamBins.py
12 # originally from version 1.3 of geneDnaDownstreamCounts.py
16 from cistematic.genomes import Genome
17 from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption, ErangeError
19 print "geneDownstreamBins: version 2.1"
25 usage = "usage: %prog genome rdsfile outfilename [--max regionSize]"
27 parser = makeParser(usage)
28 (options, args) = parser.parse_args(argv[1:])
38 geneDownstreamBins(genome, hitfile, outfilename, options.standardMinDist)
41 def makeParser(usage=""):
42 parser = optparse.OptionParser(usage=usage)
43 parser.add_option("--max", type="int", dest="standardMinDist",
44 help="maximum region in bp")
46 configParser = getConfigParser()
47 section = "geneDownstreamBins"
48 standardMinDist = getConfigIntOption(configParser, section, "regionSize", 3000)
50 parser.set_defaults(standardMinDist=standardMinDist)
55 def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False):
57 hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
58 normalizationFactor = 1.0
60 hitDictSize = len(hitRDS)
61 normalizationFactor = hitDictSize / 1000000.
63 hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True)
64 geneinfoDict = getGeneInfoDict(genome, cache=True)
66 featuresDict = hg.getallGeneFeatures()
67 outfile = open(outfilename, "w")
68 gidList = hg.allGIDs()
72 featuresList = featuresDict[gid]
77 binList, symbol, geneLength, tagCount = getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist)
81 tagCount *= normalizationFactor
82 print "%s %s %.2f %d %s" % (gid, symbol, tagCount, geneLength, str(binList))
83 outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, geneLength))
84 for binAmount in binList:
85 outfile.write("\t%.2f" % binAmount)
92 def getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist):
94 symbol, featurePositionList, sense, chrom = getFeatureList(gid, geneinfoDict, featuresList, hitDict.keys())
95 geneStart, geneLength = getGeneStats(genome, gid, standardMinDist, featurePositionList, sense)
96 if geneLength < standardMinDist:
97 raise ErangeError("gene length less than minimum")
99 binList, tagCount = getBinList(hitDict[chrom], standardMinDist, geneStart, geneLength, sense)
101 raise ErangeError("tag count less than minimum")
103 return binList, symbol, geneLength, tagCount
106 def getFeatureList(gid, geneinfoDict, featureList, chromosomeList):
107 if len(featureList) == 0:
108 raise ErangeError("no features found")
110 symbol = "LOC%s" % gid
113 geneinfo = geneinfoDict[gid]
114 symbol = geneinfo[0][0]
119 for (ftype, chrom, start, stop, sense) in featureList:
120 if (start, stop) not in newfeatureList:
121 newfeatureList.append((start, stop))
123 if len(newfeatureList) < 1:
124 raise ErangeError("no features found")
126 if chrom not in chromosomeList:
127 raise ErangeError("chromosome not found in reads")
129 newfeatureList.sort()
131 return symbol, newfeatureList, sense, chrom
134 def getGeneStats(genome, gid, minDistance, featureList, sense):
135 geneLength = minDistance
137 nextGene = genome.rightGeneDistance((genome.genome, gid), geneLength * 2)
138 geneStart = featureList[-1][1]
140 nextGene = genome.leftGeneDistance((genome.genome, gid), geneLength * 2)
141 geneStart = max(featureList[0][0] - geneLength, 0)
143 if nextGene < geneLength * 2:
144 geneLength = nextGene / 2
146 geneLength = max(geneLength, 1)
148 return geneStart, geneLength
151 def getBinList(readList, standardMinDist, geneStart, geneLength, sense):
154 standardMinThresh = standardMinDist / bins
155 binList = [0.] * bins
156 for read in readList:
157 tagStart = read["start"]
158 if tagStart >= geneLength:
161 tagStart -= geneStart
162 weight = read["weight"]
166 # we are relying on python's integer division quirk
167 binID = tagStart / standardMinThresh
169 rdist = geneLength - tagStart
170 binID = rdist / standardMinThresh
172 binList[binID] += weight
174 return binList, tagCount
176 if __name__ == "__main__":