X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=geneDownstreamBins.py;h=91db4a1a44870602383a072d7a4585568026f165;hp=058ad8236a4f7df6d08a6ca293d0a27ab090c9bd;hb=HEAD;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/geneDownstreamBins.py b/geneDownstreamBins.py index 058ad82..91db4a1 100755 --- a/geneDownstreamBins.py +++ b/geneDownstreamBins.py @@ -10,12 +10,13 @@ except: pass # originally from version 1.3 of geneDnaDownstreamCounts.py -import sys, optparse -from commoncode import readDataset +import sys +import optparse +import ReadDataset from cistematic.genomes import Genome -from cistematic.core.geneinfo import geneinfoDB +from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption, ErangeError -print "%prog: version 2.0" +print "geneDownstreamBins: version 2.1" def main(argv=None): if not argv: @@ -23,10 +24,7 @@ def main(argv=None): usage = "usage: %prog genome rdsfile outfilename [--max regionSize]" - parser = optparse.OptionParser(usage=usage) - parser.add_option("--max", type="int", dest="standardMinDist", - help="maximum region in bp") - parser.set_defaults(standardMinDist=3000) + parser = makeParser(usage) (options, args) = parser.parse_args(argv[1:]) if len(args) < 3: @@ -40,110 +38,140 @@ def main(argv=None): geneDownstreamBins(genome, hitfile, outfilename, options.standardMinDist) +def makeParser(usage=""): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--max", type="int", dest="standardMinDist", + help="maximum region in bp") + + configParser = getConfigParser() + section = "geneDownstreamBins" + standardMinDist = getConfigIntOption(configParser, section, "regionSize", 3000) + + parser.set_defaults(standardMinDist=standardMinDist) + + return parser + + def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False): - bins = 10 - standardMinThresh = standardMinDist / bins - hitRDS = readDataset(hitfile, verbose=True, cache=doCache) + hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache) normalizationFactor = 1.0 if normalize: hitDictSize = len(hitRDS) normalizationFactor = hitDictSize / 1000000. hitDict = hitRDS.getReadsDict(doMulti=True, findallOptimize=True) - + geneinfoDict = getGeneInfoDict(genome, cache=True) hg = Genome(genome) - idb = geneinfoDB(cache=True) - - geneinfoDict = idb.getallGeneInfo(genome) featuresDict = hg.getallGeneFeatures() - outfile = open(outfilename, "w") - gidList = hg.allGIDs() gidList.sort() for gid in gidList: - symbol = "LOC" + gid - geneinfo = "" - featureList = [] try: - geneinfo = geneinfoDict[gid] - featureList = featuresDict[gid] - symbol = geneinfo[0][0] - except: + featuresList = featuresDict[gid] + except KeyError: print gid - if len(featureList) == 0: + try: + binList, symbol, geneLength, tagCount = getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist) + except ErangeError: continue - newfeatureList = [] - for (ftype, chrom, start, stop, fsense) in featureList: - if (start, stop) not in newfeatureList: - newfeatureList.append((start, stop)) + tagCount *= normalizationFactor + print "%s %s %.2f %d %s" % (gid, symbol, tagCount, geneLength, str(binList)) + outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, geneLength)) + for binAmount in binList: + outfile.write("\t%.2f" % binAmount) - if chrom not in hitDict: - continue + outfile.write("\n") - newfeatureList.sort() - if len(newfeatureList) < 1: - continue + outfile.close() - glen = standardMinDist - if fsense == "F": - nextGene = hg.rightGeneDistance((genome, gid), glen * 2) - if nextGene < glen * 2: - glen = nextGene / 2 - if glen < 1: - glen = 1 +def getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist): - gstart = newfeatureList[-1][1] - else: - nextGene = hg.leftGeneDistance((genome, gid), glen * 2) - if nextGene < glen * 2: - glen = nextGene / 2 + symbol, featurePositionList, sense, chrom = getFeatureList(gid, geneinfoDict, featuresList, hitDict.keys()) + geneStart, geneLength = getGeneStats(genome, gid, standardMinDist, featurePositionList, sense) + if geneLength < standardMinDist: + raise ErangeError("gene length less than minimum") - if glen < 1: - glen = 1 + binList, tagCount = getBinList(hitDict[chrom], standardMinDist, geneStart, geneLength, sense) + if tagCount < 2: + raise ErangeError("tag count less than minimum") - gstart = newfeatureList[0][0] - glen - if gstart < 0: - gstart = 0 + return binList, symbol, geneLength, tagCount - tagCount = 0 - if glen < standardMinDist: - continue - binList = [0.] * bins - for (tagStart, sense, weight) in hitDict[chrom]: - tagStart -= gstart - if tagStart >= glen: - break - - if tagStart > 0: - tagCount += weight - if fsense == "F": - # we are relying on python's integer division quirk - binID = tagStart / standardMinThresh - binList[binID] += weight - else: - rdist = glen - tagStart - binID = rdist / standardMinThresh - binList[binID] += weight - - if tagCount < 2: - continue +def getFeatureList(gid, geneinfoDict, featureList, chromosomeList): + if len(featureList) == 0: + raise ErangeError("no features found") - tagCount *= normalizationFactor - print "%s %s %.2f %d %s" % (gid, symbol, tagCount, glen, str(binList)) - outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, glen)) - for binAmount in binList: - outfile.write("\t%.2f" % binAmount) + symbol = "LOC%s" % gid + geneinfo = "" + try: + geneinfo = geneinfoDict[gid] + symbol = geneinfo[0][0] + except KeyError: + print gid - outfile.write("\n") + newfeatureList = [] + for (ftype, chrom, start, stop, sense) in featureList: + if (start, stop) not in newfeatureList: + newfeatureList.append((start, stop)) - outfile.close() + if len(newfeatureList) < 1: + raise ErangeError("no features found") + + if chrom not in chromosomeList: + raise ErangeError("chromosome not found in reads") + newfeatureList.sort() + + return symbol, newfeatureList, sense, chrom + + +def getGeneStats(genome, gid, minDistance, featureList, sense): + geneLength = minDistance + if sense == "F": + nextGene = genome.rightGeneDistance((genome.genome, gid), geneLength * 2) + geneStart = featureList[-1][1] + else: + nextGene = genome.leftGeneDistance((genome.genome, gid), geneLength * 2) + geneStart = max(featureList[0][0] - geneLength, 0) + + if nextGene < geneLength * 2: + geneLength = nextGene / 2 + + geneLength = max(geneLength, 1) + + return geneStart, geneLength + + +def getBinList(readList, standardMinDist, geneStart, geneLength, sense): + tagCount = 0 + bins = 10 + standardMinThresh = standardMinDist / bins + binList = [0.] * bins + for read in readList: + tagStart = read["start"] + if tagStart >= geneLength: + break + + tagStart -= geneStart + weight = read["weight"] + if tagStart > 0: + tagCount += weight + if sense == "F": + # we are relying on python's integer division quirk + binID = tagStart / standardMinThresh + else: + rdist = geneLength - tagStart + binID = rdist / standardMinThresh + + binList[binID] += weight + + return binList, tagCount if __name__ == "__main__": main(sys.argv) \ No newline at end of file