X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=geneDownstreamBins.py;fp=geneDownstreamBins.py;h=91db4a1a44870602383a072d7a4585568026f165;hp=4ce97e4fc1380517447153e59fba80e5531e3029;hb=77dccd7c98d8cdb60caaf178b1123df71ea662c9;hpb=bc30aca13e5ec397c92e67002fbf7a103130b828 diff --git a/geneDownstreamBins.py b/geneDownstreamBins.py index 4ce97e4..91db4a1 100755 --- a/geneDownstreamBins.py +++ b/geneDownstreamBins.py @@ -14,7 +14,7 @@ import sys import optparse import ReadDataset from cistematic.genomes import Genome -from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption +from commoncode import getGeneInfoDict, getConfigParser, getConfigIntOption, ErangeError print "geneDownstreamBins: version 2.1" @@ -53,8 +53,6 @@ def makeParser(usage=""): def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCache=False, normalize=False): - bins = 10 - standardMinThresh = standardMinDist / bins hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache) normalizationFactor = 1.0 @@ -66,95 +64,114 @@ def geneDownstreamBins(genome, hitfile, outfilename, standardMinDist=3000, doCac geneinfoDict = getGeneInfoDict(genome, cache=True) hg = Genome(genome) featuresDict = hg.getallGeneFeatures() - outfile = open(outfilename, "w") - gidList = hg.allGIDs() gidList.sort() for gid in gidList: - symbol = "LOC" + gid - geneinfo = "" - featureList = [] try: - geneinfo = geneinfoDict[gid] - featureList = featuresDict[gid] - symbol = geneinfo[0][0] - except: + featuresList = featuresDict[gid] + except KeyError: print gid - if len(featureList) == 0: + try: + binList, symbol, geneLength, tagCount = getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist) + except ErangeError: continue - newfeatureList = [] - for (ftype, chrom, start, stop, fsense) in featureList: - if (start, stop) not in newfeatureList: - newfeatureList.append((start, stop)) + tagCount *= normalizationFactor + print "%s %s %.2f %d %s" % (gid, symbol, tagCount, geneLength, str(binList)) + outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, geneLength)) + for binAmount in binList: + outfile.write("\t%.2f" % binAmount) - if chrom not in hitDict: - continue + outfile.write("\n") - newfeatureList.sort() - if len(newfeatureList) < 1: - continue + outfile.close() - glen = standardMinDist - if fsense == "F": - nextGene = hg.rightGeneDistance((genome, gid), glen * 2) - if nextGene < glen * 2: - glen = nextGene / 2 - if glen < 1: - glen = 1 +def getDownstreamBins(genome, gid, hitDict, geneinfoDict, featuresList, standardMinDist): - gstart = newfeatureList[-1][1] - else: - nextGene = hg.leftGeneDistance((genome, gid), glen * 2) - if nextGene < glen * 2: - glen = nextGene / 2 + symbol, featurePositionList, sense, chrom = getFeatureList(gid, geneinfoDict, featuresList, hitDict.keys()) + geneStart, geneLength = getGeneStats(genome, gid, standardMinDist, featurePositionList, sense) + if geneLength < standardMinDist: + raise ErangeError("gene length less than minimum") - if glen < 1: - glen = 1 + binList, tagCount = getBinList(hitDict[chrom], standardMinDist, geneStart, geneLength, sense) + if tagCount < 2: + raise ErangeError("tag count less than minimum") - gstart = newfeatureList[0][0] - glen - if gstart < 0: - gstart = 0 + return binList, symbol, geneLength, tagCount - tagCount = 0 - if glen < standardMinDist: - continue - binList = [0.] * bins - for read in hitDict[chrom]: - tagStart = read["start"] - weight = read["weight"] - tagStart -= gstart - if tagStart >= glen: - break - - if tagStart > 0: - tagCount += weight - if fsense == "F": - # we are relying on python's integer division quirk - binID = tagStart / standardMinThresh - binList[binID] += weight - else: - rdist = glen - tagStart - binID = rdist / standardMinThresh - binList[binID] += weight - - if tagCount < 2: - continue +def getFeatureList(gid, geneinfoDict, featureList, chromosomeList): + if len(featureList) == 0: + raise ErangeError("no features found") - tagCount *= normalizationFactor - print "%s %s %.2f %d %s" % (gid, symbol, tagCount, glen, str(binList)) - outfile.write("%s\t%s\t%.2f\t%d" % (gid, symbol, tagCount, glen)) - for binAmount in binList: - outfile.write("\t%.2f" % binAmount) + symbol = "LOC%s" % gid + geneinfo = "" + try: + geneinfo = geneinfoDict[gid] + symbol = geneinfo[0][0] + except KeyError: + print gid - outfile.write("\n") + newfeatureList = [] + for (ftype, chrom, start, stop, sense) in featureList: + if (start, stop) not in newfeatureList: + newfeatureList.append((start, stop)) + + if len(newfeatureList) < 1: + raise ErangeError("no features found") + + if chrom not in chromosomeList: + raise ErangeError("chromosome not found in reads") + + newfeatureList.sort() + + return symbol, newfeatureList, sense, chrom - outfile.close() +def getGeneStats(genome, gid, minDistance, featureList, sense): + geneLength = minDistance + if sense == "F": + nextGene = genome.rightGeneDistance((genome.genome, gid), geneLength * 2) + geneStart = featureList[-1][1] + else: + nextGene = genome.leftGeneDistance((genome.genome, gid), geneLength * 2) + geneStart = max(featureList[0][0] - geneLength, 0) + + if nextGene < geneLength * 2: + geneLength = nextGene / 2 + + geneLength = max(geneLength, 1) + + return geneStart, geneLength + + +def getBinList(readList, standardMinDist, geneStart, geneLength, sense): + tagCount = 0 + bins = 10 + standardMinThresh = standardMinDist / bins + binList = [0.] * bins + for read in readList: + tagStart = read["start"] + if tagStart >= geneLength: + break + + tagStart -= geneStart + weight = read["weight"] + if tagStart > 0: + tagCount += weight + if sense == "F": + # we are relying on python's integer division quirk + binID = tagStart / standardMinThresh + else: + rdist = geneLength - tagStart + binID = rdist / standardMinThresh + + binList[binID] += weight + + return binList, tagCount if __name__ == "__main__": main(sys.argv) \ No newline at end of file