X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=findall.py;fp=findall.py;h=d20608f1997e2e6a0fd84d9169c78025485e4b57;hp=10f007bd7d340fcf3b30606046c84ef42ee6556d;hb=0d3e3112fd04c2e6b44a25cacef1d591658ad181;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/findall.py b/findall.py index 10f007b..d20608f 100755 --- a/findall.py +++ b/findall.py @@ -49,10 +49,12 @@ import sys import math import string import optparse -from commoncode import readDataset, writeLog, findPeak, getBestShiftForRegion +from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption +import ReadDataset +import Region -versionString = "%s: version 3.2" % sys.argv[0] +versionString = "findall: version 3.2" print versionString def usage(): @@ -63,6 +65,27 @@ def main(argv=None): if not argv: argv = sys.argv + parser = makeParser() + (options, args) = parser.parse_args(argv[1:]) + + if len(args) < 3: + usage() + sys.exit(2) + + factor = args[0] + hitfile = args[1] + outfilename = args[2] + + findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift, + options.stringency, options.noshift, options.autoshift, options.reportshift, + options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak, + options.normalize, options.logfilename, options.withFlag, options.doDirectionality, + options.trimValue, options.doTrim, options.doAppend, options.rnaSettings, + options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti, + options.strandfilter, options.combine5p) + + +def makeParser(): usage = __doc__ parser = optparse.OptionParser(usage=usage) @@ -94,31 +117,47 @@ def main(argv=None): parser.add_option("--append", action="store_true", dest="doAppend") parser.add_option("--RNA", action="store_true", dest="rnaSettings") parser.add_option("--combine5p", action="store_true", dest="combine5p") - parser.set_defaults(minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None, - stringency=4.0, noshift=False, autoshift=False, reportshift=False, - minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5, - normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True, - trimValue=None, doTrim=True, doAppend=False, rnaSettings=False, - cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False, - strandfilter=None, combine5p=False) - (options, args) = parser.parse_args(argv[1:]) - - if len(args) < 3: - usage() - sys.exit(2) - - factor = args[0] - hitfile = args[1] - outfilename = args[2] - - findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift, - options.stringency, options.noshift, options.autoshift, options.reportshift, - options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak, - options.normalize, options.logfilename, options.withFlag, options.doDirectionality, - options.trimValue, options.doTrim, options.doAppend, options.rnaSettings, - options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti, - options.strandfilter, options.combine5p) + configParser = getConfigParser() + section = "findall" + minHits = getConfigFloatOption(configParser, section, "minHits", 4.0) + minRatio = getConfigFloatOption(configParser, section, "minRatio", 4.0) + maxSpacing = getConfigIntOption(configParser, section, "maxSpacing", 50) + listPeak = getConfigBoolOption(configParser, section, "listPeak", False) + shift = getConfigOption(configParser, section, "shift", None) + stringency = getConfigFloatOption(configParser, section, "stringency", 4.0) + noshift = getConfigBoolOption(configParser, section, "noshift", False) + autoshift = getConfigBoolOption(configParser, section, "autoshift", False) + reportshift = getConfigBoolOption(configParser, section, "reportshift", False) + minPlusRatio = getConfigFloatOption(configParser, section, "minPlusRatio", 0.25) + maxPlusRatio = getConfigFloatOption(configParser, section, "maxPlusRatio", 0.75) + leftPlusRatio = getConfigFloatOption(configParser, section, "leftPlusRatio", 0.3) + minPeak = getConfigFloatOption(configParser, section, "minPeak", 0.5) + normalize = getConfigBoolOption(configParser, section, "normalize", True) + logfilename = getConfigOption(configParser, section, "logfilename", "findall.log") + withFlag = getConfigOption(configParser, section, "withFlag", "") + doDirectionality = getConfigBoolOption(configParser, section, "doDirectionality", True) + trimValue = getConfigOption(configParser, section, "trimValue", None) + doTrim = getConfigBoolOption(configParser, section, "doTrim", True) + doAppend = getConfigBoolOption(configParser, section, "doAppend", False) + rnaSettings = getConfigBoolOption(configParser, section, "rnaSettings", False) + cachePages = getConfigOption(configParser, section, "cachePages", None) + ptype = getConfigOption(configParser, section, "ptype", None) + mockfile = getConfigOption(configParser, section, "mockfile", None) + doRevBackground = getConfigBoolOption(configParser, section, "doRevBackground", False) + noMulti = getConfigBoolOption(configParser, section, "noMulti", False) + strandfilter = getConfigOption(configParser, section, "strandfilter", None) + combine5p = getConfigBoolOption(configParser, section, "combine5p", False) + + parser.set_defaults(minHits=minHits, minRatio=minRatio, maxSpacing=maxSpacing, listPeak=listPeak, shift=shift, + stringency=stringency, noshift=noshift, autoshift=autoshift, reportshift=reportshift, + minPlusRatio=minPlusRatio, maxPlusRatio=maxPlusRatio, leftPlusRatio=leftPlusRatio, minPeak=minPeak, + normalize=normalize, logfilename=logfilename, withFlag=withFlag, doDirectionality=doDirectionality, + trimValue=trimValue, doTrim=doTrim, doAppend=doAppend, rnaSettings=rnaSettings, + cachePages=cachePages, ptype=ptype, mockfile=mockfile, doRevBackground=doRevBackground, noMulti=noMulti, + strandfilter=strandfilter, combine5p=combine5p) + + return parser def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None, @@ -129,20 +168,7 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False, strandfilter=None, combine5p=False): - shiftValue = 0 - if autoshift: - shiftValue = "auto" - - if shift is not None: - try: - shiftValue = int(shift) - except ValueError: - if shift == "learn": - shiftValue = "learn" - print "Will try to learn shift" - - if noshift: - shiftValue = 0 + shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings) if trimValue is not None: trimValue = float(trimValue) / 100. @@ -198,7 +224,6 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= if rnaSettings: print "using settings appropriate for RNA: -nodirectionality -notrim -noshift" - shiftValue = 0 doTrim = False doDirectionality = False @@ -219,48 +244,44 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= writeLog(logfilename, versionString, string.join(sys.argv[1:])) if doControl: print "\ncontrol:" - mockRDS = readDataset(mockfile, verbose=True, cache=doCache) + mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache) if cachePages > mockRDS.getDefaultCacheSize(): mockRDS.setDBcache(cachePages) print "\nsample:" - hitRDS = readDataset(hitfile, verbose=True, cache=doCache) + hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache) readlen = hitRDS.getReadSize() if rnaSettings: maxSpacing = readlen - print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache) - print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded) - print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType) - if cachePages > hitRDS.getDefaultCacheSize(): hitRDS.setDBcache(cachePages) - hitRDSsize = len(hitRDS) / 1000000. - if doControl: - mockRDSsize = len(mockRDS) / 1000000. - - if normalize: - if doControl: - mockSampleSize = mockRDSsize - - hitSampleSize = hitRDSsize - if doAppend: - outfile = open(outfilename, "a") + fileMode = "a" else: - outfile = open(outfilename, "w") + fileMode = "w" + + outfile = open(outfilename, fileMode) outfile.write("#ERANGE %s\n" % versionString) if doControl: - outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)\n" % (hitfile, hitRDSsize, mockfile, mockRDSsize)) + mockRDSsize = len(mockRDS) / 1000000. + controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize) else: - outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample: none\n" % (hitfile, hitRDSsize)) + controlSampleString = " none" + + hitRDSsize = len(hitRDS) / 1000000. + outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:%s\n" % (hitfile, hitRDSsize, controlSampleString)) if withFlag != "": outfile.write("#restrict to Flag = %s\n" % withFlag) + print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache) + print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded) + print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType) + outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache)) outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)) outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)) @@ -300,6 +321,12 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= if doControl: mockChromList = mockRDS.getChromosomes() + if normalize: + if doControl: + mockSampleSize = mockRDSsize + + hitSampleSize = hitRDSsize + hitChromList.sort() for chromosome in hitChromList: @@ -307,16 +334,21 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= continue print "chromosome %s" % (chromosome) - hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True, strand=stranded, combine5p=combine5p) + hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True, + doMulti=useMulti, findallOptimize=True, strand=stranded, + combine5p=combine5p) maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti) if shiftValue == "learn": - shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord, - stringency, readlen, minHits, logfilename, outfile, outfilename) + shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize, + mockSampleSize, minRatio, maxSpacing, maxCoord, stringency, readlen, minHits, + logfilename, outfile, outfilename) - regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize, chromosome, useMulti, - normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen, - shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak, - noMulti, doControl, factor, trimValue, outputRegionList=True) + regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize, + chromosome, useMulti, normalize, maxSpacing, + doDirectionality, doTrim, minHits, minRatio, + readlen, shiftValue, minPeak, minPlusRatio, + maxPlusRatio, leftPlusRatio, listPeak, noMulti, + doControl, factor, trimValue, outputRegionList=True) statistics["index"] += regionStats["index"] statistics["total"] += regionStats["total"] @@ -332,10 +364,12 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= #now do background swapping the two samples around print "calculating background..." backgroundTrimValue = 1/20. - backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize, chromosome, useMulti, - normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen, - shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak, - noMulti, doControl, factor, backgroundTrimValue) + backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize, + chromosome, useMulti, normalize, maxSpacing, + doDirectionality, doTrim, minHits, minRatio, + readlen, shiftValue, minPeak, minPlusRatio, + maxPlusRatio, leftPlusRatio, listPeak, noMulti, + doControl, factor, backgroundTrimValue) statistics["mIndex"] += backgroundRegionStats["index"] statistics["mTotal"] += backgroundRegionStats["total"] @@ -358,6 +392,25 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | "))) +def determineShiftValue(autoshift, shift, noshift, rnaSettings): + shiftValue = 0 + if autoshift: + shiftValue = "auto" + + if shift is not None: + try: + shiftValue = int(shift) + except ValueError: + if shift == "learn": + shiftValue = "learn" + print "Will try to learn shift" + + if noshift or rnaSettings: + shiftValue = 0 + + return shiftValue + + def doNotProcessChromosome(chromosome, doControl, mockChromList): skipChromosome = False if chromosome == "chrM": @@ -384,19 +437,22 @@ def calculatePValue(dataList): def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord, - stringency, readlen, minHits, logfilename, outfile, outfilename): + stringency, readlen, minHits, logfilename, outfile, outfilename, minSites=30): - print "learning shift.... will need at least 30 training sites" + print "learning shift.... will need at least %d training sites" % minSites previousHit = -1 * maxSpacing hitList = [-1] - weightList = [0] + totalWeight = 0 readList = [] shiftDict = {} count = 0 numStarts = 0 - for (pos, sense, weight) in hitDict[chrom]: + for read in hitDict[chrom]: + pos = read["start"] + sense = read["sense"] + weight = read["weight"] if abs(pos - previousHit) > maxSpacing or pos == maxCoord: - sumAll = sum(weightList) + sumAll = totalWeight if normalize: sumAll /= hitSampleSize @@ -417,7 +473,7 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm count += 1 hitList = [] - weightList = [] + totalWeight = 0 readList = [] numStarts = 0 @@ -425,18 +481,21 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm numStarts += 1 hitList.append(pos) - weightList.append(weight) - readList.append((pos, sense, weight)) + totalWeight += weight + readList.append({"start": pos, "sense": sense, "weight": weight}) previousHit = pos bestShift = 0 bestCount = 0 - outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency, stringency * minHits, stringency * minRatio, stringency * readlen, count) + learningSettings = ["#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d" % (stringency, stringency * minHits, + stringency * minRatio, stringency * readlen), + "#number of training examples: %d" % count] + outline = string.join(learningSettings, "\n") print outline writeLog(logfilename, versionString, "%s%s" % (outfilename, outline)) - if count < 30: + if count < minSites: outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold" - print outline + print >> outfile, outline writeLog(logfilename, versionString, "%s%s" % (outfilename, outline)) shiftValue = 0 else: @@ -481,20 +540,24 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, noMulti, doControl, factor, trimValue, outputRegionList=False): index = 0 - total = 0 + totalRegionWeight = 0 failedCounter = 0 previousHit = - 1 * maxSpacing currentHitList = [-1] - currentWeightList = [0] + currentTotalWeight = 0 + currentUniqReadCount = 0 currentReadList = [] regionWeights = [] outregions = [] numStarts = 0 hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True) maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti) - for (pos, sense, weight) in hitDict[chrom]: + for read in hitDict[chrom]: + pos = read["start"] + sense = read["sense"] + weight = read["weight"] if abs(pos - previousHit) > maxSpacing or pos == maxCoord: - sumAll = sum(currentWeightList) + sumAll = currentTotalWeight if normalize: sumAll /= rdsSampleSize @@ -507,13 +570,14 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll) if foldRatio >= minRatio: # first pass, with absolute numbers - if doDirectionality: - (topPos, numHits, smoothArray, numPlus, numLeft, shift) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue, returnShift=True) - else: - (topPos, numHits, smoothArray, numPlus, shift) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, shift=shiftValue, returnShift=True) - - bestPos = topPos[0] - peakScore = smoothArray[bestPos] + peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue) + + bestPos = peak.topPos[0] + numHits = peak.numHits + peakScore = peak.smoothArray[bestPos] + numPlus = peak.numPlus + shift = peak.shift + numLeft = peak.numLeft if normalize: peakScore /= rdsSampleSize @@ -523,28 +587,25 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, stop = regionStop - regionStart - 1 startFound = False while not startFound: - if smoothArray[start] >= minSignalThresh or start == bestPos: + if peak.smoothArray[start] >= minSignalThresh or start == bestPos: startFound = True else: start += 1 stopFound = False while not stopFound: - if smoothArray[stop] >= minSignalThresh or stop == bestPos: + if peak.smoothArray[stop] >= minSignalThresh or stop == bestPos: stopFound = True else: stop -= 1 regionStop = regionStart + stop regionStart += start - try: - if doDirectionality: - (topPos, sumAll, smoothArray, numPlus, numLeft) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift) - else: - (topPos, sumAll, smoothArray, numPlus) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, shift=shift) - except: - continue + trimPeak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift) + sumAll = trimPeak.numHits + numPlus = trimPeak.numPlus + numLeft = trimPeak.numLeft if normalize: sumAll /= rdsSampleSize @@ -553,8 +614,8 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True) # just in case it changed, use latest data try: - bestPos = topPos[0] - peakScore = smoothArray[bestPos] + bestPos = trimPeak.topPos[0] + peakScore = trimPeak.smoothArray[bestPos] except: continue @@ -563,7 +624,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, peakScore /= rdsSampleSize elif outputRegionList: - sumMulti = sum(currentWeightList) - currentWeightList.count(1.0) + sumMulti = currentTotalWeight - currentUniqReadCount if outputRegionList: # normalize to RPM @@ -583,9 +644,9 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, plusRatio = float(numPlus)/numHits if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio: if outputRegionList: - peak = "" + peakDescription = "" if listPeak: - peak = "\t%d\t%.1f" % (regionStart + bestPos, peakScore) + peakDescription = "\t%d\t%.1f" % (regionStart + bestPos, peakScore) if doDirectionality: if leftPlusRatio < numLeft / numPlus: @@ -594,20 +655,27 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, plusP = plusRatio * 100. leftP = 100. * numLeft / numPlus # we have a region that passes all criteria - outregions.append((factor, index, chrom, regionStart, regionStop + readlen - 1, sumAll, foldRatio, multiP, plusP, leftP, peak, shift)) + region = Region.DirectionalRegion(regionStart, regionStop + readlen - 1, + factor, index, chrom, sumAll, + foldRatio, multiP, plusP, leftP, + peakDescription, shift) + outregions.append(region) - total += sumAll + totalRegionWeight += sumAll else: failedCounter += 1 else: # we have a region, but didn't check for directionality index += 1 - total += sumAll + totalRegionWeight += sumAll if outputRegionList: - outregions.append((factor, index, chrom, regionStart, regionStop + readlen - 1, sumAll, foldRatio, multiP, peak, shift)) + region = Region.Region(regionStart, regionStop + readlen - 1, factor, index, chrom, + sumAll, foldRatio, multiP, peakDescription, shift) + outregions.append(region) currentHitList = [] - currentWeightList = [] + currentTotalWeight = 0 + currentUniqReadCount = 0 currentReadList = [] numStarts = 0 @@ -615,12 +683,15 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, numStarts += 1 currentHitList.append(pos) - currentWeightList.append(weight) - currentReadList.append((pos, sense, weight)) + currentTotalWeight += weight + if weight == 1.0: + currentUniqReadCount += 1 + + currentReadList.append({"start": pos, "sense": sense, "weight": weight}) previousHit = pos statistics = {"index": index, - "total": total, + "total": totalRegionWeight, "failed": failedCounter } @@ -634,43 +705,35 @@ def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, repor bestShift = 0 shiftDict = {} for region in outregions: + # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest + if reportshift: + outputList = [region.printRegionWithShift()] + if shiftValue == "auto": + try: + shiftDict[region.shift] += 1 + except KeyError: + shiftDict[region.shift] = 1 + else: + outputList = [region.printRegion()] + # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest if doPvalue: - sumAll = int(region[5]) + sumAll = int(region.numReads) for i in xrange(sumAll): pValue *= poissonmean pValue /= i+1 - if shiftValue == "auto" and reportshift: - try: - shiftDict[region[-1]] += 1 - except KeyError: - shiftDict[region[-1]] = 1 - - try: - if reportshift: - outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f%s\t%d" % region] - else: - outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f%s" % region[:-1]] - except: - if reportshift: - outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f%s\t%d" % region] - else: - outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f%s" % region[:-1]] - - if doPvalue: - outputList.append("%1.2g" % pValue) + outputList.append("%1.2f" % pValue) outline = string.join(outputList, "\t") print outline print >> outfile, outline - if shiftValue == "auto" and reportshift: - bestCount = 0 - for shift in sorted(shiftDict): - if shiftDict[shift] > bestCount: - bestShift = shift - bestCount = shiftDict[shift] + bestCount = 0 + for shift in sorted(shiftDict): + if shiftDict[shift] > bestCount: + bestShift = shift + bestCount = shiftDict[shift] return bestShift