X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=findall.py;h=827ee503ba7956b37143481beabae171a35f7619;hp=663d7e075d5d30535da6eed6547d4da2b4970be2;hb=HEAD;hpb=03f1e0b3bab22d517ad75b9af4d54e8fcb8540fb diff --git a/findall.py b/findall.py index 663d7e0..827ee50 100755 --- a/findall.py +++ b/findall.py @@ -1,6 +1,6 @@ """ - usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile - [--control controlrdsfile] [--minimum minHits] [--ratio minRatio] + usage: python $ERANGEPATH/findall.py label samplebamfile regionoutfile + [--control controlbamfile] [--minimum minHits] [--ratio minRatio] [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num] [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction] [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw] @@ -50,8 +50,8 @@ import math import string import optparse import operator -from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption -import ReadDataset +import pysam +from commoncode import writeLog, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption, isSpliceEntry import Region @@ -60,11 +60,12 @@ print versionString class RegionDirectionError(Exception): pass - + class RegionFinder(): - def __init__(self, minRatio, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, strandfilter, minHits, trimValue, doTrim, doDirectionality, - shiftValue, maxSpacing): + def __init__(self, label, minRatio=4.0, minPeak=0.5, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, strandfilter="", + minHits=4.0, trimValue=0.1, doTrim=True, doDirectionality=True, shiftValue=0, maxSpacing=50, withFlag="", + normalize=True, listPeak=False, reportshift=False, stringency=1.0, controlfile=None, doRevBackground=False): self.statistics = {"index": 0, "total": 0, @@ -74,8 +75,10 @@ class RegionFinder(): "badRegionTrim": 0 } - self.controlRDSsize = 1 - self.sampleRDSsize = 1 + self.regionLabel = label + self.rnaSettings = False + self.controlRDSsize = 1.0 + self.sampleRDSsize = 1.0 self.minRatio = minRatio self.minPeak = minPeak self.leftPlusRatio = leftPlusRatio @@ -107,17 +110,93 @@ class RegionFinder(): self.shiftValue = shiftValue self.maxSpacing = maxSpacing - - - def useRNASettings(self): + self.withFlag = withFlag + self.normalize = normalize + self.listPeak = listPeak + self.reportshift = reportshift + self.stringency = max(stringency, 1.0) + self.controlfile = controlfile + self.doControl = self.controlfile is not None + self.doPvalue = False + self.doRevBackground = doRevBackground + + + def useRNASettings(self, readlen): + self.rnaSettings = True self.shiftValue = 0 self.doTrim = False self.doDirectionality = False + self.maxSpacing = readlen + + + def getHeader(self): + if self.normalize: + countType = "RPM" + else: + countType = "COUNT" + + headerFields = ["#regionID\tchrom\tstart\tstop", countType, "fold\tmulti%"] + + if self.doDirectionality: + headerFields.append("plus%\tleftPlus%") + + if self.listPeak: + headerFields.append("peakPos\tpeakHeight") + + if self.reportshift: + headerFields.append("readShift") + + if self.doPvalue: + headerFields.append("pValue") + + return string.join(headerFields, "\t") + + + def printSettings(self, ptype, useMulti, pValueType): + print + self.printStatusMessages(ptype, useMulti) + self.printOptionsSummary(useMulti, pValueType) + + + def printStatusMessages(self, ptype, useMulti): + if self.shiftValue == "learn": + print "Will try to learn shift" + + if self.normalize: + print "Normalizing to RPM" + + if self.doRevBackground: + print "Swapping IP and background to calculate FDR" + + if ptype != "": + if ptype in ["NONE", "SELF"]: + pass + elif ptype == "BACK": + if self.doControl and self.doRevBackground: + pass + else: + print "must have a control dataset and -revbackground for pValue type 'back'" + else: + print "could not use pValue type : %s" % ptype + + if self.withFlag != "": + print "restrict to flag = %s" % self.withFlag + + if not useMulti: + print "using unique reads only" + + if self.rnaSettings: + print "using settings appropriate for RNA: -nodirectionality -notrim -noshift" + + if self.strandfilter == "plus": + print "only analyzing reads on the plus strand" + elif self.strandfilter == "minus": + print "only analyzing reads on the minus strand" - def printOptionsSummary(self, listPeak, useMulti, doCache, pValueType): + def printOptionsSummary(self, useMulti, pValueType): - print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (self.doDirectionality, listPeak, not useMulti, doCache) + print "\nenforceDirectionality=%s listPeak=%s nomulti=%s " % (self.doDirectionality, self.listPeak, not useMulti) print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded) try: print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType) @@ -125,19 +204,18 @@ class RegionFinder(): print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType) - def getAnalysisDescription(self, hitfile, controlfile, doControl, - withFlag, listPeak, useMulti, doCache, pValueType): + def getAnalysisDescription(self, hitfile, useMulti, pValueType): description = ["#ERANGE %s" % versionString] - if doControl: - description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, controlfile, self.controlRDSsize)) + if self.doControl: + description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, self.controlfile, self.controlRDSsize)) else: description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample: none" % (hitfile, self.sampleRDSsize)) - if withFlag != "": - description.append("#restrict to Flag = %s" % withFlag) + if self.withFlag != "": + description.append("#restrict to Flag = %s" % self.withFlag) - description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s" % (self.doDirectionality, listPeak, not useMulti, doCache)) + description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s" % (self.doDirectionality, self.listPeak, not useMulti)) description.append("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded)) try: description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)) @@ -147,6 +225,47 @@ class RegionFinder(): return string.join(description, "\n") + def getFooter(self, bestShift): + index = self.statistics["index"] + mIndex = self.statistics["mIndex"] + footerLines = ["#stats:\t%.1f RPM in %d regions" % (self.statistics["total"], index)] + if self.doDirectionality: + footerLines.append("#\t\t%d additional regions failed directionality filter" % self.statistics["failed"]) + + if self.doRevBackground: + try: + percent = min(100. * (float(mIndex)/index), 100.) + except ZeroDivisionError: + percent = 0. + + footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, self.statistics["mTotal"], percent)) + + if self.shiftValue == "auto" and self.reportshift: + + footerLines.append("#mode of shift values: %d" % bestShift) + + if self.statistics["badRegionTrim"] > 0: + footerLines.append("#%d regions discarded due to trimming problems" % self.statistics["badRegionTrim"]) + + return string.join(footerLines, "\n") + + + def updateControlStatistics(self, peak, sumAll, peakScore): + + plusRatio = float(peak.numPlus)/peak.numHits + if peakScore >= self.minPeak and self.minPlusRatio <= plusRatio <= self.maxPlusRatio: + if self.doDirectionality: + if self.leftPlusRatio < peak.numLeftPlus / peak.numPlus: + self.statistics["mIndex"] += 1 + self.statistics["mTotal"] += sumAll + else: + self.statistics["failed"] += 1 + else: + # we have a region, but didn't check for directionality + self.statistics["mIndex"] += 1 + self.statistics["mTotal"] += sumAll + + def usage(): print __doc__ @@ -186,13 +305,16 @@ def main(argv=None): else: outputMode = "w" - findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, shiftValue, - options.stringency, options.reportshift, - options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak, - options.normalize, options.logfilename, options.withFlag, options.doDirectionality, - options.trimValue, options.doTrim, outputMode, options.rnaSettings, - options.cachePages, options.ptype, options.controlfile, options.doRevBackground, options.useMulti, - options.strandfilter, options.combine5p) + regionFinder = RegionFinder(factor, minRatio=options.minRatio, minPeak=options.minPeak, minPlusRatio=options.minPlusRatio, + maxPlusRatio=options.maxPlusRatio, leftPlusRatio=options.leftPlusRatio, strandfilter=options.strandfilter, + minHits=options.minHits, trimValue=options.trimValue, doTrim=options.doTrim, + doDirectionality=options.doDirectionality, shiftValue=shiftValue, maxSpacing=options.maxSpacing, + withFlag=options.withFlag, normalize=options.normalize, listPeak=options.listPeak, + reportshift=options.reportshift, stringency=options.stringency, controlfile=options.controlfile, + doRevBackground=options.doRevBackground) + + findall(regionFinder, hitfile, outfilename, options.logfilename, outputMode, options.rnaSettings, + options.ptype, options.useMulti, options.combine5p) def makeParser(): @@ -270,78 +392,67 @@ def makeParser(): return parser -def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shiftValue=0, - stringency=4.0, reportshift=False, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5, - normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True, trimValue=0.1, doTrim=True, - outputMode="w", rnaSettings=False, cachePages=None, ptype="", controlfile=None, doRevBackground=False, - useMulti=True, strandfilter="", combine5p=False): - - regionFinder = RegionFinder(minRatio, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, strandfilter, minHits, trimValue, - doTrim, doDirectionality, shiftValue, maxSpacing) - - doControl = controlfile is not None - pValueType = getPValueType(ptype, doControl, doRevBackground) - doPvalue = not pValueType == "none" +def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outputMode="w", rnaSettings=False, + ptype="", useMulti=True, combine5p=False): - if rnaSettings: - regionFinder.useRNASettings() - - doCache = cachePages is not None - printStatusMessages(regionFinder.shiftValue, normalize, doRevBackground, ptype, doControl, withFlag, useMulti, rnaSettings, regionFinder.strandfilter) - controlRDS = None - if doControl: + writeLog(logfilename, versionString, string.join(sys.argv[1:])) + controlBAM = None + if regionFinder.doControl: print "\ncontrol:" - controlRDS = openRDSFile(controlfile, cachePages=cachePages, doCache=doCache) + controlBAM = pysam.Samfile(regionFinder.controlfile, "rb") + regionFinder.controlRDSsize = int(getHeaderComment(controlBAM.header, "Total")) / 1000000. print "\nsample:" - hitRDS = openRDSFile(hitfile, cachePages=cachePages, doCache=doCache) - - print - regionFinder.readlen = hitRDS.getReadSize() + sampleBAM = pysam.Samfile(hitfile, "rb") + regionFinder.sampleRDSsize = int(getHeaderComment(sampleBAM.header, "Total")) / 1000000. + pValueType = getPValueType(ptype, regionFinder.doControl, regionFinder.doRevBackground) + regionFinder.doPvalue = not pValueType == "none" + regionFinder.readlen = int(getHeaderComment(sampleBAM.header, "ReadLength")) if rnaSettings: - regionFinder.maxSpacing = regionFinder.readlen - - writeLog(logfilename, versionString, string.join(sys.argv[1:])) - regionFinder.printOptionsSummary(listPeak, useMulti, doCache, pValueType) - - regionFinder.sampleRDSsize = len(hitRDS) / 1000000. - if doControl: - regionFinder.controlRDSsize = len(controlRDS) / 1000000. + regionFinder.useRNASettings(regionFinder.readlen) + regionFinder.printSettings(ptype, useMulti, pValueType) outfile = open(outfilename, outputMode) - print >> outfile, regionFinder.getAnalysisDescription(hitfile, controlfile, doControl, - withFlag, listPeak, useMulti, doCache, pValueType) - - header = getHeader(normalize, regionFinder.doDirectionality, listPeak, reportshift, doPvalue) - print >> outfile, header - if minRatio < minPeak: - minPeak = minRatio - + header = writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType) shiftDict = {} - hitChromList = hitRDS.getChromosomes() - stringency = max(stringency, 1.0) - chromosomeList = getChromosomeListToProcess(hitChromList, controlRDS, doControl) - for chromosome in chromosomeList: - allregions, outregions = findPeakRegions(regionFinder, hitRDS, controlRDS, chromosome, logfilename, outfilename, - outfile, stringency, normalize, useMulti, doControl, withFlag, combine5p, - factor, listPeak) - - if doRevBackground: - #TODO: this is *almost* the same calculation - there are small yet important differences - backregions = findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, normalize, useMulti, factor) - writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict, - allregions, header, backregions=backregions, pValueType=pValueType) + chromList = getChromosomeListToProcess(sampleBAM, controlBAM) + for chromosome in chromList: + #TODO: Really? Use first chr shift value for all of them + maxSampleCoord = getMaxCoordinate(sampleBAM, chromosome, doMulti=useMulti) + if regionFinder.shiftValue == "learn": + regionFinder.shiftValue = learnShift(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p) + + allregions, outregions = findPeakRegions(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p) + if regionFinder.doRevBackground: + maxControlCoord = getMaxCoordinate(controlBAM, chromosome, doMulti=useMulti) + backregions = findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxControlCoord, chromosome, useMulti) else: - writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict, - allregions, header) + backregions = [] + pValueType = "self" + + writeChromosomeResults(regionFinder, outregions, outfile, shiftDict, allregions, header, backregions=backregions, pValueType=pValueType) + + try: + bestShift = getBestShiftInDict(shiftDict) + except ValueError: + bestShift = 0 - footer = getFooter(regionFinder, shiftDict, reportshift, doRevBackground) + footer = regionFinder.getFooter(bestShift) print footer print >> outfile, footer outfile.close() writeLog(logfilename, versionString, outfilename + footer.replace("\n#"," | ")[:-1]) +def getHeaderComment(bamHeader, commentKey): + for comment in bamHeader["CO"]: + fields = comment.split("\t") + if fields[0] == commentKey: + return fields[1] + + raise KeyError + + def getPValueType(ptype, doControl, doRevBackground): pValueType = "self" if ptype in ["NONE", "SELF", "BACK"]: @@ -358,159 +469,91 @@ def getPValueType(ptype, doControl, doRevBackground): return pValueType -def printStatusMessages(shiftValue, normalize, doRevBackground, ptype, doControl, withFlag, useMulti, rnaSettings, strandfilter): - if shiftValue == "learn": - print "Will try to learn shift" - - if normalize: - print "Normalizing to RPM" - - if doRevBackground: - print "Swapping IP and background to calculate FDR" - - if ptype != "": - if ptype in ["NONE", "SELF"]: - pass - elif ptype == "BACK": - if doControl and doRevBackground: - pass - else: - print "must have a control dataset and -revbackground for pValue type 'back'" - else: - print "could not use pValue type : %s" % ptype - - if withFlag != "": - print "restrict to flag = %s" % withFlag - - if not useMulti: - print "using unique reads only" - - if rnaSettings: - print "using settings appropriate for RNA: -nodirectionality -notrim -noshift" - - if strandfilter == "plus": - print "only analyzing reads on the plus strand" - elif strandfilter == "minus": - print "only analyzing reads on the minus strand" - - -def openRDSFile(filename, cachePages=None, doCache=False): - rds = ReadDataset.ReadDataset(filename, verbose=True, cache=doCache) - if cachePages > rds.getDefaultCacheSize(): - rds.setDBcache(cachePages) - - return rds - - -def getHeader(normalize, doDirectionality, listPeak, reportshift, doPvalue): - if normalize: - countType = "RPM" - else: - countType = "COUNT" - - headerFields = ["#regionID\tchrom\tstart\tstop", countType, "fold\tmulti%"] - - if doDirectionality: - headerFields.append("plus%\tleftPlus%") - - if listPeak: - headerFields.append("peakPos\tpeakHeight") - - if reportshift: - headerFields.append("readShift") - - if doPvalue: - headerFields.append("pValue") +def writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType): + print >> outfile, regionFinder.getAnalysisDescription(hitfile, useMulti, pValueType) + header = regionFinder.getHeader() + print >> outfile, header - return string.join(headerFields, "\t") + return header -def getChromosomeListToProcess(hitChromList, controlRDS, doControl): - if doControl: - controlChromList = controlRDS.getChromosomes() - chromosomeList = [chrom for chrom in hitChromList if chrom in controlChromList and chrom != "chrM"] +def getChromosomeListToProcess(sampleBAM, controlBAM=None): + if controlBAM is not None: + chromosomeList = [chrom for chrom in sampleBAM.references if chrom in controlBAM.references and chrom != "chrM"] else: - chromosomeList = [chrom for chrom in hitChromList if chrom != "chrM"] + chromosomeList = [chrom for chrom in sampleBAM.references if chrom != "chrM"] return chromosomeList -def findPeakRegions(regionFinder, hitRDS, controlRDS, chromosome, logfilename, outfilename, - outfile, stringency, normalize, useMulti, doControl, withFlag, combine5p, factor, listPeak): +def findPeakRegions(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename, + outfile, useMulti, controlBAM, combine5p): outregions = [] allregions = [] print "chromosome %s" % (chromosome) previousHit = - 1 * regionFinder.maxSpacing readStartPositions = [-1] - totalWeight = 0 + totalWeight = 0.0 uniqueReadCount = 0 reads = [] - numStarts = 0 - badRegion = False - hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True, - strand=regionFinder.stranded, combine5p=combine5p) + numStartsInRegion = 0 - maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti) - if regionFinder.shiftValue == "learn": - learnShift(regionFinder, hitDict, controlRDS, chromosome, maxCoord, logfilename, outfilename, - outfile, numStarts, stringency, normalize, useMulti, doControl) + for alignedread in sampleBAM.fetch(chromosome): + if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p): + continue - for read in hitDict[chromosome]: - pos = read["start"] + pos = alignedread.pos if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord): lastReadPos = readStartPositions[-1] lastBasePosition = lastReadPos + regionFinder.readlen - 1 newRegionIndex = regionFinder.statistics["index"] + 1 if regionFinder.doDirectionality: - region = Region.DirectionalRegion(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=factor, numReads=totalWeight) + region = Region.DirectionalRegion(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=regionFinder.regionLabel, + numReads=totalWeight) else: - region = Region.Region(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=factor, numReads=totalWeight) + region = Region.Region(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=regionFinder.regionLabel, numReads=totalWeight) - if normalize: + if regionFinder.normalize: region.numReads /= regionFinder.sampleRDSsize allregions.append(int(region.numReads)) regionLength = lastReadPos - region.start - if regionPassesCriteria(region.numReads, regionFinder.minHits, numStarts, regionFinder.minRatio, regionLength, regionFinder.readlen): - region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos, - useMulti, doControl, normalize) + if regionPassesCriteria(regionFinder, region.numReads, numStartsInRegion, regionLength): + region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti) if region.foldRatio >= regionFinder.minRatio: # first pass, with absolute numbers peak = findPeak(reads, region.start, regionLength, regionFinder.readlen, doWeight=True, leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue) if regionFinder.doTrim: try: - lastReadPos = trimRegion(region, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.readlen, regionFinder.doDirectionality, - normalize, regionFinder.sampleRDSsize) + lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.sampleRDSsize) except IndexError: - badRegion = True + regionFinder.statistics["badRegionTrim"] += 1 continue - region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos, - useMulti, doControl, normalize) + region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti) - # just in case it changed, use latest data try: bestPos = peak.topPos[0] peakScore = peak.smoothArray[bestPos] - if normalize: + if regionFinder.normalize: peakScore /= regionFinder.sampleRDSsize - except: + except (IndexError, AttributeError, ZeroDivisionError): continue - if listPeak: - region.peakDescription= "%d\t%.1f" % (region.start + bestPos, peakScore) + if regionFinder.listPeak: + region.peakDescription = "%d\t%.1f" % (region.start + bestPos, peakScore) if useMulti: - setMultireadPercentage(region, hitRDS, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos, normalize, regionFinder.doTrim) + setMultireadPercentage(region, sampleBAM, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos, + regionFinder.normalize, regionFinder.doTrim) region.shift = peak.shift # check that we still pass threshold regionLength = lastReadPos - region.start plusRatio = float(peak.numPlus)/peak.numHits - if regionAndPeakPass(region, regionFinder.minHits, regionFinder.minRatio, regionLength, regionFinder.readlen, peakScore, regionFinder.minPeak, - regionFinder.minPlusRatio, regionFinder.maxPlusRatio, plusRatio): + if regionAndPeakPass(regionFinder, region, regionLength, peakScore, plusRatio): try: updateRegion(region, regionFinder.doDirectionality, regionFinder.leftPlusRatio, peak.numLeftPlus, peak.numPlus, plusRatio) regionFinder.statistics["index"] += 1 @@ -520,56 +563,88 @@ def findPeakRegions(regionFinder, hitRDS, controlRDS, chromosome, logfilename, o regionFinder.statistics["failed"] += 1 readStartPositions = [] - totalWeight = 0 + totalWeight = 0.0 uniqueReadCount = 0 reads = [] - numStarts = 0 - if badRegion: - badRegion = False - regionFinder.statistics["badRegionTrim"] += 1 + numStartsInRegion = 0 if pos not in readStartPositions: - numStarts += 1 + numStartsInRegion += 1 readStartPositions.append(pos) - weight = read["weight"] + weight = 1.0/alignedread.opt('NH') totalWeight += weight if weight == 1.0: uniqueReadCount += 1 - reads.append({"start": pos, "sense": read["sense"], "weight": weight}) + reads.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight}) previousHit = pos return allregions, outregions -def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, normalize, useMulti, factor): +def getReadSense(read): + if read.is_reverse: + sense = "-" + else: + sense = "+" + + return sense + + +def doNotProcessRead(read, doMulti=False, strand="both", combine5p=False): + if read.opt('NH') > 1 and not doMulti: + return True + + if strand == "+" and read.is_reverse: + return True + + if strand == "-" and not read.is_reverse: + return True + + return False + +def getMaxCoordinate(samfile, chr, doMulti=False): + maxCoord = 0 + for alignedread in samfile.fetch(chr): + if alignedread.opt('NH') > 1: + if doMulti: + maxCoord = max(maxCoord, alignedread.pos) + else: + maxCoord = max(maxCoord, alignedread.pos) + + return maxCoord + + +def findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxCoord, chromosome, useMulti): + #TODO: this is *almost* the same calculation - there are small yet important differences print "calculating background..." previousHit = - 1 * regionFinder.maxSpacing currentHitList = [-1] - currentTotalWeight = 0 + currentTotalWeight = 0.0 currentReadList = [] backregions = [] numStarts = 0 badRegion = False - hitDict = controlRDS.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, findallOptimize=True) - maxCoord = controlRDS.getMaxCoordinate(chromosome, doMulti=useMulti) - for read in hitDict[chromosome]: - pos = read["start"] + for alignedread in controlBAM.fetch(chromosome): + if doNotProcessRead(alignedread, doMulti=useMulti): + continue + + pos = alignedread.pos if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord): lastReadPos = currentHitList[-1] lastBasePosition = lastReadPos + regionFinder.readlen - 1 - region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=factor, numReads=currentTotalWeight) - if normalize: + region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight) + if regionFinder.normalize: region.numReads /= regionFinder.controlRDSsize backregions.append(int(region.numReads)) - region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=factor, numReads=currentTotalWeight) + region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight) regionLength = lastReadPos - region.start - if regionPassesCriteria(region.numReads, regionFinder.minHits, numStarts, regionFinder.minRatio, regionLength, regionFinder.readlen): - numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True) - if normalize: + if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength): + numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti) + if regionFinder.normalize: numMock /= regionFinder.sampleRDSsize foldRatio = region.numReads / numMock @@ -580,19 +655,17 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, normaliz if regionFinder.doTrim: try: - lastReadPos = trimRegion(region, peak, lastReadPos, 20., currentReadList, regionFinder.readlen, regionFinder.doDirectionality, - normalize, regionFinder.controlRDSsize) + lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, 20., currentReadList, regionFinder.controlRDSsize) except IndexError: badRegion = True continue - numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True) - if normalize: + numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti) + if regionFinder.normalize: numMock /= regionFinder.sampleRDSsize foldRatio = region.numReads / numMock - # just in case it changed, use latest data try: bestPos = peak.topPos[0] peakScore = peak.smoothArray[bestPos] @@ -600,16 +673,16 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, normaliz continue # normalize to RPM - if normalize: + if regionFinder.normalize: peakScore /= regionFinder.controlRDSsize # check that we still pass threshold regionLength = lastReadPos - region.start - if regionPassesCriteria(region.numReads, regionFinder.minHits, foldRatio, regionFinder.minRatio, regionLength, regionFinder.readlen): - updateControlStatistics(regionFinder, peak, region.numReads, peakScore) + if regionPassesCriteria(regionFinder, region.numReads, foldRatio, regionLength): + regionFinder.updateControlStatistics(peak, region.numReads, peakScore) currentHitList = [] - currentTotalWeight = 0 + currentTotalWeight = 0.0 currentReadList = [] numStarts = 0 if badRegion: @@ -620,50 +693,55 @@ def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, normaliz numStarts += 1 currentHitList.append(pos) - weight = read["weight"] + weight = 1.0/alignedread.opt('NH') currentTotalWeight += weight - currentReadList.append({"start": pos, "sense": read["sense"], "weight": weight}) + currentReadList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight}) previousHit = pos return backregions -def learnShift(regionFinder, hitDict, controlRDS, chromosome, maxCoord, logfilename, outfilename, - outfile, numStarts, stringency, normalize, useMulti, doControl): +def learnShift(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename, + outfile, useMulti, controlBAM, combine5p): print "learning shift.... will need at least 30 training sites" + stringency = regionFinder.stringency previousHit = -1 * regionFinder.maxSpacing positionList = [-1] - totalWeight = 0 + totalWeight = 0.0 readList = [] shiftDict = {} count = 0 - for read in hitDict[chromosome]: - pos = read["start"] + numStarts = 0 + for alignedread in sampleBAM.fetch(chromosome): + if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p): + continue + + pos = alignedread.pos if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord): - if normalize: + if regionFinder.normalize: totalWeight /= regionFinder.sampleRDSsize regionStart = positionList[0] regionStop = positionList[-1] regionLength = regionStop - regionStart - if regionPassesCriteria(totalWeight, regionFinder.minHits, numStarts, regionFinder.minRatio, regionLength, regionFinder.readlen, stringency=stringency): - foldRatio = getFoldRatio(regionFinder, controlRDS, totalWeight, chromosome, regionStart, regionStop, useMulti, doControl, normalize) + if regionPassesCriteria(regionFinder, totalWeight, numStarts, regionLength, stringency=stringency): + foldRatio = getFoldRatio(regionFinder, controlBAM, totalWeight, chromosome, regionStart, regionStop, useMulti) if foldRatio >= regionFinder.minRatio: updateShiftDict(shiftDict, readList, regionStart, regionLength, regionFinder.readlen) count += 1 positionList = [] - totalWeight = 0 + totalWeight = 0.0 readList = [] if pos not in positionList: numStarts += 1 positionList.append(pos) - weight = read["weight"] + weight = 1.0/alignedread.opt('NH') totalWeight += weight - readList.append({"start": pos, "sense": read["sense"], "weight": weight}) + readList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight}) previousHit = pos outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency, @@ -674,26 +752,32 @@ def learnShift(regionFinder, hitDict, controlRDS, chromosome, maxCoord, logfilen print outline writeLog(logfilename, versionString, outfilename + outline) - regionFinder.shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename) - outline = "#picked shiftValue to be %d" % regionFinder.shiftValue + shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename) + outline = "#picked shiftValue to be %d" % shiftValue print outline print >> outfile, outline writeLog(logfilename, versionString, outfilename + outline) + return shiftValue, shiftDict + def previousRegionIsDone(pos, previousHit, maxSpacing, maxCoord): return abs(pos - previousHit) > maxSpacing or pos == maxCoord -def regionPassesCriteria(sumAll, minHits, numStarts, minRatio, regionLength, readlen, stringency=1): - return sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen +def regionPassesCriteria(regionFinder, sumAll, numStarts, regionLength, stringency=1): + minTotalReads = stringency * regionFinder.minHits + minNumReadStarts = stringency * regionFinder.minRatio + minRegionLength = stringency * regionFinder.readlen + + return sumAll >= minTotalReads and numStarts >= minNumReadStarts and regionLength > minRegionLength -def trimRegion(region, peak, regionStop, trimValue, currentReadList, readlen, doDirectionality, normalize, hitRDSsize): +def trimRegion(region, regionFinder, peak, regionStop, trimValue, currentReadList, totalReadCount): bestPos = peak.topPos[0] peakScore = peak.smoothArray[bestPos] - if normalize: - peakScore /= hitRDSsize + if regionFinder.normalize: + peakScore /= totalReadCount minSignalThresh = trimValue * peakScore start = findStartEdgePosition(peak, minSignalThresh) @@ -702,18 +786,21 @@ def trimRegion(region, peak, regionStop, trimValue, currentReadList, readlen, do regionStop = region.start + stop region.start += start - trimmedPeak = findPeak(currentReadList, region.start, regionStop - region.start, readlen, doWeight=True, leftPlus=doDirectionality, shift=peak.shift) + + trimmedPeak = findPeak(currentReadList, region.start, regionStop - region.start, regionFinder.readlen, doWeight=True, + leftPlus=regionFinder.doDirectionality, shift=peak.shift) + peak.numPlus = trimmedPeak.numPlus peak.numLeftPlus = trimmedPeak.numLeftPlus peak.topPos = trimmedPeak.topPos peak.smoothArray = trimmedPeak.smoothArray region.numReads = trimmedPeak.numHits - if normalize: - region.numReads /= hitRDSsize + if regionFinder.normalize: + region.numReads /= totalReadCount - region.stop = regionStop + readlen - 1 - + region.stop = regionStop + regionFinder.readlen - 1 + return regionStop @@ -736,10 +823,13 @@ def peakEdgeLocated(peak, position, minSignalThresh): return peak.smoothArray[position] >= minSignalThresh or position == peak.topPos[0] -def getFoldRatio(regionFinder, controlRDS, sumAll, chromosome, regionStart, regionStop, useMulti, doControl, normalize): - if doControl: - numMock = 1. + controlRDS.getCounts(chromosome, regionStart, regionStop, uniqs=True, multi=useMulti, splices=False, reportCombined=True) - if normalize: +def getFoldRatio(regionFinder, controlBAM, sumAll, chromosome, regionStart, regionStop, useMulti): + """ Fold ratio calculated is total read weight over control + """ + #TODO: this needs to be generalized as there is a point at which we want to use the sampleRDS instead of controlRDS + if regionFinder.doControl: + numMock = 1. + countReadsInRegion(controlBAM, chromosome, regionStart, regionStop, countMulti=useMulti) + if regionFinder.normalize: numMock /= regionFinder.controlRDSsize foldRatio = sumAll / numMock @@ -749,6 +839,25 @@ def getFoldRatio(regionFinder, controlRDS, sumAll, chromosome, regionStart, regi return foldRatio +def countReadsInRegion(bamfile, chr, start, end, uniqs=True, countMulti=False, countSplices=False): + count = 0.0 + for alignedread in bamfile.fetch(chr, start, end): + if alignedread.opt('NH') > 1: + if countMulti: + if isSpliceEntry(alignedread.cigar): + if countSplices: + count += 1.0/alignedread.opt('NH') + else: + count += 1.0/alignedread.opt('NH') + elif uniqs: + if isSpliceEntry(alignedread.cigar): + if countSplices: + count += 1.0 + else: + count += 1.0 + + return count + def updateShiftDict(shiftDict, readList, regionStart, regionLength, readlen): peak = findPeak(readList, regionStart, regionLength, readlen, doWeight=True, shift="auto") try: @@ -770,22 +879,6 @@ def getShiftValue(shiftDict, count, logfilename, outfilename): return shiftValue -def updateControlStatistics(regionFinder, peak, sumAll, peakScore): - - plusRatio = float(peak.numPlus)/peak.numHits - if peakScore >= regionFinder.minPeak and regionFinder.minPlusRatio <= plusRatio <= regionFinder.maxPlusRatio: - if regionFinder.doDirectionality: - if regionFinder.leftPlusRatio < peak.numLeft / peak.numPlus: - regionFinder.statistics["mIndex"] += 1 - regionFinder.statistics["mTotal"] += sumAll - else: - regionFinder.statistics["failed"] += 1 - else: - # we have a region, but didn't check for directionality - regionFinder.statistics["mIndex"] += 1 - regionFinder.statistics["mTotal"] += sumAll - - def getRegion(regionStart, regionStop, factor, index, chromosome, sumAll, foldRatio, multiP, peakDescription, shift, doDirectionality, leftPlusRatio, numLeft, numPlus, plusRatio): @@ -810,9 +903,12 @@ def getRegion(regionStart, regionStop, factor, index, chromosome, sumAll, foldRa return region -def setMultireadPercentage(region, hitRDS, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim): +def setMultireadPercentage(region, sampleBAM, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim): if doTrim: - sumMulti = hitRDS.getMultiCount(chromosome, region.start, lastReadPos) + sumMulti = 0.0 + for alignedread in sampleBAM.fetch(chromosome, region.start, lastReadPos): + if alignedread.opt('NH') > 1: + sumMulti += 1.0/alignedread.opt('NH') else: sumMulti = currentTotalWeight - currentUniqueCount @@ -825,21 +921,20 @@ def setMultireadPercentage(region, hitRDS, hitRDSsize, currentTotalWeight, curre except ZeroDivisionError: return - region.multiP = multiP + region.multiP = min(multiP, 100.) -def regionAndPeakPass(region, minHits, minRatio, regionLength, readlen, peakScore, minPeak, minPlusRatio, maxPlusRatio, plusRatio): +def regionAndPeakPass(regionFinder, region, regionLength, peakScore, plusRatio): regionPasses = False - if regionPassesCriteria(region.numReads, minHits, region.foldRatio, minRatio, regionLength, readlen): - if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio: + if regionPassesCriteria(regionFinder, region.numReads, region.foldRatio, regionLength): + #TODO: here is where the test dataset is failing + if peakScore >= regionFinder.minPeak and regionFinder.minPlusRatio <= plusRatio <= regionFinder.maxPlusRatio: regionPasses = True return regionPasses -def updateRegion(region, - doDirectionality, leftPlusRatio, numLeft, - numPlus, plusRatio): +def updateRegion(region, doDirectionality, leftPlusRatio, numLeft, numPlus, plusRatio): if doDirectionality: if leftPlusRatio < numLeft / numPlus: @@ -849,25 +944,33 @@ def updateRegion(region, raise RegionDirectionError -def writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict, - allregions, header): - - writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict, - allregions, header, backregions=[], pValueType="self") - - -def writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict, +def writeChromosomeResults(regionFinder, outregions, outfile, shiftDict, allregions, header, backregions=[], pValueType="none"): print regionFinder.statistics["mIndex"], regionFinder.statistics["mTotal"] - if doPvalue: + if regionFinder.doPvalue: if pValueType == "self": poissonmean = calculatePoissonMean(allregions) else: poissonmean = calculatePoissonMean(backregions) print header - writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=regionFinder.shiftValue, reportshift=reportshift, shiftDict=shiftDict) + for region in outregions: + if regionFinder.shiftValue == "auto" and regionFinder.reportshift: + try: + shiftDict[region.shift] += 1 + except KeyError: + shiftDict[region.shift] = 1 + + outline = getRegionString(region, regionFinder.reportshift) + + # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest + if regionFinder.doPvalue: + pValue = calculatePValue(int(region.numReads), poissonmean) + outline += "\t%1.2g" % pValue + + print outline + print >> outfile, outline def calculatePoissonMean(dataList): @@ -883,29 +986,8 @@ def calculatePoissonMean(dataList): return poissonmean -def writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=0, reportshift=False, shiftDict={}): - for region in outregions: - if shiftValue == "auto" and reportshift: - try: - shiftDict[region.shift] += 1 - except KeyError: - shiftDict[region.shift] = 1 - - outline = getRegionString(region, reportshift) - - # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest - if doPvalue: - sumAll = int(region.numReads) - pValue = calculatePValue(sumAll, poissonmean) - outline += "\t%1.2g" % pValue - - print outline - print >> outfile, outline - - def calculatePValue(sum, poissonmean): pValue = math.exp(-poissonmean) - #TODO: 798: DeprecationWarning: integer argument expected, got float - for i in xrange(sum) for i in xrange(sum): pValue *= poissonmean pValue /= i+1 @@ -922,34 +1004,9 @@ def getRegionString(region, reportShift): return outline -def getFooter(regionFinder, shiftDict, reportshift, doRevBackground): - index = regionFinder.statistics["index"] - mIndex = regionFinder.statistics["mIndex"] - footerLines = ["#stats:\t%.1f RPM in %d regions" % (regionFinder.statistics["total"], index)] - if regionFinder.doDirectionality: - footerLines.append("#\t\t%d additional regions failed directionality filter" % regionFinder.statistics["failed"]) - - if doRevBackground: - try: - percent = min(100. * (float(mIndex)/index), 100.) - except ZeroDivisionError: - percent = 0. - - footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, regionFinder.statistics["mTotal"], percent)) - - if regionFinder.shiftValue == "auto" and reportshift: - bestShift = getBestShiftInDict(shiftDict) - footerLines.append("#mode of shift values: %d" % bestShift) - - if regionFinder.statistics["badRegionTrim"] > 0: - footerLines.append("#%d regions discarded due to trimming problems" % regionFinder.statistics["badRegionTrim"]) - - return string.join(footerLines, "\n") - - def getBestShiftInDict(shiftDict): return max(shiftDict.iteritems(), key=operator.itemgetter(1))[0] if __name__ == "__main__": - main(sys.argv) + main(sys.argv) \ No newline at end of file