X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=findall.py;h=827ee503ba7956b37143481beabae171a35f7619;hp=aa820374b994d497fbc4dd336a98b795489f5900;hb=HEAD;hpb=a3c0e60eb30c924232d7baaa348a15c5554e3864 diff --git a/findall.py b/findall.py index aa82037..827ee50 100755 --- a/findall.py +++ b/findall.py @@ -1,6 +1,6 @@ """ - usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile - [--control controlrdsfile] [--minimum minHits] [--ratio minRatio] + usage: python $ERANGEPATH/findall.py label samplebamfile regionoutfile + [--control controlbamfile] [--minimum minHits] [--ratio minRatio] [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num] [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction] [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw] @@ -49,14 +49,223 @@ import sys import math import string import optparse -from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption -import ReadDataset +import operator +import pysam +from commoncode import writeLog, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption, isSpliceEntry import Region -versionString = "findall: version 3.2" +versionString = "findall: version 3.2.1" print versionString +class RegionDirectionError(Exception): + pass + + +class RegionFinder(): + def __init__(self, label, minRatio=4.0, minPeak=0.5, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, strandfilter="", + minHits=4.0, trimValue=0.1, doTrim=True, doDirectionality=True, shiftValue=0, maxSpacing=50, withFlag="", + normalize=True, listPeak=False, reportshift=False, stringency=1.0, controlfile=None, doRevBackground=False): + + self.statistics = {"index": 0, + "total": 0, + "mIndex": 0, + "mTotal": 0, + "failed": 0, + "badRegionTrim": 0 + } + + self.regionLabel = label + self.rnaSettings = False + self.controlRDSsize = 1.0 + self.sampleRDSsize = 1.0 + self.minRatio = minRatio + self.minPeak = minPeak + self.leftPlusRatio = leftPlusRatio + self.stranded = "both" + if strandfilter == "plus": + self.stranded = "+" + minPlusRatio = 0.9 + maxPlusRatio = 1.0 + elif strandfilter == "minus": + self.stranded = "-" + minPlusRatio = 0.0 + maxPlusRatio = 0.1 + + if minRatio < minPeak: + self.minPeak = minRatio + + self.minPlusRatio = minPlusRatio + self.maxPlusRatio = maxPlusRatio + self.strandfilter = strandfilter + self.minHits = minHits + self.trimValue = trimValue + self.doTrim = doTrim + self.doDirectionality = doDirectionality + + if self.doTrim: + self.trimString = string.join(["%2.1f" % (100. * self.trimValue), "%"], "") + else: + self.trimString = "none" + + self.shiftValue = shiftValue + self.maxSpacing = maxSpacing + self.withFlag = withFlag + self.normalize = normalize + self.listPeak = listPeak + self.reportshift = reportshift + self.stringency = max(stringency, 1.0) + self.controlfile = controlfile + self.doControl = self.controlfile is not None + self.doPvalue = False + self.doRevBackground = doRevBackground + + + def useRNASettings(self, readlen): + self.rnaSettings = True + self.shiftValue = 0 + self.doTrim = False + self.doDirectionality = False + self.maxSpacing = readlen + + + def getHeader(self): + if self.normalize: + countType = "RPM" + else: + countType = "COUNT" + + headerFields = ["#regionID\tchrom\tstart\tstop", countType, "fold\tmulti%"] + + if self.doDirectionality: + headerFields.append("plus%\tleftPlus%") + + if self.listPeak: + headerFields.append("peakPos\tpeakHeight") + + if self.reportshift: + headerFields.append("readShift") + + if self.doPvalue: + headerFields.append("pValue") + + return string.join(headerFields, "\t") + + + def printSettings(self, ptype, useMulti, pValueType): + print + self.printStatusMessages(ptype, useMulti) + self.printOptionsSummary(useMulti, pValueType) + + + def printStatusMessages(self, ptype, useMulti): + if self.shiftValue == "learn": + print "Will try to learn shift" + + if self.normalize: + print "Normalizing to RPM" + + if self.doRevBackground: + print "Swapping IP and background to calculate FDR" + + if ptype != "": + if ptype in ["NONE", "SELF"]: + pass + elif ptype == "BACK": + if self.doControl and self.doRevBackground: + pass + else: + print "must have a control dataset and -revbackground for pValue type 'back'" + else: + print "could not use pValue type : %s" % ptype + + if self.withFlag != "": + print "restrict to flag = %s" % self.withFlag + + if not useMulti: + print "using unique reads only" + + if self.rnaSettings: + print "using settings appropriate for RNA: -nodirectionality -notrim -noshift" + + if self.strandfilter == "plus": + print "only analyzing reads on the plus strand" + elif self.strandfilter == "minus": + print "only analyzing reads on the minus strand" + + + def printOptionsSummary(self, useMulti, pValueType): + + print "\nenforceDirectionality=%s listPeak=%s nomulti=%s " % (self.doDirectionality, self.listPeak, not useMulti) + print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded) + try: + print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType) + except: + print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType) + + + def getAnalysisDescription(self, hitfile, useMulti, pValueType): + + description = ["#ERANGE %s" % versionString] + if self.doControl: + description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, self.controlfile, self.controlRDSsize)) + else: + description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample: none" % (hitfile, self.sampleRDSsize)) + + if self.withFlag != "": + description.append("#restrict to Flag = %s" % self.withFlag) + + description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s" % (self.doDirectionality, self.listPeak, not useMulti)) + description.append("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded)) + try: + description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)) + except: + description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)) + + return string.join(description, "\n") + + + def getFooter(self, bestShift): + index = self.statistics["index"] + mIndex = self.statistics["mIndex"] + footerLines = ["#stats:\t%.1f RPM in %d regions" % (self.statistics["total"], index)] + if self.doDirectionality: + footerLines.append("#\t\t%d additional regions failed directionality filter" % self.statistics["failed"]) + + if self.doRevBackground: + try: + percent = min(100. * (float(mIndex)/index), 100.) + except ZeroDivisionError: + percent = 0. + + footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, self.statistics["mTotal"], percent)) + + if self.shiftValue == "auto" and self.reportshift: + + footerLines.append("#mode of shift values: %d" % bestShift) + + if self.statistics["badRegionTrim"] > 0: + footerLines.append("#%d regions discarded due to trimming problems" % self.statistics["badRegionTrim"]) + + return string.join(footerLines, "\n") + + + def updateControlStatistics(self, peak, sumAll, peakScore): + + plusRatio = float(peak.numPlus)/peak.numHits + if peakScore >= self.minPeak and self.minPlusRatio <= plusRatio <= self.maxPlusRatio: + if self.doDirectionality: + if self.leftPlusRatio < peak.numLeftPlus / peak.numPlus: + self.statistics["mIndex"] += 1 + self.statistics["mTotal"] += sumAll + else: + self.statistics["failed"] += 1 + else: + # we have a region, but didn't check for directionality + self.statistics["mIndex"] += 1 + self.statistics["mTotal"] += sumAll + + def usage(): print __doc__ @@ -76,20 +285,43 @@ def main(argv=None): hitfile = args[1] outfilename = args[2] - findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift, - options.stringency, options.noshift, options.autoshift, options.reportshift, - options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak, - options.normalize, options.logfilename, options.withFlag, options.doDirectionality, - options.trimValue, options.doTrim, options.doAppend, options.rnaSettings, - options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti, - options.strandfilter, options.combine5p) + shiftValue = 0 + + if options.autoshift: + shiftValue = "auto" + + if options.shift is not None: + try: + shiftValue = int(options.shift) + except ValueError: + if options.shift == "learn": + shiftValue = "learn" + + if options.noshift: + shiftValue = 0 + + if options.doAppend: + outputMode = "a" + else: + outputMode = "w" + + regionFinder = RegionFinder(factor, minRatio=options.minRatio, minPeak=options.minPeak, minPlusRatio=options.minPlusRatio, + maxPlusRatio=options.maxPlusRatio, leftPlusRatio=options.leftPlusRatio, strandfilter=options.strandfilter, + minHits=options.minHits, trimValue=options.trimValue, doTrim=options.doTrim, + doDirectionality=options.doDirectionality, shiftValue=shiftValue, maxSpacing=options.maxSpacing, + withFlag=options.withFlag, normalize=options.normalize, listPeak=options.listPeak, + reportshift=options.reportshift, stringency=options.stringency, controlfile=options.controlfile, + doRevBackground=options.doRevBackground) + + findall(regionFinder, hitfile, outfilename, options.logfilename, outputMode, options.rnaSettings, + options.ptype, options.useMulti, options.combine5p) def makeParser(): usage = __doc__ parser = optparse.OptionParser(usage=usage) - parser.add_option("--control", dest="mockfile") + parser.add_option("--control", dest="controlfile") parser.add_option("--minimum", type="float", dest="minHits") parser.add_option("--ratio", type="float", dest="minRatio") parser.add_option("--spacing", type="int", dest="maxSpacing") @@ -99,7 +331,7 @@ def makeParser(): parser.add_option("--noshift", action="store_true", dest="noShift") parser.add_option("--autoshift", action="store_true", dest="autoshift") parser.add_option("--reportshift", action="store_true", dest="reportshift") - parser.add_option("--nomulti", action="store_true", dest="noMulti") + parser.add_option("--nomulti", action="store_false", dest="useMulti") parser.add_option("--minPlus", type="float", dest="minPlusRatio") parser.add_option("--maxPlus", type="float", dest="maxPlusRatio") parser.add_option("--leftPlus", type="float", dest="leftPlusRatio") @@ -137,16 +369,16 @@ def makeParser(): logfilename = getConfigOption(configParser, section, "logfilename", "findall.log") withFlag = getConfigOption(configParser, section, "withFlag", "") doDirectionality = getConfigBoolOption(configParser, section, "doDirectionality", True) - trimValue = getConfigOption(configParser, section, "trimValue", None) + trimValue = getConfigFloatOption(configParser, section, "trimValue", 0.1) doTrim = getConfigBoolOption(configParser, section, "doTrim", True) doAppend = getConfigBoolOption(configParser, section, "doAppend", False) rnaSettings = getConfigBoolOption(configParser, section, "rnaSettings", False) cachePages = getConfigOption(configParser, section, "cachePages", None) - ptype = getConfigOption(configParser, section, "ptype", None) - mockfile = getConfigOption(configParser, section, "mockfile", None) + ptype = getConfigOption(configParser, section, "ptype", "") + controlfile = getConfigOption(configParser, section, "controlfile", None) doRevBackground = getConfigBoolOption(configParser, section, "doRevBackground", False) - noMulti = getConfigBoolOption(configParser, section, "noMulti", False) - strandfilter = getConfigOption(configParser, section, "strandfilter", None) + useMulti = getConfigBoolOption(configParser, section, "useMulti", True) + strandfilter = getConfigOption(configParser, section, "strandfilter", "") combine5p = getConfigBoolOption(configParser, section, "combine5p", False) parser.set_defaults(minHits=minHits, minRatio=minRatio, maxSpacing=maxSpacing, listPeak=listPeak, shift=shift, @@ -154,611 +386,627 @@ def makeParser(): minPlusRatio=minPlusRatio, maxPlusRatio=maxPlusRatio, leftPlusRatio=leftPlusRatio, minPeak=minPeak, normalize=normalize, logfilename=logfilename, withFlag=withFlag, doDirectionality=doDirectionality, trimValue=trimValue, doTrim=doTrim, doAppend=doAppend, rnaSettings=rnaSettings, - cachePages=cachePages, ptype=ptype, mockfile=mockfile, doRevBackground=doRevBackground, noMulti=noMulti, + cachePages=cachePages, ptype=ptype, controlfile=controlfile, doRevBackground=doRevBackground, useMulti=useMulti, strandfilter=strandfilter, combine5p=combine5p) return parser -def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None, - stringency=4.0, noshift=False, autoshift=False, reportshift=False, - minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5, - normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True, - trimValue=None, doTrim=True, doAppend=False, rnaSettings=False, - cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False, - strandfilter=None, combine5p=False): +def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outputMode="w", rnaSettings=False, + ptype="", useMulti=True, combine5p=False): - shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings) - doControl = False - if mockfile is not None: - doControl = True + writeLog(logfilename, versionString, string.join(sys.argv[1:])) + controlBAM = None + if regionFinder.doControl: + print "\ncontrol:" + controlBAM = pysam.Samfile(regionFinder.controlfile, "rb") + regionFinder.controlRDSsize = int(getHeaderComment(controlBAM.header, "Total")) / 1000000. - if doRevBackground: - print "Swapping IP and background to calculate FDR" - pValueType = "back" + print "\nsample:" + sampleBAM = pysam.Samfile(hitfile, "rb") + regionFinder.sampleRDSsize = int(getHeaderComment(sampleBAM.header, "Total")) / 1000000. + pValueType = getPValueType(ptype, regionFinder.doControl, regionFinder.doRevBackground) + regionFinder.doPvalue = not pValueType == "none" + regionFinder.readlen = int(getHeaderComment(sampleBAM.header, "ReadLength")) + if rnaSettings: + regionFinder.useRNASettings(regionFinder.readlen) + + regionFinder.printSettings(ptype, useMulti, pValueType) + outfile = open(outfilename, outputMode) + header = writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType) + shiftDict = {} + chromList = getChromosomeListToProcess(sampleBAM, controlBAM) + for chromosome in chromList: + #TODO: Really? Use first chr shift value for all of them + maxSampleCoord = getMaxCoordinate(sampleBAM, chromosome, doMulti=useMulti) + if regionFinder.shiftValue == "learn": + regionFinder.shiftValue = learnShift(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p) + + allregions, outregions = findPeakRegions(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p) + if regionFinder.doRevBackground: + maxControlCoord = getMaxCoordinate(controlBAM, chromosome, doMulti=useMulti) + backregions = findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxControlCoord, chromosome, useMulti) + else: + backregions = [] + pValueType = "self" + + writeChromosomeResults(regionFinder, outregions, outfile, shiftDict, allregions, header, backregions=backregions, pValueType=pValueType) + + try: + bestShift = getBestShiftInDict(shiftDict) + except ValueError: + bestShift = 0 + + footer = regionFinder.getFooter(bestShift) + print footer + print >> outfile, footer + outfile.close() + writeLog(logfilename, versionString, outfilename + footer.replace("\n#"," | ")[:-1]) + + +def getHeaderComment(bamHeader, commentKey): + for comment in bamHeader["CO"]: + fields = comment.split("\t") + if fields[0] == commentKey: + return fields[1] + + raise KeyError - doPvalue = True - if ptype is not None: - ptype = ptype.upper() + +def getPValueType(ptype, doControl, doRevBackground): + pValueType = "self" + if ptype in ["NONE", "SELF", "BACK"]: if ptype == "NONE": - doPvalue = False pValueType = "none" - p = 1 - poissonmean = 0 elif ptype == "SELF": pValueType = "self" elif ptype == "BACK": if doControl and doRevBackground: pValueType = "back" - else: - print "must have a control dataset and -revbackground for pValue type 'back'" - else: - print "could not use pValue type : %s" % ptype - else: - pValueType = "self" + elif doRevBackground: + pValueType = "back" - if withFlag != "": - print "restrict to flag = %s" % withFlag + return pValueType - useMulti = True - if noMulti: - print "using unique reads only" - useMulti = False - if rnaSettings: - print "using settings appropriate for RNA: -nodirectionality -notrim -noshift" - doTrim = False - doDirectionality = False +def writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType): + print >> outfile, regionFinder.getAnalysisDescription(hitfile, useMulti, pValueType) + header = regionFinder.getHeader() + print >> outfile, header - stranded = "" - if strandfilter is not None: - if strandfilter == "plus": - stranded = "+" - minPlusRatio = 0.9 - maxPlusRatio = 1.0 - print "only analyzing reads on the plus strand" - elif strandfilter == "minus": - stranded = "-" - minPlusRatio = 0.0 - maxPlusRatio = 0.1 - print "only analyzing reads on the minus strand" + return header - stringency = max(stringency, 1.0) - writeLog(logfilename, versionString, string.join(sys.argv[1:])) - if cachePages is not None: - doCache = True - else: - doCache = False - cachePages = -1 - mockRDS = None - if doControl: - print "\ncontrol:" - mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache) - - if cachePages > mockRDS.getDefaultCacheSize(): - mockRDS.setDBcache(cachePages) - - print "\nsample:" - hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache) - readlen = hitRDS.getReadSize() - if rnaSettings: - maxSpacing = readlen +def getChromosomeListToProcess(sampleBAM, controlBAM=None): + if controlBAM is not None: + chromosomeList = [chrom for chrom in sampleBAM.references if chrom in controlBAM.references and chrom != "chrM"] + else: + chromosomeList = [chrom for chrom in sampleBAM.references if chrom != "chrM"] - if cachePages > hitRDS.getDefaultCacheSize(): - hitRDS.setDBcache(cachePages) + return chromosomeList - if trimValue is not None: - trimValue = float(trimValue) / 100. - trimString = "%2.1f%s" % ((100. * trimValue), "%") - else: - trimValue = 0.1 - trimString = "10%" - if not doTrim: - trimString = "none" +def findPeakRegions(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename, + outfile, useMulti, controlBAM, combine5p): - if doAppend: - fileMode = "a" - else: - fileMode = "w" + outregions = [] + allregions = [] + print "chromosome %s" % (chromosome) + previousHit = - 1 * regionFinder.maxSpacing + readStartPositions = [-1] + totalWeight = 0.0 + uniqueReadCount = 0 + reads = [] + numStartsInRegion = 0 + + for alignedread in sampleBAM.fetch(chromosome): + if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p): + continue - outfile = open(outfilename, fileMode) - outfile.write("#ERANGE %s\n" % versionString) - if doControl: - mockRDSsize = len(mockRDS) / 1000000. - controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize) - else: - mockRDSsize = 0 - controlSampleString = " none" + pos = alignedread.pos + if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord): + lastReadPos = readStartPositions[-1] + lastBasePosition = lastReadPos + regionFinder.readlen - 1 + newRegionIndex = regionFinder.statistics["index"] + 1 + if regionFinder.doDirectionality: + region = Region.DirectionalRegion(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=regionFinder.regionLabel, + numReads=totalWeight) + else: + region = Region.Region(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=regionFinder.regionLabel, numReads=totalWeight) - hitRDSsize = len(hitRDS) / 1000000. - outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:%s\n" % (hitfile, hitRDSsize, controlSampleString)) + if regionFinder.normalize: + region.numReads /= regionFinder.sampleRDSsize - if withFlag != "": - outfile.write("#restrict to Flag = %s\n" % withFlag) + allregions.append(int(region.numReads)) + regionLength = lastReadPos - region.start + if regionPassesCriteria(regionFinder, region.numReads, numStartsInRegion, regionLength): + region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti) - print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache) - print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded) - print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType) + if region.foldRatio >= regionFinder.minRatio: + # first pass, with absolute numbers + peak = findPeak(reads, region.start, regionLength, regionFinder.readlen, doWeight=True, leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue) + if regionFinder.doTrim: + try: + lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.sampleRDSsize) + except IndexError: + regionFinder.statistics["badRegionTrim"] += 1 + continue - outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache)) - outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)) - outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)) - if normalize: - print "Normalizing to RPM" - countLabel = "RPM" - else: - countLabel = "COUNT" + region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti) - headerList = ["#regionID\tchrom\tstart\tstop", countLabel, "fold\tmulti%"] - if doDirectionality: - headerList.append("plus%\tleftPlus%") + try: + bestPos = peak.topPos[0] + peakScore = peak.smoothArray[bestPos] + if regionFinder.normalize: + peakScore /= regionFinder.sampleRDSsize + except (IndexError, AttributeError, ZeroDivisionError): + continue - if listPeak: - headerList.append("peakPos\tpeakHeight") + if regionFinder.listPeak: + region.peakDescription = "%d\t%.1f" % (region.start + bestPos, peakScore) - if reportshift: - headerList.append("readShift") + if useMulti: + setMultireadPercentage(region, sampleBAM, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos, + regionFinder.normalize, regionFinder.doTrim) - if doPvalue: - headerList.append("pValue") + region.shift = peak.shift + # check that we still pass threshold + regionLength = lastReadPos - region.start + plusRatio = float(peak.numPlus)/peak.numHits + if regionAndPeakPass(regionFinder, region, regionLength, peakScore, plusRatio): + try: + updateRegion(region, regionFinder.doDirectionality, regionFinder.leftPlusRatio, peak.numLeftPlus, peak.numPlus, plusRatio) + regionFinder.statistics["index"] += 1 + outregions.append(region) + regionFinder.statistics["total"] += region.numReads + except RegionDirectionError: + regionFinder.statistics["failed"] += 1 + + readStartPositions = [] + totalWeight = 0.0 + uniqueReadCount = 0 + reads = [] + numStartsInRegion = 0 + + if pos not in readStartPositions: + numStartsInRegion += 1 + + readStartPositions.append(pos) + weight = 1.0/alignedread.opt('NH') + totalWeight += weight + if weight == 1.0: + uniqueReadCount += 1 - headline = string.join(headerList, "\t") - print >> outfile, headline + reads.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight}) + previousHit = pos - statistics = {"index": 0, - "total": 0, - "mIndex": 0, - "mTotal": 0, - "failed": 0 - } + return allregions, outregions - if minRatio < minPeak: - minPeak = minRatio - hitChromList = hitRDS.getChromosomes() - if doControl: - mockChromList = mockRDS.getChromosomes() +def getReadSense(read): + if read.is_reverse: + sense = "-" else: - mockChromList = [] - - mockSampleSize = 0 - hitSampleSize = 0 - if normalize: - if doControl: - mockSampleSize = mockRDSsize + sense = "+" - hitSampleSize = hitRDSsize + return sense - hitChromList.sort() - for chromosome in hitChromList: - if doNotProcessChromosome(chromosome, doControl, mockChromList): - continue +def doNotProcessRead(read, doMulti=False, strand="both", combine5p=False): + if read.opt('NH') > 1 and not doMulti: + return True - print "chromosome %s" % (chromosome) - hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True, - doMulti=useMulti, findallOptimize=True, strand=stranded, - combine5p=combine5p) - maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti) - if shiftValue == "learn": - shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize, - mockSampleSize, minRatio, maxSpacing, maxCoord, stringency, readlen, minHits, - logfilename, outfile, outfilename) - - regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize, - chromosome, useMulti, normalize, maxSpacing, - doDirectionality, doTrim, minHits, minRatio, - readlen, shiftValue, minPeak, minPlusRatio, - maxPlusRatio, leftPlusRatio, listPeak, noMulti, - doControl, factor, trimValue, outputRegionList=True) - - statistics["index"] += regionStats["index"] - statistics["total"] += regionStats["total"] - statistics["failed"] += regionStats["failed"] - if not doRevBackground: - if doPvalue: - p, poissonmean = calculatePValue(allRegionWeights) - - print headline - shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue) - continue + if strand == "+" and read.is_reverse: + return True - #now do background swapping the two samples around - if mockRDS is not None: - print "calculating background..." - backgroundTrimValue = 1/20. - backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize, - chromosome, useMulti, normalize, maxSpacing, - doDirectionality, doTrim, minHits, minRatio, - readlen, shiftValue, minPeak, minPlusRatio, - maxPlusRatio, leftPlusRatio, listPeak, noMulti, - doControl, factor, backgroundTrimValue) - - statistics["mIndex"] += backgroundRegionStats["index"] - statistics["mTotal"] += backgroundRegionStats["total"] - statistics["failed"] += backgroundRegionStats["failed"] - print statistics["mIndex"], statistics["mTotal"] - if doPvalue: - if pValueType == "self": - p, poissonmean = calculatePValue(allRegionWeights) - else: - p, poissonmean = calculatePValue(backgroundRegionWeights) + if strand == "-" and not read.is_reverse: + return True + + return False - print headline - shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue) - footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue) - print footer - outfile.write(footer) - outfile.close() +def getMaxCoordinate(samfile, chr, doMulti=False): + maxCoord = 0 + for alignedread in samfile.fetch(chr): + if alignedread.opt('NH') > 1: + if doMulti: + maxCoord = max(maxCoord, alignedread.pos) + else: + maxCoord = max(maxCoord, alignedread.pos) - writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | "))) + return maxCoord -def determineShiftValue(autoshift, shift, noshift, rnaSettings): - shiftValue = 0 - if autoshift: - shiftValue = "auto" +def findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxCoord, chromosome, useMulti): + #TODO: this is *almost* the same calculation - there are small yet important differences + print "calculating background..." + previousHit = - 1 * regionFinder.maxSpacing + currentHitList = [-1] + currentTotalWeight = 0.0 + currentReadList = [] + backregions = [] + numStarts = 0 + badRegion = False + for alignedread in controlBAM.fetch(chromosome): + if doNotProcessRead(alignedread, doMulti=useMulti): + continue - if shift is not None: - try: - shiftValue = int(shift) - except ValueError: - if shift == "learn": - shiftValue = "learn" - print "Will try to learn shift" + pos = alignedread.pos + if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord): + lastReadPos = currentHitList[-1] + lastBasePosition = lastReadPos + regionFinder.readlen - 1 + region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight) + if regionFinder.normalize: + region.numReads /= regionFinder.controlRDSsize + + backregions.append(int(region.numReads)) + region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight) + regionLength = lastReadPos - region.start + if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength): + numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti) + if regionFinder.normalize: + numMock /= regionFinder.sampleRDSsize + + foldRatio = region.numReads / numMock + if foldRatio >= regionFinder.minRatio: + # first pass, with absolute numbers + peak = findPeak(currentReadList, region.start, lastReadPos - region.start, regionFinder.readlen, doWeight=True, + leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue) - if noshift or rnaSettings: - shiftValue = 0 + if regionFinder.doTrim: + try: + lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, 20., currentReadList, regionFinder.controlRDSsize) + except IndexError: + badRegion = True + continue - return shiftValue + numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti) + if regionFinder.normalize: + numMock /= regionFinder.sampleRDSsize + foldRatio = region.numReads / numMock -def doNotProcessChromosome(chromosome, doControl, mockChromList): - skipChromosome = False - if chromosome == "chrM": - skipChromosome = True + try: + bestPos = peak.topPos[0] + peakScore = peak.smoothArray[bestPos] + except IndexError: + continue - if doControl and (chromosome not in mockChromList): - skipChromosome = True + # normalize to RPM + if regionFinder.normalize: + peakScore /= regionFinder.controlRDSsize - return skipChromosome + # check that we still pass threshold + regionLength = lastReadPos - region.start + if regionPassesCriteria(regionFinder, region.numReads, foldRatio, regionLength): + regionFinder.updateControlStatistics(peak, region.numReads, peakScore) + currentHitList = [] + currentTotalWeight = 0.0 + currentReadList = [] + numStarts = 0 + if badRegion: + badRegion = False + regionFinder.statistics["badRegionTrim"] += 1 -def calculatePValue(dataList): - dataList.sort() - listSize = float(len(dataList)) - try: - poissonmean = sum(dataList) / listSize - except ZeroDivisionError: - poissonmean = 0 + if pos not in currentHitList: + numStarts += 1 - print "Poisson n=%d, p=%f" % (listSize, poissonmean) - p = math.exp(-poissonmean) + currentHitList.append(pos) + weight = 1.0/alignedread.opt('NH') + currentTotalWeight += weight + currentReadList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight}) + previousHit = pos - return p, poissonmean + return backregions -def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord, - stringency, readlen, minHits, logfilename, outfile, outfilename, minSites=30): +def learnShift(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename, + outfile, useMulti, controlBAM, combine5p): - print "learning shift.... will need at least %d training sites" % minSites - previousHit = -1 * maxSpacing - hitList = [-1] - totalWeight = 0 + print "learning shift.... will need at least 30 training sites" + stringency = regionFinder.stringency + previousHit = -1 * regionFinder.maxSpacing + positionList = [-1] + totalWeight = 0.0 readList = [] shiftDict = {} count = 0 numStarts = 0 - for read in hitDict[chrom]: - pos = read["start"] - sense = read["sense"] - weight = read["weight"] - if abs(pos - previousHit) > maxSpacing or pos == maxCoord: - sumAll = totalWeight - if normalize: - sumAll /= hitSampleSize - - regionStart = hitList[0] - regionStop = hitList[-1] - regionLength = regionStop - regionStart - # we're going to require stringent settings - if sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen: - foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio) + for alignedread in sampleBAM.fetch(chromosome): + if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p): + continue - if foldRatio >= minRatio: - localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True) - try: - shiftDict[localshift] += 1 - except KeyError: - shiftDict[localshift] = 1 + pos = alignedread.pos + if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord): + if regionFinder.normalize: + totalWeight /= regionFinder.sampleRDSsize + regionStart = positionList[0] + regionStop = positionList[-1] + regionLength = regionStop - regionStart + if regionPassesCriteria(regionFinder, totalWeight, numStarts, regionLength, stringency=stringency): + foldRatio = getFoldRatio(regionFinder, controlBAM, totalWeight, chromosome, regionStart, regionStop, useMulti) + if foldRatio >= regionFinder.minRatio: + updateShiftDict(shiftDict, readList, regionStart, regionLength, regionFinder.readlen) count += 1 - hitList = [] - totalWeight = 0 + positionList = [] + totalWeight = 0.0 readList = [] - numStarts = 0 - if pos not in hitList: + if pos not in positionList: numStarts += 1 - hitList.append(pos) + positionList.append(pos) + weight = 1.0/alignedread.opt('NH') totalWeight += weight - readList.append({"start": pos, "sense": sense, "weight": weight}) + readList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight}) previousHit = pos - bestShift = 0 - bestCount = 0 - learningSettings = ["#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d" % (stringency, stringency * minHits, - stringency * minRatio, stringency * readlen), - "#number of training examples: %d" % count] - outline = string.join(learningSettings, "\n") - print outline - writeLog(logfilename, versionString, "%s%s" % (outfilename, outline)) - if count < minSites: - outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold" - print >> outfile, outline - writeLog(logfilename, versionString, "%s%s" % (outfilename, outline)) - shiftValue = 0 - else: - for shift in sorted(shiftDict): - if shiftDict[shift] > bestCount: - bestShift = shift - bestCount = shiftDict[shift] - - shiftValue = bestShift - print shiftDict + outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency, + stringency * regionFinder.minHits, + stringency * regionFinder.minRatio, + stringency * regionFinder.readlen, + count) + print outline + writeLog(logfilename, versionString, outfilename + outline) + shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename) outline = "#picked shiftValue to be %d" % shiftValue print outline print >> outfile, outline - writeLog(logfilename, versionString, "%s%s" % (outfilename, outline)) + writeLog(logfilename, versionString, outfilename + outline) + + return shiftValue, shiftDict + + +def previousRegionIsDone(pos, previousHit, maxSpacing, maxCoord): + return abs(pos - previousHit) > maxSpacing or pos == maxCoord + + +def regionPassesCriteria(regionFinder, sumAll, numStarts, regionLength, stringency=1): + minTotalReads = stringency * regionFinder.minHits + minNumReadStarts = stringency * regionFinder.minRatio + minRegionLength = stringency * regionFinder.readlen + + return sumAll >= minTotalReads and numStarts >= minNumReadStarts and regionLength > minRegionLength + + +def trimRegion(region, regionFinder, peak, regionStop, trimValue, currentReadList, totalReadCount): + bestPos = peak.topPos[0] + peakScore = peak.smoothArray[bestPos] + if regionFinder.normalize: + peakScore /= totalReadCount + + minSignalThresh = trimValue * peakScore + start = findStartEdgePosition(peak, minSignalThresh) + regionEndPoint = regionStop - region.start - 1 + stop = findStopEdgePosition(peak, regionEndPoint, minSignalThresh) + + regionStop = region.start + stop + region.start += start + + trimmedPeak = findPeak(currentReadList, region.start, regionStop - region.start, regionFinder.readlen, doWeight=True, + leftPlus=regionFinder.doDirectionality, shift=peak.shift) + + peak.numPlus = trimmedPeak.numPlus + peak.numLeftPlus = trimmedPeak.numLeftPlus + peak.topPos = trimmedPeak.topPos + peak.smoothArray = trimmedPeak.smoothArray + + region.numReads = trimmedPeak.numHits + if regionFinder.normalize: + region.numReads /= totalReadCount + + region.stop = regionStop + regionFinder.readlen - 1 + + return regionStop + + +def findStartEdgePosition(peak, minSignalThresh): + start = 0 + while not peakEdgeLocated(peak, start, minSignalThresh): + start += 1 + + return start + + +def findStopEdgePosition(peak, stop, minSignalThresh): + while not peakEdgeLocated(peak, stop, minSignalThresh): + stop -= 1 + + return stop - return shiftValue +def peakEdgeLocated(peak, position, minSignalThresh): + return peak.smoothArray[position] >= minSignalThresh or position == peak.topPos[0] -def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio): - if doControl and rds is not None: - foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll) + +def getFoldRatio(regionFinder, controlBAM, sumAll, chromosome, regionStart, regionStop, useMulti): + """ Fold ratio calculated is total read weight over control + """ + #TODO: this needs to be generalized as there is a point at which we want to use the sampleRDS instead of controlRDS + if regionFinder.doControl: + numMock = 1. + countReadsInRegion(controlBAM, chromosome, regionStart, regionStop, countMulti=useMulti) + if regionFinder.normalize: + numMock /= regionFinder.controlRDSsize + + foldRatio = sumAll / numMock else: - foldRatio = minRatio + foldRatio = regionFinder.minRatio return foldRatio -def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll): - numMock = 1. + rds.getCounts(chrom, start, stop, uniqs=True, multi=useMulti, splices=False, reportCombined=True) - if normalize: - numMock /= sampleSize +def countReadsInRegion(bamfile, chr, start, end, uniqs=True, countMulti=False, countSplices=False): + count = 0.0 + for alignedread in bamfile.fetch(chr, start, end): + if alignedread.opt('NH') > 1: + if countMulti: + if isSpliceEntry(alignedread.cigar): + if countSplices: + count += 1.0/alignedread.opt('NH') + else: + count += 1.0/alignedread.opt('NH') + elif uniqs: + if isSpliceEntry(alignedread.cigar): + if countSplices: + count += 1.0 + else: + count += 1.0 - foldRatio = sumAll / numMock + return count - return foldRatio +def updateShiftDict(shiftDict, readList, regionStart, regionLength, readlen): + peak = findPeak(readList, regionStart, regionLength, readlen, doWeight=True, shift="auto") + try: + shiftDict[peak.shift] += 1 + except KeyError: + shiftDict[peak.shift] = 1 -def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti, - normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen, - shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak, - noMulti, doControl, factor, trimValue, outputRegionList=False): +def getShiftValue(shiftDict, count, logfilename, outfilename): + if count < 30: + outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold" + print outline + writeLog(logfilename, versionString, outfilename + outline) + shiftValue = 0 + else: + shiftValue = getBestShiftInDict(shiftDict) + print shiftDict - index = 0 - totalRegionWeight = 0 - failedCounter = 0 - previousHit = - 1 * maxSpacing - currentHitList = [-1] - currentTotalWeight = 0 - currentUniqReadCount = 0 - currentReadList = [] - regionWeights = [] - outregions = [] - numStarts = 0 - hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True) - maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti) - for read in hitDict[chrom]: - pos = read["start"] - sense = read["sense"] - weight = read["weight"] - if abs(pos - previousHit) > maxSpacing or pos == maxCoord: - sumAll = currentTotalWeight - if normalizeToRPM: - sumAll /= rdsSampleSize - - regionStart = currentHitList[0] - regionStop = currentHitList[-1] - regionWeights.append(int(sumAll)) - if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen: - sumMulti = 0. - foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio) - if foldRatio >= minRatio: - # first pass, with absolute numbers - peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue) - - bestPos = peak.topPos[0] - numHits = peak.numHits - peakScore = peak.smoothArray[bestPos] - numPlus = peak.numPlus - shift = peak.shift - numLeft = peak.numLeftPlus - if normalizeToRPM: - peakScore /= rdsSampleSize - - if doTrim: - minSignalThresh = trimValue * peakScore - start = 0 - stop = regionStop - regionStart - 1 - startFound = False - while not startFound: - if peak.smoothArray[start] >= minSignalThresh or start == bestPos: - startFound = True - else: - start += 1 - - stopFound = False - while not stopFound: - if peak.smoothArray[stop] >= minSignalThresh or stop == bestPos: - stopFound = True - else: - stop -= 1 - - regionStop = regionStart + stop - regionStart += start - trimPeak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift) - - sumAll = trimPeak.numHits - numPlus = trimPeak.numPlus - numLeft = trimPeak.numLeftPlus - if normalizeToRPM: - sumAll /= rdsSampleSize - - foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio) - if outputRegionList: - sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True) - # just in case it changed, use latest data - try: - bestPos = trimPeak.topPos[0] - peakScore = trimPeak.smoothArray[bestPos] - except: - continue + return shiftValue - if normalizeToRPM: - peakScore /= rdsSampleSize - elif outputRegionList: - sumMulti = currentTotalWeight - currentUniqReadCount +def getRegion(regionStart, regionStop, factor, index, chromosome, sumAll, foldRatio, multiP, + peakDescription, shift, doDirectionality, leftPlusRatio, numLeft, + numPlus, plusRatio): - if outputRegionList: - if normalizeToRPM: - sumMulti /= rdsSampleSize + if doDirectionality: + if leftPlusRatio < numLeft / numPlus: + plusP = plusRatio * 100. + leftP = 100. * numLeft / numPlus + # we have a region that passes all criteria + region = Region.DirectionalRegion(regionStart, regionStop, + factor, index, chromosome, sumAll, + foldRatio, multiP, plusP, leftP, + peakDescription, shift) - try: - multiP = 100. * (sumMulti / sumAll) - except: - break + else: + raise RegionDirectionError + else: + # we have a region, but didn't check for directionality + region = Region.Region(regionStart, regionStop, factor, index, chromosome, + sumAll, foldRatio, multiP, peakDescription, shift) - if noMulti: - multiP = 0. + return region - # check that we still pass threshold - if sumAll >= minHits and foldRatio >= minRatio and (regionStop - regionStart) > readlen: - plusRatio = float(numPlus)/numHits - if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio: - if outputRegionList: - peakDescription = "" - if listPeak: - peakDescription = "\t%d\t%.1f" % (regionStart + bestPos, peakScore) - - if doDirectionality: - if leftPlusRatio < numLeft / numPlus: - index += 1 - if outputRegionList: - plusP = plusRatio * 100. - leftP = 100. * numLeft / numPlus - # we have a region that passes all criteria - region = Region.DirectionalRegion(regionStart, regionStop + readlen - 1, - factor, index, chrom, sumAll, - foldRatio, multiP, plusP, leftP, - peakDescription, shift) - outregions.append(region) - - totalRegionWeight += sumAll - else: - failedCounter += 1 - else: - # we have a region, but didn't check for directionality - index += 1 - totalRegionWeight += sumAll - if outputRegionList: - region = Region.Region(regionStart, regionStop + readlen - 1, factor, index, chrom, - sumAll, foldRatio, multiP, peakDescription, shift) - outregions.append(region) - currentHitList = [] - currentTotalWeight = 0 - currentUniqReadCount = 0 - currentReadList = [] - numStarts = 0 +def setMultireadPercentage(region, sampleBAM, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim): + if doTrim: + sumMulti = 0.0 + for alignedread in sampleBAM.fetch(chromosome, region.start, lastReadPos): + if alignedread.opt('NH') > 1: + sumMulti += 1.0/alignedread.opt('NH') + else: + sumMulti = currentTotalWeight - currentUniqueCount - if pos not in currentHitList: - numStarts += 1 + # normalize to RPM + if normalize: + sumMulti /= hitRDSsize - currentHitList.append(pos) - currentTotalWeight += weight - if weight == 1.0: - currentUniqReadCount += 1 + try: + multiP = 100. * (sumMulti / region.numReads) + except ZeroDivisionError: + return - currentReadList.append({"start": pos, "sense": sense, "weight": weight}) - previousHit = pos + region.multiP = min(multiP, 100.) - statistics = {"index": index, - "total": totalRegionWeight, - "failed": failedCounter - } - if outputRegionList: - return statistics, regionWeights, outregions - else: - return statistics, regionWeights +def regionAndPeakPass(regionFinder, region, regionLength, peakScore, plusRatio): + regionPasses = False + if regionPassesCriteria(regionFinder, region.numReads, region.foldRatio, regionLength): + #TODO: here is where the test dataset is failing + if peakScore >= regionFinder.minPeak and regionFinder.minPlusRatio <= plusRatio <= regionFinder.maxPlusRatio: + regionPasses = True + return regionPasses -def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, reportshift, shiftValue): - bestShift = 0 - shiftDict = {} - for region in outregions: - # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest - if reportshift: - outputList = [region.printRegionWithShift()] - if shiftValue == "auto": - try: - shiftDict[region.shift] += 1 - except KeyError: - shiftDict[region.shift] = 1 + +def updateRegion(region, doDirectionality, leftPlusRatio, numLeft, numPlus, plusRatio): + + if doDirectionality: + if leftPlusRatio < numLeft / numPlus: + region.plusP = plusRatio * 100. + region.leftP = 100. * numLeft / numPlus else: - outputList = [region.printRegion()] + raise RegionDirectionError - # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest - if doPvalue: - sumAll = int(region.numReads) - for i in xrange(sumAll): - pValue *= poissonmean - pValue /= i+1 - outputList.append("%1.2g" % pValue) +def writeChromosomeResults(regionFinder, outregions, outfile, shiftDict, + allregions, header, backregions=[], pValueType="none"): + + print regionFinder.statistics["mIndex"], regionFinder.statistics["mTotal"] + if regionFinder.doPvalue: + if pValueType == "self": + poissonmean = calculatePoissonMean(allregions) + else: + poissonmean = calculatePoissonMean(backregions) + + print header + for region in outregions: + if regionFinder.shiftValue == "auto" and regionFinder.reportshift: + try: + shiftDict[region.shift] += 1 + except KeyError: + shiftDict[region.shift] = 1 + + outline = getRegionString(region, regionFinder.reportshift) + + # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest + if regionFinder.doPvalue: + pValue = calculatePValue(int(region.numReads), poissonmean) + outline += "\t%1.2g" % pValue - outline = string.join(outputList, "\t") print outline print >> outfile, outline - bestCount = 0 - for shift in sorted(shiftDict): - if shiftDict[shift] > bestCount: - bestShift = shift - bestCount = shiftDict[shift] - return bestShift +def calculatePoissonMean(dataList): + dataList.sort() + listSize = float(len(dataList)) + try: + poissonmean = sum(dataList) / listSize + except ZeroDivisionError: + poissonmean = 0 + + print "Poisson n=%d, p=%f" % (listSize, poissonmean) + + return poissonmean -def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue): - footerList = ["#stats:\t%.1f RPM in %d regions" % (stats["total"], stats["index"])] - if doDirectionality: - footerList.append("#\t\t%d additional regions failed directionality filter" % stats["failed"]) +def calculatePValue(sum, poissonmean): + pValue = math.exp(-poissonmean) + for i in xrange(sum): + pValue *= poissonmean + pValue /= i+1 - if doRevBackground: - try: - percent = min(100. * (float(stats["mIndex"])/stats["index"]), 100) - except (ValueError, ZeroDivisionError): - percent = 0. + return pValue + + +def getRegionString(region, reportShift): + if reportShift: + outline = region.printRegionWithShift() + else: + outline = region.printRegion() - footerList.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (stats["mIndex"], stats["mTotal"], percent)) + return outline - if shiftValue == "auto" and reportshift: - footerList.append("#mode of shift values: %d" % shiftModeValue) - footer = string.join(footerList, "\n") +def getBestShiftInDict(shiftDict): + return max(shiftDict.iteritems(), key=operator.itemgetter(1))[0] - return footer if __name__ == "__main__": - main(sys.argv) + main(sys.argv) \ No newline at end of file