"""
- usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile
- [--control controlrdsfile] [--minimum minHits] [--ratio minRatio]
+ usage: python $ERANGEPATH/findall.py label samplebamfile regionoutfile
+ [--control controlbamfile] [--minimum minHits] [--ratio minRatio]
[--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num]
[--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction]
[--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw]
import math
import string
import optparse
-from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
-import ReadDataset
+import operator
+import pysam
+from commoncode import writeLog, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption, isSpliceEntry
import Region
-versionString = "findall: version 3.2"
+versionString = "findall: version 3.2.1"
print versionString
+class RegionDirectionError(Exception):
+ pass
+
+
+class RegionFinder():
+ def __init__(self, label, minRatio=4.0, minPeak=0.5, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, strandfilter="",
+ minHits=4.0, trimValue=0.1, doTrim=True, doDirectionality=True, shiftValue=0, maxSpacing=50, withFlag="",
+ normalize=True, listPeak=False, reportshift=False, stringency=1.0, controlfile=None, doRevBackground=False):
+
+ self.statistics = {"index": 0,
+ "total": 0,
+ "mIndex": 0,
+ "mTotal": 0,
+ "failed": 0,
+ "badRegionTrim": 0
+ }
+
+ self.regionLabel = label
+ self.rnaSettings = False
+ self.controlRDSsize = 1.0
+ self.sampleRDSsize = 1.0
+ self.minRatio = minRatio
+ self.minPeak = minPeak
+ self.leftPlusRatio = leftPlusRatio
+ self.stranded = "both"
+ if strandfilter == "plus":
+ self.stranded = "+"
+ minPlusRatio = 0.9
+ maxPlusRatio = 1.0
+ elif strandfilter == "minus":
+ self.stranded = "-"
+ minPlusRatio = 0.0
+ maxPlusRatio = 0.1
+
+ if minRatio < minPeak:
+ self.minPeak = minRatio
+
+ self.minPlusRatio = minPlusRatio
+ self.maxPlusRatio = maxPlusRatio
+ self.strandfilter = strandfilter
+ self.minHits = minHits
+ self.trimValue = trimValue
+ self.doTrim = doTrim
+ self.doDirectionality = doDirectionality
+
+ if self.doTrim:
+ self.trimString = string.join(["%2.1f" % (100. * self.trimValue), "%"], "")
+ else:
+ self.trimString = "none"
+
+ self.shiftValue = shiftValue
+ self.maxSpacing = maxSpacing
+ self.withFlag = withFlag
+ self.normalize = normalize
+ self.listPeak = listPeak
+ self.reportshift = reportshift
+ self.stringency = max(stringency, 1.0)
+ self.controlfile = controlfile
+ self.doControl = self.controlfile is not None
+ self.doPvalue = False
+ self.doRevBackground = doRevBackground
+
+
+ def useRNASettings(self, readlen):
+ self.rnaSettings = True
+ self.shiftValue = 0
+ self.doTrim = False
+ self.doDirectionality = False
+ self.maxSpacing = readlen
+
+
+ def getHeader(self):
+ if self.normalize:
+ countType = "RPM"
+ else:
+ countType = "COUNT"
+
+ headerFields = ["#regionID\tchrom\tstart\tstop", countType, "fold\tmulti%"]
+
+ if self.doDirectionality:
+ headerFields.append("plus%\tleftPlus%")
+
+ if self.listPeak:
+ headerFields.append("peakPos\tpeakHeight")
+
+ if self.reportshift:
+ headerFields.append("readShift")
+
+ if self.doPvalue:
+ headerFields.append("pValue")
+
+ return string.join(headerFields, "\t")
+
+
+ def printSettings(self, ptype, useMulti, pValueType):
+ print
+ self.printStatusMessages(ptype, useMulti)
+ self.printOptionsSummary(useMulti, pValueType)
+
+
+ def printStatusMessages(self, ptype, useMulti):
+ if self.shiftValue == "learn":
+ print "Will try to learn shift"
+
+ if self.normalize:
+ print "Normalizing to RPM"
+
+ if self.doRevBackground:
+ print "Swapping IP and background to calculate FDR"
+
+ if ptype != "":
+ if ptype in ["NONE", "SELF"]:
+ pass
+ elif ptype == "BACK":
+ if self.doControl and self.doRevBackground:
+ pass
+ else:
+ print "must have a control dataset and -revbackground for pValue type 'back'"
+ else:
+ print "could not use pValue type : %s" % ptype
+
+ if self.withFlag != "":
+ print "restrict to flag = %s" % self.withFlag
+
+ if not useMulti:
+ print "using unique reads only"
+
+ if self.rnaSettings:
+ print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
+
+ if self.strandfilter == "plus":
+ print "only analyzing reads on the plus strand"
+ elif self.strandfilter == "minus":
+ print "only analyzing reads on the minus strand"
+
+
+ def printOptionsSummary(self, useMulti, pValueType):
+
+ print "\nenforceDirectionality=%s listPeak=%s nomulti=%s " % (self.doDirectionality, self.listPeak, not useMulti)
+ print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded)
+ try:
+ print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
+ except:
+ print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
+
+
+ def getAnalysisDescription(self, hitfile, useMulti, pValueType):
+
+ description = ["#ERANGE %s" % versionString]
+ if self.doControl:
+ description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, self.controlfile, self.controlRDSsize))
+ else:
+ description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample: none" % (hitfile, self.sampleRDSsize))
+
+ if self.withFlag != "":
+ description.append("#restrict to Flag = %s" % self.withFlag)
+
+ description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s" % (self.doDirectionality, self.listPeak, not useMulti))
+ description.append("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded))
+ try:
+ description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
+ except:
+ description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
+
+ return string.join(description, "\n")
+
+
+ def getFooter(self, bestShift):
+ index = self.statistics["index"]
+ mIndex = self.statistics["mIndex"]
+ footerLines = ["#stats:\t%.1f RPM in %d regions" % (self.statistics["total"], index)]
+ if self.doDirectionality:
+ footerLines.append("#\t\t%d additional regions failed directionality filter" % self.statistics["failed"])
+
+ if self.doRevBackground:
+ try:
+ percent = min(100. * (float(mIndex)/index), 100.)
+ except ZeroDivisionError:
+ percent = 0.
+
+ footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, self.statistics["mTotal"], percent))
+
+ if self.shiftValue == "auto" and self.reportshift:
+
+ footerLines.append("#mode of shift values: %d" % bestShift)
+
+ if self.statistics["badRegionTrim"] > 0:
+ footerLines.append("#%d regions discarded due to trimming problems" % self.statistics["badRegionTrim"])
+
+ return string.join(footerLines, "\n")
+
+
+ def updateControlStatistics(self, peak, sumAll, peakScore):
+
+ plusRatio = float(peak.numPlus)/peak.numHits
+ if peakScore >= self.minPeak and self.minPlusRatio <= plusRatio <= self.maxPlusRatio:
+ if self.doDirectionality:
+ if self.leftPlusRatio < peak.numLeftPlus / peak.numPlus:
+ self.statistics["mIndex"] += 1
+ self.statistics["mTotal"] += sumAll
+ else:
+ self.statistics["failed"] += 1
+ else:
+ # we have a region, but didn't check for directionality
+ self.statistics["mIndex"] += 1
+ self.statistics["mTotal"] += sumAll
+
+
def usage():
print __doc__
hitfile = args[1]
outfilename = args[2]
- findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift,
- options.stringency, options.noshift, options.autoshift, options.reportshift,
- options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
- options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
- options.trimValue, options.doTrim, options.doAppend, options.rnaSettings,
- options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti,
- options.strandfilter, options.combine5p)
+ shiftValue = 0
+
+ if options.autoshift:
+ shiftValue = "auto"
+
+ if options.shift is not None:
+ try:
+ shiftValue = int(options.shift)
+ except ValueError:
+ if options.shift == "learn":
+ shiftValue = "learn"
+
+ if options.noshift:
+ shiftValue = 0
+
+ if options.doAppend:
+ outputMode = "a"
+ else:
+ outputMode = "w"
+
+ regionFinder = RegionFinder(factor, minRatio=options.minRatio, minPeak=options.minPeak, minPlusRatio=options.minPlusRatio,
+ maxPlusRatio=options.maxPlusRatio, leftPlusRatio=options.leftPlusRatio, strandfilter=options.strandfilter,
+ minHits=options.minHits, trimValue=options.trimValue, doTrim=options.doTrim,
+ doDirectionality=options.doDirectionality, shiftValue=shiftValue, maxSpacing=options.maxSpacing,
+ withFlag=options.withFlag, normalize=options.normalize, listPeak=options.listPeak,
+ reportshift=options.reportshift, stringency=options.stringency, controlfile=options.controlfile,
+ doRevBackground=options.doRevBackground)
+
+ findall(regionFinder, hitfile, outfilename, options.logfilename, outputMode, options.rnaSettings,
+ options.ptype, options.useMulti, options.combine5p)
def makeParser():
usage = __doc__
parser = optparse.OptionParser(usage=usage)
- parser.add_option("--control", dest="mockfile")
+ parser.add_option("--control", dest="controlfile")
parser.add_option("--minimum", type="float", dest="minHits")
parser.add_option("--ratio", type="float", dest="minRatio")
parser.add_option("--spacing", type="int", dest="maxSpacing")
parser.add_option("--noshift", action="store_true", dest="noShift")
parser.add_option("--autoshift", action="store_true", dest="autoshift")
parser.add_option("--reportshift", action="store_true", dest="reportshift")
- parser.add_option("--nomulti", action="store_true", dest="noMulti")
+ parser.add_option("--nomulti", action="store_false", dest="useMulti")
parser.add_option("--minPlus", type="float", dest="minPlusRatio")
parser.add_option("--maxPlus", type="float", dest="maxPlusRatio")
parser.add_option("--leftPlus", type="float", dest="leftPlusRatio")
logfilename = getConfigOption(configParser, section, "logfilename", "findall.log")
withFlag = getConfigOption(configParser, section, "withFlag", "")
doDirectionality = getConfigBoolOption(configParser, section, "doDirectionality", True)
- trimValue = getConfigOption(configParser, section, "trimValue", None)
+ trimValue = getConfigFloatOption(configParser, section, "trimValue", 0.1)
doTrim = getConfigBoolOption(configParser, section, "doTrim", True)
doAppend = getConfigBoolOption(configParser, section, "doAppend", False)
rnaSettings = getConfigBoolOption(configParser, section, "rnaSettings", False)
cachePages = getConfigOption(configParser, section, "cachePages", None)
- ptype = getConfigOption(configParser, section, "ptype", None)
- mockfile = getConfigOption(configParser, section, "mockfile", None)
+ ptype = getConfigOption(configParser, section, "ptype", "")
+ controlfile = getConfigOption(configParser, section, "controlfile", None)
doRevBackground = getConfigBoolOption(configParser, section, "doRevBackground", False)
- noMulti = getConfigBoolOption(configParser, section, "noMulti", False)
- strandfilter = getConfigOption(configParser, section, "strandfilter", None)
+ useMulti = getConfigBoolOption(configParser, section, "useMulti", True)
+ strandfilter = getConfigOption(configParser, section, "strandfilter", "")
combine5p = getConfigBoolOption(configParser, section, "combine5p", False)
parser.set_defaults(minHits=minHits, minRatio=minRatio, maxSpacing=maxSpacing, listPeak=listPeak, shift=shift,
minPlusRatio=minPlusRatio, maxPlusRatio=maxPlusRatio, leftPlusRatio=leftPlusRatio, minPeak=minPeak,
normalize=normalize, logfilename=logfilename, withFlag=withFlag, doDirectionality=doDirectionality,
trimValue=trimValue, doTrim=doTrim, doAppend=doAppend, rnaSettings=rnaSettings,
- cachePages=cachePages, ptype=ptype, mockfile=mockfile, doRevBackground=doRevBackground, noMulti=noMulti,
+ cachePages=cachePages, ptype=ptype, controlfile=controlfile, doRevBackground=doRevBackground, useMulti=useMulti,
strandfilter=strandfilter, combine5p=combine5p)
return parser
-def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
- stringency=4.0, noshift=False, autoshift=False, reportshift=False,
- minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
- normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
- trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
- cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
- strandfilter=None, combine5p=False):
+def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outputMode="w", rnaSettings=False,
+ ptype="", useMulti=True, combine5p=False):
- shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
- doControl = False
- if mockfile is not None:
- doControl = True
+ writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+ controlBAM = None
+ if regionFinder.doControl:
+ print "\ncontrol:"
+ controlBAM = pysam.Samfile(regionFinder.controlfile, "rb")
+ regionFinder.controlRDSsize = int(getHeaderComment(controlBAM.header, "Total")) / 1000000.
- if doRevBackground:
- print "Swapping IP and background to calculate FDR"
- pValueType = "back"
+ print "\nsample:"
+ sampleBAM = pysam.Samfile(hitfile, "rb")
+ regionFinder.sampleRDSsize = int(getHeaderComment(sampleBAM.header, "Total")) / 1000000.
+ pValueType = getPValueType(ptype, regionFinder.doControl, regionFinder.doRevBackground)
+ regionFinder.doPvalue = not pValueType == "none"
+ regionFinder.readlen = int(getHeaderComment(sampleBAM.header, "ReadLength"))
+ if rnaSettings:
+ regionFinder.useRNASettings(regionFinder.readlen)
+
+ regionFinder.printSettings(ptype, useMulti, pValueType)
+ outfile = open(outfilename, outputMode)
+ header = writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType)
+ shiftDict = {}
+ chromList = getChromosomeListToProcess(sampleBAM, controlBAM)
+ for chromosome in chromList:
+ #TODO: Really? Use first chr shift value for all of them
+ maxSampleCoord = getMaxCoordinate(sampleBAM, chromosome, doMulti=useMulti)
+ if regionFinder.shiftValue == "learn":
+ regionFinder.shiftValue = learnShift(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p)
+
+ allregions, outregions = findPeakRegions(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p)
+ if regionFinder.doRevBackground:
+ maxControlCoord = getMaxCoordinate(controlBAM, chromosome, doMulti=useMulti)
+ backregions = findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxControlCoord, chromosome, useMulti)
+ else:
+ backregions = []
+ pValueType = "self"
+
+ writeChromosomeResults(regionFinder, outregions, outfile, shiftDict, allregions, header, backregions=backregions, pValueType=pValueType)
+
+ try:
+ bestShift = getBestShiftInDict(shiftDict)
+ except ValueError:
+ bestShift = 0
+
+ footer = regionFinder.getFooter(bestShift)
+ print footer
+ print >> outfile, footer
+ outfile.close()
+ writeLog(logfilename, versionString, outfilename + footer.replace("\n#"," | ")[:-1])
+
+
+def getHeaderComment(bamHeader, commentKey):
+ for comment in bamHeader["CO"]:
+ fields = comment.split("\t")
+ if fields[0] == commentKey:
+ return fields[1]
+
+ raise KeyError
- doPvalue = True
- if ptype is not None:
- ptype = ptype.upper()
+
+def getPValueType(ptype, doControl, doRevBackground):
+ pValueType = "self"
+ if ptype in ["NONE", "SELF", "BACK"]:
if ptype == "NONE":
- doPvalue = False
pValueType = "none"
- p = 1
- poissonmean = 0
elif ptype == "SELF":
pValueType = "self"
elif ptype == "BACK":
if doControl and doRevBackground:
pValueType = "back"
- else:
- print "must have a control dataset and -revbackground for pValue type 'back'"
- else:
- print "could not use pValue type : %s" % ptype
- else:
- pValueType = "self"
+ elif doRevBackground:
+ pValueType = "back"
- if withFlag != "":
- print "restrict to flag = %s" % withFlag
+ return pValueType
- useMulti = True
- if noMulti:
- print "using unique reads only"
- useMulti = False
- if rnaSettings:
- print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
- doTrim = False
- doDirectionality = False
+def writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType):
+ print >> outfile, regionFinder.getAnalysisDescription(hitfile, useMulti, pValueType)
+ header = regionFinder.getHeader()
+ print >> outfile, header
- stranded = ""
- if strandfilter is not None:
- if strandfilter == "plus":
- stranded = "+"
- minPlusRatio = 0.9
- maxPlusRatio = 1.0
- print "only analyzing reads on the plus strand"
- elif strandfilter == "minus":
- stranded = "-"
- minPlusRatio = 0.0
- maxPlusRatio = 0.1
- print "only analyzing reads on the minus strand"
+ return header
- stringency = max(stringency, 1.0)
- writeLog(logfilename, versionString, string.join(sys.argv[1:]))
- if cachePages is not None:
- doCache = True
- else:
- doCache = False
- cachePages = -1
- mockRDS = None
- if doControl:
- print "\ncontrol:"
- mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
-
- if cachePages > mockRDS.getDefaultCacheSize():
- mockRDS.setDBcache(cachePages)
-
- print "\nsample:"
- hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
- readlen = hitRDS.getReadSize()
- if rnaSettings:
- maxSpacing = readlen
+def getChromosomeListToProcess(sampleBAM, controlBAM=None):
+ if controlBAM is not None:
+ chromosomeList = [chrom for chrom in sampleBAM.references if chrom in controlBAM.references and chrom != "chrM"]
+ else:
+ chromosomeList = [chrom for chrom in sampleBAM.references if chrom != "chrM"]
- if cachePages > hitRDS.getDefaultCacheSize():
- hitRDS.setDBcache(cachePages)
+ return chromosomeList
- if trimValue is not None:
- trimValue = float(trimValue) / 100.
- trimString = "%2.1f%s" % ((100. * trimValue), "%")
- else:
- trimValue = 0.1
- trimString = "10%"
- if not doTrim:
- trimString = "none"
+def findPeakRegions(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename,
+ outfile, useMulti, controlBAM, combine5p):
- if doAppend:
- fileMode = "a"
- else:
- fileMode = "w"
+ outregions = []
+ allregions = []
+ print "chromosome %s" % (chromosome)
+ previousHit = - 1 * regionFinder.maxSpacing
+ readStartPositions = [-1]
+ totalWeight = 0.0
+ uniqueReadCount = 0
+ reads = []
+ numStartsInRegion = 0
+
+ for alignedread in sampleBAM.fetch(chromosome):
+ if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p):
+ continue
- outfile = open(outfilename, fileMode)
- outfile.write("#ERANGE %s\n" % versionString)
- if doControl:
- mockRDSsize = len(mockRDS) / 1000000.
- controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize)
- else:
- mockRDSsize = 0
- controlSampleString = " none"
+ pos = alignedread.pos
+ if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
+ lastReadPos = readStartPositions[-1]
+ lastBasePosition = lastReadPos + regionFinder.readlen - 1
+ newRegionIndex = regionFinder.statistics["index"] + 1
+ if regionFinder.doDirectionality:
+ region = Region.DirectionalRegion(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=regionFinder.regionLabel,
+ numReads=totalWeight)
+ else:
+ region = Region.Region(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=regionFinder.regionLabel, numReads=totalWeight)
- hitRDSsize = len(hitRDS) / 1000000.
- outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:%s\n" % (hitfile, hitRDSsize, controlSampleString))
+ if regionFinder.normalize:
+ region.numReads /= regionFinder.sampleRDSsize
- if withFlag != "":
- outfile.write("#restrict to Flag = %s\n" % withFlag)
+ allregions.append(int(region.numReads))
+ regionLength = lastReadPos - region.start
+ if regionPassesCriteria(regionFinder, region.numReads, numStartsInRegion, regionLength):
+ region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti)
- print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache)
- print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)
- print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)
+ if region.foldRatio >= regionFinder.minRatio:
+ # first pass, with absolute numbers
+ peak = findPeak(reads, region.start, regionLength, regionFinder.readlen, doWeight=True, leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue)
+ if regionFinder.doTrim:
+ try:
+ lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.sampleRDSsize)
+ except IndexError:
+ regionFinder.statistics["badRegionTrim"] += 1
+ continue
- outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache))
- outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded))
- outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType))
- if normalize:
- print "Normalizing to RPM"
- countLabel = "RPM"
- else:
- countLabel = "COUNT"
+ region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti)
- headerList = ["#regionID\tchrom\tstart\tstop", countLabel, "fold\tmulti%"]
- if doDirectionality:
- headerList.append("plus%\tleftPlus%")
+ try:
+ bestPos = peak.topPos[0]
+ peakScore = peak.smoothArray[bestPos]
+ if regionFinder.normalize:
+ peakScore /= regionFinder.sampleRDSsize
+ except (IndexError, AttributeError, ZeroDivisionError):
+ continue
- if listPeak:
- headerList.append("peakPos\tpeakHeight")
+ if regionFinder.listPeak:
+ region.peakDescription = "%d\t%.1f" % (region.start + bestPos, peakScore)
- if reportshift:
- headerList.append("readShift")
+ if useMulti:
+ setMultireadPercentage(region, sampleBAM, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos,
+ regionFinder.normalize, regionFinder.doTrim)
- if doPvalue:
- headerList.append("pValue")
+ region.shift = peak.shift
+ # check that we still pass threshold
+ regionLength = lastReadPos - region.start
+ plusRatio = float(peak.numPlus)/peak.numHits
+ if regionAndPeakPass(regionFinder, region, regionLength, peakScore, plusRatio):
+ try:
+ updateRegion(region, regionFinder.doDirectionality, regionFinder.leftPlusRatio, peak.numLeftPlus, peak.numPlus, plusRatio)
+ regionFinder.statistics["index"] += 1
+ outregions.append(region)
+ regionFinder.statistics["total"] += region.numReads
+ except RegionDirectionError:
+ regionFinder.statistics["failed"] += 1
+
+ readStartPositions = []
+ totalWeight = 0.0
+ uniqueReadCount = 0
+ reads = []
+ numStartsInRegion = 0
+
+ if pos not in readStartPositions:
+ numStartsInRegion += 1
+
+ readStartPositions.append(pos)
+ weight = 1.0/alignedread.opt('NH')
+ totalWeight += weight
+ if weight == 1.0:
+ uniqueReadCount += 1
- headline = string.join(headerList, "\t")
- print >> outfile, headline
+ reads.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
+ previousHit = pos
- statistics = {"index": 0,
- "total": 0,
- "mIndex": 0,
- "mTotal": 0,
- "failed": 0
- }
+ return allregions, outregions
- if minRatio < minPeak:
- minPeak = minRatio
- hitChromList = hitRDS.getChromosomes()
- if doControl:
- mockChromList = mockRDS.getChromosomes()
+def getReadSense(read):
+ if read.is_reverse:
+ sense = "-"
else:
- mockChromList = []
-
- mockSampleSize = 0
- hitSampleSize = 0
- if normalize:
- if doControl:
- mockSampleSize = mockRDSsize
+ sense = "+"
- hitSampleSize = hitRDSsize
+ return sense
- hitChromList.sort()
- for chromosome in hitChromList:
- if doNotProcessChromosome(chromosome, doControl, mockChromList):
- continue
+def doNotProcessRead(read, doMulti=False, strand="both", combine5p=False):
+ if read.opt('NH') > 1 and not doMulti:
+ return True
- print "chromosome %s" % (chromosome)
- hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True,
- doMulti=useMulti, findallOptimize=True, strand=stranded,
- combine5p=combine5p)
- maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
- if shiftValue == "learn":
- shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize,
- mockSampleSize, minRatio, maxSpacing, maxCoord, stringency, readlen, minHits,
- logfilename, outfile, outfilename)
-
- regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize,
- chromosome, useMulti, normalize, maxSpacing,
- doDirectionality, doTrim, minHits, minRatio,
- readlen, shiftValue, minPeak, minPlusRatio,
- maxPlusRatio, leftPlusRatio, listPeak, noMulti,
- doControl, factor, trimValue, outputRegionList=True)
-
- statistics["index"] += regionStats["index"]
- statistics["total"] += regionStats["total"]
- statistics["failed"] += regionStats["failed"]
- if not doRevBackground:
- if doPvalue:
- p, poissonmean = calculatePValue(allRegionWeights)
-
- print headline
- shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
- continue
+ if strand == "+" and read.is_reverse:
+ return True
- #now do background swapping the two samples around
- if mockRDS is not None:
- print "calculating background..."
- backgroundTrimValue = 1/20.
- backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize,
- chromosome, useMulti, normalize, maxSpacing,
- doDirectionality, doTrim, minHits, minRatio,
- readlen, shiftValue, minPeak, minPlusRatio,
- maxPlusRatio, leftPlusRatio, listPeak, noMulti,
- doControl, factor, backgroundTrimValue)
-
- statistics["mIndex"] += backgroundRegionStats["index"]
- statistics["mTotal"] += backgroundRegionStats["total"]
- statistics["failed"] += backgroundRegionStats["failed"]
- print statistics["mIndex"], statistics["mTotal"]
- if doPvalue:
- if pValueType == "self":
- p, poissonmean = calculatePValue(allRegionWeights)
- else:
- p, poissonmean = calculatePValue(backgroundRegionWeights)
+ if strand == "-" and not read.is_reverse:
+ return True
+
+ return False
- print headline
- shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
- footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue)
- print footer
- outfile.write(footer)
- outfile.close()
+def getMaxCoordinate(samfile, chr, doMulti=False):
+ maxCoord = 0
+ for alignedread in samfile.fetch(chr):
+ if alignedread.opt('NH') > 1:
+ if doMulti:
+ maxCoord = max(maxCoord, alignedread.pos)
+ else:
+ maxCoord = max(maxCoord, alignedread.pos)
- writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | ")))
+ return maxCoord
-def determineShiftValue(autoshift, shift, noshift, rnaSettings):
- shiftValue = 0
- if autoshift:
- shiftValue = "auto"
+def findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxCoord, chromosome, useMulti):
+ #TODO: this is *almost* the same calculation - there are small yet important differences
+ print "calculating background..."
+ previousHit = - 1 * regionFinder.maxSpacing
+ currentHitList = [-1]
+ currentTotalWeight = 0.0
+ currentReadList = []
+ backregions = []
+ numStarts = 0
+ badRegion = False
+ for alignedread in controlBAM.fetch(chromosome):
+ if doNotProcessRead(alignedread, doMulti=useMulti):
+ continue
- if shift is not None:
- try:
- shiftValue = int(shift)
- except ValueError:
- if shift == "learn":
- shiftValue = "learn"
- print "Will try to learn shift"
+ pos = alignedread.pos
+ if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
+ lastReadPos = currentHitList[-1]
+ lastBasePosition = lastReadPos + regionFinder.readlen - 1
+ region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight)
+ if regionFinder.normalize:
+ region.numReads /= regionFinder.controlRDSsize
+
+ backregions.append(int(region.numReads))
+ region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight)
+ regionLength = lastReadPos - region.start
+ if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength):
+ numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti)
+ if regionFinder.normalize:
+ numMock /= regionFinder.sampleRDSsize
+
+ foldRatio = region.numReads / numMock
+ if foldRatio >= regionFinder.minRatio:
+ # first pass, with absolute numbers
+ peak = findPeak(currentReadList, region.start, lastReadPos - region.start, regionFinder.readlen, doWeight=True,
+ leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue)
- if noshift or rnaSettings:
- shiftValue = 0
+ if regionFinder.doTrim:
+ try:
+ lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, 20., currentReadList, regionFinder.controlRDSsize)
+ except IndexError:
+ badRegion = True
+ continue
- return shiftValue
+ numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti)
+ if regionFinder.normalize:
+ numMock /= regionFinder.sampleRDSsize
+ foldRatio = region.numReads / numMock
-def doNotProcessChromosome(chromosome, doControl, mockChromList):
- skipChromosome = False
- if chromosome == "chrM":
- skipChromosome = True
+ try:
+ bestPos = peak.topPos[0]
+ peakScore = peak.smoothArray[bestPos]
+ except IndexError:
+ continue
- if doControl and (chromosome not in mockChromList):
- skipChromosome = True
+ # normalize to RPM
+ if regionFinder.normalize:
+ peakScore /= regionFinder.controlRDSsize
- return skipChromosome
+ # check that we still pass threshold
+ regionLength = lastReadPos - region.start
+ if regionPassesCriteria(regionFinder, region.numReads, foldRatio, regionLength):
+ regionFinder.updateControlStatistics(peak, region.numReads, peakScore)
+ currentHitList = []
+ currentTotalWeight = 0.0
+ currentReadList = []
+ numStarts = 0
+ if badRegion:
+ badRegion = False
+ regionFinder.statistics["badRegionTrim"] += 1
-def calculatePValue(dataList):
- dataList.sort()
- listSize = float(len(dataList))
- try:
- poissonmean = sum(dataList) / listSize
- except ZeroDivisionError:
- poissonmean = 0
+ if pos not in currentHitList:
+ numStarts += 1
- print "Poisson n=%d, p=%f" % (listSize, poissonmean)
- p = math.exp(-poissonmean)
+ currentHitList.append(pos)
+ weight = 1.0/alignedread.opt('NH')
+ currentTotalWeight += weight
+ currentReadList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
+ previousHit = pos
- return p, poissonmean
+ return backregions
-def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
- stringency, readlen, minHits, logfilename, outfile, outfilename, minSites=30):
+def learnShift(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename,
+ outfile, useMulti, controlBAM, combine5p):
- print "learning shift.... will need at least %d training sites" % minSites
- previousHit = -1 * maxSpacing
- hitList = [-1]
- totalWeight = 0
+ print "learning shift.... will need at least 30 training sites"
+ stringency = regionFinder.stringency
+ previousHit = -1 * regionFinder.maxSpacing
+ positionList = [-1]
+ totalWeight = 0.0
readList = []
shiftDict = {}
count = 0
numStarts = 0
- for read in hitDict[chrom]:
- pos = read["start"]
- sense = read["sense"]
- weight = read["weight"]
- if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
- sumAll = totalWeight
- if normalize:
- sumAll /= hitSampleSize
-
- regionStart = hitList[0]
- regionStop = hitList[-1]
- regionLength = regionStop - regionStart
- # we're going to require stringent settings
- if sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen:
- foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
+ for alignedread in sampleBAM.fetch(chromosome):
+ if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p):
+ continue
- if foldRatio >= minRatio:
- localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True)
- try:
- shiftDict[localshift] += 1
- except KeyError:
- shiftDict[localshift] = 1
+ pos = alignedread.pos
+ if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
+ if regionFinder.normalize:
+ totalWeight /= regionFinder.sampleRDSsize
+ regionStart = positionList[0]
+ regionStop = positionList[-1]
+ regionLength = regionStop - regionStart
+ if regionPassesCriteria(regionFinder, totalWeight, numStarts, regionLength, stringency=stringency):
+ foldRatio = getFoldRatio(regionFinder, controlBAM, totalWeight, chromosome, regionStart, regionStop, useMulti)
+ if foldRatio >= regionFinder.minRatio:
+ updateShiftDict(shiftDict, readList, regionStart, regionLength, regionFinder.readlen)
count += 1
- hitList = []
- totalWeight = 0
+ positionList = []
+ totalWeight = 0.0
readList = []
- numStarts = 0
- if pos not in hitList:
+ if pos not in positionList:
numStarts += 1
- hitList.append(pos)
+ positionList.append(pos)
+ weight = 1.0/alignedread.opt('NH')
totalWeight += weight
- readList.append({"start": pos, "sense": sense, "weight": weight})
+ readList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
previousHit = pos
- bestShift = 0
- bestCount = 0
- learningSettings = ["#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d" % (stringency, stringency * minHits,
- stringency * minRatio, stringency * readlen),
- "#number of training examples: %d" % count]
- outline = string.join(learningSettings, "\n")
- print outline
- writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
- if count < minSites:
- outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
- print >> outfile, outline
- writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
- shiftValue = 0
- else:
- for shift in sorted(shiftDict):
- if shiftDict[shift] > bestCount:
- bestShift = shift
- bestCount = shiftDict[shift]
-
- shiftValue = bestShift
- print shiftDict
+ outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency,
+ stringency * regionFinder.minHits,
+ stringency * regionFinder.minRatio,
+ stringency * regionFinder.readlen,
+ count)
+ print outline
+ writeLog(logfilename, versionString, outfilename + outline)
+ shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename)
outline = "#picked shiftValue to be %d" % shiftValue
print outline
print >> outfile, outline
- writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
+ writeLog(logfilename, versionString, outfilename + outline)
+
+ return shiftValue, shiftDict
+
+
+def previousRegionIsDone(pos, previousHit, maxSpacing, maxCoord):
+ return abs(pos - previousHit) > maxSpacing or pos == maxCoord
+
+
+def regionPassesCriteria(regionFinder, sumAll, numStarts, regionLength, stringency=1):
+ minTotalReads = stringency * regionFinder.minHits
+ minNumReadStarts = stringency * regionFinder.minRatio
+ minRegionLength = stringency * regionFinder.readlen
+
+ return sumAll >= minTotalReads and numStarts >= minNumReadStarts and regionLength > minRegionLength
+
+
+def trimRegion(region, regionFinder, peak, regionStop, trimValue, currentReadList, totalReadCount):
+ bestPos = peak.topPos[0]
+ peakScore = peak.smoothArray[bestPos]
+ if regionFinder.normalize:
+ peakScore /= totalReadCount
+
+ minSignalThresh = trimValue * peakScore
+ start = findStartEdgePosition(peak, minSignalThresh)
+ regionEndPoint = regionStop - region.start - 1
+ stop = findStopEdgePosition(peak, regionEndPoint, minSignalThresh)
+
+ regionStop = region.start + stop
+ region.start += start
+
+ trimmedPeak = findPeak(currentReadList, region.start, regionStop - region.start, regionFinder.readlen, doWeight=True,
+ leftPlus=regionFinder.doDirectionality, shift=peak.shift)
+
+ peak.numPlus = trimmedPeak.numPlus
+ peak.numLeftPlus = trimmedPeak.numLeftPlus
+ peak.topPos = trimmedPeak.topPos
+ peak.smoothArray = trimmedPeak.smoothArray
+
+ region.numReads = trimmedPeak.numHits
+ if regionFinder.normalize:
+ region.numReads /= totalReadCount
+
+ region.stop = regionStop + regionFinder.readlen - 1
+
+ return regionStop
+
+
+def findStartEdgePosition(peak, minSignalThresh):
+ start = 0
+ while not peakEdgeLocated(peak, start, minSignalThresh):
+ start += 1
+
+ return start
+
+
+def findStopEdgePosition(peak, stop, minSignalThresh):
+ while not peakEdgeLocated(peak, stop, minSignalThresh):
+ stop -= 1
+
+ return stop
- return shiftValue
+def peakEdgeLocated(peak, position, minSignalThresh):
+ return peak.smoothArray[position] >= minSignalThresh or position == peak.topPos[0]
-def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio):
- if doControl and rds is not None:
- foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll)
+
+def getFoldRatio(regionFinder, controlBAM, sumAll, chromosome, regionStart, regionStop, useMulti):
+ """ Fold ratio calculated is total read weight over control
+ """
+ #TODO: this needs to be generalized as there is a point at which we want to use the sampleRDS instead of controlRDS
+ if regionFinder.doControl:
+ numMock = 1. + countReadsInRegion(controlBAM, chromosome, regionStart, regionStop, countMulti=useMulti)
+ if regionFinder.normalize:
+ numMock /= regionFinder.controlRDSsize
+
+ foldRatio = sumAll / numMock
else:
- foldRatio = minRatio
+ foldRatio = regionFinder.minRatio
return foldRatio
-def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll):
- numMock = 1. + rds.getCounts(chrom, start, stop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
- if normalize:
- numMock /= sampleSize
+def countReadsInRegion(bamfile, chr, start, end, uniqs=True, countMulti=False, countSplices=False):
+ count = 0.0
+ for alignedread in bamfile.fetch(chr, start, end):
+ if alignedread.opt('NH') > 1:
+ if countMulti:
+ if isSpliceEntry(alignedread.cigar):
+ if countSplices:
+ count += 1.0/alignedread.opt('NH')
+ else:
+ count += 1.0/alignedread.opt('NH')
+ elif uniqs:
+ if isSpliceEntry(alignedread.cigar):
+ if countSplices:
+ count += 1.0
+ else:
+ count += 1.0
- foldRatio = sumAll / numMock
+ return count
- return foldRatio
+def updateShiftDict(shiftDict, readList, regionStart, regionLength, readlen):
+ peak = findPeak(readList, regionStart, regionLength, readlen, doWeight=True, shift="auto")
+ try:
+ shiftDict[peak.shift] += 1
+ except KeyError:
+ shiftDict[peak.shift] = 1
-def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
- normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
- shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
- noMulti, doControl, factor, trimValue, outputRegionList=False):
+def getShiftValue(shiftDict, count, logfilename, outfilename):
+ if count < 30:
+ outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
+ print outline
+ writeLog(logfilename, versionString, outfilename + outline)
+ shiftValue = 0
+ else:
+ shiftValue = getBestShiftInDict(shiftDict)
+ print shiftDict
- index = 0
- totalRegionWeight = 0
- failedCounter = 0
- previousHit = - 1 * maxSpacing
- currentHitList = [-1]
- currentTotalWeight = 0
- currentUniqReadCount = 0
- currentReadList = []
- regionWeights = []
- outregions = []
- numStarts = 0
- hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True)
- maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti)
- for read in hitDict[chrom]:
- pos = read["start"]
- sense = read["sense"]
- weight = read["weight"]
- if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
- sumAll = currentTotalWeight
- if normalizeToRPM:
- sumAll /= rdsSampleSize
-
- regionStart = currentHitList[0]
- regionStop = currentHitList[-1]
- regionWeights.append(int(sumAll))
- if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
- sumMulti = 0.
- foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
- if foldRatio >= minRatio:
- # first pass, with absolute numbers
- peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
-
- bestPos = peak.topPos[0]
- numHits = peak.numHits
- peakScore = peak.smoothArray[bestPos]
- numPlus = peak.numPlus
- shift = peak.shift
- numLeft = peak.numLeftPlus
- if normalizeToRPM:
- peakScore /= rdsSampleSize
-
- if doTrim:
- minSignalThresh = trimValue * peakScore
- start = 0
- stop = regionStop - regionStart - 1
- startFound = False
- while not startFound:
- if peak.smoothArray[start] >= minSignalThresh or start == bestPos:
- startFound = True
- else:
- start += 1
-
- stopFound = False
- while not stopFound:
- if peak.smoothArray[stop] >= minSignalThresh or stop == bestPos:
- stopFound = True
- else:
- stop -= 1
-
- regionStop = regionStart + stop
- regionStart += start
- trimPeak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift)
-
- sumAll = trimPeak.numHits
- numPlus = trimPeak.numPlus
- numLeft = trimPeak.numLeftPlus
- if normalizeToRPM:
- sumAll /= rdsSampleSize
-
- foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
- if outputRegionList:
- sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
- # just in case it changed, use latest data
- try:
- bestPos = trimPeak.topPos[0]
- peakScore = trimPeak.smoothArray[bestPos]
- except:
- continue
+ return shiftValue
- if normalizeToRPM:
- peakScore /= rdsSampleSize
- elif outputRegionList:
- sumMulti = currentTotalWeight - currentUniqReadCount
+def getRegion(regionStart, regionStop, factor, index, chromosome, sumAll, foldRatio, multiP,
+ peakDescription, shift, doDirectionality, leftPlusRatio, numLeft,
+ numPlus, plusRatio):
- if outputRegionList:
- if normalizeToRPM:
- sumMulti /= rdsSampleSize
+ if doDirectionality:
+ if leftPlusRatio < numLeft / numPlus:
+ plusP = plusRatio * 100.
+ leftP = 100. * numLeft / numPlus
+ # we have a region that passes all criteria
+ region = Region.DirectionalRegion(regionStart, regionStop,
+ factor, index, chromosome, sumAll,
+ foldRatio, multiP, plusP, leftP,
+ peakDescription, shift)
- try:
- multiP = 100. * (sumMulti / sumAll)
- except:
- break
+ else:
+ raise RegionDirectionError
+ else:
+ # we have a region, but didn't check for directionality
+ region = Region.Region(regionStart, regionStop, factor, index, chromosome,
+ sumAll, foldRatio, multiP, peakDescription, shift)
- if noMulti:
- multiP = 0.
+ return region
- # check that we still pass threshold
- if sumAll >= minHits and foldRatio >= minRatio and (regionStop - regionStart) > readlen:
- plusRatio = float(numPlus)/numHits
- if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
- if outputRegionList:
- peakDescription = ""
- if listPeak:
- peakDescription = "\t%d\t%.1f" % (regionStart + bestPos, peakScore)
-
- if doDirectionality:
- if leftPlusRatio < numLeft / numPlus:
- index += 1
- if outputRegionList:
- plusP = plusRatio * 100.
- leftP = 100. * numLeft / numPlus
- # we have a region that passes all criteria
- region = Region.DirectionalRegion(regionStart, regionStop + readlen - 1,
- factor, index, chrom, sumAll,
- foldRatio, multiP, plusP, leftP,
- peakDescription, shift)
- outregions.append(region)
-
- totalRegionWeight += sumAll
- else:
- failedCounter += 1
- else:
- # we have a region, but didn't check for directionality
- index += 1
- totalRegionWeight += sumAll
- if outputRegionList:
- region = Region.Region(regionStart, regionStop + readlen - 1, factor, index, chrom,
- sumAll, foldRatio, multiP, peakDescription, shift)
- outregions.append(region)
- currentHitList = []
- currentTotalWeight = 0
- currentUniqReadCount = 0
- currentReadList = []
- numStarts = 0
+def setMultireadPercentage(region, sampleBAM, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim):
+ if doTrim:
+ sumMulti = 0.0
+ for alignedread in sampleBAM.fetch(chromosome, region.start, lastReadPos):
+ if alignedread.opt('NH') > 1:
+ sumMulti += 1.0/alignedread.opt('NH')
+ else:
+ sumMulti = currentTotalWeight - currentUniqueCount
- if pos not in currentHitList:
- numStarts += 1
+ # normalize to RPM
+ if normalize:
+ sumMulti /= hitRDSsize
- currentHitList.append(pos)
- currentTotalWeight += weight
- if weight == 1.0:
- currentUniqReadCount += 1
+ try:
+ multiP = 100. * (sumMulti / region.numReads)
+ except ZeroDivisionError:
+ return
- currentReadList.append({"start": pos, "sense": sense, "weight": weight})
- previousHit = pos
+ region.multiP = min(multiP, 100.)
- statistics = {"index": index,
- "total": totalRegionWeight,
- "failed": failedCounter
- }
- if outputRegionList:
- return statistics, regionWeights, outregions
- else:
- return statistics, regionWeights
+def regionAndPeakPass(regionFinder, region, regionLength, peakScore, plusRatio):
+ regionPasses = False
+ if regionPassesCriteria(regionFinder, region.numReads, region.foldRatio, regionLength):
+ #TODO: here is where the test dataset is failing
+ if peakScore >= regionFinder.minPeak and regionFinder.minPlusRatio <= plusRatio <= regionFinder.maxPlusRatio:
+ regionPasses = True
+ return regionPasses
-def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, reportshift, shiftValue):
- bestShift = 0
- shiftDict = {}
- for region in outregions:
- # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
- if reportshift:
- outputList = [region.printRegionWithShift()]
- if shiftValue == "auto":
- try:
- shiftDict[region.shift] += 1
- except KeyError:
- shiftDict[region.shift] = 1
+
+def updateRegion(region, doDirectionality, leftPlusRatio, numLeft, numPlus, plusRatio):
+
+ if doDirectionality:
+ if leftPlusRatio < numLeft / numPlus:
+ region.plusP = plusRatio * 100.
+ region.leftP = 100. * numLeft / numPlus
else:
- outputList = [region.printRegion()]
+ raise RegionDirectionError
- # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
- if doPvalue:
- sumAll = int(region.numReads)
- for i in xrange(sumAll):
- pValue *= poissonmean
- pValue /= i+1
- outputList.append("%1.2g" % pValue)
+def writeChromosomeResults(regionFinder, outregions, outfile, shiftDict,
+ allregions, header, backregions=[], pValueType="none"):
+
+ print regionFinder.statistics["mIndex"], regionFinder.statistics["mTotal"]
+ if regionFinder.doPvalue:
+ if pValueType == "self":
+ poissonmean = calculatePoissonMean(allregions)
+ else:
+ poissonmean = calculatePoissonMean(backregions)
+
+ print header
+ for region in outregions:
+ if regionFinder.shiftValue == "auto" and regionFinder.reportshift:
+ try:
+ shiftDict[region.shift] += 1
+ except KeyError:
+ shiftDict[region.shift] = 1
+
+ outline = getRegionString(region, regionFinder.reportshift)
+
+ # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
+ if regionFinder.doPvalue:
+ pValue = calculatePValue(int(region.numReads), poissonmean)
+ outline += "\t%1.2g" % pValue
- outline = string.join(outputList, "\t")
print outline
print >> outfile, outline
- bestCount = 0
- for shift in sorted(shiftDict):
- if shiftDict[shift] > bestCount:
- bestShift = shift
- bestCount = shiftDict[shift]
- return bestShift
+def calculatePoissonMean(dataList):
+ dataList.sort()
+ listSize = float(len(dataList))
+ try:
+ poissonmean = sum(dataList) / listSize
+ except ZeroDivisionError:
+ poissonmean = 0
+
+ print "Poisson n=%d, p=%f" % (listSize, poissonmean)
+
+ return poissonmean
-def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue):
- footerList = ["#stats:\t%.1f RPM in %d regions" % (stats["total"], stats["index"])]
- if doDirectionality:
- footerList.append("#\t\t%d additional regions failed directionality filter" % stats["failed"])
+def calculatePValue(sum, poissonmean):
+ pValue = math.exp(-poissonmean)
+ for i in xrange(sum):
+ pValue *= poissonmean
+ pValue /= i+1
- if doRevBackground:
- try:
- percent = min(100. * (float(stats["mIndex"])/stats["index"]), 100)
- except (ValueError, ZeroDivisionError):
- percent = 0.
+ return pValue
+
+
+def getRegionString(region, reportShift):
+ if reportShift:
+ outline = region.printRegionWithShift()
+ else:
+ outline = region.printRegion()
- footerList.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (stats["mIndex"], stats["mTotal"], percent))
+ return outline
- if shiftValue == "auto" and reportshift:
- footerList.append("#mode of shift values: %d" % shiftModeValue)
- footer = string.join(footerList, "\n")
+def getBestShiftInDict(shiftDict):
+ return max(shiftDict.iteritems(), key=operator.itemgetter(1))[0]
- return footer
if __name__ == "__main__":
- main(sys.argv)
+ main(sys.argv)
\ No newline at end of file