"""
- usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile
- [--control controlrdsfile] [--minimum minHits] [--ratio minRatio]
+ usage: python $ERANGEPATH/findall.py label samplebamfile regionoutfile
+ [--control controlbamfile] [--minimum minHits] [--ratio minRatio]
[--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num]
[--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction]
[--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw]
import string
import optparse
import operator
-from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
-import ReadDataset
+import pysam
+from commoncode import writeLog, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption, isSpliceEntry
import Region
class RegionDirectionError(Exception):
pass
-
+
class RegionFinder():
- def __init__(self, minRatio, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, strandfilter, minHits, trimValue, doTrim, doDirectionality,
- shiftValue, maxSpacing):
+ def __init__(self, label, minRatio=4.0, minPeak=0.5, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, strandfilter="",
+ minHits=4.0, trimValue=0.1, doTrim=True, doDirectionality=True, shiftValue=0, maxSpacing=50, withFlag="",
+ normalize=True, listPeak=False, reportshift=False, stringency=1.0, controlfile=None, doRevBackground=False):
self.statistics = {"index": 0,
"total": 0,
"badRegionTrim": 0
}
- self.controlRDSsize = 1
- self.sampleRDSsize = 1
+ self.regionLabel = label
+ self.rnaSettings = False
+ self.controlRDSsize = 1.0
+ self.sampleRDSsize = 1.0
self.minRatio = minRatio
self.minPeak = minPeak
self.leftPlusRatio = leftPlusRatio
self.shiftValue = shiftValue
self.maxSpacing = maxSpacing
-
-
- def useRNASettings(self):
+ self.withFlag = withFlag
+ self.normalize = normalize
+ self.listPeak = listPeak
+ self.reportshift = reportshift
+ self.stringency = max(stringency, 1.0)
+ self.controlfile = controlfile
+ self.doControl = self.controlfile is not None
+ self.doPvalue = False
+ self.doRevBackground = doRevBackground
+
+
+ def useRNASettings(self, readlen):
+ self.rnaSettings = True
self.shiftValue = 0
self.doTrim = False
self.doDirectionality = False
+ self.maxSpacing = readlen
+
+
+ def getHeader(self):
+ if self.normalize:
+ countType = "RPM"
+ else:
+ countType = "COUNT"
+
+ headerFields = ["#regionID\tchrom\tstart\tstop", countType, "fold\tmulti%"]
+
+ if self.doDirectionality:
+ headerFields.append("plus%\tleftPlus%")
+
+ if self.listPeak:
+ headerFields.append("peakPos\tpeakHeight")
+
+ if self.reportshift:
+ headerFields.append("readShift")
+
+ if self.doPvalue:
+ headerFields.append("pValue")
+
+ return string.join(headerFields, "\t")
+
+
+ def printSettings(self, ptype, useMulti, pValueType):
+ print
+ self.printStatusMessages(ptype, useMulti)
+ self.printOptionsSummary(useMulti, pValueType)
+
+
+ def printStatusMessages(self, ptype, useMulti):
+ if self.shiftValue == "learn":
+ print "Will try to learn shift"
+
+ if self.normalize:
+ print "Normalizing to RPM"
+
+ if self.doRevBackground:
+ print "Swapping IP and background to calculate FDR"
+
+ if ptype != "":
+ if ptype in ["NONE", "SELF"]:
+ pass
+ elif ptype == "BACK":
+ if self.doControl and self.doRevBackground:
+ pass
+ else:
+ print "must have a control dataset and -revbackground for pValue type 'back'"
+ else:
+ print "could not use pValue type : %s" % ptype
+
+ if self.withFlag != "":
+ print "restrict to flag = %s" % self.withFlag
+
+ if not useMulti:
+ print "using unique reads only"
+
+ if self.rnaSettings:
+ print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
+
+ if self.strandfilter == "plus":
+ print "only analyzing reads on the plus strand"
+ elif self.strandfilter == "minus":
+ print "only analyzing reads on the minus strand"
- def printOptionsSummary(self, listPeak, useMulti, doCache, pValueType):
+ def printOptionsSummary(self, useMulti, pValueType):
- print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (self.doDirectionality, listPeak, not useMulti, doCache)
+ print "\nenforceDirectionality=%s listPeak=%s nomulti=%s " % (self.doDirectionality, self.listPeak, not useMulti)
print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded)
try:
print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
- def getAnalysisDescription(self, hitfile, controlfile, doControl,
- withFlag, listPeak, useMulti, doCache, pValueType):
+ def getAnalysisDescription(self, hitfile, useMulti, pValueType):
description = ["#ERANGE %s" % versionString]
- if doControl:
- description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, controlfile, self.controlRDSsize))
+ if self.doControl:
+ description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, self.controlfile, self.controlRDSsize))
else:
description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample: none" % (hitfile, self.sampleRDSsize))
- if withFlag != "":
- description.append("#restrict to Flag = %s" % withFlag)
+ if self.withFlag != "":
+ description.append("#restrict to Flag = %s" % self.withFlag)
- description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s" % (self.doDirectionality, listPeak, not useMulti, doCache))
+ description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s" % (self.doDirectionality, self.listPeak, not useMulti))
description.append("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded))
try:
description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
return string.join(description, "\n")
+ def getFooter(self, bestShift):
+ index = self.statistics["index"]
+ mIndex = self.statistics["mIndex"]
+ footerLines = ["#stats:\t%.1f RPM in %d regions" % (self.statistics["total"], index)]
+ if self.doDirectionality:
+ footerLines.append("#\t\t%d additional regions failed directionality filter" % self.statistics["failed"])
+
+ if self.doRevBackground:
+ try:
+ percent = min(100. * (float(mIndex)/index), 100.)
+ except ZeroDivisionError:
+ percent = 0.
+
+ footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, self.statistics["mTotal"], percent))
+
+ if self.shiftValue == "auto" and self.reportshift:
+
+ footerLines.append("#mode of shift values: %d" % bestShift)
+
+ if self.statistics["badRegionTrim"] > 0:
+ footerLines.append("#%d regions discarded due to trimming problems" % self.statistics["badRegionTrim"])
+
+ return string.join(footerLines, "\n")
+
+
+ def updateControlStatistics(self, peak, sumAll, peakScore):
+
+ plusRatio = float(peak.numPlus)/peak.numHits
+ if peakScore >= self.minPeak and self.minPlusRatio <= plusRatio <= self.maxPlusRatio:
+ if self.doDirectionality:
+ if self.leftPlusRatio < peak.numLeftPlus / peak.numPlus:
+ self.statistics["mIndex"] += 1
+ self.statistics["mTotal"] += sumAll
+ else:
+ self.statistics["failed"] += 1
+ else:
+ # we have a region, but didn't check for directionality
+ self.statistics["mIndex"] += 1
+ self.statistics["mTotal"] += sumAll
+
+
def usage():
print __doc__
else:
outputMode = "w"
- findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, shiftValue,
- options.stringency, options.reportshift,
- options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
- options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
- options.trimValue, options.doTrim, outputMode, options.rnaSettings,
- options.cachePages, options.ptype, options.controlfile, options.doRevBackground, options.useMulti,
- options.strandfilter, options.combine5p)
+ regionFinder = RegionFinder(factor, minRatio=options.minRatio, minPeak=options.minPeak, minPlusRatio=options.minPlusRatio,
+ maxPlusRatio=options.maxPlusRatio, leftPlusRatio=options.leftPlusRatio, strandfilter=options.strandfilter,
+ minHits=options.minHits, trimValue=options.trimValue, doTrim=options.doTrim,
+ doDirectionality=options.doDirectionality, shiftValue=shiftValue, maxSpacing=options.maxSpacing,
+ withFlag=options.withFlag, normalize=options.normalize, listPeak=options.listPeak,
+ reportshift=options.reportshift, stringency=options.stringency, controlfile=options.controlfile,
+ doRevBackground=options.doRevBackground)
+
+ findall(regionFinder, hitfile, outfilename, options.logfilename, outputMode, options.rnaSettings,
+ options.ptype, options.useMulti, options.combine5p)
def makeParser():
return parser
-def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shiftValue=0,
- stringency=4.0, reportshift=False, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
- normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True, trimValue=0.1, doTrim=True,
- outputMode="w", rnaSettings=False, cachePages=None, ptype="", controlfile=None, doRevBackground=False,
- useMulti=True, strandfilter="", combine5p=False):
-
- regionFinder = RegionFinder(minRatio, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, strandfilter, minHits, trimValue,
- doTrim, doDirectionality, shiftValue, maxSpacing)
-
- doControl = controlfile is not None
- pValueType = getPValueType(ptype, doControl, doRevBackground)
- doPvalue = not pValueType == "none"
+def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outputMode="w", rnaSettings=False,
+ ptype="", useMulti=True, combine5p=False):
- if rnaSettings:
- regionFinder.useRNASettings()
-
- doCache = cachePages is not None
- printStatusMessages(regionFinder.shiftValue, normalize, doRevBackground, ptype, doControl, withFlag, useMulti, rnaSettings, regionFinder.strandfilter)
- controlRDS = None
- if doControl:
+ writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+ controlBAM = None
+ if regionFinder.doControl:
print "\ncontrol:"
- controlRDS = openRDSFile(controlfile, cachePages=cachePages, doCache=doCache)
+ controlBAM = pysam.Samfile(regionFinder.controlfile, "rb")
+ regionFinder.controlRDSsize = int(getHeaderComment(controlBAM.header, "Total")) / 1000000.
print "\nsample:"
- hitRDS = openRDSFile(hitfile, cachePages=cachePages, doCache=doCache)
-
- print
- regionFinder.readlen = hitRDS.getReadSize()
+ sampleBAM = pysam.Samfile(hitfile, "rb")
+ regionFinder.sampleRDSsize = int(getHeaderComment(sampleBAM.header, "Total")) / 1000000.
+ pValueType = getPValueType(ptype, regionFinder.doControl, regionFinder.doRevBackground)
+ regionFinder.doPvalue = not pValueType == "none"
+ regionFinder.readlen = int(getHeaderComment(sampleBAM.header, "ReadLength"))
if rnaSettings:
- regionFinder.maxSpacing = regionFinder.readlen
-
- writeLog(logfilename, versionString, string.join(sys.argv[1:]))
- regionFinder.printOptionsSummary(listPeak, useMulti, doCache, pValueType)
-
- regionFinder.sampleRDSsize = len(hitRDS) / 1000000.
- if doControl:
- regionFinder.controlRDSsize = len(controlRDS) / 1000000.
+ regionFinder.useRNASettings(regionFinder.readlen)
+ regionFinder.printSettings(ptype, useMulti, pValueType)
outfile = open(outfilename, outputMode)
- print >> outfile, regionFinder.getAnalysisDescription(hitfile, controlfile, doControl,
- withFlag, listPeak, useMulti, doCache, pValueType)
-
- header = getHeader(normalize, regionFinder.doDirectionality, listPeak, reportshift, doPvalue)
- print >> outfile, header
- if minRatio < minPeak:
- minPeak = minRatio
-
+ header = writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType)
shiftDict = {}
- hitChromList = hitRDS.getChromosomes()
- stringency = max(stringency, 1.0)
- chromosomeList = getChromosomeListToProcess(hitChromList, controlRDS, doControl)
- for chromosome in chromosomeList:
- allregions, outregions = findPeakRegions(regionFinder, hitRDS, controlRDS, chromosome, logfilename, outfilename,
- outfile, stringency, normalize, useMulti, doControl, withFlag, combine5p,
- factor, listPeak)
-
- if doRevBackground:
- #TODO: this is *almost* the same calculation - there are small yet important differences
- backregions = findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, normalize, useMulti, factor)
- writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
- allregions, header, backregions=backregions, pValueType=pValueType)
+ chromList = getChromosomeListToProcess(sampleBAM, controlBAM)
+ for chromosome in chromList:
+ #TODO: Really? Use first chr shift value for all of them
+ maxSampleCoord = getMaxCoordinate(sampleBAM, chromosome, doMulti=useMulti)
+ if regionFinder.shiftValue == "learn":
+ regionFinder.shiftValue = learnShift(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p)
+
+ allregions, outregions = findPeakRegions(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p)
+ if regionFinder.doRevBackground:
+ maxControlCoord = getMaxCoordinate(controlBAM, chromosome, doMulti=useMulti)
+ backregions = findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxControlCoord, chromosome, useMulti)
else:
- writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
- allregions, header)
+ backregions = []
+ pValueType = "self"
+
+ writeChromosomeResults(regionFinder, outregions, outfile, shiftDict, allregions, header, backregions=backregions, pValueType=pValueType)
+
+ try:
+ bestShift = getBestShiftInDict(shiftDict)
+ except ValueError:
+ bestShift = 0
- footer = getFooter(regionFinder, shiftDict, reportshift, doRevBackground)
+ footer = regionFinder.getFooter(bestShift)
print footer
print >> outfile, footer
outfile.close()
writeLog(logfilename, versionString, outfilename + footer.replace("\n#"," | ")[:-1])
+def getHeaderComment(bamHeader, commentKey):
+ for comment in bamHeader["CO"]:
+ fields = comment.split("\t")
+ if fields[0] == commentKey:
+ return fields[1]
+
+ raise KeyError
+
+
def getPValueType(ptype, doControl, doRevBackground):
pValueType = "self"
if ptype in ["NONE", "SELF", "BACK"]:
return pValueType
-def printStatusMessages(shiftValue, normalize, doRevBackground, ptype, doControl, withFlag, useMulti, rnaSettings, strandfilter):
- if shiftValue == "learn":
- print "Will try to learn shift"
-
- if normalize:
- print "Normalizing to RPM"
-
- if doRevBackground:
- print "Swapping IP and background to calculate FDR"
-
- if ptype != "":
- if ptype in ["NONE", "SELF"]:
- pass
- elif ptype == "BACK":
- if doControl and doRevBackground:
- pass
- else:
- print "must have a control dataset and -revbackground for pValue type 'back'"
- else:
- print "could not use pValue type : %s" % ptype
-
- if withFlag != "":
- print "restrict to flag = %s" % withFlag
-
- if not useMulti:
- print "using unique reads only"
-
- if rnaSettings:
- print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
-
- if strandfilter == "plus":
- print "only analyzing reads on the plus strand"
- elif strandfilter == "minus":
- print "only analyzing reads on the minus strand"
-
-
-def openRDSFile(filename, cachePages=None, doCache=False):
- rds = ReadDataset.ReadDataset(filename, verbose=True, cache=doCache)
- if cachePages > rds.getDefaultCacheSize():
- rds.setDBcache(cachePages)
-
- return rds
-
-
-def getHeader(normalize, doDirectionality, listPeak, reportshift, doPvalue):
- if normalize:
- countType = "RPM"
- else:
- countType = "COUNT"
-
- headerFields = ["#regionID\tchrom\tstart\tstop", countType, "fold\tmulti%"]
-
- if doDirectionality:
- headerFields.append("plus%\tleftPlus%")
-
- if listPeak:
- headerFields.append("peakPos\tpeakHeight")
-
- if reportshift:
- headerFields.append("readShift")
-
- if doPvalue:
- headerFields.append("pValue")
+def writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType):
+ print >> outfile, regionFinder.getAnalysisDescription(hitfile, useMulti, pValueType)
+ header = regionFinder.getHeader()
+ print >> outfile, header
- return string.join(headerFields, "\t")
+ return header
-def getChromosomeListToProcess(hitChromList, controlRDS, doControl):
- if doControl:
- controlChromList = controlRDS.getChromosomes()
- chromosomeList = [chrom for chrom in hitChromList if chrom in controlChromList and chrom != "chrM"]
+def getChromosomeListToProcess(sampleBAM, controlBAM=None):
+ if controlBAM is not None:
+ chromosomeList = [chrom for chrom in sampleBAM.references if chrom in controlBAM.references and chrom != "chrM"]
else:
- chromosomeList = [chrom for chrom in hitChromList if chrom != "chrM"]
+ chromosomeList = [chrom for chrom in sampleBAM.references if chrom != "chrM"]
return chromosomeList
-def findPeakRegions(regionFinder, hitRDS, controlRDS, chromosome, logfilename, outfilename,
- outfile, stringency, normalize, useMulti, doControl, withFlag, combine5p, factor, listPeak):
+def findPeakRegions(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename,
+ outfile, useMulti, controlBAM, combine5p):
outregions = []
allregions = []
print "chromosome %s" % (chromosome)
previousHit = - 1 * regionFinder.maxSpacing
readStartPositions = [-1]
- totalWeight = 0
+ totalWeight = 0.0
uniqueReadCount = 0
reads = []
- numStarts = 0
- badRegion = False
- hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True,
- strand=regionFinder.stranded, combine5p=combine5p)
+ numStartsInRegion = 0
- maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
- if regionFinder.shiftValue == "learn":
- learnShift(regionFinder, hitDict, controlRDS, chromosome, maxCoord, logfilename, outfilename,
- outfile, numStarts, stringency, normalize, useMulti, doControl)
+ for alignedread in sampleBAM.fetch(chromosome):
+ if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p):
+ continue
- for read in hitDict[chromosome]:
- pos = read["start"]
+ pos = alignedread.pos
if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
lastReadPos = readStartPositions[-1]
lastBasePosition = lastReadPos + regionFinder.readlen - 1
newRegionIndex = regionFinder.statistics["index"] + 1
if regionFinder.doDirectionality:
- region = Region.DirectionalRegion(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=factor, numReads=totalWeight)
+ region = Region.DirectionalRegion(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=regionFinder.regionLabel,
+ numReads=totalWeight)
else:
- region = Region.Region(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=factor, numReads=totalWeight)
+ region = Region.Region(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=regionFinder.regionLabel, numReads=totalWeight)
- if normalize:
+ if regionFinder.normalize:
region.numReads /= regionFinder.sampleRDSsize
allregions.append(int(region.numReads))
regionLength = lastReadPos - region.start
- if regionPassesCriteria(region.numReads, regionFinder.minHits, numStarts, regionFinder.minRatio, regionLength, regionFinder.readlen):
- region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos,
- useMulti, doControl, normalize)
+ if regionPassesCriteria(regionFinder, region.numReads, numStartsInRegion, regionLength):
+ region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti)
if region.foldRatio >= regionFinder.minRatio:
# first pass, with absolute numbers
peak = findPeak(reads, region.start, regionLength, regionFinder.readlen, doWeight=True, leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue)
if regionFinder.doTrim:
try:
- lastReadPos = trimRegion(region, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.readlen, regionFinder.doDirectionality,
- normalize, regionFinder.sampleRDSsize)
+ lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.sampleRDSsize)
except IndexError:
- badRegion = True
+ regionFinder.statistics["badRegionTrim"] += 1
continue
- region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos,
- useMulti, doControl, normalize)
+ region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti)
- # just in case it changed, use latest data
try:
bestPos = peak.topPos[0]
peakScore = peak.smoothArray[bestPos]
- if normalize:
+ if regionFinder.normalize:
peakScore /= regionFinder.sampleRDSsize
- except:
+ except (IndexError, AttributeError, ZeroDivisionError):
continue
- if listPeak:
- region.peakDescription= "%d\t%.1f" % (region.start + bestPos, peakScore)
+ if regionFinder.listPeak:
+ region.peakDescription = "%d\t%.1f" % (region.start + bestPos, peakScore)
if useMulti:
- setMultireadPercentage(region, hitRDS, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos, normalize, regionFinder.doTrim)
+ setMultireadPercentage(region, sampleBAM, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos,
+ regionFinder.normalize, regionFinder.doTrim)
region.shift = peak.shift
# check that we still pass threshold
regionLength = lastReadPos - region.start
plusRatio = float(peak.numPlus)/peak.numHits
- if regionAndPeakPass(region, regionFinder.minHits, regionFinder.minRatio, regionLength, regionFinder.readlen, peakScore, regionFinder.minPeak,
- regionFinder.minPlusRatio, regionFinder.maxPlusRatio, plusRatio):
+ if regionAndPeakPass(regionFinder, region, regionLength, peakScore, plusRatio):
try:
updateRegion(region, regionFinder.doDirectionality, regionFinder.leftPlusRatio, peak.numLeftPlus, peak.numPlus, plusRatio)
regionFinder.statistics["index"] += 1
regionFinder.statistics["failed"] += 1
readStartPositions = []
- totalWeight = 0
+ totalWeight = 0.0
uniqueReadCount = 0
reads = []
- numStarts = 0
- if badRegion:
- badRegion = False
- regionFinder.statistics["badRegionTrim"] += 1
+ numStartsInRegion = 0
if pos not in readStartPositions:
- numStarts += 1
+ numStartsInRegion += 1
readStartPositions.append(pos)
- weight = read["weight"]
+ weight = 1.0/alignedread.opt('NH')
totalWeight += weight
if weight == 1.0:
uniqueReadCount += 1
- reads.append({"start": pos, "sense": read["sense"], "weight": weight})
+ reads.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
previousHit = pos
return allregions, outregions
-def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, normalize, useMulti, factor):
+def getReadSense(read):
+ if read.is_reverse:
+ sense = "-"
+ else:
+ sense = "+"
+
+ return sense
+
+
+def doNotProcessRead(read, doMulti=False, strand="both", combine5p=False):
+ if read.opt('NH') > 1 and not doMulti:
+ return True
+
+ if strand == "+" and read.is_reverse:
+ return True
+
+ if strand == "-" and not read.is_reverse:
+ return True
+
+ return False
+
+def getMaxCoordinate(samfile, chr, doMulti=False):
+ maxCoord = 0
+ for alignedread in samfile.fetch(chr):
+ if alignedread.opt('NH') > 1:
+ if doMulti:
+ maxCoord = max(maxCoord, alignedread.pos)
+ else:
+ maxCoord = max(maxCoord, alignedread.pos)
+
+ return maxCoord
+
+
+def findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxCoord, chromosome, useMulti):
+ #TODO: this is *almost* the same calculation - there are small yet important differences
print "calculating background..."
previousHit = - 1 * regionFinder.maxSpacing
currentHitList = [-1]
- currentTotalWeight = 0
+ currentTotalWeight = 0.0
currentReadList = []
backregions = []
numStarts = 0
badRegion = False
- hitDict = controlRDS.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, findallOptimize=True)
- maxCoord = controlRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
- for read in hitDict[chromosome]:
- pos = read["start"]
+ for alignedread in controlBAM.fetch(chromosome):
+ if doNotProcessRead(alignedread, doMulti=useMulti):
+ continue
+
+ pos = alignedread.pos
if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
lastReadPos = currentHitList[-1]
lastBasePosition = lastReadPos + regionFinder.readlen - 1
- region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=factor, numReads=currentTotalWeight)
- if normalize:
+ region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight)
+ if regionFinder.normalize:
region.numReads /= regionFinder.controlRDSsize
backregions.append(int(region.numReads))
- region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=factor, numReads=currentTotalWeight)
+ region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight)
regionLength = lastReadPos - region.start
- if regionPassesCriteria(region.numReads, regionFinder.minHits, numStarts, regionFinder.minRatio, regionLength, regionFinder.readlen):
- numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
- if normalize:
+ if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength):
+ numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti)
+ if regionFinder.normalize:
numMock /= regionFinder.sampleRDSsize
foldRatio = region.numReads / numMock
if regionFinder.doTrim:
try:
- lastReadPos = trimRegion(region, peak, lastReadPos, 20., currentReadList, regionFinder.readlen, regionFinder.doDirectionality,
- normalize, regionFinder.controlRDSsize)
+ lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, 20., currentReadList, regionFinder.controlRDSsize)
except IndexError:
badRegion = True
continue
- numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
- if normalize:
+ numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti)
+ if regionFinder.normalize:
numMock /= regionFinder.sampleRDSsize
foldRatio = region.numReads / numMock
- # just in case it changed, use latest data
try:
bestPos = peak.topPos[0]
peakScore = peak.smoothArray[bestPos]
continue
# normalize to RPM
- if normalize:
+ if regionFinder.normalize:
peakScore /= regionFinder.controlRDSsize
# check that we still pass threshold
regionLength = lastReadPos - region.start
- if regionPassesCriteria(region.numReads, regionFinder.minHits, foldRatio, regionFinder.minRatio, regionLength, regionFinder.readlen):
- updateControlStatistics(regionFinder, peak, region.numReads, peakScore)
+ if regionPassesCriteria(regionFinder, region.numReads, foldRatio, regionLength):
+ regionFinder.updateControlStatistics(peak, region.numReads, peakScore)
currentHitList = []
- currentTotalWeight = 0
+ currentTotalWeight = 0.0
currentReadList = []
numStarts = 0
if badRegion:
numStarts += 1
currentHitList.append(pos)
- weight = read["weight"]
+ weight = 1.0/alignedread.opt('NH')
currentTotalWeight += weight
- currentReadList.append({"start": pos, "sense": read["sense"], "weight": weight})
+ currentReadList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
previousHit = pos
return backregions
-def learnShift(regionFinder, hitDict, controlRDS, chromosome, maxCoord, logfilename, outfilename,
- outfile, numStarts, stringency, normalize, useMulti, doControl):
+def learnShift(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename,
+ outfile, useMulti, controlBAM, combine5p):
print "learning shift.... will need at least 30 training sites"
+ stringency = regionFinder.stringency
previousHit = -1 * regionFinder.maxSpacing
positionList = [-1]
- totalWeight = 0
+ totalWeight = 0.0
readList = []
shiftDict = {}
count = 0
- for read in hitDict[chromosome]:
- pos = read["start"]
+ numStarts = 0
+ for alignedread in sampleBAM.fetch(chromosome):
+ if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p):
+ continue
+
+ pos = alignedread.pos
if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
- if normalize:
+ if regionFinder.normalize:
totalWeight /= regionFinder.sampleRDSsize
regionStart = positionList[0]
regionStop = positionList[-1]
regionLength = regionStop - regionStart
- if regionPassesCriteria(totalWeight, regionFinder.minHits, numStarts, regionFinder.minRatio, regionLength, regionFinder.readlen, stringency=stringency):
- foldRatio = getFoldRatio(regionFinder, controlRDS, totalWeight, chromosome, regionStart, regionStop, useMulti, doControl, normalize)
+ if regionPassesCriteria(regionFinder, totalWeight, numStarts, regionLength, stringency=stringency):
+ foldRatio = getFoldRatio(regionFinder, controlBAM, totalWeight, chromosome, regionStart, regionStop, useMulti)
if foldRatio >= regionFinder.minRatio:
updateShiftDict(shiftDict, readList, regionStart, regionLength, regionFinder.readlen)
count += 1
positionList = []
- totalWeight = 0
+ totalWeight = 0.0
readList = []
if pos not in positionList:
numStarts += 1
positionList.append(pos)
- weight = read["weight"]
+ weight = 1.0/alignedread.opt('NH')
totalWeight += weight
- readList.append({"start": pos, "sense": read["sense"], "weight": weight})
+ readList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
previousHit = pos
outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency,
print outline
writeLog(logfilename, versionString, outfilename + outline)
- regionFinder.shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename)
- outline = "#picked shiftValue to be %d" % regionFinder.shiftValue
+ shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename)
+ outline = "#picked shiftValue to be %d" % shiftValue
print outline
print >> outfile, outline
writeLog(logfilename, versionString, outfilename + outline)
+ return shiftValue, shiftDict
+
def previousRegionIsDone(pos, previousHit, maxSpacing, maxCoord):
return abs(pos - previousHit) > maxSpacing or pos == maxCoord
-def regionPassesCriteria(sumAll, minHits, numStarts, minRatio, regionLength, readlen, stringency=1):
- return sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen
+def regionPassesCriteria(regionFinder, sumAll, numStarts, regionLength, stringency=1):
+ minTotalReads = stringency * regionFinder.minHits
+ minNumReadStarts = stringency * regionFinder.minRatio
+ minRegionLength = stringency * regionFinder.readlen
+
+ return sumAll >= minTotalReads and numStarts >= minNumReadStarts and regionLength > minRegionLength
-def trimRegion(region, peak, regionStop, trimValue, currentReadList, readlen, doDirectionality, normalize, hitRDSsize):
+def trimRegion(region, regionFinder, peak, regionStop, trimValue, currentReadList, totalReadCount):
bestPos = peak.topPos[0]
peakScore = peak.smoothArray[bestPos]
- if normalize:
- peakScore /= hitRDSsize
+ if regionFinder.normalize:
+ peakScore /= totalReadCount
minSignalThresh = trimValue * peakScore
start = findStartEdgePosition(peak, minSignalThresh)
regionStop = region.start + stop
region.start += start
- trimmedPeak = findPeak(currentReadList, region.start, regionStop - region.start, readlen, doWeight=True, leftPlus=doDirectionality, shift=peak.shift)
+
+ trimmedPeak = findPeak(currentReadList, region.start, regionStop - region.start, regionFinder.readlen, doWeight=True,
+ leftPlus=regionFinder.doDirectionality, shift=peak.shift)
+
peak.numPlus = trimmedPeak.numPlus
peak.numLeftPlus = trimmedPeak.numLeftPlus
peak.topPos = trimmedPeak.topPos
peak.smoothArray = trimmedPeak.smoothArray
region.numReads = trimmedPeak.numHits
- if normalize:
- region.numReads /= hitRDSsize
+ if regionFinder.normalize:
+ region.numReads /= totalReadCount
- region.stop = regionStop + readlen - 1
-
+ region.stop = regionStop + regionFinder.readlen - 1
+
return regionStop
return peak.smoothArray[position] >= minSignalThresh or position == peak.topPos[0]
-def getFoldRatio(regionFinder, controlRDS, sumAll, chromosome, regionStart, regionStop, useMulti, doControl, normalize):
- if doControl:
- numMock = 1. + controlRDS.getCounts(chromosome, regionStart, regionStop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
- if normalize:
+def getFoldRatio(regionFinder, controlBAM, sumAll, chromosome, regionStart, regionStop, useMulti):
+ """ Fold ratio calculated is total read weight over control
+ """
+ #TODO: this needs to be generalized as there is a point at which we want to use the sampleRDS instead of controlRDS
+ if regionFinder.doControl:
+ numMock = 1. + countReadsInRegion(controlBAM, chromosome, regionStart, regionStop, countMulti=useMulti)
+ if regionFinder.normalize:
numMock /= regionFinder.controlRDSsize
foldRatio = sumAll / numMock
return foldRatio
+def countReadsInRegion(bamfile, chr, start, end, uniqs=True, countMulti=False, countSplices=False):
+ count = 0.0
+ for alignedread in bamfile.fetch(chr, start, end):
+ if alignedread.opt('NH') > 1:
+ if countMulti:
+ if isSpliceEntry(alignedread.cigar):
+ if countSplices:
+ count += 1.0/alignedread.opt('NH')
+ else:
+ count += 1.0/alignedread.opt('NH')
+ elif uniqs:
+ if isSpliceEntry(alignedread.cigar):
+ if countSplices:
+ count += 1.0
+ else:
+ count += 1.0
+
+ return count
+
def updateShiftDict(shiftDict, readList, regionStart, regionLength, readlen):
peak = findPeak(readList, regionStart, regionLength, readlen, doWeight=True, shift="auto")
try:
return shiftValue
-def updateControlStatistics(regionFinder, peak, sumAll, peakScore):
-
- plusRatio = float(peak.numPlus)/peak.numHits
- if peakScore >= regionFinder.minPeak and regionFinder.minPlusRatio <= plusRatio <= regionFinder.maxPlusRatio:
- if regionFinder.doDirectionality:
- if regionFinder.leftPlusRatio < peak.numLeft / peak.numPlus:
- regionFinder.statistics["mIndex"] += 1
- regionFinder.statistics["mTotal"] += sumAll
- else:
- regionFinder.statistics["failed"] += 1
- else:
- # we have a region, but didn't check for directionality
- regionFinder.statistics["mIndex"] += 1
- regionFinder.statistics["mTotal"] += sumAll
-
-
def getRegion(regionStart, regionStop, factor, index, chromosome, sumAll, foldRatio, multiP,
peakDescription, shift, doDirectionality, leftPlusRatio, numLeft,
numPlus, plusRatio):
return region
-def setMultireadPercentage(region, hitRDS, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim):
+def setMultireadPercentage(region, sampleBAM, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim):
if doTrim:
- sumMulti = hitRDS.getMultiCount(chromosome, region.start, lastReadPos)
+ sumMulti = 0.0
+ for alignedread in sampleBAM.fetch(chromosome, region.start, lastReadPos):
+ if alignedread.opt('NH') > 1:
+ sumMulti += 1.0/alignedread.opt('NH')
else:
sumMulti = currentTotalWeight - currentUniqueCount
except ZeroDivisionError:
return
- region.multiP = multiP
+ region.multiP = min(multiP, 100.)
-def regionAndPeakPass(region, minHits, minRatio, regionLength, readlen, peakScore, minPeak, minPlusRatio, maxPlusRatio, plusRatio):
+def regionAndPeakPass(regionFinder, region, regionLength, peakScore, plusRatio):
regionPasses = False
- if regionPassesCriteria(region.numReads, minHits, region.foldRatio, minRatio, regionLength, readlen):
- if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
+ if regionPassesCriteria(regionFinder, region.numReads, region.foldRatio, regionLength):
+ #TODO: here is where the test dataset is failing
+ if peakScore >= regionFinder.minPeak and regionFinder.minPlusRatio <= plusRatio <= regionFinder.maxPlusRatio:
regionPasses = True
return regionPasses
-def updateRegion(region,
- doDirectionality, leftPlusRatio, numLeft,
- numPlus, plusRatio):
+def updateRegion(region, doDirectionality, leftPlusRatio, numLeft, numPlus, plusRatio):
if doDirectionality:
if leftPlusRatio < numLeft / numPlus:
raise RegionDirectionError
-def writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
- allregions, header):
-
- writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
- allregions, header, backregions=[], pValueType="self")
-
-
-def writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
+def writeChromosomeResults(regionFinder, outregions, outfile, shiftDict,
allregions, header, backregions=[], pValueType="none"):
print regionFinder.statistics["mIndex"], regionFinder.statistics["mTotal"]
- if doPvalue:
+ if regionFinder.doPvalue:
if pValueType == "self":
poissonmean = calculatePoissonMean(allregions)
else:
poissonmean = calculatePoissonMean(backregions)
print header
- writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=regionFinder.shiftValue, reportshift=reportshift, shiftDict=shiftDict)
+ for region in outregions:
+ if regionFinder.shiftValue == "auto" and regionFinder.reportshift:
+ try:
+ shiftDict[region.shift] += 1
+ except KeyError:
+ shiftDict[region.shift] = 1
+
+ outline = getRegionString(region, regionFinder.reportshift)
+
+ # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
+ if regionFinder.doPvalue:
+ pValue = calculatePValue(int(region.numReads), poissonmean)
+ outline += "\t%1.2g" % pValue
+
+ print outline
+ print >> outfile, outline
def calculatePoissonMean(dataList):
return poissonmean
-def writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=0, reportshift=False, shiftDict={}):
- for region in outregions:
- if shiftValue == "auto" and reportshift:
- try:
- shiftDict[region.shift] += 1
- except KeyError:
- shiftDict[region.shift] = 1
-
- outline = getRegionString(region, reportshift)
-
- # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
- if doPvalue:
- sumAll = int(region.numReads)
- pValue = calculatePValue(sumAll, poissonmean)
- outline += "\t%1.2g" % pValue
-
- print outline
- print >> outfile, outline
-
-
def calculatePValue(sum, poissonmean):
pValue = math.exp(-poissonmean)
- #TODO: 798: DeprecationWarning: integer argument expected, got float - for i in xrange(sum)
for i in xrange(sum):
pValue *= poissonmean
pValue /= i+1
return outline
-def getFooter(regionFinder, shiftDict, reportshift, doRevBackground):
- index = regionFinder.statistics["index"]
- mIndex = regionFinder.statistics["mIndex"]
- footerLines = ["#stats:\t%.1f RPM in %d regions" % (regionFinder.statistics["total"], index)]
- if regionFinder.doDirectionality:
- footerLines.append("#\t\t%d additional regions failed directionality filter" % regionFinder.statistics["failed"])
-
- if doRevBackground:
- try:
- percent = min(100. * (float(mIndex)/index), 100.)
- except ZeroDivisionError:
- percent = 0.
-
- footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, regionFinder.statistics["mTotal"], percent))
-
- if regionFinder.shiftValue == "auto" and reportshift:
- bestShift = getBestShiftInDict(shiftDict)
- footerLines.append("#mode of shift values: %d" % bestShift)
-
- if regionFinder.statistics["badRegionTrim"] > 0:
- footerLines.append("#%d regions discarded due to trimming problems" % regionFinder.statistics["badRegionTrim"])
-
- return string.join(footerLines, "\n")
-
-
def getBestShiftInDict(shiftDict):
return max(shiftDict.iteritems(), key=operator.itemgetter(1))[0]
if __name__ == "__main__":
- main(sys.argv)
+ main(sys.argv)
\ No newline at end of file