2 usage: python $ERANGEPATH/findall.py label samplebamfile regionoutfile
3 [--control controlbamfile] [--minimum minHits] [--ratio minRatio]
4 [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num]
5 [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction]
6 [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw]
7 [--revbackground] [--pvalue self|back|none] [--nodirectionality]
8 [--strandfilter plus/minus] [--trimvalue percent] [--notrim]
9 [--cache pages] [--log altlogfile] [--flag aflag] [--append] [--RNA]
11 where values in brackets are optional and label is an arbitrary string.
13 Use -ratio (default 4 fold) to set the minimum fold enrichment
14 over the control, -minimum (default 4) is the minimum number of reads
15 (RPM) within the region, and -spacing (default readlen) to set the maximum
16 distance between reads in the region. -listPeak lists the peak of the
17 region. Peaks mut be higher than -minPeak (default 0.5 RPM).
18 Pvalues are calculated from the sample (change with -pvalue),
19 unless the -revbackground flag and a control RDS file are provided.
21 By default, all numbers and parameters are on a reads per
22 million (RPM) basis. -raw will treat all settings, ratios and reported
23 numbers as raw counts rather than RPM. Use -notrim to turn off region
24 trimming and -trimvalue to control trimming (default 10% of peak signal)
26 The peak finder uses minimal directionality information that can
27 be turned off with -nodirectionality ; the fraction of + strand reads
28 required to be to the left of the peak (default 0.3) can be set with
29 -leftPlus ; -minPlus and -maxPlus change the minimum/maximum fraction
30 of plus reads in a region, which (defaults 0.25 and 0.75, respectively).
32 Use -shift to shift reads either by half the expected
33 fragment length (default 0 bp) or '-shift learn ' to learn the shift
34 based on the first chromosome. If you prefer to learn the shift
35 manually, use -autoshift to calculate a per-region shift value, which
36 can be reported using -reportshift. -strandfilter should only be used
37 when explicitely calling unshifted stranded peaks from non-ChIP-seq
38 data such as directional RNA-seq. regionoutfile is written over by
39 default unless given the -append flag.
54 from commoncode import writeLog, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption, isSpliceEntry
58 versionString = "findall: version 3.2.1"
61 class RegionDirectionError(Exception):
66 def __init__(self, label, minRatio=4.0, minPeak=0.5, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, strandfilter="",
67 minHits=4.0, trimValue=0.1, doTrim=True, doDirectionality=True, shiftValue=0, maxSpacing=50, withFlag="",
68 normalize=True, listPeak=False, reportshift=False, stringency=1.0, controlfile=None, doRevBackground=False):
70 self.statistics = {"index": 0,
78 self.regionLabel = label
79 self.rnaSettings = False
80 self.controlRDSsize = 1.0
81 self.sampleRDSsize = 1.0
82 self.minRatio = minRatio
83 self.minPeak = minPeak
84 self.leftPlusRatio = leftPlusRatio
85 self.stranded = "both"
86 if strandfilter == "plus":
90 elif strandfilter == "minus":
95 if minRatio < minPeak:
96 self.minPeak = minRatio
98 self.minPlusRatio = minPlusRatio
99 self.maxPlusRatio = maxPlusRatio
100 self.strandfilter = strandfilter
101 self.minHits = minHits
102 self.trimValue = trimValue
104 self.doDirectionality = doDirectionality
107 self.trimString = string.join(["%2.1f" % (100. * self.trimValue), "%"], "")
109 self.trimString = "none"
111 self.shiftValue = shiftValue
112 self.maxSpacing = maxSpacing
113 self.withFlag = withFlag
114 self.normalize = normalize
115 self.listPeak = listPeak
116 self.reportshift = reportshift
117 self.stringency = max(stringency, 1.0)
118 self.controlfile = controlfile
119 self.doControl = self.controlfile is not None
120 self.doPvalue = False
121 self.doRevBackground = doRevBackground
124 def useRNASettings(self, readlen):
125 self.rnaSettings = True
128 self.doDirectionality = False
129 self.maxSpacing = readlen
138 headerFields = ["#regionID\tchrom\tstart\tstop", countType, "fold\tmulti%"]
140 if self.doDirectionality:
141 headerFields.append("plus%\tleftPlus%")
144 headerFields.append("peakPos\tpeakHeight")
147 headerFields.append("readShift")
150 headerFields.append("pValue")
152 return string.join(headerFields, "\t")
155 def printSettings(self, ptype, useMulti, pValueType):
157 self.printStatusMessages(ptype, useMulti)
158 self.printOptionsSummary(useMulti, pValueType)
161 def printStatusMessages(self, ptype, useMulti):
162 if self.shiftValue == "learn":
163 print "Will try to learn shift"
166 print "Normalizing to RPM"
168 if self.doRevBackground:
169 print "Swapping IP and background to calculate FDR"
172 if ptype in ["NONE", "SELF"]:
174 elif ptype == "BACK":
175 if self.doControl and self.doRevBackground:
178 print "must have a control dataset and -revbackground for pValue type 'back'"
180 print "could not use pValue type : %s" % ptype
182 if self.withFlag != "":
183 print "restrict to flag = %s" % self.withFlag
186 print "using unique reads only"
189 print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
191 if self.strandfilter == "plus":
192 print "only analyzing reads on the plus strand"
193 elif self.strandfilter == "minus":
194 print "only analyzing reads on the minus strand"
197 def printOptionsSummary(self, useMulti, pValueType):
199 print "\nenforceDirectionality=%s listPeak=%s nomulti=%s " % (self.doDirectionality, self.listPeak, not useMulti)
200 print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded)
202 print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
204 print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
207 def getAnalysisDescription(self, hitfile, useMulti, pValueType):
209 description = ["#ERANGE %s" % versionString]
211 description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, self.controlfile, self.controlRDSsize))
213 description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample: none" % (hitfile, self.sampleRDSsize))
215 if self.withFlag != "":
216 description.append("#restrict to Flag = %s" % self.withFlag)
218 description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s" % (self.doDirectionality, self.listPeak, not useMulti))
219 description.append("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded))
221 description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
223 description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
225 return string.join(description, "\n")
228 def getFooter(self, bestShift):
229 index = self.statistics["index"]
230 mIndex = self.statistics["mIndex"]
231 footerLines = ["#stats:\t%.1f RPM in %d regions" % (self.statistics["total"], index)]
232 if self.doDirectionality:
233 footerLines.append("#\t\t%d additional regions failed directionality filter" % self.statistics["failed"])
235 if self.doRevBackground:
237 percent = min(100. * (float(mIndex)/index), 100.)
238 except ZeroDivisionError:
241 footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, self.statistics["mTotal"], percent))
243 if self.shiftValue == "auto" and self.reportshift:
245 footerLines.append("#mode of shift values: %d" % bestShift)
247 if self.statistics["badRegionTrim"] > 0:
248 footerLines.append("#%d regions discarded due to trimming problems" % self.statistics["badRegionTrim"])
250 return string.join(footerLines, "\n")
253 def updateControlStatistics(self, peak, sumAll, peakScore):
255 plusRatio = float(peak.numPlus)/peak.numHits
256 if peakScore >= self.minPeak and self.minPlusRatio <= plusRatio <= self.maxPlusRatio:
257 if self.doDirectionality:
258 if self.leftPlusRatio < peak.numLeftPlus / peak.numPlus:
259 self.statistics["mIndex"] += 1
260 self.statistics["mTotal"] += sumAll
262 self.statistics["failed"] += 1
264 # we have a region, but didn't check for directionality
265 self.statistics["mIndex"] += 1
266 self.statistics["mTotal"] += sumAll
277 parser = makeParser()
278 (options, args) = parser.parse_args(argv[1:])
286 outfilename = args[2]
290 if options.autoshift:
293 if options.shift is not None:
295 shiftValue = int(options.shift)
297 if options.shift == "learn":
308 regionFinder = RegionFinder(factor, minRatio=options.minRatio, minPeak=options.minPeak, minPlusRatio=options.minPlusRatio,
309 maxPlusRatio=options.maxPlusRatio, leftPlusRatio=options.leftPlusRatio, strandfilter=options.strandfilter,
310 minHits=options.minHits, trimValue=options.trimValue, doTrim=options.doTrim,
311 doDirectionality=options.doDirectionality, shiftValue=shiftValue, maxSpacing=options.maxSpacing,
312 withFlag=options.withFlag, normalize=options.normalize, listPeak=options.listPeak,
313 reportshift=options.reportshift, stringency=options.stringency, controlfile=options.controlfile,
314 doRevBackground=options.doRevBackground)
316 findall(regionFinder, hitfile, outfilename, options.logfilename, outputMode, options.rnaSettings,
317 options.ptype, options.useMulti, options.combine5p)
323 parser = optparse.OptionParser(usage=usage)
324 parser.add_option("--control", dest="controlfile")
325 parser.add_option("--minimum", type="float", dest="minHits")
326 parser.add_option("--ratio", type="float", dest="minRatio")
327 parser.add_option("--spacing", type="int", dest="maxSpacing")
328 parser.add_option("--listPeak", action="store_true", dest="listPeak")
329 parser.add_option("--shift", dest="shift")
330 parser.add_option("--learnFold", type="float", dest="stringency")
331 parser.add_option("--noshift", action="store_true", dest="noShift")
332 parser.add_option("--autoshift", action="store_true", dest="autoshift")
333 parser.add_option("--reportshift", action="store_true", dest="reportshift")
334 parser.add_option("--nomulti", action="store_false", dest="useMulti")
335 parser.add_option("--minPlus", type="float", dest="minPlusRatio")
336 parser.add_option("--maxPlus", type="float", dest="maxPlusRatio")
337 parser.add_option("--leftPlus", type="float", dest="leftPlusRatio")
338 parser.add_option("--minPeak", type="float", dest="minPeak")
339 parser.add_option("--raw", action="store_false", dest="normalize")
340 parser.add_option("--revbackground", action="store_true", dest="doRevBackground")
341 parser.add_option("--pvalue", dest="ptype")
342 parser.add_option("--nodirectionality", action="store_false", dest="doDirectionality")
343 parser.add_option("--strandfilter", dest="strandfilter")
344 parser.add_option("--trimvalue", type="float", dest="trimValue")
345 parser.add_option("--notrim", action="store_false", dest="doTrim")
346 parser.add_option("--cache", type="int", dest="cachePages")
347 parser.add_option("--log", dest="logfilename")
348 parser.add_option("--flag", dest="withFlag")
349 parser.add_option("--append", action="store_true", dest="doAppend")
350 parser.add_option("--RNA", action="store_true", dest="rnaSettings")
351 parser.add_option("--combine5p", action="store_true", dest="combine5p")
353 configParser = getConfigParser()
355 minHits = getConfigFloatOption(configParser, section, "minHits", 4.0)
356 minRatio = getConfigFloatOption(configParser, section, "minRatio", 4.0)
357 maxSpacing = getConfigIntOption(configParser, section, "maxSpacing", 50)
358 listPeak = getConfigBoolOption(configParser, section, "listPeak", False)
359 shift = getConfigOption(configParser, section, "shift", None)
360 stringency = getConfigFloatOption(configParser, section, "stringency", 4.0)
361 noshift = getConfigBoolOption(configParser, section, "noshift", False)
362 autoshift = getConfigBoolOption(configParser, section, "autoshift", False)
363 reportshift = getConfigBoolOption(configParser, section, "reportshift", False)
364 minPlusRatio = getConfigFloatOption(configParser, section, "minPlusRatio", 0.25)
365 maxPlusRatio = getConfigFloatOption(configParser, section, "maxPlusRatio", 0.75)
366 leftPlusRatio = getConfigFloatOption(configParser, section, "leftPlusRatio", 0.3)
367 minPeak = getConfigFloatOption(configParser, section, "minPeak", 0.5)
368 normalize = getConfigBoolOption(configParser, section, "normalize", True)
369 logfilename = getConfigOption(configParser, section, "logfilename", "findall.log")
370 withFlag = getConfigOption(configParser, section, "withFlag", "")
371 doDirectionality = getConfigBoolOption(configParser, section, "doDirectionality", True)
372 trimValue = getConfigFloatOption(configParser, section, "trimValue", 0.1)
373 doTrim = getConfigBoolOption(configParser, section, "doTrim", True)
374 doAppend = getConfigBoolOption(configParser, section, "doAppend", False)
375 rnaSettings = getConfigBoolOption(configParser, section, "rnaSettings", False)
376 cachePages = getConfigOption(configParser, section, "cachePages", None)
377 ptype = getConfigOption(configParser, section, "ptype", "")
378 controlfile = getConfigOption(configParser, section, "controlfile", None)
379 doRevBackground = getConfigBoolOption(configParser, section, "doRevBackground", False)
380 useMulti = getConfigBoolOption(configParser, section, "useMulti", True)
381 strandfilter = getConfigOption(configParser, section, "strandfilter", "")
382 combine5p = getConfigBoolOption(configParser, section, "combine5p", False)
384 parser.set_defaults(minHits=minHits, minRatio=minRatio, maxSpacing=maxSpacing, listPeak=listPeak, shift=shift,
385 stringency=stringency, noshift=noshift, autoshift=autoshift, reportshift=reportshift,
386 minPlusRatio=minPlusRatio, maxPlusRatio=maxPlusRatio, leftPlusRatio=leftPlusRatio, minPeak=minPeak,
387 normalize=normalize, logfilename=logfilename, withFlag=withFlag, doDirectionality=doDirectionality,
388 trimValue=trimValue, doTrim=doTrim, doAppend=doAppend, rnaSettings=rnaSettings,
389 cachePages=cachePages, ptype=ptype, controlfile=controlfile, doRevBackground=doRevBackground, useMulti=useMulti,
390 strandfilter=strandfilter, combine5p=combine5p)
395 def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outputMode="w", rnaSettings=False,
396 ptype="", useMulti=True, combine5p=False):
398 writeLog(logfilename, versionString, string.join(sys.argv[1:]))
400 if regionFinder.doControl:
402 controlBAM = pysam.Samfile(regionFinder.controlfile, "rb")
403 regionFinder.controlRDSsize = int(getHeaderComment(controlBAM.header, "Total")) / 1000000.
406 sampleBAM = pysam.Samfile(hitfile, "rb")
407 regionFinder.sampleRDSsize = int(getHeaderComment(sampleBAM.header, "Total")) / 1000000.
408 pValueType = getPValueType(ptype, regionFinder.doControl, regionFinder.doRevBackground)
409 regionFinder.doPvalue = not pValueType == "none"
410 regionFinder.readlen = int(getHeaderComment(sampleBAM.header, "ReadLength"))
412 regionFinder.useRNASettings(regionFinder.readlen)
414 regionFinder.printSettings(ptype, useMulti, pValueType)
415 outfile = open(outfilename, outputMode)
416 header = writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType)
418 chromList = getChromosomeListToProcess(sampleBAM, controlBAM)
419 for chromosome in chromList:
420 #TODO: Really? Use first chr shift value for all of them
421 maxSampleCoord = getMaxCoordinate(sampleBAM, chromosome, doMulti=useMulti)
422 if regionFinder.shiftValue == "learn":
423 regionFinder.shiftValue = learnShift(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p)
425 allregions, outregions = findPeakRegions(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p)
426 if regionFinder.doRevBackground:
427 maxControlCoord = getMaxCoordinate(controlBAM, chromosome, doMulti=useMulti)
428 backregions = findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxControlCoord, chromosome, useMulti)
433 writeChromosomeResults(regionFinder, outregions, outfile, shiftDict, allregions, header, backregions=backregions, pValueType=pValueType)
436 bestShift = getBestShiftInDict(shiftDict)
440 footer = regionFinder.getFooter(bestShift)
442 print >> outfile, footer
444 writeLog(logfilename, versionString, outfilename + footer.replace("\n#"," | ")[:-1])
447 def getHeaderComment(bamHeader, commentKey):
448 for comment in bamHeader["CO"]:
449 fields = comment.split("\t")
450 if fields[0] == commentKey:
456 def getPValueType(ptype, doControl, doRevBackground):
458 if ptype in ["NONE", "SELF", "BACK"]:
461 elif ptype == "SELF":
463 elif ptype == "BACK":
464 if doControl and doRevBackground:
466 elif doRevBackground:
472 def writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType):
473 print >> outfile, regionFinder.getAnalysisDescription(hitfile, useMulti, pValueType)
474 header = regionFinder.getHeader()
475 print >> outfile, header
480 def getChromosomeListToProcess(sampleBAM, controlBAM=None):
481 if controlBAM is not None:
482 chromosomeList = [chrom for chrom in sampleBAM.references if chrom in controlBAM.references and chrom != "chrM"]
484 chromosomeList = [chrom for chrom in sampleBAM.references if chrom != "chrM"]
486 return chromosomeList
489 def findPeakRegions(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename,
490 outfile, useMulti, controlBAM, combine5p):
494 print "chromosome %s" % (chromosome)
495 previousHit = - 1 * regionFinder.maxSpacing
496 readStartPositions = [-1]
500 numStartsInRegion = 0
502 for alignedread in sampleBAM.fetch(chromosome):
503 if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p):
506 pos = alignedread.pos
507 if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
508 lastReadPos = readStartPositions[-1]
509 lastBasePosition = lastReadPos + regionFinder.readlen - 1
510 newRegionIndex = regionFinder.statistics["index"] + 1
511 if regionFinder.doDirectionality:
512 region = Region.DirectionalRegion(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=regionFinder.regionLabel,
513 numReads=totalWeight)
515 region = Region.Region(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=regionFinder.regionLabel, numReads=totalWeight)
517 if regionFinder.normalize:
518 region.numReads /= regionFinder.sampleRDSsize
520 allregions.append(int(region.numReads))
521 regionLength = lastReadPos - region.start
522 if regionPassesCriteria(regionFinder, region.numReads, numStartsInRegion, regionLength):
523 region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti)
525 if region.foldRatio >= regionFinder.minRatio:
526 # first pass, with absolute numbers
527 peak = findPeak(reads, region.start, regionLength, regionFinder.readlen, doWeight=True, leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue)
528 if regionFinder.doTrim:
530 lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.sampleRDSsize)
532 regionFinder.statistics["badRegionTrim"] += 1
535 region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti)
538 bestPos = peak.topPos[0]
539 peakScore = peak.smoothArray[bestPos]
540 if regionFinder.normalize:
541 peakScore /= regionFinder.sampleRDSsize
542 except (IndexError, AttributeError, ZeroDivisionError):
545 if regionFinder.listPeak:
546 region.peakDescription = "%d\t%.1f" % (region.start + bestPos, peakScore)
549 setMultireadPercentage(region, sampleBAM, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos,
550 regionFinder.normalize, regionFinder.doTrim)
552 region.shift = peak.shift
553 # check that we still pass threshold
554 regionLength = lastReadPos - region.start
555 plusRatio = float(peak.numPlus)/peak.numHits
556 if regionAndPeakPass(regionFinder, region, regionLength, peakScore, plusRatio):
558 updateRegion(region, regionFinder.doDirectionality, regionFinder.leftPlusRatio, peak.numLeftPlus, peak.numPlus, plusRatio)
559 regionFinder.statistics["index"] += 1
560 outregions.append(region)
561 regionFinder.statistics["total"] += region.numReads
562 except RegionDirectionError:
563 regionFinder.statistics["failed"] += 1
565 readStartPositions = []
569 numStartsInRegion = 0
571 if pos not in readStartPositions:
572 numStartsInRegion += 1
574 readStartPositions.append(pos)
575 weight = 1.0/alignedread.opt('NH')
576 totalWeight += weight
580 reads.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
583 return allregions, outregions
586 def getReadSense(read):
595 def doNotProcessRead(read, doMulti=False, strand="both", combine5p=False):
596 if read.opt('NH') > 1 and not doMulti:
599 if strand == "+" and read.is_reverse:
602 if strand == "-" and not read.is_reverse:
608 def getMaxCoordinate(samfile, chr, doMulti=False):
610 for alignedread in samfile.fetch(chr):
611 if alignedread.opt('NH') > 1:
613 maxCoord = max(maxCoord, alignedread.pos)
615 maxCoord = max(maxCoord, alignedread.pos)
620 def findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxCoord, chromosome, useMulti):
621 #TODO: this is *almost* the same calculation - there are small yet important differences
622 print "calculating background..."
623 previousHit = - 1 * regionFinder.maxSpacing
624 currentHitList = [-1]
625 currentTotalWeight = 0.0
630 for alignedread in controlBAM.fetch(chromosome):
631 if doNotProcessRead(alignedread, doMulti=useMulti):
634 pos = alignedread.pos
635 if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
636 lastReadPos = currentHitList[-1]
637 lastBasePosition = lastReadPos + regionFinder.readlen - 1
638 region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight)
639 if regionFinder.normalize:
640 region.numReads /= regionFinder.controlRDSsize
642 backregions.append(int(region.numReads))
643 region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight)
644 regionLength = lastReadPos - region.start
645 if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength):
646 numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti)
647 if regionFinder.normalize:
648 numMock /= regionFinder.sampleRDSsize
650 foldRatio = region.numReads / numMock
651 if foldRatio >= regionFinder.minRatio:
652 # first pass, with absolute numbers
653 peak = findPeak(currentReadList, region.start, lastReadPos - region.start, regionFinder.readlen, doWeight=True,
654 leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue)
656 if regionFinder.doTrim:
658 lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, 20., currentReadList, regionFinder.controlRDSsize)
663 numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti)
664 if regionFinder.normalize:
665 numMock /= regionFinder.sampleRDSsize
667 foldRatio = region.numReads / numMock
670 bestPos = peak.topPos[0]
671 peakScore = peak.smoothArray[bestPos]
676 if regionFinder.normalize:
677 peakScore /= regionFinder.controlRDSsize
679 # check that we still pass threshold
680 regionLength = lastReadPos - region.start
681 if regionPassesCriteria(regionFinder, region.numReads, foldRatio, regionLength):
682 regionFinder.updateControlStatistics(peak, region.numReads, peakScore)
685 currentTotalWeight = 0.0
690 regionFinder.statistics["badRegionTrim"] += 1
692 if pos not in currentHitList:
695 currentHitList.append(pos)
696 weight = 1.0/alignedread.opt('NH')
697 currentTotalWeight += weight
698 currentReadList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
704 def learnShift(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename,
705 outfile, useMulti, controlBAM, combine5p):
707 print "learning shift.... will need at least 30 training sites"
708 stringency = regionFinder.stringency
709 previousHit = -1 * regionFinder.maxSpacing
716 for alignedread in sampleBAM.fetch(chromosome):
717 if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p):
720 pos = alignedread.pos
721 if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
722 if regionFinder.normalize:
723 totalWeight /= regionFinder.sampleRDSsize
725 regionStart = positionList[0]
726 regionStop = positionList[-1]
727 regionLength = regionStop - regionStart
728 if regionPassesCriteria(regionFinder, totalWeight, numStarts, regionLength, stringency=stringency):
729 foldRatio = getFoldRatio(regionFinder, controlBAM, totalWeight, chromosome, regionStart, regionStop, useMulti)
730 if foldRatio >= regionFinder.minRatio:
731 updateShiftDict(shiftDict, readList, regionStart, regionLength, regionFinder.readlen)
738 if pos not in positionList:
741 positionList.append(pos)
742 weight = 1.0/alignedread.opt('NH')
743 totalWeight += weight
744 readList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
747 outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency,
748 stringency * regionFinder.minHits,
749 stringency * regionFinder.minRatio,
750 stringency * regionFinder.readlen,
754 writeLog(logfilename, versionString, outfilename + outline)
755 shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename)
756 outline = "#picked shiftValue to be %d" % shiftValue
758 print >> outfile, outline
759 writeLog(logfilename, versionString, outfilename + outline)
761 return shiftValue, shiftDict
764 def previousRegionIsDone(pos, previousHit, maxSpacing, maxCoord):
765 return abs(pos - previousHit) > maxSpacing or pos == maxCoord
768 def regionPassesCriteria(regionFinder, sumAll, numStarts, regionLength, stringency=1):
769 minTotalReads = stringency * regionFinder.minHits
770 minNumReadStarts = stringency * regionFinder.minRatio
771 minRegionLength = stringency * regionFinder.readlen
773 return sumAll >= minTotalReads and numStarts >= minNumReadStarts and regionLength > minRegionLength
776 def trimRegion(region, regionFinder, peak, regionStop, trimValue, currentReadList, totalReadCount):
777 bestPos = peak.topPos[0]
778 peakScore = peak.smoothArray[bestPos]
779 if regionFinder.normalize:
780 peakScore /= totalReadCount
782 minSignalThresh = trimValue * peakScore
783 start = findStartEdgePosition(peak, minSignalThresh)
784 regionEndPoint = regionStop - region.start - 1
785 stop = findStopEdgePosition(peak, regionEndPoint, minSignalThresh)
787 regionStop = region.start + stop
788 region.start += start
790 trimmedPeak = findPeak(currentReadList, region.start, regionStop - region.start, regionFinder.readlen, doWeight=True,
791 leftPlus=regionFinder.doDirectionality, shift=peak.shift)
793 peak.numPlus = trimmedPeak.numPlus
794 peak.numLeftPlus = trimmedPeak.numLeftPlus
795 peak.topPos = trimmedPeak.topPos
796 peak.smoothArray = trimmedPeak.smoothArray
798 region.numReads = trimmedPeak.numHits
799 if regionFinder.normalize:
800 region.numReads /= totalReadCount
802 region.stop = regionStop + regionFinder.readlen - 1
807 def findStartEdgePosition(peak, minSignalThresh):
809 while not peakEdgeLocated(peak, start, minSignalThresh):
815 def findStopEdgePosition(peak, stop, minSignalThresh):
816 while not peakEdgeLocated(peak, stop, minSignalThresh):
822 def peakEdgeLocated(peak, position, minSignalThresh):
823 return peak.smoothArray[position] >= minSignalThresh or position == peak.topPos[0]
826 def getFoldRatio(regionFinder, controlBAM, sumAll, chromosome, regionStart, regionStop, useMulti):
827 """ Fold ratio calculated is total read weight over control
829 #TODO: this needs to be generalized as there is a point at which we want to use the sampleRDS instead of controlRDS
830 if regionFinder.doControl:
831 numMock = 1. + countReadsInRegion(controlBAM, chromosome, regionStart, regionStop, countMulti=useMulti)
832 if regionFinder.normalize:
833 numMock /= regionFinder.controlRDSsize
835 foldRatio = sumAll / numMock
837 foldRatio = regionFinder.minRatio
842 def countReadsInRegion(bamfile, chr, start, end, uniqs=True, countMulti=False, countSplices=False):
844 for alignedread in bamfile.fetch(chr, start, end):
845 if alignedread.opt('NH') > 1:
847 if isSpliceEntry(alignedread.cigar):
849 count += 1.0/alignedread.opt('NH')
851 count += 1.0/alignedread.opt('NH')
853 if isSpliceEntry(alignedread.cigar):
861 def updateShiftDict(shiftDict, readList, regionStart, regionLength, readlen):
862 peak = findPeak(readList, regionStart, regionLength, readlen, doWeight=True, shift="auto")
864 shiftDict[peak.shift] += 1
866 shiftDict[peak.shift] = 1
869 def getShiftValue(shiftDict, count, logfilename, outfilename):
871 outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
873 writeLog(logfilename, versionString, outfilename + outline)
876 shiftValue = getBestShiftInDict(shiftDict)
882 def getRegion(regionStart, regionStop, factor, index, chromosome, sumAll, foldRatio, multiP,
883 peakDescription, shift, doDirectionality, leftPlusRatio, numLeft,
887 if leftPlusRatio < numLeft / numPlus:
888 plusP = plusRatio * 100.
889 leftP = 100. * numLeft / numPlus
890 # we have a region that passes all criteria
891 region = Region.DirectionalRegion(regionStart, regionStop,
892 factor, index, chromosome, sumAll,
893 foldRatio, multiP, plusP, leftP,
894 peakDescription, shift)
897 raise RegionDirectionError
899 # we have a region, but didn't check for directionality
900 region = Region.Region(regionStart, regionStop, factor, index, chromosome,
901 sumAll, foldRatio, multiP, peakDescription, shift)
906 def setMultireadPercentage(region, sampleBAM, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim):
909 for alignedread in sampleBAM.fetch(chromosome, region.start, lastReadPos):
910 if alignedread.opt('NH') > 1:
911 sumMulti += 1.0/alignedread.opt('NH')
913 sumMulti = currentTotalWeight - currentUniqueCount
917 sumMulti /= hitRDSsize
920 multiP = 100. * (sumMulti / region.numReads)
921 except ZeroDivisionError:
924 region.multiP = min(multiP, 100.)
927 def regionAndPeakPass(regionFinder, region, regionLength, peakScore, plusRatio):
929 if regionPassesCriteria(regionFinder, region.numReads, region.foldRatio, regionLength):
930 if peakScore >= regionFinder.minPeak and regionFinder.minPlusRatio <= plusRatio <= regionFinder.maxPlusRatio:
936 def updateRegion(region, doDirectionality, leftPlusRatio, numLeft, numPlus, plusRatio):
939 if leftPlusRatio < numLeft / numPlus:
940 region.plusP = plusRatio * 100.
941 region.leftP = 100. * numLeft / numPlus
943 raise RegionDirectionError
946 def writeChromosomeResults(regionFinder, outregions, outfile, shiftDict,
947 allregions, header, backregions=[], pValueType="none"):
949 print regionFinder.statistics["mIndex"], regionFinder.statistics["mTotal"]
950 if regionFinder.doPvalue:
951 if pValueType == "self":
952 poissonmean = calculatePoissonMean(allregions)
954 poissonmean = calculatePoissonMean(backregions)
957 for region in outregions:
958 if regionFinder.shiftValue == "auto" and regionFinder.reportshift:
960 shiftDict[region.shift] += 1
962 shiftDict[region.shift] = 1
964 outline = getRegionString(region, regionFinder.reportshift)
966 # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
967 if regionFinder.doPvalue:
968 pValue = calculatePValue(int(region.numReads), poissonmean)
969 outline += "\t%1.2g" % pValue
972 print >> outfile, outline
975 def calculatePoissonMean(dataList):
977 listSize = float(len(dataList))
979 poissonmean = sum(dataList) / listSize
980 except ZeroDivisionError:
983 print "Poisson n=%d, p=%f" % (listSize, poissonmean)
988 def calculatePValue(sum, poissonmean):
989 pValue = math.exp(-poissonmean)
990 for i in xrange(sum):
991 pValue *= poissonmean
997 def getRegionString(region, reportShift):
999 outline = region.printRegionWithShift()
1001 outline = region.printRegion()
1006 def getBestShiftInDict(shiftDict):
1007 return max(shiftDict.iteritems(), key=operator.itemgetter(1))[0]
1010 if __name__ == "__main__":