2 usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile
3 [--control controlrdsfile] [--minimum minHits] [--ratio minRatio]
4 [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num]
5 [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction]
6 [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw]
7 [--revbackground] [--pvalue self|back|none] [--nodirectionality]
8 [--strandfilter plus/minus] [--trimvalue percent] [--notrim]
9 [--cache pages] [--log altlogfile] [--flag aflag] [--append] [--RNA]
11 where values in brackets are optional and label is an arbitrary string.
13 Use -ratio (default 4 fold) to set the minimum fold enrichment
14 over the control, -minimum (default 4) is the minimum number of reads
15 (RPM) within the region, and -spacing (default readlen) to set the maximum
16 distance between reads in the region. -listPeak lists the peak of the
17 region. Peaks mut be higher than -minPeak (default 0.5 RPM).
18 Pvalues are calculated from the sample (change with -pvalue),
19 unless the -revbackground flag and a control RDS file are provided.
21 By default, all numbers and parameters are on a reads per
22 million (RPM) basis. -raw will treat all settings, ratios and reported
23 numbers as raw counts rather than RPM. Use -notrim to turn off region
24 trimming and -trimvalue to control trimming (default 10% of peak signal)
26 The peak finder uses minimal directionality information that can
27 be turned off with -nodirectionality ; the fraction of + strand reads
28 required to be to the left of the peak (default 0.3) can be set with
29 -leftPlus ; -minPlus and -maxPlus change the minimum/maximum fraction
30 of plus reads in a region, which (defaults 0.25 and 0.75, respectively).
32 Use -shift to shift reads either by half the expected
33 fragment length (default 0 bp) or '-shift learn ' to learn the shift
34 based on the first chromosome. If you prefer to learn the shift
35 manually, use -autoshift to calculate a per-region shift value, which
36 can be reported using -reportshift. -strandfilter should only be used
37 when explicitely calling unshifted stranded peaks from non-ChIP-seq
38 data such as directional RNA-seq. regionoutfile is written over by
39 default unless given the -append flag.
53 from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
58 versionString = "findall: version 3.2.1"
61 class RegionDirectionError(Exception):
66 def __init__(self, label, minRatio=4.0, minPeak=0.5, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, strandfilter="",
67 minHits=4.0, trimValue=0.1, doTrim=True, doDirectionality=True, shiftValue=0, maxSpacing=50, withFlag="",
68 normalize=True, listPeak=False, reportshift=False, stringency=1.0):
70 self.statistics = {"index": 0,
78 self.regionLabel = label
79 self.rnaSettings = False
80 self.controlRDSsize = 1
81 self.sampleRDSsize = 1
82 self.minRatio = minRatio
83 self.minPeak = minPeak
84 self.leftPlusRatio = leftPlusRatio
85 self.stranded = "both"
86 if strandfilter == "plus":
90 elif strandfilter == "minus":
95 if minRatio < minPeak:
96 self.minPeak = minRatio
98 self.minPlusRatio = minPlusRatio
99 self.maxPlusRatio = maxPlusRatio
100 self.strandfilter = strandfilter
101 self.minHits = minHits
102 self.trimValue = trimValue
104 self.doDirectionality = doDirectionality
107 self.trimString = string.join(["%2.1f" % (100. * self.trimValue), "%"], "")
109 self.trimString = "none"
111 self.shiftValue = shiftValue
112 self.maxSpacing = maxSpacing
113 self.withFlag = withFlag
114 self.normalize = normalize
115 self.listPeak = listPeak
116 self.reportshift = reportshift
117 self.stringency = max(stringency, 1.0)
120 def useRNASettings(self, readlen):
121 self.rnaSettings = True
124 self.doDirectionality = False
125 self.maxSpacing = readlen
128 def getHeader(self, doPvalue):
134 headerFields = ["#regionID\tchrom\tstart\tstop", countType, "fold\tmulti%"]
136 if self.doDirectionality:
137 headerFields.append("plus%\tleftPlus%")
140 headerFields.append("peakPos\tpeakHeight")
143 headerFields.append("readShift")
146 headerFields.append("pValue")
148 return string.join(headerFields, "\t")
151 def printSettings(self, doRevBackground, ptype, doControl, useMulti, doCache, pValueType):
153 self.printStatusMessages(doRevBackground, ptype, doControl, useMulti)
154 self.printOptionsSummary(useMulti, doCache, pValueType)
157 def printStatusMessages(self, doRevBackground, ptype, doControl, useMulti):
158 if self.shiftValue == "learn":
159 print "Will try to learn shift"
162 print "Normalizing to RPM"
165 print "Swapping IP and background to calculate FDR"
168 if ptype in ["NONE", "SELF"]:
170 elif ptype == "BACK":
171 if doControl and doRevBackground:
174 print "must have a control dataset and -revbackground for pValue type 'back'"
176 print "could not use pValue type : %s" % ptype
178 if self.withFlag != "":
179 print "restrict to flag = %s" % self.withFlag
182 print "using unique reads only"
185 print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
187 if self.strandfilter == "plus":
188 print "only analyzing reads on the plus strand"
189 elif self.strandfilter == "minus":
190 print "only analyzing reads on the minus strand"
193 def printOptionsSummary(self, useMulti, doCache, pValueType):
195 print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (self.doDirectionality, self.listPeak, not useMulti, doCache)
196 print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded)
198 print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
200 print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
203 def getAnalysisDescription(self, hitfile, useMulti, doCache, pValueType, controlfile, doControl):
205 description = ["#ERANGE %s" % versionString]
207 description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, controlfile, self.controlRDSsize))
209 description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample: none" % (hitfile, self.sampleRDSsize))
211 if self.withFlag != "":
212 description.append("#restrict to Flag = %s" % self.withFlag)
214 description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s" % (self.doDirectionality, self.listPeak, not useMulti, doCache))
215 description.append("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded))
217 description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
219 description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
221 return string.join(description, "\n")
224 def updateControlStatistics(self, peak, sumAll, peakScore):
226 plusRatio = float(peak.numPlus)/peak.numHits
227 if peakScore >= self.minPeak and self.minPlusRatio <= plusRatio <= self.maxPlusRatio:
228 if self.doDirectionality:
229 if self.leftPlusRatio < peak.numLeftPlus / peak.numPlus:
230 self.statistics["mIndex"] += 1
231 self.statistics["mTotal"] += sumAll
233 self.statistics["failed"] += 1
235 # we have a region, but didn't check for directionality
236 self.statistics["mIndex"] += 1
237 self.statistics["mTotal"] += sumAll
248 parser = makeParser()
249 (options, args) = parser.parse_args(argv[1:])
257 outfilename = args[2]
261 if options.autoshift:
264 if options.shift is not None:
266 shiftValue = int(options.shift)
268 if options.shift == "learn":
279 regionFinder = RegionFinder(factor, minRatio=options.minRatio, minPeak=options.minPeak, minPlusRatio=options.minPlusRatio,
280 maxPlusRatio=options.maxPlusRatio, leftPlusRatio=options.leftPlusRatio, strandfilter=options.strandfilter,
281 minHits=options.minHits, trimValue=options.trimValue, doTrim=options.doTrim,
282 doDirectionality=options.doDirectionality, shiftValue=shiftValue, maxSpacing=options.maxSpacing,
283 withFlag=options.withFlag, normalize=options.normalize, listPeak=options.listPeak,
284 reportshift=options.reportshift, stringency=options.stringency)
286 findall(regionFinder, hitfile, outfilename, options.logfilename, outputMode, options.rnaSettings,
287 options.cachePages, options.ptype, options.controlfile, options.doRevBackground,
288 options.useMulti, options.combine5p)
294 parser = optparse.OptionParser(usage=usage)
295 parser.add_option("--control", dest="controlfile")
296 parser.add_option("--minimum", type="float", dest="minHits")
297 parser.add_option("--ratio", type="float", dest="minRatio")
298 parser.add_option("--spacing", type="int", dest="maxSpacing")
299 parser.add_option("--listPeak", action="store_true", dest="listPeak")
300 parser.add_option("--shift", dest="shift")
301 parser.add_option("--learnFold", type="float", dest="stringency")
302 parser.add_option("--noshift", action="store_true", dest="noShift")
303 parser.add_option("--autoshift", action="store_true", dest="autoshift")
304 parser.add_option("--reportshift", action="store_true", dest="reportshift")
305 parser.add_option("--nomulti", action="store_false", dest="useMulti")
306 parser.add_option("--minPlus", type="float", dest="minPlusRatio")
307 parser.add_option("--maxPlus", type="float", dest="maxPlusRatio")
308 parser.add_option("--leftPlus", type="float", dest="leftPlusRatio")
309 parser.add_option("--minPeak", type="float", dest="minPeak")
310 parser.add_option("--raw", action="store_false", dest="normalize")
311 parser.add_option("--revbackground", action="store_true", dest="doRevBackground")
312 parser.add_option("--pvalue", dest="ptype")
313 parser.add_option("--nodirectionality", action="store_false", dest="doDirectionality")
314 parser.add_option("--strandfilter", dest="strandfilter")
315 parser.add_option("--trimvalue", type="float", dest="trimValue")
316 parser.add_option("--notrim", action="store_false", dest="doTrim")
317 parser.add_option("--cache", type="int", dest="cachePages")
318 parser.add_option("--log", dest="logfilename")
319 parser.add_option("--flag", dest="withFlag")
320 parser.add_option("--append", action="store_true", dest="doAppend")
321 parser.add_option("--RNA", action="store_true", dest="rnaSettings")
322 parser.add_option("--combine5p", action="store_true", dest="combine5p")
324 configParser = getConfigParser()
326 minHits = getConfigFloatOption(configParser, section, "minHits", 4.0)
327 minRatio = getConfigFloatOption(configParser, section, "minRatio", 4.0)
328 maxSpacing = getConfigIntOption(configParser, section, "maxSpacing", 50)
329 listPeak = getConfigBoolOption(configParser, section, "listPeak", False)
330 shift = getConfigOption(configParser, section, "shift", None)
331 stringency = getConfigFloatOption(configParser, section, "stringency", 4.0)
332 noshift = getConfigBoolOption(configParser, section, "noshift", False)
333 autoshift = getConfigBoolOption(configParser, section, "autoshift", False)
334 reportshift = getConfigBoolOption(configParser, section, "reportshift", False)
335 minPlusRatio = getConfigFloatOption(configParser, section, "minPlusRatio", 0.25)
336 maxPlusRatio = getConfigFloatOption(configParser, section, "maxPlusRatio", 0.75)
337 leftPlusRatio = getConfigFloatOption(configParser, section, "leftPlusRatio", 0.3)
338 minPeak = getConfigFloatOption(configParser, section, "minPeak", 0.5)
339 normalize = getConfigBoolOption(configParser, section, "normalize", True)
340 logfilename = getConfigOption(configParser, section, "logfilename", "findall.log")
341 withFlag = getConfigOption(configParser, section, "withFlag", "")
342 doDirectionality = getConfigBoolOption(configParser, section, "doDirectionality", True)
343 trimValue = getConfigFloatOption(configParser, section, "trimValue", 0.1)
344 doTrim = getConfigBoolOption(configParser, section, "doTrim", True)
345 doAppend = getConfigBoolOption(configParser, section, "doAppend", False)
346 rnaSettings = getConfigBoolOption(configParser, section, "rnaSettings", False)
347 cachePages = getConfigOption(configParser, section, "cachePages", None)
348 ptype = getConfigOption(configParser, section, "ptype", "")
349 controlfile = getConfigOption(configParser, section, "controlfile", None)
350 doRevBackground = getConfigBoolOption(configParser, section, "doRevBackground", False)
351 useMulti = getConfigBoolOption(configParser, section, "useMulti", True)
352 strandfilter = getConfigOption(configParser, section, "strandfilter", "")
353 combine5p = getConfigBoolOption(configParser, section, "combine5p", False)
355 parser.set_defaults(minHits=minHits, minRatio=minRatio, maxSpacing=maxSpacing, listPeak=listPeak, shift=shift,
356 stringency=stringency, noshift=noshift, autoshift=autoshift, reportshift=reportshift,
357 minPlusRatio=minPlusRatio, maxPlusRatio=maxPlusRatio, leftPlusRatio=leftPlusRatio, minPeak=minPeak,
358 normalize=normalize, logfilename=logfilename, withFlag=withFlag, doDirectionality=doDirectionality,
359 trimValue=trimValue, doTrim=doTrim, doAppend=doAppend, rnaSettings=rnaSettings,
360 cachePages=cachePages, ptype=ptype, controlfile=controlfile, doRevBackground=doRevBackground, useMulti=useMulti,
361 strandfilter=strandfilter, combine5p=combine5p)
366 def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outputMode="w", rnaSettings=False, cachePages=None,
367 ptype="", controlfile=None, doRevBackground=False, useMulti=True, combine5p=False):
369 writeLog(logfilename, versionString, string.join(sys.argv[1:]))
370 doCache = cachePages is not None
372 doControl = controlfile is not None
375 controlRDS = openRDSFile(controlfile, cachePages=cachePages, doCache=doCache)
376 regionFinder.controlRDSsize = len(controlRDS) / 1000000.
379 hitRDS = openRDSFile(hitfile, cachePages=cachePages, doCache=doCache)
380 regionFinder.sampleRDSsize = len(hitRDS) / 1000000.
381 pValueType = getPValueType(ptype, doControl, doRevBackground)
382 doPvalue = not pValueType == "none"
383 regionFinder.readlen = hitRDS.getReadSize()
385 regionFinder.useRNASettings(regionFinder.readlen)
387 regionFinder.printSettings(doRevBackground, ptype, doControl, useMulti, doCache, pValueType)
388 outfile = open(outfilename, outputMode)
389 header = writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, doCache, pValueType, doPvalue, controlfile, doControl)
391 chromosomeList = getChromosomeListToProcess(hitRDS, controlRDS, doControl)
392 for chromosome in chromosomeList:
393 if regionFinder.shiftValue == "learn":
394 learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename, outfile, useMulti, doControl, controlRDS, combine5p)
396 allregions, outregions = findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename, outfile, useMulti, doControl, controlRDS, combine5p)
398 backregions = findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti)
399 writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, shiftDict, allregions, header, backregions=backregions, pValueType=pValueType)
401 writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, shiftDict, allregions, header)
403 footer = getFooter(regionFinder, shiftDict, doRevBackground)
405 print >> outfile, footer
407 writeLog(logfilename, versionString, outfilename + footer.replace("\n#"," | ")[:-1])
410 def getPValueType(ptype, doControl, doRevBackground):
412 if ptype in ["NONE", "SELF", "BACK"]:
415 elif ptype == "SELF":
417 elif ptype == "BACK":
418 if doControl and doRevBackground:
420 elif doRevBackground:
426 def openRDSFile(filename, cachePages=None, doCache=False):
427 rds = ReadDataset.ReadDataset(filename, verbose=True, cache=doCache)
428 if cachePages > rds.getDefaultCacheSize():
429 rds.setDBcache(cachePages)
434 def writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, doCache, pValueType, doPvalue, controlfile, doControl):
435 print >> outfile, regionFinder.getAnalysisDescription(hitfile, useMulti, doCache, pValueType, controlfile, doControl)
436 header = regionFinder.getHeader(doPvalue)
437 print >> outfile, header
442 def getChromosomeListToProcess(hitRDS, controlRDS=None, doControl=False):
443 hitChromList = hitRDS.getChromosomes()
445 controlChromList = controlRDS.getChromosomes()
446 chromosomeList = [chrom for chrom in hitChromList if chrom in controlChromList and chrom != "chrM"]
448 chromosomeList = [chrom for chrom in hitChromList if chrom != "chrM"]
450 return chromosomeList
453 def findPeakRegions(regionFinder, hitRDS, chromosome, logfilename, outfilename,
454 outfile, useMulti, doControl, controlRDS, combine5p):
458 print "chromosome %s" % (chromosome)
459 previousHit = - 1 * regionFinder.maxSpacing
460 readStartPositions = [-1]
466 hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=regionFinder.withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True,
467 strand=regionFinder.stranded, combine5p=combine5p)
469 maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
470 for read in hitDict[chromosome]:
472 if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
473 lastReadPos = readStartPositions[-1]
474 lastBasePosition = lastReadPos + regionFinder.readlen - 1
475 newRegionIndex = regionFinder.statistics["index"] + 1
476 if regionFinder.doDirectionality:
477 region = Region.DirectionalRegion(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=regionFinder.regionLabel,
478 numReads=totalWeight)
480 region = Region.Region(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=regionFinder.regionLabel, numReads=totalWeight)
482 if regionFinder.normalize:
483 region.numReads /= regionFinder.sampleRDSsize
485 allregions.append(int(region.numReads))
486 regionLength = lastReadPos - region.start
487 if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength):
488 region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos, useMulti, doControl)
490 if region.foldRatio >= regionFinder.minRatio:
491 # first pass, with absolute numbers
492 peak = findPeak(reads, region.start, regionLength, regionFinder.readlen, doWeight=True, leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue)
493 if regionFinder.doTrim:
495 lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.sampleRDSsize)
500 region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos, useMulti, doControl)
502 # just in case it changed, use latest data
504 bestPos = peak.topPos[0]
505 peakScore = peak.smoothArray[bestPos]
506 if regionFinder.normalize:
507 peakScore /= regionFinder.sampleRDSsize
511 if regionFinder.listPeak:
512 region.peakDescription= "%d\t%.1f" % (region.start + bestPos, peakScore)
515 setMultireadPercentage(region, hitRDS, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos,
516 regionFinder.normalize, regionFinder.doTrim)
518 region.shift = peak.shift
519 # check that we still pass threshold
520 regionLength = lastReadPos - region.start
521 plusRatio = float(peak.numPlus)/peak.numHits
522 if regionAndPeakPass(regionFinder, region, regionLength, peakScore, plusRatio):
524 updateRegion(region, regionFinder.doDirectionality, regionFinder.leftPlusRatio, peak.numLeftPlus, peak.numPlus, plusRatio)
525 regionFinder.statistics["index"] += 1
526 outregions.append(region)
527 regionFinder.statistics["total"] += region.numReads
528 except RegionDirectionError:
529 regionFinder.statistics["failed"] += 1
531 readStartPositions = []
538 regionFinder.statistics["badRegionTrim"] += 1
540 if pos not in readStartPositions:
543 readStartPositions.append(pos)
544 weight = read["weight"]
545 totalWeight += weight
549 reads.append({"start": pos, "sense": read["sense"], "weight": weight})
552 return allregions, outregions
555 def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, useMulti):
556 #TODO: this is *almost* the same calculation - there are small yet important differences
557 print "calculating background..."
558 previousHit = - 1 * regionFinder.maxSpacing
559 currentHitList = [-1]
560 currentTotalWeight = 0
565 hitDict = controlRDS.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, findallOptimize=True)
566 maxCoord = controlRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
567 for read in hitDict[chromosome]:
569 if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
570 lastReadPos = currentHitList[-1]
571 lastBasePosition = lastReadPos + regionFinder.readlen - 1
572 region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight)
573 if regionFinder.normalize:
574 region.numReads /= regionFinder.controlRDSsize
576 backregions.append(int(region.numReads))
577 region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight)
578 regionLength = lastReadPos - region.start
579 if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength):
580 numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
581 if regionFinder.normalize:
582 numMock /= regionFinder.sampleRDSsize
584 foldRatio = region.numReads / numMock
585 if foldRatio >= regionFinder.minRatio:
586 # first pass, with absolute numbers
587 peak = findPeak(currentReadList, region.start, lastReadPos - region.start, regionFinder.readlen, doWeight=True,
588 leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue)
590 if regionFinder.doTrim:
592 lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, 20., currentReadList, regionFinder.controlRDSsize)
597 numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
598 if regionFinder.normalize:
599 numMock /= regionFinder.sampleRDSsize
601 foldRatio = region.numReads / numMock
603 # just in case it changed, use latest data
605 bestPos = peak.topPos[0]
606 peakScore = peak.smoothArray[bestPos]
611 if regionFinder.normalize:
612 peakScore /= regionFinder.controlRDSsize
614 # check that we still pass threshold
615 regionLength = lastReadPos - region.start
616 if regionPassesCriteria(regionFinder, region.numReads, foldRatio, regionLength):
617 regionFinder.updateControlStatistics(peak, region.numReads, peakScore)
620 currentTotalWeight = 0
625 regionFinder.statistics["badRegionTrim"] += 1
627 if pos not in currentHitList:
630 currentHitList.append(pos)
631 weight = read["weight"]
632 currentTotalWeight += weight
633 currentReadList.append({"start": pos, "sense": read["sense"], "weight": weight})
639 def learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename,
640 outfile, useMulti, doControl, controlRDS, combine5p):
642 hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=regionFinder.withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True,
643 strand=regionFinder.stranded, combine5p=combine5p)
645 maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
646 print "learning shift.... will need at least 30 training sites"
647 stringency = regionFinder.stringency
648 previousHit = -1 * regionFinder.maxSpacing
655 for read in hitDict[chromosome]:
657 if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
658 if regionFinder.normalize:
659 totalWeight /= regionFinder.sampleRDSsize
661 regionStart = positionList[0]
662 regionStop = positionList[-1]
663 regionLength = regionStop - regionStart
664 if regionPassesCriteria(regionFinder, totalWeight, numStarts, regionLength, stringency=stringency):
665 foldRatio = getFoldRatio(regionFinder, controlRDS, totalWeight, chromosome, regionStart, regionStop, useMulti, doControl)
666 if foldRatio >= regionFinder.minRatio:
667 updateShiftDict(shiftDict, readList, regionStart, regionLength, regionFinder.readlen)
674 if pos not in positionList:
677 positionList.append(pos)
678 weight = read["weight"]
679 totalWeight += weight
680 readList.append({"start": pos, "sense": read["sense"], "weight": weight})
683 outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency,
684 stringency * regionFinder.minHits,
685 stringency * regionFinder.minRatio,
686 stringency * regionFinder.readlen,
690 writeLog(logfilename, versionString, outfilename + outline)
691 regionFinder.shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename)
692 outline = "#picked shiftValue to be %d" % regionFinder.shiftValue
694 print >> outfile, outline
695 writeLog(logfilename, versionString, outfilename + outline)
698 def previousRegionIsDone(pos, previousHit, maxSpacing, maxCoord):
699 return abs(pos - previousHit) > maxSpacing or pos == maxCoord
702 def regionPassesCriteria(regionFinder, sumAll, numStarts, regionLength, stringency=1):
703 minTotalReads = stringency * regionFinder.minHits
704 minNumReadStarts = stringency * regionFinder.minRatio
705 minRegionLength = stringency * regionFinder.readlen
707 return sumAll >= minTotalReads and numStarts > minNumReadStarts and regionLength > minRegionLength
710 def trimRegion(region, regionFinder, peak, regionStop, trimValue, currentReadList, totalReadCount):
711 bestPos = peak.topPos[0]
712 peakScore = peak.smoothArray[bestPos]
713 if regionFinder.normalize:
714 peakScore /= totalReadCount
716 minSignalThresh = trimValue * peakScore
717 start = findStartEdgePosition(peak, minSignalThresh)
718 regionEndPoint = regionStop - region.start - 1
719 stop = findStopEdgePosition(peak, regionEndPoint, minSignalThresh)
721 regionStop = region.start + stop
722 region.start += start
724 trimmedPeak = findPeak(currentReadList, region.start, regionStop - region.start, regionFinder.readlen, doWeight=True,
725 leftPlus=regionFinder.doDirectionality, shift=peak.shift)
727 peak.numPlus = trimmedPeak.numPlus
728 peak.numLeftPlus = trimmedPeak.numLeftPlus
729 peak.topPos = trimmedPeak.topPos
730 peak.smoothArray = trimmedPeak.smoothArray
732 region.numReads = trimmedPeak.numHits
733 if regionFinder.normalize:
734 region.numReads /= totalReadCount
736 region.stop = regionStop + regionFinder.readlen - 1
741 def findStartEdgePosition(peak, minSignalThresh):
743 while not peakEdgeLocated(peak, start, minSignalThresh):
749 def findStopEdgePosition(peak, stop, minSignalThresh):
750 while not peakEdgeLocated(peak, stop, minSignalThresh):
756 def peakEdgeLocated(peak, position, minSignalThresh):
757 return peak.smoothArray[position] >= minSignalThresh or position == peak.topPos[0]
760 def getFoldRatio(regionFinder, controlRDS, sumAll, chromosome, regionStart, regionStop, useMulti, doControl):
761 """ Fold ratio calculated is total read weight over control
763 #TODO: this needs to be generalized as there is a point at which we want to use the sampleRDS instead of controlRDS
765 numMock = 1. + controlRDS.getCounts(chromosome, regionStart, regionStop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
766 if regionFinder.normalize:
767 numMock /= regionFinder.controlRDSsize
769 foldRatio = sumAll / numMock
771 foldRatio = regionFinder.minRatio
776 def updateShiftDict(shiftDict, readList, regionStart, regionLength, readlen):
777 peak = findPeak(readList, regionStart, regionLength, readlen, doWeight=True, shift="auto")
779 shiftDict[peak.shift] += 1
781 shiftDict[peak.shift] = 1
784 def getShiftValue(shiftDict, count, logfilename, outfilename):
786 outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
788 writeLog(logfilename, versionString, outfilename + outline)
791 shiftValue = getBestShiftInDict(shiftDict)
797 def getRegion(regionStart, regionStop, factor, index, chromosome, sumAll, foldRatio, multiP,
798 peakDescription, shift, doDirectionality, leftPlusRatio, numLeft,
802 if leftPlusRatio < numLeft / numPlus:
803 plusP = plusRatio * 100.
804 leftP = 100. * numLeft / numPlus
805 # we have a region that passes all criteria
806 region = Region.DirectionalRegion(regionStart, regionStop,
807 factor, index, chromosome, sumAll,
808 foldRatio, multiP, plusP, leftP,
809 peakDescription, shift)
812 raise RegionDirectionError
814 # we have a region, but didn't check for directionality
815 region = Region.Region(regionStart, regionStop, factor, index, chromosome,
816 sumAll, foldRatio, multiP, peakDescription, shift)
821 def setMultireadPercentage(region, hitRDS, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim):
823 sumMulti = hitRDS.getMultiCount(chromosome, region.start, lastReadPos)
825 sumMulti = currentTotalWeight - currentUniqueCount
829 sumMulti /= hitRDSsize
832 multiP = 100. * (sumMulti / region.numReads)
833 except ZeroDivisionError:
836 region.multiP = min(multiP, 100.)
839 def regionAndPeakPass(regionFinder, region, regionLength, peakScore, plusRatio):
841 if regionPassesCriteria(regionFinder, region.numReads, region.foldRatio, regionLength):
842 if peakScore >= regionFinder.minPeak and regionFinder.minPlusRatio <= plusRatio <= regionFinder.maxPlusRatio:
848 def updateRegion(region, doDirectionality, leftPlusRatio, numLeft, numPlus, plusRatio):
851 if leftPlusRatio < numLeft / numPlus:
852 region.plusP = plusRatio * 100.
853 region.leftP = 100. * numLeft / numPlus
855 raise RegionDirectionError
858 def writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, shiftDict,
861 writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, shiftDict,
862 allregions, header, backregions=[], pValueType="self")
865 def writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, shiftDict,
866 allregions, header, backregions=[], pValueType="none"):
868 print regionFinder.statistics["mIndex"], regionFinder.statistics["mTotal"]
870 if pValueType == "self":
871 poissonmean = calculatePoissonMean(allregions)
873 poissonmean = calculatePoissonMean(backregions)
876 writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=regionFinder.shiftValue, reportshift=regionFinder.reportshift, shiftDict=shiftDict)
879 def calculatePoissonMean(dataList):
881 listSize = float(len(dataList))
883 poissonmean = sum(dataList) / listSize
884 except ZeroDivisionError:
887 print "Poisson n=%d, p=%f" % (listSize, poissonmean)
892 def writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=0, reportshift=False, shiftDict={}):
893 for region in outregions:
894 if shiftValue == "auto" and reportshift:
896 shiftDict[region.shift] += 1
898 shiftDict[region.shift] = 1
900 outline = getRegionString(region, reportshift)
902 # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
904 sumAll = int(region.numReads)
905 pValue = calculatePValue(sumAll, poissonmean)
906 outline += "\t%1.2g" % pValue
909 print >> outfile, outline
912 def calculatePValue(sum, poissonmean):
913 pValue = math.exp(-poissonmean)
914 #TODO: 798: DeprecationWarning: integer argument expected, got float - for i in xrange(sum)
915 for i in xrange(sum):
916 pValue *= poissonmean
922 def getRegionString(region, reportShift):
924 outline = region.printRegionWithShift()
926 outline = region.printRegion()
931 def getFooter(regionFinder, shiftDict, doRevBackground):
932 index = regionFinder.statistics["index"]
933 mIndex = regionFinder.statistics["mIndex"]
934 footerLines = ["#stats:\t%.1f RPM in %d regions" % (regionFinder.statistics["total"], index)]
935 if regionFinder.doDirectionality:
936 footerLines.append("#\t\t%d additional regions failed directionality filter" % regionFinder.statistics["failed"])
940 percent = min(100. * (float(mIndex)/index), 100.)
941 except ZeroDivisionError:
944 footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, regionFinder.statistics["mTotal"], percent))
946 if regionFinder.shiftValue == "auto" and regionFinder.reportshift:
947 bestShift = getBestShiftInDict(shiftDict)
948 footerLines.append("#mode of shift values: %d" % bestShift)
950 if regionFinder.statistics["badRegionTrim"] > 0:
951 footerLines.append("#%d regions discarded due to trimming problems" % regionFinder.statistics["badRegionTrim"])
953 return string.join(footerLines, "\n")
956 def getBestShiftInDict(shiftDict):
957 return max(shiftDict.iteritems(), key=operator.itemgetter(1))[0]
960 if __name__ == "__main__":