2 usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile
3 [--control controlrdsfile] [--minimum minHits] [--ratio minRatio]
4 [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num]
5 [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction]
6 [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw]
7 [--revbackground] [--pvalue self|back|none] [--nodirectionality]
8 [--strandfilter plus/minus] [--trimvalue percent] [--notrim]
9 [--cache pages] [--log altlogfile] [--flag aflag] [--append] [--RNA]
11 where values in brackets are optional and label is an arbitrary string.
13 Use -ratio (default 4 fold) to set the minimum fold enrichment
14 over the control, -minimum (default 4) is the minimum number of reads
15 (RPM) within the region, and -spacing (default readlen) to set the maximum
16 distance between reads in the region. -listPeak lists the peak of the
17 region. Peaks mut be higher than -minPeak (default 0.5 RPM).
18 Pvalues are calculated from the sample (change with -pvalue),
19 unless the -revbackground flag and a control RDS file are provided.
21 By default, all numbers and parameters are on a reads per
22 million (RPM) basis. -raw will treat all settings, ratios and reported
23 numbers as raw counts rather than RPM. Use -notrim to turn off region
24 trimming and -trimvalue to control trimming (default 10% of peak signal)
26 The peak finder uses minimal directionality information that can
27 be turned off with -nodirectionality ; the fraction of + strand reads
28 required to be to the left of the peak (default 0.3) can be set with
29 -leftPlus ; -minPlus and -maxPlus change the minimum/maximum fraction
30 of plus reads in a region, which (defaults 0.25 and 0.75, respectively).
32 Use -shift to shift reads either by half the expected
33 fragment length (default 0 bp) or '-shift learn ' to learn the shift
34 based on the first chromosome. If you prefer to learn the shift
35 manually, use -autoshift to calculate a per-region shift value, which
36 can be reported using -reportshift. -strandfilter should only be used
37 when explicitely calling unshifted stranded peaks from non-ChIP-seq
38 data such as directional RNA-seq. regionoutfile is written over by
39 default unless given the -append flag.
52 from commoncode import readDataset, writeLog, findPeak, getBestShiftForRegion
55 versionString = "%s: version 3.2" % sys.argv[0]
68 parser = optparse.OptionParser(usage=usage)
69 parser.add_option("--control", dest="mockfile")
70 parser.add_option("--minimum", type="float", dest="minHits")
71 parser.add_option("--ratio", type="float", dest="minRatio")
72 parser.add_option("--spacing", type="int", dest="maxSpacing")
73 parser.add_option("--listPeak", action="store_true", dest="listPeak")
74 parser.add_option("--shift", dest="shift")
75 parser.add_option("--learnFold", type="float", dest="stringency")
76 parser.add_option("--noshift", action="store_true", dest="noShift")
77 parser.add_option("--autoshift", action="store_true", dest="autoshift")
78 parser.add_option("--reportshift", action="store_true", dest="reportshift")
79 parser.add_option("--nomulti", action="store_true", dest="noMulti")
80 parser.add_option("--minPlus", type="float", dest="minPlusRatio")
81 parser.add_option("--maxPlus", type="float", dest="maxPlusRatio")
82 parser.add_option("--leftPlus", type="float", dest="leftPlusRatio")
83 parser.add_option("--minPeak", type="float", dest="minPeak")
84 parser.add_option("--raw", action="store_false", dest="normalize")
85 parser.add_option("--revbackground", action="store_true", dest="doRevBackground")
86 parser.add_option("--pvalue", dest="ptype")
87 parser.add_option("--nodirectionality", action="store_false", dest="doDirectionality")
88 parser.add_option("--strandfilter", dest="strandfilter")
89 parser.add_option("--trimvalue", type="float", dest="trimValue")
90 parser.add_option("--notrim", action="store_false", dest="doTrim")
91 parser.add_option("--cache", type="int", dest="cachePages")
92 parser.add_option("--log", dest="logfilename")
93 parser.add_option("--flag", dest="withFlag")
94 parser.add_option("--append", action="store_true", dest="doAppend")
95 parser.add_option("--RNA", action="store_true", dest="rnaSettings")
96 parser.add_option("--combine5p", action="store_true", dest="combine5p")
97 parser.set_defaults(minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
98 stringency=4.0, noshift=False, autoshift=False, reportshift=False,
99 minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
100 normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
101 trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
102 cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
103 strandfilter=None, combine5p=False)
105 (options, args) = parser.parse_args(argv[1:])
113 outfilename = args[2]
115 findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift,
116 options.stringency, options.noshift, options.autoshift, options.reportshift,
117 options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
118 options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
119 options.trimValue, options.doTrim, options.doAppend, options.rnaSettings,
120 options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti,
121 options.strandfilter, options.combine5p)
124 def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
125 stringency=4.0, noshift=False, autoshift=False, reportshift=False,
126 minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
127 normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
128 trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
129 cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
130 strandfilter=None, combine5p=False):
136 if shift is not None:
138 shiftValue = int(shift)
142 print "Will try to learn shift"
147 if trimValue is not None:
148 trimValue = float(trimValue) / 100.
149 trimString = "%2.1f%s" % ((100. * trimValue), "%")
158 print "Swapping IP and background to calculate FDR"
162 if mockfile is not None:
166 if ptype is not None:
167 ptype = ptype.upper()
173 elif ptype == "SELF":
175 elif ptype == "BACK":
176 if doControl and doRevBackground:
179 print "must have a control dataset and -revbackground for pValue type 'back'"
181 print "could not use pValue type : %s" % ptype
185 if cachePages is not None:
192 print "restrict to flag = %s" % withFlag
196 print "using unique reads only"
200 print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
203 doDirectionality = False
206 if strandfilter is not None:
207 if strandfilter == "plus":
211 print "only analyzing reads on the plus strand"
212 elif strandfilter == "minus":
216 print "only analyzing reads on the minus strand"
218 stringency = max(stringency, 1.0)
219 writeLog(logfilename, versionString, string.join(sys.argv[1:]))
222 mockRDS = readDataset(mockfile, verbose=True, cache=doCache)
224 if cachePages > mockRDS.getDefaultCacheSize():
225 mockRDS.setDBcache(cachePages)
228 hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
229 readlen = hitRDS.getReadSize()
233 print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache)
234 print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)
235 print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)
237 if cachePages > hitRDS.getDefaultCacheSize():
238 hitRDS.setDBcache(cachePages)
240 hitRDSsize = len(hitRDS) / 1000000.
242 mockRDSsize = len(mockRDS) / 1000000.
246 mockSampleSize = mockRDSsize
248 hitSampleSize = hitRDSsize
251 outfile = open(outfilename, "a")
253 outfile = open(outfilename, "w")
255 outfile.write("#ERANGE %s\n" % versionString)
257 outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)\n" % (hitfile, hitRDSsize, mockfile, mockRDSsize))
259 outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample: none\n" % (hitfile, hitRDSsize))
262 outfile.write("#restrict to Flag = %s\n" % withFlag)
264 outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache))
265 outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded))
266 outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType))
268 print "Normalizing to RPM"
273 headerList = ["#regionID\tchrom\tstart\tstop", countLabel, "fold\tmulti%"]
275 headerList.append("plus%\tleftPlus%")
278 headerList.append("peakPos\tpeakHeight")
281 headerList.append("readShift")
284 headerList.append("pValue")
286 headline = string.join(headerList, "\t")
287 print >> outfile, headline
289 statistics = {"index": 0,
296 if minRatio < minPeak:
299 hitChromList = hitRDS.getChromosomes()
301 mockChromList = mockRDS.getChromosomes()
305 for chromosome in hitChromList:
306 if doNotProcessChromosome(chromosome, doControl, mockChromList):
309 print "chromosome %s" % (chromosome)
310 hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True, strand=stranded, combine5p=combine5p)
311 maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
312 if shiftValue == "learn":
313 shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
314 stringency, readlen, minHits, logfilename, outfile, outfilename)
316 regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize, chromosome, useMulti,
317 normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
318 shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
319 noMulti, doControl, factor, trimValue, outputRegionList=True)
321 statistics["index"] += regionStats["index"]
322 statistics["total"] += regionStats["total"]
323 statistics["failed"] += regionStats["failed"]
324 if not doRevBackground:
326 p, poissonmean = calculatePValue(allRegionWeights)
329 shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
332 #now do background swapping the two samples around
333 print "calculating background..."
334 backgroundTrimValue = 1/20.
335 backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize, chromosome, useMulti,
336 normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
337 shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
338 noMulti, doControl, factor, backgroundTrimValue)
340 statistics["mIndex"] += backgroundRegionStats["index"]
341 statistics["mTotal"] += backgroundRegionStats["total"]
342 statistics["failed"] += backgroundRegionStats["failed"]
343 print statistics["mIndex"], statistics["mTotal"]
345 if pValueType == "self":
346 p, poissonmean = calculatePValue(allRegionWeights)
348 p, poissonmean = calculatePValue(backgroundRegionWeights)
351 shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
353 footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue)
355 outfile.write(footer)
358 writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | ")))
361 def doNotProcessChromosome(chromosome, doControl, mockChromList):
362 skipChromosome = False
363 if chromosome == "chrM":
364 skipChromosome = True
366 if doControl and (chromosome not in mockChromList):
367 skipChromosome = True
369 return skipChromosome
372 def calculatePValue(dataList):
374 listSize = float(len(dataList))
376 poissonmean = sum(dataList) / listSize
377 except ZeroDivisionError:
380 print "Poisson n=%d, p=%f" % (listSize, poissonmean)
381 p = math.exp(-poissonmean)
383 return p, poissonmean
386 def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
387 stringency, readlen, minHits, logfilename, outfile, outfilename):
389 print "learning shift.... will need at least 30 training sites"
390 previousHit = -1 * maxSpacing
397 for (pos, sense, weight) in hitDict[chrom]:
398 if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
399 sumAll = sum(weightList)
401 sumAll /= hitSampleSize
403 regionStart = hitList[0]
404 regionStop = hitList[-1]
405 regionLength = regionStop - regionStart
406 # we're going to require stringent settings
407 if sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen:
408 foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
410 if foldRatio >= minRatio:
411 localshift = getBestShiftForRegion(readList, regionStart, regionLength, doWeight=True)
413 shiftDict[localshift] += 1
415 shiftDict[localshift] = 1
424 if pos not in hitList:
428 weightList.append(weight)
429 readList.append((pos, sense, weight))
434 outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency, stringency * minHits, stringency * minRatio, stringency * readlen, count)
436 writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
438 outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
440 writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
443 for shift in sorted(shiftDict):
444 if shiftDict[shift] > bestCount:
446 bestCount = shiftDict[shift]
448 shiftValue = bestShift
451 outline = "#picked shiftValue to be %d" % shiftValue
453 print >> outfile, outline
454 writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
459 def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio):
461 foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll)
468 def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll):
469 numMock = 1. + rds.getCounts(chrom, start, stop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
471 numMock /= sampleSize
473 foldRatio = sumAll / numMock
478 def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
479 normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
480 shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
481 noMulti, doControl, factor, trimValue, outputRegionList=False):
486 previousHit = - 1 * maxSpacing
487 currentHitList = [-1]
488 currentWeightList = [0]
493 hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True)
494 maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti)
495 for (pos, sense, weight) in hitDict[chrom]:
496 if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
497 sumAll = sum(currentWeightList)
499 sumAll /= rdsSampleSize
501 regionStart = currentHitList[0]
502 regionStop = currentHitList[-1]
503 regionWeights.append(int(sumAll))
504 if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
506 #first pass uses getFoldRatio on mockRDS as there may not be control
507 foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll)
508 if foldRatio >= minRatio:
509 # first pass, with absolute numbers
511 (topPos, numHits, smoothArray, numPlus, numLeft, shift) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue, returnShift=True)
513 (topPos, numHits, smoothArray, numPlus, shift) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, shift=shiftValue, returnShift=True)
516 peakScore = smoothArray[bestPos]
518 peakScore /= rdsSampleSize
521 minSignalThresh = trimValue * peakScore
523 stop = regionStop - regionStart - 1
525 while not startFound:
526 if smoothArray[start] >= minSignalThresh or start == bestPos:
533 if smoothArray[stop] >= minSignalThresh or stop == bestPos:
538 regionStop = regionStart + stop
542 (topPos, sumAll, smoothArray, numPlus, numLeft) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift)
544 (topPos, sumAll, smoothArray, numPlus) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, shift=shift)
549 sumAll /= rdsSampleSize
551 foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, referenceSampleSize, sumAll, minRatio)
553 sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
554 # just in case it changed, use latest data
557 peakScore = smoothArray[bestPos]
563 peakScore /= rdsSampleSize
565 elif outputRegionList:
566 sumMulti = sum(currentWeightList) - currentWeightList.count(1.0)
571 sumMulti /= rdsSampleSize
574 multiP = 100. * (sumMulti / sumAll)
581 # check that we still pass threshold
582 if sumAll >= minHits and foldRatio >= minRatio and (regionStop - regionStart) > readlen:
583 plusRatio = float(numPlus)/numHits
584 if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
588 peak = "\t%d\t%.1f" % (regionStart + bestPos, peakScore)
591 if leftPlusRatio < numLeft / numPlus:
594 plusP = plusRatio * 100.
595 leftP = 100. * numLeft / numPlus
596 # we have a region that passes all criteria
597 outregions.append((factor, index, chrom, regionStart, regionStop + readlen - 1, sumAll, foldRatio, multiP, plusP, leftP, peak, shift))
603 # we have a region, but didn't check for directionality
607 outregions.append((factor, index, chrom, regionStart, regionStop + readlen - 1, sumAll, foldRatio, multiP, peak, shift))
610 currentWeightList = []
614 if pos not in currentHitList:
617 currentHitList.append(pos)
618 currentWeightList.append(weight)
619 currentReadList.append((pos, sense, weight))
622 statistics = {"index": index,
624 "failed": failedCounter
628 return statistics, regionWeights, outregions
630 return statistics, regionWeights
633 def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, reportshift, shiftValue):
636 for region in outregions:
637 # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
639 sumAll = int(region[5])
640 for i in xrange(sumAll):
641 pValue *= poissonmean
644 if shiftValue == "auto" and reportshift:
646 shiftDict[region[-1]] += 1
648 shiftDict[region[-1]] = 1
652 outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f%s\t%d" % region]
654 outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f%s" % region[:-1]]
657 outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f%s\t%d" % region]
659 outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f%s" % region[:-1]]
662 outputList.append("%1.2g" % pValue)
664 outline = string.join(outputList, "\t")
666 print >> outfile, outline
668 if shiftValue == "auto" and reportshift:
670 for shift in sorted(shiftDict):
671 if shiftDict[shift] > bestCount:
673 bestCount = shiftDict[shift]
678 def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue):
679 footerList = ["#stats:\t%.1f RPM in %d regions" % (stats["total"], stats["index"])]
681 footerList.append("#\t\t%d additional regions failed directionality filter" % stats["failed"])
685 percent = min(100. * (float(stats["mIndex"])/stats["index"]), 100)
686 except (ValueError, ZeroDivisionError):
689 footerList.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (stats["mIndex"], stats["mTotal"], percent))
691 if shiftValue == "auto" and reportshift:
692 footerList.append("#mode of shift values: %d" % shiftModeValue)
694 footer = string.join(footerList, "\n")
698 if __name__ == "__main__":