erange version 4.0a dev release
[erange.git] / findall.py
1 """
2     usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile
3            [--control controlrdsfile] [--minimum minHits] [--ratio minRatio]
4            [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num]
5            [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction]
6            [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw]
7            [--revbackground] [--pvalue self|back|none] [--nodirectionality]
8            [--strandfilter plus/minus] [--trimvalue percent] [--notrim]
9            [--cache pages] [--log altlogfile] [--flag aflag] [--append] [--RNA]
10
11            where values in brackets are optional and label is an arbitrary string.
12
13            Use -ratio (default 4 fold) to set the minimum fold enrichment
14            over the control, -minimum (default 4) is the minimum number of reads
15            (RPM) within the region, and -spacing (default readlen) to set the maximum
16            distance between reads in the region. -listPeak lists the peak of the
17            region. Peaks mut be higher than -minPeak (default 0.5 RPM).
18            Pvalues are calculated from the sample (change with -pvalue),
19            unless the -revbackground flag and a control RDS file are provided.
20
21            By default, all numbers and parameters are on a reads per
22            million (RPM) basis. -raw will treat all settings, ratios and reported
23            numbers as raw counts rather than RPM. Use -notrim to turn off region
24            trimming and -trimvalue to control trimming (default 10% of peak signal)
25
26            The peak finder uses minimal directionality information that can
27            be turned off with -nodirectionality ; the fraction of + strand reads
28            required to be to the left of the peak (default 0.3) can be set with
29            -leftPlus ; -minPlus and -maxPlus change the minimum/maximum fraction
30            of plus reads in a region, which (defaults 0.25 and 0.75, respectively).
31
32            Use -shift to shift reads either by half the expected
33            fragment length (default 0 bp) or '-shift learn ' to learn the shift
34            based on the first chromosome. If you prefer to learn the shift
35            manually, use -autoshift to calculate a per-region shift value, which
36            can be reported using -reportshift. -strandfilter should only be used
37            when explicitely calling unshifted stranded peaks from non-ChIP-seq
38            data such as directional RNA-seq. regionoutfile is written over by
39            default unless given the -append flag.
40 """
41
42 try:
43     import psyco
44     psyco.full()
45 except:
46     pass
47
48 import sys
49 import math
50 import string
51 import optparse
52 from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
53 import ReadDataset
54 import Region
55
56
57 versionString = "findall: version 3.2"
58 print versionString
59
60 def usage():
61     print __doc__
62
63
64 def main(argv=None):
65     if not argv:
66         argv = sys.argv
67
68     parser = makeParser()
69     (options, args) = parser.parse_args(argv[1:])
70
71     if len(args) < 3:
72         usage()
73         sys.exit(2)
74
75     factor = args[0]
76     hitfile = args[1]
77     outfilename = args[2]
78
79     findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift,
80             options.stringency, options.noshift, options.autoshift, options.reportshift,
81             options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
82             options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
83             options.trimValue, options.doTrim, options.doAppend, options.rnaSettings,
84             options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti,
85             options.strandfilter, options.combine5p)
86
87
88 def makeParser():
89     usage = __doc__
90
91     parser = optparse.OptionParser(usage=usage)
92     parser.add_option("--control", dest="mockfile")
93     parser.add_option("--minimum", type="float", dest="minHits")
94     parser.add_option("--ratio", type="float", dest="minRatio")
95     parser.add_option("--spacing", type="int", dest="maxSpacing")
96     parser.add_option("--listPeak", action="store_true", dest="listPeak")
97     parser.add_option("--shift", dest="shift")
98     parser.add_option("--learnFold", type="float", dest="stringency")
99     parser.add_option("--noshift", action="store_true", dest="noShift")
100     parser.add_option("--autoshift", action="store_true", dest="autoshift")
101     parser.add_option("--reportshift", action="store_true", dest="reportshift")
102     parser.add_option("--nomulti", action="store_true", dest="noMulti")
103     parser.add_option("--minPlus", type="float", dest="minPlusRatio")
104     parser.add_option("--maxPlus", type="float", dest="maxPlusRatio")
105     parser.add_option("--leftPlus", type="float", dest="leftPlusRatio")
106     parser.add_option("--minPeak", type="float", dest="minPeak")
107     parser.add_option("--raw", action="store_false", dest="normalize")
108     parser.add_option("--revbackground", action="store_true", dest="doRevBackground")
109     parser.add_option("--pvalue", dest="ptype")
110     parser.add_option("--nodirectionality", action="store_false", dest="doDirectionality")
111     parser.add_option("--strandfilter", dest="strandfilter")
112     parser.add_option("--trimvalue", type="float", dest="trimValue")
113     parser.add_option("--notrim", action="store_false", dest="doTrim")
114     parser.add_option("--cache", type="int", dest="cachePages")
115     parser.add_option("--log", dest="logfilename")
116     parser.add_option("--flag", dest="withFlag")
117     parser.add_option("--append", action="store_true", dest="doAppend")
118     parser.add_option("--RNA", action="store_true", dest="rnaSettings")
119     parser.add_option("--combine5p", action="store_true", dest="combine5p")
120
121     configParser = getConfigParser()
122     section = "findall"
123     minHits = getConfigFloatOption(configParser, section, "minHits", 4.0)
124     minRatio = getConfigFloatOption(configParser, section, "minRatio", 4.0)
125     maxSpacing = getConfigIntOption(configParser, section, "maxSpacing", 50)
126     listPeak = getConfigBoolOption(configParser, section, "listPeak", False)
127     shift = getConfigOption(configParser, section, "shift", None)
128     stringency = getConfigFloatOption(configParser, section, "stringency", 4.0)
129     noshift = getConfigBoolOption(configParser, section, "noshift", False)
130     autoshift = getConfigBoolOption(configParser, section, "autoshift", False)
131     reportshift = getConfigBoolOption(configParser, section, "reportshift", False)
132     minPlusRatio = getConfigFloatOption(configParser, section, "minPlusRatio", 0.25)
133     maxPlusRatio = getConfigFloatOption(configParser, section, "maxPlusRatio", 0.75)
134     leftPlusRatio = getConfigFloatOption(configParser, section, "leftPlusRatio", 0.3)
135     minPeak = getConfigFloatOption(configParser, section, "minPeak", 0.5)
136     normalize = getConfigBoolOption(configParser, section, "normalize", True)
137     logfilename = getConfigOption(configParser, section, "logfilename", "findall.log")
138     withFlag = getConfigOption(configParser, section, "withFlag", "")
139     doDirectionality = getConfigBoolOption(configParser, section, "doDirectionality", True)
140     trimValue = getConfigOption(configParser, section, "trimValue", None)
141     doTrim = getConfigBoolOption(configParser, section, "doTrim", True)
142     doAppend = getConfigBoolOption(configParser, section, "doAppend", False)
143     rnaSettings = getConfigBoolOption(configParser, section, "rnaSettings", False)
144     cachePages = getConfigOption(configParser, section, "cachePages", None)
145     ptype = getConfigOption(configParser, section, "ptype", None)
146     mockfile = getConfigOption(configParser, section, "mockfile", None)
147     doRevBackground = getConfigBoolOption(configParser, section, "doRevBackground", False)
148     noMulti = getConfigBoolOption(configParser, section, "noMulti", False)
149     strandfilter = getConfigOption(configParser, section, "strandfilter", None)
150     combine5p = getConfigBoolOption(configParser, section, "combine5p", False)
151
152     parser.set_defaults(minHits=minHits, minRatio=minRatio, maxSpacing=maxSpacing, listPeak=listPeak, shift=shift,
153                         stringency=stringency, noshift=noshift, autoshift=autoshift, reportshift=reportshift,
154                         minPlusRatio=minPlusRatio, maxPlusRatio=maxPlusRatio, leftPlusRatio=leftPlusRatio, minPeak=minPeak,
155                         normalize=normalize, logfilename=logfilename, withFlag=withFlag, doDirectionality=doDirectionality,
156                         trimValue=trimValue, doTrim=doTrim, doAppend=doAppend, rnaSettings=rnaSettings,
157                         cachePages=cachePages, ptype=ptype, mockfile=mockfile, doRevBackground=doRevBackground, noMulti=noMulti,
158                         strandfilter=strandfilter, combine5p=combine5p)
159
160     return parser
161
162
163 def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
164             stringency=4.0, noshift=False, autoshift=False, reportshift=False,
165             minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
166             normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
167             trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
168             cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
169             strandfilter=None, combine5p=False):
170
171     shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
172
173     if trimValue is not None:
174         trimValue = float(trimValue) / 100.
175         trimString = "%2.1f%s" % ((100. * trimValue), "%")
176     else:
177         trimValue = 0.1
178         trimString = "10%"
179
180     if not doTrim:
181         trimString = "none"
182
183     if doRevBackground:
184         print "Swapping IP and background to calculate FDR"
185         pValueType = "back"
186
187     doControl = False
188     if mockfile is not None:
189         doControl = True
190
191     doPvalue = True
192     if ptype is not None:
193         ptype = ptype.upper()
194         if ptype == "NONE":
195             doPvalue = False
196             pValueType = "none"
197             p = 1
198             poissonmean = 0
199         elif ptype == "SELF":
200             pValueType = "self"
201         elif ptype == "BACK":
202             if doControl and doRevBackground:
203                 pValueType = "back"
204             else:
205                 print "must have a control dataset and -revbackground for pValue type 'back'"
206         else:
207             print "could not use pValue type : %s" % ptype
208     else:
209         pValueType = "self"
210
211     if cachePages is not None:
212         doCache = True
213     else:
214         doCache = False
215         cachePages = -1
216
217     if withFlag != "":
218         print "restrict to flag = %s" % withFlag
219
220     useMulti = True
221     if noMulti:
222         print "using unique reads only"
223         useMulti = False
224
225     if rnaSettings:
226         print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
227         doTrim = False
228         doDirectionality = False
229
230     stranded = ""
231     if strandfilter is not None:
232         if strandfilter == "plus":
233             stranded = "+"
234             minPlusRatio = 0.9
235             maxPlusRatio = 1.0
236             print "only analyzing reads on the plus strand"
237         elif strandfilter == "minus":
238             stranded = "-"
239             minPlusRatio = 0.0
240             maxPlusRatio = 0.1
241             print "only analyzing reads on the minus strand"
242
243     stringency = max(stringency, 1.0)
244     writeLog(logfilename, versionString, string.join(sys.argv[1:]))
245     if doControl:
246         print "\ncontrol:" 
247         mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
248
249         if cachePages > mockRDS.getDefaultCacheSize():
250             mockRDS.setDBcache(cachePages)
251
252     print "\nsample:" 
253     hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
254     readlen = hitRDS.getReadSize()
255     if rnaSettings:
256         maxSpacing = readlen
257
258     if cachePages > hitRDS.getDefaultCacheSize():
259         hitRDS.setDBcache(cachePages)
260
261     if doAppend:
262         fileMode = "a"
263     else:
264         fileMode = "w"
265
266     outfile = open(outfilename, fileMode)
267
268     outfile.write("#ERANGE %s\n" % versionString)
269     if doControl:
270         mockRDSsize = len(mockRDS) / 1000000.
271         controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize)
272     else:
273         controlSampleString = " none"
274
275     hitRDSsize = len(hitRDS) / 1000000.
276     outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:%s\n" % (hitfile, hitRDSsize, controlSampleString))
277
278     if withFlag != "":
279         outfile.write("#restrict to Flag = %s\n" % withFlag)
280
281     print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache)
282     print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)
283     print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)
284
285     outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache))
286     outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded))
287     outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType))
288     if normalize:
289         print "Normalizing to RPM"
290         countLabel = "RPM"
291     else:
292         countLabel = "COUNT"
293
294     headerList = ["#regionID\tchrom\tstart\tstop", countLabel, "fold\tmulti%"]
295     if doDirectionality:
296         headerList.append("plus%\tleftPlus%")
297
298     if listPeak:
299         headerList.append("peakPos\tpeakHeight")
300
301     if reportshift:
302         headerList.append("readShift")
303
304     if doPvalue:
305         headerList.append("pValue")
306
307     headline = string.join(headerList, "\t")
308     print >> outfile, headline
309
310     statistics = {"index": 0,
311                   "total": 0,
312                   "mIndex": 0,
313                   "mTotal": 0,
314                   "failed": 0
315     }
316
317     if minRatio < minPeak:
318         minPeak = minRatio
319
320     hitChromList = hitRDS.getChromosomes()
321     if doControl:
322         mockChromList = mockRDS.getChromosomes()
323
324     if normalize:
325         if doControl:
326             mockSampleSize = mockRDSsize
327
328         hitSampleSize = hitRDSsize
329
330     hitChromList.sort()
331
332     for chromosome in hitChromList:
333         if doNotProcessChromosome(chromosome, doControl, mockChromList):
334             continue
335
336         print "chromosome %s" % (chromosome)
337         hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True,
338                                       doMulti=useMulti, findallOptimize=True, strand=stranded,
339                                       combine5p=combine5p)
340         maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
341         if shiftValue == "learn":
342             shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize,
343                                     mockSampleSize, minRatio, maxSpacing, maxCoord, stringency, readlen, minHits,
344                                     logfilename, outfile, outfilename)
345
346         regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize,
347                                                                   chromosome, useMulti, normalize, maxSpacing,
348                                                                   doDirectionality, doTrim, minHits, minRatio,
349                                                                   readlen, shiftValue, minPeak, minPlusRatio,
350                                                                   maxPlusRatio, leftPlusRatio, listPeak, noMulti,
351                                                                   doControl, factor, trimValue, outputRegionList=True)
352
353         statistics["index"] += regionStats["index"]
354         statistics["total"] += regionStats["total"]
355         statistics["failed"] += regionStats["failed"]
356         if not doRevBackground:
357             if doPvalue:
358                 p, poissonmean = calculatePValue(allRegionWeights)
359
360             print headline
361             shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
362             continue
363
364         #now do background swapping the two samples around
365         print "calculating background..."
366         backgroundTrimValue = 1/20.
367         backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize,
368                                                                        chromosome, useMulti, normalize, maxSpacing,
369                                                                        doDirectionality, doTrim, minHits, minRatio,
370                                                                        readlen, shiftValue, minPeak, minPlusRatio,
371                                                                        maxPlusRatio, leftPlusRatio, listPeak, noMulti,
372                                                                        doControl, factor, backgroundTrimValue)
373
374         statistics["mIndex"] += backgroundRegionStats["index"]
375         statistics["mTotal"] += backgroundRegionStats["total"]
376         statistics["failed"] += backgroundRegionStats["failed"]
377         print statistics["mIndex"], statistics["mTotal"]
378         if doPvalue:
379             if pValueType == "self":
380                 p, poissonmean = calculatePValue(allRegionWeights)
381             else:
382                 p, poissonmean = calculatePValue(backgroundRegionWeights)
383
384         print headline
385         shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
386
387     footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue)
388     print footer
389     outfile.write(footer)
390     outfile.close()
391
392     writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | ")))
393
394
395 def determineShiftValue(autoshift, shift, noshift, rnaSettings):
396     shiftValue = 0
397     if autoshift:
398         shiftValue = "auto"
399
400     if shift is not None:
401         try:
402             shiftValue = int(shift)
403         except ValueError:
404             if shift == "learn":
405                 shiftValue = "learn"
406                 print "Will try to learn shift"
407
408     if noshift or rnaSettings:
409         shiftValue = 0
410
411     return shiftValue
412
413
414 def doNotProcessChromosome(chromosome, doControl, mockChromList):
415     skipChromosome = False
416     if chromosome == "chrM":
417         skipChromosome = True
418
419     if doControl and (chromosome not in mockChromList):
420         skipChromosome = True
421
422     return skipChromosome
423
424
425 def calculatePValue(dataList):
426     dataList.sort()
427     listSize = float(len(dataList))
428     try:
429         poissonmean = sum(dataList) / listSize
430     except ZeroDivisionError:
431         poissonmean = 0
432
433     print "Poisson n=%d, p=%f" % (listSize, poissonmean)
434     p = math.exp(-poissonmean)
435
436     return p, poissonmean
437
438
439 def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
440                stringency, readlen, minHits, logfilename, outfile, outfilename, minSites=30):
441
442     print "learning shift.... will need at least %d training sites" % minSites
443     previousHit = -1 * maxSpacing
444     hitList = [-1]
445     totalWeight = 0
446     readList = []
447     shiftDict = {}
448     count = 0
449     numStarts = 0
450     for read in hitDict[chrom]:
451         pos = read["start"]
452         sense = read["sense"]
453         weight = read["weight"]
454         if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
455             sumAll = totalWeight
456             if normalize:
457                 sumAll /= hitSampleSize
458
459             regionStart = hitList[0]
460             regionStop = hitList[-1]
461             regionLength = regionStop - regionStart
462             # we're going to require stringent settings
463             if sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen:
464                 foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
465
466                 if foldRatio >= minRatio:
467                     localshift = getBestShiftForRegion(readList, regionStart, regionLength, doWeight=True)
468                     try:
469                         shiftDict[localshift] += 1
470                     except KeyError:
471                         shiftDict[localshift] = 1
472
473                     count += 1
474
475             hitList = []
476             totalWeight = 0
477             readList = []
478             numStarts = 0
479
480         if pos not in hitList:
481             numStarts += 1
482
483         hitList.append(pos)
484         totalWeight += weight
485         readList.append({"start": pos, "sense": sense, "weight": weight})
486         previousHit = pos
487
488     bestShift = 0
489     bestCount = 0
490     learningSettings = ["#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d" % (stringency, stringency * minHits,
491                                                                                                        stringency * minRatio, stringency * readlen),
492                         "#number of training examples: %d" % count]
493     outline = string.join(learningSettings, "\n")
494     print outline
495     writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
496     if count < minSites:
497         outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
498         print >> outfile, outline
499         writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
500         shiftValue = 0
501     else:
502         for shift in sorted(shiftDict):
503             if shiftDict[shift] > bestCount:
504                 bestShift = shift
505                 bestCount = shiftDict[shift]
506
507         shiftValue = bestShift
508         print shiftDict
509
510     outline = "#picked shiftValue to be %d" % shiftValue
511     print outline
512     print >> outfile, outline
513     writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
514
515     return shiftValue
516
517
518 def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio):
519     if doControl:
520         foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll)
521     else:
522         foldRatio = minRatio
523
524     return foldRatio
525
526
527 def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll):
528     numMock = 1. + rds.getCounts(chrom, start, stop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
529     if normalize:
530         numMock /= sampleSize
531
532     foldRatio = sumAll / numMock
533
534     return foldRatio
535
536
537 def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
538                 normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
539                 shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
540                 noMulti, doControl, factor, trimValue, outputRegionList=False):
541
542     index = 0
543     totalRegionWeight = 0
544     failedCounter = 0
545     previousHit = - 1 * maxSpacing
546     currentHitList = [-1]
547     currentTotalWeight = 0
548     currentUniqReadCount = 0
549     currentReadList = []
550     regionWeights = []
551     outregions = []
552     numStarts = 0
553     hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True)
554     maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti)
555     for read in hitDict[chrom]:
556         pos = read["start"]
557         sense = read["sense"]
558         weight = read["weight"]
559         if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
560             sumAll = currentTotalWeight
561             if normalize:
562                 sumAll /= rdsSampleSize
563
564             regionStart = currentHitList[0]
565             regionStop = currentHitList[-1]
566             regionWeights.append(int(sumAll))
567             if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
568                 sumMulti = 0.
569                 #first pass uses getFoldRatio on mockRDS as there may not be control
570                 foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll)
571                 if foldRatio >= minRatio:
572                     # first pass, with absolute numbers
573                     peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
574
575                     bestPos = peak.topPos[0]
576                     numHits = peak.numHits
577                     peakScore = peak.smoothArray[bestPos]
578                     numPlus = peak.numPlus
579                     shift = peak.shift
580                     numLeft = peak.numLeft
581                     if normalize:
582                         peakScore /= rdsSampleSize
583
584                     if doTrim:
585                         minSignalThresh = trimValue * peakScore
586                         start = 0
587                         stop = regionStop - regionStart - 1
588                         startFound = False
589                         while not startFound:
590                             if peak.smoothArray[start] >= minSignalThresh or start == bestPos:
591                                 startFound = True
592                             else:
593                                 start += 1
594
595                         stopFound = False
596                         while not stopFound:
597                             if peak.smoothArray[stop] >= minSignalThresh or stop == bestPos:
598                                 stopFound = True
599                             else:
600                                 stop -= 1
601
602                         regionStop = regionStart + stop
603                         regionStart += start
604                         trimPeak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift)
605
606                         sumAll = trimPeak.numHits
607                         numPlus = trimPeak.numPlus
608                         numLeft = trimPeak.numLeft
609                         if normalize:
610                             sumAll /= rdsSampleSize
611
612                         foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, referenceSampleSize, sumAll, minRatio)
613                         if outputRegionList:
614                             sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
615                         # just in case it changed, use latest data
616                         try:
617                             bestPos = trimPeak.topPos[0]
618                             peakScore = trimPeak.smoothArray[bestPos]
619                         except:
620                             continue
621
622                         # normalize to RPM
623                         if normalize:
624                             peakScore /= rdsSampleSize
625
626                     elif outputRegionList:
627                         sumMulti = currentTotalWeight - currentUniqReadCount
628
629                     if outputRegionList:
630                         # normalize to RPM
631                         if normalize:
632                             sumMulti /= rdsSampleSize
633
634                         try:
635                             multiP = 100. * (sumMulti / sumAll)
636                         except:
637                             break
638
639                         if noMulti:
640                             multiP = 0.
641
642                     # check that we still pass threshold
643                     if sumAll >= minHits and  foldRatio >= minRatio and (regionStop - regionStart) > readlen:
644                         plusRatio = float(numPlus)/numHits
645                         if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
646                             if outputRegionList:
647                                 peakDescription = ""
648                                 if listPeak:
649                                     peakDescription = "\t%d\t%.1f" % (regionStart + bestPos, peakScore)
650
651                             if doDirectionality:
652                                 if leftPlusRatio < numLeft / numPlus:
653                                     index += 1
654                                     if outputRegionList:
655                                         plusP = plusRatio * 100.
656                                         leftP = 100. * numLeft / numPlus
657                                         # we have a region that passes all criteria
658                                         region = Region.DirectionalRegion(regionStart, regionStop + readlen - 1,
659                                                                           factor, index, chrom, sumAll,
660                                                                           foldRatio, multiP, plusP, leftP,
661                                                                           peakDescription, shift)
662                                         outregions.append(region)
663
664                                     totalRegionWeight += sumAll
665                                 else:
666                                     failedCounter += 1
667                             else:
668                                 # we have a region, but didn't check for directionality
669                                 index += 1
670                                 totalRegionWeight += sumAll
671                                 if outputRegionList:
672                                     region = Region.Region(regionStart, regionStop + readlen - 1, factor, index, chrom,
673                                                            sumAll, foldRatio, multiP, peakDescription, shift)
674                                     outregions.append(region)
675
676             currentHitList = []
677             currentTotalWeight = 0
678             currentUniqReadCount = 0
679             currentReadList = []
680             numStarts = 0
681
682         if pos not in currentHitList:
683             numStarts += 1
684
685         currentHitList.append(pos)
686         currentTotalWeight += weight
687         if weight == 1.0:
688             currentUniqReadCount += 1
689
690         currentReadList.append({"start": pos, "sense": sense, "weight": weight})
691         previousHit = pos
692
693     statistics = {"index": index,
694                   "total": totalRegionWeight,
695                   "failed": failedCounter
696     }
697
698     if outputRegionList:
699         return statistics, regionWeights, outregions
700     else:
701         return statistics, regionWeights
702
703
704 def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, reportshift, shiftValue):
705     bestShift = 0
706     shiftDict = {}
707     for region in outregions:
708         # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
709         if reportshift:
710             outputList = [region.printRegionWithShift()]
711             if shiftValue == "auto":
712                 try:
713                     shiftDict[region.shift] += 1
714                 except KeyError:
715                     shiftDict[region.shift] = 1
716         else:
717             outputList = [region.printRegion()]
718
719         # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
720         if doPvalue:
721             sumAll = int(region.numReads)
722             for i in xrange(sumAll):
723                 pValue *= poissonmean
724                 pValue /= i+1
725
726             outputList.append("%1.2f" % pValue)
727
728         outline = string.join(outputList, "\t")
729         print outline
730         print >> outfile, outline
731
732     bestCount = 0
733     for shift in sorted(shiftDict):
734         if shiftDict[shift] > bestCount:
735             bestShift = shift
736             bestCount = shiftDict[shift]
737
738     return bestShift
739
740
741 def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue):
742     footerList = ["#stats:\t%.1f RPM in %d regions" % (stats["total"], stats["index"])]
743     if doDirectionality:
744         footerList.append("#\t\t%d additional regions failed directionality filter" % stats["failed"])
745
746     if doRevBackground:
747         try:
748             percent = min(100. * (float(stats["mIndex"])/stats["index"]), 100)
749         except (ValueError, ZeroDivisionError):
750             percent = 0.
751
752         footerList.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (stats["mIndex"], stats["mTotal"], percent))
753
754     if shiftValue == "auto" and reportshift:
755         footerList.append("#mode of shift values: %d" % shiftModeValue)
756
757     footer = string.join(footerList, "\n")
758
759     return footer
760
761 if __name__ == "__main__":
762     main(sys.argv)