development checkpoint
[erange.git] / findall.py
1 """
2     usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile
3            [--control controlrdsfile] [--minimum minHits] [--ratio minRatio]
4            [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num]
5            [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction]
6            [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw]
7            [--revbackground] [--pvalue self|back|none] [--nodirectionality]
8            [--strandfilter plus/minus] [--trimvalue percent] [--notrim]
9            [--cache pages] [--log altlogfile] [--flag aflag] [--append] [--RNA]
10
11            where values in brackets are optional and label is an arbitrary string.
12
13            Use -ratio (default 4 fold) to set the minimum fold enrichment
14            over the control, -minimum (default 4) is the minimum number of reads
15            (RPM) within the region, and -spacing (default readlen) to set the maximum
16            distance between reads in the region. -listPeak lists the peak of the
17            region. Peaks mut be higher than -minPeak (default 0.5 RPM).
18            Pvalues are calculated from the sample (change with -pvalue),
19            unless the -revbackground flag and a control RDS file are provided.
20
21            By default, all numbers and parameters are on a reads per
22            million (RPM) basis. -raw will treat all settings, ratios and reported
23            numbers as raw counts rather than RPM. Use -notrim to turn off region
24            trimming and -trimvalue to control trimming (default 10% of peak signal)
25
26            The peak finder uses minimal directionality information that can
27            be turned off with -nodirectionality ; the fraction of + strand reads
28            required to be to the left of the peak (default 0.3) can be set with
29            -leftPlus ; -minPlus and -maxPlus change the minimum/maximum fraction
30            of plus reads in a region, which (defaults 0.25 and 0.75, respectively).
31
32            Use -shift to shift reads either by half the expected
33            fragment length (default 0 bp) or '-shift learn ' to learn the shift
34            based on the first chromosome. If you prefer to learn the shift
35            manually, use -autoshift to calculate a per-region shift value, which
36            can be reported using -reportshift. -strandfilter should only be used
37            when explicitely calling unshifted stranded peaks from non-ChIP-seq
38            data such as directional RNA-seq. regionoutfile is written over by
39            default unless given the -append flag.
40 """
41
42 try:
43     import psyco
44     psyco.full()
45 except:
46     pass
47
48 import sys
49 import math
50 import string
51 import optparse
52 from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
53 import ReadDataset
54 import Region
55
56
57 versionString = "findall: version 3.2"
58 print versionString
59
60 def usage():
61     print __doc__
62
63
64 def main(argv=None):
65     if not argv:
66         argv = sys.argv
67
68     parser = makeParser()
69     (options, args) = parser.parse_args(argv[1:])
70
71     if len(args) < 3:
72         usage()
73         sys.exit(2)
74
75     factor = args[0]
76     hitfile = args[1]
77     outfilename = args[2]
78
79     findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift,
80             options.stringency, options.noshift, options.autoshift, options.reportshift,
81             options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
82             options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
83             options.trimValue, options.doTrim, options.doAppend, options.rnaSettings,
84             options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti,
85             options.strandfilter, options.combine5p)
86
87
88 def makeParser():
89     usage = __doc__
90
91     parser = optparse.OptionParser(usage=usage)
92     parser.add_option("--control", dest="mockfile")
93     parser.add_option("--minimum", type="float", dest="minHits")
94     parser.add_option("--ratio", type="float", dest="minRatio")
95     parser.add_option("--spacing", type="int", dest="maxSpacing")
96     parser.add_option("--listPeak", action="store_true", dest="listPeak")
97     parser.add_option("--shift", dest="shift")
98     parser.add_option("--learnFold", type="float", dest="stringency")
99     parser.add_option("--noshift", action="store_true", dest="noShift")
100     parser.add_option("--autoshift", action="store_true", dest="autoshift")
101     parser.add_option("--reportshift", action="store_true", dest="reportshift")
102     parser.add_option("--nomulti", action="store_true", dest="noMulti")
103     parser.add_option("--minPlus", type="float", dest="minPlusRatio")
104     parser.add_option("--maxPlus", type="float", dest="maxPlusRatio")
105     parser.add_option("--leftPlus", type="float", dest="leftPlusRatio")
106     parser.add_option("--minPeak", type="float", dest="minPeak")
107     parser.add_option("--raw", action="store_false", dest="normalize")
108     parser.add_option("--revbackground", action="store_true", dest="doRevBackground")
109     parser.add_option("--pvalue", dest="ptype")
110     parser.add_option("--nodirectionality", action="store_false", dest="doDirectionality")
111     parser.add_option("--strandfilter", dest="strandfilter")
112     parser.add_option("--trimvalue", type="float", dest="trimValue")
113     parser.add_option("--notrim", action="store_false", dest="doTrim")
114     parser.add_option("--cache", type="int", dest="cachePages")
115     parser.add_option("--log", dest="logfilename")
116     parser.add_option("--flag", dest="withFlag")
117     parser.add_option("--append", action="store_true", dest="doAppend")
118     parser.add_option("--RNA", action="store_true", dest="rnaSettings")
119     parser.add_option("--combine5p", action="store_true", dest="combine5p")
120
121     configParser = getConfigParser()
122     section = "findall"
123     minHits = getConfigFloatOption(configParser, section, "minHits", 4.0)
124     minRatio = getConfigFloatOption(configParser, section, "minRatio", 4.0)
125     maxSpacing = getConfigIntOption(configParser, section, "maxSpacing", 50)
126     listPeak = getConfigBoolOption(configParser, section, "listPeak", False)
127     shift = getConfigOption(configParser, section, "shift", None)
128     stringency = getConfigFloatOption(configParser, section, "stringency", 4.0)
129     noshift = getConfigBoolOption(configParser, section, "noshift", False)
130     autoshift = getConfigBoolOption(configParser, section, "autoshift", False)
131     reportshift = getConfigBoolOption(configParser, section, "reportshift", False)
132     minPlusRatio = getConfigFloatOption(configParser, section, "minPlusRatio", 0.25)
133     maxPlusRatio = getConfigFloatOption(configParser, section, "maxPlusRatio", 0.75)
134     leftPlusRatio = getConfigFloatOption(configParser, section, "leftPlusRatio", 0.3)
135     minPeak = getConfigFloatOption(configParser, section, "minPeak", 0.5)
136     normalize = getConfigBoolOption(configParser, section, "normalize", True)
137     logfilename = getConfigOption(configParser, section, "logfilename", "findall.log")
138     withFlag = getConfigOption(configParser, section, "withFlag", "")
139     doDirectionality = getConfigBoolOption(configParser, section, "doDirectionality", True)
140     trimValue = getConfigOption(configParser, section, "trimValue", None)
141     doTrim = getConfigBoolOption(configParser, section, "doTrim", True)
142     doAppend = getConfigBoolOption(configParser, section, "doAppend", False)
143     rnaSettings = getConfigBoolOption(configParser, section, "rnaSettings", False)
144     cachePages = getConfigOption(configParser, section, "cachePages", None)
145     ptype = getConfigOption(configParser, section, "ptype", None)
146     mockfile = getConfigOption(configParser, section, "mockfile", None)
147     doRevBackground = getConfigBoolOption(configParser, section, "doRevBackground", False)
148     noMulti = getConfigBoolOption(configParser, section, "noMulti", False)
149     strandfilter = getConfigOption(configParser, section, "strandfilter", None)
150     combine5p = getConfigBoolOption(configParser, section, "combine5p", False)
151
152     parser.set_defaults(minHits=minHits, minRatio=minRatio, maxSpacing=maxSpacing, listPeak=listPeak, shift=shift,
153                         stringency=stringency, noshift=noshift, autoshift=autoshift, reportshift=reportshift,
154                         minPlusRatio=minPlusRatio, maxPlusRatio=maxPlusRatio, leftPlusRatio=leftPlusRatio, minPeak=minPeak,
155                         normalize=normalize, logfilename=logfilename, withFlag=withFlag, doDirectionality=doDirectionality,
156                         trimValue=trimValue, doTrim=doTrim, doAppend=doAppend, rnaSettings=rnaSettings,
157                         cachePages=cachePages, ptype=ptype, mockfile=mockfile, doRevBackground=doRevBackground, noMulti=noMulti,
158                         strandfilter=strandfilter, combine5p=combine5p)
159
160     return parser
161
162
163 def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
164             stringency=4.0, noshift=False, autoshift=False, reportshift=False,
165             minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
166             normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
167             trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
168             cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
169             strandfilter=None, combine5p=False):
170
171     shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
172     doControl = False
173     if mockfile is not None:
174         doControl = True
175
176     if doRevBackground:
177         print "Swapping IP and background to calculate FDR"
178         pValueType = "back"
179
180     doPvalue = True
181     if ptype is not None:
182         ptype = ptype.upper()
183         if ptype == "NONE":
184             doPvalue = False
185             pValueType = "none"
186             p = 1
187             poissonmean = 0
188         elif ptype == "SELF":
189             pValueType = "self"
190         elif ptype == "BACK":
191             if doControl and doRevBackground:
192                 pValueType = "back"
193             else:
194                 print "must have a control dataset and -revbackground for pValue type 'back'"
195         else:
196             print "could not use pValue type : %s" % ptype
197     else:
198         pValueType = "self"
199
200     if withFlag != "":
201         print "restrict to flag = %s" % withFlag
202
203     useMulti = True
204     if noMulti:
205         print "using unique reads only"
206         useMulti = False
207
208     if rnaSettings:
209         print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
210         doTrim = False
211         doDirectionality = False
212
213     stranded = ""
214     if strandfilter is not None:
215         if strandfilter == "plus":
216             stranded = "+"
217             minPlusRatio = 0.9
218             maxPlusRatio = 1.0
219             print "only analyzing reads on the plus strand"
220         elif strandfilter == "minus":
221             stranded = "-"
222             minPlusRatio = 0.0
223             maxPlusRatio = 0.1
224             print "only analyzing reads on the minus strand"
225
226     stringency = max(stringency, 1.0)
227     writeLog(logfilename, versionString, string.join(sys.argv[1:]))
228     if cachePages is not None:
229         doCache = True
230     else:
231         doCache = False
232         cachePages = -1
233
234     mockRDS = None
235     if doControl:
236         print "\ncontrol:" 
237         mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
238
239         if cachePages > mockRDS.getDefaultCacheSize():
240             mockRDS.setDBcache(cachePages)
241
242     print "\nsample:" 
243     hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
244     readlen = hitRDS.getReadSize()
245     if rnaSettings:
246         maxSpacing = readlen
247
248     if cachePages > hitRDS.getDefaultCacheSize():
249         hitRDS.setDBcache(cachePages)
250
251     if trimValue is not None:
252         trimValue = float(trimValue) / 100.
253         trimString = "%2.1f%s" % ((100. * trimValue), "%")
254     else:
255         trimValue = 0.1
256         trimString = "10%"
257
258     if not doTrim:
259         trimString = "none"
260
261     if doAppend:
262         fileMode = "a"
263     else:
264         fileMode = "w"
265
266     outfile = open(outfilename, fileMode)
267     outfile.write("#ERANGE %s\n" % versionString)
268     if doControl:
269         mockRDSsize = len(mockRDS) / 1000000.
270         controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize)
271     else:
272         mockRDSsize = 0
273         controlSampleString = " none"
274
275     hitRDSsize = len(hitRDS) / 1000000.
276     outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:%s\n" % (hitfile, hitRDSsize, controlSampleString))
277
278     if withFlag != "":
279         outfile.write("#restrict to Flag = %s\n" % withFlag)
280
281     print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache)
282     print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)
283     print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)
284
285     outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache))
286     outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded))
287     outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType))
288     if normalize:
289         print "Normalizing to RPM"
290         countLabel = "RPM"
291     else:
292         countLabel = "COUNT"
293
294     headerList = ["#regionID\tchrom\tstart\tstop", countLabel, "fold\tmulti%"]
295     if doDirectionality:
296         headerList.append("plus%\tleftPlus%")
297
298     if listPeak:
299         headerList.append("peakPos\tpeakHeight")
300
301     if reportshift:
302         headerList.append("readShift")
303
304     if doPvalue:
305         headerList.append("pValue")
306
307     headline = string.join(headerList, "\t")
308     print >> outfile, headline
309
310     statistics = {"index": 0,
311                   "total": 0,
312                   "mIndex": 0,
313                   "mTotal": 0,
314                   "failed": 0
315     }
316
317     if minRatio < minPeak:
318         minPeak = minRatio
319
320     hitChromList = hitRDS.getChromosomes()
321     if doControl:
322         mockChromList = mockRDS.getChromosomes()
323     else:
324         mockChromList = []
325
326     mockSampleSize = 0
327     hitSampleSize = 0
328     if normalize:
329         if doControl:
330             mockSampleSize = mockRDSsize
331
332         hitSampleSize = hitRDSsize
333
334     hitChromList.sort()
335
336     for chromosome in hitChromList:
337         if doNotProcessChromosome(chromosome, doControl, mockChromList):
338             continue
339
340         print "chromosome %s" % (chromosome)
341         hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True,
342                                       doMulti=useMulti, findallOptimize=True, strand=stranded,
343                                       combine5p=combine5p)
344         maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
345         if shiftValue == "learn":
346             shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize,
347                                     mockSampleSize, minRatio, maxSpacing, maxCoord, stringency, readlen, minHits,
348                                     logfilename, outfile, outfilename)
349
350         regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize,
351                                                                   chromosome, useMulti, normalize, maxSpacing,
352                                                                   doDirectionality, doTrim, minHits, minRatio,
353                                                                   readlen, shiftValue, minPeak, minPlusRatio,
354                                                                   maxPlusRatio, leftPlusRatio, listPeak, noMulti,
355                                                                   doControl, factor, trimValue, outputRegionList=True)
356
357         statistics["index"] += regionStats["index"]
358         statistics["total"] += regionStats["total"]
359         statistics["failed"] += regionStats["failed"]
360         if not doRevBackground:
361             if doPvalue:
362                 p, poissonmean = calculatePValue(allRegionWeights)
363
364             print headline
365             shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
366             continue
367
368         #now do background swapping the two samples around
369         if mockRDS is not None:
370             print "calculating background..."
371             backgroundTrimValue = 1/20.
372             backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize,
373                                                                            chromosome, useMulti, normalize, maxSpacing,
374                                                                            doDirectionality, doTrim, minHits, minRatio,
375                                                                            readlen, shiftValue, minPeak, minPlusRatio,
376                                                                            maxPlusRatio, leftPlusRatio, listPeak, noMulti,
377                                                                            doControl, factor, backgroundTrimValue)
378
379             statistics["mIndex"] += backgroundRegionStats["index"]
380             statistics["mTotal"] += backgroundRegionStats["total"]
381             statistics["failed"] += backgroundRegionStats["failed"]
382             print statistics["mIndex"], statistics["mTotal"]
383             if doPvalue:
384                 if pValueType == "self":
385                     p, poissonmean = calculatePValue(allRegionWeights)
386                 else:
387                     p, poissonmean = calculatePValue(backgroundRegionWeights)
388
389             print headline
390             shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
391
392     footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue)
393     print footer
394     outfile.write(footer)
395     outfile.close()
396
397     writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | ")))
398
399
400 def determineShiftValue(autoshift, shift, noshift, rnaSettings):
401     shiftValue = 0
402     if autoshift:
403         shiftValue = "auto"
404
405     if shift is not None:
406         try:
407             shiftValue = int(shift)
408         except ValueError:
409             if shift == "learn":
410                 shiftValue = "learn"
411                 print "Will try to learn shift"
412
413     if noshift or rnaSettings:
414         shiftValue = 0
415
416     return shiftValue
417
418
419 def doNotProcessChromosome(chromosome, doControl, mockChromList):
420     skipChromosome = False
421     if chromosome == "chrM":
422         skipChromosome = True
423
424     if doControl and (chromosome not in mockChromList):
425         skipChromosome = True
426
427     return skipChromosome
428
429
430 def calculatePValue(dataList):
431     dataList.sort()
432     listSize = float(len(dataList))
433     try:
434         poissonmean = sum(dataList) / listSize
435     except ZeroDivisionError:
436         poissonmean = 0
437
438     print "Poisson n=%d, p=%f" % (listSize, poissonmean)
439     p = math.exp(-poissonmean)
440
441     return p, poissonmean
442
443
444 def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
445                stringency, readlen, minHits, logfilename, outfile, outfilename, minSites=30):
446
447     print "learning shift.... will need at least %d training sites" % minSites
448     previousHit = -1 * maxSpacing
449     hitList = [-1]
450     totalWeight = 0
451     readList = []
452     shiftDict = {}
453     count = 0
454     numStarts = 0
455     for read in hitDict[chrom]:
456         pos = read["start"]
457         sense = read["sense"]
458         weight = read["weight"]
459         if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
460             sumAll = totalWeight
461             if normalize:
462                 sumAll /= hitSampleSize
463
464             regionStart = hitList[0]
465             regionStop = hitList[-1]
466             regionLength = regionStop - regionStart
467             # we're going to require stringent settings
468             if sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen:
469                 foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
470
471                 if foldRatio >= minRatio:
472                     localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True)
473                     try:
474                         shiftDict[localshift] += 1
475                     except KeyError:
476                         shiftDict[localshift] = 1
477
478                     count += 1
479
480             hitList = []
481             totalWeight = 0
482             readList = []
483             numStarts = 0
484
485         if pos not in hitList:
486             numStarts += 1
487
488         hitList.append(pos)
489         totalWeight += weight
490         readList.append({"start": pos, "sense": sense, "weight": weight})
491         previousHit = pos
492
493     bestShift = 0
494     bestCount = 0
495     learningSettings = ["#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d" % (stringency, stringency * minHits,
496                                                                                                        stringency * minRatio, stringency * readlen),
497                         "#number of training examples: %d" % count]
498     outline = string.join(learningSettings, "\n")
499     print outline
500     writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
501     if count < minSites:
502         outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
503         print >> outfile, outline
504         writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
505         shiftValue = 0
506     else:
507         for shift in sorted(shiftDict):
508             if shiftDict[shift] > bestCount:
509                 bestShift = shift
510                 bestCount = shiftDict[shift]
511
512         shiftValue = bestShift
513         print shiftDict
514
515     outline = "#picked shiftValue to be %d" % shiftValue
516     print outline
517     print >> outfile, outline
518     writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
519
520     return shiftValue
521
522
523 def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio):
524     if doControl and rds is not None:
525         foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll)
526     else:
527         foldRatio = minRatio
528
529     return foldRatio
530
531
532 def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll):
533     numMock = 1. + rds.getCounts(chrom, start, stop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
534     if normalize:
535         numMock /= sampleSize
536
537     foldRatio = sumAll / numMock
538
539     return foldRatio
540
541
542 def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
543                 normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
544                 shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
545                 noMulti, doControl, factor, trimValue, outputRegionList=False):
546
547     index = 0
548     totalRegionWeight = 0
549     failedCounter = 0
550     previousHit = - 1 * maxSpacing
551     currentHitList = [-1]
552     currentTotalWeight = 0
553     currentUniqReadCount = 0
554     currentReadList = []
555     regionWeights = []
556     outregions = []
557     numStarts = 0
558     hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True)
559     maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti)
560     for read in hitDict[chrom]:
561         pos = read["start"]
562         sense = read["sense"]
563         weight = read["weight"]
564         if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
565             sumAll = currentTotalWeight
566             if normalizeToRPM:
567                 sumAll /= rdsSampleSize
568
569             regionStart = currentHitList[0]
570             regionStop = currentHitList[-1]
571             regionWeights.append(int(sumAll))
572             if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
573                 sumMulti = 0.
574                 foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
575                 if foldRatio >= minRatio:
576                     # first pass, with absolute numbers
577                     peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
578
579                     bestPos = peak.topPos[0]
580                     numHits = peak.numHits
581                     peakScore = peak.smoothArray[bestPos]
582                     numPlus = peak.numPlus
583                     shift = peak.shift
584                     numLeft = peak.numLeftPlus
585                     if normalizeToRPM:
586                         peakScore /= rdsSampleSize
587
588                     if doTrim:
589                         minSignalThresh = trimValue * peakScore
590                         start = 0
591                         stop = regionStop - regionStart - 1
592                         startFound = False
593                         while not startFound:
594                             if peak.smoothArray[start] >= minSignalThresh or start == bestPos:
595                                 startFound = True
596                             else:
597                                 start += 1
598
599                         stopFound = False
600                         while not stopFound:
601                             if peak.smoothArray[stop] >= minSignalThresh or stop == bestPos:
602                                 stopFound = True
603                             else:
604                                 stop -= 1
605
606                         regionStop = regionStart + stop
607                         regionStart += start
608                         trimPeak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift)
609
610                         sumAll = trimPeak.numHits
611                         numPlus = trimPeak.numPlus
612                         numLeft = trimPeak.numLeftPlus
613                         if normalizeToRPM:
614                             sumAll /= rdsSampleSize
615
616                         foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
617                         if outputRegionList:
618                             sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
619                         # just in case it changed, use latest data
620                         try:
621                             bestPos = trimPeak.topPos[0]
622                             peakScore = trimPeak.smoothArray[bestPos]
623                         except:
624                             continue
625
626                         if normalizeToRPM:
627                             peakScore /= rdsSampleSize
628
629                     elif outputRegionList:
630                         sumMulti = currentTotalWeight - currentUniqReadCount
631
632                     if outputRegionList:
633                         if normalizeToRPM:
634                             sumMulti /= rdsSampleSize
635
636                         try:
637                             multiP = 100. * (sumMulti / sumAll)
638                         except:
639                             break
640
641                         if noMulti:
642                             multiP = 0.
643
644                     # check that we still pass threshold
645                     if sumAll >= minHits and  foldRatio >= minRatio and (regionStop - regionStart) > readlen:
646                         plusRatio = float(numPlus)/numHits
647                         if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
648                             if outputRegionList:
649                                 peakDescription = ""
650                                 if listPeak:
651                                     peakDescription = "\t%d\t%.1f" % (regionStart + bestPos, peakScore)
652
653                             if doDirectionality:
654                                 if leftPlusRatio < numLeft / numPlus:
655                                     index += 1
656                                     if outputRegionList:
657                                         plusP = plusRatio * 100.
658                                         leftP = 100. * numLeft / numPlus
659                                         # we have a region that passes all criteria
660                                         region = Region.DirectionalRegion(regionStart, regionStop + readlen - 1,
661                                                                           factor, index, chrom, sumAll,
662                                                                           foldRatio, multiP, plusP, leftP,
663                                                                           peakDescription, shift)
664                                         outregions.append(region)
665
666                                     totalRegionWeight += sumAll
667                                 else:
668                                     failedCounter += 1
669                             else:
670                                 # we have a region, but didn't check for directionality
671                                 index += 1
672                                 totalRegionWeight += sumAll
673                                 if outputRegionList:
674                                     region = Region.Region(regionStart, regionStop + readlen - 1, factor, index, chrom,
675                                                            sumAll, foldRatio, multiP, peakDescription, shift)
676                                     outregions.append(region)
677
678             currentHitList = []
679             currentTotalWeight = 0
680             currentUniqReadCount = 0
681             currentReadList = []
682             numStarts = 0
683
684         if pos not in currentHitList:
685             numStarts += 1
686
687         currentHitList.append(pos)
688         currentTotalWeight += weight
689         if weight == 1.0:
690             currentUniqReadCount += 1
691
692         currentReadList.append({"start": pos, "sense": sense, "weight": weight})
693         previousHit = pos
694
695     statistics = {"index": index,
696                   "total": totalRegionWeight,
697                   "failed": failedCounter
698     }
699
700     if outputRegionList:
701         return statistics, regionWeights, outregions
702     else:
703         return statistics, regionWeights
704
705
706 def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, reportshift, shiftValue):
707     bestShift = 0
708     shiftDict = {}
709     for region in outregions:
710         # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
711         if reportshift:
712             outputList = [region.printRegionWithShift()]
713             if shiftValue == "auto":
714                 try:
715                     shiftDict[region.shift] += 1
716                 except KeyError:
717                     shiftDict[region.shift] = 1
718         else:
719             outputList = [region.printRegion()]
720
721         # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
722         if doPvalue:
723             sumAll = int(region.numReads)
724             for i in xrange(sumAll):
725                 pValue *= poissonmean
726                 pValue /= i+1
727
728             outputList.append("%1.2g" % pValue)
729
730         outline = string.join(outputList, "\t")
731         print outline
732         print >> outfile, outline
733
734     bestCount = 0
735     for shift in sorted(shiftDict):
736         if shiftDict[shift] > bestCount:
737             bestShift = shift
738             bestCount = shiftDict[shift]
739
740     return bestShift
741
742
743 def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue):
744     footerList = ["#stats:\t%.1f RPM in %d regions" % (stats["total"], stats["index"])]
745     if doDirectionality:
746         footerList.append("#\t\t%d additional regions failed directionality filter" % stats["failed"])
747
748     if doRevBackground:
749         try:
750             percent = min(100. * (float(stats["mIndex"])/stats["index"]), 100)
751         except (ValueError, ZeroDivisionError):
752             percent = 0.
753
754         footerList.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (stats["mIndex"], stats["mTotal"], percent))
755
756     if shiftValue == "auto" and reportshift:
757         footerList.append("#mode of shift values: %d" % shiftModeValue)
758
759     footer = string.join(footerList, "\n")
760
761     return footer
762
763 if __name__ == "__main__":
764     main(sys.argv)