snapshot of 4.0a development. initial git repo commit
[erange.git] / findall.py
1 """
2     usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile
3            [--control controlrdsfile] [--minimum minHits] [--ratio minRatio]
4            [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num]
5            [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction]
6            [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw]
7            [--revbackground] [--pvalue self|back|none] [--nodirectionality]
8            [--strandfilter plus/minus] [--trimvalue percent] [--notrim]
9            [--cache pages] [--log altlogfile] [--flag aflag] [--append] [--RNA]
10
11            where values in brackets are optional and label is an arbitrary string.
12
13            Use -ratio (default 4 fold) to set the minimum fold enrichment
14            over the control, -minimum (default 4) is the minimum number of reads
15            (RPM) within the region, and -spacing (default readlen) to set the maximum
16            distance between reads in the region. -listPeak lists the peak of the
17            region. Peaks mut be higher than -minPeak (default 0.5 RPM).
18            Pvalues are calculated from the sample (change with -pvalue),
19            unless the -revbackground flag and a control RDS file are provided.
20
21            By default, all numbers and parameters are on a reads per
22            million (RPM) basis. -raw will treat all settings, ratios and reported
23            numbers as raw counts rather than RPM. Use -notrim to turn off region
24            trimming and -trimvalue to control trimming (default 10% of peak signal)
25
26            The peak finder uses minimal directionality information that can
27            be turned off with -nodirectionality ; the fraction of + strand reads
28            required to be to the left of the peak (default 0.3) can be set with
29            -leftPlus ; -minPlus and -maxPlus change the minimum/maximum fraction
30            of plus reads in a region, which (defaults 0.25 and 0.75, respectively).
31
32            Use -shift to shift reads either by half the expected
33            fragment length (default 0 bp) or '-shift learn ' to learn the shift
34            based on the first chromosome. If you prefer to learn the shift
35            manually, use -autoshift to calculate a per-region shift value, which
36            can be reported using -reportshift. -strandfilter should only be used
37            when explicitely calling unshifted stranded peaks from non-ChIP-seq
38            data such as directional RNA-seq. regionoutfile is written over by
39            default unless given the -append flag.
40 """
41
42 try:
43     import psyco
44     psyco.full()
45 except:
46     pass
47
48 import sys
49 import math
50 import string
51 import optparse
52 from commoncode import readDataset, writeLog, findPeak, getBestShiftForRegion
53
54
55 versionString = "%s: version 3.2" % sys.argv[0]
56 print versionString
57
58 def usage():
59     print __doc__
60
61
62 def main(argv=None):
63     if not argv:
64         argv = sys.argv
65
66     usage = __doc__
67
68     parser = optparse.OptionParser(usage=usage)
69     parser.add_option("--control", dest="mockfile")
70     parser.add_option("--minimum", type="float", dest="minHits")
71     parser.add_option("--ratio", type="float", dest="minRatio")
72     parser.add_option("--spacing", type="int", dest="maxSpacing")
73     parser.add_option("--listPeak", action="store_true", dest="listPeak")
74     parser.add_option("--shift", dest="shift")
75     parser.add_option("--learnFold", type="float", dest="stringency")
76     parser.add_option("--noshift", action="store_true", dest="noShift")
77     parser.add_option("--autoshift", action="store_true", dest="autoshift")
78     parser.add_option("--reportshift", action="store_true", dest="reportshift")
79     parser.add_option("--nomulti", action="store_true", dest="noMulti")
80     parser.add_option("--minPlus", type="float", dest="minPlusRatio")
81     parser.add_option("--maxPlus", type="float", dest="maxPlusRatio")
82     parser.add_option("--leftPlus", type="float", dest="leftPlusRatio")
83     parser.add_option("--minPeak", type="float", dest="minPeak")
84     parser.add_option("--raw", action="store_false", dest="normalize")
85     parser.add_option("--revbackground", action="store_true", dest="doRevBackground")
86     parser.add_option("--pvalue", dest="ptype")
87     parser.add_option("--nodirectionality", action="store_false", dest="doDirectionality")
88     parser.add_option("--strandfilter", dest="strandfilter")
89     parser.add_option("--trimvalue", type="float", dest="trimValue")
90     parser.add_option("--notrim", action="store_false", dest="doTrim")
91     parser.add_option("--cache", type="int", dest="cachePages")
92     parser.add_option("--log", dest="logfilename")
93     parser.add_option("--flag", dest="withFlag")
94     parser.add_option("--append", action="store_true", dest="doAppend")
95     parser.add_option("--RNA", action="store_true", dest="rnaSettings")
96     parser.add_option("--combine5p", action="store_true", dest="combine5p")
97     parser.set_defaults(minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
98                         stringency=4.0, noshift=False, autoshift=False, reportshift=False,
99                         minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
100                         normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
101                         trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
102                         cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
103                         strandfilter=None, combine5p=False)
104
105     (options, args) = parser.parse_args(argv[1:])
106
107     if len(args) < 3:
108         usage()
109         sys.exit(2)
110
111     factor = args[0]
112     hitfile = args[1]
113     outfilename = args[2]
114
115     findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift,
116             options.stringency, options.noshift, options.autoshift, options.reportshift,
117             options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
118             options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
119             options.trimValue, options.doTrim, options.doAppend, options.rnaSettings,
120             options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti,
121             options.strandfilter, options.combine5p)
122
123
124 def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
125             stringency=4.0, noshift=False, autoshift=False, reportshift=False,
126             minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
127             normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
128             trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
129             cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
130             strandfilter=None, combine5p=False):
131
132     shiftValue = 0
133     if autoshift:
134         shiftValue = "auto"
135
136     if shift is not None:
137         try:
138             shiftValue = int(shift)
139         except ValueError:
140             if shift == "learn":
141                 shiftValue = "learn"
142                 print "Will try to learn shift"
143
144     if noshift:
145         shiftValue = 0
146
147     if trimValue is not None:
148         trimValue = float(trimValue) / 100.
149         trimString = "%2.1f%s" % ((100. * trimValue), "%")
150     else:
151         trimValue = 0.1
152         trimString = "10%"
153
154     if not doTrim:
155         trimString = "none"
156
157     if doRevBackground:
158         print "Swapping IP and background to calculate FDR"
159         pValueType = "back"
160
161     doControl = False
162     if mockfile is not None:
163         doControl = True
164
165     doPvalue = True
166     if ptype is not None:
167         ptype = ptype.upper()
168         if ptype == "NONE":
169             doPvalue = False
170             pValueType = "none"
171             p = 1
172             poissonmean = 0
173         elif ptype == "SELF":
174             pValueType = "self"
175         elif ptype == "BACK":
176             if doControl and doRevBackground:
177                 pValueType = "back"
178             else:
179                 print "must have a control dataset and -revbackground for pValue type 'back'"
180         else:
181             print "could not use pValue type : %s" % ptype
182     else:
183         pValueType = "self"
184
185     if cachePages is not None:
186         doCache = True
187     else:
188         doCache = False
189         cachePages = -1
190
191     if withFlag != "":
192         print "restrict to flag = %s" % withFlag
193
194     useMulti = True
195     if noMulti:
196         print "using unique reads only"
197         useMulti = False
198
199     if rnaSettings:
200         print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
201         shiftValue = 0
202         doTrim = False
203         doDirectionality = False
204
205     stranded = ""
206     if strandfilter is not None:
207         if strandfilter == "plus":
208             stranded = "+"
209             minPlusRatio = 0.9
210             maxPlusRatio = 1.0
211             print "only analyzing reads on the plus strand"
212         elif strandfilter == "minus":
213             stranded = "-"
214             minPlusRatio = 0.0
215             maxPlusRatio = 0.1
216             print "only analyzing reads on the minus strand"
217
218     stringency = max(stringency, 1.0)
219     writeLog(logfilename, versionString, string.join(sys.argv[1:]))
220     if doControl:
221         print "\ncontrol:" 
222         mockRDS = readDataset(mockfile, verbose=True, cache=doCache)
223
224         if cachePages > mockRDS.getDefaultCacheSize():
225             mockRDS.setDBcache(cachePages)
226
227     print "\nsample:" 
228     hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
229     readlen = hitRDS.getReadSize()
230     if rnaSettings:
231         maxSpacing = readlen
232
233     print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache)
234     print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)
235     print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)
236
237     if cachePages > hitRDS.getDefaultCacheSize():
238         hitRDS.setDBcache(cachePages)
239
240     hitRDSsize = len(hitRDS) / 1000000.
241     if doControl:
242         mockRDSsize = len(mockRDS) / 1000000.
243
244     if normalize:
245         if doControl:
246             mockSampleSize = mockRDSsize
247
248         hitSampleSize = hitRDSsize
249
250     if doAppend:
251         outfile = open(outfilename, "a")
252     else:
253         outfile = open(outfilename, "w")
254
255     outfile.write("#ERANGE %s\n" % versionString)
256     if doControl:
257         outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)\n" % (hitfile, hitRDSsize, mockfile, mockRDSsize))
258     else:
259         outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample: none\n" % (hitfile, hitRDSsize))
260
261     if withFlag != "":
262         outfile.write("#restrict to Flag = %s\n" % withFlag)
263
264     outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache))
265     outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded))
266     outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType))
267     if normalize:
268         print "Normalizing to RPM"
269         countLabel = "RPM"
270     else:
271         countLabel = "COUNT"
272
273     headerList = ["#regionID\tchrom\tstart\tstop", countLabel, "fold\tmulti%"]
274     if doDirectionality:
275         headerList.append("plus%\tleftPlus%")
276
277     if listPeak:
278         headerList.append("peakPos\tpeakHeight")
279
280     if reportshift:
281         headerList.append("readShift")
282
283     if doPvalue:
284         headerList.append("pValue")
285
286     headline = string.join(headerList, "\t")
287     print >> outfile, headline
288
289     statistics = {"index": 0,
290                   "total": 0,
291                   "mIndex": 0,
292                   "mTotal": 0,
293                   "failed": 0
294     }
295
296     if minRatio < minPeak:
297         minPeak = minRatio
298
299     hitChromList = hitRDS.getChromosomes()
300     if doControl:
301         mockChromList = mockRDS.getChromosomes()
302
303     hitChromList.sort()
304
305     for chromosome in hitChromList:
306         if doNotProcessChromosome(chromosome, doControl, mockChromList):
307             continue
308
309         print "chromosome %s" % (chromosome)
310         hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True, strand=stranded, combine5p=combine5p)
311         maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
312         if shiftValue == "learn":
313             shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
314                                     stringency, readlen, minHits, logfilename, outfile, outfilename)
315
316         regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize, chromosome, useMulti,
317                                                               normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
318                                                               shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
319                                                               noMulti, doControl, factor, trimValue, outputRegionList=True)
320
321         statistics["index"] += regionStats["index"]
322         statistics["total"] += regionStats["total"]
323         statistics["failed"] += regionStats["failed"]
324         if not doRevBackground:
325             if doPvalue:
326                 p, poissonmean = calculatePValue(allRegionWeights)
327
328             print headline
329             shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
330             continue
331
332         #now do background swapping the two samples around
333         print "calculating background..."
334         backgroundTrimValue = 1/20.
335         backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize, chromosome, useMulti,
336                                                               normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
337                                                               shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
338                                                               noMulti, doControl, factor, backgroundTrimValue)
339
340         statistics["mIndex"] += backgroundRegionStats["index"]
341         statistics["mTotal"] += backgroundRegionStats["total"]
342         statistics["failed"] += backgroundRegionStats["failed"]
343         print statistics["mIndex"], statistics["mTotal"]
344         if doPvalue:
345             if pValueType == "self":
346                 p, poissonmean = calculatePValue(allRegionWeights)
347             else:
348                 p, poissonmean = calculatePValue(backgroundRegionWeights)
349
350         print headline
351         shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
352
353     footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue)
354     print footer
355     outfile.write(footer)
356     outfile.close()
357
358     writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | ")))
359
360
361 def doNotProcessChromosome(chromosome, doControl, mockChromList):
362     skipChromosome = False
363     if chromosome == "chrM":
364         skipChromosome = True
365
366     if doControl and (chromosome not in mockChromList):
367         skipChromosome = True
368
369     return skipChromosome
370
371
372 def calculatePValue(dataList):
373     dataList.sort()
374     listSize = float(len(dataList))
375     try:
376         poissonmean = sum(dataList) / listSize
377     except ZeroDivisionError:
378         poissonmean = 0
379
380     print "Poisson n=%d, p=%f" % (listSize, poissonmean)
381     p = math.exp(-poissonmean)
382
383     return p, poissonmean
384
385
386 def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
387                stringency, readlen, minHits, logfilename, outfile, outfilename):
388
389     print "learning shift.... will need at least 30 training sites"
390     previousHit = -1 * maxSpacing
391     hitList = [-1]
392     weightList = [0]
393     readList = []
394     shiftDict = {}
395     count = 0
396     numStarts = 0
397     for (pos, sense, weight) in hitDict[chrom]:
398         if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
399             sumAll = sum(weightList)
400             if normalize:
401                 sumAll /= hitSampleSize
402
403             regionStart = hitList[0]
404             regionStop = hitList[-1]
405             regionLength = regionStop - regionStart
406             # we're going to require stringent settings
407             if sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen:
408                 foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
409
410                 if foldRatio >= minRatio:
411                     localshift = getBestShiftForRegion(readList, regionStart, regionLength, doWeight=True)
412                     try:
413                         shiftDict[localshift] += 1
414                     except KeyError:
415                         shiftDict[localshift] = 1
416
417                     count += 1
418
419             hitList = []
420             weightList = []
421             readList = []
422             numStarts = 0
423
424         if pos not in hitList:
425             numStarts += 1
426
427         hitList.append(pos)
428         weightList.append(weight)
429         readList.append((pos, sense, weight))
430         previousHit = pos
431
432     bestShift = 0
433     bestCount = 0
434     outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency, stringency * minHits, stringency * minRatio, stringency * readlen, count)
435     print outline
436     writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
437     if count < 30:
438         outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
439         print outline
440         writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
441         shiftValue = 0
442     else:
443         for shift in sorted(shiftDict):
444             if shiftDict[shift] > bestCount:
445                 bestShift = shift
446                 bestCount = shiftDict[shift]
447
448         shiftValue = bestShift
449         print shiftDict
450
451     outline = "#picked shiftValue to be %d" % shiftValue
452     print outline
453     print >> outfile, outline
454     writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
455
456     return shiftValue
457
458
459 def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio):
460     if doControl:
461         foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll)
462     else:
463         foldRatio = minRatio
464
465     return foldRatio
466
467
468 def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll):
469     numMock = 1. + rds.getCounts(chrom, start, stop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
470     if normalize:
471         numMock /= sampleSize
472
473     foldRatio = sumAll / numMock
474
475     return foldRatio
476
477
478 def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
479                 normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
480                 shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
481                 noMulti, doControl, factor, trimValue, outputRegionList=False):
482
483     index = 0
484     total = 0
485     failedCounter = 0
486     previousHit = - 1 * maxSpacing
487     currentHitList = [-1]
488     currentWeightList = [0]
489     currentReadList = []
490     regionWeights = []
491     outregions = []
492     numStarts = 0
493     hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True)
494     maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti)
495     for (pos, sense, weight) in hitDict[chrom]:
496         if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
497             sumAll = sum(currentWeightList)
498             if normalize:
499                 sumAll /= rdsSampleSize
500
501             regionStart = currentHitList[0]
502             regionStop = currentHitList[-1]
503             regionWeights.append(int(sumAll))
504             if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
505                 sumMulti = 0.
506                 #first pass uses getFoldRatio on mockRDS as there may not be control
507                 foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll)
508                 if foldRatio >= minRatio:
509                     # first pass, with absolute numbers
510                     if doDirectionality:
511                         (topPos, numHits, smoothArray, numPlus, numLeft, shift) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue, returnShift=True)
512                     else:
513                         (topPos, numHits, smoothArray, numPlus, shift) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, shift=shiftValue, returnShift=True)
514
515                     bestPos = topPos[0]
516                     peakScore = smoothArray[bestPos]
517                     if normalize:
518                         peakScore /= rdsSampleSize
519
520                     if doTrim:
521                         minSignalThresh = trimValue * peakScore
522                         start = 0
523                         stop = regionStop - regionStart - 1
524                         startFound = False
525                         while not startFound:
526                             if smoothArray[start] >= minSignalThresh or start == bestPos:
527                                 startFound = True
528                             else:
529                                 start += 1
530
531                         stopFound = False
532                         while not stopFound:
533                             if smoothArray[stop] >= minSignalThresh or stop == bestPos:
534                                 stopFound = True
535                             else:
536                                 stop -= 1
537
538                         regionStop = regionStart + stop
539                         regionStart += start
540                         try:
541                             if doDirectionality:
542                                 (topPos, sumAll, smoothArray, numPlus, numLeft) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift)
543                             else:
544                                 (topPos, sumAll, smoothArray, numPlus) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, shift=shift)
545                         except:
546                             continue
547
548                         if normalize:
549                             sumAll /= rdsSampleSize
550
551                         foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, referenceSampleSize, sumAll, minRatio)
552                         if outputRegionList:
553                             sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
554                         # just in case it changed, use latest data
555                         try:
556                             bestPos = topPos[0]
557                             peakScore = smoothArray[bestPos]
558                         except:
559                             continue
560
561                         # normalize to RPM
562                         if normalize:
563                             peakScore /= rdsSampleSize
564
565                     elif outputRegionList:
566                         sumMulti = sum(currentWeightList) - currentWeightList.count(1.0)
567
568                     if outputRegionList:
569                         # normalize to RPM
570                         if normalize:
571                             sumMulti /= rdsSampleSize
572
573                         try:
574                             multiP = 100. * (sumMulti / sumAll)
575                         except:
576                             break
577
578                         if noMulti:
579                             multiP = 0.
580
581                     # check that we still pass threshold
582                     if sumAll >= minHits and  foldRatio >= minRatio and (regionStop - regionStart) > readlen:
583                         plusRatio = float(numPlus)/numHits
584                         if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
585                             if outputRegionList:
586                                 peak = ""
587                                 if listPeak:
588                                     peak = "\t%d\t%.1f" % (regionStart + bestPos, peakScore)
589
590                             if doDirectionality:
591                                 if leftPlusRatio < numLeft / numPlus:
592                                     index += 1
593                                     if outputRegionList:
594                                         plusP = plusRatio * 100.
595                                         leftP = 100. * numLeft / numPlus
596                                         # we have a region that passes all criteria
597                                         outregions.append((factor, index, chrom, regionStart, regionStop + readlen - 1, sumAll, foldRatio, multiP, plusP, leftP, peak, shift))
598
599                                     total += sumAll
600                                 else:
601                                     failedCounter += 1
602                             else:
603                                 # we have a region, but didn't check for directionality
604                                 index += 1
605                                 total += sumAll
606                                 if outputRegionList:
607                                     outregions.append((factor, index, chrom, regionStart, regionStop + readlen - 1, sumAll, foldRatio, multiP, peak, shift))
608
609             currentHitList = []
610             currentWeightList = []
611             currentReadList = []
612             numStarts = 0
613
614         if pos not in currentHitList:
615             numStarts += 1
616
617         currentHitList.append(pos)
618         currentWeightList.append(weight)
619         currentReadList.append((pos, sense, weight))
620         previousHit = pos
621
622     statistics = {"index": index,
623                   "total": total,
624                   "failed": failedCounter
625     }
626
627     if outputRegionList:
628         return statistics, regionWeights, outregions
629     else:
630         return statistics, regionWeights
631
632
633 def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, reportshift, shiftValue):
634     bestShift = 0
635     shiftDict = {}
636     for region in outregions:
637         # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
638         if doPvalue:
639             sumAll = int(region[5])
640             for i in xrange(sumAll):
641                 pValue *= poissonmean
642                 pValue /= i+1
643
644         if shiftValue == "auto" and reportshift:
645             try:
646                 shiftDict[region[-1]] += 1
647             except KeyError:
648                 shiftDict[region[-1]] = 1
649
650         try:
651             if reportshift:
652                 outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f%s\t%d" % region]
653             else:
654                 outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f%s" % region[:-1]]
655         except:
656             if reportshift:
657                 outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f%s\t%d" % region]
658             else:
659                 outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f%s" % region[:-1]]
660
661         if doPvalue:
662             outputList.append("%1.2g" % pValue)
663
664         outline = string.join(outputList, "\t")
665         print outline
666         print >> outfile, outline
667
668     if shiftValue == "auto" and reportshift:
669         bestCount = 0
670         for shift in sorted(shiftDict):
671             if shiftDict[shift] > bestCount:
672                 bestShift = shift
673                 bestCount = shiftDict[shift]
674
675     return bestShift
676
677
678 def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue):
679     footerList = ["#stats:\t%.1f RPM in %d regions" % (stats["total"], stats["index"])]
680     if doDirectionality:
681         footerList.append("#\t\t%d additional regions failed directionality filter" % stats["failed"])
682
683     if doRevBackground:
684         try:
685             percent = min(100. * (float(stats["mIndex"])/stats["index"]), 100)
686         except (ValueError, ZeroDivisionError):
687             percent = 0.
688
689         footerList.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (stats["mIndex"], stats["mTotal"], percent))
690
691     if shiftValue == "auto" and reportshift:
692         footerList.append("#mode of shift values: %d" % shiftModeValue)
693
694     footer = string.join(footerList, "\n")
695
696     return footer
697
698 if __name__ == "__main__":
699     main(sys.argv)