Release version for Erange 4.0a
[erange.git] / findall.py
1 """
2     usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile
3            [--control controlrdsfile] [--minimum minHits] [--ratio minRatio]
4            [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num]
5            [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction]
6            [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw]
7            [--revbackground] [--pvalue self|back|none] [--nodirectionality]
8            [--strandfilter plus/minus] [--trimvalue percent] [--notrim]
9            [--cache pages] [--log altlogfile] [--flag aflag] [--append] [--RNA]
10
11            where values in brackets are optional and label is an arbitrary string.
12
13            Use -ratio (default 4 fold) to set the minimum fold enrichment
14            over the control, -minimum (default 4) is the minimum number of reads
15            (RPM) within the region, and -spacing (default readlen) to set the maximum
16            distance between reads in the region. -listPeak lists the peak of the
17            region. Peaks mut be higher than -minPeak (default 0.5 RPM).
18            Pvalues are calculated from the sample (change with -pvalue),
19            unless the -revbackground flag and a control RDS file are provided.
20
21            By default, all numbers and parameters are on a reads per
22            million (RPM) basis. -raw will treat all settings, ratios and reported
23            numbers as raw counts rather than RPM. Use -notrim to turn off region
24            trimming and -trimvalue to control trimming (default 10% of peak signal)
25
26            The peak finder uses minimal directionality information that can
27            be turned off with -nodirectionality ; the fraction of + strand reads
28            required to be to the left of the peak (default 0.3) can be set with
29            -leftPlus ; -minPlus and -maxPlus change the minimum/maximum fraction
30            of plus reads in a region, which (defaults 0.25 and 0.75, respectively).
31
32            Use -shift to shift reads either by half the expected
33            fragment length (default 0 bp) or '-shift learn ' to learn the shift
34            based on the first chromosome. If you prefer to learn the shift
35            manually, use -autoshift to calculate a per-region shift value, which
36            can be reported using -reportshift. -strandfilter should only be used
37            when explicitely calling unshifted stranded peaks from non-ChIP-seq
38            data such as directional RNA-seq. regionoutfile is written over by
39            default unless given the -append flag.
40 """
41
42 try:
43     import psyco
44     psyco.full()
45 except:
46     pass
47
48 import sys
49 import math
50 import string
51 import optparse
52 from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
53 import ReadDataset
54 import Region
55
56
57 versionString = "findall: version 3.2"
58 print versionString
59
60 def usage():
61     print __doc__
62
63
64 def main(argv=None):
65     if not argv:
66         argv = sys.argv
67
68     parser = makeParser()
69     (options, args) = parser.parse_args(argv[1:])
70
71     if len(args) < 3:
72         usage()
73         sys.exit(2)
74
75     factor = args[0]
76     hitfile = args[1]
77     outfilename = args[2]
78
79     findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift,
80             options.stringency, options.noshift, options.autoshift, options.reportshift,
81             options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
82             options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
83             options.trimValue, options.doTrim, options.doAppend, options.rnaSettings,
84             options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti,
85             options.strandfilter, options.combine5p)
86
87
88 def makeParser():
89     usage = __doc__
90
91     parser = optparse.OptionParser(usage=usage)
92     parser.add_option("--control", dest="mockfile")
93     parser.add_option("--minimum", type="float", dest="minHits")
94     parser.add_option("--ratio", type="float", dest="minRatio")
95     parser.add_option("--spacing", type="int", dest="maxSpacing")
96     parser.add_option("--listPeak", action="store_true", dest="listPeak")
97     parser.add_option("--shift", dest="shift")
98     parser.add_option("--learnFold", type="float", dest="stringency")
99     parser.add_option("--noshift", action="store_true", dest="noShift")
100     parser.add_option("--autoshift", action="store_true", dest="autoshift")
101     parser.add_option("--reportshift", action="store_true", dest="reportshift")
102     parser.add_option("--nomulti", action="store_true", dest="noMulti")
103     parser.add_option("--minPlus", type="float", dest="minPlusRatio")
104     parser.add_option("--maxPlus", type="float", dest="maxPlusRatio")
105     parser.add_option("--leftPlus", type="float", dest="leftPlusRatio")
106     parser.add_option("--minPeak", type="float", dest="minPeak")
107     parser.add_option("--raw", action="store_false", dest="normalize")
108     parser.add_option("--revbackground", action="store_true", dest="doRevBackground")
109     parser.add_option("--pvalue", dest="ptype")
110     parser.add_option("--nodirectionality", action="store_false", dest="doDirectionality")
111     parser.add_option("--strandfilter", dest="strandfilter")
112     parser.add_option("--trimvalue", type="float", dest="trimValue")
113     parser.add_option("--notrim", action="store_false", dest="doTrim")
114     parser.add_option("--cache", type="int", dest="cachePages")
115     parser.add_option("--log", dest="logfilename")
116     parser.add_option("--flag", dest="withFlag")
117     parser.add_option("--append", action="store_true", dest="doAppend")
118     parser.add_option("--RNA", action="store_true", dest="rnaSettings")
119     parser.add_option("--combine5p", action="store_true", dest="combine5p")
120
121     configParser = getConfigParser()
122     section = "findall"
123     minHits = getConfigFloatOption(configParser, section, "minHits", 4.0)
124     minRatio = getConfigFloatOption(configParser, section, "minRatio", 4.0)
125     maxSpacing = getConfigIntOption(configParser, section, "maxSpacing", 50)
126     listPeak = getConfigBoolOption(configParser, section, "listPeak", False)
127     shift = getConfigOption(configParser, section, "shift", None)
128     stringency = getConfigFloatOption(configParser, section, "stringency", 4.0)
129     noshift = getConfigBoolOption(configParser, section, "noshift", False)
130     autoshift = getConfigBoolOption(configParser, section, "autoshift", False)
131     reportshift = getConfigBoolOption(configParser, section, "reportshift", False)
132     minPlusRatio = getConfigFloatOption(configParser, section, "minPlusRatio", 0.25)
133     maxPlusRatio = getConfigFloatOption(configParser, section, "maxPlusRatio", 0.75)
134     leftPlusRatio = getConfigFloatOption(configParser, section, "leftPlusRatio", 0.3)
135     minPeak = getConfigFloatOption(configParser, section, "minPeak", 0.5)
136     normalize = getConfigBoolOption(configParser, section, "normalize", True)
137     logfilename = getConfigOption(configParser, section, "logfilename", "findall.log")
138     withFlag = getConfigOption(configParser, section, "withFlag", "")
139     doDirectionality = getConfigBoolOption(configParser, section, "doDirectionality", True)
140     trimValue = getConfigOption(configParser, section, "trimValue", None)
141     doTrim = getConfigBoolOption(configParser, section, "doTrim", True)
142     doAppend = getConfigBoolOption(configParser, section, "doAppend", False)
143     rnaSettings = getConfigBoolOption(configParser, section, "rnaSettings", False)
144     cachePages = getConfigOption(configParser, section, "cachePages", None)
145     ptype = getConfigOption(configParser, section, "ptype", None)
146     mockfile = getConfigOption(configParser, section, "mockfile", None)
147     doRevBackground = getConfigBoolOption(configParser, section, "doRevBackground", False)
148     noMulti = getConfigBoolOption(configParser, section, "noMulti", False)
149     strandfilter = getConfigOption(configParser, section, "strandfilter", None)
150     combine5p = getConfigBoolOption(configParser, section, "combine5p", False)
151
152     parser.set_defaults(minHits=minHits, minRatio=minRatio, maxSpacing=maxSpacing, listPeak=listPeak, shift=shift,
153                         stringency=stringency, noshift=noshift, autoshift=autoshift, reportshift=reportshift,
154                         minPlusRatio=minPlusRatio, maxPlusRatio=maxPlusRatio, leftPlusRatio=leftPlusRatio, minPeak=minPeak,
155                         normalize=normalize, logfilename=logfilename, withFlag=withFlag, doDirectionality=doDirectionality,
156                         trimValue=trimValue, doTrim=doTrim, doAppend=doAppend, rnaSettings=rnaSettings,
157                         cachePages=cachePages, ptype=ptype, mockfile=mockfile, doRevBackground=doRevBackground, noMulti=noMulti,
158                         strandfilter=strandfilter, combine5p=combine5p)
159
160     return parser
161
162
163 def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
164             stringency=4.0, noshift=False, autoshift=False, reportshift=False,
165             minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
166             normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
167             trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
168             cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
169             strandfilter=None, combine5p=False):
170
171     shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
172     doControl = False
173     if mockfile is not None:
174         doControl = True
175
176     if doRevBackground:
177         print "Swapping IP and background to calculate FDR"
178         pValueType = "back"
179
180     doPvalue = True
181     if ptype is not None:
182         ptype = ptype.upper()
183         if ptype == "NONE":
184             doPvalue = False
185             pValueType = "none"
186             p = 1
187             poissonmean = 0
188         elif ptype == "SELF":
189             pValueType = "self"
190         elif ptype == "BACK":
191             if doControl and doRevBackground:
192                 pValueType = "back"
193             else:
194                 print "must have a control dataset and -revbackground for pValue type 'back'"
195         else:
196             print "could not use pValue type : %s" % ptype
197     else:
198         pValueType = "self"
199
200     if withFlag != "":
201         print "restrict to flag = %s" % withFlag
202
203     useMulti = True
204     if noMulti:
205         print "using unique reads only"
206         useMulti = False
207
208     if rnaSettings:
209         print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
210         doTrim = False
211         doDirectionality = False
212
213     stranded = ""
214     if strandfilter is not None:
215         if strandfilter == "plus":
216             stranded = "+"
217             minPlusRatio = 0.9
218             maxPlusRatio = 1.0
219             print "only analyzing reads on the plus strand"
220         elif strandfilter == "minus":
221             stranded = "-"
222             minPlusRatio = 0.0
223             maxPlusRatio = 0.1
224             print "only analyzing reads on the minus strand"
225
226     stringency = max(stringency, 1.0)
227     writeLog(logfilename, versionString, string.join(sys.argv[1:]))
228     if cachePages is not None:
229         doCache = True
230     else:
231         doCache = False
232         cachePages = -1
233
234     if doControl:
235         print "\ncontrol:" 
236         mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
237
238         if cachePages > mockRDS.getDefaultCacheSize():
239             mockRDS.setDBcache(cachePages)
240
241     print "\nsample:" 
242     hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
243     readlen = hitRDS.getReadSize()
244     if rnaSettings:
245         maxSpacing = readlen
246
247     if cachePages > hitRDS.getDefaultCacheSize():
248         hitRDS.setDBcache(cachePages)
249
250     if trimValue is not None:
251         trimValue = float(trimValue) / 100.
252         trimString = "%2.1f%s" % ((100. * trimValue), "%")
253     else:
254         trimValue = 0.1
255         trimString = "10%"
256
257     if not doTrim:
258         trimString = "none"
259
260     if doAppend:
261         fileMode = "a"
262     else:
263         fileMode = "w"
264
265     outfile = open(outfilename, fileMode)
266     outfile.write("#ERANGE %s\n" % versionString)
267     if doControl:
268         mockRDSsize = len(mockRDS) / 1000000.
269         controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize)
270     else:
271         controlSampleString = " none"
272
273     hitRDSsize = len(hitRDS) / 1000000.
274     outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:%s\n" % (hitfile, hitRDSsize, controlSampleString))
275
276     if withFlag != "":
277         outfile.write("#restrict to Flag = %s\n" % withFlag)
278
279     print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache)
280     print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)
281     print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)
282
283     outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache))
284     outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded))
285     outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType))
286     if normalize:
287         print "Normalizing to RPM"
288         countLabel = "RPM"
289     else:
290         countLabel = "COUNT"
291
292     headerList = ["#regionID\tchrom\tstart\tstop", countLabel, "fold\tmulti%"]
293     if doDirectionality:
294         headerList.append("plus%\tleftPlus%")
295
296     if listPeak:
297         headerList.append("peakPos\tpeakHeight")
298
299     if reportshift:
300         headerList.append("readShift")
301
302     if doPvalue:
303         headerList.append("pValue")
304
305     headline = string.join(headerList, "\t")
306     print >> outfile, headline
307
308     statistics = {"index": 0,
309                   "total": 0,
310                   "mIndex": 0,
311                   "mTotal": 0,
312                   "failed": 0
313     }
314
315     if minRatio < minPeak:
316         minPeak = minRatio
317
318     hitChromList = hitRDS.getChromosomes()
319     if doControl:
320         mockChromList = mockRDS.getChromosomes()
321
322     if normalize:
323         if doControl:
324             mockSampleSize = mockRDSsize
325
326         hitSampleSize = hitRDSsize
327
328     hitChromList.sort()
329
330     for chromosome in hitChromList:
331         if doNotProcessChromosome(chromosome, doControl, mockChromList):
332             continue
333
334         print "chromosome %s" % (chromosome)
335         hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True,
336                                       doMulti=useMulti, findallOptimize=True, strand=stranded,
337                                       combine5p=combine5p)
338         maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
339         if shiftValue == "learn":
340             shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize,
341                                     mockSampleSize, minRatio, maxSpacing, maxCoord, stringency, readlen, minHits,
342                                     logfilename, outfile, outfilename)
343
344         regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize,
345                                                                   chromosome, useMulti, normalize, maxSpacing,
346                                                                   doDirectionality, doTrim, minHits, minRatio,
347                                                                   readlen, shiftValue, minPeak, minPlusRatio,
348                                                                   maxPlusRatio, leftPlusRatio, listPeak, noMulti,
349                                                                   doControl, factor, trimValue, outputRegionList=True)
350
351         statistics["index"] += regionStats["index"]
352         statistics["total"] += regionStats["total"]
353         statistics["failed"] += regionStats["failed"]
354         if not doRevBackground:
355             if doPvalue:
356                 p, poissonmean = calculatePValue(allRegionWeights)
357
358             print headline
359             shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
360             continue
361
362         #now do background swapping the two samples around
363         print "calculating background..."
364         backgroundTrimValue = 1/20.
365         backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize,
366                                                                        chromosome, useMulti, normalize, maxSpacing,
367                                                                        doDirectionality, doTrim, minHits, minRatio,
368                                                                        readlen, shiftValue, minPeak, minPlusRatio,
369                                                                        maxPlusRatio, leftPlusRatio, listPeak, noMulti,
370                                                                        doControl, factor, backgroundTrimValue)
371
372         statistics["mIndex"] += backgroundRegionStats["index"]
373         statistics["mTotal"] += backgroundRegionStats["total"]
374         statistics["failed"] += backgroundRegionStats["failed"]
375         print statistics["mIndex"], statistics["mTotal"]
376         if doPvalue:
377             if pValueType == "self":
378                 p, poissonmean = calculatePValue(allRegionWeights)
379             else:
380                 p, poissonmean = calculatePValue(backgroundRegionWeights)
381
382         print headline
383         shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
384
385     footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue)
386     print footer
387     outfile.write(footer)
388     outfile.close()
389
390     writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | ")))
391
392
393 def determineShiftValue(autoshift, shift, noshift, rnaSettings):
394     shiftValue = 0
395     if autoshift:
396         shiftValue = "auto"
397
398     if shift is not None:
399         try:
400             shiftValue = int(shift)
401         except ValueError:
402             if shift == "learn":
403                 shiftValue = "learn"
404                 print "Will try to learn shift"
405
406     if noshift or rnaSettings:
407         shiftValue = 0
408
409     return shiftValue
410
411
412 def doNotProcessChromosome(chromosome, doControl, mockChromList):
413     skipChromosome = False
414     if chromosome == "chrM":
415         skipChromosome = True
416
417     if doControl and (chromosome not in mockChromList):
418         skipChromosome = True
419
420     return skipChromosome
421
422
423 def calculatePValue(dataList):
424     dataList.sort()
425     listSize = float(len(dataList))
426     try:
427         poissonmean = sum(dataList) / listSize
428     except ZeroDivisionError:
429         poissonmean = 0
430
431     print "Poisson n=%d, p=%f" % (listSize, poissonmean)
432     p = math.exp(-poissonmean)
433
434     return p, poissonmean
435
436
437 def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
438                stringency, readlen, minHits, logfilename, outfile, outfilename, minSites=30):
439
440     print "learning shift.... will need at least %d training sites" % minSites
441     previousHit = -1 * maxSpacing
442     hitList = [-1]
443     totalWeight = 0
444     readList = []
445     shiftDict = {}
446     count = 0
447     numStarts = 0
448     for read in hitDict[chrom]:
449         pos = read["start"]
450         sense = read["sense"]
451         weight = read["weight"]
452         if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
453             sumAll = totalWeight
454             if normalize:
455                 sumAll /= hitSampleSize
456
457             regionStart = hitList[0]
458             regionStop = hitList[-1]
459             regionLength = regionStop - regionStart
460             # we're going to require stringent settings
461             if sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen:
462                 foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
463
464                 if foldRatio >= minRatio:
465                     localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True)
466                     try:
467                         shiftDict[localshift] += 1
468                     except KeyError:
469                         shiftDict[localshift] = 1
470
471                     count += 1
472
473             hitList = []
474             totalWeight = 0
475             readList = []
476             numStarts = 0
477
478         if pos not in hitList:
479             numStarts += 1
480
481         hitList.append(pos)
482         totalWeight += weight
483         readList.append({"start": pos, "sense": sense, "weight": weight})
484         previousHit = pos
485
486     bestShift = 0
487     bestCount = 0
488     learningSettings = ["#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d" % (stringency, stringency * minHits,
489                                                                                                        stringency * minRatio, stringency * readlen),
490                         "#number of training examples: %d" % count]
491     outline = string.join(learningSettings, "\n")
492     print outline
493     writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
494     if count < minSites:
495         outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
496         print >> outfile, outline
497         writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
498         shiftValue = 0
499     else:
500         for shift in sorted(shiftDict):
501             if shiftDict[shift] > bestCount:
502                 bestShift = shift
503                 bestCount = shiftDict[shift]
504
505         shiftValue = bestShift
506         print shiftDict
507
508     outline = "#picked shiftValue to be %d" % shiftValue
509     print outline
510     print >> outfile, outline
511     writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
512
513     return shiftValue
514
515
516 def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio):
517     if doControl:
518         foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll)
519     else:
520         foldRatio = minRatio
521
522     return foldRatio
523
524
525 def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll):
526     numMock = 1. + rds.getCounts(chrom, start, stop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
527     if normalize:
528         numMock /= sampleSize
529
530     foldRatio = sumAll / numMock
531
532     return foldRatio
533
534
535 def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
536                 normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
537                 shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
538                 noMulti, doControl, factor, trimValue, outputRegionList=False):
539
540     index = 0
541     totalRegionWeight = 0
542     failedCounter = 0
543     previousHit = - 1 * maxSpacing
544     currentHitList = [-1]
545     currentTotalWeight = 0
546     currentUniqReadCount = 0
547     currentReadList = []
548     regionWeights = []
549     outregions = []
550     numStarts = 0
551     hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True)
552     maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti)
553     for read in hitDict[chrom]:
554         pos = read["start"]
555         sense = read["sense"]
556         weight = read["weight"]
557         if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
558             sumAll = currentTotalWeight
559             if normalizeToRPM:
560                 sumAll /= rdsSampleSize
561
562             regionStart = currentHitList[0]
563             regionStop = currentHitList[-1]
564             regionWeights.append(int(sumAll))
565             if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
566                 sumMulti = 0.
567                 #first pass uses getFoldRatio on mockRDS as there may not be control
568                 foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalizeToRPM, referenceSampleSize, sumAll)
569                 if foldRatio >= minRatio:
570                     # first pass, with absolute numbers
571                     peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
572
573                     bestPos = peak.topPos[0]
574                     numHits = peak.numHits
575                     peakScore = peak.smoothArray[bestPos]
576                     numPlus = peak.numPlus
577                     shift = peak.shift
578                     numLeft = peak.numLeftPlus
579                     if normalizeToRPM:
580                         peakScore /= rdsSampleSize
581
582                     if doTrim:
583                         minSignalThresh = trimValue * peakScore
584                         start = 0
585                         stop = regionStop - regionStart - 1
586                         startFound = False
587                         while not startFound:
588                             if peak.smoothArray[start] >= minSignalThresh or start == bestPos:
589                                 startFound = True
590                             else:
591                                 start += 1
592
593                         stopFound = False
594                         while not stopFound:
595                             if peak.smoothArray[stop] >= minSignalThresh or stop == bestPos:
596                                 stopFound = True
597                             else:
598                                 stop -= 1
599
600                         regionStop = regionStart + stop
601                         regionStart += start
602                         trimPeak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift)
603
604                         sumAll = trimPeak.numHits
605                         numPlus = trimPeak.numPlus
606                         numLeft = trimPeak.numLeftPlus
607                         if normalizeToRPM:
608                             sumAll /= rdsSampleSize
609
610                         foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
611                         if outputRegionList:
612                             sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
613                         # just in case it changed, use latest data
614                         try:
615                             bestPos = trimPeak.topPos[0]
616                             peakScore = trimPeak.smoothArray[bestPos]
617                         except:
618                             continue
619
620                         if normalizeToRPM:
621                             peakScore /= rdsSampleSize
622
623                     elif outputRegionList:
624                         sumMulti = currentTotalWeight - currentUniqReadCount
625
626                     if outputRegionList:
627                         if normalizeToRPM:
628                             sumMulti /= rdsSampleSize
629
630                         try:
631                             multiP = 100. * (sumMulti / sumAll)
632                         except:
633                             break
634
635                         if noMulti:
636                             multiP = 0.
637
638                     # check that we still pass threshold
639                     if sumAll >= minHits and  foldRatio >= minRatio and (regionStop - regionStart) > readlen:
640                         plusRatio = float(numPlus)/numHits
641                         if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
642                             if outputRegionList:
643                                 peakDescription = ""
644                                 if listPeak:
645                                     peakDescription = "\t%d\t%.1f" % (regionStart + bestPos, peakScore)
646
647                             if doDirectionality:
648                                 if leftPlusRatio < numLeft / numPlus:
649                                     index += 1
650                                     if outputRegionList:
651                                         plusP = plusRatio * 100.
652                                         leftP = 100. * numLeft / numPlus
653                                         # we have a region that passes all criteria
654                                         region = Region.DirectionalRegion(regionStart, regionStop + readlen - 1,
655                                                                           factor, index, chrom, sumAll,
656                                                                           foldRatio, multiP, plusP, leftP,
657                                                                           peakDescription, shift)
658                                         outregions.append(region)
659
660                                     totalRegionWeight += sumAll
661                                 else:
662                                     failedCounter += 1
663                             else:
664                                 # we have a region, but didn't check for directionality
665                                 index += 1
666                                 totalRegionWeight += sumAll
667                                 if outputRegionList:
668                                     region = Region.Region(regionStart, regionStop + readlen - 1, factor, index, chrom,
669                                                            sumAll, foldRatio, multiP, peakDescription, shift)
670                                     outregions.append(region)
671
672             currentHitList = []
673             currentTotalWeight = 0
674             currentUniqReadCount = 0
675             currentReadList = []
676             numStarts = 0
677
678         if pos not in currentHitList:
679             numStarts += 1
680
681         currentHitList.append(pos)
682         currentTotalWeight += weight
683         if weight == 1.0:
684             currentUniqReadCount += 1
685
686         currentReadList.append({"start": pos, "sense": sense, "weight": weight})
687         previousHit = pos
688
689     statistics = {"index": index,
690                   "total": totalRegionWeight,
691                   "failed": failedCounter
692     }
693
694     if outputRegionList:
695         return statistics, regionWeights, outregions
696     else:
697         return statistics, regionWeights
698
699
700 def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, reportshift, shiftValue):
701     bestShift = 0
702     shiftDict = {}
703     for region in outregions:
704         # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
705         if reportshift:
706             outputList = [region.printRegionWithShift()]
707             if shiftValue == "auto":
708                 try:
709                     shiftDict[region.shift] += 1
710                 except KeyError:
711                     shiftDict[region.shift] = 1
712         else:
713             outputList = [region.printRegion()]
714
715         # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
716         if doPvalue:
717             sumAll = int(region.numReads)
718             for i in xrange(sumAll):
719                 pValue *= poissonmean
720                 pValue /= i+1
721
722             outputList.append("%1.2f" % pValue)
723
724         outline = string.join(outputList, "\t")
725         print outline
726         print >> outfile, outline
727
728     bestCount = 0
729     for shift in sorted(shiftDict):
730         if shiftDict[shift] > bestCount:
731             bestShift = shift
732             bestCount = shiftDict[shift]
733
734     return bestShift
735
736
737 def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue):
738     footerList = ["#stats:\t%.1f RPM in %d regions" % (stats["total"], stats["index"])]
739     if doDirectionality:
740         footerList.append("#\t\t%d additional regions failed directionality filter" % stats["failed"])
741
742     if doRevBackground:
743         try:
744             percent = min(100. * (float(stats["mIndex"])/stats["index"]), 100)
745         except (ValueError, ZeroDivisionError):
746             percent = 0.
747
748         footerList.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (stats["mIndex"], stats["mTotal"], percent))
749
750     if shiftValue == "auto" and reportshift:
751         footerList.append("#mode of shift values: %d" % shiftModeValue)
752
753     footer = string.join(footerList, "\n")
754
755     return footer
756
757 if __name__ == "__main__":
758     main(sys.argv)