first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / findall.py
index ec75efb54f3680c892578a38811c1bd241c37241..827ee503ba7956b37143481beabae171a35f7619 100755 (executable)
@@ -1,6 +1,6 @@
 """
-    usage: python $ERANGEPATH/findall.py label samplerdsfile regionoutfile
-           [--control controlrdsfile] [--minimum minHits] [--ratio minRatio]
+    usage: python $ERANGEPATH/findall.py label samplebamfile regionoutfile
+           [--control controlbamfile] [--minimum minHits] [--ratio minRatio]
            [--spacing maxSpacing] [--listPeak] [--shift #bp | learn] [--learnFold num]
            [--noshift] [--autoshift] [--reportshift] [--nomulti] [--minPlus fraction]
            [--maxPlus fraction] [--leftPlus fraction] [--minPeak RPM] [--raw]
@@ -49,14 +49,223 @@ import sys
 import math
 import string
 import optparse
-from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
-import ReadDataset
+import operator
+import pysam
+from commoncode import writeLog, findPeak, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption, isSpliceEntry
 import Region
 
 
-versionString = "findall: version 3.2"
+versionString = "findall: version 3.2.1"
 print versionString
 
+class RegionDirectionError(Exception):
+    pass
+            
+
+class RegionFinder():
+    def __init__(self, label, minRatio=4.0, minPeak=0.5, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, strandfilter="",
+                 minHits=4.0, trimValue=0.1, doTrim=True, doDirectionality=True, shiftValue=0, maxSpacing=50, withFlag="",
+                 normalize=True, listPeak=False, reportshift=False, stringency=1.0, controlfile=None, doRevBackground=False):
+
+        self.statistics = {"index": 0,
+                           "total": 0,
+                           "mIndex": 0,
+                           "mTotal": 0,
+                           "failed": 0,
+                           "badRegionTrim": 0
+        }
+
+        self.regionLabel = label
+        self.rnaSettings = False
+        self.controlRDSsize = 1.0
+        self.sampleRDSsize = 1.0
+        self.minRatio = minRatio
+        self.minPeak = minPeak
+        self.leftPlusRatio = leftPlusRatio
+        self.stranded = "both"
+        if strandfilter == "plus":
+            self.stranded = "+"
+            minPlusRatio = 0.9
+            maxPlusRatio = 1.0
+        elif strandfilter == "minus":
+            self.stranded = "-"
+            minPlusRatio = 0.0
+            maxPlusRatio = 0.1
+
+        if minRatio < minPeak:
+            self.minPeak = minRatio
+
+        self.minPlusRatio = minPlusRatio
+        self.maxPlusRatio = maxPlusRatio
+        self.strandfilter = strandfilter
+        self.minHits = minHits
+        self.trimValue = trimValue
+        self.doTrim = doTrim
+        self.doDirectionality = doDirectionality
+
+        if self.doTrim:
+            self.trimString = string.join(["%2.1f" % (100. * self.trimValue), "%"], "")
+        else:
+            self.trimString = "none"
+
+        self.shiftValue = shiftValue
+        self.maxSpacing = maxSpacing
+        self.withFlag = withFlag
+        self.normalize = normalize
+        self.listPeak = listPeak
+        self.reportshift = reportshift
+        self.stringency = max(stringency, 1.0)
+        self.controlfile = controlfile
+        self.doControl = self.controlfile is not None
+        self.doPvalue = False
+        self.doRevBackground = doRevBackground
+
+
+    def useRNASettings(self, readlen):
+        self.rnaSettings = True
+        self.shiftValue = 0
+        self.doTrim = False
+        self.doDirectionality = False
+        self.maxSpacing = readlen
+
+
+    def getHeader(self):
+        if self.normalize:
+            countType = "RPM"
+        else:
+            countType = "COUNT"
+
+        headerFields = ["#regionID\tchrom\tstart\tstop", countType, "fold\tmulti%"]
+
+        if self.doDirectionality:
+            headerFields.append("plus%\tleftPlus%")
+
+        if self.listPeak:
+            headerFields.append("peakPos\tpeakHeight")
+
+        if self.reportshift:
+            headerFields.append("readShift")
+
+        if self.doPvalue:
+            headerFields.append("pValue")
+
+        return string.join(headerFields, "\t")
+
+
+    def printSettings(self, ptype, useMulti, pValueType):
+        print
+        self.printStatusMessages(ptype, useMulti)
+        self.printOptionsSummary(useMulti, pValueType)
+
+
+    def printStatusMessages(self, ptype, useMulti):
+        if self.shiftValue == "learn":
+            print "Will try to learn shift"
+
+        if self.normalize:
+            print "Normalizing to RPM"
+
+        if self.doRevBackground:
+            print "Swapping IP and background to calculate FDR"
+
+        if ptype != "":
+            if ptype in ["NONE", "SELF"]:
+                pass
+            elif ptype == "BACK":
+                if self.doControl and self.doRevBackground:
+                    pass
+                else:
+                    print "must have a control dataset and -revbackground for pValue type 'back'"
+            else:
+                print "could not use pValue type : %s" % ptype
+
+        if self.withFlag != "":
+            print "restrict to flag = %s" % self.withFlag
+
+        if not useMulti:
+            print "using unique reads only"
+
+        if self.rnaSettings:
+            print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
+
+        if self.strandfilter == "plus":
+            print "only analyzing reads on the plus strand"
+        elif self.strandfilter == "minus":
+            print "only analyzing reads on the minus strand"
+
+
+    def printOptionsSummary(self, useMulti, pValueType):
+
+        print "\nenforceDirectionality=%s listPeak=%s nomulti=%s " % (self.doDirectionality, self.listPeak, not useMulti)
+        print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded)
+        try:
+            print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
+        except:
+            print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
+
+
+    def getAnalysisDescription(self, hitfile, useMulti, pValueType):
+
+        description = ["#ERANGE %s" % versionString]
+        if self.doControl:
+            description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, self.controlfile, self.controlRDSsize))
+        else:
+            description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample: none" % (hitfile, self.sampleRDSsize))
+
+        if self.withFlag != "":
+            description.append("#restrict to Flag = %s" % self.withFlag)
+
+        description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s" % (self.doDirectionality, self.listPeak, not useMulti))
+        description.append("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded))
+        try:
+            description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
+        except:
+            description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
+
+        return string.join(description, "\n")
+
+
+    def getFooter(self, bestShift):
+        index = self.statistics["index"]
+        mIndex = self.statistics["mIndex"]
+        footerLines = ["#stats:\t%.1f RPM in %d regions" % (self.statistics["total"], index)]
+        if self.doDirectionality:
+            footerLines.append("#\t\t%d additional regions failed directionality filter" % self.statistics["failed"])
+
+        if self.doRevBackground:
+            try:
+                percent = min(100. * (float(mIndex)/index), 100.)
+            except ZeroDivisionError:
+                percent = 0.
+
+            footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, self.statistics["mTotal"], percent))
+
+        if self.shiftValue == "auto" and self.reportshift:
+            
+            footerLines.append("#mode of shift values: %d" % bestShift)
+
+        if self.statistics["badRegionTrim"] > 0:
+            footerLines.append("#%d regions discarded due to trimming problems" % self.statistics["badRegionTrim"])
+
+        return string.join(footerLines, "\n")
+
+
+    def updateControlStatistics(self, peak, sumAll, peakScore):
+
+        plusRatio = float(peak.numPlus)/peak.numHits
+        if peakScore >= self.minPeak and self.minPlusRatio <= plusRatio <= self.maxPlusRatio:
+            if self.doDirectionality:
+                if self.leftPlusRatio < peak.numLeftPlus / peak.numPlus:
+                    self.statistics["mIndex"] += 1
+                    self.statistics["mTotal"] += sumAll
+                else:
+                    self.statistics["failed"] += 1
+            else:
+                # we have a region, but didn't check for directionality
+                self.statistics["mIndex"] += 1
+                self.statistics["mTotal"] += sumAll
+
+
 def usage():
     print __doc__
 
@@ -76,20 +285,43 @@ def main(argv=None):
     hitfile = args[1]
     outfilename = args[2]
 
-    findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift,
-            options.stringency, options.noshift, options.autoshift, options.reportshift,
-            options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
-            options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
-            options.trimValue, options.doTrim, options.doAppend, options.rnaSettings,
-            options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti,
-            options.strandfilter, options.combine5p)
+    shiftValue = 0
+
+    if options.autoshift:
+        shiftValue = "auto"
+
+    if options.shift is not None:
+        try:
+            shiftValue = int(options.shift)
+        except ValueError:
+            if options.shift == "learn":
+                shiftValue = "learn"
+
+    if options.noshift:
+        shiftValue = 0
+
+    if options.doAppend:
+        outputMode = "a"
+    else:
+        outputMode = "w"
+
+    regionFinder = RegionFinder(factor, minRatio=options.minRatio, minPeak=options.minPeak, minPlusRatio=options.minPlusRatio,
+                                maxPlusRatio=options.maxPlusRatio, leftPlusRatio=options.leftPlusRatio, strandfilter=options.strandfilter,
+                                minHits=options.minHits, trimValue=options.trimValue, doTrim=options.doTrim,
+                                doDirectionality=options.doDirectionality, shiftValue=shiftValue, maxSpacing=options.maxSpacing,
+                                withFlag=options.withFlag, normalize=options.normalize, listPeak=options.listPeak,
+                                reportshift=options.reportshift, stringency=options.stringency, controlfile=options.controlfile,
+                                doRevBackground=options.doRevBackground)
+
+    findall(regionFinder, hitfile, outfilename, options.logfilename, outputMode, options.rnaSettings,
+            options.ptype, options.useMulti, options.combine5p)
 
 
 def makeParser():
     usage = __doc__
 
     parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--control", dest="mockfile")
+    parser.add_option("--control", dest="controlfile")
     parser.add_option("--minimum", type="float", dest="minHits")
     parser.add_option("--ratio", type="float", dest="minRatio")
     parser.add_option("--spacing", type="int", dest="maxSpacing")
@@ -99,7 +331,7 @@ def makeParser():
     parser.add_option("--noshift", action="store_true", dest="noShift")
     parser.add_option("--autoshift", action="store_true", dest="autoshift")
     parser.add_option("--reportshift", action="store_true", dest="reportshift")
-    parser.add_option("--nomulti", action="store_true", dest="noMulti")
+    parser.add_option("--nomulti", action="store_false", dest="useMulti")
     parser.add_option("--minPlus", type="float", dest="minPlusRatio")
     parser.add_option("--maxPlus", type="float", dest="maxPlusRatio")
     parser.add_option("--leftPlus", type="float", dest="leftPlusRatio")
@@ -137,16 +369,16 @@ def makeParser():
     logfilename = getConfigOption(configParser, section, "logfilename", "findall.log")
     withFlag = getConfigOption(configParser, section, "withFlag", "")
     doDirectionality = getConfigBoolOption(configParser, section, "doDirectionality", True)
-    trimValue = getConfigOption(configParser, section, "trimValue", None)
+    trimValue = getConfigFloatOption(configParser, section, "trimValue", 0.1)
     doTrim = getConfigBoolOption(configParser, section, "doTrim", True)
     doAppend = getConfigBoolOption(configParser, section, "doAppend", False)
     rnaSettings = getConfigBoolOption(configParser, section, "rnaSettings", False)
     cachePages = getConfigOption(configParser, section, "cachePages", None)
-    ptype = getConfigOption(configParser, section, "ptype", None)
-    mockfile = getConfigOption(configParser, section, "mockfile", None)
+    ptype = getConfigOption(configParser, section, "ptype", "")
+    controlfile = getConfigOption(configParser, section, "controlfile", None)
     doRevBackground = getConfigBoolOption(configParser, section, "doRevBackground", False)
-    noMulti = getConfigBoolOption(configParser, section, "noMulti", False)
-    strandfilter = getConfigOption(configParser, section, "strandfilter", None)
+    useMulti = getConfigBoolOption(configParser, section, "useMulti", True)
+    strandfilter = getConfigOption(configParser, section, "strandfilter", "")
     combine5p = getConfigBoolOption(configParser, section, "combine5p", False)
 
     parser.set_defaults(minHits=minHits, minRatio=minRatio, maxSpacing=maxSpacing, listPeak=listPeak, shift=shift,
@@ -154,605 +386,627 @@ def makeParser():
                         minPlusRatio=minPlusRatio, maxPlusRatio=maxPlusRatio, leftPlusRatio=leftPlusRatio, minPeak=minPeak,
                         normalize=normalize, logfilename=logfilename, withFlag=withFlag, doDirectionality=doDirectionality,
                         trimValue=trimValue, doTrim=doTrim, doAppend=doAppend, rnaSettings=rnaSettings,
-                        cachePages=cachePages, ptype=ptype, mockfile=mockfile, doRevBackground=doRevBackground, noMulti=noMulti,
+                        cachePages=cachePages, ptype=ptype, controlfile=controlfile, doRevBackground=doRevBackground, useMulti=useMulti,
                         strandfilter=strandfilter, combine5p=combine5p)
 
     return parser
 
 
-def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
-            stringency=4.0, noshift=False, autoshift=False, reportshift=False,
-            minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
-            normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
-            trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
-            cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
-            strandfilter=None, combine5p=False):
+def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outputMode="w", rnaSettings=False,
+            ptype="", useMulti=True, combine5p=False):
 
-    shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
-    doControl = False
-    if mockfile is not None:
-        doControl = True
+    writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+    controlBAM = None
+    if regionFinder.doControl:
+        print "\ncontrol:" 
+        controlBAM = pysam.Samfile(regionFinder.controlfile, "rb")
+        regionFinder.controlRDSsize = int(getHeaderComment(controlBAM.header, "Total")) / 1000000.
+
+    print "\nsample:" 
+    sampleBAM = pysam.Samfile(hitfile, "rb")
+    regionFinder.sampleRDSsize = int(getHeaderComment(sampleBAM.header, "Total")) / 1000000.
+    pValueType = getPValueType(ptype, regionFinder.doControl, regionFinder.doRevBackground)
+    regionFinder.doPvalue = not pValueType == "none"
+    regionFinder.readlen = int(getHeaderComment(sampleBAM.header, "ReadLength"))
+    if rnaSettings:
+        regionFinder.useRNASettings(regionFinder.readlen)
+
+    regionFinder.printSettings(ptype, useMulti, pValueType)
+    outfile = open(outfilename, outputMode)
+    header = writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType)
+    shiftDict = {}
+    chromList = getChromosomeListToProcess(sampleBAM, controlBAM)
+    for chromosome in chromList:
+        #TODO: Really? Use first chr shift value for all of them
+        maxSampleCoord = getMaxCoordinate(sampleBAM, chromosome, doMulti=useMulti)
+        if regionFinder.shiftValue == "learn":
+            regionFinder.shiftValue = learnShift(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p)
+
+        allregions, outregions = findPeakRegions(regionFinder, sampleBAM, maxSampleCoord, chromosome, logfilename, outfilename, outfile, useMulti, controlBAM, combine5p)
+        if regionFinder.doRevBackground:
+            maxControlCoord = getMaxCoordinate(controlBAM, chromosome, doMulti=useMulti)
+            backregions = findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxControlCoord, chromosome, useMulti)
+        else:
+            backregions = []
+            pValueType = "self"
+
+        writeChromosomeResults(regionFinder, outregions, outfile, shiftDict, allregions, header, backregions=backregions, pValueType=pValueType)
+
+    try:
+        bestShift = getBestShiftInDict(shiftDict)
+    except ValueError:
+        bestShift = 0
+
+    footer = regionFinder.getFooter(bestShift)
+    print footer
+    print >> outfile, footer
+    outfile.close()
+    writeLog(logfilename, versionString, outfilename + footer.replace("\n#"," | ")[:-1])
+
+
+def getHeaderComment(bamHeader, commentKey):
+    for comment in bamHeader["CO"]:
+        fields = comment.split("\t")
+        if fields[0] == commentKey:
+            return fields[1]
+
+    raise KeyError
 
-    if doRevBackground:
-        print "Swapping IP and background to calculate FDR"
-        pValueType = "back"
 
-    doPvalue = True
-    if ptype is not None:
-        ptype = ptype.upper()
+def getPValueType(ptype, doControl, doRevBackground):
+    pValueType = "self"
+    if ptype in ["NONE", "SELF", "BACK"]:
         if ptype == "NONE":
-            doPvalue = False
             pValueType = "none"
-            p = 1
-            poissonmean = 0
         elif ptype == "SELF":
             pValueType = "self"
         elif ptype == "BACK":
             if doControl and doRevBackground:
                 pValueType = "back"
-            else:
-                print "must have a control dataset and -revbackground for pValue type 'back'"
-        else:
-            print "could not use pValue type : %s" % ptype
-    else:
-        pValueType = "self"
-
-    if withFlag != "":
-        print "restrict to flag = %s" % withFlag
-
-    useMulti = True
-    if noMulti:
-        print "using unique reads only"
-        useMulti = False
-
-    if rnaSettings:
-        print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
-        doTrim = False
-        doDirectionality = False
-
-    stranded = ""
-    if strandfilter is not None:
-        if strandfilter == "plus":
-            stranded = "+"
-            minPlusRatio = 0.9
-            maxPlusRatio = 1.0
-            print "only analyzing reads on the plus strand"
-        elif strandfilter == "minus":
-            stranded = "-"
-            minPlusRatio = 0.0
-            maxPlusRatio = 0.1
-            print "only analyzing reads on the minus strand"
+    elif doRevBackground:
+        pValueType = "back"
 
-    stringency = max(stringency, 1.0)
-    writeLog(logfilename, versionString, string.join(sys.argv[1:]))
-    if cachePages is not None:
-        doCache = True
-    else:
-        doCache = False
-        cachePages = -1
+    return pValueType
 
-    if doControl:
-        print "\ncontrol:" 
-        mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
 
-        if cachePages > mockRDS.getDefaultCacheSize():
-            mockRDS.setDBcache(cachePages)
+def writeOutputFileHeader(regionFinder, outfile, hitfile, useMulti, pValueType):
+    print >> outfile, regionFinder.getAnalysisDescription(hitfile, useMulti, pValueType)
+    header = regionFinder.getHeader()
+    print >> outfile, header
 
-    print "\nsample:" 
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
-    readlen = hitRDS.getReadSize()
-    if rnaSettings:
-        maxSpacing = readlen
+    return header
 
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
 
-    if trimValue is not None:
-        trimValue = float(trimValue) / 100.
-        trimString = "%2.1f%s" % ((100. * trimValue), "%")
+def getChromosomeListToProcess(sampleBAM, controlBAM=None):
+    if controlBAM is not None:
+        chromosomeList = [chrom for chrom in sampleBAM.references if chrom in controlBAM.references and chrom != "chrM"]
     else:
-        trimValue = 0.1
-        trimString = "10%"
+        chromosomeList = [chrom for chrom in sampleBAM.references if chrom != "chrM"]
 
-    if not doTrim:
-        trimString = "none"
+    return chromosomeList
 
-    if doAppend:
-        fileMode = "a"
-    else:
-        fileMode = "w"
 
-    outfile = open(outfilename, fileMode)
-    outfile.write("#ERANGE %s\n" % versionString)
-    if doControl:
-        mockRDSsize = len(mockRDS) / 1000000.
-        controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize)
-    else:
-        controlSampleString = " none"
+def findPeakRegions(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename,
+                    outfile, useMulti, controlBAM, combine5p):
 
-    hitRDSsize = len(hitRDS) / 1000000.
-    outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:%s\n" % (hitfile, hitRDSsize, controlSampleString))
+    outregions = []
+    allregions = []
+    print "chromosome %s" % (chromosome)
+    previousHit = - 1 * regionFinder.maxSpacing
+    readStartPositions = [-1]
+    totalWeight = 0.0
+    uniqueReadCount = 0
+    reads = []
+    numStartsInRegion = 0
+
+    for alignedread in sampleBAM.fetch(chromosome):
+        if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p):
+            continue
 
-    if withFlag != "":
-        outfile.write("#restrict to Flag = %s\n" % withFlag)
+        pos = alignedread.pos
+        if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
+            lastReadPos = readStartPositions[-1]
+            lastBasePosition = lastReadPos + regionFinder.readlen - 1
+            newRegionIndex = regionFinder.statistics["index"] + 1
+            if regionFinder.doDirectionality:
+                region = Region.DirectionalRegion(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=regionFinder.regionLabel,
+                                                  numReads=totalWeight)
+            else:
+                region = Region.Region(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=regionFinder.regionLabel, numReads=totalWeight)
 
-    print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache)
-    print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)
-    print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)
+            if regionFinder.normalize:
+                region.numReads /= regionFinder.sampleRDSsize
 
-    outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache))
-    outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded))
-    outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType))
-    if normalize:
-        print "Normalizing to RPM"
-        countLabel = "RPM"
-    else:
-        countLabel = "COUNT"
+            allregions.append(int(region.numReads))
+            regionLength = lastReadPos - region.start
+            if regionPassesCriteria(regionFinder, region.numReads, numStartsInRegion, regionLength):
+                region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti)
 
-    headerList = ["#regionID\tchrom\tstart\tstop", countLabel, "fold\tmulti%"]
-    if doDirectionality:
-        headerList.append("plus%\tleftPlus%")
+                if region.foldRatio >= regionFinder.minRatio:
+                    # first pass, with absolute numbers
+                    peak = findPeak(reads, region.start, regionLength, regionFinder.readlen, doWeight=True, leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue)
+                    if regionFinder.doTrim:
+                        try:
+                            lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.sampleRDSsize)
+                        except IndexError:
+                            regionFinder.statistics["badRegionTrim"] += 1
+                            continue
 
-    if listPeak:
-        headerList.append("peakPos\tpeakHeight")
+                        region.foldRatio = getFoldRatio(regionFinder, controlBAM, region.numReads, chromosome, region.start, lastReadPos, useMulti)
 
-    if reportshift:
-        headerList.append("readShift")
+                    try:
+                        bestPos = peak.topPos[0]
+                        peakScore = peak.smoothArray[bestPos]
+                        if regionFinder.normalize:
+                            peakScore /= regionFinder.sampleRDSsize
+                    except (IndexError, AttributeError, ZeroDivisionError):
+                        continue
 
-    if doPvalue:
-        headerList.append("pValue")
+                    if regionFinder.listPeak:
+                        region.peakDescription = "%d\t%.1f" % (region.start + bestPos, peakScore)
 
-    headline = string.join(headerList, "\t")
-    print >> outfile, headline
+                    if useMulti:
+                        setMultireadPercentage(region, sampleBAM, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos,
+                                               regionFinder.normalize, regionFinder.doTrim)
 
-    statistics = {"index": 0,
-                  "total": 0,
-                  "mIndex": 0,
-                  "mTotal": 0,
-                  "failed": 0
-    }
+                    region.shift = peak.shift
+                    # check that we still pass threshold
+                    regionLength = lastReadPos - region.start
+                    plusRatio = float(peak.numPlus)/peak.numHits
+                    if regionAndPeakPass(regionFinder, region, regionLength, peakScore, plusRatio):
+                        try:
+                            updateRegion(region, regionFinder.doDirectionality, regionFinder.leftPlusRatio, peak.numLeftPlus, peak.numPlus, plusRatio)
+                            regionFinder.statistics["index"] += 1
+                            outregions.append(region)
+                            regionFinder.statistics["total"] += region.numReads
+                        except RegionDirectionError:
+                            regionFinder.statistics["failed"] += 1
+
+            readStartPositions = []
+            totalWeight = 0.0
+            uniqueReadCount = 0
+            reads = []
+            numStartsInRegion = 0
+
+        if pos not in readStartPositions:
+            numStartsInRegion += 1
+
+        readStartPositions.append(pos)
+        weight = 1.0/alignedread.opt('NH')
+        totalWeight += weight
+        if weight == 1.0:
+            uniqueReadCount += 1
 
-    if minRatio < minPeak:
-        minPeak = minRatio
+        reads.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
+        previousHit = pos
 
-    hitChromList = hitRDS.getChromosomes()
-    if doControl:
-        mockChromList = mockRDS.getChromosomes()
+    return allregions, outregions
 
-    if normalize:
-        if doControl:
-            mockSampleSize = mockRDSsize
 
-        hitSampleSize = hitRDSsize
+def getReadSense(read):
+    if read.is_reverse:
+        sense = "-"
+    else:
+        sense = "+"
 
-    hitChromList.sort()
+    return sense
 
-    for chromosome in hitChromList:
-        if doNotProcessChromosome(chromosome, doControl, mockChromList):
-            continue
 
-        print "chromosome %s" % (chromosome)
-        hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True,
-                                      doMulti=useMulti, findallOptimize=True, strand=stranded,
-                                      combine5p=combine5p)
-        maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
-        if shiftValue == "learn":
-            shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize,
-                                    mockSampleSize, minRatio, maxSpacing, maxCoord, stringency, readlen, minHits,
-                                    logfilename, outfile, outfilename)
-
-        regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize,
-                                                                  chromosome, useMulti, normalize, maxSpacing,
-                                                                  doDirectionality, doTrim, minHits, minRatio,
-                                                                  readlen, shiftValue, minPeak, minPlusRatio,
-                                                                  maxPlusRatio, leftPlusRatio, listPeak, noMulti,
-                                                                  doControl, factor, trimValue, outputRegionList=True)
-
-        statistics["index"] += regionStats["index"]
-        statistics["total"] += regionStats["total"]
-        statistics["failed"] += regionStats["failed"]
-        if not doRevBackground:
-            if doPvalue:
-                p, poissonmean = calculatePValue(allRegionWeights)
-
-            print headline
-            shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
-            continue
+def doNotProcessRead(read, doMulti=False, strand="both", combine5p=False):
+    if read.opt('NH') > 1 and not doMulti:
+        return True
 
-        #now do background swapping the two samples around
-        print "calculating background..."
-        backgroundTrimValue = 1/20.
-        backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize,
-                                                                       chromosome, useMulti, normalize, maxSpacing,
-                                                                       doDirectionality, doTrim, minHits, minRatio,
-                                                                       readlen, shiftValue, minPeak, minPlusRatio,
-                                                                       maxPlusRatio, leftPlusRatio, listPeak, noMulti,
-                                                                       doControl, factor, backgroundTrimValue)
-
-        statistics["mIndex"] += backgroundRegionStats["index"]
-        statistics["mTotal"] += backgroundRegionStats["total"]
-        statistics["failed"] += backgroundRegionStats["failed"]
-        print statistics["mIndex"], statistics["mTotal"]
-        if doPvalue:
-            if pValueType == "self":
-                p, poissonmean = calculatePValue(allRegionWeights)
-            else:
-                p, poissonmean = calculatePValue(backgroundRegionWeights)
+    if strand == "+" and read.is_reverse:
+        return True
 
-        print headline
-        shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
+    if strand == "-" and not read.is_reverse:
+        return True
+        
+    return False
 
-    footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue)
-    print footer
-    outfile.write(footer)
-    outfile.close()
 
-    writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | ")))
+def getMaxCoordinate(samfile, chr, doMulti=False):
+    maxCoord = 0
+    for alignedread in samfile.fetch(chr):
+        if alignedread.opt('NH') > 1:
+            if doMulti:
+                maxCoord = max(maxCoord, alignedread.pos)
+        else:
+            maxCoord = max(maxCoord, alignedread.pos)
 
+    return maxCoord
 
-def determineShiftValue(autoshift, shift, noshift, rnaSettings):
-    shiftValue = 0
-    if autoshift:
-        shiftValue = "auto"
 
-    if shift is not None:
-        try:
-            shiftValue = int(shift)
-        except ValueError:
-            if shift == "learn":
-                shiftValue = "learn"
-                print "Will try to learn shift"
+def findBackgroundRegions(regionFinder, sampleBAM, controlBAM, maxCoord, chromosome, useMulti):
+    #TODO: this is *almost* the same calculation - there are small yet important differences
+    print "calculating background..."
+    previousHit = - 1 * regionFinder.maxSpacing
+    currentHitList = [-1]
+    currentTotalWeight = 0.0
+    currentReadList = []
+    backregions = []
+    numStarts = 0
+    badRegion = False
+    for alignedread in controlBAM.fetch(chromosome):
+        if doNotProcessRead(alignedread, doMulti=useMulti):
+            continue
 
-    if noshift or rnaSettings:
-        shiftValue = 0
+        pos = alignedread.pos
+        if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
+            lastReadPos = currentHitList[-1]
+            lastBasePosition = lastReadPos + regionFinder.readlen - 1
+            region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight)
+            if regionFinder.normalize:
+                region.numReads /= regionFinder.controlRDSsize
+
+            backregions.append(int(region.numReads))
+            region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=regionFinder.regionLabel, numReads=currentTotalWeight)
+            regionLength = lastReadPos - region.start
+            if regionPassesCriteria(regionFinder, region.numReads, numStarts, regionLength):
+                numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti)
+                if regionFinder.normalize:
+                    numMock /= regionFinder.sampleRDSsize
+
+                foldRatio = region.numReads / numMock
+                if foldRatio >= regionFinder.minRatio:
+                    # first pass, with absolute numbers
+                    peak = findPeak(currentReadList, region.start, lastReadPos - region.start, regionFinder.readlen, doWeight=True,
+                                    leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue)
 
-    return shiftValue
+                    if regionFinder.doTrim:
+                        try:
+                            lastReadPos = trimRegion(region, regionFinder, peak, lastReadPos, 20., currentReadList, regionFinder.controlRDSsize)
+                        except IndexError:
+                            badRegion = True
+                            continue
+
+                        numMock = 1. + countReadsInRegion(sampleBAM, chromosome, region.start, lastReadPos, countMulti=useMulti)
+                        if regionFinder.normalize:
+                            numMock /= regionFinder.sampleRDSsize
 
+                        foldRatio = region.numReads / numMock
 
-def doNotProcessChromosome(chromosome, doControl, mockChromList):
-    skipChromosome = False
-    if chromosome == "chrM":
-        skipChromosome = True
+                    try:
+                        bestPos = peak.topPos[0]
+                        peakScore = peak.smoothArray[bestPos]
+                    except IndexError:
+                        continue
 
-    if doControl and (chromosome not in mockChromList):
-        skipChromosome = True
+                    # normalize to RPM
+                    if regionFinder.normalize:
+                        peakScore /= regionFinder.controlRDSsize
 
-    return skipChromosome
+                    # check that we still pass threshold
+                    regionLength = lastReadPos - region.start
+                    if regionPassesCriteria(regionFinder, region.numReads, foldRatio, regionLength):
+                        regionFinder.updateControlStatistics(peak, region.numReads, peakScore)
 
+            currentHitList = []
+            currentTotalWeight = 0.0
+            currentReadList = []
+            numStarts = 0
+            if badRegion:
+                badRegion = False
+                regionFinder.statistics["badRegionTrim"] += 1
 
-def calculatePValue(dataList):
-    dataList.sort()
-    listSize = float(len(dataList))
-    try:
-        poissonmean = sum(dataList) / listSize
-    except ZeroDivisionError:
-        poissonmean = 0
+        if pos not in currentHitList:
+            numStarts += 1
 
-    print "Poisson n=%d, p=%f" % (listSize, poissonmean)
-    p = math.exp(-poissonmean)
+        currentHitList.append(pos)
+        weight = 1.0/alignedread.opt('NH')
+        currentTotalWeight += weight
+        currentReadList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
+        previousHit = pos
 
-    return p, poissonmean
+    return backregions
 
 
-def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
-               stringency, readlen, minHits, logfilename, outfile, outfilename, minSites=30):
+def learnShift(regionFinder, sampleBAM, maxCoord, chromosome, logfilename, outfilename,
+               outfile, useMulti, controlBAM, combine5p):
 
-    print "learning shift.... will need at least %d training sites" % minSites
-    previousHit = -1 * maxSpacing
-    hitList = [-1]
-    totalWeight = 0
+    print "learning shift.... will need at least 30 training sites"
+    stringency = regionFinder.stringency
+    previousHit = -1 * regionFinder.maxSpacing
+    positionList = [-1]
+    totalWeight = 0.0
     readList = []
     shiftDict = {}
     count = 0
     numStarts = 0
-    for read in hitDict[chrom]:
-        pos = read["start"]
-        sense = read["sense"]
-        weight = read["weight"]
-        if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
-            sumAll = totalWeight
-            if normalize:
-                sumAll /= hitSampleSize
-
-            regionStart = hitList[0]
-            regionStop = hitList[-1]
-            regionLength = regionStop - regionStart
-            # we're going to require stringent settings
-            if sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen:
-                foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
+    for alignedread in sampleBAM.fetch(chromosome):
+        if doNotProcessRead(alignedread, doMulti=useMulti, strand=regionFinder.stranded, combine5p=combine5p):
+            continue
 
-                if foldRatio >= minRatio:
-                    localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True)
-                    try:
-                        shiftDict[localshift] += 1
-                    except KeyError:
-                        shiftDict[localshift] = 1
+        pos = alignedread.pos
+        if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
+            if regionFinder.normalize:
+                totalWeight /= regionFinder.sampleRDSsize
 
+            regionStart = positionList[0]
+            regionStop = positionList[-1]
+            regionLength = regionStop - regionStart
+            if regionPassesCriteria(regionFinder, totalWeight, numStarts, regionLength, stringency=stringency):
+                foldRatio = getFoldRatio(regionFinder, controlBAM, totalWeight, chromosome, regionStart, regionStop, useMulti)
+                if foldRatio >= regionFinder.minRatio:
+                    updateShiftDict(shiftDict, readList, regionStart, regionLength, regionFinder.readlen)
                     count += 1
 
-            hitList = []
-            totalWeight = 0
+            positionList = []
+            totalWeight = 0.0
             readList = []
-            numStarts = 0
 
-        if pos not in hitList:
+        if pos not in positionList:
             numStarts += 1
 
-        hitList.append(pos)
+        positionList.append(pos)
+        weight = 1.0/alignedread.opt('NH')
         totalWeight += weight
-        readList.append({"start": pos, "sense": sense, "weight": weight})
+        readList.append({"start": pos, "sense": getReadSense(alignedread), "weight": weight})
         previousHit = pos
 
-    bestShift = 0
-    bestCount = 0
-    learningSettings = ["#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d" % (stringency, stringency * minHits,
-                                                                                                       stringency * minRatio, stringency * readlen),
-                        "#number of training examples: %d" % count]
-    outline = string.join(learningSettings, "\n")
-    print outline
-    writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
-    if count < minSites:
-        outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
-        print >> outfile, outline
-        writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
-        shiftValue = 0
-    else:
-        for shift in sorted(shiftDict):
-            if shiftDict[shift] > bestCount:
-                bestShift = shift
-                bestCount = shiftDict[shift]
-
-        shiftValue = bestShift
-        print shiftDict
+    outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency,
+                                                                                                                               stringency * regionFinder.minHits,
+                                                                                                                               stringency * regionFinder.minRatio,
+                                                                                                                               stringency * regionFinder.readlen,
+                                                                                                                               count)
 
+    print outline
+    writeLog(logfilename, versionString, outfilename + outline)
+    shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename)
     outline = "#picked shiftValue to be %d" % shiftValue
     print outline
     print >> outfile, outline
-    writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
+    writeLog(logfilename, versionString, outfilename + outline)
 
-    return shiftValue
+    return shiftValue, shiftDict
+
+
+def previousRegionIsDone(pos, previousHit, maxSpacing, maxCoord):
+    return abs(pos - previousHit) > maxSpacing or pos == maxCoord
+
+
+def regionPassesCriteria(regionFinder, sumAll, numStarts, regionLength, stringency=1):
+    minTotalReads = stringency * regionFinder.minHits
+    minNumReadStarts = stringency * regionFinder.minRatio
+    minRegionLength = stringency * regionFinder.readlen
+
+    return sumAll >= minTotalReads and numStarts >= minNumReadStarts and regionLength > minRegionLength
+
+
+def trimRegion(region, regionFinder, peak, regionStop, trimValue, currentReadList, totalReadCount):
+    bestPos = peak.topPos[0]
+    peakScore = peak.smoothArray[bestPos]
+    if regionFinder.normalize:
+        peakScore /= totalReadCount
+
+    minSignalThresh = trimValue * peakScore
+    start = findStartEdgePosition(peak, minSignalThresh)
+    regionEndPoint = regionStop - region.start - 1
+    stop = findStopEdgePosition(peak, regionEndPoint, minSignalThresh)
+
+    regionStop = region.start + stop
+    region.start += start
+
+    trimmedPeak = findPeak(currentReadList, region.start, regionStop - region.start, regionFinder.readlen, doWeight=True,
+                           leftPlus=regionFinder.doDirectionality, shift=peak.shift)
+
+    peak.numPlus = trimmedPeak.numPlus
+    peak.numLeftPlus = trimmedPeak.numLeftPlus
+    peak.topPos = trimmedPeak.topPos
+    peak.smoothArray = trimmedPeak.smoothArray
+
+    region.numReads = trimmedPeak.numHits
+    if regionFinder.normalize:
+        region.numReads /= totalReadCount
+
+    region.stop = regionStop + regionFinder.readlen - 1
+                          
+    return regionStop
+
+
+def findStartEdgePosition(peak, minSignalThresh):
+    start = 0
+    while not peakEdgeLocated(peak, start, minSignalThresh):
+        start += 1
+
+    return start
+
+
+def findStopEdgePosition(peak, stop, minSignalThresh):
+    while not peakEdgeLocated(peak, stop, minSignalThresh):
+        stop -= 1
+
+    return stop
 
 
-def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio):
-    if doControl:
-        foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll)
+def peakEdgeLocated(peak, position, minSignalThresh):
+    return peak.smoothArray[position] >= minSignalThresh or position == peak.topPos[0]
+
+
+def getFoldRatio(regionFinder, controlBAM, sumAll, chromosome, regionStart, regionStop, useMulti):
+    """ Fold ratio calculated is total read weight over control
+    """
+    #TODO: this needs to be generalized as there is a point at which we want to use the sampleRDS instead of controlRDS
+    if regionFinder.doControl:
+        numMock = 1. + countReadsInRegion(controlBAM, chromosome, regionStart, regionStop, countMulti=useMulti)
+        if regionFinder.normalize:
+            numMock /= regionFinder.controlRDSsize
+
+        foldRatio = sumAll / numMock
     else:
-        foldRatio = minRatio
+        foldRatio = regionFinder.minRatio
 
     return foldRatio
 
 
-def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll):
-    numMock = 1. + rds.getCounts(chrom, start, stop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
-    if normalize:
-        numMock /= sampleSize
+def countReadsInRegion(bamfile, chr, start, end, uniqs=True, countMulti=False, countSplices=False):
+    count = 0.0
+    for alignedread in bamfile.fetch(chr, start, end):
+        if alignedread.opt('NH') > 1:
+            if countMulti:
+                if isSpliceEntry(alignedread.cigar):
+                    if countSplices:
+                        count += 1.0/alignedread.opt('NH')
+                else:
+                    count += 1.0/alignedread.opt('NH')
+        elif uniqs:
+            if isSpliceEntry(alignedread.cigar):
+                if countSplices:
+                    count += 1.0
+            else:
+                count += 1.0
 
-    foldRatio = sumAll / numMock
+    return count
 
-    return foldRatio
+def updateShiftDict(shiftDict, readList, regionStart, regionLength, readlen):
+    peak = findPeak(readList, regionStart, regionLength, readlen, doWeight=True, shift="auto")
+    try:
+        shiftDict[peak.shift] += 1
+    except KeyError:
+        shiftDict[peak.shift] = 1
 
 
-def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
-                normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
-                shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
-                noMulti, doControl, factor, trimValue, outputRegionList=False):
+def getShiftValue(shiftDict, count, logfilename, outfilename):
+    if count < 30:
+        outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
+        print outline
+        writeLog(logfilename, versionString, outfilename + outline)
+        shiftValue = 0
+    else:
+        shiftValue = getBestShiftInDict(shiftDict)
+        print shiftDict
 
-    index = 0
-    totalRegionWeight = 0
-    failedCounter = 0
-    previousHit = - 1 * maxSpacing
-    currentHitList = [-1]
-    currentTotalWeight = 0
-    currentUniqReadCount = 0
-    currentReadList = []
-    regionWeights = []
-    outregions = []
-    numStarts = 0
-    hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True)
-    maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti)
-    for read in hitDict[chrom]:
-        pos = read["start"]
-        sense = read["sense"]
-        weight = read["weight"]
-        if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
-            sumAll = currentTotalWeight
-            if normalizeToRPM:
-                sumAll /= rdsSampleSize
-
-            regionStart = currentHitList[0]
-            regionStop = currentHitList[-1]
-            regionWeights.append(int(sumAll))
-            if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
-                sumMulti = 0.
-                #first pass uses getFoldRatio on mockRDS as there may not be control
-                foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalizeToRPM, referenceSampleSize, sumAll)
-                if foldRatio >= minRatio:
-                    # first pass, with absolute numbers
-                    peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
-
-                    bestPos = peak.topPos[0]
-                    numHits = peak.numHits
-                    peakScore = peak.smoothArray[bestPos]
-                    numPlus = peak.numPlus
-                    shift = peak.shift
-                    numLeft = peak.numLeftPlus
-                    if normalizeToRPM:
-                        peakScore /= rdsSampleSize
-
-                    if doTrim:
-                        minSignalThresh = trimValue * peakScore
-                        start = 0
-                        stop = regionStop - regionStart - 1
-                        startFound = False
-                        while not startFound:
-                            if peak.smoothArray[start] >= minSignalThresh or start == bestPos:
-                                startFound = True
-                            else:
-                                start += 1
-
-                        stopFound = False
-                        while not stopFound:
-                            if peak.smoothArray[stop] >= minSignalThresh or stop == bestPos:
-                                stopFound = True
-                            else:
-                                stop -= 1
-
-                        regionStop = regionStart + stop
-                        regionStart += start
-                        trimPeak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift)
-
-                        sumAll = trimPeak.numHits
-                        numPlus = trimPeak.numPlus
-                        numLeft = trimPeak.numLeftPlus
-                        if normalizeToRPM:
-                            sumAll /= rdsSampleSize
-
-                        foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
-                        if outputRegionList:
-                            sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
-                        # just in case it changed, use latest data
-                        try:
-                            bestPos = trimPeak.topPos[0]
-                            peakScore = trimPeak.smoothArray[bestPos]
-                        except:
-                            continue
+    return shiftValue
 
-                        if normalizeToRPM:
-                            peakScore /= rdsSampleSize
 
-                    elif outputRegionList:
-                        sumMulti = currentTotalWeight - currentUniqReadCount
+def getRegion(regionStart, regionStop, factor, index, chromosome, sumAll, foldRatio, multiP,
+              peakDescription, shift, doDirectionality, leftPlusRatio, numLeft,
+              numPlus, plusRatio):
 
-                    if outputRegionList:
-                        if normalizeToRPM:
-                            sumMulti /= rdsSampleSize
+    if doDirectionality:
+        if leftPlusRatio < numLeft / numPlus:
+            plusP = plusRatio * 100.
+            leftP = 100. * numLeft / numPlus
+            # we have a region that passes all criteria
+            region = Region.DirectionalRegion(regionStart, regionStop,
+                                              factor, index, chromosome, sumAll,
+                                              foldRatio, multiP, plusP, leftP,
+                                              peakDescription, shift)
 
-                        try:
-                            multiP = 100. * (sumMulti / sumAll)
-                        except:
-                            break
+        else:
+            raise RegionDirectionError
+    else:
+        # we have a region, but didn't check for directionality
+        region = Region.Region(regionStart, regionStop, factor, index, chromosome,
+                               sumAll, foldRatio, multiP, peakDescription, shift)
 
-                        if noMulti:
-                            multiP = 0.
+    return region
 
-                    # check that we still pass threshold
-                    if sumAll >= minHits and  foldRatio >= minRatio and (regionStop - regionStart) > readlen:
-                        plusRatio = float(numPlus)/numHits
-                        if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
-                            if outputRegionList:
-                                peakDescription = ""
-                                if listPeak:
-                                    peakDescription = "\t%d\t%.1f" % (regionStart + bestPos, peakScore)
-
-                            if doDirectionality:
-                                if leftPlusRatio < numLeft / numPlus:
-                                    index += 1
-                                    if outputRegionList:
-                                        plusP = plusRatio * 100.
-                                        leftP = 100. * numLeft / numPlus
-                                        # we have a region that passes all criteria
-                                        region = Region.DirectionalRegion(regionStart, regionStop + readlen - 1,
-                                                                          factor, index, chrom, sumAll,
-                                                                          foldRatio, multiP, plusP, leftP,
-                                                                          peakDescription, shift)
-                                        outregions.append(region)
-
-                                    totalRegionWeight += sumAll
-                                else:
-                                    failedCounter += 1
-                            else:
-                                # we have a region, but didn't check for directionality
-                                index += 1
-                                totalRegionWeight += sumAll
-                                if outputRegionList:
-                                    region = Region.Region(regionStart, regionStop + readlen - 1, factor, index, chrom,
-                                                           sumAll, foldRatio, multiP, peakDescription, shift)
-                                    outregions.append(region)
 
-            currentHitList = []
-            currentTotalWeight = 0
-            currentUniqReadCount = 0
-            currentReadList = []
-            numStarts = 0
+def setMultireadPercentage(region, sampleBAM, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim):
+    if doTrim:
+        sumMulti = 0.0
+        for alignedread in sampleBAM.fetch(chromosome, region.start, lastReadPos):
+            if alignedread.opt('NH') > 1:
+                sumMulti += 1.0/alignedread.opt('NH')
+    else:
+        sumMulti = currentTotalWeight - currentUniqueCount
 
-        if pos not in currentHitList:
-            numStarts += 1
+    # normalize to RPM
+    if normalize:
+        sumMulti /= hitRDSsize
 
-        currentHitList.append(pos)
-        currentTotalWeight += weight
-        if weight == 1.0:
-            currentUniqReadCount += 1
+    try:
+        multiP = 100. * (sumMulti / region.numReads)
+    except ZeroDivisionError:
+        return
 
-        currentReadList.append({"start": pos, "sense": sense, "weight": weight})
-        previousHit = pos
+    region.multiP = min(multiP, 100.)
 
-    statistics = {"index": index,
-                  "total": totalRegionWeight,
-                  "failed": failedCounter
-    }
 
-    if outputRegionList:
-        return statistics, regionWeights, outregions
-    else:
-        return statistics, regionWeights
+def regionAndPeakPass(regionFinder, region, regionLength, peakScore, plusRatio):
+    regionPasses = False
+    if regionPassesCriteria(regionFinder, region.numReads, region.foldRatio, regionLength):
+        #TODO: here is where the test dataset is failing
+        if peakScore >= regionFinder.minPeak and regionFinder.minPlusRatio <= plusRatio <= regionFinder.maxPlusRatio:
+            regionPasses = True
 
+    return regionPasses
 
-def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, reportshift, shiftValue):
-    bestShift = 0
-    shiftDict = {}
-    for region in outregions:
-        # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
-        if reportshift:
-            outputList = [region.printRegionWithShift()]
-            if shiftValue == "auto":
-                try:
-                    shiftDict[region.shift] += 1
-                except KeyError:
-                    shiftDict[region.shift] = 1
+
+def updateRegion(region, doDirectionality, leftPlusRatio, numLeft, numPlus, plusRatio):
+
+    if doDirectionality:
+        if leftPlusRatio < numLeft / numPlus:
+            region.plusP = plusRatio * 100.
+            region.leftP = 100. * numLeft / numPlus
         else:
-            outputList = [region.printRegion()]
+            raise RegionDirectionError
 
-        # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
-        if doPvalue:
-            sumAll = int(region.numReads)
-            for i in xrange(sumAll):
-                pValue *= poissonmean
-                pValue /= i+1
 
-            outputList.append("%1.2f" % pValue)
+def writeChromosomeResults(regionFinder, outregions, outfile, shiftDict,
+                           allregions, header, backregions=[], pValueType="none"):
+
+    print regionFinder.statistics["mIndex"], regionFinder.statistics["mTotal"]
+    if regionFinder.doPvalue:
+        if pValueType == "self":
+            poissonmean = calculatePoissonMean(allregions)
+        else:
+            poissonmean = calculatePoissonMean(backregions)
+
+    print header
+    for region in outregions:
+        if regionFinder.shiftValue == "auto" and regionFinder.reportshift:
+            try:
+                shiftDict[region.shift] += 1
+            except KeyError:
+                shiftDict[region.shift] = 1
+
+        outline = getRegionString(region, regionFinder.reportshift)
+
+        # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
+        if regionFinder.doPvalue:
+            pValue = calculatePValue(int(region.numReads), poissonmean)
+            outline += "\t%1.2g" % pValue
 
-        outline = string.join(outputList, "\t")
         print outline
         print >> outfile, outline
 
-    bestCount = 0
-    for shift in sorted(shiftDict):
-        if shiftDict[shift] > bestCount:
-            bestShift = shift
-            bestCount = shiftDict[shift]
 
-    return bestShift
+def calculatePoissonMean(dataList):
+    dataList.sort()
+    listSize = float(len(dataList))
+    try:
+        poissonmean = sum(dataList) / listSize
+    except ZeroDivisionError:
+        poissonmean = 0
+
+    print "Poisson n=%d, p=%f" % (listSize, poissonmean)
+
+    return poissonmean
 
 
-def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue):
-    footerList = ["#stats:\t%.1f RPM in %d regions" % (stats["total"], stats["index"])]
-    if doDirectionality:
-        footerList.append("#\t\t%d additional regions failed directionality filter" % stats["failed"])
+def calculatePValue(sum, poissonmean):
+    pValue = math.exp(-poissonmean)
+    for i in xrange(sum):
+        pValue *= poissonmean
+        pValue /= i+1
 
-    if doRevBackground:
-        try:
-            percent = min(100. * (float(stats["mIndex"])/stats["index"]), 100)
-        except (ValueError, ZeroDivisionError):
-            percent = 0.
+    return pValue
+
+
+def getRegionString(region, reportShift):
+    if reportShift:
+        outline = region.printRegionWithShift()
+    else:
+        outline = region.printRegion()
 
-        footerList.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (stats["mIndex"], stats["mTotal"], percent))
+    return outline
 
-    if shiftValue == "auto" and reportshift:
-        footerList.append("#mode of shift values: %d" % shiftModeValue)
 
-    footer = string.join(footerList, "\n")
+def getBestShiftInDict(shiftDict):
+    return max(shiftDict.iteritems(), key=operator.itemgetter(1))[0]
 
-    return footer
 
 if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
\ No newline at end of file