erange version 4.0a dev release
[erange.git] / findall.py
index 10f007bd7d340fcf3b30606046c84ef42ee6556d..d20608f1997e2e6a0fd84d9169c78025485e4b57 100755 (executable)
@@ -49,10 +49,12 @@ import sys
 import math
 import string
 import optparse
-from commoncode import readDataset, writeLog, findPeak, getBestShiftForRegion
+from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
+import ReadDataset
+import Region
 
 
-versionString = "%s: version 3.2" % sys.argv[0]
+versionString = "findall: version 3.2"
 print versionString
 
 def usage():
@@ -63,6 +65,27 @@ def main(argv=None):
     if not argv:
         argv = sys.argv
 
+    parser = makeParser()
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        usage()
+        sys.exit(2)
+
+    factor = args[0]
+    hitfile = args[1]
+    outfilename = args[2]
+
+    findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift,
+            options.stringency, options.noshift, options.autoshift, options.reportshift,
+            options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
+            options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
+            options.trimValue, options.doTrim, options.doAppend, options.rnaSettings,
+            options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti,
+            options.strandfilter, options.combine5p)
+
+
+def makeParser():
     usage = __doc__
 
     parser = optparse.OptionParser(usage=usage)
@@ -94,31 +117,47 @@ def main(argv=None):
     parser.add_option("--append", action="store_true", dest="doAppend")
     parser.add_option("--RNA", action="store_true", dest="rnaSettings")
     parser.add_option("--combine5p", action="store_true", dest="combine5p")
-    parser.set_defaults(minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
-                        stringency=4.0, noshift=False, autoshift=False, reportshift=False,
-                        minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
-                        normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
-                        trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
-                        cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
-                        strandfilter=None, combine5p=False)
 
-    (options, args) = parser.parse_args(argv[1:])
-
-    if len(args) < 3:
-        usage()
-        sys.exit(2)
-
-    factor = args[0]
-    hitfile = args[1]
-    outfilename = args[2]
-
-    findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift,
-            options.stringency, options.noshift, options.autoshift, options.reportshift,
-            options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
-            options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
-            options.trimValue, options.doTrim, options.doAppend, options.rnaSettings,
-            options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti,
-            options.strandfilter, options.combine5p)
+    configParser = getConfigParser()
+    section = "findall"
+    minHits = getConfigFloatOption(configParser, section, "minHits", 4.0)
+    minRatio = getConfigFloatOption(configParser, section, "minRatio", 4.0)
+    maxSpacing = getConfigIntOption(configParser, section, "maxSpacing", 50)
+    listPeak = getConfigBoolOption(configParser, section, "listPeak", False)
+    shift = getConfigOption(configParser, section, "shift", None)
+    stringency = getConfigFloatOption(configParser, section, "stringency", 4.0)
+    noshift = getConfigBoolOption(configParser, section, "noshift", False)
+    autoshift = getConfigBoolOption(configParser, section, "autoshift", False)
+    reportshift = getConfigBoolOption(configParser, section, "reportshift", False)
+    minPlusRatio = getConfigFloatOption(configParser, section, "minPlusRatio", 0.25)
+    maxPlusRatio = getConfigFloatOption(configParser, section, "maxPlusRatio", 0.75)
+    leftPlusRatio = getConfigFloatOption(configParser, section, "leftPlusRatio", 0.3)
+    minPeak = getConfigFloatOption(configParser, section, "minPeak", 0.5)
+    normalize = getConfigBoolOption(configParser, section, "normalize", True)
+    logfilename = getConfigOption(configParser, section, "logfilename", "findall.log")
+    withFlag = getConfigOption(configParser, section, "withFlag", "")
+    doDirectionality = getConfigBoolOption(configParser, section, "doDirectionality", True)
+    trimValue = getConfigOption(configParser, section, "trimValue", None)
+    doTrim = getConfigBoolOption(configParser, section, "doTrim", True)
+    doAppend = getConfigBoolOption(configParser, section, "doAppend", False)
+    rnaSettings = getConfigBoolOption(configParser, section, "rnaSettings", False)
+    cachePages = getConfigOption(configParser, section, "cachePages", None)
+    ptype = getConfigOption(configParser, section, "ptype", None)
+    mockfile = getConfigOption(configParser, section, "mockfile", None)
+    doRevBackground = getConfigBoolOption(configParser, section, "doRevBackground", False)
+    noMulti = getConfigBoolOption(configParser, section, "noMulti", False)
+    strandfilter = getConfigOption(configParser, section, "strandfilter", None)
+    combine5p = getConfigBoolOption(configParser, section, "combine5p", False)
+
+    parser.set_defaults(minHits=minHits, minRatio=minRatio, maxSpacing=maxSpacing, listPeak=listPeak, shift=shift,
+                        stringency=stringency, noshift=noshift, autoshift=autoshift, reportshift=reportshift,
+                        minPlusRatio=minPlusRatio, maxPlusRatio=maxPlusRatio, leftPlusRatio=leftPlusRatio, minPeak=minPeak,
+                        normalize=normalize, logfilename=logfilename, withFlag=withFlag, doDirectionality=doDirectionality,
+                        trimValue=trimValue, doTrim=doTrim, doAppend=doAppend, rnaSettings=rnaSettings,
+                        cachePages=cachePages, ptype=ptype, mockfile=mockfile, doRevBackground=doRevBackground, noMulti=noMulti,
+                        strandfilter=strandfilter, combine5p=combine5p)
+
+    return parser
 
 
 def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
@@ -129,20 +168,7 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
             cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
             strandfilter=None, combine5p=False):
 
-    shiftValue = 0
-    if autoshift:
-        shiftValue = "auto"
-
-    if shift is not None:
-        try:
-            shiftValue = int(shift)
-        except ValueError:
-            if shift == "learn":
-                shiftValue = "learn"
-                print "Will try to learn shift"
-
-    if noshift:
-        shiftValue = 0
+    shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
 
     if trimValue is not None:
         trimValue = float(trimValue) / 100.
@@ -198,7 +224,6 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
 
     if rnaSettings:
         print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
-        shiftValue = 0
         doTrim = False
         doDirectionality = False
 
@@ -219,48 +244,44 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
     writeLog(logfilename, versionString, string.join(sys.argv[1:]))
     if doControl:
         print "\ncontrol:" 
-        mockRDS = readDataset(mockfile, verbose=True, cache=doCache)
+        mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
 
         if cachePages > mockRDS.getDefaultCacheSize():
             mockRDS.setDBcache(cachePages)
 
     print "\nsample:" 
-    hitRDS = readDataset(hitfile, verbose=True, cache=doCache)
+    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
     readlen = hitRDS.getReadSize()
     if rnaSettings:
         maxSpacing = readlen
 
-    print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache)
-    print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)
-    print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)
-
     if cachePages > hitRDS.getDefaultCacheSize():
         hitRDS.setDBcache(cachePages)
 
-    hitRDSsize = len(hitRDS) / 1000000.
-    if doControl:
-        mockRDSsize = len(mockRDS) / 1000000.
-
-    if normalize:
-        if doControl:
-            mockSampleSize = mockRDSsize
-
-        hitSampleSize = hitRDSsize
-
     if doAppend:
-        outfile = open(outfilename, "a")
+        fileMode = "a"
     else:
-        outfile = open(outfilename, "w")
+        fileMode = "w"
+
+    outfile = open(outfilename, fileMode)
 
     outfile.write("#ERANGE %s\n" % versionString)
     if doControl:
-        outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)\n" % (hitfile, hitRDSsize, mockfile, mockRDSsize))
+        mockRDSsize = len(mockRDS) / 1000000.
+        controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize)
     else:
-        outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample: none\n" % (hitfile, hitRDSsize))
+        controlSampleString = " none"
+
+    hitRDSsize = len(hitRDS) / 1000000.
+    outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:%s\n" % (hitfile, hitRDSsize, controlSampleString))
 
     if withFlag != "":
         outfile.write("#restrict to Flag = %s\n" % withFlag)
 
+    print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache)
+    print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)
+    print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)
+
     outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache))
     outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded))
     outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType))
@@ -300,6 +321,12 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
     if doControl:
         mockChromList = mockRDS.getChromosomes()
 
+    if normalize:
+        if doControl:
+            mockSampleSize = mockRDSsize
+
+        hitSampleSize = hitRDSsize
+
     hitChromList.sort()
 
     for chromosome in hitChromList:
@@ -307,16 +334,21 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
             continue
 
         print "chromosome %s" % (chromosome)
-        hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True, strand=stranded, combine5p=combine5p)
+        hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True,
+                                      doMulti=useMulti, findallOptimize=True, strand=stranded,
+                                      combine5p=combine5p)
         maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
         if shiftValue == "learn":
-            shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
-                                    stringency, readlen, minHits, logfilename, outfile, outfilename)
+            shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize,
+                                    mockSampleSize, minRatio, maxSpacing, maxCoord, stringency, readlen, minHits,
+                                    logfilename, outfile, outfilename)
 
-        regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize, chromosome, useMulti,
-                                                              normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
-                                                              shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
-                                                              noMulti, doControl, factor, trimValue, outputRegionList=True)
+        regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize,
+                                                                  chromosome, useMulti, normalize, maxSpacing,
+                                                                  doDirectionality, doTrim, minHits, minRatio,
+                                                                  readlen, shiftValue, minPeak, minPlusRatio,
+                                                                  maxPlusRatio, leftPlusRatio, listPeak, noMulti,
+                                                                  doControl, factor, trimValue, outputRegionList=True)
 
         statistics["index"] += regionStats["index"]
         statistics["total"] += regionStats["total"]
@@ -332,10 +364,12 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
         #now do background swapping the two samples around
         print "calculating background..."
         backgroundTrimValue = 1/20.
-        backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize, chromosome, useMulti,
-                                                              normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
-                                                              shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
-                                                              noMulti, doControl, factor, backgroundTrimValue)
+        backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize,
+                                                                       chromosome, useMulti, normalize, maxSpacing,
+                                                                       doDirectionality, doTrim, minHits, minRatio,
+                                                                       readlen, shiftValue, minPeak, minPlusRatio,
+                                                                       maxPlusRatio, leftPlusRatio, listPeak, noMulti,
+                                                                       doControl, factor, backgroundTrimValue)
 
         statistics["mIndex"] += backgroundRegionStats["index"]
         statistics["mTotal"] += backgroundRegionStats["total"]
@@ -358,6 +392,25 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
     writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | ")))
 
 
+def determineShiftValue(autoshift, shift, noshift, rnaSettings):
+    shiftValue = 0
+    if autoshift:
+        shiftValue = "auto"
+
+    if shift is not None:
+        try:
+            shiftValue = int(shift)
+        except ValueError:
+            if shift == "learn":
+                shiftValue = "learn"
+                print "Will try to learn shift"
+
+    if noshift or rnaSettings:
+        shiftValue = 0
+
+    return shiftValue
+
+
 def doNotProcessChromosome(chromosome, doControl, mockChromList):
     skipChromosome = False
     if chromosome == "chrM":
@@ -384,19 +437,22 @@ def calculatePValue(dataList):
 
 
 def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
-               stringency, readlen, minHits, logfilename, outfile, outfilename):
+               stringency, readlen, minHits, logfilename, outfile, outfilename, minSites=30):
 
-    print "learning shift.... will need at least 30 training sites"
+    print "learning shift.... will need at least %d training sites" % minSites
     previousHit = -1 * maxSpacing
     hitList = [-1]
-    weightList = [0]
+    totalWeight = 0
     readList = []
     shiftDict = {}
     count = 0
     numStarts = 0
-    for (pos, sense, weight) in hitDict[chrom]:
+    for read in hitDict[chrom]:
+        pos = read["start"]
+        sense = read["sense"]
+        weight = read["weight"]
         if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
-            sumAll = sum(weightList)
+            sumAll = totalWeight
             if normalize:
                 sumAll /= hitSampleSize
 
@@ -417,7 +473,7 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm
                     count += 1
 
             hitList = []
-            weightList = []
+            totalWeight = 0
             readList = []
             numStarts = 0
 
@@ -425,18 +481,21 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm
             numStarts += 1
 
         hitList.append(pos)
-        weightList.append(weight)
-        readList.append((pos, sense, weight))
+        totalWeight += weight
+        readList.append({"start": pos, "sense": sense, "weight": weight})
         previousHit = pos
 
     bestShift = 0
     bestCount = 0
-    outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency, stringency * minHits, stringency * minRatio, stringency * readlen, count)
+    learningSettings = ["#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d" % (stringency, stringency * minHits,
+                                                                                                       stringency * minRatio, stringency * readlen),
+                        "#number of training examples: %d" % count]
+    outline = string.join(learningSettings, "\n")
     print outline
     writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
-    if count < 30:
+    if count < minSites:
         outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
-        print outline
+        print >> outfile, outline
         writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
         shiftValue = 0
     else:
@@ -481,20 +540,24 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                 noMulti, doControl, factor, trimValue, outputRegionList=False):
 
     index = 0
-    total = 0
+    totalRegionWeight = 0
     failedCounter = 0
     previousHit = - 1 * maxSpacing
     currentHitList = [-1]
-    currentWeightList = [0]
+    currentTotalWeight = 0
+    currentUniqReadCount = 0
     currentReadList = []
     regionWeights = []
     outregions = []
     numStarts = 0
     hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True)
     maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti)
-    for (pos, sense, weight) in hitDict[chrom]:
+    for read in hitDict[chrom]:
+        pos = read["start"]
+        sense = read["sense"]
+        weight = read["weight"]
         if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
-            sumAll = sum(currentWeightList)
+            sumAll = currentTotalWeight
             if normalize:
                 sumAll /= rdsSampleSize
 
@@ -507,13 +570,14 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                 foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll)
                 if foldRatio >= minRatio:
                     # first pass, with absolute numbers
-                    if doDirectionality:
-                        (topPos, numHits, smoothArray, numPlus, numLeft, shift) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue, returnShift=True)
-                    else:
-                        (topPos, numHits, smoothArray, numPlus, shift) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, shift=shiftValue, returnShift=True)
-
-                    bestPos = topPos[0]
-                    peakScore = smoothArray[bestPos]
+                    peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
+
+                    bestPos = peak.topPos[0]
+                    numHits = peak.numHits
+                    peakScore = peak.smoothArray[bestPos]
+                    numPlus = peak.numPlus
+                    shift = peak.shift
+                    numLeft = peak.numLeft
                     if normalize:
                         peakScore /= rdsSampleSize
 
@@ -523,28 +587,25 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                         stop = regionStop - regionStart - 1
                         startFound = False
                         while not startFound:
-                            if smoothArray[start] >= minSignalThresh or start == bestPos:
+                            if peak.smoothArray[start] >= minSignalThresh or start == bestPos:
                                 startFound = True
                             else:
                                 start += 1
 
                         stopFound = False
                         while not stopFound:
-                            if smoothArray[stop] >= minSignalThresh or stop == bestPos:
+                            if peak.smoothArray[stop] >= minSignalThresh or stop == bestPos:
                                 stopFound = True
                             else:
                                 stop -= 1
 
                         regionStop = regionStart + stop
                         regionStart += start
-                        try:
-                            if doDirectionality:
-                                (topPos, sumAll, smoothArray, numPlus, numLeft) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift)
-                            else:
-                                (topPos, sumAll, smoothArray, numPlus) = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, shift=shift)
-                        except:
-                            continue
+                        trimPeak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift)
 
+                        sumAll = trimPeak.numHits
+                        numPlus = trimPeak.numPlus
+                        numLeft = trimPeak.numLeft
                         if normalize:
                             sumAll /= rdsSampleSize
 
@@ -553,8 +614,8 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                             sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
                         # just in case it changed, use latest data
                         try:
-                            bestPos = topPos[0]
-                            peakScore = smoothArray[bestPos]
+                            bestPos = trimPeak.topPos[0]
+                            peakScore = trimPeak.smoothArray[bestPos]
                         except:
                             continue
 
@@ -563,7 +624,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                             peakScore /= rdsSampleSize
 
                     elif outputRegionList:
-                        sumMulti = sum(currentWeightList) - currentWeightList.count(1.0)
+                        sumMulti = currentTotalWeight - currentUniqReadCount
 
                     if outputRegionList:
                         # normalize to RPM
@@ -583,9 +644,9 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                         plusRatio = float(numPlus)/numHits
                         if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
                             if outputRegionList:
-                                peak = ""
+                                peakDescription = ""
                                 if listPeak:
-                                    peak = "\t%d\t%.1f" % (regionStart + bestPos, peakScore)
+                                    peakDescription = "\t%d\t%.1f" % (regionStart + bestPos, peakScore)
 
                             if doDirectionality:
                                 if leftPlusRatio < numLeft / numPlus:
@@ -594,20 +655,27 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                                         plusP = plusRatio * 100.
                                         leftP = 100. * numLeft / numPlus
                                         # we have a region that passes all criteria
-                                        outregions.append((factor, index, chrom, regionStart, regionStop + readlen - 1, sumAll, foldRatio, multiP, plusP, leftP, peak, shift))
+                                        region = Region.DirectionalRegion(regionStart, regionStop + readlen - 1,
+                                                                          factor, index, chrom, sumAll,
+                                                                          foldRatio, multiP, plusP, leftP,
+                                                                          peakDescription, shift)
+                                        outregions.append(region)
 
-                                    total += sumAll
+                                    totalRegionWeight += sumAll
                                 else:
                                     failedCounter += 1
                             else:
                                 # we have a region, but didn't check for directionality
                                 index += 1
-                                total += sumAll
+                                totalRegionWeight += sumAll
                                 if outputRegionList:
-                                    outregions.append((factor, index, chrom, regionStart, regionStop + readlen - 1, sumAll, foldRatio, multiP, peak, shift))
+                                    region = Region.Region(regionStart, regionStop + readlen - 1, factor, index, chrom,
+                                                           sumAll, foldRatio, multiP, peakDescription, shift)
+                                    outregions.append(region)
 
             currentHitList = []
-            currentWeightList = []
+            currentTotalWeight = 0
+            currentUniqReadCount = 0
             currentReadList = []
             numStarts = 0
 
@@ -615,12 +683,15 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
             numStarts += 1
 
         currentHitList.append(pos)
-        currentWeightList.append(weight)
-        currentReadList.append((pos, sense, weight))
+        currentTotalWeight += weight
+        if weight == 1.0:
+            currentUniqReadCount += 1
+
+        currentReadList.append({"start": pos, "sense": sense, "weight": weight})
         previousHit = pos
 
     statistics = {"index": index,
-                  "total": total,
+                  "total": totalRegionWeight,
                   "failed": failedCounter
     }
 
@@ -634,43 +705,35 @@ def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, repor
     bestShift = 0
     shiftDict = {}
     for region in outregions:
+        # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
+        if reportshift:
+            outputList = [region.printRegionWithShift()]
+            if shiftValue == "auto":
+                try:
+                    shiftDict[region.shift] += 1
+                except KeyError:
+                    shiftDict[region.shift] = 1
+        else:
+            outputList = [region.printRegion()]
+
         # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
         if doPvalue:
-            sumAll = int(region[5])
+            sumAll = int(region.numReads)
             for i in xrange(sumAll):
                 pValue *= poissonmean
                 pValue /= i+1
 
-        if shiftValue == "auto" and reportshift:
-            try:
-                shiftDict[region[-1]] += 1
-            except KeyError:
-                shiftDict[region[-1]] = 1
-
-        try:
-            if reportshift:
-                outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f%s\t%d" % region]
-            else:
-                outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f%s" % region[:-1]]
-        except:
-            if reportshift:
-                outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f%s\t%d" % region]
-            else:
-                outputList = ["%s%d\t%s\t%d\t%d\t%.1f\t%.1f\t%.1f%s" % region[:-1]]
-
-        if doPvalue:
-            outputList.append("%1.2g" % pValue)
+            outputList.append("%1.2f" % pValue)
 
         outline = string.join(outputList, "\t")
         print outline
         print >> outfile, outline
 
-    if shiftValue == "auto" and reportshift:
-        bestCount = 0
-        for shift in sorted(shiftDict):
-            if shiftDict[shift] > bestCount:
-                bestShift = shift
-                bestCount = shiftDict[shift]
+    bestCount = 0
+    for shift in sorted(shiftDict):
+        if shiftDict[shift] > bestCount:
+            bestShift = shift
+            bestCount = shiftDict[shift]
 
     return bestShift