Rewrite of findall.py to clean code. Configuration tested using

author Sean Upchurch <sau@caltech.edu>

Mon, 13 Jun 2011 17:21:56 +0000 (10:21 -0700)

committer Sean Upchurch <sau@caltech.edu>

Mon, 13 Jun 2011 17:21:56 +0000 (10:21 -0700)
author Sean Upchurch <sau@caltech.edu>
Mon, 13 Jun 2011 17:21:56 +0000 (10:21 -0700)
committer Sean Upchurch <sau@caltech.edu>
Mon, 13 Jun 2011 17:21:56 +0000 (10:21 -0700)
diff --git a/ReadDataset.py b/ReadDataset.py

index 850a5ec602a7b6d7a9a22ac98c4bf3eef4b873e5..c9d2a0bf2b4eb56e0fa906be2319fcac866c30b3 100644 (file)
--- a/ReadDataset.py
+++ b/ReadDataset.py
@@ -207,15 +207,27 @@ class ReadDataset():
          return sql
  
  
+    def getMemCursor(self):
+        """ returns a cursor to memory database for low-level (SQL)
+        access to the data.
+        """
+        return self.memcon.cursor()
+
+
+    def getFileCursor(self):
+        """ returns a cursor to file database for low-level (SQL)
+        access to the data.
+        """
+        return self.dbcon.cursor()
+
+
      def hasIndex(self):
-        """ check whether the RDS file has at least one index.
+        """ return True if the RDS file has at least one index.
          """
          stmt = "select count(*) from sqlite_master where type='index'"
          count = int(self.execute(stmt, returnResults=True)[0][0])
-        if count > 0:
-            return True
  
-        return False
+        return count > 0
  
  
      def initializeTables(self, dbConnection, cache=100000):
@@ -237,20 +249,6 @@ class ReadDataset():
          dbConnection.commit()
  
  
-    def getFileCursor(self):
-        """ returns a cursor to file database for low-level (SQL)
-        access to the data.
-        """
-        return self.dbcon.cursor()
-
-
-    def getMemCursor(self):
-        """ returns a cursor to memory database for low-level (SQL)
-        access to the data.
-        """
-        return self.memcon.cursor()
-
-
      def getMetadata(self, valueName=""):
          """ returns a dictionary of metadata.
          """
@@ -309,7 +307,7 @@ class ReadDataset():
  
  
      def getChromosomes(self, table="uniqs", fullChrom=True):
-        """ returns a list of distinct chromosomes in table.
+        """ returns a sorted list of distinct chromosomes in table.
          """
          statement = "select distinct chrom from %s" % table
          sql = self.getSqlCursor()
@@ -330,7 +328,7 @@ class ReadDataset():
          return results
  
  
-    def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
+    def getMaxCoordinate(self, chrom, doUniqs=True,
                           doMulti=False, doSplices=False):
          """ returns the maximum coordinate for reads on a given chromosome.
          """
@@ -347,9 +345,6 @@ class ReadDataset():
              multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
              maxCoord = max(multiMax, maxCoord)
  
-        if verbose:
-            print "%s maxCoord: %d" % (chrom, maxCoord)
-
          return maxCoord
  
  
@@ -375,9 +370,9 @@ class ReadDataset():
          and which can be restricted by chromosome or custom-flag.
          Returns unique reads by default, but can return multireads
          with doMulti set to True.
-        
-        Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
+
          """
+        #TODO: Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
  
          whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
          if findallOptimize:
@@ -421,27 +416,16 @@ class ReadDataset():
          if findallOptimize:
              if self.memBacked:
                  self.memcon.row_factory = None
-                sql = self.memcon.cursor()
              else:
                  self.dbcon.row_factory = None
-                sql = self.dbcon.cursor()
  
              stmt.append("order by start")
          elif readIDDict:
-            if self.memBacked:
-                sql = self.memcon.cursor()
-            else:
-                sql = self.dbcon.cursor()
-
              stmt.append("order by readID, start")
          else:
-            if self.memBacked:
-                sql = self.memcon.cursor()
-            else:
-                sql = self.dbcon.cursor()
-
              stmt.append("order by chrom, start")
  
+        sql = self.getSqlCursor()
          sqlQuery = string.join(stmt)
          sql.execute(sqlQuery)
  
@@ -602,10 +586,7 @@ class ReadDataset():
          whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
          selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
          selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
-        if self.memBacked:
-            sql = self.memcon.cursor()
-        else:
-            sql = self.dbcon.cursor()
+        sql = self.getSqlCursor()
  
          stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
          sql.execute(stmt)
@@ -718,7 +699,7 @@ class ReadDataset():
  
  
      def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
-        """ returns the number of row in the uniqs table.
+        """ returns the number of row in the specified table.
          """
          whereClause = []
          count = 0
@@ -741,10 +722,7 @@ class ReadDataset():
          else:
              whereQuery = ""
  
-        if self.memBacked:
-            sql = self.memcon.cursor()
-        else:
-            sql = self.dbcon.cursor()
+        sql = self.getSqlCursor()
  
          if distinct:
              sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
@@ -803,11 +781,7 @@ class ReadDataset():
              limitPart = "LIMIT %d" % limit
  
          sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
-        if self.memBacked:
-            sql = self.memcon.cursor()
-        else:
-            sql = self.dbcon.cursor()
-
+        sql = self.getSqlCursor()
          sql.execute(sqlQuery)
          result = sql.fetchall()
  
diff --git a/analyzego.py b/analyzego.py

index 6542b1bf24c256e5907c799c785b0ab060fb9eaf..f8ff0017f9b51401411423b9ff8d5420b0714409 100755 (executable)
--- a/analyzego.py
+++ b/analyzego.py
@@ -6,7 +6,7 @@ except:
  
  import sys
  import optparse
-from commoncode import getGeneInfoDict, getConfigParser, getConfigOption
+from commoncode import getGeneInfoDict, getConfigParser, getConfigOption, getConfigIntOption
  from cistematic.cisstat.analyzego import calculateGOStats
  
  print "analyzego: version 2.2"
@@ -35,7 +35,7 @@ def main(argv=None):
      infilename = args[1]
      prefix = args[2]
  
-    analyzeGOFromFile(genome, infilename, prefix, options.translateGene, fieldID)
+    analyzeGOFromFile(genome, infilename, prefix, options.translateGene, fieldID, numTests=options.numTests)
  
  
  def makeParser(usage=""):
@@ -44,24 +44,27 @@ def makeParser(usage=""):
                        help="translate gene")
      parser.add_option("--field", type="int", dest="fieldID",
                        help="column containing gene ID/Name")
+    parser.add_option("--trials", type="int", dest="numTests",
+                      help="column containing gene ID/Name")
  
      configParser = getConfigParser()
      section = "analyzego"
      translateGene = getConfigOption(configParser, section, "translateGene", False)
      fieldID = getConfigOption(configParser, section, "fieldID", None)
+    numTests = getConfigIntOption(configParser, section, "numTests", 1)
  
-    parser.set_defaults(translateGene=translateGene, fieldID=fieldID)
+    parser.set_defaults(translateGene=translateGene, fieldID=fieldID, numTests=numTests)
  
      return parser
  
  
-def analyzeGOFromFile(genome, infilename, prefix, translateGene=False, fieldID=1):
+def analyzeGOFromFile(genome, infilename, prefix, translateGene=False, fieldID=1, numTests=1):
      infile = open(infilename)
-    analyzeGO(genome, infile, prefix, translateGene=False, fieldID=1)
+    analyzeGO(genome, infile, prefix, translateGene, fieldID, numTests)
      infile.close()
  
  
-def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1):
+def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1, numTests=1):
      if translateGene:
          symbolToGidDict = getSymbolDict(genome)
  
@@ -88,7 +91,7 @@ def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1):
              locusList.append((genome, gID))
  
      if len(locusList) > 0:
-        calculateGOStats(locusList, prefix)
+        calculateGOStats(locusList, prefix, trials=numTests)
  
  
  def getSymbolDict(genome):
diff --git a/commoncode.py b/commoncode.py

index f526daacca0dc91245e71205363216bc17eba2e9..6ef4edfcb3e10d75ae0eecea887f45581f9e4edb 100755 (executable)
--- a/commoncode.py
+++ b/commoncode.py
@@ -1,8 +1,3 @@
-#
-#  commoncode.py
-#  ENRAGE
-#
-
  import ConfigParser
  import os
  import string
@@ -192,6 +187,11 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False,
      mergeCount = 0
      chromField = int(chromField)
      count = 0
+    #TODO: Current algorithm processes input file line by line and compares with prior lines.  Problem is it
+    #      exits at the first merge.  This is not a problem when the input is sorted by start position, but in
+    #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
+    #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
+    #      be merged with A but that will exit the loop and the merge with C will be missed.
      for regionEntry in regionList:
          if regionEntry[0] == "#":
              if "pvalue" in regionEntry:
@@ -372,10 +372,11 @@ def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75)
          for read in readList:
              currentpos = read["start"] - start
              if read["sense"] == "+":
-                currentpos += testShift
+                senseModifier = 1
              else:
-                currentpos -= testShift
+                senseModifier = -1
  
+            currentpos += testShift * senseModifier
              if (currentpos < 1) or (currentpos >= length):
                  continue
  
@@ -384,16 +385,12 @@ def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75)
              else:
                  weight = 1.0
  
-            if read["sense"] == "+":
-                shiftArray[currentpos] += weight
-            else:
-                shiftArray[currentpos] -= weight
+            shiftArray[currentpos] += weight * senseModifier
  
          currentScore = 0
          for score in shiftArray:
              currentScore += abs(score)
  
-        print currentScore
          if currentScore < lowestScore:
              bestShift = testShift
              lowestScore = currentScore
@@ -494,40 +491,8 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=
          if restrictGID and gid not in restrictList:
              continue
  
-        featureList = featuresDict[gid]
-        newFeatureList = []
-        isPseudo = False
-        for (ftype, chrom, start, stop, sense) in featureList:
-            if ftype == "PSEUDO":
-                isPseudo = True
-
-            if (start, stop, ftype) not in newFeatureList:
-                notContained = True
-                containedList = []
-                for (fstart, fstop, ftype2) in newFeatureList:
-                    if start >= fstart and stop <= fstop:
-                        notContained = False
-
-                    if start < fstart and stop > fstop:
-                        containedList.append((fstart, fstop))
-
-                if len(containedList) > 0:
-                    newFList = []
-                    notContained = True
-                    for (fstart, fstop, ftype2) in newFeatureList:
-                        if (fstart, fstop) not in containedList:
-                            newFList.append((fstart, fstop, ftype2))
-                            if start >= fstart and stop <= fstop:
-                                notContained = False
-
-                    newFeatureList = newFList
-                if notContained:
-                    newFeatureList.append((start, stop, ftype))
-
-        if ignorePseudo and isPseudo:
-            continue
-
-        if chrom not in featuresByChromDict:
+        newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid])
+        if newFeatureList and chrom not in featuresByChromDict:
              featuresByChromDict[chrom] = []
  
          for (start, stop, ftype) in newFeatureList:
@@ -560,6 +525,39 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=
          return featuresByChromDict
  
  
+def getNewFeatureList(featureList, ignorePseudo=False):
+    newFeatureList = []
+    for (featureType, chrom, start, stop, sense) in featureList:
+        if featureType == "PSEUDO" and ignorePseudo:
+            return [], chrom, sense
+
+        if (start, stop, featureType) not in newFeatureList:
+            notContained = True
+            containedList = []
+            for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
+                if start >= newFeatureStart and stop <= newFeatureStop:
+                    notContained = False
+
+                if start < newFeatureStart and stop > newFeatureStop:
+                    containedList.append((newFeatureStart, newFeatureStop))
+
+            if len(containedList) > 0:
+                newFList = []
+                notContained = True
+                for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
+                    if (newFeatureStart, newFeatureStop) not in containedList:
+                        newFList.append((newFeatureStart, newFeatureStop, featureType2))
+                        if start >= newFeatureStart and stop <= newFeatureStop:
+                            notContained = False
+
+                newFeatureList = newFList
+
+            if notContained:
+                newFeatureList.append((start, stop, featureType))
+
+    return newFeatureList, chrom, sense
+
+
  def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
                          additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
                          lengthCDS=0, keepSense=False, adjustToNeighbor=True):
@@ -573,23 +571,7 @@ def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
      lengthCDS < 0bp, return only the last X bp from CDS.
      """ 
      locusByChromDict = {}
-    if upstream == 0 and downstream == 0 and not useCDS:
-        print "getLocusByChromDict: asked for no sequence - returning empty dict"
-        return locusByChromDict
-    elif upstream > 0 and downstream > 0 and not useCDS:
-        print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
-        return locusByChromDict
-    elif lengthCDS != 0 and not useCDS:
-        print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
-        return locusByChromDict
-    elif upstreamSpanTSS and lengthCDS != 0:
-        print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
-        return locusByChromDict
-    elif lengthCDS > 0 and downstream > 0:
-        print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
-        return locusByChromDict
-    elif lengthCDS < 0 and upstream > 0:
-        print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
+    if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
          return locusByChromDict
  
      genomeName = genome.genome
@@ -702,6 +684,29 @@ def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
      return locusByChromDict
  
  
+def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
+    if upstream == 0 and downstream == 0 and not useCDS:
+        print "getLocusByChromDict: asked for no sequence - returning empty dict"
+        return True
+    elif upstream > 0 and downstream > 0 and not useCDS:
+        print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
+        return True
+    elif lengthCDS != 0 and not useCDS:
+        print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
+        return True
+    elif upstreamSpanTSS and lengthCDS != 0:
+        print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
+        return True
+    elif lengthCDS > 0 and downstream > 0:
+        print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
+        return True
+    elif lengthCDS < 0 and upstream > 0:
+        print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
+        return True
+
+    return False
+
+
  def getGeneFeatures(genome, additionalRegionsDict):
      featuresDict = genome.getallGeneFeatures()
      if len(additionalRegionsDict) > 0:
@@ -805,37 +810,25 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                      else:
                          regionBinLength = binLength
  
-                    startdist = tagStart - start
                      if rsense == "F":
-                        # we are relying on python's integer division quirk
-                        binID = startdist / regionBinLength
-                        if (fixedFirstBin > 0) and (startdist < fixedFirstBin):
-                            binID = 0
-                        elif fixedFirstBin > 0:
-                            binID = 1
-
-                        if binID >= bins:
-                            binID = bins - 1
-
-                        try:
-                            regionsBins[regionID][binID] += normalizedTag * weight
-                        except KeyError:
-                            print "%s %s" % (regionID, str(binID))
+                        positionInRegion = tagStart - start
                      else:
-                        rdist = rlen - startdist
-                        binID = rdist / regionBinLength
-                        if (fixedFirstBin > 0) and (rdist < fixedFirstBin):
-                            binID = 0
-                        elif fixedFirstBin > 0:
-                            binID = 1
-
-                        if binID >= bins:
-                            binID = bins - 1
-
-                        try:
-                            regionsBins[regionID][binID] += normalizedTag * weight
-                        except KeyError:
-                            print "%s %s" % (regionID, str(binID))
+                        positionInRegion = start + rlen - tagStart
+
+                    # we are relying on python's integer division quirk
+                    binID = positionInRegion / regionBinLength
+                    if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin):
+                        binID = 0
+                    elif fixedFirstBin > 0:
+                        binID = 1
+
+                    if binID >= bins:
+                        binID = bins - 1
+
+                    try:
+                        regionsBins[regionID][binID] += normalizedTag * weight
+                    except KeyError:
+                        print "%s %s" % (regionID, str(binID))
  
                      stopPoint = stop
  
diff --git a/findall.py b/findall.py

index aa820374b994d497fbc4dd336a98b795489f5900..663d7e075d5d30535da6eed6547d4da2b4970be2 100755 (executable)
--- a/findall.py
+++ b/findall.py
@@ -49,14 +49,104 @@ import sys
  import math
  import string
  import optparse
+import operator
  from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
  import ReadDataset
  import Region
  
  
-versionString = "findall: version 3.2"
+versionString = "findall: version 3.2.1"
  print versionString
  
+class RegionDirectionError(Exception):
+    pass
+
+
+class RegionFinder():
+    def __init__(self, minRatio, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, strandfilter, minHits, trimValue, doTrim, doDirectionality,
+                 shiftValue, maxSpacing):
+
+        self.statistics = {"index": 0,
+                           "total": 0,
+                           "mIndex": 0,
+                           "mTotal": 0,
+                           "failed": 0,
+                           "badRegionTrim": 0
+        }
+
+        self.controlRDSsize = 1
+        self.sampleRDSsize = 1
+        self.minRatio = minRatio
+        self.minPeak = minPeak
+        self.leftPlusRatio = leftPlusRatio
+        self.stranded = "both"
+        if strandfilter == "plus":
+            self.stranded = "+"
+            minPlusRatio = 0.9
+            maxPlusRatio = 1.0
+        elif strandfilter == "minus":
+            self.stranded = "-"
+            minPlusRatio = 0.0
+            maxPlusRatio = 0.1
+
+        if minRatio < minPeak:
+            self.minPeak = minRatio
+
+        self.minPlusRatio = minPlusRatio
+        self.maxPlusRatio = maxPlusRatio
+        self.strandfilter = strandfilter
+        self.minHits = minHits
+        self.trimValue = trimValue
+        self.doTrim = doTrim
+        self.doDirectionality = doDirectionality
+
+        if self.doTrim:
+            self.trimString = string.join(["%2.1f" % (100. * self.trimValue), "%"], "")
+        else:
+            self.trimString = "none"
+
+        self.shiftValue = shiftValue
+        self.maxSpacing = maxSpacing
+
+
+    def useRNASettings(self):
+        self.shiftValue = 0
+        self.doTrim = False
+        self.doDirectionality = False
+
+
+    def printOptionsSummary(self, listPeak, useMulti, doCache, pValueType):
+
+        print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (self.doDirectionality, listPeak, not useMulti, doCache)
+        print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded)
+        try:
+            print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
+        except:
+            print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
+
+
+    def getAnalysisDescription(self, hitfile, controlfile, doControl,
+                                 withFlag, listPeak, useMulti, doCache, pValueType):
+
+        description = ["#ERANGE %s" % versionString]
+        if doControl:
+            description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, controlfile, self.controlRDSsize))
+        else:
+            description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample: none" % (hitfile, self.sampleRDSsize))
+
+        if withFlag != "":
+            description.append("#restrict to Flag = %s" % withFlag)
+
+        description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s" % (self.doDirectionality, listPeak, not useMulti, doCache))
+        description.append("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded))
+        try:
+            description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
+        except:
+            description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
+
+        return string.join(description, "\n")
+
+
  def usage():
      print __doc__
  
@@ -76,12 +166,32 @@ def main(argv=None):
      hitfile = args[1]
      outfilename = args[2]
  
-    findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift,
-            options.stringency, options.noshift, options.autoshift, options.reportshift,
+    shiftValue = 0
+
+    if options.autoshift:
+        shiftValue = "auto"
+
+    if options.shift is not None:
+        try:
+            shiftValue = int(options.shift)
+        except ValueError:
+            if options.shift == "learn":
+                shiftValue = "learn"
+
+    if options.noshift:
+        shiftValue = 0
+
+    if options.doAppend:
+        outputMode = "a"
+    else:
+        outputMode = "w"
+
+    findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, shiftValue,
+            options.stringency, options.reportshift,
              options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
              options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
-            options.trimValue, options.doTrim, options.doAppend, options.rnaSettings,
-            options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti,
+            options.trimValue, options.doTrim, outputMode, options.rnaSettings,
+            options.cachePages, options.ptype, options.controlfile, options.doRevBackground, options.useMulti,
              options.strandfilter, options.combine5p)
  
  
@@ -89,7 +199,7 @@ def makeParser():
      usage = __doc__
  
      parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--control", dest="mockfile")
+    parser.add_option("--control", dest="controlfile")
      parser.add_option("--minimum", type="float", dest="minHits")
      parser.add_option("--ratio", type="float", dest="minRatio")
      parser.add_option("--spacing", type="int", dest="maxSpacing")
@@ -99,7 +209,7 @@ def makeParser():
      parser.add_option("--noshift", action="store_true", dest="noShift")
      parser.add_option("--autoshift", action="store_true", dest="autoshift")
      parser.add_option("--reportshift", action="store_true", dest="reportshift")
-    parser.add_option("--nomulti", action="store_true", dest="noMulti")
+    parser.add_option("--nomulti", action="store_false", dest="useMulti")
      parser.add_option("--minPlus", type="float", dest="minPlusRatio")
      parser.add_option("--maxPlus", type="float", dest="maxPlusRatio")
      parser.add_option("--leftPlus", type="float", dest="leftPlusRatio")
@@ -137,16 +247,16 @@ def makeParser():
      logfilename = getConfigOption(configParser, section, "logfilename", "findall.log")
      withFlag = getConfigOption(configParser, section, "withFlag", "")
      doDirectionality = getConfigBoolOption(configParser, section, "doDirectionality", True)
-    trimValue = getConfigOption(configParser, section, "trimValue", None)
+    trimValue = getConfigFloatOption(configParser, section, "trimValue", 0.1)
      doTrim = getConfigBoolOption(configParser, section, "doTrim", True)
      doAppend = getConfigBoolOption(configParser, section, "doAppend", False)
      rnaSettings = getConfigBoolOption(configParser, section, "rnaSettings", False)
      cachePages = getConfigOption(configParser, section, "cachePages", None)
-    ptype = getConfigOption(configParser, section, "ptype", None)
-    mockfile = getConfigOption(configParser, section, "mockfile", None)
+    ptype = getConfigOption(configParser, section, "ptype", "")
+    controlfile = getConfigOption(configParser, section, "controlfile", None)
      doRevBackground = getConfigBoolOption(configParser, section, "doRevBackground", False)
-    noMulti = getConfigBoolOption(configParser, section, "noMulti", False)
-    strandfilter = getConfigOption(configParser, section, "strandfilter", None)
+    useMulti = getConfigBoolOption(configParser, section, "useMulti", True)
+    strandfilter = getConfigOption(configParser, section, "strandfilter", "")
      combine5p = getConfigBoolOption(configParser, section, "combine5p", False)
  
      parser.set_defaults(minHits=minHits, minRatio=minRatio, maxSpacing=maxSpacing, listPeak=listPeak, shift=shift,
@@ -154,611 +264,692 @@ def makeParser():
                          minPlusRatio=minPlusRatio, maxPlusRatio=maxPlusRatio, leftPlusRatio=leftPlusRatio, minPeak=minPeak,
                          normalize=normalize, logfilename=logfilename, withFlag=withFlag, doDirectionality=doDirectionality,
                          trimValue=trimValue, doTrim=doTrim, doAppend=doAppend, rnaSettings=rnaSettings,
-                        cachePages=cachePages, ptype=ptype, mockfile=mockfile, doRevBackground=doRevBackground, noMulti=noMulti,
+                        cachePages=cachePages, ptype=ptype, controlfile=controlfile, doRevBackground=doRevBackground, useMulti=useMulti,
                          strandfilter=strandfilter, combine5p=combine5p)
  
      return parser
  
  
-def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
-            stringency=4.0, noshift=False, autoshift=False, reportshift=False,
-            minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
-            normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
-            trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
-            cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
-            strandfilter=None, combine5p=False):
+def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shiftValue=0,
+            stringency=4.0, reportshift=False, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
+            normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True, trimValue=0.1, doTrim=True,
+            outputMode="w", rnaSettings=False, cachePages=None, ptype="", controlfile=None, doRevBackground=False,
+            useMulti=True, strandfilter="", combine5p=False):
  
-    shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
-    doControl = False
-    if mockfile is not None:
-        doControl = True
+    regionFinder = RegionFinder(minRatio, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, strandfilter, minHits, trimValue,
+                                doTrim, doDirectionality, shiftValue, maxSpacing)
  
-    if doRevBackground:
-        print "Swapping IP and background to calculate FDR"
-        pValueType = "back"
+    doControl = controlfile is not None
+    pValueType = getPValueType(ptype, doControl, doRevBackground)
+    doPvalue = not pValueType == "none"
  
-    doPvalue = True
-    if ptype is not None:
-        ptype = ptype.upper()
+    if rnaSettings:
+        regionFinder.useRNASettings()
+
+    doCache = cachePages is not None
+    printStatusMessages(regionFinder.shiftValue, normalize, doRevBackground, ptype, doControl, withFlag, useMulti, rnaSettings, regionFinder.strandfilter)
+    controlRDS = None
+    if doControl:
+        print "\ncontrol:" 
+        controlRDS = openRDSFile(controlfile, cachePages=cachePages, doCache=doCache)
+
+    print "\nsample:" 
+    hitRDS = openRDSFile(hitfile, cachePages=cachePages, doCache=doCache)
+
+    print
+    regionFinder.readlen = hitRDS.getReadSize()
+    if rnaSettings:
+        regionFinder.maxSpacing = regionFinder.readlen
+
+    writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+    regionFinder.printOptionsSummary(listPeak, useMulti, doCache, pValueType)
+
+    regionFinder.sampleRDSsize = len(hitRDS) / 1000000.
+    if doControl:
+        regionFinder.controlRDSsize = len(controlRDS) / 1000000.
+
+    outfile = open(outfilename, outputMode)
+    print >> outfile, regionFinder.getAnalysisDescription(hitfile, controlfile, doControl,
+                                                          withFlag, listPeak, useMulti, doCache, pValueType)
+
+    header = getHeader(normalize, regionFinder.doDirectionality, listPeak, reportshift, doPvalue)
+    print >> outfile, header
+    if minRatio < minPeak:
+        minPeak = minRatio
+
+    shiftDict = {}
+    hitChromList = hitRDS.getChromosomes()
+    stringency = max(stringency, 1.0)
+    chromosomeList = getChromosomeListToProcess(hitChromList, controlRDS, doControl)
+    for chromosome in chromosomeList:
+        allregions, outregions = findPeakRegions(regionFinder, hitRDS, controlRDS, chromosome, logfilename, outfilename,
+                                                 outfile, stringency, normalize, useMulti, doControl, withFlag, combine5p,
+                                                 factor, listPeak)
+
+        if doRevBackground:
+            #TODO: this is *almost* the same calculation - there are small yet important differences
+            backregions = findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, normalize, useMulti, factor)
+            writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
+                                   allregions, header, backregions=backregions, pValueType=pValueType)
+        else:
+            writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
+                                        allregions, header)
+
+    footer = getFooter(regionFinder, shiftDict, reportshift, doRevBackground)
+    print footer
+    print >> outfile, footer
+    outfile.close()
+    writeLog(logfilename, versionString, outfilename + footer.replace("\n#"," | ")[:-1])
+
+
+def getPValueType(ptype, doControl, doRevBackground):
+    pValueType = "self"
+    if ptype in ["NONE", "SELF", "BACK"]:
          if ptype == "NONE":
-            doPvalue = False
              pValueType = "none"
-            p = 1
-            poissonmean = 0
          elif ptype == "SELF":
              pValueType = "self"
          elif ptype == "BACK":
              if doControl and doRevBackground:
                  pValueType = "back"
+    elif doRevBackground:
+        pValueType = "back"
+
+    return pValueType
+
+
+def printStatusMessages(shiftValue, normalize, doRevBackground, ptype, doControl, withFlag, useMulti, rnaSettings, strandfilter):
+    if shiftValue == "learn":
+        print "Will try to learn shift"
+
+    if normalize:
+        print "Normalizing to RPM"
+
+    if doRevBackground:
+        print "Swapping IP and background to calculate FDR"
+
+    if ptype != "":
+        if ptype in ["NONE", "SELF"]:
+            pass
+        elif ptype == "BACK":
+            if doControl and doRevBackground:
+                pass
              else:
                  print "must have a control dataset and -revbackground for pValue type 'back'"
          else:
              print "could not use pValue type : %s" % ptype
-    else:
-        pValueType = "self"
  
      if withFlag != "":
          print "restrict to flag = %s" % withFlag
  
-    useMulti = True
-    if noMulti:
+    if not useMulti:
          print "using unique reads only"
-        useMulti = False
  
      if rnaSettings:
          print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
-        doTrim = False
-        doDirectionality = False
  
-    stranded = ""
-    if strandfilter is not None:
-        if strandfilter == "plus":
-            stranded = "+"
-            minPlusRatio = 0.9
-            maxPlusRatio = 1.0
-            print "only analyzing reads on the plus strand"
-        elif strandfilter == "minus":
-            stranded = "-"
-            minPlusRatio = 0.0
-            maxPlusRatio = 0.1
-            print "only analyzing reads on the minus strand"
+    if strandfilter == "plus":
+        print "only analyzing reads on the plus strand"
+    elif strandfilter == "minus":
+        print "only analyzing reads on the minus strand"
  
-    stringency = max(stringency, 1.0)
-    writeLog(logfilename, versionString, string.join(sys.argv[1:]))
-    if cachePages is not None:
-        doCache = True
+
+def openRDSFile(filename, cachePages=None, doCache=False):
+    rds = ReadDataset.ReadDataset(filename, verbose=True, cache=doCache)
+    if cachePages > rds.getDefaultCacheSize():
+        rds.setDBcache(cachePages)
+
+    return rds
+
+
+def getHeader(normalize, doDirectionality, listPeak, reportshift, doPvalue):
+    if normalize:
+        countType = "RPM"
      else:
-        doCache = False
-        cachePages = -1
+        countType = "COUNT"
  
-    mockRDS = None
-    if doControl:
-        print "\ncontrol:" 
-        mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
+    headerFields = ["#regionID\tchrom\tstart\tstop", countType, "fold\tmulti%"]
  
-        if cachePages > mockRDS.getDefaultCacheSize():
-            mockRDS.setDBcache(cachePages)
+    if doDirectionality:
+        headerFields.append("plus%\tleftPlus%")
  
-    print "\nsample:" 
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
-    readlen = hitRDS.getReadSize()
-    if rnaSettings:
-        maxSpacing = readlen
+    if listPeak:
+        headerFields.append("peakPos\tpeakHeight")
  
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
+    if reportshift:
+        headerFields.append("readShift")
  
-    if trimValue is not None:
-        trimValue = float(trimValue) / 100.
-        trimString = "%2.1f%s" % ((100. * trimValue), "%")
-    else:
-        trimValue = 0.1
-        trimString = "10%"
+    if doPvalue:
+        headerFields.append("pValue")
  
-    if not doTrim:
-        trimString = "none"
+    return string.join(headerFields, "\t")
  
-    if doAppend:
-        fileMode = "a"
-    else:
-        fileMode = "w"
  
-    outfile = open(outfilename, fileMode)
-    outfile.write("#ERANGE %s\n" % versionString)
+def getChromosomeListToProcess(hitChromList, controlRDS, doControl):
      if doControl:
-        mockRDSsize = len(mockRDS) / 1000000.
-        controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize)
+        controlChromList = controlRDS.getChromosomes()
+        chromosomeList = [chrom for chrom in hitChromList if chrom in controlChromList and chrom != "chrM"]
      else:
-        mockRDSsize = 0
-        controlSampleString = " none"
+        chromosomeList = [chrom for chrom in hitChromList if chrom != "chrM"]
  
-    hitRDSsize = len(hitRDS) / 1000000.
-    outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:%s\n" % (hitfile, hitRDSsize, controlSampleString))
+    return chromosomeList
  
-    if withFlag != "":
-        outfile.write("#restrict to Flag = %s\n" % withFlag)
  
-    print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache)
-    print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)
-    print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)
+def findPeakRegions(regionFinder, hitRDS, controlRDS, chromosome, logfilename, outfilename,
+                    outfile, stringency, normalize, useMulti, doControl, withFlag, combine5p, factor, listPeak):
  
-    outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache))
-    outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded))
-    outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType))
-    if normalize:
-        print "Normalizing to RPM"
-        countLabel = "RPM"
-    else:
-        countLabel = "COUNT"
+    outregions = []
+    allregions = []
+    print "chromosome %s" % (chromosome)
+    previousHit = - 1 * regionFinder.maxSpacing
+    readStartPositions = [-1]
+    totalWeight = 0
+    uniqueReadCount = 0
+    reads = []
+    numStarts = 0
+    badRegion = False
+    hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True,
+                                  strand=regionFinder.stranded, combine5p=combine5p)
  
-    headerList = ["#regionID\tchrom\tstart\tstop", countLabel, "fold\tmulti%"]
-    if doDirectionality:
-        headerList.append("plus%\tleftPlus%")
+    maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
+    if regionFinder.shiftValue == "learn":
+        learnShift(regionFinder, hitDict, controlRDS, chromosome, maxCoord, logfilename, outfilename,
+                                outfile, numStarts, stringency, normalize, useMulti, doControl)
  
-    if listPeak:
-        headerList.append("peakPos\tpeakHeight")
+    for read in hitDict[chromosome]:
+        pos = read["start"]
+        if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
+            lastReadPos = readStartPositions[-1]
+            lastBasePosition = lastReadPos + regionFinder.readlen - 1
+            newRegionIndex = regionFinder.statistics["index"] + 1
+            if regionFinder.doDirectionality:
+                region = Region.DirectionalRegion(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=factor, numReads=totalWeight)
+            else:
+                region = Region.Region(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=factor, numReads=totalWeight)
  
-    if reportshift:
-        headerList.append("readShift")
+            if normalize:
+                region.numReads /= regionFinder.sampleRDSsize
  
-    if doPvalue:
-        headerList.append("pValue")
+            allregions.append(int(region.numReads))
+            regionLength = lastReadPos - region.start
+            if regionPassesCriteria(region.numReads, regionFinder.minHits, numStarts, regionFinder.minRatio, regionLength, regionFinder.readlen):
+                region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos,
+                                                useMulti, doControl, normalize)
  
-    headline = string.join(headerList, "\t")
-    print >> outfile, headline
+                if region.foldRatio >= regionFinder.minRatio:
+                    # first pass, with absolute numbers
+                    peak = findPeak(reads, region.start, regionLength, regionFinder.readlen, doWeight=True, leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue)
+                    if regionFinder.doTrim:
+                        try:
+                            lastReadPos = trimRegion(region, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.readlen, regionFinder.doDirectionality,
+                                                     normalize, regionFinder.sampleRDSsize)
+                        except IndexError:
+                            badRegion = True
+                            continue
  
-    statistics = {"index": 0,
-                  "total": 0,
-                  "mIndex": 0,
-                  "mTotal": 0,
-                  "failed": 0
-    }
+                        region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos,
+                                                        useMulti, doControl, normalize)
  
-    if minRatio < minPeak:
-        minPeak = minRatio
+                    # just in case it changed, use latest data
+                    try:
+                        bestPos = peak.topPos[0]
+                        peakScore = peak.smoothArray[bestPos]
+                        if normalize:
+                            peakScore /= regionFinder.sampleRDSsize
+                    except:
+                        continue
  
-    hitChromList = hitRDS.getChromosomes()
-    if doControl:
-        mockChromList = mockRDS.getChromosomes()
-    else:
-        mockChromList = []
+                    if listPeak:
+                        region.peakDescription= "%d\t%.1f" % (region.start + bestPos, peakScore)
  
-    mockSampleSize = 0
-    hitSampleSize = 0
-    if normalize:
-        if doControl:
-            mockSampleSize = mockRDSsize
-
-        hitSampleSize = hitRDSsize
-
-    hitChromList.sort()
-
-    for chromosome in hitChromList:
-        if doNotProcessChromosome(chromosome, doControl, mockChromList):
-            continue
-
-        print "chromosome %s" % (chromosome)
-        hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True,
-                                      doMulti=useMulti, findallOptimize=True, strand=stranded,
-                                      combine5p=combine5p)
-        maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
-        if shiftValue == "learn":
-            shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize,
-                                    mockSampleSize, minRatio, maxSpacing, maxCoord, stringency, readlen, minHits,
-                                    logfilename, outfile, outfilename)
-
-        regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize,
-                                                                  chromosome, useMulti, normalize, maxSpacing,
-                                                                  doDirectionality, doTrim, minHits, minRatio,
-                                                                  readlen, shiftValue, minPeak, minPlusRatio,
-                                                                  maxPlusRatio, leftPlusRatio, listPeak, noMulti,
-                                                                  doControl, factor, trimValue, outputRegionList=True)
-
-        statistics["index"] += regionStats["index"]
-        statistics["total"] += regionStats["total"]
-        statistics["failed"] += regionStats["failed"]
-        if not doRevBackground:
-            if doPvalue:
-                p, poissonmean = calculatePValue(allRegionWeights)
-
-            print headline
-            shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
-            continue
-
-        #now do background swapping the two samples around
-        if mockRDS is not None:
-            print "calculating background..."
-            backgroundTrimValue = 1/20.
-            backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize,
-                                                                           chromosome, useMulti, normalize, maxSpacing,
-                                                                           doDirectionality, doTrim, minHits, minRatio,
-                                                                           readlen, shiftValue, minPeak, minPlusRatio,
-                                                                           maxPlusRatio, leftPlusRatio, listPeak, noMulti,
-                                                                           doControl, factor, backgroundTrimValue)
-
-            statistics["mIndex"] += backgroundRegionStats["index"]
-            statistics["mTotal"] += backgroundRegionStats["total"]
-            statistics["failed"] += backgroundRegionStats["failed"]
-            print statistics["mIndex"], statistics["mTotal"]
-            if doPvalue:
-                if pValueType == "self":
-                    p, poissonmean = calculatePValue(allRegionWeights)
-                else:
-                    p, poissonmean = calculatePValue(backgroundRegionWeights)
-
-            print headline
-            shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
-
-    footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue)
-    print footer
-    outfile.write(footer)
-    outfile.close()
+                    if useMulti:
+                        setMultireadPercentage(region, hitRDS, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos, normalize, regionFinder.doTrim)
  
-    writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | ")))
+                    region.shift = peak.shift
+                    # check that we still pass threshold
+                    regionLength = lastReadPos - region.start
+                    plusRatio = float(peak.numPlus)/peak.numHits
+                    if regionAndPeakPass(region, regionFinder.minHits, regionFinder.minRatio, regionLength, regionFinder.readlen, peakScore, regionFinder.minPeak,
+                                         regionFinder.minPlusRatio, regionFinder.maxPlusRatio, plusRatio):
+                        try:
+                            updateRegion(region, regionFinder.doDirectionality, regionFinder.leftPlusRatio, peak.numLeftPlus, peak.numPlus, plusRatio)
+                            regionFinder.statistics["index"] += 1
+                            outregions.append(region)
+                            regionFinder.statistics["total"] += region.numReads
+                        except RegionDirectionError:
+                            regionFinder.statistics["failed"] += 1
+
+            readStartPositions = []
+            totalWeight = 0
+            uniqueReadCount = 0
+            reads = []
+            numStarts = 0
+            if badRegion:
+                badRegion = False
+                regionFinder.statistics["badRegionTrim"] += 1
  
+        if pos not in readStartPositions:
+            numStarts += 1
  
-def determineShiftValue(autoshift, shift, noshift, rnaSettings):
-    shiftValue = 0
-    if autoshift:
-        shiftValue = "auto"
+        readStartPositions.append(pos)
+        weight = read["weight"]
+        totalWeight += weight
+        if weight == 1.0:
+            uniqueReadCount += 1
  
-    if shift is not None:
-        try:
-            shiftValue = int(shift)
-        except ValueError:
-            if shift == "learn":
-                shiftValue = "learn"
-                print "Will try to learn shift"
+        reads.append({"start": pos, "sense": read["sense"], "weight": weight})
+        previousHit = pos
  
-    if noshift or rnaSettings:
-        shiftValue = 0
+    return allregions, outregions
  
-    return shiftValue
  
+def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, normalize, useMulti, factor):
  
-def doNotProcessChromosome(chromosome, doControl, mockChromList):
-    skipChromosome = False
-    if chromosome == "chrM":
-        skipChromosome = True
+    print "calculating background..."
+    previousHit = - 1 * regionFinder.maxSpacing
+    currentHitList = [-1]
+    currentTotalWeight = 0
+    currentReadList = []
+    backregions = []
+    numStarts = 0
+    badRegion = False
+    hitDict = controlRDS.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, findallOptimize=True)
+    maxCoord = controlRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
+    for read in hitDict[chromosome]:
+        pos = read["start"]
+        if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
+            lastReadPos = currentHitList[-1]
+            lastBasePosition = lastReadPos + regionFinder.readlen - 1
+            region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=factor, numReads=currentTotalWeight)
+            if normalize:
+                region.numReads /= regionFinder.controlRDSsize
+
+            backregions.append(int(region.numReads))
+            region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=factor, numReads=currentTotalWeight)
+            regionLength = lastReadPos - region.start
+            if regionPassesCriteria(region.numReads, regionFinder.minHits, numStarts, regionFinder.minRatio, regionLength, regionFinder.readlen):
+                numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+                if normalize:
+                    numMock /= regionFinder.sampleRDSsize
+
+                foldRatio = region.numReads / numMock
+                if foldRatio >= regionFinder.minRatio:
+                    # first pass, with absolute numbers
+                    peak = findPeak(currentReadList, region.start, lastReadPos - region.start, regionFinder.readlen, doWeight=True,
+                                    leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue)
  
-    if doControl and (chromosome not in mockChromList):
-        skipChromosome = True
+                    if regionFinder.doTrim:
+                        try:
+                            lastReadPos = trimRegion(region, peak, lastReadPos, 20., currentReadList, regionFinder.readlen, regionFinder.doDirectionality,
+                                                     normalize, regionFinder.controlRDSsize)
+                        except IndexError:
+                            badRegion = True
+                            continue
  
-    return skipChromosome
+                        numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+                        if normalize:
+                            numMock /= regionFinder.sampleRDSsize
  
+                        foldRatio = region.numReads / numMock
  
-def calculatePValue(dataList):
-    dataList.sort()
-    listSize = float(len(dataList))
-    try:
-        poissonmean = sum(dataList) / listSize
-    except ZeroDivisionError:
-        poissonmean = 0
+                    # just in case it changed, use latest data
+                    try:
+                        bestPos = peak.topPos[0]
+                        peakScore = peak.smoothArray[bestPos]
+                    except IndexError:
+                        continue
  
-    print "Poisson n=%d, p=%f" % (listSize, poissonmean)
-    p = math.exp(-poissonmean)
+                    # normalize to RPM
+                    if normalize:
+                        peakScore /= regionFinder.controlRDSsize
+
+                    # check that we still pass threshold
+                    regionLength = lastReadPos - region.start
+                    if regionPassesCriteria(region.numReads, regionFinder.minHits, foldRatio, regionFinder.minRatio, regionLength, regionFinder.readlen):
+                        updateControlStatistics(regionFinder, peak, region.numReads, peakScore)
+
+            currentHitList = []
+            currentTotalWeight = 0
+            currentReadList = []
+            numStarts = 0
+            if badRegion:
+                badRegion = False
+                regionFinder.statistics["badRegionTrim"] += 1
+
+        if pos not in currentHitList:
+            numStarts += 1
+
+        currentHitList.append(pos)
+        weight = read["weight"]
+        currentTotalWeight += weight
+        currentReadList.append({"start": pos, "sense": read["sense"], "weight": weight})
+        previousHit = pos
  
-    return p, poissonmean
+    return backregions
  
  
-def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
-               stringency, readlen, minHits, logfilename, outfile, outfilename, minSites=30):
+def learnShift(regionFinder, hitDict, controlRDS, chromosome, maxCoord, logfilename, outfilename,
+               outfile, numStarts, stringency, normalize, useMulti, doControl):
  
-    print "learning shift.... will need at least %d training sites" % minSites
-    previousHit = -1 * maxSpacing
-    hitList = [-1]
+    print "learning shift.... will need at least 30 training sites"
+    previousHit = -1 * regionFinder.maxSpacing
+    positionList = [-1]
      totalWeight = 0
      readList = []
      shiftDict = {}
      count = 0
-    numStarts = 0
-    for read in hitDict[chrom]:
+    for read in hitDict[chromosome]:
          pos = read["start"]
-        sense = read["sense"]
-        weight = read["weight"]
-        if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
-            sumAll = totalWeight
+        if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
              if normalize:
-                sumAll /= hitSampleSize
+                totalWeight /= regionFinder.sampleRDSsize
  
-            regionStart = hitList[0]
-            regionStop = hitList[-1]
+            regionStart = positionList[0]
+            regionStop = positionList[-1]
              regionLength = regionStop - regionStart
-            # we're going to require stringent settings
-            if sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen:
-                foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
-
-                if foldRatio >= minRatio:
-                    localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True)
-                    try:
-                        shiftDict[localshift] += 1
-                    except KeyError:
-                        shiftDict[localshift] = 1
-
+            if regionPassesCriteria(totalWeight, regionFinder.minHits, numStarts, regionFinder.minRatio, regionLength, regionFinder.readlen, stringency=stringency):
+                foldRatio = getFoldRatio(regionFinder, controlRDS, totalWeight, chromosome, regionStart, regionStop, useMulti, doControl, normalize)
+                if foldRatio >= regionFinder.minRatio:
+                    updateShiftDict(shiftDict, readList, regionStart, regionLength, regionFinder.readlen)
                      count += 1
  
-            hitList = []
+            positionList = []
              totalWeight = 0
              readList = []
-            numStarts = 0
  
-        if pos not in hitList:
+        if pos not in positionList:
              numStarts += 1
  
-        hitList.append(pos)
+        positionList.append(pos)
+        weight = read["weight"]
          totalWeight += weight
-        readList.append({"start": pos, "sense": sense, "weight": weight})
+        readList.append({"start": pos, "sense": read["sense"], "weight": weight})
          previousHit = pos
  
-    bestShift = 0
-    bestCount = 0
-    learningSettings = ["#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d" % (stringency, stringency * minHits,
-                                                                                                       stringency * minRatio, stringency * readlen),
-                        "#number of training examples: %d" % count]
-    outline = string.join(learningSettings, "\n")
+    outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency,
+                                                                                                                               stringency * regionFinder.minHits,
+                                                                                                                               stringency * regionFinder.minRatio,
+                                                                                                                               stringency * regionFinder.readlen,
+                                                                                                                               count)
+
      print outline
-    writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
-    if count < minSites:
+    writeLog(logfilename, versionString, outfilename + outline)
+    regionFinder.shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename)
+    outline = "#picked shiftValue to be %d" % regionFinder.shiftValue
+    print outline
+    print >> outfile, outline
+    writeLog(logfilename, versionString, outfilename + outline)
+
+
+def previousRegionIsDone(pos, previousHit, maxSpacing, maxCoord):
+    return abs(pos - previousHit) > maxSpacing or pos == maxCoord
+
+
+def regionPassesCriteria(sumAll, minHits, numStarts, minRatio, regionLength, readlen, stringency=1):
+    return sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen
+
+
+def trimRegion(region, peak, regionStop, trimValue, currentReadList, readlen, doDirectionality, normalize, hitRDSsize):
+    bestPos = peak.topPos[0]
+    peakScore = peak.smoothArray[bestPos]
+    if normalize:
+        peakScore /= hitRDSsize
+
+    minSignalThresh = trimValue * peakScore
+    start = findStartEdgePosition(peak, minSignalThresh)
+    regionEndPoint = regionStop - region.start - 1
+    stop = findStopEdgePosition(peak, regionEndPoint, minSignalThresh)
+
+    regionStop = region.start + stop
+    region.start += start
+    trimmedPeak = findPeak(currentReadList, region.start, regionStop - region.start, readlen, doWeight=True, leftPlus=doDirectionality, shift=peak.shift)
+    peak.numPlus = trimmedPeak.numPlus
+    peak.numLeftPlus = trimmedPeak.numLeftPlus
+    peak.topPos = trimmedPeak.topPos
+    peak.smoothArray = trimmedPeak.smoothArray
+
+    region.numReads = trimmedPeak.numHits
+    if normalize:
+        region.numReads /= hitRDSsize
+
+    region.stop = regionStop + readlen - 1
+                                
+    return regionStop
+
+
+def findStartEdgePosition(peak, minSignalThresh):
+    start = 0
+    while not peakEdgeLocated(peak, start, minSignalThresh):
+        start += 1
+
+    return start
+
+
+def findStopEdgePosition(peak, stop, minSignalThresh):
+    while not peakEdgeLocated(peak, stop, minSignalThresh):
+        stop -= 1
+
+    return stop
+
+
+def peakEdgeLocated(peak, position, minSignalThresh):
+    return peak.smoothArray[position] >= minSignalThresh or position == peak.topPos[0]
+
+
+def getFoldRatio(regionFinder, controlRDS, sumAll, chromosome, regionStart, regionStop, useMulti, doControl, normalize):
+    if doControl:
+        numMock = 1. + controlRDS.getCounts(chromosome, regionStart, regionStop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+        if normalize:
+            numMock /= regionFinder.controlRDSsize
+
+        foldRatio = sumAll / numMock
+    else:
+        foldRatio = regionFinder.minRatio
+
+    return foldRatio
+
+
+def updateShiftDict(shiftDict, readList, regionStart, regionLength, readlen):
+    peak = findPeak(readList, regionStart, regionLength, readlen, doWeight=True, shift="auto")
+    try:
+        shiftDict[peak.shift] += 1
+    except KeyError:
+        shiftDict[peak.shift] = 1
+
+
+def getShiftValue(shiftDict, count, logfilename, outfilename):
+    if count < 30:
          outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
-        print >> outfile, outline
-        writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
+        print outline
+        writeLog(logfilename, versionString, outfilename + outline)
          shiftValue = 0
      else:
-        for shift in sorted(shiftDict):
-            if shiftDict[shift] > bestCount:
-                bestShift = shift
-                bestCount = shiftDict[shift]
-
-        shiftValue = bestShift
+        shiftValue = getBestShiftInDict(shiftDict)
          print shiftDict
  
-    outline = "#picked shiftValue to be %d" % shiftValue
-    print outline
-    print >> outfile, outline
-    writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
-
      return shiftValue
  
  
-def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio):
-    if doControl and rds is not None:
-        foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll)
+def updateControlStatistics(regionFinder, peak, sumAll, peakScore):
+
+    plusRatio = float(peak.numPlus)/peak.numHits
+    if peakScore >= regionFinder.minPeak and regionFinder.minPlusRatio <= plusRatio <= regionFinder.maxPlusRatio:
+        if regionFinder.doDirectionality:
+            if regionFinder.leftPlusRatio < peak.numLeft / peak.numPlus:
+                regionFinder.statistics["mIndex"] += 1
+                regionFinder.statistics["mTotal"] += sumAll
+            else:
+                regionFinder.statistics["failed"] += 1
+        else:
+            # we have a region, but didn't check for directionality
+            regionFinder.statistics["mIndex"] += 1
+            regionFinder.statistics["mTotal"] += sumAll
+
+
+def getRegion(regionStart, regionStop, factor, index, chromosome, sumAll, foldRatio, multiP,
+              peakDescription, shift, doDirectionality, leftPlusRatio, numLeft,
+              numPlus, plusRatio):
+
+    if doDirectionality:
+        if leftPlusRatio < numLeft / numPlus:
+            plusP = plusRatio * 100.
+            leftP = 100. * numLeft / numPlus
+            # we have a region that passes all criteria
+            region = Region.DirectionalRegion(regionStart, regionStop,
+                                              factor, index, chromosome, sumAll,
+                                              foldRatio, multiP, plusP, leftP,
+                                              peakDescription, shift)
+
+        else:
+            raise RegionDirectionError
      else:
-        foldRatio = minRatio
+        # we have a region, but didn't check for directionality
+        region = Region.Region(regionStart, regionStop, factor, index, chromosome,
+                               sumAll, foldRatio, multiP, peakDescription, shift)
  
-    return foldRatio
+    return region
  
  
-def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll):
-    numMock = 1. + rds.getCounts(chrom, start, stop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+def setMultireadPercentage(region, hitRDS, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim):
+    if doTrim:
+        sumMulti = hitRDS.getMultiCount(chromosome, region.start, lastReadPos)
+    else:
+        sumMulti = currentTotalWeight - currentUniqueCount
+
+    # normalize to RPM
      if normalize:
-        numMock /= sampleSize
+        sumMulti /= hitRDSsize
  
-    foldRatio = sumAll / numMock
+    try:
+        multiP = 100. * (sumMulti / region.numReads)
+    except ZeroDivisionError:
+        return
  
-    return foldRatio
+    region.multiP = multiP
  
  
-def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
-                normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
-                shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
-                noMulti, doControl, factor, trimValue, outputRegionList=False):
+def regionAndPeakPass(region, minHits, minRatio, regionLength, readlen, peakScore, minPeak, minPlusRatio, maxPlusRatio, plusRatio):
+    regionPasses = False
+    if regionPassesCriteria(region.numReads, minHits, region.foldRatio, minRatio, regionLength, readlen):
+        if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
+            regionPasses = True
  
-    index = 0
-    totalRegionWeight = 0
-    failedCounter = 0
-    previousHit = - 1 * maxSpacing
-    currentHitList = [-1]
-    currentTotalWeight = 0
-    currentUniqReadCount = 0
-    currentReadList = []
-    regionWeights = []
-    outregions = []
-    numStarts = 0
-    hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True)
-    maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti)
-    for read in hitDict[chrom]:
-        pos = read["start"]
-        sense = read["sense"]
-        weight = read["weight"]
-        if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
-            sumAll = currentTotalWeight
-            if normalizeToRPM:
-                sumAll /= rdsSampleSize
-
-            regionStart = currentHitList[0]
-            regionStop = currentHitList[-1]
-            regionWeights.append(int(sumAll))
-            if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
-                sumMulti = 0.
-                foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
-                if foldRatio >= minRatio:
-                    # first pass, with absolute numbers
-                    peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
-
-                    bestPos = peak.topPos[0]
-                    numHits = peak.numHits
-                    peakScore = peak.smoothArray[bestPos]
-                    numPlus = peak.numPlus
-                    shift = peak.shift
-                    numLeft = peak.numLeftPlus
-                    if normalizeToRPM:
-                        peakScore /= rdsSampleSize
-
-                    if doTrim:
-                        minSignalThresh = trimValue * peakScore
-                        start = 0
-                        stop = regionStop - regionStart - 1
-                        startFound = False
-                        while not startFound:
-                            if peak.smoothArray[start] >= minSignalThresh or start == bestPos:
-                                startFound = True
-                            else:
-                                start += 1
-
-                        stopFound = False
-                        while not stopFound:
-                            if peak.smoothArray[stop] >= minSignalThresh or stop == bestPos:
-                                stopFound = True
-                            else:
-                                stop -= 1
-
-                        regionStop = regionStart + stop
-                        regionStart += start
-                        trimPeak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift)
-
-                        sumAll = trimPeak.numHits
-                        numPlus = trimPeak.numPlus
-                        numLeft = trimPeak.numLeftPlus
-                        if normalizeToRPM:
-                            sumAll /= rdsSampleSize
-
-                        foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
-                        if outputRegionList:
-                            sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
-                        # just in case it changed, use latest data
-                        try:
-                            bestPos = trimPeak.topPos[0]
-                            peakScore = trimPeak.smoothArray[bestPos]
-                        except:
-                            continue
+    return regionPasses
  
-                        if normalizeToRPM:
-                            peakScore /= rdsSampleSize
  
-                    elif outputRegionList:
-                        sumMulti = currentTotalWeight - currentUniqReadCount
+def updateRegion(region,
+                 doDirectionality, leftPlusRatio, numLeft,
+                 numPlus, plusRatio):
  
-                    if outputRegionList:
-                        if normalizeToRPM:
-                            sumMulti /= rdsSampleSize
+    if doDirectionality:
+        if leftPlusRatio < numLeft / numPlus:
+            region.plusP = plusRatio * 100.
+            region.leftP = 100. * numLeft / numPlus
+        else:
+            raise RegionDirectionError
  
-                        try:
-                            multiP = 100. * (sumMulti / sumAll)
-                        except:
-                            break
  
-                        if noMulti:
-                            multiP = 0.
+def writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
+                                allregions, header):
  
-                    # check that we still pass threshold
-                    if sumAll >= minHits and  foldRatio >= minRatio and (regionStop - regionStart) > readlen:
-                        plusRatio = float(numPlus)/numHits
-                        if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
-                            if outputRegionList:
-                                peakDescription = ""
-                                if listPeak:
-                                    peakDescription = "\t%d\t%.1f" % (regionStart + bestPos, peakScore)
-
-                            if doDirectionality:
-                                if leftPlusRatio < numLeft / numPlus:
-                                    index += 1
-                                    if outputRegionList:
-                                        plusP = plusRatio * 100.
-                                        leftP = 100. * numLeft / numPlus
-                                        # we have a region that passes all criteria
-                                        region = Region.DirectionalRegion(regionStart, regionStop + readlen - 1,
-                                                                          factor, index, chrom, sumAll,
-                                                                          foldRatio, multiP, plusP, leftP,
-                                                                          peakDescription, shift)
-                                        outregions.append(region)
-
-                                    totalRegionWeight += sumAll
-                                else:
-                                    failedCounter += 1
-                            else:
-                                # we have a region, but didn't check for directionality
-                                index += 1
-                                totalRegionWeight += sumAll
-                                if outputRegionList:
-                                    region = Region.Region(regionStart, regionStop + readlen - 1, factor, index, chrom,
-                                                           sumAll, foldRatio, multiP, peakDescription, shift)
-                                    outregions.append(region)
+    writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
+                           allregions, header, backregions=[], pValueType="self")
  
-            currentHitList = []
-            currentTotalWeight = 0
-            currentUniqReadCount = 0
-            currentReadList = []
-            numStarts = 0
  
-        if pos not in currentHitList:
-            numStarts += 1
+def writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
+                           allregions, header, backregions=[], pValueType="none"):
  
-        currentHitList.append(pos)
-        currentTotalWeight += weight
-        if weight == 1.0:
-            currentUniqReadCount += 1
+    print regionFinder.statistics["mIndex"], regionFinder.statistics["mTotal"]
+    if doPvalue:
+        if pValueType == "self":
+            poissonmean = calculatePoissonMean(allregions)
+        else:
+            poissonmean = calculatePoissonMean(backregions)
  
-        currentReadList.append({"start": pos, "sense": sense, "weight": weight})
-        previousHit = pos
+    print header
+    writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=regionFinder.shiftValue, reportshift=reportshift, shiftDict=shiftDict)
  
-    statistics = {"index": index,
-                  "total": totalRegionWeight,
-                  "failed": failedCounter
-    }
  
-    if outputRegionList:
-        return statistics, regionWeights, outregions
-    else:
-        return statistics, regionWeights
+def calculatePoissonMean(dataList):
+    dataList.sort()
+    listSize = float(len(dataList))
+    try:
+        poissonmean = sum(dataList) / listSize
+    except ZeroDivisionError:
+        poissonmean = 0
  
+    print "Poisson n=%d, p=%f" % (listSize, poissonmean)
  
-def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, reportshift, shiftValue):
-    bestShift = 0
-    shiftDict = {}
+    return poissonmean
+
+
+def writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=0, reportshift=False, shiftDict={}):
      for region in outregions:
-        # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
-        if reportshift:
-            outputList = [region.printRegionWithShift()]
-            if shiftValue == "auto":
-                try:
-                    shiftDict[region.shift] += 1
-                except KeyError:
-                    shiftDict[region.shift] = 1
-        else:
-            outputList = [region.printRegion()]
+        if shiftValue == "auto" and reportshift:
+            try:
+                shiftDict[region.shift] += 1
+            except KeyError:
+                shiftDict[region.shift] = 1
+
+        outline = getRegionString(region, reportshift)
  
          # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
          if doPvalue:
              sumAll = int(region.numReads)
-            for i in xrange(sumAll):
-                pValue *= poissonmean
-                pValue /= i+1
-
-            outputList.append("%1.2g" % pValue)
+            pValue = calculatePValue(sumAll, poissonmean)
+            outline += "\t%1.2g" % pValue
  
-        outline = string.join(outputList, "\t")
          print outline
          print >> outfile, outline
  
-    bestCount = 0
-    for shift in sorted(shiftDict):
-        if shiftDict[shift] > bestCount:
-            bestShift = shift
-            bestCount = shiftDict[shift]
  
-    return bestShift
+def calculatePValue(sum, poissonmean):
+    pValue = math.exp(-poissonmean)
+    #TODO: 798: DeprecationWarning: integer argument expected, got float - for i in xrange(sum)
+    for i in xrange(sum):
+        pValue *= poissonmean
+        pValue /= i+1
  
+    return pValue
+
+
+def getRegionString(region, reportShift):
+    if reportShift:
+        outline = region.printRegionWithShift()
+    else:
+        outline = region.printRegion()
+
+    return outline
  
-def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue):
-    footerList = ["#stats:\t%.1f RPM in %d regions" % (stats["total"], stats["index"])]
-    if doDirectionality:
-        footerList.append("#\t\t%d additional regions failed directionality filter" % stats["failed"])
+
+def getFooter(regionFinder, shiftDict, reportshift, doRevBackground):
+    index = regionFinder.statistics["index"]
+    mIndex = regionFinder.statistics["mIndex"]
+    footerLines = ["#stats:\t%.1f RPM in %d regions" % (regionFinder.statistics["total"], index)]
+    if regionFinder.doDirectionality:
+        footerLines.append("#\t\t%d additional regions failed directionality filter" % regionFinder.statistics["failed"])
  
      if doRevBackground:
          try:
-            percent = min(100. * (float(stats["mIndex"])/stats["index"]), 100)
-        except (ValueError, ZeroDivisionError):
+            percent = min(100. * (float(mIndex)/index), 100.)
+        except ZeroDivisionError:
              percent = 0.
  
-        footerList.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (stats["mIndex"], stats["mTotal"], percent))
+        footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, regionFinder.statistics["mTotal"], percent))
+
+    if regionFinder.shiftValue == "auto" and reportshift:
+        bestShift = getBestShiftInDict(shiftDict)
+        footerLines.append("#mode of shift values: %d" % bestShift)
+
+    if regionFinder.statistics["badRegionTrim"] > 0:
+        footerLines.append("#%d regions discarded due to trimming problems" % regionFinder.statistics["badRegionTrim"])
+
+    return string.join(footerLines, "\n")
  
-    if shiftValue == "auto" and reportshift:
-        footerList.append("#mode of shift values: %d" % shiftModeValue)
  
-    footer = string.join(footerList, "\n")
+def getBestShiftInDict(shiftDict):
+    return max(shiftDict.iteritems(), key=operator.itemgetter(1))[0]
  
-    return footer
  
  if __name__ == "__main__":
      main(sys.argv)
diff --git a/geneLocusBins.py b/geneLocusBins.py

index 49745dc0c1361c2a8dbe4736c9c8630396fd93e4..03cade292225b55513b300a0fca2d8f2c4f5d50a 100755 (executable)
--- a/geneLocusBins.py
+++ b/geneLocusBins.py
@@ -138,7 +138,7 @@ def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins
              try:
                  geneinfo = geneinfoDict[gid]
                  symbol = geneinfo[0][0]
-            except KeyError:
+            except (KeyError, IndexError):
                  pass
          else:
              symbol = gid
@@ -154,6 +154,7 @@ def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins
                  try:
                      normalizedValue = 100. * binAmount / tagCount
                  except ZeroDivisionError:
+                    #TODO: QForALi - this is the right way to refactor the original code, but I don't think this is the right answer
                      normalizedValue = 100. * binAmount
  
                  binAmount = normalizedValue
@@ -167,4 +168,4 @@ def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins
  
  
  if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
+\ No newline at end of file
diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py

index 5299d27d26c23f8729063eb31f0da390730b5ce5..74e7a0cd817e699af6b398960578f576a030f7e4 100755 (executable)
--- a/geneMrnaCountsWeighted.py
+++ b/geneMrnaCountsWeighted.py
@@ -68,6 +68,12 @@ def makeParser(usage=""):
      return parser
  
  
+#TODO: Reported user performance issue. Long run times in conditions:
+#    small number of reads ~40-50M
+#    all features on single chromosome
+#
+#    User states has been a long time problem.
+
  def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
                             withUniqs=False, withMulti=False, acceptfile=None,
                             cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
@@ -200,6 +206,7 @@ def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict,
      for line in uniquecounts:
          fields = line.strip().split()
          # add a pseudo-count here to ease calculations below
+        #TODO: figure out why this was done in prior implementation...
          uniqueCountDict[fields[0]] = float(fields[-1]) + 1
  
      uniquecounts.close()
@@ -264,4 +271,4 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
  
  
  if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
+\ No newline at end of file
diff --git a/getNovelSNPs.py b/getNovelSNPs.py

index e95d72693c76df13871da4cf809c99e43c41d36a..4091281c8fe0f5410090b23db7276496fe49d6ed 100755 (executable)
--- a/getNovelSNPs.py
+++ b/getNovelSNPs.py
@@ -68,6 +68,7 @@ def doNotProcessLine(line):
  
  def getNovelSNPInfo(genome, snpEntry, hg):
      fields = snpEntry.split()
+    #TODO: refactor naming. is fields[8] rpkm?
      if fields[8].find("N\A") == -1: 
          return snpEntry
      else:
@@ -92,4 +93,4 @@ def getNovelSNPInfo(genome, snpEntry, hg):
  
  
  if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
+\ No newline at end of file
diff --git a/getSNPs.py b/getSNPs.py

index f6071ae444be9ca294ac4c4ff39b50a74e3d0cb4..f5826751fdecdc3643967ea8e04df8331fbf94e3 100755 (executable)
--- a/getSNPs.py
+++ b/getSNPs.py
@@ -75,7 +75,7 @@ def makeParser(usage=""):
      forceChr = getConfigBoolOption(configParser, section, "forceChr", False)
      cachePages = getConfigIntOption(configParser, section, "cachePages", 0)
  
-    parser.set_defaults(doSplices=True, forceChr=False, cachePages=0)
+    parser.set_defaults(doSplices=doSplices, forceChr=forceChr, cachePages=cachePages)
  
      return parser
  
diff --git a/getfasta.py b/getfasta.py

index ea347e4aa44f1a21a0b7c54d11b407352ce91b6f..acae47d68c896d14cf7b894e63adac2552483291 100755 (executable)
--- a/getfasta.py
+++ b/getfasta.py
@@ -36,7 +36,7 @@ def main(argv=None):
      outfilename = args[2]
  
      getfasta(genome, regionfile, outfilename, options.seqsize, options.minHitThresh,
-             options.topRegions, options.maxsize, options.usePeaks, options.hitFile,
+             options.topRegions, options.maxsize, options.usePeaks, options.hitfile,
               options.doCache, options.doCompact)
author	Sean Upchurch <sau@caltech.edu>
	Mon, 13 Jun 2011 17:21:56 +0000 (10:21 -0700)
committer	Sean Upchurch <sau@caltech.edu>
	Mon, 13 Jun 2011 17:21:56 +0000 (10:21 -0700)
ReadDataset.py		patch \| blob \| history
analyzego.py		patch \| blob \| history
commoncode.py		patch \| blob \| history
findall.py		patch \| blob \| history
geneLocusBins.py		patch \| blob \| history
geneMrnaCountsWeighted.py		patch \| blob \| history
getNovelSNPs.py		patch \| blob \| history
getSNPs.py		patch \| blob \| history
getfasta.py		patch \| blob \| history