Rewrite of findall.py to clean code. Configuration tested using

author Sean Upchurch <sau@caltech.edu>

Mon, 13 Jun 2011 17:21:56 +0000 (10:21 -0700)

committer Sean Upchurch <sau@caltech.edu>

Mon, 13 Jun 2011 17:21:56 +0000 (10:21 -0700)
author Sean Upchurch <sau@caltech.edu>
Mon, 13 Jun 2011 17:21:56 +0000 (10:21 -0700)
committer Sean Upchurch <sau@caltech.edu>
Mon, 13 Jun 2011 17:21:56 +0000 (10:21 -0700)
diff --git a/ReadDataset.py b/ReadDataset.py

index 850a5ec602a7b6d7a9a22ac98c4bf3eef4b873e5..c9d2a0bf2b4eb56e0fa906be2319fcac866c30b3 100644 (file)
--- a/ReadDataset.py
+++ b/ReadDataset.py
@@ -207,15 +207,27 @@ class ReadDataset():
          return sql
  
  
          return sql
  
  
+    def getMemCursor(self):
+        """ returns a cursor to memory database for low-level (SQL)
+        access to the data.
+        """
+        return self.memcon.cursor()
+
+
+    def getFileCursor(self):
+        """ returns a cursor to file database for low-level (SQL)
+        access to the data.
+        """
+        return self.dbcon.cursor()
+
+
      def hasIndex(self):
      def hasIndex(self):
-        """ check whether the RDS file has at least one index.
+        """ return True if the RDS file has at least one index.
          """
          stmt = "select count(*) from sqlite_master where type='index'"
          count = int(self.execute(stmt, returnResults=True)[0][0])
          """
          stmt = "select count(*) from sqlite_master where type='index'"
          count = int(self.execute(stmt, returnResults=True)[0][0])
-        if count > 0:
-            return True
  
  
-        return False
+        return count > 0
  
  
      def initializeTables(self, dbConnection, cache=100000):
  
  
      def initializeTables(self, dbConnection, cache=100000):
@@ -237,20 +249,6 @@ class ReadDataset():
          dbConnection.commit()
  
  
          dbConnection.commit()
  
  
-    def getFileCursor(self):
-        """ returns a cursor to file database for low-level (SQL)
-        access to the data.
-        """
-        return self.dbcon.cursor()
-
-
-    def getMemCursor(self):
-        """ returns a cursor to memory database for low-level (SQL)
-        access to the data.
-        """
-        return self.memcon.cursor()
-
-
      def getMetadata(self, valueName=""):
          """ returns a dictionary of metadata.
          """
      def getMetadata(self, valueName=""):
          """ returns a dictionary of metadata.
          """
@@ -309,7 +307,7 @@ class ReadDataset():
  
  
      def getChromosomes(self, table="uniqs", fullChrom=True):
  
  
      def getChromosomes(self, table="uniqs", fullChrom=True):
-        """ returns a list of distinct chromosomes in table.
+        """ returns a sorted list of distinct chromosomes in table.
          """
          statement = "select distinct chrom from %s" % table
          sql = self.getSqlCursor()
          """
          statement = "select distinct chrom from %s" % table
          sql = self.getSqlCursor()
@@ -330,7 +328,7 @@ class ReadDataset():
          return results
  
  
          return results
  
  
-    def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
+    def getMaxCoordinate(self, chrom, doUniqs=True,
                           doMulti=False, doSplices=False):
          """ returns the maximum coordinate for reads on a given chromosome.
          """
                           doMulti=False, doSplices=False):
          """ returns the maximum coordinate for reads on a given chromosome.
          """
@@ -347,9 +345,6 @@ class ReadDataset():
              multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
              maxCoord = max(multiMax, maxCoord)
  
              multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
              maxCoord = max(multiMax, maxCoord)
  
-        if verbose:
-            print "%s maxCoord: %d" % (chrom, maxCoord)
-
          return maxCoord
  
  
          return maxCoord
  
  
@@ -375,9 +370,9 @@ class ReadDataset():
          and which can be restricted by chromosome or custom-flag.
          Returns unique reads by default, but can return multireads
          with doMulti set to True.
          and which can be restricted by chromosome or custom-flag.
          Returns unique reads by default, but can return multireads
          with doMulti set to True.
-        
-        Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
+
          """
          """
+        #TODO: Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
  
          whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
          if findallOptimize:
  
          whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
          if findallOptimize:
@@ -421,27 +416,16 @@ class ReadDataset():
          if findallOptimize:
              if self.memBacked:
                  self.memcon.row_factory = None
          if findallOptimize:
              if self.memBacked:
                  self.memcon.row_factory = None
-                sql = self.memcon.cursor()
              else:
                  self.dbcon.row_factory = None
              else:
                  self.dbcon.row_factory = None
-                sql = self.dbcon.cursor()
  
              stmt.append("order by start")
          elif readIDDict:
  
              stmt.append("order by start")
          elif readIDDict:
-            if self.memBacked:
-                sql = self.memcon.cursor()
-            else:
-                sql = self.dbcon.cursor()
-
              stmt.append("order by readID, start")
          else:
              stmt.append("order by readID, start")
          else:
-            if self.memBacked:
-                sql = self.memcon.cursor()
-            else:
-                sql = self.dbcon.cursor()
-
              stmt.append("order by chrom, start")
  
              stmt.append("order by chrom, start")
  
+        sql = self.getSqlCursor()
          sqlQuery = string.join(stmt)
          sql.execute(sqlQuery)
  
          sqlQuery = string.join(stmt)
          sql.execute(sqlQuery)
  
@@ -602,10 +586,7 @@ class ReadDataset():
          whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
          selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
          selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
          whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
          selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
          selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
-        if self.memBacked:
-            sql = self.memcon.cursor()
-        else:
-            sql = self.dbcon.cursor()
+        sql = self.getSqlCursor()
  
          stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
          sql.execute(stmt)
  
          stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
          sql.execute(stmt)
@@ -718,7 +699,7 @@ class ReadDataset():
  
  
      def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
  
  
      def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
-        """ returns the number of row in the uniqs table.
+        """ returns the number of row in the specified table.
          """
          whereClause = []
          count = 0
          """
          whereClause = []
          count = 0
@@ -741,10 +722,7 @@ class ReadDataset():
          else:
              whereQuery = ""
  
          else:
              whereQuery = ""
  
-        if self.memBacked:
-            sql = self.memcon.cursor()
-        else:
-            sql = self.dbcon.cursor()
+        sql = self.getSqlCursor()
  
          if distinct:
              sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
  
          if distinct:
              sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
@@ -803,11 +781,7 @@ class ReadDataset():
              limitPart = "LIMIT %d" % limit
  
          sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
              limitPart = "LIMIT %d" % limit
  
          sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
-        if self.memBacked:
-            sql = self.memcon.cursor()
-        else:
-            sql = self.dbcon.cursor()
-
+        sql = self.getSqlCursor()
          sql.execute(sqlQuery)
          result = sql.fetchall()
  
          sql.execute(sqlQuery)
          result = sql.fetchall()
  
diff --git a/analyzego.py b/analyzego.py

index 6542b1bf24c256e5907c799c785b0ab060fb9eaf..f8ff0017f9b51401411423b9ff8d5420b0714409 100755 (executable)
--- a/analyzego.py
+++ b/analyzego.py
@@ -6,7 +6,7 @@ except:
  
  import sys
  import optparse
  
  import sys
  import optparse
-from commoncode import getGeneInfoDict, getConfigParser, getConfigOption
+from commoncode import getGeneInfoDict, getConfigParser, getConfigOption, getConfigIntOption
  from cistematic.cisstat.analyzego import calculateGOStats
  
  print "analyzego: version 2.2"
  from cistematic.cisstat.analyzego import calculateGOStats
  
  print "analyzego: version 2.2"
@@ -35,7 +35,7 @@ def main(argv=None):
      infilename = args[1]
      prefix = args[2]
  
      infilename = args[1]
      prefix = args[2]
  
-    analyzeGOFromFile(genome, infilename, prefix, options.translateGene, fieldID)
+    analyzeGOFromFile(genome, infilename, prefix, options.translateGene, fieldID, numTests=options.numTests)
  
  
  def makeParser(usage=""):
  
  
  def makeParser(usage=""):
@@ -44,24 +44,27 @@ def makeParser(usage=""):
                        help="translate gene")
      parser.add_option("--field", type="int", dest="fieldID",
                        help="column containing gene ID/Name")
                        help="translate gene")
      parser.add_option("--field", type="int", dest="fieldID",
                        help="column containing gene ID/Name")
+    parser.add_option("--trials", type="int", dest="numTests",
+                      help="column containing gene ID/Name")
  
      configParser = getConfigParser()
      section = "analyzego"
      translateGene = getConfigOption(configParser, section, "translateGene", False)
      fieldID = getConfigOption(configParser, section, "fieldID", None)
  
      configParser = getConfigParser()
      section = "analyzego"
      translateGene = getConfigOption(configParser, section, "translateGene", False)
      fieldID = getConfigOption(configParser, section, "fieldID", None)
+    numTests = getConfigIntOption(configParser, section, "numTests", 1)
  
  
-    parser.set_defaults(translateGene=translateGene, fieldID=fieldID)
+    parser.set_defaults(translateGene=translateGene, fieldID=fieldID, numTests=numTests)
  
      return parser
  
  
  
      return parser
  
  
-def analyzeGOFromFile(genome, infilename, prefix, translateGene=False, fieldID=1):
+def analyzeGOFromFile(genome, infilename, prefix, translateGene=False, fieldID=1, numTests=1):
      infile = open(infilename)
      infile = open(infilename)
-    analyzeGO(genome, infile, prefix, translateGene=False, fieldID=1)
+    analyzeGO(genome, infile, prefix, translateGene, fieldID, numTests)
      infile.close()
  
  
      infile.close()
  
  
-def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1):
+def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1, numTests=1):
      if translateGene:
          symbolToGidDict = getSymbolDict(genome)
  
      if translateGene:
          symbolToGidDict = getSymbolDict(genome)
  
@@ -88,7 +91,7 @@ def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1):
              locusList.append((genome, gID))
  
      if len(locusList) > 0:
              locusList.append((genome, gID))
  
      if len(locusList) > 0:
-        calculateGOStats(locusList, prefix)
+        calculateGOStats(locusList, prefix, trials=numTests)
  
  
  def getSymbolDict(genome):
  
  
  def getSymbolDict(genome):
diff --git a/commoncode.py b/commoncode.py

index f526daacca0dc91245e71205363216bc17eba2e9..6ef4edfcb3e10d75ae0eecea887f45581f9e4edb 100755 (executable)
--- a/commoncode.py
+++ b/commoncode.py
@@ -1,8 +1,3 @@
-#
-#  commoncode.py
-#  ENRAGE
-#
-
  import ConfigParser
  import os
  import string
  import ConfigParser
  import os
  import string
@@ -192,6 +187,11 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False,
      mergeCount = 0
      chromField = int(chromField)
      count = 0
      mergeCount = 0
      chromField = int(chromField)
      count = 0
+    #TODO: Current algorithm processes input file line by line and compares with prior lines.  Problem is it
+    #      exits at the first merge.  This is not a problem when the input is sorted by start position, but in
+    #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
+    #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
+    #      be merged with A but that will exit the loop and the merge with C will be missed.
      for regionEntry in regionList:
          if regionEntry[0] == "#":
              if "pvalue" in regionEntry:
      for regionEntry in regionList:
          if regionEntry[0] == "#":
              if "pvalue" in regionEntry:
@@ -372,10 +372,11 @@ def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75)
          for read in readList:
              currentpos = read["start"] - start
              if read["sense"] == "+":
          for read in readList:
              currentpos = read["start"] - start
              if read["sense"] == "+":
-                currentpos += testShift
+                senseModifier = 1
              else:
              else:
-                currentpos -= testShift
+                senseModifier = -1
  
  
+            currentpos += testShift * senseModifier
              if (currentpos < 1) or (currentpos >= length):
                  continue
  
              if (currentpos < 1) or (currentpos >= length):
                  continue
  
@@ -384,16 +385,12 @@ def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75)
              else:
                  weight = 1.0
  
              else:
                  weight = 1.0
  
-            if read["sense"] == "+":
-                shiftArray[currentpos] += weight
-            else:
-                shiftArray[currentpos] -= weight
+            shiftArray[currentpos] += weight * senseModifier
  
          currentScore = 0
          for score in shiftArray:
              currentScore += abs(score)
  
  
          currentScore = 0
          for score in shiftArray:
              currentScore += abs(score)
  
-        print currentScore
          if currentScore < lowestScore:
              bestShift = testShift
              lowestScore = currentScore
          if currentScore < lowestScore:
              bestShift = testShift
              lowestScore = currentScore
@@ -494,40 +491,8 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=
          if restrictGID and gid not in restrictList:
              continue
  
          if restrictGID and gid not in restrictList:
              continue
  
-        featureList = featuresDict[gid]
-        newFeatureList = []
-        isPseudo = False
-        for (ftype, chrom, start, stop, sense) in featureList:
-            if ftype == "PSEUDO":
-                isPseudo = True
-
-            if (start, stop, ftype) not in newFeatureList:
-                notContained = True
-                containedList = []
-                for (fstart, fstop, ftype2) in newFeatureList:
-                    if start >= fstart and stop <= fstop:
-                        notContained = False
-
-                    if start < fstart and stop > fstop:
-                        containedList.append((fstart, fstop))
-
-                if len(containedList) > 0:
-                    newFList = []
-                    notContained = True
-                    for (fstart, fstop, ftype2) in newFeatureList:
-                        if (fstart, fstop) not in containedList:
-                            newFList.append((fstart, fstop, ftype2))
-                            if start >= fstart and stop <= fstop:
-                                notContained = False
-
-                    newFeatureList = newFList
-                if notContained:
-                    newFeatureList.append((start, stop, ftype))
-
-        if ignorePseudo and isPseudo:
-            continue
-
-        if chrom not in featuresByChromDict:
+        newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid])
+        if newFeatureList and chrom not in featuresByChromDict:
              featuresByChromDict[chrom] = []
  
          for (start, stop, ftype) in newFeatureList:
              featuresByChromDict[chrom] = []
  
          for (start, stop, ftype) in newFeatureList:
@@ -560,6 +525,39 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=
          return featuresByChromDict
  
  
          return featuresByChromDict
  
  
+def getNewFeatureList(featureList, ignorePseudo=False):
+    newFeatureList = []
+    for (featureType, chrom, start, stop, sense) in featureList:
+        if featureType == "PSEUDO" and ignorePseudo:
+            return [], chrom, sense
+
+        if (start, stop, featureType) not in newFeatureList:
+            notContained = True
+            containedList = []
+            for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
+                if start >= newFeatureStart and stop <= newFeatureStop:
+                    notContained = False
+
+                if start < newFeatureStart and stop > newFeatureStop:
+                    containedList.append((newFeatureStart, newFeatureStop))
+
+            if len(containedList) > 0:
+                newFList = []
+                notContained = True
+                for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
+                    if (newFeatureStart, newFeatureStop) not in containedList:
+                        newFList.append((newFeatureStart, newFeatureStop, featureType2))
+                        if start >= newFeatureStart and stop <= newFeatureStop:
+                            notContained = False
+
+                newFeatureList = newFList
+
+            if notContained:
+                newFeatureList.append((start, stop, featureType))
+
+    return newFeatureList, chrom, sense
+
+
  def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
                          additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
                          lengthCDS=0, keepSense=False, adjustToNeighbor=True):
  def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
                          additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
                          lengthCDS=0, keepSense=False, adjustToNeighbor=True):
@@ -573,23 +571,7 @@ def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
      lengthCDS < 0bp, return only the last X bp from CDS.
      """ 
      locusByChromDict = {}
      lengthCDS < 0bp, return only the last X bp from CDS.
      """ 
      locusByChromDict = {}
-    if upstream == 0 and downstream == 0 and not useCDS:
-        print "getLocusByChromDict: asked for no sequence - returning empty dict"
-        return locusByChromDict
-    elif upstream > 0 and downstream > 0 and not useCDS:
-        print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
-        return locusByChromDict
-    elif lengthCDS != 0 and not useCDS:
-        print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
-        return locusByChromDict
-    elif upstreamSpanTSS and lengthCDS != 0:
-        print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
-        return locusByChromDict
-    elif lengthCDS > 0 and downstream > 0:
-        print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
-        return locusByChromDict
-    elif lengthCDS < 0 and upstream > 0:
-        print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
+    if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
          return locusByChromDict
  
      genomeName = genome.genome
          return locusByChromDict
  
      genomeName = genome.genome
@@ -702,6 +684,29 @@ def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
      return locusByChromDict
  
  
      return locusByChromDict
  
  
+def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
+    if upstream == 0 and downstream == 0 and not useCDS:
+        print "getLocusByChromDict: asked for no sequence - returning empty dict"
+        return True
+    elif upstream > 0 and downstream > 0 and not useCDS:
+        print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
+        return True
+    elif lengthCDS != 0 and not useCDS:
+        print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
+        return True
+    elif upstreamSpanTSS and lengthCDS != 0:
+        print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
+        return True
+    elif lengthCDS > 0 and downstream > 0:
+        print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
+        return True
+    elif lengthCDS < 0 and upstream > 0:
+        print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
+        return True
+
+    return False
+
+
  def getGeneFeatures(genome, additionalRegionsDict):
      featuresDict = genome.getallGeneFeatures()
      if len(additionalRegionsDict) > 0:
  def getGeneFeatures(genome, additionalRegionsDict):
      featuresDict = genome.getallGeneFeatures()
      if len(additionalRegionsDict) > 0:
@@ -805,37 +810,25 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                      else:
                          regionBinLength = binLength
  
                      else:
                          regionBinLength = binLength
  
-                    startdist = tagStart - start
                      if rsense == "F":
                      if rsense == "F":
-                        # we are relying on python's integer division quirk
-                        binID = startdist / regionBinLength
-                        if (fixedFirstBin > 0) and (startdist < fixedFirstBin):
-                            binID = 0
-                        elif fixedFirstBin > 0:
-                            binID = 1
-
-                        if binID >= bins:
-                            binID = bins - 1
-
-                        try:
-                            regionsBins[regionID][binID] += normalizedTag * weight
-                        except KeyError:
-                            print "%s %s" % (regionID, str(binID))
+                        positionInRegion = tagStart - start
                      else:
                      else:
-                        rdist = rlen - startdist
-                        binID = rdist / regionBinLength
-                        if (fixedFirstBin > 0) and (rdist < fixedFirstBin):
-                            binID = 0
-                        elif fixedFirstBin > 0:
-                            binID = 1
-
-                        if binID >= bins:
-                            binID = bins - 1
-
-                        try:
-                            regionsBins[regionID][binID] += normalizedTag * weight
-                        except KeyError:
-                            print "%s %s" % (regionID, str(binID))
+                        positionInRegion = start + rlen - tagStart
+
+                    # we are relying on python's integer division quirk
+                    binID = positionInRegion / regionBinLength
+                    if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin):
+                        binID = 0
+                    elif fixedFirstBin > 0:
+                        binID = 1
+
+                    if binID >= bins:
+                        binID = bins - 1
+
+                    try:
+                        regionsBins[regionID][binID] += normalizedTag * weight
+                    except KeyError:
+                        print "%s %s" % (regionID, str(binID))
  
                      stopPoint = stop
  
  
                      stopPoint = stop
  
diff --git a/findall.py b/findall.py

index aa820374b994d497fbc4dd336a98b795489f5900..663d7e075d5d30535da6eed6547d4da2b4970be2 100755 (executable)
--- a/findall.py
+++ b/findall.py
@@ -49,14 +49,104 @@ import sys
  import math
  import string
  import optparse
  import math
  import string
  import optparse
+import operator
  from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
  import ReadDataset
  import Region
  
  
  from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
  import ReadDataset
  import Region
  
  
-versionString = "findall: version 3.2"
+versionString = "findall: version 3.2.1"
  print versionString
  
  print versionString
  
+class RegionDirectionError(Exception):
+    pass
+
+
+class RegionFinder():
+    def __init__(self, minRatio, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, strandfilter, minHits, trimValue, doTrim, doDirectionality,
+                 shiftValue, maxSpacing):
+
+        self.statistics = {"index": 0,
+                           "total": 0,
+                           "mIndex": 0,
+                           "mTotal": 0,
+                           "failed": 0,
+                           "badRegionTrim": 0
+        }
+
+        self.controlRDSsize = 1
+        self.sampleRDSsize = 1
+        self.minRatio = minRatio
+        self.minPeak = minPeak
+        self.leftPlusRatio = leftPlusRatio
+        self.stranded = "both"
+        if strandfilter == "plus":
+            self.stranded = "+"
+            minPlusRatio = 0.9
+            maxPlusRatio = 1.0
+        elif strandfilter == "minus":
+            self.stranded = "-"
+            minPlusRatio = 0.0
+            maxPlusRatio = 0.1
+
+        if minRatio < minPeak:
+            self.minPeak = minRatio
+
+        self.minPlusRatio = minPlusRatio
+        self.maxPlusRatio = maxPlusRatio
+        self.strandfilter = strandfilter
+        self.minHits = minHits
+        self.trimValue = trimValue
+        self.doTrim = doTrim
+        self.doDirectionality = doDirectionality
+
+        if self.doTrim:
+            self.trimString = string.join(["%2.1f" % (100. * self.trimValue), "%"], "")
+        else:
+            self.trimString = "none"
+
+        self.shiftValue = shiftValue
+        self.maxSpacing = maxSpacing
+
+
+    def useRNASettings(self):
+        self.shiftValue = 0
+        self.doTrim = False
+        self.doDirectionality = False
+
+
+    def printOptionsSummary(self, listPeak, useMulti, doCache, pValueType):
+
+        print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (self.doDirectionality, listPeak, not useMulti, doCache)
+        print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded)
+        try:
+            print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
+        except:
+            print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
+
+
+    def getAnalysisDescription(self, hitfile, controlfile, doControl,
+                                 withFlag, listPeak, useMulti, doCache, pValueType):
+
+        description = ["#ERANGE %s" % versionString]
+        if doControl:
+            description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, controlfile, self.controlRDSsize))
+        else:
+            description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample: none" % (hitfile, self.sampleRDSsize))
+
+        if withFlag != "":
+            description.append("#restrict to Flag = %s" % withFlag)
+
+        description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s" % (self.doDirectionality, listPeak, not useMulti, doCache))
+        description.append("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded))
+        try:
+            description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
+        except:
+            description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
+
+        return string.join(description, "\n")
+
+
  def usage():
      print __doc__
  
  def usage():
      print __doc__
  
@@ -76,12 +166,32 @@ def main(argv=None):
      hitfile = args[1]
      outfilename = args[2]
  
      hitfile = args[1]
      outfilename = args[2]
  
-    findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift,
-            options.stringency, options.noshift, options.autoshift, options.reportshift,
+    shiftValue = 0
+
+    if options.autoshift:
+        shiftValue = "auto"
+
+    if options.shift is not None:
+        try:
+            shiftValue = int(options.shift)
+        except ValueError:
+            if options.shift == "learn":
+                shiftValue = "learn"
+
+    if options.noshift:
+        shiftValue = 0
+
+    if options.doAppend:
+        outputMode = "a"
+    else:
+        outputMode = "w"
+
+    findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, shiftValue,
+            options.stringency, options.reportshift,
              options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
              options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
              options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
              options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
-            options.trimValue, options.doTrim, options.doAppend, options.rnaSettings,
-            options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti,
+            options.trimValue, options.doTrim, outputMode, options.rnaSettings,
+            options.cachePages, options.ptype, options.controlfile, options.doRevBackground, options.useMulti,
              options.strandfilter, options.combine5p)
  
  
              options.strandfilter, options.combine5p)
  
  
@@ -89,7 +199,7 @@ def makeParser():
      usage = __doc__
  
      parser = optparse.OptionParser(usage=usage)
      usage = __doc__
  
      parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--control", dest="mockfile")
+    parser.add_option("--control", dest="controlfile")
      parser.add_option("--minimum", type="float", dest="minHits")
      parser.add_option("--ratio", type="float", dest="minRatio")
      parser.add_option("--spacing", type="int", dest="maxSpacing")
      parser.add_option("--minimum", type="float", dest="minHits")
      parser.add_option("--ratio", type="float", dest="minRatio")
      parser.add_option("--spacing", type="int", dest="maxSpacing")
@@ -99,7 +209,7 @@ def makeParser():
      parser.add_option("--noshift", action="store_true", dest="noShift")
      parser.add_option("--autoshift", action="store_true", dest="autoshift")
      parser.add_option("--reportshift", action="store_true", dest="reportshift")
      parser.add_option("--noshift", action="store_true", dest="noShift")
      parser.add_option("--autoshift", action="store_true", dest="autoshift")
      parser.add_option("--reportshift", action="store_true", dest="reportshift")
-    parser.add_option("--nomulti", action="store_true", dest="noMulti")
+    parser.add_option("--nomulti", action="store_false", dest="useMulti")
      parser.add_option("--minPlus", type="float", dest="minPlusRatio")
      parser.add_option("--maxPlus", type="float", dest="maxPlusRatio")
      parser.add_option("--leftPlus", type="float", dest="leftPlusRatio")
      parser.add_option("--minPlus", type="float", dest="minPlusRatio")
      parser.add_option("--maxPlus", type="float", dest="maxPlusRatio")
      parser.add_option("--leftPlus", type="float", dest="leftPlusRatio")
@@ -137,16 +247,16 @@ def makeParser():
      logfilename = getConfigOption(configParser, section, "logfilename", "findall.log")
      withFlag = getConfigOption(configParser, section, "withFlag", "")
      doDirectionality = getConfigBoolOption(configParser, section, "doDirectionality", True)
      logfilename = getConfigOption(configParser, section, "logfilename", "findall.log")
      withFlag = getConfigOption(configParser, section, "withFlag", "")
      doDirectionality = getConfigBoolOption(configParser, section, "doDirectionality", True)
-    trimValue = getConfigOption(configParser, section, "trimValue", None)
+    trimValue = getConfigFloatOption(configParser, section, "trimValue", 0.1)
      doTrim = getConfigBoolOption(configParser, section, "doTrim", True)
      doAppend = getConfigBoolOption(configParser, section, "doAppend", False)
      rnaSettings = getConfigBoolOption(configParser, section, "rnaSettings", False)
      cachePages = getConfigOption(configParser, section, "cachePages", None)
      doTrim = getConfigBoolOption(configParser, section, "doTrim", True)
      doAppend = getConfigBoolOption(configParser, section, "doAppend", False)
      rnaSettings = getConfigBoolOption(configParser, section, "rnaSettings", False)
      cachePages = getConfigOption(configParser, section, "cachePages", None)
-    ptype = getConfigOption(configParser, section, "ptype", None)
-    mockfile = getConfigOption(configParser, section, "mockfile", None)
+    ptype = getConfigOption(configParser, section, "ptype", "")
+    controlfile = getConfigOption(configParser, section, "controlfile", None)
      doRevBackground = getConfigBoolOption(configParser, section, "doRevBackground", False)
      doRevBackground = getConfigBoolOption(configParser, section, "doRevBackground", False)
-    noMulti = getConfigBoolOption(configParser, section, "noMulti", False)
-    strandfilter = getConfigOption(configParser, section, "strandfilter", None)
+    useMulti = getConfigBoolOption(configParser, section, "useMulti", True)
+    strandfilter = getConfigOption(configParser, section, "strandfilter", "")
      combine5p = getConfigBoolOption(configParser, section, "combine5p", False)
  
      parser.set_defaults(minHits=minHits, minRatio=minRatio, maxSpacing=maxSpacing, listPeak=listPeak, shift=shift,
      combine5p = getConfigBoolOption(configParser, section, "combine5p", False)
  
      parser.set_defaults(minHits=minHits, minRatio=minRatio, maxSpacing=maxSpacing, listPeak=listPeak, shift=shift,
@@ -154,611 +264,692 @@ def makeParser():
                          minPlusRatio=minPlusRatio, maxPlusRatio=maxPlusRatio, leftPlusRatio=leftPlusRatio, minPeak=minPeak,
                          normalize=normalize, logfilename=logfilename, withFlag=withFlag, doDirectionality=doDirectionality,
                          trimValue=trimValue, doTrim=doTrim, doAppend=doAppend, rnaSettings=rnaSettings,
                          minPlusRatio=minPlusRatio, maxPlusRatio=maxPlusRatio, leftPlusRatio=leftPlusRatio, minPeak=minPeak,
                          normalize=normalize, logfilename=logfilename, withFlag=withFlag, doDirectionality=doDirectionality,
                          trimValue=trimValue, doTrim=doTrim, doAppend=doAppend, rnaSettings=rnaSettings,
-                        cachePages=cachePages, ptype=ptype, mockfile=mockfile, doRevBackground=doRevBackground, noMulti=noMulti,
+                        cachePages=cachePages, ptype=ptype, controlfile=controlfile, doRevBackground=doRevBackground, useMulti=useMulti,
                          strandfilter=strandfilter, combine5p=combine5p)
  
      return parser
  
  
                          strandfilter=strandfilter, combine5p=combine5p)
  
      return parser
  
  
-def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
-            stringency=4.0, noshift=False, autoshift=False, reportshift=False,
-            minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
-            normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
-            trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
-            cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
-            strandfilter=None, combine5p=False):
+def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shiftValue=0,
+            stringency=4.0, reportshift=False, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
+            normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True, trimValue=0.1, doTrim=True,
+            outputMode="w", rnaSettings=False, cachePages=None, ptype="", controlfile=None, doRevBackground=False,
+            useMulti=True, strandfilter="", combine5p=False):
  
  
-    shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
-    doControl = False
-    if mockfile is not None:
-        doControl = True
+    regionFinder = RegionFinder(minRatio, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, strandfilter, minHits, trimValue,
+                                doTrim, doDirectionality, shiftValue, maxSpacing)
  
  
-    if doRevBackground:
-        print "Swapping IP and background to calculate FDR"
-        pValueType = "back"
+    doControl = controlfile is not None
+    pValueType = getPValueType(ptype, doControl, doRevBackground)
+    doPvalue = not pValueType == "none"
  
  
-    doPvalue = True
-    if ptype is not None:
-        ptype = ptype.upper()
+    if rnaSettings:
+        regionFinder.useRNASettings()
+
+    doCache = cachePages is not None
+    printStatusMessages(regionFinder.shiftValue, normalize, doRevBackground, ptype, doControl, withFlag, useMulti, rnaSettings, regionFinder.strandfilter)
+    controlRDS = None
+    if doControl:
+        print "\ncontrol:" 
+        controlRDS = openRDSFile(controlfile, cachePages=cachePages, doCache=doCache)
+
+    print "\nsample:" 
+    hitRDS = openRDSFile(hitfile, cachePages=cachePages, doCache=doCache)
+
+    print
+    regionFinder.readlen = hitRDS.getReadSize()
+    if rnaSettings:
+        regionFinder.maxSpacing = regionFinder.readlen
+
+    writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+    regionFinder.printOptionsSummary(listPeak, useMulti, doCache, pValueType)
+
+    regionFinder.sampleRDSsize = len(hitRDS) / 1000000.
+    if doControl:
+        regionFinder.controlRDSsize = len(controlRDS) / 1000000.
+
+    outfile = open(outfilename, outputMode)
+    print >> outfile, regionFinder.getAnalysisDescription(hitfile, controlfile, doControl,
+                                                          withFlag, listPeak, useMulti, doCache, pValueType)
+
+    header = getHeader(normalize, regionFinder.doDirectionality, listPeak, reportshift, doPvalue)
+    print >> outfile, header
+    if minRatio < minPeak:
+        minPeak = minRatio
+
+    shiftDict = {}
+    hitChromList = hitRDS.getChromosomes()
+    stringency = max(stringency, 1.0)
+    chromosomeList = getChromosomeListToProcess(hitChromList, controlRDS, doControl)
+    for chromosome in chromosomeList:
+        allregions, outregions = findPeakRegions(regionFinder, hitRDS, controlRDS, chromosome, logfilename, outfilename,
+                                                 outfile, stringency, normalize, useMulti, doControl, withFlag, combine5p,
+                                                 factor, listPeak)
+
+        if doRevBackground:
+            #TODO: this is *almost* the same calculation - there are small yet important differences
+            backregions = findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, normalize, useMulti, factor)
+            writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
+                                   allregions, header, backregions=backregions, pValueType=pValueType)
+        else:
+            writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
+                                        allregions, header)
+
+    footer = getFooter(regionFinder, shiftDict, reportshift, doRevBackground)
+    print footer
+    print >> outfile, footer
+    outfile.close()
+    writeLog(logfilename, versionString, outfilename + footer.replace("\n#"," | ")[:-1])
+
+
+def getPValueType(ptype, doControl, doRevBackground):
+    pValueType = "self"
+    if ptype in ["NONE", "SELF", "BACK"]:
          if ptype == "NONE":
          if ptype == "NONE":
-            doPvalue = False
              pValueType = "none"
              pValueType = "none"
-            p = 1
-            poissonmean = 0
          elif ptype == "SELF":
              pValueType = "self"
          elif ptype == "BACK":
              if doControl and doRevBackground:
                  pValueType = "back"
          elif ptype == "SELF":
              pValueType = "self"
          elif ptype == "BACK":
              if doControl and doRevBackground:
                  pValueType = "back"
+    elif doRevBackground:
+        pValueType = "back"
+
+    return pValueType
+
+
+def printStatusMessages(shiftValue, normalize, doRevBackground, ptype, doControl, withFlag, useMulti, rnaSettings, strandfilter):
+    if shiftValue == "learn":
+        print "Will try to learn shift"
+
+    if normalize:
+        print "Normalizing to RPM"
+
+    if doRevBackground:
+        print "Swapping IP and background to calculate FDR"
+
+    if ptype != "":
+        if ptype in ["NONE", "SELF"]:
+            pass
+        elif ptype == "BACK":
+            if doControl and doRevBackground:
+                pass
              else:
                  print "must have a control dataset and -revbackground for pValue type 'back'"
          else:
              print "could not use pValue type : %s" % ptype
              else:
                  print "must have a control dataset and -revbackground for pValue type 'back'"
          else:
              print "could not use pValue type : %s" % ptype
-    else:
-        pValueType = "self"
  
      if withFlag != "":
          print "restrict to flag = %s" % withFlag
  
  
      if withFlag != "":
          print "restrict to flag = %s" % withFlag
  
-    useMulti = True
-    if noMulti:
+    if not useMulti:
          print "using unique reads only"
          print "using unique reads only"
-        useMulti = False
  
      if rnaSettings:
          print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
  
      if rnaSettings:
          print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
-        doTrim = False
-        doDirectionality = False
  
  
-    stranded = ""
-    if strandfilter is not None:
-        if strandfilter == "plus":
-            stranded = "+"
-            minPlusRatio = 0.9
-            maxPlusRatio = 1.0
-            print "only analyzing reads on the plus strand"
-        elif strandfilter == "minus":
-            stranded = "-"
-            minPlusRatio = 0.0
-            maxPlusRatio = 0.1
-            print "only analyzing reads on the minus strand"
+    if strandfilter == "plus":
+        print "only analyzing reads on the plus strand"
+    elif strandfilter == "minus":
+        print "only analyzing reads on the minus strand"
  
  
-    stringency = max(stringency, 1.0)
-    writeLog(logfilename, versionString, string.join(sys.argv[1:]))
-    if cachePages is not None:
-        doCache = True
+
+def openRDSFile(filename, cachePages=None, doCache=False):
+    rds = ReadDataset.ReadDataset(filename, verbose=True, cache=doCache)
+    if cachePages > rds.getDefaultCacheSize():
+        rds.setDBcache(cachePages)
+
+    return rds
+
+
+def getHeader(normalize, doDirectionality, listPeak, reportshift, doPvalue):
+    if normalize:
+        countType = "RPM"
      else:
      else:
-        doCache = False
-        cachePages = -1
+        countType = "COUNT"
  
  
-    mockRDS = None
-    if doControl:
-        print "\ncontrol:" 
-        mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
+    headerFields = ["#regionID\tchrom\tstart\tstop", countType, "fold\tmulti%"]
  
  
-        if cachePages > mockRDS.getDefaultCacheSize():
-            mockRDS.setDBcache(cachePages)
+    if doDirectionality:
+        headerFields.append("plus%\tleftPlus%")
  
  
-    print "\nsample:" 
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
-    readlen = hitRDS.getReadSize()
-    if rnaSettings:
-        maxSpacing = readlen
+    if listPeak:
+        headerFields.append("peakPos\tpeakHeight")
  
  
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
+    if reportshift:
+        headerFields.append("readShift")
  
  
-    if trimValue is not None:
-        trimValue = float(trimValue) / 100.
-        trimString = "%2.1f%s" % ((100. * trimValue), "%")
-    else:
-        trimValue = 0.1
-        trimString = "10%"
+    if doPvalue:
+        headerFields.append("pValue")
  
  
-    if not doTrim:
-        trimString = "none"
+    return string.join(headerFields, "\t")
  
  
-    if doAppend:
-        fileMode = "a"
-    else:
-        fileMode = "w"
  
  
-    outfile = open(outfilename, fileMode)
-    outfile.write("#ERANGE %s\n" % versionString)
+def getChromosomeListToProcess(hitChromList, controlRDS, doControl):
      if doControl:
      if doControl:
-        mockRDSsize = len(mockRDS) / 1000000.
-        controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize)
+        controlChromList = controlRDS.getChromosomes()
+        chromosomeList = [chrom for chrom in hitChromList if chrom in controlChromList and chrom != "chrM"]
      else:
      else:
-        mockRDSsize = 0
-        controlSampleString = " none"
+        chromosomeList = [chrom for chrom in hitChromList if chrom != "chrM"]
  
  
-    hitRDSsize = len(hitRDS) / 1000000.
-    outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:%s\n" % (hitfile, hitRDSsize, controlSampleString))
+    return chromosomeList
  
  
-    if withFlag != "":
-        outfile.write("#restrict to Flag = %s\n" % withFlag)
  
  
-    print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache)
-    print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)
-    print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)
+def findPeakRegions(regionFinder, hitRDS, controlRDS, chromosome, logfilename, outfilename,
+                    outfile, stringency, normalize, useMulti, doControl, withFlag, combine5p, factor, listPeak):
  
  
-    outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache))
-    outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded))
-    outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType))
-    if normalize:
-        print "Normalizing to RPM"
-        countLabel = "RPM"
-    else:
-        countLabel = "COUNT"
+    outregions = []
+    allregions = []
+    print "chromosome %s" % (chromosome)
+    previousHit = - 1 * regionFinder.maxSpacing
+    readStartPositions = [-1]
+    totalWeight = 0
+    uniqueReadCount = 0
+    reads = []
+    numStarts = 0
+    badRegion = False
+    hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True,
+                                  strand=regionFinder.stranded, combine5p=combine5p)
  
  
-    headerList = ["#regionID\tchrom\tstart\tstop", countLabel, "fold\tmulti%"]
-    if doDirectionality:
-        headerList.append("plus%\tleftPlus%")
+    maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
+    if regionFinder.shiftValue == "learn":
+        learnShift(regionFinder, hitDict, controlRDS, chromosome, maxCoord, logfilename, outfilename,
+                                outfile, numStarts, stringency, normalize, useMulti, doControl)
  
  
-    if listPeak:
-        headerList.append("peakPos\tpeakHeight")
+    for read in hitDict[chromosome]:
+        pos = read["start"]
+        if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
+            lastReadPos = readStartPositions[-1]
+            lastBasePosition = lastReadPos + regionFinder.readlen - 1
+            newRegionIndex = regionFinder.statistics["index"] + 1
+            if regionFinder.doDirectionality:
+                region = Region.DirectionalRegion(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=factor, numReads=totalWeight)
+            else:
+                region = Region.Region(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=factor, numReads=totalWeight)
  
  
-    if reportshift:
-        headerList.append("readShift")
+            if normalize:
+                region.numReads /= regionFinder.sampleRDSsize
  
  
-    if doPvalue:
-        headerList.append("pValue")
+            allregions.append(int(region.numReads))
+            regionLength = lastReadPos - region.start
+            if regionPassesCriteria(region.numReads, regionFinder.minHits, numStarts, regionFinder.minRatio, regionLength, regionFinder.readlen):
+                region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos,
+                                                useMulti, doControl, normalize)
  
  
-    headline = string.join(headerList, "\t")
-    print >> outfile, headline
+                if region.foldRatio >= regionFinder.minRatio:
+                    # first pass, with absolute numbers
+                    peak = findPeak(reads, region.start, regionLength, regionFinder.readlen, doWeight=True, leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue)
+                    if regionFinder.doTrim:
+                        try:
+                            lastReadPos = trimRegion(region, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.readlen, regionFinder.doDirectionality,
+                                                     normalize, regionFinder.sampleRDSsize)
+                        except IndexError:
+                            badRegion = True
+                            continue
  
  
-    statistics = {"index": 0,
-                  "total": 0,
-                  "mIndex": 0,
-                  "mTotal": 0,
-                  "failed": 0
-    }
+                        region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos,
+                                                        useMulti, doControl, normalize)
  
  
-    if minRatio < minPeak:
-        minPeak = minRatio
+                    # just in case it changed, use latest data
+                    try:
+                        bestPos = peak.topPos[0]
+                        peakScore = peak.smoothArray[bestPos]
+                        if normalize:
+                            peakScore /= regionFinder.sampleRDSsize
+                    except:
+                        continue
  
  
-    hitChromList = hitRDS.getChromosomes()
-    if doControl:
-        mockChromList = mockRDS.getChromosomes()
-    else:
-        mockChromList = []
+                    if listPeak:
+                        region.peakDescription= "%d\t%.1f" % (region.start + bestPos, peakScore)
  
  
-    mockSampleSize = 0
-    hitSampleSize = 0
-    if normalize:
-        if doControl:
-            mockSampleSize = mockRDSsize
-
-        hitSampleSize = hitRDSsize
-
-    hitChromList.sort()
-
-    for chromosome in hitChromList:
-        if doNotProcessChromosome(chromosome, doControl, mockChromList):
-            continue
-
-        print "chromosome %s" % (chromosome)
-        hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True,
-                                      doMulti=useMulti, findallOptimize=True, strand=stranded,
-                                      combine5p=combine5p)
-        maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
-        if shiftValue == "learn":
-            shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize,
-                                    mockSampleSize, minRatio, maxSpacing, maxCoord, stringency, readlen, minHits,
-                                    logfilename, outfile, outfilename)
-
-        regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize,
-                                                                  chromosome, useMulti, normalize, maxSpacing,
-                                                                  doDirectionality, doTrim, minHits, minRatio,
-                                                                  readlen, shiftValue, minPeak, minPlusRatio,
-                                                                  maxPlusRatio, leftPlusRatio, listPeak, noMulti,
-                                                                  doControl, factor, trimValue, outputRegionList=True)
-
-        statistics["index"] += regionStats["index"]
-        statistics["total"] += regionStats["total"]
-        statistics["failed"] += regionStats["failed"]
-        if not doRevBackground:
-            if doPvalue:
-                p, poissonmean = calculatePValue(allRegionWeights)
-
-            print headline
-            shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
-            continue
-
-        #now do background swapping the two samples around
-        if mockRDS is not None:
-            print "calculating background..."
-            backgroundTrimValue = 1/20.
-            backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize,
-                                                                           chromosome, useMulti, normalize, maxSpacing,
-                                                                           doDirectionality, doTrim, minHits, minRatio,
-                                                                           readlen, shiftValue, minPeak, minPlusRatio,
-                                                                           maxPlusRatio, leftPlusRatio, listPeak, noMulti,
-                                                                           doControl, factor, backgroundTrimValue)
-
-            statistics["mIndex"] += backgroundRegionStats["index"]
-            statistics["mTotal"] += backgroundRegionStats["total"]
-            statistics["failed"] += backgroundRegionStats["failed"]
-            print statistics["mIndex"], statistics["mTotal"]
-            if doPvalue:
-                if pValueType == "self":
-                    p, poissonmean = calculatePValue(allRegionWeights)
-                else:
-                    p, poissonmean = calculatePValue(backgroundRegionWeights)
-
-            print headline
-            shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
-
-    footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue)
-    print footer
-    outfile.write(footer)
-    outfile.close()
+                    if useMulti:
+                        setMultireadPercentage(region, hitRDS, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos, normalize, regionFinder.doTrim)
  
  
-    writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | ")))
+                    region.shift = peak.shift
+                    # check that we still pass threshold
+                    regionLength = lastReadPos - region.start
+                    plusRatio = float(peak.numPlus)/peak.numHits
+                    if regionAndPeakPass(region, regionFinder.minHits, regionFinder.minRatio, regionLength, regionFinder.readlen, peakScore, regionFinder.minPeak,
+                                         regionFinder.minPlusRatio, regionFinder.maxPlusRatio, plusRatio):
+                        try:
+                            updateRegion(region, regionFinder.doDirectionality, regionFinder.leftPlusRatio, peak.numLeftPlus, peak.numPlus, plusRatio)
+                            regionFinder.statistics["index"] += 1
+                            outregions.append(region)
+                            regionFinder.statistics["total"] += region.numReads
+                        except RegionDirectionError:
+                            regionFinder.statistics["failed"] += 1
+
+            readStartPositions = []
+            totalWeight = 0
+            uniqueReadCount = 0
+            reads = []
+            numStarts = 0
+            if badRegion:
+                badRegion = False
+                regionFinder.statistics["badRegionTrim"] += 1
  
  
+        if pos not in readStartPositions:
+            numStarts += 1
  
  
-def determineShiftValue(autoshift, shift, noshift, rnaSettings):
-    shiftValue = 0
-    if autoshift:
-        shiftValue = "auto"
+        readStartPositions.append(pos)
+        weight = read["weight"]
+        totalWeight += weight
+        if weight == 1.0:
+            uniqueReadCount += 1
  
  
-    if shift is not None:
-        try:
-            shiftValue = int(shift)
-        except ValueError:
-            if shift == "learn":
-                shiftValue = "learn"
-                print "Will try to learn shift"
+        reads.append({"start": pos, "sense": read["sense"], "weight": weight})
+        previousHit = pos
  
  
-    if noshift or rnaSettings:
-        shiftValue = 0
+    return allregions, outregions
  
  
-    return shiftValue
  
  
+def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, normalize, useMulti, factor):
  
  
-def doNotProcessChromosome(chromosome, doControl, mockChromList):
-    skipChromosome = False
-    if chromosome == "chrM":
-        skipChromosome = True
+    print "calculating background..."
+    previousHit = - 1 * regionFinder.maxSpacing
+    currentHitList = [-1]
+    currentTotalWeight = 0
+    currentReadList = []
+    backregions = []
+    numStarts = 0
+    badRegion = False
+    hitDict = controlRDS.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, findallOptimize=True)
+    maxCoord = controlRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
+    for read in hitDict[chromosome]:
+        pos = read["start"]
+        if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
+            lastReadPos = currentHitList[-1]
+            lastBasePosition = lastReadPos + regionFinder.readlen - 1
+            region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=factor, numReads=currentTotalWeight)
+            if normalize:
+                region.numReads /= regionFinder.controlRDSsize
+
+            backregions.append(int(region.numReads))
+            region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=factor, numReads=currentTotalWeight)
+            regionLength = lastReadPos - region.start
+            if regionPassesCriteria(region.numReads, regionFinder.minHits, numStarts, regionFinder.minRatio, regionLength, regionFinder.readlen):
+                numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+                if normalize:
+                    numMock /= regionFinder.sampleRDSsize
+
+                foldRatio = region.numReads / numMock
+                if foldRatio >= regionFinder.minRatio:
+                    # first pass, with absolute numbers
+                    peak = findPeak(currentReadList, region.start, lastReadPos - region.start, regionFinder.readlen, doWeight=True,
+                                    leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue)
  
  
-    if doControl and (chromosome not in mockChromList):
-        skipChromosome = True
+                    if regionFinder.doTrim:
+                        try:
+                            lastReadPos = trimRegion(region, peak, lastReadPos, 20., currentReadList, regionFinder.readlen, regionFinder.doDirectionality,
+                                                     normalize, regionFinder.controlRDSsize)
+                        except IndexError:
+                            badRegion = True
+                            continue
  
  
-    return skipChromosome
+                        numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+                        if normalize:
+                            numMock /= regionFinder.sampleRDSsize
  
  
+                        foldRatio = region.numReads / numMock
  
  
-def calculatePValue(dataList):
-    dataList.sort()
-    listSize = float(len(dataList))
-    try:
-        poissonmean = sum(dataList) / listSize
-    except ZeroDivisionError:
-        poissonmean = 0
+                    # just in case it changed, use latest data
+                    try:
+                        bestPos = peak.topPos[0]
+                        peakScore = peak.smoothArray[bestPos]
+                    except IndexError:
+                        continue
  
  
-    print "Poisson n=%d, p=%f" % (listSize, poissonmean)
-    p = math.exp(-poissonmean)
+                    # normalize to RPM
+                    if normalize:
+                        peakScore /= regionFinder.controlRDSsize
+
+                    # check that we still pass threshold
+                    regionLength = lastReadPos - region.start
+                    if regionPassesCriteria(region.numReads, regionFinder.minHits, foldRatio, regionFinder.minRatio, regionLength, regionFinder.readlen):
+                        updateControlStatistics(regionFinder, peak, region.numReads, peakScore)
+
+            currentHitList = []
+            currentTotalWeight = 0
+            currentReadList = []
+            numStarts = 0
+            if badRegion:
+                badRegion = False
+                regionFinder.statistics["badRegionTrim"] += 1
+
+        if pos not in currentHitList:
+            numStarts += 1
+
+        currentHitList.append(pos)
+        weight = read["weight"]
+        currentTotalWeight += weight
+        currentReadList.append({"start": pos, "sense": read["sense"], "weight": weight})
+        previousHit = pos
  
  
-    return p, poissonmean
+    return backregions
  
  
  
  
-def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
-               stringency, readlen, minHits, logfilename, outfile, outfilename, minSites=30):
+def learnShift(regionFinder, hitDict, controlRDS, chromosome, maxCoord, logfilename, outfilename,
+               outfile, numStarts, stringency, normalize, useMulti, doControl):
  
  
-    print "learning shift.... will need at least %d training sites" % minSites
-    previousHit = -1 * maxSpacing
-    hitList = [-1]
+    print "learning shift.... will need at least 30 training sites"
+    previousHit = -1 * regionFinder.maxSpacing
+    positionList = [-1]
      totalWeight = 0
      readList = []
      shiftDict = {}
      count = 0
      totalWeight = 0
      readList = []
      shiftDict = {}
      count = 0
-    numStarts = 0
-    for read in hitDict[chrom]:
+    for read in hitDict[chromosome]:
          pos = read["start"]
          pos = read["start"]
-        sense = read["sense"]
-        weight = read["weight"]
-        if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
-            sumAll = totalWeight
+        if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
              if normalize:
              if normalize:
-                sumAll /= hitSampleSize
+                totalWeight /= regionFinder.sampleRDSsize
  
  
-            regionStart = hitList[0]
-            regionStop = hitList[-1]
+            regionStart = positionList[0]
+            regionStop = positionList[-1]
              regionLength = regionStop - regionStart
              regionLength = regionStop - regionStart
-            # we're going to require stringent settings
-            if sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen:
-                foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
-
-                if foldRatio >= minRatio:
-                    localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True)
-                    try:
-                        shiftDict[localshift] += 1
-                    except KeyError:
-                        shiftDict[localshift] = 1
-
+            if regionPassesCriteria(totalWeight, regionFinder.minHits, numStarts, regionFinder.minRatio, regionLength, regionFinder.readlen, stringency=stringency):
+                foldRatio = getFoldRatio(regionFinder, controlRDS, totalWeight, chromosome, regionStart, regionStop, useMulti, doControl, normalize)
+                if foldRatio >= regionFinder.minRatio:
+                    updateShiftDict(shiftDict, readList, regionStart, regionLength, regionFinder.readlen)
                      count += 1
  
                      count += 1
  
-            hitList = []
+            positionList = []
              totalWeight = 0
              readList = []
              totalWeight = 0
              readList = []
-            numStarts = 0
  
  
-        if pos not in hitList:
+        if pos not in positionList:
              numStarts += 1
  
              numStarts += 1
  
-        hitList.append(pos)
+        positionList.append(pos)
+        weight = read["weight"]
          totalWeight += weight
          totalWeight += weight
-        readList.append({"start": pos, "sense": sense, "weight": weight})
+        readList.append({"start": pos, "sense": read["sense"], "weight": weight})
          previousHit = pos
  
          previousHit = pos
  
-    bestShift = 0
-    bestCount = 0
-    learningSettings = ["#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d" % (stringency, stringency * minHits,
-                                                                                                       stringency * minRatio, stringency * readlen),
-                        "#number of training examples: %d" % count]
-    outline = string.join(learningSettings, "\n")
+    outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency,
+                                                                                                                               stringency * regionFinder.minHits,
+                                                                                                                               stringency * regionFinder.minRatio,
+                                                                                                                               stringency * regionFinder.readlen,
+                                                                                                                               count)
+
      print outline
      print outline
-    writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
-    if count < minSites:
+    writeLog(logfilename, versionString, outfilename + outline)
+    regionFinder.shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename)
+    outline = "#picked shiftValue to be %d" % regionFinder.shiftValue
+    print outline
+    print >> outfile, outline
+    writeLog(logfilename, versionString, outfilename + outline)
+
+
+def previousRegionIsDone(pos, previousHit, maxSpacing, maxCoord):
+    return abs(pos - previousHit) > maxSpacing or pos == maxCoord
+
+
+def regionPassesCriteria(sumAll, minHits, numStarts, minRatio, regionLength, readlen, stringency=1):
+    return sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen
+
+
+def trimRegion(region, peak, regionStop, trimValue, currentReadList, readlen, doDirectionality, normalize, hitRDSsize):
+    bestPos = peak.topPos[0]
+    peakScore = peak.smoothArray[bestPos]
+    if normalize:
+        peakScore /= hitRDSsize
+
+    minSignalThresh = trimValue * peakScore
+    start = findStartEdgePosition(peak, minSignalThresh)
+    regionEndPoint = regionStop - region.start - 1
+    stop = findStopEdgePosition(peak, regionEndPoint, minSignalThresh)
+
+    regionStop = region.start + stop
+    region.start += start
+    trimmedPeak = findPeak(currentReadList, region.start, regionStop - region.start, readlen, doWeight=True, leftPlus=doDirectionality, shift=peak.shift)
+    peak.numPlus = trimmedPeak.numPlus
+    peak.numLeftPlus = trimmedPeak.numLeftPlus
+    peak.topPos = trimmedPeak.topPos
+    peak.smoothArray = trimmedPeak.smoothArray
+
+    region.numReads = trimmedPeak.numHits
+    if normalize:
+        region.numReads /= hitRDSsize
+
+    region.stop = regionStop + readlen - 1
+                                
+    return regionStop
+
+
+def findStartEdgePosition(peak, minSignalThresh):
+    start = 0
+    while not peakEdgeLocated(peak, start, minSignalThresh):
+        start += 1
+
+    return start
+
+
+def findStopEdgePosition(peak, stop, minSignalThresh):
+    while not peakEdgeLocated(peak, stop, minSignalThresh):
+        stop -= 1
+
+    return stop
+
+
+def peakEdgeLocated(peak, position, minSignalThresh):
+    return peak.smoothArray[position] >= minSignalThresh or position == peak.topPos[0]
+
+
+def getFoldRatio(regionFinder, controlRDS, sumAll, chromosome, regionStart, regionStop, useMulti, doControl, normalize):
+    if doControl:
+        numMock = 1. + controlRDS.getCounts(chromosome, regionStart, regionStop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+        if normalize:
+            numMock /= regionFinder.controlRDSsize
+
+        foldRatio = sumAll / numMock
+    else:
+        foldRatio = regionFinder.minRatio
+
+    return foldRatio
+
+
+def updateShiftDict(shiftDict, readList, regionStart, regionLength, readlen):
+    peak = findPeak(readList, regionStart, regionLength, readlen, doWeight=True, shift="auto")
+    try:
+        shiftDict[peak.shift] += 1
+    except KeyError:
+        shiftDict[peak.shift] = 1
+
+
+def getShiftValue(shiftDict, count, logfilename, outfilename):
+    if count < 30:
          outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
          outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
-        print >> outfile, outline
-        writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
+        print outline
+        writeLog(logfilename, versionString, outfilename + outline)
          shiftValue = 0
      else:
          shiftValue = 0
      else:
-        for shift in sorted(shiftDict):
-            if shiftDict[shift] > bestCount:
-                bestShift = shift
-                bestCount = shiftDict[shift]
-
-        shiftValue = bestShift
+        shiftValue = getBestShiftInDict(shiftDict)
          print shiftDict
  
          print shiftDict
  
-    outline = "#picked shiftValue to be %d" % shiftValue
-    print outline
-    print >> outfile, outline
-    writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
-
      return shiftValue
  
  
      return shiftValue
  
  
-def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio):
-    if doControl and rds is not None:
-        foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll)
+def updateControlStatistics(regionFinder, peak, sumAll, peakScore):
+
+    plusRatio = float(peak.numPlus)/peak.numHits
+    if peakScore >= regionFinder.minPeak and regionFinder.minPlusRatio <= plusRatio <= regionFinder.maxPlusRatio:
+        if regionFinder.doDirectionality:
+            if regionFinder.leftPlusRatio < peak.numLeft / peak.numPlus:
+                regionFinder.statistics["mIndex"] += 1
+                regionFinder.statistics["mTotal"] += sumAll
+            else:
+                regionFinder.statistics["failed"] += 1
+        else:
+            # we have a region, but didn't check for directionality
+            regionFinder.statistics["mIndex"] += 1
+            regionFinder.statistics["mTotal"] += sumAll
+
+
+def getRegion(regionStart, regionStop, factor, index, chromosome, sumAll, foldRatio, multiP,
+              peakDescription, shift, doDirectionality, leftPlusRatio, numLeft,
+              numPlus, plusRatio):
+
+    if doDirectionality:
+        if leftPlusRatio < numLeft / numPlus:
+            plusP = plusRatio * 100.
+            leftP = 100. * numLeft / numPlus
+            # we have a region that passes all criteria
+            region = Region.DirectionalRegion(regionStart, regionStop,
+                                              factor, index, chromosome, sumAll,
+                                              foldRatio, multiP, plusP, leftP,
+                                              peakDescription, shift)
+
+        else:
+            raise RegionDirectionError
      else:
      else:
-        foldRatio = minRatio
+        # we have a region, but didn't check for directionality
+        region = Region.Region(regionStart, regionStop, factor, index, chromosome,
+                               sumAll, foldRatio, multiP, peakDescription, shift)
  
  
-    return foldRatio
+    return region
  
  
  
  
-def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll):
-    numMock = 1. + rds.getCounts(chrom, start, stop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+def setMultireadPercentage(region, hitRDS, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim):
+    if doTrim:
+        sumMulti = hitRDS.getMultiCount(chromosome, region.start, lastReadPos)
+    else:
+        sumMulti = currentTotalWeight - currentUniqueCount
+
+    # normalize to RPM
      if normalize:
      if normalize:
-        numMock /= sampleSize
+        sumMulti /= hitRDSsize
  
  
-    foldRatio = sumAll / numMock
+    try:
+        multiP = 100. * (sumMulti / region.numReads)
+    except ZeroDivisionError:
+        return
  
  
-    return foldRatio
+    region.multiP = multiP
  
  
  
  
-def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
-                normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
-                shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
-                noMulti, doControl, factor, trimValue, outputRegionList=False):
+def regionAndPeakPass(region, minHits, minRatio, regionLength, readlen, peakScore, minPeak, minPlusRatio, maxPlusRatio, plusRatio):
+    regionPasses = False
+    if regionPassesCriteria(region.numReads, minHits, region.foldRatio, minRatio, regionLength, readlen):
+        if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
+            regionPasses = True
  
  
-    index = 0
-    totalRegionWeight = 0
-    failedCounter = 0
-    previousHit = - 1 * maxSpacing
-    currentHitList = [-1]
-    currentTotalWeight = 0
-    currentUniqReadCount = 0
-    currentReadList = []
-    regionWeights = []
-    outregions = []
-    numStarts = 0
-    hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True)
-    maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti)
-    for read in hitDict[chrom]:
-        pos = read["start"]
-        sense = read["sense"]
-        weight = read["weight"]
-        if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
-            sumAll = currentTotalWeight
-            if normalizeToRPM:
-                sumAll /= rdsSampleSize
-
-            regionStart = currentHitList[0]
-            regionStop = currentHitList[-1]
-            regionWeights.append(int(sumAll))
-            if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
-                sumMulti = 0.
-                foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
-                if foldRatio >= minRatio:
-                    # first pass, with absolute numbers
-                    peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
-
-                    bestPos = peak.topPos[0]
-                    numHits = peak.numHits
-                    peakScore = peak.smoothArray[bestPos]
-                    numPlus = peak.numPlus
-                    shift = peak.shift
-                    numLeft = peak.numLeftPlus
-                    if normalizeToRPM:
-                        peakScore /= rdsSampleSize
-
-                    if doTrim:
-                        minSignalThresh = trimValue * peakScore
-                        start = 0
-                        stop = regionStop - regionStart - 1
-                        startFound = False
-                        while not startFound:
-                            if peak.smoothArray[start] >= minSignalThresh or start == bestPos:
-                                startFound = True
-                            else:
-                                start += 1
-
-                        stopFound = False
-                        while not stopFound:
-                            if peak.smoothArray[stop] >= minSignalThresh or stop == bestPos:
-                                stopFound = True
-                            else:
-                                stop -= 1
-
-                        regionStop = regionStart + stop
-                        regionStart += start
-                        trimPeak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift)
-
-                        sumAll = trimPeak.numHits
-                        numPlus = trimPeak.numPlus
-                        numLeft = trimPeak.numLeftPlus
-                        if normalizeToRPM:
-                            sumAll /= rdsSampleSize
-
-                        foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
-                        if outputRegionList:
-                            sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
-                        # just in case it changed, use latest data
-                        try:
-                            bestPos = trimPeak.topPos[0]
-                            peakScore = trimPeak.smoothArray[bestPos]
-                        except:
-                            continue
+    return regionPasses
  
  
-                        if normalizeToRPM:
-                            peakScore /= rdsSampleSize
  
  
-                    elif outputRegionList:
-                        sumMulti = currentTotalWeight - currentUniqReadCount
+def updateRegion(region,
+                 doDirectionality, leftPlusRatio, numLeft,
+                 numPlus, plusRatio):
  
  
-                    if outputRegionList:
-                        if normalizeToRPM:
-                            sumMulti /= rdsSampleSize
+    if doDirectionality:
+        if leftPlusRatio < numLeft / numPlus:
+            region.plusP = plusRatio * 100.
+            region.leftP = 100. * numLeft / numPlus
+        else:
+            raise RegionDirectionError
  
  
-                        try:
-                            multiP = 100. * (sumMulti / sumAll)
-                        except:
-                            break
  
  
-                        if noMulti:
-                            multiP = 0.
+def writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
+                                allregions, header):
  
  
-                    # check that we still pass threshold
-                    if sumAll >= minHits and  foldRatio >= minRatio and (regionStop - regionStart) > readlen:
-                        plusRatio = float(numPlus)/numHits
-                        if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
-                            if outputRegionList:
-                                peakDescription = ""
-                                if listPeak:
-                                    peakDescription = "\t%d\t%.1f" % (regionStart + bestPos, peakScore)
-
-                            if doDirectionality:
-                                if leftPlusRatio < numLeft / numPlus:
-                                    index += 1
-                                    if outputRegionList:
-                                        plusP = plusRatio * 100.
-                                        leftP = 100. * numLeft / numPlus
-                                        # we have a region that passes all criteria
-                                        region = Region.DirectionalRegion(regionStart, regionStop + readlen - 1,
-                                                                          factor, index, chrom, sumAll,
-                                                                          foldRatio, multiP, plusP, leftP,
-                                                                          peakDescription, shift)
-                                        outregions.append(region)
-
-                                    totalRegionWeight += sumAll
-                                else:
-                                    failedCounter += 1
-                            else:
-                                # we have a region, but didn't check for directionality
-                                index += 1
-                                totalRegionWeight += sumAll
-                                if outputRegionList:
-                                    region = Region.Region(regionStart, regionStop + readlen - 1, factor, index, chrom,
-                                                           sumAll, foldRatio, multiP, peakDescription, shift)
-                                    outregions.append(region)
+    writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
+                           allregions, header, backregions=[], pValueType="self")
  
  
-            currentHitList = []
-            currentTotalWeight = 0
-            currentUniqReadCount = 0
-            currentReadList = []
-            numStarts = 0
  
  
-        if pos not in currentHitList:
-            numStarts += 1
+def writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
+                           allregions, header, backregions=[], pValueType="none"):
  
  
-        currentHitList.append(pos)
-        currentTotalWeight += weight
-        if weight == 1.0:
-            currentUniqReadCount += 1
+    print regionFinder.statistics["mIndex"], regionFinder.statistics["mTotal"]
+    if doPvalue:
+        if pValueType == "self":
+            poissonmean = calculatePoissonMean(allregions)
+        else:
+            poissonmean = calculatePoissonMean(backregions)
  
  
-        currentReadList.append({"start": pos, "sense": sense, "weight": weight})
-        previousHit = pos
+    print header
+    writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=regionFinder.shiftValue, reportshift=reportshift, shiftDict=shiftDict)
  
  
-    statistics = {"index": index,
-                  "total": totalRegionWeight,
-                  "failed": failedCounter
-    }
  
  
-    if outputRegionList:
-        return statistics, regionWeights, outregions
-    else:
-        return statistics, regionWeights
+def calculatePoissonMean(dataList):
+    dataList.sort()
+    listSize = float(len(dataList))
+    try:
+        poissonmean = sum(dataList) / listSize
+    except ZeroDivisionError:
+        poissonmean = 0
  
  
+    print "Poisson n=%d, p=%f" % (listSize, poissonmean)
  
  
-def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, reportshift, shiftValue):
-    bestShift = 0
-    shiftDict = {}
+    return poissonmean
+
+
+def writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=0, reportshift=False, shiftDict={}):
      for region in outregions:
      for region in outregions:
-        # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
-        if reportshift:
-            outputList = [region.printRegionWithShift()]
-            if shiftValue == "auto":
-                try:
-                    shiftDict[region.shift] += 1
-                except KeyError:
-                    shiftDict[region.shift] = 1
-        else:
-            outputList = [region.printRegion()]
+        if shiftValue == "auto" and reportshift:
+            try:
+                shiftDict[region.shift] += 1
+            except KeyError:
+                shiftDict[region.shift] = 1
+
+        outline = getRegionString(region, reportshift)
  
          # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
          if doPvalue:
              sumAll = int(region.numReads)
  
          # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
          if doPvalue:
              sumAll = int(region.numReads)
-            for i in xrange(sumAll):
-                pValue *= poissonmean
-                pValue /= i+1
-
-            outputList.append("%1.2g" % pValue)
+            pValue = calculatePValue(sumAll, poissonmean)
+            outline += "\t%1.2g" % pValue
  
  
-        outline = string.join(outputList, "\t")
          print outline
          print >> outfile, outline
  
          print outline
          print >> outfile, outline
  
-    bestCount = 0
-    for shift in sorted(shiftDict):
-        if shiftDict[shift] > bestCount:
-            bestShift = shift
-            bestCount = shiftDict[shift]
  
  
-    return bestShift
+def calculatePValue(sum, poissonmean):
+    pValue = math.exp(-poissonmean)
+    #TODO: 798: DeprecationWarning: integer argument expected, got float - for i in xrange(sum)
+    for i in xrange(sum):
+        pValue *= poissonmean
+        pValue /= i+1
  
  
+    return pValue
+
+
+def getRegionString(region, reportShift):
+    if reportShift:
+        outline = region.printRegionWithShift()
+    else:
+        outline = region.printRegion()
+
+    return outline
  
  
-def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue):
-    footerList = ["#stats:\t%.1f RPM in %d regions" % (stats["total"], stats["index"])]
-    if doDirectionality:
-        footerList.append("#\t\t%d additional regions failed directionality filter" % stats["failed"])
+
+def getFooter(regionFinder, shiftDict, reportshift, doRevBackground):
+    index = regionFinder.statistics["index"]
+    mIndex = regionFinder.statistics["mIndex"]
+    footerLines = ["#stats:\t%.1f RPM in %d regions" % (regionFinder.statistics["total"], index)]
+    if regionFinder.doDirectionality:
+        footerLines.append("#\t\t%d additional regions failed directionality filter" % regionFinder.statistics["failed"])
  
      if doRevBackground:
          try:
  
      if doRevBackground:
          try:
-            percent = min(100. * (float(stats["mIndex"])/stats["index"]), 100)
-        except (ValueError, ZeroDivisionError):
+            percent = min(100. * (float(mIndex)/index), 100.)
+        except ZeroDivisionError:
              percent = 0.
  
              percent = 0.
  
-        footerList.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (stats["mIndex"], stats["mTotal"], percent))
+        footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, regionFinder.statistics["mTotal"], percent))
+
+    if regionFinder.shiftValue == "auto" and reportshift:
+        bestShift = getBestShiftInDict(shiftDict)
+        footerLines.append("#mode of shift values: %d" % bestShift)
+
+    if regionFinder.statistics["badRegionTrim"] > 0:
+        footerLines.append("#%d regions discarded due to trimming problems" % regionFinder.statistics["badRegionTrim"])
+
+    return string.join(footerLines, "\n")
  
  
-    if shiftValue == "auto" and reportshift:
-        footerList.append("#mode of shift values: %d" % shiftModeValue)
  
  
-    footer = string.join(footerList, "\n")
+def getBestShiftInDict(shiftDict):
+    return max(shiftDict.iteritems(), key=operator.itemgetter(1))[0]
  
  
-    return footer
  
  if __name__ == "__main__":
      main(sys.argv)
  
  if __name__ == "__main__":
      main(sys.argv)
diff --git a/geneLocusBins.py b/geneLocusBins.py

index 49745dc0c1361c2a8dbe4736c9c8630396fd93e4..03cade292225b55513b300a0fca2d8f2c4f5d50a 100755 (executable)
--- a/geneLocusBins.py
+++ b/geneLocusBins.py
@@ -138,7 +138,7 @@ def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins
              try:
                  geneinfo = geneinfoDict[gid]
                  symbol = geneinfo[0][0]
              try:
                  geneinfo = geneinfoDict[gid]
                  symbol = geneinfo[0][0]
-            except KeyError:
+            except (KeyError, IndexError):
                  pass
          else:
              symbol = gid
                  pass
          else:
              symbol = gid
@@ -154,6 +154,7 @@ def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins
                  try:
                      normalizedValue = 100. * binAmount / tagCount
                  except ZeroDivisionError:
                  try:
                      normalizedValue = 100. * binAmount / tagCount
                  except ZeroDivisionError:
+                    #TODO: QForALi - this is the right way to refactor the original code, but I don't think this is the right answer
                      normalizedValue = 100. * binAmount
  
                  binAmount = normalizedValue
                      normalizedValue = 100. * binAmount
  
                  binAmount = normalizedValue
@@ -167,4 +168,4 @@ def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins
  
  
  if __name__ == "__main__":
  
  
  if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
+\ No newline at end of file
diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py

index 5299d27d26c23f8729063eb31f0da390730b5ce5..74e7a0cd817e699af6b398960578f576a030f7e4 100755 (executable)
--- a/geneMrnaCountsWeighted.py
+++ b/geneMrnaCountsWeighted.py
@@ -68,6 +68,12 @@ def makeParser(usage=""):
      return parser
  
  
      return parser
  
  
+#TODO: Reported user performance issue. Long run times in conditions:
+#    small number of reads ~40-50M
+#    all features on single chromosome
+#
+#    User states has been a long time problem.
+
  def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
                             withUniqs=False, withMulti=False, acceptfile=None,
                             cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
  def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
                             withUniqs=False, withMulti=False, acceptfile=None,
                             cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
@@ -200,6 +206,7 @@ def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict,
      for line in uniquecounts:
          fields = line.strip().split()
          # add a pseudo-count here to ease calculations below
      for line in uniquecounts:
          fields = line.strip().split()
          # add a pseudo-count here to ease calculations below
+        #TODO: figure out why this was done in prior implementation...
          uniqueCountDict[fields[0]] = float(fields[-1]) + 1
  
      uniquecounts.close()
          uniqueCountDict[fields[0]] = float(fields[-1]) + 1
  
      uniquecounts.close()
@@ -264,4 +271,4 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
  
  
  if __name__ == "__main__":
  
  
  if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
+\ No newline at end of file
diff --git a/getNovelSNPs.py b/getNovelSNPs.py

index e95d72693c76df13871da4cf809c99e43c41d36a..4091281c8fe0f5410090b23db7276496fe49d6ed 100755 (executable)
--- a/getNovelSNPs.py
+++ b/getNovelSNPs.py
@@ -68,6 +68,7 @@ def doNotProcessLine(line):
  
  def getNovelSNPInfo(genome, snpEntry, hg):
      fields = snpEntry.split()
  
  def getNovelSNPInfo(genome, snpEntry, hg):
      fields = snpEntry.split()
+    #TODO: refactor naming. is fields[8] rpkm?
      if fields[8].find("N\A") == -1: 
          return snpEntry
      else:
      if fields[8].find("N\A") == -1: 
          return snpEntry
      else:
@@ -92,4 +93,4 @@ def getNovelSNPInfo(genome, snpEntry, hg):
  
  
  if __name__ == "__main__":
  
  
  if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
+\ No newline at end of file
diff --git a/getSNPs.py b/getSNPs.py

index f6071ae444be9ca294ac4c4ff39b50a74e3d0cb4..f5826751fdecdc3643967ea8e04df8331fbf94e3 100755 (executable)
--- a/getSNPs.py
+++ b/getSNPs.py
@@ -75,7 +75,7 @@ def makeParser(usage=""):
      forceChr = getConfigBoolOption(configParser, section, "forceChr", False)
      cachePages = getConfigIntOption(configParser, section, "cachePages", 0)
  
      forceChr = getConfigBoolOption(configParser, section, "forceChr", False)
      cachePages = getConfigIntOption(configParser, section, "cachePages", 0)
  
-    parser.set_defaults(doSplices=True, forceChr=False, cachePages=0)
+    parser.set_defaults(doSplices=doSplices, forceChr=forceChr, cachePages=cachePages)
  
      return parser
  
  
      return parser
  
diff --git a/getfasta.py b/getfasta.py

index ea347e4aa44f1a21a0b7c54d11b407352ce91b6f..acae47d68c896d14cf7b894e63adac2552483291 100755 (executable)
--- a/getfasta.py
+++ b/getfasta.py
@@ -36,7 +36,7 @@ def main(argv=None):
      outfilename = args[2]
  
      getfasta(genome, regionfile, outfilename, options.seqsize, options.minHitThresh,
      outfilename = args[2]
  
      getfasta(genome, regionfile, outfilename, options.seqsize, options.minHitThresh,
-             options.topRegions, options.maxsize, options.usePeaks, options.hitFile,
+             options.topRegions, options.maxsize, options.usePeaks, options.hitfile,
               options.doCache, options.doCompact)
  
  
               options.doCache, options.doCompact)
author	Sean Upchurch <sau@caltech.edu>
	Mon, 13 Jun 2011 17:21:56 +0000 (10:21 -0700)
committer	Sean Upchurch <sau@caltech.edu>
	Mon, 13 Jun 2011 17:21:56 +0000 (10:21 -0700)
ReadDataset.py		patch \| blob \| history
analyzego.py		patch \| blob \| history
commoncode.py		patch \| blob \| history
findall.py		patch \| blob \| history
geneLocusBins.py		patch \| blob \| history
geneMrnaCountsWeighted.py		patch \| blob \| history
getNovelSNPs.py		patch \| blob \| history
getSNPs.py		patch \| blob \| history
getfasta.py		patch \| blob \| history