Rewrite of findall.py to clean code. Configuration tested using
authorSean Upchurch <sau@caltech.edu>
Mon, 13 Jun 2011 17:21:56 +0000 (10:21 -0700)
committerSean Upchurch <sau@caltech.edu>
Mon, 13 Jun 2011 17:21:56 +0000 (10:21 -0700)
erange3.2 as basis.  Other minor code cleanups.

ReadDataset.py
analyzego.py
commoncode.py
findall.py
geneLocusBins.py
geneMrnaCountsWeighted.py
getNovelSNPs.py
getSNPs.py
getfasta.py

index 850a5ec602a7b6d7a9a22ac98c4bf3eef4b873e5..c9d2a0bf2b4eb56e0fa906be2319fcac866c30b3 100644 (file)
@@ -207,15 +207,27 @@ class ReadDataset():
         return sql
 
 
         return sql
 
 
+    def getMemCursor(self):
+        """ returns a cursor to memory database for low-level (SQL)
+        access to the data.
+        """
+        return self.memcon.cursor()
+
+
+    def getFileCursor(self):
+        """ returns a cursor to file database for low-level (SQL)
+        access to the data.
+        """
+        return self.dbcon.cursor()
+
+
     def hasIndex(self):
     def hasIndex(self):
-        """ check whether the RDS file has at least one index.
+        """ return True if the RDS file has at least one index.
         """
         stmt = "select count(*) from sqlite_master where type='index'"
         count = int(self.execute(stmt, returnResults=True)[0][0])
         """
         stmt = "select count(*) from sqlite_master where type='index'"
         count = int(self.execute(stmt, returnResults=True)[0][0])
-        if count > 0:
-            return True
 
 
-        return False
+        return count > 0
 
 
     def initializeTables(self, dbConnection, cache=100000):
 
 
     def initializeTables(self, dbConnection, cache=100000):
@@ -237,20 +249,6 @@ class ReadDataset():
         dbConnection.commit()
 
 
         dbConnection.commit()
 
 
-    def getFileCursor(self):
-        """ returns a cursor to file database for low-level (SQL)
-        access to the data.
-        """
-        return self.dbcon.cursor()
-
-
-    def getMemCursor(self):
-        """ returns a cursor to memory database for low-level (SQL)
-        access to the data.
-        """
-        return self.memcon.cursor()
-
-
     def getMetadata(self, valueName=""):
         """ returns a dictionary of metadata.
         """
     def getMetadata(self, valueName=""):
         """ returns a dictionary of metadata.
         """
@@ -309,7 +307,7 @@ class ReadDataset():
 
 
     def getChromosomes(self, table="uniqs", fullChrom=True):
 
 
     def getChromosomes(self, table="uniqs", fullChrom=True):
-        """ returns a list of distinct chromosomes in table.
+        """ returns a sorted list of distinct chromosomes in table.
         """
         statement = "select distinct chrom from %s" % table
         sql = self.getSqlCursor()
         """
         statement = "select distinct chrom from %s" % table
         sql = self.getSqlCursor()
@@ -330,7 +328,7 @@ class ReadDataset():
         return results
 
 
         return results
 
 
-    def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
+    def getMaxCoordinate(self, chrom, doUniqs=True,
                          doMulti=False, doSplices=False):
         """ returns the maximum coordinate for reads on a given chromosome.
         """
                          doMulti=False, doSplices=False):
         """ returns the maximum coordinate for reads on a given chromosome.
         """
@@ -347,9 +345,6 @@ class ReadDataset():
             multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
             maxCoord = max(multiMax, maxCoord)
 
             multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
             maxCoord = max(multiMax, maxCoord)
 
-        if verbose:
-            print "%s maxCoord: %d" % (chrom, maxCoord)
-
         return maxCoord
 
 
         return maxCoord
 
 
@@ -375,9 +370,9 @@ class ReadDataset():
         and which can be restricted by chromosome or custom-flag.
         Returns unique reads by default, but can return multireads
         with doMulti set to True.
         and which can be restricted by chromosome or custom-flag.
         Returns unique reads by default, but can return multireads
         with doMulti set to True.
-        
-        Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
+
         """
         """
+        #TODO: Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
 
         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
         if findallOptimize:
 
         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
         if findallOptimize:
@@ -421,27 +416,16 @@ class ReadDataset():
         if findallOptimize:
             if self.memBacked:
                 self.memcon.row_factory = None
         if findallOptimize:
             if self.memBacked:
                 self.memcon.row_factory = None
-                sql = self.memcon.cursor()
             else:
                 self.dbcon.row_factory = None
             else:
                 self.dbcon.row_factory = None
-                sql = self.dbcon.cursor()
 
             stmt.append("order by start")
         elif readIDDict:
 
             stmt.append("order by start")
         elif readIDDict:
-            if self.memBacked:
-                sql = self.memcon.cursor()
-            else:
-                sql = self.dbcon.cursor()
-
             stmt.append("order by readID, start")
         else:
             stmt.append("order by readID, start")
         else:
-            if self.memBacked:
-                sql = self.memcon.cursor()
-            else:
-                sql = self.dbcon.cursor()
-
             stmt.append("order by chrom, start")
 
             stmt.append("order by chrom, start")
 
+        sql = self.getSqlCursor()
         sqlQuery = string.join(stmt)
         sql.execute(sqlQuery)
 
         sqlQuery = string.join(stmt)
         sql.execute(sqlQuery)
 
@@ -602,10 +586,7 @@ class ReadDataset():
         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
         selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
         selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
         selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
         selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
-        if self.memBacked:
-            sql = self.memcon.cursor()
-        else:
-            sql = self.dbcon.cursor()
+        sql = self.getSqlCursor()
 
         stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
         sql.execute(stmt)
 
         stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
         sql.execute(stmt)
@@ -718,7 +699,7 @@ class ReadDataset():
 
 
     def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
 
 
     def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
-        """ returns the number of row in the uniqs table.
+        """ returns the number of row in the specified table.
         """
         whereClause = []
         count = 0
         """
         whereClause = []
         count = 0
@@ -741,10 +722,7 @@ class ReadDataset():
         else:
             whereQuery = ""
 
         else:
             whereQuery = ""
 
-        if self.memBacked:
-            sql = self.memcon.cursor()
-        else:
-            sql = self.dbcon.cursor()
+        sql = self.getSqlCursor()
 
         if distinct:
             sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
 
         if distinct:
             sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
@@ -803,11 +781,7 @@ class ReadDataset():
             limitPart = "LIMIT %d" % limit
 
         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
             limitPart = "LIMIT %d" % limit
 
         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
-        if self.memBacked:
-            sql = self.memcon.cursor()
-        else:
-            sql = self.dbcon.cursor()
-
+        sql = self.getSqlCursor()
         sql.execute(sqlQuery)
         result = sql.fetchall()
 
         sql.execute(sqlQuery)
         result = sql.fetchall()
 
index 6542b1bf24c256e5907c799c785b0ab060fb9eaf..f8ff0017f9b51401411423b9ff8d5420b0714409 100755 (executable)
@@ -6,7 +6,7 @@ except:
 
 import sys
 import optparse
 
 import sys
 import optparse
-from commoncode import getGeneInfoDict, getConfigParser, getConfigOption
+from commoncode import getGeneInfoDict, getConfigParser, getConfigOption, getConfigIntOption
 from cistematic.cisstat.analyzego import calculateGOStats
 
 print "analyzego: version 2.2"
 from cistematic.cisstat.analyzego import calculateGOStats
 
 print "analyzego: version 2.2"
@@ -35,7 +35,7 @@ def main(argv=None):
     infilename = args[1]
     prefix = args[2]
 
     infilename = args[1]
     prefix = args[2]
 
-    analyzeGOFromFile(genome, infilename, prefix, options.translateGene, fieldID)
+    analyzeGOFromFile(genome, infilename, prefix, options.translateGene, fieldID, numTests=options.numTests)
 
 
 def makeParser(usage=""):
 
 
 def makeParser(usage=""):
@@ -44,24 +44,27 @@ def makeParser(usage=""):
                       help="translate gene")
     parser.add_option("--field", type="int", dest="fieldID",
                       help="column containing gene ID/Name")
                       help="translate gene")
     parser.add_option("--field", type="int", dest="fieldID",
                       help="column containing gene ID/Name")
+    parser.add_option("--trials", type="int", dest="numTests",
+                      help="column containing gene ID/Name")
 
     configParser = getConfigParser()
     section = "analyzego"
     translateGene = getConfigOption(configParser, section, "translateGene", False)
     fieldID = getConfigOption(configParser, section, "fieldID", None)
 
     configParser = getConfigParser()
     section = "analyzego"
     translateGene = getConfigOption(configParser, section, "translateGene", False)
     fieldID = getConfigOption(configParser, section, "fieldID", None)
+    numTests = getConfigIntOption(configParser, section, "numTests", 1)
 
 
-    parser.set_defaults(translateGene=translateGene, fieldID=fieldID)
+    parser.set_defaults(translateGene=translateGene, fieldID=fieldID, numTests=numTests)
 
     return parser
 
 
 
     return parser
 
 
-def analyzeGOFromFile(genome, infilename, prefix, translateGene=False, fieldID=1):
+def analyzeGOFromFile(genome, infilename, prefix, translateGene=False, fieldID=1, numTests=1):
     infile = open(infilename)
     infile = open(infilename)
-    analyzeGO(genome, infile, prefix, translateGene=False, fieldID=1)
+    analyzeGO(genome, infile, prefix, translateGene, fieldID, numTests)
     infile.close()
 
 
     infile.close()
 
 
-def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1):
+def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1, numTests=1):
     if translateGene:
         symbolToGidDict = getSymbolDict(genome)
 
     if translateGene:
         symbolToGidDict = getSymbolDict(genome)
 
@@ -88,7 +91,7 @@ def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1):
             locusList.append((genome, gID))
 
     if len(locusList) > 0:
             locusList.append((genome, gID))
 
     if len(locusList) > 0:
-        calculateGOStats(locusList, prefix)
+        calculateGOStats(locusList, prefix, trials=numTests)
 
 
 def getSymbolDict(genome):
 
 
 def getSymbolDict(genome):
index f526daacca0dc91245e71205363216bc17eba2e9..6ef4edfcb3e10d75ae0eecea887f45581f9e4edb 100755 (executable)
@@ -1,8 +1,3 @@
-#
-#  commoncode.py
-#  ENRAGE
-#
-
 import ConfigParser
 import os
 import string
 import ConfigParser
 import os
 import string
@@ -192,6 +187,11 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False,
     mergeCount = 0
     chromField = int(chromField)
     count = 0
     mergeCount = 0
     chromField = int(chromField)
     count = 0
+    #TODO: Current algorithm processes input file line by line and compares with prior lines.  Problem is it
+    #      exits at the first merge.  This is not a problem when the input is sorted by start position, but in
+    #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
+    #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
+    #      be merged with A but that will exit the loop and the merge with C will be missed.
     for regionEntry in regionList:
         if regionEntry[0] == "#":
             if "pvalue" in regionEntry:
     for regionEntry in regionList:
         if regionEntry[0] == "#":
             if "pvalue" in regionEntry:
@@ -372,10 +372,11 @@ def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75)
         for read in readList:
             currentpos = read["start"] - start
             if read["sense"] == "+":
         for read in readList:
             currentpos = read["start"] - start
             if read["sense"] == "+":
-                currentpos += testShift
+                senseModifier = 1
             else:
             else:
-                currentpos -= testShift
+                senseModifier = -1
 
 
+            currentpos += testShift * senseModifier
             if (currentpos < 1) or (currentpos >= length):
                 continue
 
             if (currentpos < 1) or (currentpos >= length):
                 continue
 
@@ -384,16 +385,12 @@ def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75)
             else:
                 weight = 1.0
 
             else:
                 weight = 1.0
 
-            if read["sense"] == "+":
-                shiftArray[currentpos] += weight
-            else:
-                shiftArray[currentpos] -= weight
+            shiftArray[currentpos] += weight * senseModifier
 
         currentScore = 0
         for score in shiftArray:
             currentScore += abs(score)
 
 
         currentScore = 0
         for score in shiftArray:
             currentScore += abs(score)
 
-        print currentScore
         if currentScore < lowestScore:
             bestShift = testShift
             lowestScore = currentScore
         if currentScore < lowestScore:
             bestShift = testShift
             lowestScore = currentScore
@@ -494,40 +491,8 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=
         if restrictGID and gid not in restrictList:
             continue
 
         if restrictGID and gid not in restrictList:
             continue
 
-        featureList = featuresDict[gid]
-        newFeatureList = []
-        isPseudo = False
-        for (ftype, chrom, start, stop, sense) in featureList:
-            if ftype == "PSEUDO":
-                isPseudo = True
-
-            if (start, stop, ftype) not in newFeatureList:
-                notContained = True
-                containedList = []
-                for (fstart, fstop, ftype2) in newFeatureList:
-                    if start >= fstart and stop <= fstop:
-                        notContained = False
-
-                    if start < fstart and stop > fstop:
-                        containedList.append((fstart, fstop))
-
-                if len(containedList) > 0:
-                    newFList = []
-                    notContained = True
-                    for (fstart, fstop, ftype2) in newFeatureList:
-                        if (fstart, fstop) not in containedList:
-                            newFList.append((fstart, fstop, ftype2))
-                            if start >= fstart and stop <= fstop:
-                                notContained = False
-
-                    newFeatureList = newFList
-                if notContained:
-                    newFeatureList.append((start, stop, ftype))
-
-        if ignorePseudo and isPseudo:
-            continue
-
-        if chrom not in featuresByChromDict:
+        newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid])
+        if newFeatureList and chrom not in featuresByChromDict:
             featuresByChromDict[chrom] = []
 
         for (start, stop, ftype) in newFeatureList:
             featuresByChromDict[chrom] = []
 
         for (start, stop, ftype) in newFeatureList:
@@ -560,6 +525,39 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=
         return featuresByChromDict
 
 
         return featuresByChromDict
 
 
+def getNewFeatureList(featureList, ignorePseudo=False):
+    newFeatureList = []
+    for (featureType, chrom, start, stop, sense) in featureList:
+        if featureType == "PSEUDO" and ignorePseudo:
+            return [], chrom, sense
+
+        if (start, stop, featureType) not in newFeatureList:
+            notContained = True
+            containedList = []
+            for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
+                if start >= newFeatureStart and stop <= newFeatureStop:
+                    notContained = False
+
+                if start < newFeatureStart and stop > newFeatureStop:
+                    containedList.append((newFeatureStart, newFeatureStop))
+
+            if len(containedList) > 0:
+                newFList = []
+                notContained = True
+                for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList:
+                    if (newFeatureStart, newFeatureStop) not in containedList:
+                        newFList.append((newFeatureStart, newFeatureStop, featureType2))
+                        if start >= newFeatureStart and stop <= newFeatureStop:
+                            notContained = False
+
+                newFeatureList = newFList
+
+            if notContained:
+                newFeatureList.append((start, stop, featureType))
+
+    return newFeatureList, chrom, sense
+
+
 def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
                         additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
                         lengthCDS=0, keepSense=False, adjustToNeighbor=True):
 def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
                         additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False,
                         lengthCDS=0, keepSense=False, adjustToNeighbor=True):
@@ -573,23 +571,7 @@ def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
     lengthCDS < 0bp, return only the last X bp from CDS.
     """ 
     locusByChromDict = {}
     lengthCDS < 0bp, return only the last X bp from CDS.
     """ 
     locusByChromDict = {}
-    if upstream == 0 and downstream == 0 and not useCDS:
-        print "getLocusByChromDict: asked for no sequence - returning empty dict"
-        return locusByChromDict
-    elif upstream > 0 and downstream > 0 and not useCDS:
-        print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
-        return locusByChromDict
-    elif lengthCDS != 0 and not useCDS:
-        print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
-        return locusByChromDict
-    elif upstreamSpanTSS and lengthCDS != 0:
-        print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
-        return locusByChromDict
-    elif lengthCDS > 0 and downstream > 0:
-        print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
-        return locusByChromDict
-    elif lengthCDS < 0 and upstream > 0:
-        print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
+    if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
         return locusByChromDict
 
     genomeName = genome.genome
         return locusByChromDict
 
     genomeName = genome.genome
@@ -702,6 +684,29 @@ def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True,
     return locusByChromDict
 
 
     return locusByChromDict
 
 
+def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS):
+    if upstream == 0 and downstream == 0 and not useCDS:
+        print "getLocusByChromDict: asked for no sequence - returning empty dict"
+        return True
+    elif upstream > 0 and downstream > 0 and not useCDS:
+        print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict"
+        return True
+    elif lengthCDS != 0 and not useCDS:
+        print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict"
+        return True
+    elif upstreamSpanTSS and lengthCDS != 0:
+        print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict"
+        return True
+    elif lengthCDS > 0 and downstream > 0:
+        print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict"
+        return True
+    elif lengthCDS < 0 and upstream > 0:
+        print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict"
+        return True
+
+    return False
+
+
 def getGeneFeatures(genome, additionalRegionsDict):
     featuresDict = genome.getallGeneFeatures()
     if len(additionalRegionsDict) > 0:
 def getGeneFeatures(genome, additionalRegionsDict):
     featuresDict = genome.getallGeneFeatures()
     if len(additionalRegionsDict) > 0:
@@ -805,37 +810,25 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[],
                     else:
                         regionBinLength = binLength
 
                     else:
                         regionBinLength = binLength
 
-                    startdist = tagStart - start
                     if rsense == "F":
                     if rsense == "F":
-                        # we are relying on python's integer division quirk
-                        binID = startdist / regionBinLength
-                        if (fixedFirstBin > 0) and (startdist < fixedFirstBin):
-                            binID = 0
-                        elif fixedFirstBin > 0:
-                            binID = 1
-
-                        if binID >= bins:
-                            binID = bins - 1
-
-                        try:
-                            regionsBins[regionID][binID] += normalizedTag * weight
-                        except KeyError:
-                            print "%s %s" % (regionID, str(binID))
+                        positionInRegion = tagStart - start
                     else:
                     else:
-                        rdist = rlen - startdist
-                        binID = rdist / regionBinLength
-                        if (fixedFirstBin > 0) and (rdist < fixedFirstBin):
-                            binID = 0
-                        elif fixedFirstBin > 0:
-                            binID = 1
-
-                        if binID >= bins:
-                            binID = bins - 1
-
-                        try:
-                            regionsBins[regionID][binID] += normalizedTag * weight
-                        except KeyError:
-                            print "%s %s" % (regionID, str(binID))
+                        positionInRegion = start + rlen - tagStart
+
+                    # we are relying on python's integer division quirk
+                    binID = positionInRegion / regionBinLength
+                    if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin):
+                        binID = 0
+                    elif fixedFirstBin > 0:
+                        binID = 1
+
+                    if binID >= bins:
+                        binID = bins - 1
+
+                    try:
+                        regionsBins[regionID][binID] += normalizedTag * weight
+                    except KeyError:
+                        print "%s %s" % (regionID, str(binID))
 
                     stopPoint = stop
 
 
                     stopPoint = stop
 
index aa820374b994d497fbc4dd336a98b795489f5900..663d7e075d5d30535da6eed6547d4da2b4970be2 100755 (executable)
@@ -49,14 +49,104 @@ import sys
 import math
 import string
 import optparse
 import math
 import string
 import optparse
+import operator
 from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
 import ReadDataset
 import Region
 
 
 from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption
 import ReadDataset
 import Region
 
 
-versionString = "findall: version 3.2"
+versionString = "findall: version 3.2.1"
 print versionString
 
 print versionString
 
+class RegionDirectionError(Exception):
+    pass
+
+
+class RegionFinder():
+    def __init__(self, minRatio, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, strandfilter, minHits, trimValue, doTrim, doDirectionality,
+                 shiftValue, maxSpacing):
+
+        self.statistics = {"index": 0,
+                           "total": 0,
+                           "mIndex": 0,
+                           "mTotal": 0,
+                           "failed": 0,
+                           "badRegionTrim": 0
+        }
+
+        self.controlRDSsize = 1
+        self.sampleRDSsize = 1
+        self.minRatio = minRatio
+        self.minPeak = minPeak
+        self.leftPlusRatio = leftPlusRatio
+        self.stranded = "both"
+        if strandfilter == "plus":
+            self.stranded = "+"
+            minPlusRatio = 0.9
+            maxPlusRatio = 1.0
+        elif strandfilter == "minus":
+            self.stranded = "-"
+            minPlusRatio = 0.0
+            maxPlusRatio = 0.1
+
+        if minRatio < minPeak:
+            self.minPeak = minRatio
+
+        self.minPlusRatio = minPlusRatio
+        self.maxPlusRatio = maxPlusRatio
+        self.strandfilter = strandfilter
+        self.minHits = minHits
+        self.trimValue = trimValue
+        self.doTrim = doTrim
+        self.doDirectionality = doDirectionality
+
+        if self.doTrim:
+            self.trimString = string.join(["%2.1f" % (100. * self.trimValue), "%"], "")
+        else:
+            self.trimString = "none"
+
+        self.shiftValue = shiftValue
+        self.maxSpacing = maxSpacing
+
+
+    def useRNASettings(self):
+        self.shiftValue = 0
+        self.doTrim = False
+        self.doDirectionality = False
+
+
+    def printOptionsSummary(self, listPeak, useMulti, doCache, pValueType):
+
+        print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (self.doDirectionality, listPeak, not useMulti, doCache)
+        print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded)
+        try:
+            print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
+        except:
+            print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)
+
+
+    def getAnalysisDescription(self, hitfile, controlfile, doControl,
+                                 withFlag, listPeak, useMulti, doCache, pValueType):
+
+        description = ["#ERANGE %s" % versionString]
+        if doControl:
+            description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, controlfile, self.controlRDSsize))
+        else:
+            description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample: none" % (hitfile, self.sampleRDSsize))
+
+        if withFlag != "":
+            description.append("#restrict to Flag = %s" % withFlag)
+
+        description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s" % (self.doDirectionality, listPeak, not useMulti, doCache))
+        description.append("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded))
+        try:
+            description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
+        except:
+            description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType))
+
+        return string.join(description, "\n")
+
+
 def usage():
     print __doc__
 
 def usage():
     print __doc__
 
@@ -76,12 +166,32 @@ def main(argv=None):
     hitfile = args[1]
     outfilename = args[2]
 
     hitfile = args[1]
     outfilename = args[2]
 
-    findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift,
-            options.stringency, options.noshift, options.autoshift, options.reportshift,
+    shiftValue = 0
+
+    if options.autoshift:
+        shiftValue = "auto"
+
+    if options.shift is not None:
+        try:
+            shiftValue = int(options.shift)
+        except ValueError:
+            if options.shift == "learn":
+                shiftValue = "learn"
+
+    if options.noshift:
+        shiftValue = 0
+
+    if options.doAppend:
+        outputMode = "a"
+    else:
+        outputMode = "w"
+
+    findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, shiftValue,
+            options.stringency, options.reportshift,
             options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
             options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
             options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak,
             options.normalize, options.logfilename, options.withFlag, options.doDirectionality,
-            options.trimValue, options.doTrim, options.doAppend, options.rnaSettings,
-            options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti,
+            options.trimValue, options.doTrim, outputMode, options.rnaSettings,
+            options.cachePages, options.ptype, options.controlfile, options.doRevBackground, options.useMulti,
             options.strandfilter, options.combine5p)
 
 
             options.strandfilter, options.combine5p)
 
 
@@ -89,7 +199,7 @@ def makeParser():
     usage = __doc__
 
     parser = optparse.OptionParser(usage=usage)
     usage = __doc__
 
     parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--control", dest="mockfile")
+    parser.add_option("--control", dest="controlfile")
     parser.add_option("--minimum", type="float", dest="minHits")
     parser.add_option("--ratio", type="float", dest="minRatio")
     parser.add_option("--spacing", type="int", dest="maxSpacing")
     parser.add_option("--minimum", type="float", dest="minHits")
     parser.add_option("--ratio", type="float", dest="minRatio")
     parser.add_option("--spacing", type="int", dest="maxSpacing")
@@ -99,7 +209,7 @@ def makeParser():
     parser.add_option("--noshift", action="store_true", dest="noShift")
     parser.add_option("--autoshift", action="store_true", dest="autoshift")
     parser.add_option("--reportshift", action="store_true", dest="reportshift")
     parser.add_option("--noshift", action="store_true", dest="noShift")
     parser.add_option("--autoshift", action="store_true", dest="autoshift")
     parser.add_option("--reportshift", action="store_true", dest="reportshift")
-    parser.add_option("--nomulti", action="store_true", dest="noMulti")
+    parser.add_option("--nomulti", action="store_false", dest="useMulti")
     parser.add_option("--minPlus", type="float", dest="minPlusRatio")
     parser.add_option("--maxPlus", type="float", dest="maxPlusRatio")
     parser.add_option("--leftPlus", type="float", dest="leftPlusRatio")
     parser.add_option("--minPlus", type="float", dest="minPlusRatio")
     parser.add_option("--maxPlus", type="float", dest="maxPlusRatio")
     parser.add_option("--leftPlus", type="float", dest="leftPlusRatio")
@@ -137,16 +247,16 @@ def makeParser():
     logfilename = getConfigOption(configParser, section, "logfilename", "findall.log")
     withFlag = getConfigOption(configParser, section, "withFlag", "")
     doDirectionality = getConfigBoolOption(configParser, section, "doDirectionality", True)
     logfilename = getConfigOption(configParser, section, "logfilename", "findall.log")
     withFlag = getConfigOption(configParser, section, "withFlag", "")
     doDirectionality = getConfigBoolOption(configParser, section, "doDirectionality", True)
-    trimValue = getConfigOption(configParser, section, "trimValue", None)
+    trimValue = getConfigFloatOption(configParser, section, "trimValue", 0.1)
     doTrim = getConfigBoolOption(configParser, section, "doTrim", True)
     doAppend = getConfigBoolOption(configParser, section, "doAppend", False)
     rnaSettings = getConfigBoolOption(configParser, section, "rnaSettings", False)
     cachePages = getConfigOption(configParser, section, "cachePages", None)
     doTrim = getConfigBoolOption(configParser, section, "doTrim", True)
     doAppend = getConfigBoolOption(configParser, section, "doAppend", False)
     rnaSettings = getConfigBoolOption(configParser, section, "rnaSettings", False)
     cachePages = getConfigOption(configParser, section, "cachePages", None)
-    ptype = getConfigOption(configParser, section, "ptype", None)
-    mockfile = getConfigOption(configParser, section, "mockfile", None)
+    ptype = getConfigOption(configParser, section, "ptype", "")
+    controlfile = getConfigOption(configParser, section, "controlfile", None)
     doRevBackground = getConfigBoolOption(configParser, section, "doRevBackground", False)
     doRevBackground = getConfigBoolOption(configParser, section, "doRevBackground", False)
-    noMulti = getConfigBoolOption(configParser, section, "noMulti", False)
-    strandfilter = getConfigOption(configParser, section, "strandfilter", None)
+    useMulti = getConfigBoolOption(configParser, section, "useMulti", True)
+    strandfilter = getConfigOption(configParser, section, "strandfilter", "")
     combine5p = getConfigBoolOption(configParser, section, "combine5p", False)
 
     parser.set_defaults(minHits=minHits, minRatio=minRatio, maxSpacing=maxSpacing, listPeak=listPeak, shift=shift,
     combine5p = getConfigBoolOption(configParser, section, "combine5p", False)
 
     parser.set_defaults(minHits=minHits, minRatio=minRatio, maxSpacing=maxSpacing, listPeak=listPeak, shift=shift,
@@ -154,611 +264,692 @@ def makeParser():
                         minPlusRatio=minPlusRatio, maxPlusRatio=maxPlusRatio, leftPlusRatio=leftPlusRatio, minPeak=minPeak,
                         normalize=normalize, logfilename=logfilename, withFlag=withFlag, doDirectionality=doDirectionality,
                         trimValue=trimValue, doTrim=doTrim, doAppend=doAppend, rnaSettings=rnaSettings,
                         minPlusRatio=minPlusRatio, maxPlusRatio=maxPlusRatio, leftPlusRatio=leftPlusRatio, minPeak=minPeak,
                         normalize=normalize, logfilename=logfilename, withFlag=withFlag, doDirectionality=doDirectionality,
                         trimValue=trimValue, doTrim=doTrim, doAppend=doAppend, rnaSettings=rnaSettings,
-                        cachePages=cachePages, ptype=ptype, mockfile=mockfile, doRevBackground=doRevBackground, noMulti=noMulti,
+                        cachePages=cachePages, ptype=ptype, controlfile=controlfile, doRevBackground=doRevBackground, useMulti=useMulti,
                         strandfilter=strandfilter, combine5p=combine5p)
 
     return parser
 
 
                         strandfilter=strandfilter, combine5p=combine5p)
 
     return parser
 
 
-def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None,
-            stringency=4.0, noshift=False, autoshift=False, reportshift=False,
-            minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
-            normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True,
-            trimValue=None, doTrim=True, doAppend=False, rnaSettings=False,
-            cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False,
-            strandfilter=None, combine5p=False):
+def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shiftValue=0,
+            stringency=4.0, reportshift=False, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5,
+            normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True, trimValue=0.1, doTrim=True,
+            outputMode="w", rnaSettings=False, cachePages=None, ptype="", controlfile=None, doRevBackground=False,
+            useMulti=True, strandfilter="", combine5p=False):
 
 
-    shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
-    doControl = False
-    if mockfile is not None:
-        doControl = True
+    regionFinder = RegionFinder(minRatio, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, strandfilter, minHits, trimValue,
+                                doTrim, doDirectionality, shiftValue, maxSpacing)
 
 
-    if doRevBackground:
-        print "Swapping IP and background to calculate FDR"
-        pValueType = "back"
+    doControl = controlfile is not None
+    pValueType = getPValueType(ptype, doControl, doRevBackground)
+    doPvalue = not pValueType == "none"
 
 
-    doPvalue = True
-    if ptype is not None:
-        ptype = ptype.upper()
+    if rnaSettings:
+        regionFinder.useRNASettings()
+
+    doCache = cachePages is not None
+    printStatusMessages(regionFinder.shiftValue, normalize, doRevBackground, ptype, doControl, withFlag, useMulti, rnaSettings, regionFinder.strandfilter)
+    controlRDS = None
+    if doControl:
+        print "\ncontrol:" 
+        controlRDS = openRDSFile(controlfile, cachePages=cachePages, doCache=doCache)
+
+    print "\nsample:" 
+    hitRDS = openRDSFile(hitfile, cachePages=cachePages, doCache=doCache)
+
+    print
+    regionFinder.readlen = hitRDS.getReadSize()
+    if rnaSettings:
+        regionFinder.maxSpacing = regionFinder.readlen
+
+    writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+    regionFinder.printOptionsSummary(listPeak, useMulti, doCache, pValueType)
+
+    regionFinder.sampleRDSsize = len(hitRDS) / 1000000.
+    if doControl:
+        regionFinder.controlRDSsize = len(controlRDS) / 1000000.
+
+    outfile = open(outfilename, outputMode)
+    print >> outfile, regionFinder.getAnalysisDescription(hitfile, controlfile, doControl,
+                                                          withFlag, listPeak, useMulti, doCache, pValueType)
+
+    header = getHeader(normalize, regionFinder.doDirectionality, listPeak, reportshift, doPvalue)
+    print >> outfile, header
+    if minRatio < minPeak:
+        minPeak = minRatio
+
+    shiftDict = {}
+    hitChromList = hitRDS.getChromosomes()
+    stringency = max(stringency, 1.0)
+    chromosomeList = getChromosomeListToProcess(hitChromList, controlRDS, doControl)
+    for chromosome in chromosomeList:
+        allregions, outregions = findPeakRegions(regionFinder, hitRDS, controlRDS, chromosome, logfilename, outfilename,
+                                                 outfile, stringency, normalize, useMulti, doControl, withFlag, combine5p,
+                                                 factor, listPeak)
+
+        if doRevBackground:
+            #TODO: this is *almost* the same calculation - there are small yet important differences
+            backregions = findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, normalize, useMulti, factor)
+            writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
+                                   allregions, header, backregions=backregions, pValueType=pValueType)
+        else:
+            writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
+                                        allregions, header)
+
+    footer = getFooter(regionFinder, shiftDict, reportshift, doRevBackground)
+    print footer
+    print >> outfile, footer
+    outfile.close()
+    writeLog(logfilename, versionString, outfilename + footer.replace("\n#"," | ")[:-1])
+
+
+def getPValueType(ptype, doControl, doRevBackground):
+    pValueType = "self"
+    if ptype in ["NONE", "SELF", "BACK"]:
         if ptype == "NONE":
         if ptype == "NONE":
-            doPvalue = False
             pValueType = "none"
             pValueType = "none"
-            p = 1
-            poissonmean = 0
         elif ptype == "SELF":
             pValueType = "self"
         elif ptype == "BACK":
             if doControl and doRevBackground:
                 pValueType = "back"
         elif ptype == "SELF":
             pValueType = "self"
         elif ptype == "BACK":
             if doControl and doRevBackground:
                 pValueType = "back"
+    elif doRevBackground:
+        pValueType = "back"
+
+    return pValueType
+
+
+def printStatusMessages(shiftValue, normalize, doRevBackground, ptype, doControl, withFlag, useMulti, rnaSettings, strandfilter):
+    if shiftValue == "learn":
+        print "Will try to learn shift"
+
+    if normalize:
+        print "Normalizing to RPM"
+
+    if doRevBackground:
+        print "Swapping IP and background to calculate FDR"
+
+    if ptype != "":
+        if ptype in ["NONE", "SELF"]:
+            pass
+        elif ptype == "BACK":
+            if doControl and doRevBackground:
+                pass
             else:
                 print "must have a control dataset and -revbackground for pValue type 'back'"
         else:
             print "could not use pValue type : %s" % ptype
             else:
                 print "must have a control dataset and -revbackground for pValue type 'back'"
         else:
             print "could not use pValue type : %s" % ptype
-    else:
-        pValueType = "self"
 
     if withFlag != "":
         print "restrict to flag = %s" % withFlag
 
 
     if withFlag != "":
         print "restrict to flag = %s" % withFlag
 
-    useMulti = True
-    if noMulti:
+    if not useMulti:
         print "using unique reads only"
         print "using unique reads only"
-        useMulti = False
 
     if rnaSettings:
         print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
 
     if rnaSettings:
         print "using settings appropriate for RNA: -nodirectionality -notrim -noshift"
-        doTrim = False
-        doDirectionality = False
 
 
-    stranded = ""
-    if strandfilter is not None:
-        if strandfilter == "plus":
-            stranded = "+"
-            minPlusRatio = 0.9
-            maxPlusRatio = 1.0
-            print "only analyzing reads on the plus strand"
-        elif strandfilter == "minus":
-            stranded = "-"
-            minPlusRatio = 0.0
-            maxPlusRatio = 0.1
-            print "only analyzing reads on the minus strand"
+    if strandfilter == "plus":
+        print "only analyzing reads on the plus strand"
+    elif strandfilter == "minus":
+        print "only analyzing reads on the minus strand"
 
 
-    stringency = max(stringency, 1.0)
-    writeLog(logfilename, versionString, string.join(sys.argv[1:]))
-    if cachePages is not None:
-        doCache = True
+
+def openRDSFile(filename, cachePages=None, doCache=False):
+    rds = ReadDataset.ReadDataset(filename, verbose=True, cache=doCache)
+    if cachePages > rds.getDefaultCacheSize():
+        rds.setDBcache(cachePages)
+
+    return rds
+
+
+def getHeader(normalize, doDirectionality, listPeak, reportshift, doPvalue):
+    if normalize:
+        countType = "RPM"
     else:
     else:
-        doCache = False
-        cachePages = -1
+        countType = "COUNT"
 
 
-    mockRDS = None
-    if doControl:
-        print "\ncontrol:" 
-        mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
+    headerFields = ["#regionID\tchrom\tstart\tstop", countType, "fold\tmulti%"]
 
 
-        if cachePages > mockRDS.getDefaultCacheSize():
-            mockRDS.setDBcache(cachePages)
+    if doDirectionality:
+        headerFields.append("plus%\tleftPlus%")
 
 
-    print "\nsample:" 
-    hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache)
-    readlen = hitRDS.getReadSize()
-    if rnaSettings:
-        maxSpacing = readlen
+    if listPeak:
+        headerFields.append("peakPos\tpeakHeight")
 
 
-    if cachePages > hitRDS.getDefaultCacheSize():
-        hitRDS.setDBcache(cachePages)
+    if reportshift:
+        headerFields.append("readShift")
 
 
-    if trimValue is not None:
-        trimValue = float(trimValue) / 100.
-        trimString = "%2.1f%s" % ((100. * trimValue), "%")
-    else:
-        trimValue = 0.1
-        trimString = "10%"
+    if doPvalue:
+        headerFields.append("pValue")
 
 
-    if not doTrim:
-        trimString = "none"
+    return string.join(headerFields, "\t")
 
 
-    if doAppend:
-        fileMode = "a"
-    else:
-        fileMode = "w"
 
 
-    outfile = open(outfilename, fileMode)
-    outfile.write("#ERANGE %s\n" % versionString)
+def getChromosomeListToProcess(hitChromList, controlRDS, doControl):
     if doControl:
     if doControl:
-        mockRDSsize = len(mockRDS) / 1000000.
-        controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize)
+        controlChromList = controlRDS.getChromosomes()
+        chromosomeList = [chrom for chrom in hitChromList if chrom in controlChromList and chrom != "chrM"]
     else:
     else:
-        mockRDSsize = 0
-        controlSampleString = " none"
+        chromosomeList = [chrom for chrom in hitChromList if chrom != "chrM"]
 
 
-    hitRDSsize = len(hitRDS) / 1000000.
-    outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:%s\n" % (hitfile, hitRDSsize, controlSampleString))
+    return chromosomeList
 
 
-    if withFlag != "":
-        outfile.write("#restrict to Flag = %s\n" % withFlag)
 
 
-    print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache)
-    print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)
-    print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)
+def findPeakRegions(regionFinder, hitRDS, controlRDS, chromosome, logfilename, outfilename,
+                    outfile, stringency, normalize, useMulti, doControl, withFlag, combine5p, factor, listPeak):
 
 
-    outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache))
-    outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded))
-    outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType))
-    if normalize:
-        print "Normalizing to RPM"
-        countLabel = "RPM"
-    else:
-        countLabel = "COUNT"
+    outregions = []
+    allregions = []
+    print "chromosome %s" % (chromosome)
+    previousHit = - 1 * regionFinder.maxSpacing
+    readStartPositions = [-1]
+    totalWeight = 0
+    uniqueReadCount = 0
+    reads = []
+    numStarts = 0
+    badRegion = False
+    hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True,
+                                  strand=regionFinder.stranded, combine5p=combine5p)
 
 
-    headerList = ["#regionID\tchrom\tstart\tstop", countLabel, "fold\tmulti%"]
-    if doDirectionality:
-        headerList.append("plus%\tleftPlus%")
+    maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
+    if regionFinder.shiftValue == "learn":
+        learnShift(regionFinder, hitDict, controlRDS, chromosome, maxCoord, logfilename, outfilename,
+                                outfile, numStarts, stringency, normalize, useMulti, doControl)
 
 
-    if listPeak:
-        headerList.append("peakPos\tpeakHeight")
+    for read in hitDict[chromosome]:
+        pos = read["start"]
+        if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
+            lastReadPos = readStartPositions[-1]
+            lastBasePosition = lastReadPos + regionFinder.readlen - 1
+            newRegionIndex = regionFinder.statistics["index"] + 1
+            if regionFinder.doDirectionality:
+                region = Region.DirectionalRegion(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=factor, numReads=totalWeight)
+            else:
+                region = Region.Region(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=factor, numReads=totalWeight)
 
 
-    if reportshift:
-        headerList.append("readShift")
+            if normalize:
+                region.numReads /= regionFinder.sampleRDSsize
 
 
-    if doPvalue:
-        headerList.append("pValue")
+            allregions.append(int(region.numReads))
+            regionLength = lastReadPos - region.start
+            if regionPassesCriteria(region.numReads, regionFinder.minHits, numStarts, regionFinder.minRatio, regionLength, regionFinder.readlen):
+                region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos,
+                                                useMulti, doControl, normalize)
 
 
-    headline = string.join(headerList, "\t")
-    print >> outfile, headline
+                if region.foldRatio >= regionFinder.minRatio:
+                    # first pass, with absolute numbers
+                    peak = findPeak(reads, region.start, regionLength, regionFinder.readlen, doWeight=True, leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue)
+                    if regionFinder.doTrim:
+                        try:
+                            lastReadPos = trimRegion(region, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.readlen, regionFinder.doDirectionality,
+                                                     normalize, regionFinder.sampleRDSsize)
+                        except IndexError:
+                            badRegion = True
+                            continue
 
 
-    statistics = {"index": 0,
-                  "total": 0,
-                  "mIndex": 0,
-                  "mTotal": 0,
-                  "failed": 0
-    }
+                        region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos,
+                                                        useMulti, doControl, normalize)
 
 
-    if minRatio < minPeak:
-        minPeak = minRatio
+                    # just in case it changed, use latest data
+                    try:
+                        bestPos = peak.topPos[0]
+                        peakScore = peak.smoothArray[bestPos]
+                        if normalize:
+                            peakScore /= regionFinder.sampleRDSsize
+                    except:
+                        continue
 
 
-    hitChromList = hitRDS.getChromosomes()
-    if doControl:
-        mockChromList = mockRDS.getChromosomes()
-    else:
-        mockChromList = []
+                    if listPeak:
+                        region.peakDescription= "%d\t%.1f" % (region.start + bestPos, peakScore)
 
 
-    mockSampleSize = 0
-    hitSampleSize = 0
-    if normalize:
-        if doControl:
-            mockSampleSize = mockRDSsize
-
-        hitSampleSize = hitRDSsize
-
-    hitChromList.sort()
-
-    for chromosome in hitChromList:
-        if doNotProcessChromosome(chromosome, doControl, mockChromList):
-            continue
-
-        print "chromosome %s" % (chromosome)
-        hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True,
-                                      doMulti=useMulti, findallOptimize=True, strand=stranded,
-                                      combine5p=combine5p)
-        maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
-        if shiftValue == "learn":
-            shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize,
-                                    mockSampleSize, minRatio, maxSpacing, maxCoord, stringency, readlen, minHits,
-                                    logfilename, outfile, outfilename)
-
-        regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize,
-                                                                  chromosome, useMulti, normalize, maxSpacing,
-                                                                  doDirectionality, doTrim, minHits, minRatio,
-                                                                  readlen, shiftValue, minPeak, minPlusRatio,
-                                                                  maxPlusRatio, leftPlusRatio, listPeak, noMulti,
-                                                                  doControl, factor, trimValue, outputRegionList=True)
-
-        statistics["index"] += regionStats["index"]
-        statistics["total"] += regionStats["total"]
-        statistics["failed"] += regionStats["failed"]
-        if not doRevBackground:
-            if doPvalue:
-                p, poissonmean = calculatePValue(allRegionWeights)
-
-            print headline
-            shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
-            continue
-
-        #now do background swapping the two samples around
-        if mockRDS is not None:
-            print "calculating background..."
-            backgroundTrimValue = 1/20.
-            backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize,
-                                                                           chromosome, useMulti, normalize, maxSpacing,
-                                                                           doDirectionality, doTrim, minHits, minRatio,
-                                                                           readlen, shiftValue, minPeak, minPlusRatio,
-                                                                           maxPlusRatio, leftPlusRatio, listPeak, noMulti,
-                                                                           doControl, factor, backgroundTrimValue)
-
-            statistics["mIndex"] += backgroundRegionStats["index"]
-            statistics["mTotal"] += backgroundRegionStats["total"]
-            statistics["failed"] += backgroundRegionStats["failed"]
-            print statistics["mIndex"], statistics["mTotal"]
-            if doPvalue:
-                if pValueType == "self":
-                    p, poissonmean = calculatePValue(allRegionWeights)
-                else:
-                    p, poissonmean = calculatePValue(backgroundRegionWeights)
-
-            print headline
-            shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
-
-    footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue)
-    print footer
-    outfile.write(footer)
-    outfile.close()
+                    if useMulti:
+                        setMultireadPercentage(region, hitRDS, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos, normalize, regionFinder.doTrim)
 
 
-    writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | ")))
+                    region.shift = peak.shift
+                    # check that we still pass threshold
+                    regionLength = lastReadPos - region.start
+                    plusRatio = float(peak.numPlus)/peak.numHits
+                    if regionAndPeakPass(region, regionFinder.minHits, regionFinder.minRatio, regionLength, regionFinder.readlen, peakScore, regionFinder.minPeak,
+                                         regionFinder.minPlusRatio, regionFinder.maxPlusRatio, plusRatio):
+                        try:
+                            updateRegion(region, regionFinder.doDirectionality, regionFinder.leftPlusRatio, peak.numLeftPlus, peak.numPlus, plusRatio)
+                            regionFinder.statistics["index"] += 1
+                            outregions.append(region)
+                            regionFinder.statistics["total"] += region.numReads
+                        except RegionDirectionError:
+                            regionFinder.statistics["failed"] += 1
+
+            readStartPositions = []
+            totalWeight = 0
+            uniqueReadCount = 0
+            reads = []
+            numStarts = 0
+            if badRegion:
+                badRegion = False
+                regionFinder.statistics["badRegionTrim"] += 1
 
 
+        if pos not in readStartPositions:
+            numStarts += 1
 
 
-def determineShiftValue(autoshift, shift, noshift, rnaSettings):
-    shiftValue = 0
-    if autoshift:
-        shiftValue = "auto"
+        readStartPositions.append(pos)
+        weight = read["weight"]
+        totalWeight += weight
+        if weight == 1.0:
+            uniqueReadCount += 1
 
 
-    if shift is not None:
-        try:
-            shiftValue = int(shift)
-        except ValueError:
-            if shift == "learn":
-                shiftValue = "learn"
-                print "Will try to learn shift"
+        reads.append({"start": pos, "sense": read["sense"], "weight": weight})
+        previousHit = pos
 
 
-    if noshift or rnaSettings:
-        shiftValue = 0
+    return allregions, outregions
 
 
-    return shiftValue
 
 
+def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, normalize, useMulti, factor):
 
 
-def doNotProcessChromosome(chromosome, doControl, mockChromList):
-    skipChromosome = False
-    if chromosome == "chrM":
-        skipChromosome = True
+    print "calculating background..."
+    previousHit = - 1 * regionFinder.maxSpacing
+    currentHitList = [-1]
+    currentTotalWeight = 0
+    currentReadList = []
+    backregions = []
+    numStarts = 0
+    badRegion = False
+    hitDict = controlRDS.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, findallOptimize=True)
+    maxCoord = controlRDS.getMaxCoordinate(chromosome, doMulti=useMulti)
+    for read in hitDict[chromosome]:
+        pos = read["start"]
+        if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
+            lastReadPos = currentHitList[-1]
+            lastBasePosition = lastReadPos + regionFinder.readlen - 1
+            region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=factor, numReads=currentTotalWeight)
+            if normalize:
+                region.numReads /= regionFinder.controlRDSsize
+
+            backregions.append(int(region.numReads))
+            region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=factor, numReads=currentTotalWeight)
+            regionLength = lastReadPos - region.start
+            if regionPassesCriteria(region.numReads, regionFinder.minHits, numStarts, regionFinder.minRatio, regionLength, regionFinder.readlen):
+                numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+                if normalize:
+                    numMock /= regionFinder.sampleRDSsize
+
+                foldRatio = region.numReads / numMock
+                if foldRatio >= regionFinder.minRatio:
+                    # first pass, with absolute numbers
+                    peak = findPeak(currentReadList, region.start, lastReadPos - region.start, regionFinder.readlen, doWeight=True,
+                                    leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue)
 
 
-    if doControl and (chromosome not in mockChromList):
-        skipChromosome = True
+                    if regionFinder.doTrim:
+                        try:
+                            lastReadPos = trimRegion(region, peak, lastReadPos, 20., currentReadList, regionFinder.readlen, regionFinder.doDirectionality,
+                                                     normalize, regionFinder.controlRDSsize)
+                        except IndexError:
+                            badRegion = True
+                            continue
 
 
-    return skipChromosome
+                        numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+                        if normalize:
+                            numMock /= regionFinder.sampleRDSsize
 
 
+                        foldRatio = region.numReads / numMock
 
 
-def calculatePValue(dataList):
-    dataList.sort()
-    listSize = float(len(dataList))
-    try:
-        poissonmean = sum(dataList) / listSize
-    except ZeroDivisionError:
-        poissonmean = 0
+                    # just in case it changed, use latest data
+                    try:
+                        bestPos = peak.topPos[0]
+                        peakScore = peak.smoothArray[bestPos]
+                    except IndexError:
+                        continue
 
 
-    print "Poisson n=%d, p=%f" % (listSize, poissonmean)
-    p = math.exp(-poissonmean)
+                    # normalize to RPM
+                    if normalize:
+                        peakScore /= regionFinder.controlRDSsize
+
+                    # check that we still pass threshold
+                    regionLength = lastReadPos - region.start
+                    if regionPassesCriteria(region.numReads, regionFinder.minHits, foldRatio, regionFinder.minRatio, regionLength, regionFinder.readlen):
+                        updateControlStatistics(regionFinder, peak, region.numReads, peakScore)
+
+            currentHitList = []
+            currentTotalWeight = 0
+            currentReadList = []
+            numStarts = 0
+            if badRegion:
+                badRegion = False
+                regionFinder.statistics["badRegionTrim"] += 1
+
+        if pos not in currentHitList:
+            numStarts += 1
+
+        currentHitList.append(pos)
+        weight = read["weight"]
+        currentTotalWeight += weight
+        currentReadList.append({"start": pos, "sense": read["sense"], "weight": weight})
+        previousHit = pos
 
 
-    return p, poissonmean
+    return backregions
 
 
 
 
-def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord,
-               stringency, readlen, minHits, logfilename, outfile, outfilename, minSites=30):
+def learnShift(regionFinder, hitDict, controlRDS, chromosome, maxCoord, logfilename, outfilename,
+               outfile, numStarts, stringency, normalize, useMulti, doControl):
 
 
-    print "learning shift.... will need at least %d training sites" % minSites
-    previousHit = -1 * maxSpacing
-    hitList = [-1]
+    print "learning shift.... will need at least 30 training sites"
+    previousHit = -1 * regionFinder.maxSpacing
+    positionList = [-1]
     totalWeight = 0
     readList = []
     shiftDict = {}
     count = 0
     totalWeight = 0
     readList = []
     shiftDict = {}
     count = 0
-    numStarts = 0
-    for read in hitDict[chrom]:
+    for read in hitDict[chromosome]:
         pos = read["start"]
         pos = read["start"]
-        sense = read["sense"]
-        weight = read["weight"]
-        if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
-            sumAll = totalWeight
+        if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord):
             if normalize:
             if normalize:
-                sumAll /= hitSampleSize
+                totalWeight /= regionFinder.sampleRDSsize
 
 
-            regionStart = hitList[0]
-            regionStop = hitList[-1]
+            regionStart = positionList[0]
+            regionStop = positionList[-1]
             regionLength = regionStop - regionStart
             regionLength = regionStop - regionStart
-            # we're going to require stringent settings
-            if sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen:
-                foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
-
-                if foldRatio >= minRatio:
-                    localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True)
-                    try:
-                        shiftDict[localshift] += 1
-                    except KeyError:
-                        shiftDict[localshift] = 1
-
+            if regionPassesCriteria(totalWeight, regionFinder.minHits, numStarts, regionFinder.minRatio, regionLength, regionFinder.readlen, stringency=stringency):
+                foldRatio = getFoldRatio(regionFinder, controlRDS, totalWeight, chromosome, regionStart, regionStop, useMulti, doControl, normalize)
+                if foldRatio >= regionFinder.minRatio:
+                    updateShiftDict(shiftDict, readList, regionStart, regionLength, regionFinder.readlen)
                     count += 1
 
                     count += 1
 
-            hitList = []
+            positionList = []
             totalWeight = 0
             readList = []
             totalWeight = 0
             readList = []
-            numStarts = 0
 
 
-        if pos not in hitList:
+        if pos not in positionList:
             numStarts += 1
 
             numStarts += 1
 
-        hitList.append(pos)
+        positionList.append(pos)
+        weight = read["weight"]
         totalWeight += weight
         totalWeight += weight
-        readList.append({"start": pos, "sense": sense, "weight": weight})
+        readList.append({"start": pos, "sense": read["sense"], "weight": weight})
         previousHit = pos
 
         previousHit = pos
 
-    bestShift = 0
-    bestCount = 0
-    learningSettings = ["#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d" % (stringency, stringency * minHits,
-                                                                                                       stringency * minRatio, stringency * readlen),
-                        "#number of training examples: %d" % count]
-    outline = string.join(learningSettings, "\n")
+    outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency,
+                                                                                                                               stringency * regionFinder.minHits,
+                                                                                                                               stringency * regionFinder.minRatio,
+                                                                                                                               stringency * regionFinder.readlen,
+                                                                                                                               count)
+
     print outline
     print outline
-    writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
-    if count < minSites:
+    writeLog(logfilename, versionString, outfilename + outline)
+    regionFinder.shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename)
+    outline = "#picked shiftValue to be %d" % regionFinder.shiftValue
+    print outline
+    print >> outfile, outline
+    writeLog(logfilename, versionString, outfilename + outline)
+
+
+def previousRegionIsDone(pos, previousHit, maxSpacing, maxCoord):
+    return abs(pos - previousHit) > maxSpacing or pos == maxCoord
+
+
+def regionPassesCriteria(sumAll, minHits, numStarts, minRatio, regionLength, readlen, stringency=1):
+    return sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen
+
+
+def trimRegion(region, peak, regionStop, trimValue, currentReadList, readlen, doDirectionality, normalize, hitRDSsize):
+    bestPos = peak.topPos[0]
+    peakScore = peak.smoothArray[bestPos]
+    if normalize:
+        peakScore /= hitRDSsize
+
+    minSignalThresh = trimValue * peakScore
+    start = findStartEdgePosition(peak, minSignalThresh)
+    regionEndPoint = regionStop - region.start - 1
+    stop = findStopEdgePosition(peak, regionEndPoint, minSignalThresh)
+
+    regionStop = region.start + stop
+    region.start += start
+    trimmedPeak = findPeak(currentReadList, region.start, regionStop - region.start, readlen, doWeight=True, leftPlus=doDirectionality, shift=peak.shift)
+    peak.numPlus = trimmedPeak.numPlus
+    peak.numLeftPlus = trimmedPeak.numLeftPlus
+    peak.topPos = trimmedPeak.topPos
+    peak.smoothArray = trimmedPeak.smoothArray
+
+    region.numReads = trimmedPeak.numHits
+    if normalize:
+        region.numReads /= hitRDSsize
+
+    region.stop = regionStop + readlen - 1
+                                
+    return regionStop
+
+
+def findStartEdgePosition(peak, minSignalThresh):
+    start = 0
+    while not peakEdgeLocated(peak, start, minSignalThresh):
+        start += 1
+
+    return start
+
+
+def findStopEdgePosition(peak, stop, minSignalThresh):
+    while not peakEdgeLocated(peak, stop, minSignalThresh):
+        stop -= 1
+
+    return stop
+
+
+def peakEdgeLocated(peak, position, minSignalThresh):
+    return peak.smoothArray[position] >= minSignalThresh or position == peak.topPos[0]
+
+
+def getFoldRatio(regionFinder, controlRDS, sumAll, chromosome, regionStart, regionStop, useMulti, doControl, normalize):
+    if doControl:
+        numMock = 1. + controlRDS.getCounts(chromosome, regionStart, regionStop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+        if normalize:
+            numMock /= regionFinder.controlRDSsize
+
+        foldRatio = sumAll / numMock
+    else:
+        foldRatio = regionFinder.minRatio
+
+    return foldRatio
+
+
+def updateShiftDict(shiftDict, readList, regionStart, regionLength, readlen):
+    peak = findPeak(readList, regionStart, regionLength, readlen, doWeight=True, shift="auto")
+    try:
+        shiftDict[peak.shift] += 1
+    except KeyError:
+        shiftDict[peak.shift] = 1
+
+
+def getShiftValue(shiftDict, count, logfilename, outfilename):
+    if count < 30:
         outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
         outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold"
-        print >> outfile, outline
-        writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
+        print outline
+        writeLog(logfilename, versionString, outfilename + outline)
         shiftValue = 0
     else:
         shiftValue = 0
     else:
-        for shift in sorted(shiftDict):
-            if shiftDict[shift] > bestCount:
-                bestShift = shift
-                bestCount = shiftDict[shift]
-
-        shiftValue = bestShift
+        shiftValue = getBestShiftInDict(shiftDict)
         print shiftDict
 
         print shiftDict
 
-    outline = "#picked shiftValue to be %d" % shiftValue
-    print outline
-    print >> outfile, outline
-    writeLog(logfilename, versionString, "%s%s" % (outfilename, outline))
-
     return shiftValue
 
 
     return shiftValue
 
 
-def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio):
-    if doControl and rds is not None:
-        foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll)
+def updateControlStatistics(regionFinder, peak, sumAll, peakScore):
+
+    plusRatio = float(peak.numPlus)/peak.numHits
+    if peakScore >= regionFinder.minPeak and regionFinder.minPlusRatio <= plusRatio <= regionFinder.maxPlusRatio:
+        if regionFinder.doDirectionality:
+            if regionFinder.leftPlusRatio < peak.numLeft / peak.numPlus:
+                regionFinder.statistics["mIndex"] += 1
+                regionFinder.statistics["mTotal"] += sumAll
+            else:
+                regionFinder.statistics["failed"] += 1
+        else:
+            # we have a region, but didn't check for directionality
+            regionFinder.statistics["mIndex"] += 1
+            regionFinder.statistics["mTotal"] += sumAll
+
+
+def getRegion(regionStart, regionStop, factor, index, chromosome, sumAll, foldRatio, multiP,
+              peakDescription, shift, doDirectionality, leftPlusRatio, numLeft,
+              numPlus, plusRatio):
+
+    if doDirectionality:
+        if leftPlusRatio < numLeft / numPlus:
+            plusP = plusRatio * 100.
+            leftP = 100. * numLeft / numPlus
+            # we have a region that passes all criteria
+            region = Region.DirectionalRegion(regionStart, regionStop,
+                                              factor, index, chromosome, sumAll,
+                                              foldRatio, multiP, plusP, leftP,
+                                              peakDescription, shift)
+
+        else:
+            raise RegionDirectionError
     else:
     else:
-        foldRatio = minRatio
+        # we have a region, but didn't check for directionality
+        region = Region.Region(regionStart, regionStop, factor, index, chromosome,
+                               sumAll, foldRatio, multiP, peakDescription, shift)
 
 
-    return foldRatio
+    return region
 
 
 
 
-def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll):
-    numMock = 1. + rds.getCounts(chrom, start, stop, uniqs=True, multi=useMulti, splices=False, reportCombined=True)
+def setMultireadPercentage(region, hitRDS, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim):
+    if doTrim:
+        sumMulti = hitRDS.getMultiCount(chromosome, region.start, lastReadPos)
+    else:
+        sumMulti = currentTotalWeight - currentUniqueCount
+
+    # normalize to RPM
     if normalize:
     if normalize:
-        numMock /= sampleSize
+        sumMulti /= hitRDSsize
 
 
-    foldRatio = sumAll / numMock
+    try:
+        multiP = 100. * (sumMulti / region.numReads)
+    except ZeroDivisionError:
+        return
 
 
-    return foldRatio
+    region.multiP = multiP
 
 
 
 
-def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
-                normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
-                shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
-                noMulti, doControl, factor, trimValue, outputRegionList=False):
+def regionAndPeakPass(region, minHits, minRatio, regionLength, readlen, peakScore, minPeak, minPlusRatio, maxPlusRatio, plusRatio):
+    regionPasses = False
+    if regionPassesCriteria(region.numReads, minHits, region.foldRatio, minRatio, regionLength, readlen):
+        if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
+            regionPasses = True
 
 
-    index = 0
-    totalRegionWeight = 0
-    failedCounter = 0
-    previousHit = - 1 * maxSpacing
-    currentHitList = [-1]
-    currentTotalWeight = 0
-    currentUniqReadCount = 0
-    currentReadList = []
-    regionWeights = []
-    outregions = []
-    numStarts = 0
-    hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True)
-    maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti)
-    for read in hitDict[chrom]:
-        pos = read["start"]
-        sense = read["sense"]
-        weight = read["weight"]
-        if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
-            sumAll = currentTotalWeight
-            if normalizeToRPM:
-                sumAll /= rdsSampleSize
-
-            regionStart = currentHitList[0]
-            regionStop = currentHitList[-1]
-            regionWeights.append(int(sumAll))
-            if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
-                sumMulti = 0.
-                foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
-                if foldRatio >= minRatio:
-                    # first pass, with absolute numbers
-                    peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
-
-                    bestPos = peak.topPos[0]
-                    numHits = peak.numHits
-                    peakScore = peak.smoothArray[bestPos]
-                    numPlus = peak.numPlus
-                    shift = peak.shift
-                    numLeft = peak.numLeftPlus
-                    if normalizeToRPM:
-                        peakScore /= rdsSampleSize
-
-                    if doTrim:
-                        minSignalThresh = trimValue * peakScore
-                        start = 0
-                        stop = regionStop - regionStart - 1
-                        startFound = False
-                        while not startFound:
-                            if peak.smoothArray[start] >= minSignalThresh or start == bestPos:
-                                startFound = True
-                            else:
-                                start += 1
-
-                        stopFound = False
-                        while not stopFound:
-                            if peak.smoothArray[stop] >= minSignalThresh or stop == bestPos:
-                                stopFound = True
-                            else:
-                                stop -= 1
-
-                        regionStop = regionStart + stop
-                        regionStart += start
-                        trimPeak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift)
-
-                        sumAll = trimPeak.numHits
-                        numPlus = trimPeak.numPlus
-                        numLeft = trimPeak.numLeftPlus
-                        if normalizeToRPM:
-                            sumAll /= rdsSampleSize
-
-                        foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
-                        if outputRegionList:
-                            sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
-                        # just in case it changed, use latest data
-                        try:
-                            bestPos = trimPeak.topPos[0]
-                            peakScore = trimPeak.smoothArray[bestPos]
-                        except:
-                            continue
+    return regionPasses
 
 
-                        if normalizeToRPM:
-                            peakScore /= rdsSampleSize
 
 
-                    elif outputRegionList:
-                        sumMulti = currentTotalWeight - currentUniqReadCount
+def updateRegion(region,
+                 doDirectionality, leftPlusRatio, numLeft,
+                 numPlus, plusRatio):
 
 
-                    if outputRegionList:
-                        if normalizeToRPM:
-                            sumMulti /= rdsSampleSize
+    if doDirectionality:
+        if leftPlusRatio < numLeft / numPlus:
+            region.plusP = plusRatio * 100.
+            region.leftP = 100. * numLeft / numPlus
+        else:
+            raise RegionDirectionError
 
 
-                        try:
-                            multiP = 100. * (sumMulti / sumAll)
-                        except:
-                            break
 
 
-                        if noMulti:
-                            multiP = 0.
+def writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
+                                allregions, header):
 
 
-                    # check that we still pass threshold
-                    if sumAll >= minHits and  foldRatio >= minRatio and (regionStop - regionStart) > readlen:
-                        plusRatio = float(numPlus)/numHits
-                        if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio:
-                            if outputRegionList:
-                                peakDescription = ""
-                                if listPeak:
-                                    peakDescription = "\t%d\t%.1f" % (regionStart + bestPos, peakScore)
-
-                            if doDirectionality:
-                                if leftPlusRatio < numLeft / numPlus:
-                                    index += 1
-                                    if outputRegionList:
-                                        plusP = plusRatio * 100.
-                                        leftP = 100. * numLeft / numPlus
-                                        # we have a region that passes all criteria
-                                        region = Region.DirectionalRegion(regionStart, regionStop + readlen - 1,
-                                                                          factor, index, chrom, sumAll,
-                                                                          foldRatio, multiP, plusP, leftP,
-                                                                          peakDescription, shift)
-                                        outregions.append(region)
-
-                                    totalRegionWeight += sumAll
-                                else:
-                                    failedCounter += 1
-                            else:
-                                # we have a region, but didn't check for directionality
-                                index += 1
-                                totalRegionWeight += sumAll
-                                if outputRegionList:
-                                    region = Region.Region(regionStart, regionStop + readlen - 1, factor, index, chrom,
-                                                           sumAll, foldRatio, multiP, peakDescription, shift)
-                                    outregions.append(region)
+    writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
+                           allregions, header, backregions=[], pValueType="self")
 
 
-            currentHitList = []
-            currentTotalWeight = 0
-            currentUniqReadCount = 0
-            currentReadList = []
-            numStarts = 0
 
 
-        if pos not in currentHitList:
-            numStarts += 1
+def writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict,
+                           allregions, header, backregions=[], pValueType="none"):
 
 
-        currentHitList.append(pos)
-        currentTotalWeight += weight
-        if weight == 1.0:
-            currentUniqReadCount += 1
+    print regionFinder.statistics["mIndex"], regionFinder.statistics["mTotal"]
+    if doPvalue:
+        if pValueType == "self":
+            poissonmean = calculatePoissonMean(allregions)
+        else:
+            poissonmean = calculatePoissonMean(backregions)
 
 
-        currentReadList.append({"start": pos, "sense": sense, "weight": weight})
-        previousHit = pos
+    print header
+    writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=regionFinder.shiftValue, reportshift=reportshift, shiftDict=shiftDict)
 
 
-    statistics = {"index": index,
-                  "total": totalRegionWeight,
-                  "failed": failedCounter
-    }
 
 
-    if outputRegionList:
-        return statistics, regionWeights, outregions
-    else:
-        return statistics, regionWeights
+def calculatePoissonMean(dataList):
+    dataList.sort()
+    listSize = float(len(dataList))
+    try:
+        poissonmean = sum(dataList) / listSize
+    except ZeroDivisionError:
+        poissonmean = 0
 
 
+    print "Poisson n=%d, p=%f" % (listSize, poissonmean)
 
 
-def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, reportshift, shiftValue):
-    bestShift = 0
-    shiftDict = {}
+    return poissonmean
+
+
+def writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=0, reportshift=False, shiftDict={}):
     for region in outregions:
     for region in outregions:
-        # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
-        if reportshift:
-            outputList = [region.printRegionWithShift()]
-            if shiftValue == "auto":
-                try:
-                    shiftDict[region.shift] += 1
-                except KeyError:
-                    shiftDict[region.shift] = 1
-        else:
-            outputList = [region.printRegion()]
+        if shiftValue == "auto" and reportshift:
+            try:
+                shiftDict[region.shift] += 1
+            except KeyError:
+                shiftDict[region.shift] = 1
+
+        outline = getRegionString(region, reportshift)
 
         # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
         if doPvalue:
             sumAll = int(region.numReads)
 
         # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest
         if doPvalue:
             sumAll = int(region.numReads)
-            for i in xrange(sumAll):
-                pValue *= poissonmean
-                pValue /= i+1
-
-            outputList.append("%1.2g" % pValue)
+            pValue = calculatePValue(sumAll, poissonmean)
+            outline += "\t%1.2g" % pValue
 
 
-        outline = string.join(outputList, "\t")
         print outline
         print >> outfile, outline
 
         print outline
         print >> outfile, outline
 
-    bestCount = 0
-    for shift in sorted(shiftDict):
-        if shiftDict[shift] > bestCount:
-            bestShift = shift
-            bestCount = shiftDict[shift]
 
 
-    return bestShift
+def calculatePValue(sum, poissonmean):
+    pValue = math.exp(-poissonmean)
+    #TODO: 798: DeprecationWarning: integer argument expected, got float - for i in xrange(sum)
+    for i in xrange(sum):
+        pValue *= poissonmean
+        pValue /= i+1
 
 
+    return pValue
+
+
+def getRegionString(region, reportShift):
+    if reportShift:
+        outline = region.printRegionWithShift()
+    else:
+        outline = region.printRegion()
+
+    return outline
 
 
-def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue):
-    footerList = ["#stats:\t%.1f RPM in %d regions" % (stats["total"], stats["index"])]
-    if doDirectionality:
-        footerList.append("#\t\t%d additional regions failed directionality filter" % stats["failed"])
+
+def getFooter(regionFinder, shiftDict, reportshift, doRevBackground):
+    index = regionFinder.statistics["index"]
+    mIndex = regionFinder.statistics["mIndex"]
+    footerLines = ["#stats:\t%.1f RPM in %d regions" % (regionFinder.statistics["total"], index)]
+    if regionFinder.doDirectionality:
+        footerLines.append("#\t\t%d additional regions failed directionality filter" % regionFinder.statistics["failed"])
 
     if doRevBackground:
         try:
 
     if doRevBackground:
         try:
-            percent = min(100. * (float(stats["mIndex"])/stats["index"]), 100)
-        except (ValueError, ZeroDivisionError):
+            percent = min(100. * (float(mIndex)/index), 100.)
+        except ZeroDivisionError:
             percent = 0.
 
             percent = 0.
 
-        footerList.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (stats["mIndex"], stats["mTotal"], percent))
+        footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, regionFinder.statistics["mTotal"], percent))
+
+    if regionFinder.shiftValue == "auto" and reportshift:
+        bestShift = getBestShiftInDict(shiftDict)
+        footerLines.append("#mode of shift values: %d" % bestShift)
+
+    if regionFinder.statistics["badRegionTrim"] > 0:
+        footerLines.append("#%d regions discarded due to trimming problems" % regionFinder.statistics["badRegionTrim"])
+
+    return string.join(footerLines, "\n")
 
 
-    if shiftValue == "auto" and reportshift:
-        footerList.append("#mode of shift values: %d" % shiftModeValue)
 
 
-    footer = string.join(footerList, "\n")
+def getBestShiftInDict(shiftDict):
+    return max(shiftDict.iteritems(), key=operator.itemgetter(1))[0]
 
 
-    return footer
 
 if __name__ == "__main__":
     main(sys.argv)
 
 if __name__ == "__main__":
     main(sys.argv)
index 49745dc0c1361c2a8dbe4736c9c8630396fd93e4..03cade292225b55513b300a0fca2d8f2c4f5d50a 100755 (executable)
@@ -138,7 +138,7 @@ def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins
             try:
                 geneinfo = geneinfoDict[gid]
                 symbol = geneinfo[0][0]
             try:
                 geneinfo = geneinfoDict[gid]
                 symbol = geneinfo[0][0]
-            except KeyError:
+            except (KeyError, IndexError):
                 pass
         else:
             symbol = gid
                 pass
         else:
             symbol = gid
@@ -154,6 +154,7 @@ def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins
                 try:
                     normalizedValue = 100. * binAmount / tagCount
                 except ZeroDivisionError:
                 try:
                     normalizedValue = 100. * binAmount / tagCount
                 except ZeroDivisionError:
+                    #TODO: QForALi - this is the right way to refactor the original code, but I don't think this is the right answer
                     normalizedValue = 100. * binAmount
 
                 binAmount = normalizedValue
                     normalizedValue = 100. * binAmount
 
                 binAmount = normalizedValue
@@ -167,4 +168,4 @@ def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins
 
 
 if __name__ == "__main__":
 
 
 if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
\ No newline at end of file
index 5299d27d26c23f8729063eb31f0da390730b5ce5..74e7a0cd817e699af6b398960578f576a030f7e4 100755 (executable)
@@ -68,6 +68,12 @@ def makeParser(usage=""):
     return parser
 
 
     return parser
 
 
+#TODO: Reported user performance issue. Long run times in conditions:
+#    small number of reads ~40-50M
+#    all features on single chromosome
+#
+#    User states has been a long time problem.
+
 def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
                            withUniqs=False, withMulti=False, acceptfile=None,
                            cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
 def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True,
                            withUniqs=False, withMulti=False, acceptfile=None,
                            cachePages=None, doVerbose=False, extendGenome="", replaceModels=False):
@@ -200,6 +206,7 @@ def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict,
     for line in uniquecounts:
         fields = line.strip().split()
         # add a pseudo-count here to ease calculations below
     for line in uniquecounts:
         fields = line.strip().split()
         # add a pseudo-count here to ease calculations below
+        #TODO: figure out why this was done in prior implementation...
         uniqueCountDict[fields[0]] = float(fields[-1]) + 1
 
     uniquecounts.close()
         uniqueCountDict[fields[0]] = float(fields[-1]) + 1
 
     uniquecounts.close()
@@ -264,4 +271,4 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
 
 
 if __name__ == "__main__":
 
 
 if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
\ No newline at end of file
index e95d72693c76df13871da4cf809c99e43c41d36a..4091281c8fe0f5410090b23db7276496fe49d6ed 100755 (executable)
@@ -68,6 +68,7 @@ def doNotProcessLine(line):
 
 def getNovelSNPInfo(genome, snpEntry, hg):
     fields = snpEntry.split()
 
 def getNovelSNPInfo(genome, snpEntry, hg):
     fields = snpEntry.split()
+    #TODO: refactor naming. is fields[8] rpkm?
     if fields[8].find("N\A") == -1: 
         return snpEntry
     else:
     if fields[8].find("N\A") == -1: 
         return snpEntry
     else:
@@ -92,4 +93,4 @@ def getNovelSNPInfo(genome, snpEntry, hg):
 
 
 if __name__ == "__main__":
 
 
 if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
\ No newline at end of file
index f6071ae444be9ca294ac4c4ff39b50a74e3d0cb4..f5826751fdecdc3643967ea8e04df8331fbf94e3 100755 (executable)
@@ -75,7 +75,7 @@ def makeParser(usage=""):
     forceChr = getConfigBoolOption(configParser, section, "forceChr", False)
     cachePages = getConfigIntOption(configParser, section, "cachePages", 0)
 
     forceChr = getConfigBoolOption(configParser, section, "forceChr", False)
     cachePages = getConfigIntOption(configParser, section, "cachePages", 0)
 
-    parser.set_defaults(doSplices=True, forceChr=False, cachePages=0)
+    parser.set_defaults(doSplices=doSplices, forceChr=forceChr, cachePages=cachePages)
 
     return parser
 
 
     return parser
 
index ea347e4aa44f1a21a0b7c54d11b407352ce91b6f..acae47d68c896d14cf7b894e63adac2552483291 100755 (executable)
@@ -36,7 +36,7 @@ def main(argv=None):
     outfilename = args[2]
 
     getfasta(genome, regionfile, outfilename, options.seqsize, options.minHitThresh,
     outfilename = args[2]
 
     getfasta(genome, regionfile, outfilename, options.seqsize, options.minHitThresh,
-             options.topRegions, options.maxsize, options.usePeaks, options.hitFile,
+             options.topRegions, options.maxsize, options.usePeaks, options.hitfile,
              options.doCache, options.doCompact)
 
 
              options.doCache, options.doCompact)