Release version for Erange 4.0a
[erange.git] / findall.py
index d20608f1997e2e6a0fd84d9169c78025485e4b57..ec75efb54f3680c892578a38811c1bd241c37241 100755 (executable)
@@ -169,25 +169,14 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
             strandfilter=None, combine5p=False):
 
     shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
-
-    if trimValue is not None:
-        trimValue = float(trimValue) / 100.
-        trimString = "%2.1f%s" % ((100. * trimValue), "%")
-    else:
-        trimValue = 0.1
-        trimString = "10%"
-
-    if not doTrim:
-        trimString = "none"
+    doControl = False
+    if mockfile is not None:
+        doControl = True
 
     if doRevBackground:
         print "Swapping IP and background to calculate FDR"
         pValueType = "back"
 
-    doControl = False
-    if mockfile is not None:
-        doControl = True
-
     doPvalue = True
     if ptype is not None:
         ptype = ptype.upper()
@@ -208,12 +197,6 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
     else:
         pValueType = "self"
 
-    if cachePages is not None:
-        doCache = True
-    else:
-        doCache = False
-        cachePages = -1
-
     if withFlag != "":
         print "restrict to flag = %s" % withFlag
 
@@ -242,6 +225,12 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
 
     stringency = max(stringency, 1.0)
     writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+    if cachePages is not None:
+        doCache = True
+    else:
+        doCache = False
+        cachePages = -1
+
     if doControl:
         print "\ncontrol:" 
         mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
@@ -258,13 +247,22 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
     if cachePages > hitRDS.getDefaultCacheSize():
         hitRDS.setDBcache(cachePages)
 
+    if trimValue is not None:
+        trimValue = float(trimValue) / 100.
+        trimString = "%2.1f%s" % ((100. * trimValue), "%")
+    else:
+        trimValue = 0.1
+        trimString = "10%"
+
+    if not doTrim:
+        trimString = "none"
+
     if doAppend:
         fileMode = "a"
     else:
         fileMode = "w"
 
     outfile = open(outfilename, fileMode)
-
     outfile.write("#ERANGE %s\n" % versionString)
     if doControl:
         mockRDSsize = len(mockRDS) / 1000000.
@@ -464,7 +462,7 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm
                 foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
 
                 if foldRatio >= minRatio:
-                    localshift = getBestShiftForRegion(readList, regionStart, regionLength, doWeight=True)
+                    localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True)
                     try:
                         shiftDict[localshift] += 1
                     except KeyError:
@@ -535,7 +533,7 @@ def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize
 
 
 def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
-                normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
+                normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
                 shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
                 noMulti, doControl, factor, trimValue, outputRegionList=False):
 
@@ -558,7 +556,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
         weight = read["weight"]
         if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
             sumAll = currentTotalWeight
-            if normalize:
+            if normalizeToRPM:
                 sumAll /= rdsSampleSize
 
             regionStart = currentHitList[0]
@@ -567,7 +565,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
             if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
                 sumMulti = 0.
                 #first pass uses getFoldRatio on mockRDS as there may not be control
-                foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll)
+                foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalizeToRPM, referenceSampleSize, sumAll)
                 if foldRatio >= minRatio:
                     # first pass, with absolute numbers
                     peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
@@ -577,8 +575,8 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                     peakScore = peak.smoothArray[bestPos]
                     numPlus = peak.numPlus
                     shift = peak.shift
-                    numLeft = peak.numLeft
-                    if normalize:
+                    numLeft = peak.numLeftPlus
+                    if normalizeToRPM:
                         peakScore /= rdsSampleSize
 
                     if doTrim:
@@ -605,11 +603,11 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
 
                         sumAll = trimPeak.numHits
                         numPlus = trimPeak.numPlus
-                        numLeft = trimPeak.numLeft
-                        if normalize:
+                        numLeft = trimPeak.numLeftPlus
+                        if normalizeToRPM:
                             sumAll /= rdsSampleSize
 
-                        foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, referenceSampleSize, sumAll, minRatio)
+                        foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
                         if outputRegionList:
                             sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
                         # just in case it changed, use latest data
@@ -619,16 +617,14 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                         except:
                             continue
 
-                        # normalize to RPM
-                        if normalize:
+                        if normalizeToRPM:
                             peakScore /= rdsSampleSize
 
                     elif outputRegionList:
                         sumMulti = currentTotalWeight - currentUniqReadCount
 
                     if outputRegionList:
-                        # normalize to RPM
-                        if normalize:
+                        if normalizeToRPM:
                             sumMulti /= rdsSampleSize
 
                         try:
@@ -759,4 +755,4 @@ def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift,
     return footer
 
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)