development checkpoint
[erange.git] / findall.py
index d20608f1997e2e6a0fd84d9169c78025485e4b57..aa820374b994d497fbc4dd336a98b795489f5900 100755 (executable)
@@ -169,25 +169,14 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
             strandfilter=None, combine5p=False):
 
     shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings)
-
-    if trimValue is not None:
-        trimValue = float(trimValue) / 100.
-        trimString = "%2.1f%s" % ((100. * trimValue), "%")
-    else:
-        trimValue = 0.1
-        trimString = "10%"
-
-    if not doTrim:
-        trimString = "none"
+    doControl = False
+    if mockfile is not None:
+        doControl = True
 
     if doRevBackground:
         print "Swapping IP and background to calculate FDR"
         pValueType = "back"
 
-    doControl = False
-    if mockfile is not None:
-        doControl = True
-
     doPvalue = True
     if ptype is not None:
         ptype = ptype.upper()
@@ -208,12 +197,6 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
     else:
         pValueType = "self"
 
-    if cachePages is not None:
-        doCache = True
-    else:
-        doCache = False
-        cachePages = -1
-
     if withFlag != "":
         print "restrict to flag = %s" % withFlag
 
@@ -242,6 +225,13 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
 
     stringency = max(stringency, 1.0)
     writeLog(logfilename, versionString, string.join(sys.argv[1:]))
+    if cachePages is not None:
+        doCache = True
+    else:
+        doCache = False
+        cachePages = -1
+
+    mockRDS = None
     if doControl:
         print "\ncontrol:" 
         mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
@@ -258,18 +248,28 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
     if cachePages > hitRDS.getDefaultCacheSize():
         hitRDS.setDBcache(cachePages)
 
+    if trimValue is not None:
+        trimValue = float(trimValue) / 100.
+        trimString = "%2.1f%s" % ((100. * trimValue), "%")
+    else:
+        trimValue = 0.1
+        trimString = "10%"
+
+    if not doTrim:
+        trimString = "none"
+
     if doAppend:
         fileMode = "a"
     else:
         fileMode = "w"
 
     outfile = open(outfilename, fileMode)
-
     outfile.write("#ERANGE %s\n" % versionString)
     if doControl:
         mockRDSsize = len(mockRDS) / 1000000.
         controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize)
     else:
+        mockRDSsize = 0
         controlSampleString = " none"
 
     hitRDSsize = len(hitRDS) / 1000000.
@@ -320,7 +320,11 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
     hitChromList = hitRDS.getChromosomes()
     if doControl:
         mockChromList = mockRDS.getChromosomes()
+    else:
+        mockChromList = []
 
+    mockSampleSize = 0
+    hitSampleSize = 0
     if normalize:
         if doControl:
             mockSampleSize = mockRDSsize
@@ -362,27 +366,28 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
             continue
 
         #now do background swapping the two samples around
-        print "calculating background..."
-        backgroundTrimValue = 1/20.
-        backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize,
-                                                                       chromosome, useMulti, normalize, maxSpacing,
-                                                                       doDirectionality, doTrim, minHits, minRatio,
-                                                                       readlen, shiftValue, minPeak, minPlusRatio,
-                                                                       maxPlusRatio, leftPlusRatio, listPeak, noMulti,
-                                                                       doControl, factor, backgroundTrimValue)
-
-        statistics["mIndex"] += backgroundRegionStats["index"]
-        statistics["mTotal"] += backgroundRegionStats["total"]
-        statistics["failed"] += backgroundRegionStats["failed"]
-        print statistics["mIndex"], statistics["mTotal"]
-        if doPvalue:
-            if pValueType == "self":
-                p, poissonmean = calculatePValue(allRegionWeights)
-            else:
-                p, poissonmean = calculatePValue(backgroundRegionWeights)
+        if mockRDS is not None:
+            print "calculating background..."
+            backgroundTrimValue = 1/20.
+            backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize,
+                                                                           chromosome, useMulti, normalize, maxSpacing,
+                                                                           doDirectionality, doTrim, minHits, minRatio,
+                                                                           readlen, shiftValue, minPeak, minPlusRatio,
+                                                                           maxPlusRatio, leftPlusRatio, listPeak, noMulti,
+                                                                           doControl, factor, backgroundTrimValue)
+
+            statistics["mIndex"] += backgroundRegionStats["index"]
+            statistics["mTotal"] += backgroundRegionStats["total"]
+            statistics["failed"] += backgroundRegionStats["failed"]
+            print statistics["mIndex"], statistics["mTotal"]
+            if doPvalue:
+                if pValueType == "self":
+                    p, poissonmean = calculatePValue(allRegionWeights)
+                else:
+                    p, poissonmean = calculatePValue(backgroundRegionWeights)
 
-        print headline
-        shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
+            print headline
+            shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
 
     footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue)
     print footer
@@ -464,7 +469,7 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm
                 foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio)
 
                 if foldRatio >= minRatio:
-                    localshift = getBestShiftForRegion(readList, regionStart, regionLength, doWeight=True)
+                    localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True)
                     try:
                         shiftDict[localshift] += 1
                     except KeyError:
@@ -516,7 +521,7 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm
 
 
 def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio):
-    if doControl:
+    if doControl and rds is not None:
         foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll)
     else:
         foldRatio = minRatio
@@ -535,7 +540,7 @@ def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize
 
 
 def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti,
-                normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
+                normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen,
                 shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak,
                 noMulti, doControl, factor, trimValue, outputRegionList=False):
 
@@ -558,7 +563,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
         weight = read["weight"]
         if abs(pos - previousHit) > maxSpacing or pos == maxCoord:
             sumAll = currentTotalWeight
-            if normalize:
+            if normalizeToRPM:
                 sumAll /= rdsSampleSize
 
             regionStart = currentHitList[0]
@@ -566,8 +571,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
             regionWeights.append(int(sumAll))
             if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
                 sumMulti = 0.
-                #first pass uses getFoldRatio on mockRDS as there may not be control
-                foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll)
+                foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
                 if foldRatio >= minRatio:
                     # first pass, with absolute numbers
                     peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
@@ -577,8 +581,8 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                     peakScore = peak.smoothArray[bestPos]
                     numPlus = peak.numPlus
                     shift = peak.shift
-                    numLeft = peak.numLeft
-                    if normalize:
+                    numLeft = peak.numLeftPlus
+                    if normalizeToRPM:
                         peakScore /= rdsSampleSize
 
                     if doTrim:
@@ -605,11 +609,11 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
 
                         sumAll = trimPeak.numHits
                         numPlus = trimPeak.numPlus
-                        numLeft = trimPeak.numLeft
-                        if normalize:
+                        numLeft = trimPeak.numLeftPlus
+                        if normalizeToRPM:
                             sumAll /= rdsSampleSize
 
-                        foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, referenceSampleSize, sumAll, minRatio)
+                        foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
                         if outputRegionList:
                             sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True)
                         # just in case it changed, use latest data
@@ -619,16 +623,14 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
                         except:
                             continue
 
-                        # normalize to RPM
-                        if normalize:
+                        if normalizeToRPM:
                             peakScore /= rdsSampleSize
 
                     elif outputRegionList:
                         sumMulti = currentTotalWeight - currentUniqReadCount
 
                     if outputRegionList:
-                        # normalize to RPM
-                        if normalize:
+                        if normalizeToRPM:
                             sumMulti /= rdsSampleSize
 
                         try:
@@ -723,7 +725,7 @@ def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, repor
                 pValue *= poissonmean
                 pValue /= i+1
 
-            outputList.append("%1.2f" % pValue)
+            outputList.append("%1.2g" % pValue)
 
         outline = string.join(outputList, "\t")
         print outline
@@ -759,4 +761,4 @@ def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift,
     return footer
 
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)