X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=findall.py;h=aa820374b994d497fbc4dd336a98b795489f5900;hp=d20608f1997e2e6a0fd84d9169c78025485e4b57;hb=a3c0e60eb30c924232d7baaa348a15c5554e3864;hpb=0d3e3112fd04c2e6b44a25cacef1d591658ad181 diff --git a/findall.py b/findall.py index d20608f..aa82037 100755 --- a/findall.py +++ b/findall.py @@ -169,25 +169,14 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= strandfilter=None, combine5p=False): shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings) - - if trimValue is not None: - trimValue = float(trimValue) / 100. - trimString = "%2.1f%s" % ((100. * trimValue), "%") - else: - trimValue = 0.1 - trimString = "10%" - - if not doTrim: - trimString = "none" + doControl = False + if mockfile is not None: + doControl = True if doRevBackground: print "Swapping IP and background to calculate FDR" pValueType = "back" - doControl = False - if mockfile is not None: - doControl = True - doPvalue = True if ptype is not None: ptype = ptype.upper() @@ -208,12 +197,6 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= else: pValueType = "self" - if cachePages is not None: - doCache = True - else: - doCache = False - cachePages = -1 - if withFlag != "": print "restrict to flag = %s" % withFlag @@ -242,6 +225,13 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= stringency = max(stringency, 1.0) writeLog(logfilename, versionString, string.join(sys.argv[1:])) + if cachePages is not None: + doCache = True + else: + doCache = False + cachePages = -1 + + mockRDS = None if doControl: print "\ncontrol:" mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache) @@ -258,18 +248,28 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= if cachePages > hitRDS.getDefaultCacheSize(): hitRDS.setDBcache(cachePages) + if trimValue is not None: + trimValue = float(trimValue) / 100. + trimString = "%2.1f%s" % ((100. * trimValue), "%") + else: + trimValue = 0.1 + trimString = "10%" + + if not doTrim: + trimString = "none" + if doAppend: fileMode = "a" else: fileMode = "w" outfile = open(outfilename, fileMode) - outfile.write("#ERANGE %s\n" % versionString) if doControl: mockRDSsize = len(mockRDS) / 1000000. controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize) else: + mockRDSsize = 0 controlSampleString = " none" hitRDSsize = len(hitRDS) / 1000000. @@ -320,7 +320,11 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= hitChromList = hitRDS.getChromosomes() if doControl: mockChromList = mockRDS.getChromosomes() + else: + mockChromList = [] + mockSampleSize = 0 + hitSampleSize = 0 if normalize: if doControl: mockSampleSize = mockRDSsize @@ -362,27 +366,28 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= continue #now do background swapping the two samples around - print "calculating background..." - backgroundTrimValue = 1/20. - backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize, - chromosome, useMulti, normalize, maxSpacing, - doDirectionality, doTrim, minHits, minRatio, - readlen, shiftValue, minPeak, minPlusRatio, - maxPlusRatio, leftPlusRatio, listPeak, noMulti, - doControl, factor, backgroundTrimValue) - - statistics["mIndex"] += backgroundRegionStats["index"] - statistics["mTotal"] += backgroundRegionStats["total"] - statistics["failed"] += backgroundRegionStats["failed"] - print statistics["mIndex"], statistics["mTotal"] - if doPvalue: - if pValueType == "self": - p, poissonmean = calculatePValue(allRegionWeights) - else: - p, poissonmean = calculatePValue(backgroundRegionWeights) + if mockRDS is not None: + print "calculating background..." + backgroundTrimValue = 1/20. + backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize, + chromosome, useMulti, normalize, maxSpacing, + doDirectionality, doTrim, minHits, minRatio, + readlen, shiftValue, minPeak, minPlusRatio, + maxPlusRatio, leftPlusRatio, listPeak, noMulti, + doControl, factor, backgroundTrimValue) + + statistics["mIndex"] += backgroundRegionStats["index"] + statistics["mTotal"] += backgroundRegionStats["total"] + statistics["failed"] += backgroundRegionStats["failed"] + print statistics["mIndex"], statistics["mTotal"] + if doPvalue: + if pValueType == "self": + p, poissonmean = calculatePValue(allRegionWeights) + else: + p, poissonmean = calculatePValue(backgroundRegionWeights) - print headline - shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue) + print headline + shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue) footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue) print footer @@ -464,7 +469,7 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio) if foldRatio >= minRatio: - localshift = getBestShiftForRegion(readList, regionStart, regionLength, doWeight=True) + localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True) try: shiftDict[localshift] += 1 except KeyError: @@ -516,7 +521,7 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio): - if doControl: + if doControl and rds is not None: foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll) else: foldRatio = minRatio @@ -535,7 +540,7 @@ def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti, - normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen, + normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen, shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak, noMulti, doControl, factor, trimValue, outputRegionList=False): @@ -558,7 +563,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, weight = read["weight"] if abs(pos - previousHit) > maxSpacing or pos == maxCoord: sumAll = currentTotalWeight - if normalize: + if normalizeToRPM: sumAll /= rdsSampleSize regionStart = currentHitList[0] @@ -566,8 +571,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, regionWeights.append(int(sumAll)) if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen: sumMulti = 0. - #first pass uses getFoldRatio on mockRDS as there may not be control - foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll) + foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio) if foldRatio >= minRatio: # first pass, with absolute numbers peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue) @@ -577,8 +581,8 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, peakScore = peak.smoothArray[bestPos] numPlus = peak.numPlus shift = peak.shift - numLeft = peak.numLeft - if normalize: + numLeft = peak.numLeftPlus + if normalizeToRPM: peakScore /= rdsSampleSize if doTrim: @@ -605,11 +609,11 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, sumAll = trimPeak.numHits numPlus = trimPeak.numPlus - numLeft = trimPeak.numLeft - if normalize: + numLeft = trimPeak.numLeftPlus + if normalizeToRPM: sumAll /= rdsSampleSize - foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, referenceSampleSize, sumAll, minRatio) + foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio) if outputRegionList: sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True) # just in case it changed, use latest data @@ -619,16 +623,14 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, except: continue - # normalize to RPM - if normalize: + if normalizeToRPM: peakScore /= rdsSampleSize elif outputRegionList: sumMulti = currentTotalWeight - currentUniqReadCount if outputRegionList: - # normalize to RPM - if normalize: + if normalizeToRPM: sumMulti /= rdsSampleSize try: @@ -723,7 +725,7 @@ def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, repor pValue *= poissonmean pValue /= i+1 - outputList.append("%1.2f" % pValue) + outputList.append("%1.2g" % pValue) outline = string.join(outputList, "\t") print outline @@ -759,4 +761,4 @@ def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift, return footer if __name__ == "__main__": - main(sys.argv) \ No newline at end of file + main(sys.argv)