X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=findall.py;fp=findall.py;h=ec75efb54f3680c892578a38811c1bd241c37241;hp=d20608f1997e2e6a0fd84d9169c78025485e4b57;hb=77dccd7c98d8cdb60caaf178b1123df71ea662c9;hpb=bc30aca13e5ec397c92e67002fbf7a103130b828 diff --git a/findall.py b/findall.py index d20608f..ec75efb 100755 --- a/findall.py +++ b/findall.py @@ -169,25 +169,14 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= strandfilter=None, combine5p=False): shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings) - - if trimValue is not None: - trimValue = float(trimValue) / 100. - trimString = "%2.1f%s" % ((100. * trimValue), "%") - else: - trimValue = 0.1 - trimString = "10%" - - if not doTrim: - trimString = "none" + doControl = False + if mockfile is not None: + doControl = True if doRevBackground: print "Swapping IP and background to calculate FDR" pValueType = "back" - doControl = False - if mockfile is not None: - doControl = True - doPvalue = True if ptype is not None: ptype = ptype.upper() @@ -208,12 +197,6 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= else: pValueType = "self" - if cachePages is not None: - doCache = True - else: - doCache = False - cachePages = -1 - if withFlag != "": print "restrict to flag = %s" % withFlag @@ -242,6 +225,12 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= stringency = max(stringency, 1.0) writeLog(logfilename, versionString, string.join(sys.argv[1:])) + if cachePages is not None: + doCache = True + else: + doCache = False + cachePages = -1 + if doControl: print "\ncontrol:" mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache) @@ -258,13 +247,22 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= if cachePages > hitRDS.getDefaultCacheSize(): hitRDS.setDBcache(cachePages) + if trimValue is not None: + trimValue = float(trimValue) / 100. + trimString = "%2.1f%s" % ((100. * trimValue), "%") + else: + trimValue = 0.1 + trimString = "10%" + + if not doTrim: + trimString = "none" + if doAppend: fileMode = "a" else: fileMode = "w" outfile = open(outfilename, fileMode) - outfile.write("#ERANGE %s\n" % versionString) if doControl: mockRDSsize = len(mockRDS) / 1000000. @@ -464,7 +462,7 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio) if foldRatio >= minRatio: - localshift = getBestShiftForRegion(readList, regionStart, regionLength, doWeight=True) + localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True) try: shiftDict[localshift] += 1 except KeyError: @@ -535,7 +533,7 @@ def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti, - normalize, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen, + normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen, shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak, noMulti, doControl, factor, trimValue, outputRegionList=False): @@ -558,7 +556,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, weight = read["weight"] if abs(pos - previousHit) > maxSpacing or pos == maxCoord: sumAll = currentTotalWeight - if normalize: + if normalizeToRPM: sumAll /= rdsSampleSize regionStart = currentHitList[0] @@ -567,7 +565,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen: sumMulti = 0. #first pass uses getFoldRatio on mockRDS as there may not be control - foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalize, referenceSampleSize, sumAll) + foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalizeToRPM, referenceSampleSize, sumAll) if foldRatio >= minRatio: # first pass, with absolute numbers peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue) @@ -577,8 +575,8 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, peakScore = peak.smoothArray[bestPos] numPlus = peak.numPlus shift = peak.shift - numLeft = peak.numLeft - if normalize: + numLeft = peak.numLeftPlus + if normalizeToRPM: peakScore /= rdsSampleSize if doTrim: @@ -605,11 +603,11 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, sumAll = trimPeak.numHits numPlus = trimPeak.numPlus - numLeft = trimPeak.numLeft - if normalize: + numLeft = trimPeak.numLeftPlus + if normalizeToRPM: sumAll /= rdsSampleSize - foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, referenceSampleSize, sumAll, minRatio) + foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio) if outputRegionList: sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True) # just in case it changed, use latest data @@ -619,16 +617,14 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, except: continue - # normalize to RPM - if normalize: + if normalizeToRPM: peakScore /= rdsSampleSize elif outputRegionList: sumMulti = currentTotalWeight - currentUniqReadCount if outputRegionList: - # normalize to RPM - if normalize: + if normalizeToRPM: sumMulti /= rdsSampleSize try: @@ -759,4 +755,4 @@ def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift, return footer if __name__ == "__main__": - main(sys.argv) \ No newline at end of file + main(sys.argv)