From 03f1e0b3bab22d517ad75b9af4d54e8fcb8540fb Mon Sep 17 00:00:00 2001 From: Sean Upchurch Date: Mon, 13 Jun 2011 10:21:56 -0700 Subject: [PATCH] Rewrite of findall.py to clean code. Configuration tested using erange3.2 as basis. Other minor code cleanups. --- ReadDataset.py | 76 +-- analyzego.py | 17 +- commoncode.py | 177 +++--- findall.py | 1143 ++++++++++++++++++++++--------------- geneLocusBins.py | 5 +- geneMrnaCountsWeighted.py | 9 +- getNovelSNPs.py | 3 +- getSNPs.py | 2 +- getfasta.py | 2 +- 9 files changed, 802 insertions(+), 632 deletions(-) diff --git a/ReadDataset.py b/ReadDataset.py index 850a5ec..c9d2a0b 100644 --- a/ReadDataset.py +++ b/ReadDataset.py @@ -207,15 +207,27 @@ class ReadDataset(): return sql + def getMemCursor(self): + """ returns a cursor to memory database for low-level (SQL) + access to the data. + """ + return self.memcon.cursor() + + + def getFileCursor(self): + """ returns a cursor to file database for low-level (SQL) + access to the data. + """ + return self.dbcon.cursor() + + def hasIndex(self): - """ check whether the RDS file has at least one index. + """ return True if the RDS file has at least one index. """ stmt = "select count(*) from sqlite_master where type='index'" count = int(self.execute(stmt, returnResults=True)[0][0]) - if count > 0: - return True - return False + return count > 0 def initializeTables(self, dbConnection, cache=100000): @@ -237,20 +249,6 @@ class ReadDataset(): dbConnection.commit() - def getFileCursor(self): - """ returns a cursor to file database for low-level (SQL) - access to the data. - """ - return self.dbcon.cursor() - - - def getMemCursor(self): - """ returns a cursor to memory database for low-level (SQL) - access to the data. - """ - return self.memcon.cursor() - - def getMetadata(self, valueName=""): """ returns a dictionary of metadata. """ @@ -309,7 +307,7 @@ class ReadDataset(): def getChromosomes(self, table="uniqs", fullChrom=True): - """ returns a list of distinct chromosomes in table. + """ returns a sorted list of distinct chromosomes in table. """ statement = "select distinct chrom from %s" % table sql = self.getSqlCursor() @@ -330,7 +328,7 @@ class ReadDataset(): return results - def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True, + def getMaxCoordinate(self, chrom, doUniqs=True, doMulti=False, doSplices=False): """ returns the maximum coordinate for reads on a given chromosome. """ @@ -347,9 +345,6 @@ class ReadDataset(): multiMax = self.getMaxStartCoordinateInTable(chrom, "multi") maxCoord = max(multiMax, maxCoord) - if verbose: - print "%s maxCoord: %d" % (chrom, maxCoord) - return maxCoord @@ -375,9 +370,9 @@ class ReadDataset(): and which can be restricted by chromosome or custom-flag. Returns unique reads by default, but can return multireads with doMulti set to True. - - Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict + """ + #TODO: Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike) if findallOptimize: @@ -421,27 +416,16 @@ class ReadDataset(): if findallOptimize: if self.memBacked: self.memcon.row_factory = None - sql = self.memcon.cursor() else: self.dbcon.row_factory = None - sql = self.dbcon.cursor() stmt.append("order by start") elif readIDDict: - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() - stmt.append("order by readID, start") else: - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() - stmt.append("order by chrom, start") + sql = self.getSqlCursor() sqlQuery = string.join(stmt) sql.execute(sqlQuery) @@ -602,10 +586,7 @@ class ReadDataset(): whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True) selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID" selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch) - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() + sql = self.getSqlCursor() stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery) sql.execute(stmt) @@ -718,7 +699,7 @@ class ReadDataset(): def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"): - """ returns the number of row in the uniqs table. + """ returns the number of row in the specified table. """ whereClause = [] count = 0 @@ -741,10 +722,7 @@ class ReadDataset(): else: whereQuery = "" - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() + sql = self.getSqlCursor() if distinct: sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery)) @@ -803,11 +781,7 @@ class ReadDataset(): limitPart = "LIMIT %d" % limit sqlQuery = "%s group by readID %s" % (selectPart, limitPart) - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() - + sql = self.getSqlCursor() sql.execute(sqlQuery) result = sql.fetchall() diff --git a/analyzego.py b/analyzego.py index 6542b1b..f8ff001 100755 --- a/analyzego.py +++ b/analyzego.py @@ -6,7 +6,7 @@ except: import sys import optparse -from commoncode import getGeneInfoDict, getConfigParser, getConfigOption +from commoncode import getGeneInfoDict, getConfigParser, getConfigOption, getConfigIntOption from cistematic.cisstat.analyzego import calculateGOStats print "analyzego: version 2.2" @@ -35,7 +35,7 @@ def main(argv=None): infilename = args[1] prefix = args[2] - analyzeGOFromFile(genome, infilename, prefix, options.translateGene, fieldID) + analyzeGOFromFile(genome, infilename, prefix, options.translateGene, fieldID, numTests=options.numTests) def makeParser(usage=""): @@ -44,24 +44,27 @@ def makeParser(usage=""): help="translate gene") parser.add_option("--field", type="int", dest="fieldID", help="column containing gene ID/Name") + parser.add_option("--trials", type="int", dest="numTests", + help="column containing gene ID/Name") configParser = getConfigParser() section = "analyzego" translateGene = getConfigOption(configParser, section, "translateGene", False) fieldID = getConfigOption(configParser, section, "fieldID", None) + numTests = getConfigIntOption(configParser, section, "numTests", 1) - parser.set_defaults(translateGene=translateGene, fieldID=fieldID) + parser.set_defaults(translateGene=translateGene, fieldID=fieldID, numTests=numTests) return parser -def analyzeGOFromFile(genome, infilename, prefix, translateGene=False, fieldID=1): +def analyzeGOFromFile(genome, infilename, prefix, translateGene=False, fieldID=1, numTests=1): infile = open(infilename) - analyzeGO(genome, infile, prefix, translateGene=False, fieldID=1) + analyzeGO(genome, infile, prefix, translateGene, fieldID, numTests) infile.close() -def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1): +def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1, numTests=1): if translateGene: symbolToGidDict = getSymbolDict(genome) @@ -88,7 +91,7 @@ def analyzeGO(genome, geneInfoList, prefix, translateGene=False, fieldID=1): locusList.append((genome, gID)) if len(locusList) > 0: - calculateGOStats(locusList, prefix) + calculateGOStats(locusList, prefix, trials=numTests) def getSymbolDict(genome): diff --git a/commoncode.py b/commoncode.py index f526daa..6ef4edf 100755 --- a/commoncode.py +++ b/commoncode.py @@ -1,8 +1,3 @@ -# -# commoncode.py -# ENRAGE -# - import ConfigParser import os import string @@ -192,6 +187,11 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, mergeCount = 0 chromField = int(chromField) count = 0 + #TODO: Current algorithm processes input file line by line and compares with prior lines. Problem is it + # exits at the first merge. This is not a problem when the input is sorted by start position, but in + # the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there + # will be no merge with A as B is needed to bridge the two. When it comes time to process B it will + # be merged with A but that will exit the loop and the merge with C will be missed. for regionEntry in regionList: if regionEntry[0] == "#": if "pvalue" in regionEntry: @@ -372,10 +372,11 @@ def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75) for read in readList: currentpos = read["start"] - start if read["sense"] == "+": - currentpos += testShift + senseModifier = 1 else: - currentpos -= testShift + senseModifier = -1 + currentpos += testShift * senseModifier if (currentpos < 1) or (currentpos >= length): continue @@ -384,16 +385,12 @@ def getBestShiftForRegion(readList, start, length, useWeight=False, maxShift=75) else: weight = 1.0 - if read["sense"] == "+": - shiftArray[currentpos] += weight - else: - shiftArray[currentpos] -= weight + shiftArray[currentpos] += weight * senseModifier currentScore = 0 for score in shiftArray: currentScore += abs(score) - print currentScore if currentScore < lowestScore: bestShift = testShift lowestScore = currentScore @@ -494,40 +491,8 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo= if restrictGID and gid not in restrictList: continue - featureList = featuresDict[gid] - newFeatureList = [] - isPseudo = False - for (ftype, chrom, start, stop, sense) in featureList: - if ftype == "PSEUDO": - isPseudo = True - - if (start, stop, ftype) not in newFeatureList: - notContained = True - containedList = [] - for (fstart, fstop, ftype2) in newFeatureList: - if start >= fstart and stop <= fstop: - notContained = False - - if start < fstart and stop > fstop: - containedList.append((fstart, fstop)) - - if len(containedList) > 0: - newFList = [] - notContained = True - for (fstart, fstop, ftype2) in newFeatureList: - if (fstart, fstop) not in containedList: - newFList.append((fstart, fstop, ftype2)) - if start >= fstart and stop <= fstop: - notContained = False - - newFeatureList = newFList - if notContained: - newFeatureList.append((start, stop, ftype)) - - if ignorePseudo and isPseudo: - continue - - if chrom not in featuresByChromDict: + newFeatureList, chrom, sense = getNewFeatureList(featuresDict[gid]) + if newFeatureList and chrom not in featuresByChromDict: featuresByChromDict[chrom] = [] for (start, stop, ftype) in newFeatureList: @@ -560,6 +525,39 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo= return featuresByChromDict +def getNewFeatureList(featureList, ignorePseudo=False): + newFeatureList = [] + for (featureType, chrom, start, stop, sense) in featureList: + if featureType == "PSEUDO" and ignorePseudo: + return [], chrom, sense + + if (start, stop, featureType) not in newFeatureList: + notContained = True + containedList = [] + for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList: + if start >= newFeatureStart and stop <= newFeatureStop: + notContained = False + + if start < newFeatureStart and stop > newFeatureStop: + containedList.append((newFeatureStart, newFeatureStop)) + + if len(containedList) > 0: + newFList = [] + notContained = True + for (newFeatureStart, newFeatureStop, featureType2) in newFeatureList: + if (newFeatureStart, newFeatureStop) not in containedList: + newFList.append((newFeatureStart, newFeatureStop, featureType2)) + if start >= newFeatureStart and stop <= newFeatureStop: + notContained = False + + newFeatureList = newFList + + if notContained: + newFeatureList.append((start, stop, featureType)) + + return newFeatureList, chrom, sense + + def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True, additionalRegionsDict={}, ignorePseudo=False, upstreamSpanTSS=False, lengthCDS=0, keepSense=False, adjustToNeighbor=True): @@ -573,23 +571,7 @@ def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True, lengthCDS < 0bp, return only the last X bp from CDS. """ locusByChromDict = {} - if upstream == 0 and downstream == 0 and not useCDS: - print "getLocusByChromDict: asked for no sequence - returning empty dict" - return locusByChromDict - elif upstream > 0 and downstream > 0 and not useCDS: - print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict" - return locusByChromDict - elif lengthCDS != 0 and not useCDS: - print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict" - return locusByChromDict - elif upstreamSpanTSS and lengthCDS != 0: - print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict" - return locusByChromDict - elif lengthCDS > 0 and downstream > 0: - print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict" - return locusByChromDict - elif lengthCDS < 0 and upstream > 0: - print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict" + if usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS): return locusByChromDict genomeName = genome.genome @@ -702,6 +684,29 @@ def getLocusByChromDict(genome, upstream=0, downstream=0, useCDS=True, return locusByChromDict +def usingBadLocusParameters(upstream, downstream, useCDS, lengthCDS, upstreamSpanTSS): + if upstream == 0 and downstream == 0 and not useCDS: + print "getLocusByChromDict: asked for no sequence - returning empty dict" + return True + elif upstream > 0 and downstream > 0 and not useCDS: + print "getLocusByChromDict: asked for only upstream and downstream - returning empty dict" + return True + elif lengthCDS != 0 and not useCDS: + print "getLocusByChromDict: asked for partial CDS but not useCDS - returning empty dict" + return True + elif upstreamSpanTSS and lengthCDS != 0: + print "getLocusByChromDict: asked for TSS spanning and partial CDS - returning empty dict" + return True + elif lengthCDS > 0 and downstream > 0: + print "getLocusByChromDict: asked for discontinuous partial CDS from start and downstream - returning empty dict" + return True + elif lengthCDS < 0 and upstream > 0: + print "getLocusByChromDict: asked for discontinuous partial CDS from stop and upstream - returning empty dict" + return True + + return False + + def getGeneFeatures(genome, additionalRegionsDict): featuresDict = genome.getallGeneFeatures() if len(additionalRegionsDict) > 0: @@ -805,37 +810,25 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[], else: regionBinLength = binLength - startdist = tagStart - start if rsense == "F": - # we are relying on python's integer division quirk - binID = startdist / regionBinLength - if (fixedFirstBin > 0) and (startdist < fixedFirstBin): - binID = 0 - elif fixedFirstBin > 0: - binID = 1 - - if binID >= bins: - binID = bins - 1 - - try: - regionsBins[regionID][binID] += normalizedTag * weight - except KeyError: - print "%s %s" % (regionID, str(binID)) + positionInRegion = tagStart - start else: - rdist = rlen - startdist - binID = rdist / regionBinLength - if (fixedFirstBin > 0) and (rdist < fixedFirstBin): - binID = 0 - elif fixedFirstBin > 0: - binID = 1 - - if binID >= bins: - binID = bins - 1 - - try: - regionsBins[regionID][binID] += normalizedTag * weight - except KeyError: - print "%s %s" % (regionID, str(binID)) + positionInRegion = start + rlen - tagStart + + # we are relying on python's integer division quirk + binID = positionInRegion / regionBinLength + if (fixedFirstBin > 0) and (positionInRegion < fixedFirstBin): + binID = 0 + elif fixedFirstBin > 0: + binID = 1 + + if binID >= bins: + binID = bins - 1 + + try: + regionsBins[regionID][binID] += normalizedTag * weight + except KeyError: + print "%s %s" % (regionID, str(binID)) stopPoint = stop diff --git a/findall.py b/findall.py index aa82037..663d7e0 100755 --- a/findall.py +++ b/findall.py @@ -49,14 +49,104 @@ import sys import math import string import optparse +import operator from commoncode import writeLog, findPeak, getBestShiftForRegion, getConfigParser, getConfigOption, getConfigIntOption, getConfigFloatOption, getConfigBoolOption import ReadDataset import Region -versionString = "findall: version 3.2" +versionString = "findall: version 3.2.1" print versionString +class RegionDirectionError(Exception): + pass + + +class RegionFinder(): + def __init__(self, minRatio, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, strandfilter, minHits, trimValue, doTrim, doDirectionality, + shiftValue, maxSpacing): + + self.statistics = {"index": 0, + "total": 0, + "mIndex": 0, + "mTotal": 0, + "failed": 0, + "badRegionTrim": 0 + } + + self.controlRDSsize = 1 + self.sampleRDSsize = 1 + self.minRatio = minRatio + self.minPeak = minPeak + self.leftPlusRatio = leftPlusRatio + self.stranded = "both" + if strandfilter == "plus": + self.stranded = "+" + minPlusRatio = 0.9 + maxPlusRatio = 1.0 + elif strandfilter == "minus": + self.stranded = "-" + minPlusRatio = 0.0 + maxPlusRatio = 0.1 + + if minRatio < minPeak: + self.minPeak = minRatio + + self.minPlusRatio = minPlusRatio + self.maxPlusRatio = maxPlusRatio + self.strandfilter = strandfilter + self.minHits = minHits + self.trimValue = trimValue + self.doTrim = doTrim + self.doDirectionality = doDirectionality + + if self.doTrim: + self.trimString = string.join(["%2.1f" % (100. * self.trimValue), "%"], "") + else: + self.trimString = "none" + + self.shiftValue = shiftValue + self.maxSpacing = maxSpacing + + + def useRNASettings(self): + self.shiftValue = 0 + self.doTrim = False + self.doDirectionality = False + + + def printOptionsSummary(self, listPeak, useMulti, doCache, pValueType): + + print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (self.doDirectionality, listPeak, not useMulti, doCache) + print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded) + try: + print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType) + except: + print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType) + + + def getAnalysisDescription(self, hitfile, controlfile, doControl, + withFlag, listPeak, useMulti, doCache, pValueType): + + description = ["#ERANGE %s" % versionString] + if doControl: + description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample:\t%s (%.1f M reads)" % (hitfile, self.sampleRDSsize, controlfile, self.controlRDSsize)) + else: + description.append("#enriched sample:\t%s (%.1f M reads)\n#control sample: none" % (hitfile, self.sampleRDSsize)) + + if withFlag != "": + description.append("#restrict to Flag = %s" % withFlag) + + description.append("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s" % (self.doDirectionality, listPeak, not useMulti, doCache)) + description.append("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s" % (self.maxSpacing, self.minHits, self.minRatio, self.minPeak, self.trimString, self.stranded)) + try: + description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%d pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)) + except: + description.append("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (self.minPlusRatio, self.maxPlusRatio, self.leftPlusRatio, self.shiftValue, pValueType)) + + return string.join(description, "\n") + + def usage(): print __doc__ @@ -76,12 +166,32 @@ def main(argv=None): hitfile = args[1] outfilename = args[2] - findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, options.shift, - options.stringency, options.noshift, options.autoshift, options.reportshift, + shiftValue = 0 + + if options.autoshift: + shiftValue = "auto" + + if options.shift is not None: + try: + shiftValue = int(options.shift) + except ValueError: + if options.shift == "learn": + shiftValue = "learn" + + if options.noshift: + shiftValue = 0 + + if options.doAppend: + outputMode = "a" + else: + outputMode = "w" + + findall(factor, hitfile, outfilename, options.minHits, options.minRatio, options.maxSpacing, options.listPeak, shiftValue, + options.stringency, options.reportshift, options.minPlusRatio, options.maxPlusRatio, options.leftPlusRatio, options.minPeak, options.normalize, options.logfilename, options.withFlag, options.doDirectionality, - options.trimValue, options.doTrim, options.doAppend, options.rnaSettings, - options.cachePages, options.ptype, options.mockfile, options.doRevBackground, options.noMulti, + options.trimValue, options.doTrim, outputMode, options.rnaSettings, + options.cachePages, options.ptype, options.controlfile, options.doRevBackground, options.useMulti, options.strandfilter, options.combine5p) @@ -89,7 +199,7 @@ def makeParser(): usage = __doc__ parser = optparse.OptionParser(usage=usage) - parser.add_option("--control", dest="mockfile") + parser.add_option("--control", dest="controlfile") parser.add_option("--minimum", type="float", dest="minHits") parser.add_option("--ratio", type="float", dest="minRatio") parser.add_option("--spacing", type="int", dest="maxSpacing") @@ -99,7 +209,7 @@ def makeParser(): parser.add_option("--noshift", action="store_true", dest="noShift") parser.add_option("--autoshift", action="store_true", dest="autoshift") parser.add_option("--reportshift", action="store_true", dest="reportshift") - parser.add_option("--nomulti", action="store_true", dest="noMulti") + parser.add_option("--nomulti", action="store_false", dest="useMulti") parser.add_option("--minPlus", type="float", dest="minPlusRatio") parser.add_option("--maxPlus", type="float", dest="maxPlusRatio") parser.add_option("--leftPlus", type="float", dest="leftPlusRatio") @@ -137,16 +247,16 @@ def makeParser(): logfilename = getConfigOption(configParser, section, "logfilename", "findall.log") withFlag = getConfigOption(configParser, section, "withFlag", "") doDirectionality = getConfigBoolOption(configParser, section, "doDirectionality", True) - trimValue = getConfigOption(configParser, section, "trimValue", None) + trimValue = getConfigFloatOption(configParser, section, "trimValue", 0.1) doTrim = getConfigBoolOption(configParser, section, "doTrim", True) doAppend = getConfigBoolOption(configParser, section, "doAppend", False) rnaSettings = getConfigBoolOption(configParser, section, "rnaSettings", False) cachePages = getConfigOption(configParser, section, "cachePages", None) - ptype = getConfigOption(configParser, section, "ptype", None) - mockfile = getConfigOption(configParser, section, "mockfile", None) + ptype = getConfigOption(configParser, section, "ptype", "") + controlfile = getConfigOption(configParser, section, "controlfile", None) doRevBackground = getConfigBoolOption(configParser, section, "doRevBackground", False) - noMulti = getConfigBoolOption(configParser, section, "noMulti", False) - strandfilter = getConfigOption(configParser, section, "strandfilter", None) + useMulti = getConfigBoolOption(configParser, section, "useMulti", True) + strandfilter = getConfigOption(configParser, section, "strandfilter", "") combine5p = getConfigBoolOption(configParser, section, "combine5p", False) parser.set_defaults(minHits=minHits, minRatio=minRatio, maxSpacing=maxSpacing, listPeak=listPeak, shift=shift, @@ -154,611 +264,692 @@ def makeParser(): minPlusRatio=minPlusRatio, maxPlusRatio=maxPlusRatio, leftPlusRatio=leftPlusRatio, minPeak=minPeak, normalize=normalize, logfilename=logfilename, withFlag=withFlag, doDirectionality=doDirectionality, trimValue=trimValue, doTrim=doTrim, doAppend=doAppend, rnaSettings=rnaSettings, - cachePages=cachePages, ptype=ptype, mockfile=mockfile, doRevBackground=doRevBackground, noMulti=noMulti, + cachePages=cachePages, ptype=ptype, controlfile=controlfile, doRevBackground=doRevBackground, useMulti=useMulti, strandfilter=strandfilter, combine5p=combine5p) return parser -def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shift=None, - stringency=4.0, noshift=False, autoshift=False, reportshift=False, - minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5, - normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True, - trimValue=None, doTrim=True, doAppend=False, rnaSettings=False, - cachePages=None, ptype=None, mockfile=None, doRevBackground=False, noMulti=False, - strandfilter=None, combine5p=False): +def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=50, listPeak=False, shiftValue=0, + stringency=4.0, reportshift=False, minPlusRatio=0.25, maxPlusRatio=0.75, leftPlusRatio=0.3, minPeak=0.5, + normalize=True, logfilename="findall.log", withFlag="", doDirectionality=True, trimValue=0.1, doTrim=True, + outputMode="w", rnaSettings=False, cachePages=None, ptype="", controlfile=None, doRevBackground=False, + useMulti=True, strandfilter="", combine5p=False): - shiftValue = determineShiftValue(autoshift, shift, noshift, rnaSettings) - doControl = False - if mockfile is not None: - doControl = True + regionFinder = RegionFinder(minRatio, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, strandfilter, minHits, trimValue, + doTrim, doDirectionality, shiftValue, maxSpacing) - if doRevBackground: - print "Swapping IP and background to calculate FDR" - pValueType = "back" + doControl = controlfile is not None + pValueType = getPValueType(ptype, doControl, doRevBackground) + doPvalue = not pValueType == "none" - doPvalue = True - if ptype is not None: - ptype = ptype.upper() + if rnaSettings: + regionFinder.useRNASettings() + + doCache = cachePages is not None + printStatusMessages(regionFinder.shiftValue, normalize, doRevBackground, ptype, doControl, withFlag, useMulti, rnaSettings, regionFinder.strandfilter) + controlRDS = None + if doControl: + print "\ncontrol:" + controlRDS = openRDSFile(controlfile, cachePages=cachePages, doCache=doCache) + + print "\nsample:" + hitRDS = openRDSFile(hitfile, cachePages=cachePages, doCache=doCache) + + print + regionFinder.readlen = hitRDS.getReadSize() + if rnaSettings: + regionFinder.maxSpacing = regionFinder.readlen + + writeLog(logfilename, versionString, string.join(sys.argv[1:])) + regionFinder.printOptionsSummary(listPeak, useMulti, doCache, pValueType) + + regionFinder.sampleRDSsize = len(hitRDS) / 1000000. + if doControl: + regionFinder.controlRDSsize = len(controlRDS) / 1000000. + + outfile = open(outfilename, outputMode) + print >> outfile, regionFinder.getAnalysisDescription(hitfile, controlfile, doControl, + withFlag, listPeak, useMulti, doCache, pValueType) + + header = getHeader(normalize, regionFinder.doDirectionality, listPeak, reportshift, doPvalue) + print >> outfile, header + if minRatio < minPeak: + minPeak = minRatio + + shiftDict = {} + hitChromList = hitRDS.getChromosomes() + stringency = max(stringency, 1.0) + chromosomeList = getChromosomeListToProcess(hitChromList, controlRDS, doControl) + for chromosome in chromosomeList: + allregions, outregions = findPeakRegions(regionFinder, hitRDS, controlRDS, chromosome, logfilename, outfilename, + outfile, stringency, normalize, useMulti, doControl, withFlag, combine5p, + factor, listPeak) + + if doRevBackground: + #TODO: this is *almost* the same calculation - there are small yet important differences + backregions = findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, normalize, useMulti, factor) + writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict, + allregions, header, backregions=backregions, pValueType=pValueType) + else: + writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict, + allregions, header) + + footer = getFooter(regionFinder, shiftDict, reportshift, doRevBackground) + print footer + print >> outfile, footer + outfile.close() + writeLog(logfilename, versionString, outfilename + footer.replace("\n#"," | ")[:-1]) + + +def getPValueType(ptype, doControl, doRevBackground): + pValueType = "self" + if ptype in ["NONE", "SELF", "BACK"]: if ptype == "NONE": - doPvalue = False pValueType = "none" - p = 1 - poissonmean = 0 elif ptype == "SELF": pValueType = "self" elif ptype == "BACK": if doControl and doRevBackground: pValueType = "back" + elif doRevBackground: + pValueType = "back" + + return pValueType + + +def printStatusMessages(shiftValue, normalize, doRevBackground, ptype, doControl, withFlag, useMulti, rnaSettings, strandfilter): + if shiftValue == "learn": + print "Will try to learn shift" + + if normalize: + print "Normalizing to RPM" + + if doRevBackground: + print "Swapping IP and background to calculate FDR" + + if ptype != "": + if ptype in ["NONE", "SELF"]: + pass + elif ptype == "BACK": + if doControl and doRevBackground: + pass else: print "must have a control dataset and -revbackground for pValue type 'back'" else: print "could not use pValue type : %s" % ptype - else: - pValueType = "self" if withFlag != "": print "restrict to flag = %s" % withFlag - useMulti = True - if noMulti: + if not useMulti: print "using unique reads only" - useMulti = False if rnaSettings: print "using settings appropriate for RNA: -nodirectionality -notrim -noshift" - doTrim = False - doDirectionality = False - stranded = "" - if strandfilter is not None: - if strandfilter == "plus": - stranded = "+" - minPlusRatio = 0.9 - maxPlusRatio = 1.0 - print "only analyzing reads on the plus strand" - elif strandfilter == "minus": - stranded = "-" - minPlusRatio = 0.0 - maxPlusRatio = 0.1 - print "only analyzing reads on the minus strand" + if strandfilter == "plus": + print "only analyzing reads on the plus strand" + elif strandfilter == "minus": + print "only analyzing reads on the minus strand" - stringency = max(stringency, 1.0) - writeLog(logfilename, versionString, string.join(sys.argv[1:])) - if cachePages is not None: - doCache = True + +def openRDSFile(filename, cachePages=None, doCache=False): + rds = ReadDataset.ReadDataset(filename, verbose=True, cache=doCache) + if cachePages > rds.getDefaultCacheSize(): + rds.setDBcache(cachePages) + + return rds + + +def getHeader(normalize, doDirectionality, listPeak, reportshift, doPvalue): + if normalize: + countType = "RPM" else: - doCache = False - cachePages = -1 + countType = "COUNT" - mockRDS = None - if doControl: - print "\ncontrol:" - mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache) + headerFields = ["#regionID\tchrom\tstart\tstop", countType, "fold\tmulti%"] - if cachePages > mockRDS.getDefaultCacheSize(): - mockRDS.setDBcache(cachePages) + if doDirectionality: + headerFields.append("plus%\tleftPlus%") - print "\nsample:" - hitRDS = ReadDataset.ReadDataset(hitfile, verbose=True, cache=doCache) - readlen = hitRDS.getReadSize() - if rnaSettings: - maxSpacing = readlen + if listPeak: + headerFields.append("peakPos\tpeakHeight") - if cachePages > hitRDS.getDefaultCacheSize(): - hitRDS.setDBcache(cachePages) + if reportshift: + headerFields.append("readShift") - if trimValue is not None: - trimValue = float(trimValue) / 100. - trimString = "%2.1f%s" % ((100. * trimValue), "%") - else: - trimValue = 0.1 - trimString = "10%" + if doPvalue: + headerFields.append("pValue") - if not doTrim: - trimString = "none" + return string.join(headerFields, "\t") - if doAppend: - fileMode = "a" - else: - fileMode = "w" - outfile = open(outfilename, fileMode) - outfile.write("#ERANGE %s\n" % versionString) +def getChromosomeListToProcess(hitChromList, controlRDS, doControl): if doControl: - mockRDSsize = len(mockRDS) / 1000000. - controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize) + controlChromList = controlRDS.getChromosomes() + chromosomeList = [chrom for chrom in hitChromList if chrom in controlChromList and chrom != "chrM"] else: - mockRDSsize = 0 - controlSampleString = " none" + chromosomeList = [chrom for chrom in hitChromList if chrom != "chrM"] - hitRDSsize = len(hitRDS) / 1000000. - outfile.write("#enriched sample:\t%s (%.1f M reads)\n#control sample:%s\n" % (hitfile, hitRDSsize, controlSampleString)) + return chromosomeList - if withFlag != "": - outfile.write("#restrict to Flag = %s\n" % withFlag) - print "\nenforceDirectionality=%s listPeak=%s nomulti=%s cache=%s " % (doDirectionality, listPeak, noMulti, doCache) - print "spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f\ttrimmed=%s\tstrand=%s" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded) - print "minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType) +def findPeakRegions(regionFinder, hitRDS, controlRDS, chromosome, logfilename, outfilename, + outfile, stringency, normalize, useMulti, doControl, withFlag, combine5p, factor, listPeak): - outfile.write("#enforceDirectionality=%s listPeak=%s nomulti=%s cache=%s\n" % (doDirectionality, listPeak, noMulti, doCache)) - outfile.write("#spacing<%d minimum>%.1f ratio>%.1f minPeak=%.1f trimmed=%s strand=%s\n" % (maxSpacing, minHits, minRatio, minPeak, trimString, stranded)) - outfile.write("#minPlus=%.2f maxPlus=%.2f leftPlus=%.2f shift=%s pvalue=%s\n" % (minPlusRatio, maxPlusRatio, leftPlusRatio, str(shiftValue), pValueType)) - if normalize: - print "Normalizing to RPM" - countLabel = "RPM" - else: - countLabel = "COUNT" + outregions = [] + allregions = [] + print "chromosome %s" % (chromosome) + previousHit = - 1 * regionFinder.maxSpacing + readStartPositions = [-1] + totalWeight = 0 + uniqueReadCount = 0 + reads = [] + numStarts = 0 + badRegion = False + hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True, doMulti=useMulti, findallOptimize=True, + strand=regionFinder.stranded, combine5p=combine5p) - headerList = ["#regionID\tchrom\tstart\tstop", countLabel, "fold\tmulti%"] - if doDirectionality: - headerList.append("plus%\tleftPlus%") + maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti) + if regionFinder.shiftValue == "learn": + learnShift(regionFinder, hitDict, controlRDS, chromosome, maxCoord, logfilename, outfilename, + outfile, numStarts, stringency, normalize, useMulti, doControl) - if listPeak: - headerList.append("peakPos\tpeakHeight") + for read in hitDict[chromosome]: + pos = read["start"] + if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord): + lastReadPos = readStartPositions[-1] + lastBasePosition = lastReadPos + regionFinder.readlen - 1 + newRegionIndex = regionFinder.statistics["index"] + 1 + if regionFinder.doDirectionality: + region = Region.DirectionalRegion(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=factor, numReads=totalWeight) + else: + region = Region.Region(readStartPositions[0], lastBasePosition, chrom=chromosome, index=newRegionIndex, label=factor, numReads=totalWeight) - if reportshift: - headerList.append("readShift") + if normalize: + region.numReads /= regionFinder.sampleRDSsize - if doPvalue: - headerList.append("pValue") + allregions.append(int(region.numReads)) + regionLength = lastReadPos - region.start + if regionPassesCriteria(region.numReads, regionFinder.minHits, numStarts, regionFinder.minRatio, regionLength, regionFinder.readlen): + region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos, + useMulti, doControl, normalize) - headline = string.join(headerList, "\t") - print >> outfile, headline + if region.foldRatio >= regionFinder.minRatio: + # first pass, with absolute numbers + peak = findPeak(reads, region.start, regionLength, regionFinder.readlen, doWeight=True, leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue) + if regionFinder.doTrim: + try: + lastReadPos = trimRegion(region, peak, lastReadPos, regionFinder.trimValue, reads, regionFinder.readlen, regionFinder.doDirectionality, + normalize, regionFinder.sampleRDSsize) + except IndexError: + badRegion = True + continue - statistics = {"index": 0, - "total": 0, - "mIndex": 0, - "mTotal": 0, - "failed": 0 - } + region.foldRatio = getFoldRatio(regionFinder, controlRDS, region.numReads, chromosome, region.start, lastReadPos, + useMulti, doControl, normalize) - if minRatio < minPeak: - minPeak = minRatio + # just in case it changed, use latest data + try: + bestPos = peak.topPos[0] + peakScore = peak.smoothArray[bestPos] + if normalize: + peakScore /= regionFinder.sampleRDSsize + except: + continue - hitChromList = hitRDS.getChromosomes() - if doControl: - mockChromList = mockRDS.getChromosomes() - else: - mockChromList = [] + if listPeak: + region.peakDescription= "%d\t%.1f" % (region.start + bestPos, peakScore) - mockSampleSize = 0 - hitSampleSize = 0 - if normalize: - if doControl: - mockSampleSize = mockRDSsize - - hitSampleSize = hitRDSsize - - hitChromList.sort() - - for chromosome in hitChromList: - if doNotProcessChromosome(chromosome, doControl, mockChromList): - continue - - print "chromosome %s" % (chromosome) - hitDict = hitRDS.getReadsDict(fullChrom=True, chrom=chromosome, flag=withFlag, withWeight=True, - doMulti=useMulti, findallOptimize=True, strand=stranded, - combine5p=combine5p) - maxCoord = hitRDS.getMaxCoordinate(chromosome, doMulti=useMulti) - if shiftValue == "learn": - shiftValue = learnShift(hitDict, hitSampleSize, mockRDS, chromosome, doControl, useMulti, normalize, - mockSampleSize, minRatio, maxSpacing, maxCoord, stringency, readlen, minHits, - logfilename, outfile, outfilename) - - regionStats, allRegionWeights, outregions = locateRegions(hitRDS, hitSampleSize, mockRDS, mockSampleSize, - chromosome, useMulti, normalize, maxSpacing, - doDirectionality, doTrim, minHits, minRatio, - readlen, shiftValue, minPeak, minPlusRatio, - maxPlusRatio, leftPlusRatio, listPeak, noMulti, - doControl, factor, trimValue, outputRegionList=True) - - statistics["index"] += regionStats["index"] - statistics["total"] += regionStats["total"] - statistics["failed"] += regionStats["failed"] - if not doRevBackground: - if doPvalue: - p, poissonmean = calculatePValue(allRegionWeights) - - print headline - shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue) - continue - - #now do background swapping the two samples around - if mockRDS is not None: - print "calculating background..." - backgroundTrimValue = 1/20. - backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize, - chromosome, useMulti, normalize, maxSpacing, - doDirectionality, doTrim, minHits, minRatio, - readlen, shiftValue, minPeak, minPlusRatio, - maxPlusRatio, leftPlusRatio, listPeak, noMulti, - doControl, factor, backgroundTrimValue) - - statistics["mIndex"] += backgroundRegionStats["index"] - statistics["mTotal"] += backgroundRegionStats["total"] - statistics["failed"] += backgroundRegionStats["failed"] - print statistics["mIndex"], statistics["mTotal"] - if doPvalue: - if pValueType == "self": - p, poissonmean = calculatePValue(allRegionWeights) - else: - p, poissonmean = calculatePValue(backgroundRegionWeights) - - print headline - shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue) - - footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue) - print footer - outfile.write(footer) - outfile.close() + if useMulti: + setMultireadPercentage(region, hitRDS, regionFinder.sampleRDSsize, totalWeight, uniqueReadCount, chromosome, lastReadPos, normalize, regionFinder.doTrim) - writeLog(logfilename, versionString, "%s%s" % (outfilename, footer.replace("\n#", " | "))) + region.shift = peak.shift + # check that we still pass threshold + regionLength = lastReadPos - region.start + plusRatio = float(peak.numPlus)/peak.numHits + if regionAndPeakPass(region, regionFinder.minHits, regionFinder.minRatio, regionLength, regionFinder.readlen, peakScore, regionFinder.minPeak, + regionFinder.minPlusRatio, regionFinder.maxPlusRatio, plusRatio): + try: + updateRegion(region, regionFinder.doDirectionality, regionFinder.leftPlusRatio, peak.numLeftPlus, peak.numPlus, plusRatio) + regionFinder.statistics["index"] += 1 + outregions.append(region) + regionFinder.statistics["total"] += region.numReads + except RegionDirectionError: + regionFinder.statistics["failed"] += 1 + + readStartPositions = [] + totalWeight = 0 + uniqueReadCount = 0 + reads = [] + numStarts = 0 + if badRegion: + badRegion = False + regionFinder.statistics["badRegionTrim"] += 1 + if pos not in readStartPositions: + numStarts += 1 -def determineShiftValue(autoshift, shift, noshift, rnaSettings): - shiftValue = 0 - if autoshift: - shiftValue = "auto" + readStartPositions.append(pos) + weight = read["weight"] + totalWeight += weight + if weight == 1.0: + uniqueReadCount += 1 - if shift is not None: - try: - shiftValue = int(shift) - except ValueError: - if shift == "learn": - shiftValue = "learn" - print "Will try to learn shift" + reads.append({"start": pos, "sense": read["sense"], "weight": weight}) + previousHit = pos - if noshift or rnaSettings: - shiftValue = 0 + return allregions, outregions - return shiftValue +def findBackgroundRegions(regionFinder, hitRDS, controlRDS, chromosome, normalize, useMulti, factor): -def doNotProcessChromosome(chromosome, doControl, mockChromList): - skipChromosome = False - if chromosome == "chrM": - skipChromosome = True + print "calculating background..." + previousHit = - 1 * regionFinder.maxSpacing + currentHitList = [-1] + currentTotalWeight = 0 + currentReadList = [] + backregions = [] + numStarts = 0 + badRegion = False + hitDict = controlRDS.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, findallOptimize=True) + maxCoord = controlRDS.getMaxCoordinate(chromosome, doMulti=useMulti) + for read in hitDict[chromosome]: + pos = read["start"] + if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord): + lastReadPos = currentHitList[-1] + lastBasePosition = lastReadPos + regionFinder.readlen - 1 + region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=factor, numReads=currentTotalWeight) + if normalize: + region.numReads /= regionFinder.controlRDSsize + + backregions.append(int(region.numReads)) + region = Region.Region(currentHitList[0], lastBasePosition, chrom=chromosome, label=factor, numReads=currentTotalWeight) + regionLength = lastReadPos - region.start + if regionPassesCriteria(region.numReads, regionFinder.minHits, numStarts, regionFinder.minRatio, regionLength, regionFinder.readlen): + numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True) + if normalize: + numMock /= regionFinder.sampleRDSsize + + foldRatio = region.numReads / numMock + if foldRatio >= regionFinder.minRatio: + # first pass, with absolute numbers + peak = findPeak(currentReadList, region.start, lastReadPos - region.start, regionFinder.readlen, doWeight=True, + leftPlus=regionFinder.doDirectionality, shift=regionFinder.shiftValue) - if doControl and (chromosome not in mockChromList): - skipChromosome = True + if regionFinder.doTrim: + try: + lastReadPos = trimRegion(region, peak, lastReadPos, 20., currentReadList, regionFinder.readlen, regionFinder.doDirectionality, + normalize, regionFinder.controlRDSsize) + except IndexError: + badRegion = True + continue - return skipChromosome + numMock = 1. + hitRDS.getCounts(chromosome, region.start, lastReadPos, uniqs=True, multi=useMulti, splices=False, reportCombined=True) + if normalize: + numMock /= regionFinder.sampleRDSsize + foldRatio = region.numReads / numMock -def calculatePValue(dataList): - dataList.sort() - listSize = float(len(dataList)) - try: - poissonmean = sum(dataList) / listSize - except ZeroDivisionError: - poissonmean = 0 + # just in case it changed, use latest data + try: + bestPos = peak.topPos[0] + peakScore = peak.smoothArray[bestPos] + except IndexError: + continue - print "Poisson n=%d, p=%f" % (listSize, poissonmean) - p = math.exp(-poissonmean) + # normalize to RPM + if normalize: + peakScore /= regionFinder.controlRDSsize + + # check that we still pass threshold + regionLength = lastReadPos - region.start + if regionPassesCriteria(region.numReads, regionFinder.minHits, foldRatio, regionFinder.minRatio, regionLength, regionFinder.readlen): + updateControlStatistics(regionFinder, peak, region.numReads, peakScore) + + currentHitList = [] + currentTotalWeight = 0 + currentReadList = [] + numStarts = 0 + if badRegion: + badRegion = False + regionFinder.statistics["badRegionTrim"] += 1 + + if pos not in currentHitList: + numStarts += 1 + + currentHitList.append(pos) + weight = read["weight"] + currentTotalWeight += weight + currentReadList.append({"start": pos, "sense": read["sense"], "weight": weight}) + previousHit = pos - return p, poissonmean + return backregions -def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, normalize, mockSampleSize, minRatio, maxSpacing, maxCoord, - stringency, readlen, minHits, logfilename, outfile, outfilename, minSites=30): +def learnShift(regionFinder, hitDict, controlRDS, chromosome, maxCoord, logfilename, outfilename, + outfile, numStarts, stringency, normalize, useMulti, doControl): - print "learning shift.... will need at least %d training sites" % minSites - previousHit = -1 * maxSpacing - hitList = [-1] + print "learning shift.... will need at least 30 training sites" + previousHit = -1 * regionFinder.maxSpacing + positionList = [-1] totalWeight = 0 readList = [] shiftDict = {} count = 0 - numStarts = 0 - for read in hitDict[chrom]: + for read in hitDict[chromosome]: pos = read["start"] - sense = read["sense"] - weight = read["weight"] - if abs(pos - previousHit) > maxSpacing or pos == maxCoord: - sumAll = totalWeight + if previousRegionIsDone(pos, previousHit, regionFinder.maxSpacing, maxCoord): if normalize: - sumAll /= hitSampleSize + totalWeight /= regionFinder.sampleRDSsize - regionStart = hitList[0] - regionStop = hitList[-1] + regionStart = positionList[0] + regionStop = positionList[-1] regionLength = regionStop - regionStart - # we're going to require stringent settings - if sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen: - foldRatio = getFoldRatio(mockRDS, chrom, regionStart, regionStop, doControl, useMulti, normalize, mockSampleSize, sumAll, minRatio) - - if foldRatio >= minRatio: - localshift = getBestShiftForRegion(readList, regionStart, regionLength, useWeight=True) - try: - shiftDict[localshift] += 1 - except KeyError: - shiftDict[localshift] = 1 - + if regionPassesCriteria(totalWeight, regionFinder.minHits, numStarts, regionFinder.minRatio, regionLength, regionFinder.readlen, stringency=stringency): + foldRatio = getFoldRatio(regionFinder, controlRDS, totalWeight, chromosome, regionStart, regionStop, useMulti, doControl, normalize) + if foldRatio >= regionFinder.minRatio: + updateShiftDict(shiftDict, readList, regionStart, regionLength, regionFinder.readlen) count += 1 - hitList = [] + positionList = [] totalWeight = 0 readList = [] - numStarts = 0 - if pos not in hitList: + if pos not in positionList: numStarts += 1 - hitList.append(pos) + positionList.append(pos) + weight = read["weight"] totalWeight += weight - readList.append({"start": pos, "sense": sense, "weight": weight}) + readList.append({"start": pos, "sense": read["sense"], "weight": weight}) previousHit = pos - bestShift = 0 - bestCount = 0 - learningSettings = ["#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d" % (stringency, stringency * minHits, - stringency * minRatio, stringency * readlen), - "#number of training examples: %d" % count] - outline = string.join(learningSettings, "\n") + outline = "#learn: stringency=%.2f min_signal=%2.f min_ratio=%.2f min_region_size=%d\n#number of training examples: %d" % (stringency, + stringency * regionFinder.minHits, + stringency * regionFinder.minRatio, + stringency * regionFinder.readlen, + count) + print outline - writeLog(logfilename, versionString, "%s%s" % (outfilename, outline)) - if count < minSites: + writeLog(logfilename, versionString, outfilename + outline) + regionFinder.shiftValue = getShiftValue(shiftDict, count, logfilename, outfilename) + outline = "#picked shiftValue to be %d" % regionFinder.shiftValue + print outline + print >> outfile, outline + writeLog(logfilename, versionString, outfilename + outline) + + +def previousRegionIsDone(pos, previousHit, maxSpacing, maxCoord): + return abs(pos - previousHit) > maxSpacing or pos == maxCoord + + +def regionPassesCriteria(sumAll, minHits, numStarts, minRatio, regionLength, readlen, stringency=1): + return sumAll >= stringency * minHits and numStarts > stringency * minRatio and regionLength > stringency * readlen + + +def trimRegion(region, peak, regionStop, trimValue, currentReadList, readlen, doDirectionality, normalize, hitRDSsize): + bestPos = peak.topPos[0] + peakScore = peak.smoothArray[bestPos] + if normalize: + peakScore /= hitRDSsize + + minSignalThresh = trimValue * peakScore + start = findStartEdgePosition(peak, minSignalThresh) + regionEndPoint = regionStop - region.start - 1 + stop = findStopEdgePosition(peak, regionEndPoint, minSignalThresh) + + regionStop = region.start + stop + region.start += start + trimmedPeak = findPeak(currentReadList, region.start, regionStop - region.start, readlen, doWeight=True, leftPlus=doDirectionality, shift=peak.shift) + peak.numPlus = trimmedPeak.numPlus + peak.numLeftPlus = trimmedPeak.numLeftPlus + peak.topPos = trimmedPeak.topPos + peak.smoothArray = trimmedPeak.smoothArray + + region.numReads = trimmedPeak.numHits + if normalize: + region.numReads /= hitRDSsize + + region.stop = regionStop + readlen - 1 + + return regionStop + + +def findStartEdgePosition(peak, minSignalThresh): + start = 0 + while not peakEdgeLocated(peak, start, minSignalThresh): + start += 1 + + return start + + +def findStopEdgePosition(peak, stop, minSignalThresh): + while not peakEdgeLocated(peak, stop, minSignalThresh): + stop -= 1 + + return stop + + +def peakEdgeLocated(peak, position, minSignalThresh): + return peak.smoothArray[position] >= minSignalThresh or position == peak.topPos[0] + + +def getFoldRatio(regionFinder, controlRDS, sumAll, chromosome, regionStart, regionStop, useMulti, doControl, normalize): + if doControl: + numMock = 1. + controlRDS.getCounts(chromosome, regionStart, regionStop, uniqs=True, multi=useMulti, splices=False, reportCombined=True) + if normalize: + numMock /= regionFinder.controlRDSsize + + foldRatio = sumAll / numMock + else: + foldRatio = regionFinder.minRatio + + return foldRatio + + +def updateShiftDict(shiftDict, readList, regionStart, regionLength, readlen): + peak = findPeak(readList, regionStart, regionLength, readlen, doWeight=True, shift="auto") + try: + shiftDict[peak.shift] += 1 + except KeyError: + shiftDict[peak.shift] = 1 + + +def getShiftValue(shiftDict, count, logfilename, outfilename): + if count < 30: outline = "#too few training examples to pick a shiftValue - defaulting to 0\n#consider picking a lower minimum or threshold" - print >> outfile, outline - writeLog(logfilename, versionString, "%s%s" % (outfilename, outline)) + print outline + writeLog(logfilename, versionString, outfilename + outline) shiftValue = 0 else: - for shift in sorted(shiftDict): - if shiftDict[shift] > bestCount: - bestShift = shift - bestCount = shiftDict[shift] - - shiftValue = bestShift + shiftValue = getBestShiftInDict(shiftDict) print shiftDict - outline = "#picked shiftValue to be %d" % shiftValue - print outline - print >> outfile, outline - writeLog(logfilename, versionString, "%s%s" % (outfilename, outline)) - return shiftValue -def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio): - if doControl and rds is not None: - foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll) +def updateControlStatistics(regionFinder, peak, sumAll, peakScore): + + plusRatio = float(peak.numPlus)/peak.numHits + if peakScore >= regionFinder.minPeak and regionFinder.minPlusRatio <= plusRatio <= regionFinder.maxPlusRatio: + if regionFinder.doDirectionality: + if regionFinder.leftPlusRatio < peak.numLeft / peak.numPlus: + regionFinder.statistics["mIndex"] += 1 + regionFinder.statistics["mTotal"] += sumAll + else: + regionFinder.statistics["failed"] += 1 + else: + # we have a region, but didn't check for directionality + regionFinder.statistics["mIndex"] += 1 + regionFinder.statistics["mTotal"] += sumAll + + +def getRegion(regionStart, regionStop, factor, index, chromosome, sumAll, foldRatio, multiP, + peakDescription, shift, doDirectionality, leftPlusRatio, numLeft, + numPlus, plusRatio): + + if doDirectionality: + if leftPlusRatio < numLeft / numPlus: + plusP = plusRatio * 100. + leftP = 100. * numLeft / numPlus + # we have a region that passes all criteria + region = Region.DirectionalRegion(regionStart, regionStop, + factor, index, chromosome, sumAll, + foldRatio, multiP, plusP, leftP, + peakDescription, shift) + + else: + raise RegionDirectionError else: - foldRatio = minRatio + # we have a region, but didn't check for directionality + region = Region.Region(regionStart, regionStop, factor, index, chromosome, + sumAll, foldRatio, multiP, peakDescription, shift) - return foldRatio + return region -def getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll): - numMock = 1. + rds.getCounts(chrom, start, stop, uniqs=True, multi=useMulti, splices=False, reportCombined=True) +def setMultireadPercentage(region, hitRDS, hitRDSsize, currentTotalWeight, currentUniqueCount, chromosome, lastReadPos, normalize, doTrim): + if doTrim: + sumMulti = hitRDS.getMultiCount(chromosome, region.start, lastReadPos) + else: + sumMulti = currentTotalWeight - currentUniqueCount + + # normalize to RPM if normalize: - numMock /= sampleSize + sumMulti /= hitRDSsize - foldRatio = sumAll / numMock + try: + multiP = 100. * (sumMulti / region.numReads) + except ZeroDivisionError: + return - return foldRatio + region.multiP = multiP -def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, useMulti, - normalizeToRPM, maxSpacing, doDirectionality, doTrim, minHits, minRatio, readlen, - shiftValue, minPeak, minPlusRatio, maxPlusRatio, leftPlusRatio, listPeak, - noMulti, doControl, factor, trimValue, outputRegionList=False): +def regionAndPeakPass(region, minHits, minRatio, regionLength, readlen, peakScore, minPeak, minPlusRatio, maxPlusRatio, plusRatio): + regionPasses = False + if regionPassesCriteria(region.numReads, minHits, region.foldRatio, minRatio, regionLength, readlen): + if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio: + regionPasses = True - index = 0 - totalRegionWeight = 0 - failedCounter = 0 - previousHit = - 1 * maxSpacing - currentHitList = [-1] - currentTotalWeight = 0 - currentUniqReadCount = 0 - currentReadList = [] - regionWeights = [] - outregions = [] - numStarts = 0 - hitDict = rds.getReadsDict(fullChrom=True, chrom=chrom, withWeight=True, doMulti=useMulti, findallOptimize=True) - maxCoord = rds.getMaxCoordinate(chrom, doMulti=useMulti) - for read in hitDict[chrom]: - pos = read["start"] - sense = read["sense"] - weight = read["weight"] - if abs(pos - previousHit) > maxSpacing or pos == maxCoord: - sumAll = currentTotalWeight - if normalizeToRPM: - sumAll /= rdsSampleSize - - regionStart = currentHitList[0] - regionStop = currentHitList[-1] - regionWeights.append(int(sumAll)) - if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen: - sumMulti = 0. - foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio) - if foldRatio >= minRatio: - # first pass, with absolute numbers - peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue) - - bestPos = peak.topPos[0] - numHits = peak.numHits - peakScore = peak.smoothArray[bestPos] - numPlus = peak.numPlus - shift = peak.shift - numLeft = peak.numLeftPlus - if normalizeToRPM: - peakScore /= rdsSampleSize - - if doTrim: - minSignalThresh = trimValue * peakScore - start = 0 - stop = regionStop - regionStart - 1 - startFound = False - while not startFound: - if peak.smoothArray[start] >= minSignalThresh or start == bestPos: - startFound = True - else: - start += 1 - - stopFound = False - while not stopFound: - if peak.smoothArray[stop] >= minSignalThresh or stop == bestPos: - stopFound = True - else: - stop -= 1 - - regionStop = regionStart + stop - regionStart += start - trimPeak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shift) - - sumAll = trimPeak.numHits - numPlus = trimPeak.numPlus - numLeft = trimPeak.numLeftPlus - if normalizeToRPM: - sumAll /= rdsSampleSize - - foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio) - if outputRegionList: - sumMulti = rds.getCounts(chrom, regionStart, regionStop, uniqs=False, multi=useMulti, splices=False, reportCombined=True) - # just in case it changed, use latest data - try: - bestPos = trimPeak.topPos[0] - peakScore = trimPeak.smoothArray[bestPos] - except: - continue + return regionPasses - if normalizeToRPM: - peakScore /= rdsSampleSize - elif outputRegionList: - sumMulti = currentTotalWeight - currentUniqReadCount +def updateRegion(region, + doDirectionality, leftPlusRatio, numLeft, + numPlus, plusRatio): - if outputRegionList: - if normalizeToRPM: - sumMulti /= rdsSampleSize + if doDirectionality: + if leftPlusRatio < numLeft / numPlus: + region.plusP = plusRatio * 100. + region.leftP = 100. * numLeft / numPlus + else: + raise RegionDirectionError - try: - multiP = 100. * (sumMulti / sumAll) - except: - break - if noMulti: - multiP = 0. +def writeNoRevBackgroundResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict, + allregions, header): - # check that we still pass threshold - if sumAll >= minHits and foldRatio >= minRatio and (regionStop - regionStart) > readlen: - plusRatio = float(numPlus)/numHits - if peakScore >= minPeak and minPlusRatio <= plusRatio <= maxPlusRatio: - if outputRegionList: - peakDescription = "" - if listPeak: - peakDescription = "\t%d\t%.1f" % (regionStart + bestPos, peakScore) - - if doDirectionality: - if leftPlusRatio < numLeft / numPlus: - index += 1 - if outputRegionList: - plusP = plusRatio * 100. - leftP = 100. * numLeft / numPlus - # we have a region that passes all criteria - region = Region.DirectionalRegion(regionStart, regionStop + readlen - 1, - factor, index, chrom, sumAll, - foldRatio, multiP, plusP, leftP, - peakDescription, shift) - outregions.append(region) - - totalRegionWeight += sumAll - else: - failedCounter += 1 - else: - # we have a region, but didn't check for directionality - index += 1 - totalRegionWeight += sumAll - if outputRegionList: - region = Region.Region(regionStart, regionStop + readlen - 1, factor, index, chrom, - sumAll, foldRatio, multiP, peakDescription, shift) - outregions.append(region) + writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict, + allregions, header, backregions=[], pValueType="self") - currentHitList = [] - currentTotalWeight = 0 - currentUniqReadCount = 0 - currentReadList = [] - numStarts = 0 - if pos not in currentHitList: - numStarts += 1 +def writeChromosomeResults(regionFinder, outregions, outfile, doPvalue, reportshift, shiftDict, + allregions, header, backregions=[], pValueType="none"): - currentHitList.append(pos) - currentTotalWeight += weight - if weight == 1.0: - currentUniqReadCount += 1 + print regionFinder.statistics["mIndex"], regionFinder.statistics["mTotal"] + if doPvalue: + if pValueType == "self": + poissonmean = calculatePoissonMean(allregions) + else: + poissonmean = calculatePoissonMean(backregions) - currentReadList.append({"start": pos, "sense": sense, "weight": weight}) - previousHit = pos + print header + writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=regionFinder.shiftValue, reportshift=reportshift, shiftDict=shiftDict) - statistics = {"index": index, - "total": totalRegionWeight, - "failed": failedCounter - } - if outputRegionList: - return statistics, regionWeights, outregions - else: - return statistics, regionWeights +def calculatePoissonMean(dataList): + dataList.sort() + listSize = float(len(dataList)) + try: + poissonmean = sum(dataList) / listSize + except ZeroDivisionError: + poissonmean = 0 + print "Poisson n=%d, p=%f" % (listSize, poissonmean) -def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, reportshift, shiftValue): - bestShift = 0 - shiftDict = {} + return poissonmean + + +def writeRegions(outregions, outfile, doPvalue, poissonmean, shiftValue=0, reportshift=False, shiftDict={}): for region in outregions: - # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest - if reportshift: - outputList = [region.printRegionWithShift()] - if shiftValue == "auto": - try: - shiftDict[region.shift] += 1 - except KeyError: - shiftDict[region.shift] = 1 - else: - outputList = [region.printRegion()] + if shiftValue == "auto" and reportshift: + try: + shiftDict[region.shift] += 1 + except KeyError: + shiftDict[region.shift] = 1 + + outline = getRegionString(region, reportshift) # iterative poisson from http://stackoverflow.com/questions/280797?sort=newest if doPvalue: sumAll = int(region.numReads) - for i in xrange(sumAll): - pValue *= poissonmean - pValue /= i+1 - - outputList.append("%1.2g" % pValue) + pValue = calculatePValue(sumAll, poissonmean) + outline += "\t%1.2g" % pValue - outline = string.join(outputList, "\t") print outline print >> outfile, outline - bestCount = 0 - for shift in sorted(shiftDict): - if shiftDict[shift] > bestCount: - bestShift = shift - bestCount = shiftDict[shift] - return bestShift +def calculatePValue(sum, poissonmean): + pValue = math.exp(-poissonmean) + #TODO: 798: DeprecationWarning: integer argument expected, got float - for i in xrange(sum) + for i in xrange(sum): + pValue *= poissonmean + pValue /= i+1 + return pValue + + +def getRegionString(region, reportShift): + if reportShift: + outline = region.printRegionWithShift() + else: + outline = region.printRegion() + + return outline -def getFooter(stats, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue): - footerList = ["#stats:\t%.1f RPM in %d regions" % (stats["total"], stats["index"])] - if doDirectionality: - footerList.append("#\t\t%d additional regions failed directionality filter" % stats["failed"]) + +def getFooter(regionFinder, shiftDict, reportshift, doRevBackground): + index = regionFinder.statistics["index"] + mIndex = regionFinder.statistics["mIndex"] + footerLines = ["#stats:\t%.1f RPM in %d regions" % (regionFinder.statistics["total"], index)] + if regionFinder.doDirectionality: + footerLines.append("#\t\t%d additional regions failed directionality filter" % regionFinder.statistics["failed"]) if doRevBackground: try: - percent = min(100. * (float(stats["mIndex"])/stats["index"]), 100) - except (ValueError, ZeroDivisionError): + percent = min(100. * (float(mIndex)/index), 100.) + except ZeroDivisionError: percent = 0. - footerList.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (stats["mIndex"], stats["mTotal"], percent)) + footerLines.append("#%d regions (%.1f RPM) found in background (FDR = %.2f percent)" % (mIndex, regionFinder.statistics["mTotal"], percent)) + + if regionFinder.shiftValue == "auto" and reportshift: + bestShift = getBestShiftInDict(shiftDict) + footerLines.append("#mode of shift values: %d" % bestShift) + + if regionFinder.statistics["badRegionTrim"] > 0: + footerLines.append("#%d regions discarded due to trimming problems" % regionFinder.statistics["badRegionTrim"]) + + return string.join(footerLines, "\n") - if shiftValue == "auto" and reportshift: - footerList.append("#mode of shift values: %d" % shiftModeValue) - footer = string.join(footerList, "\n") +def getBestShiftInDict(shiftDict): + return max(shiftDict.iteritems(), key=operator.itemgetter(1))[0] - return footer if __name__ == "__main__": main(sys.argv) diff --git a/geneLocusBins.py b/geneLocusBins.py index 49745dc..03cade2 100755 --- a/geneLocusBins.py +++ b/geneLocusBins.py @@ -138,7 +138,7 @@ def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins try: geneinfo = geneinfoDict[gid] symbol = geneinfo[0][0] - except KeyError: + except (KeyError, IndexError): pass else: symbol = gid @@ -154,6 +154,7 @@ def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins try: normalizedValue = 100. * binAmount / tagCount except ZeroDivisionError: + #TODO: QForALi - this is the right way to refactor the original code, but I don't think this is the right answer normalizedValue = 100. * binAmount binAmount = normalizedValue @@ -167,4 +168,4 @@ def writeBins(gidList, geneinfoDict, gidBins, gidLen, outfilename, normalizeBins if __name__ == "__main__": - main(sys.argv) + main(sys.argv) \ No newline at end of file diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py index 5299d27..74e7a0c 100755 --- a/geneMrnaCountsWeighted.py +++ b/geneMrnaCountsWeighted.py @@ -68,6 +68,12 @@ def makeParser(usage=""): return parser +#TODO: Reported user performance issue. Long run times in conditions: +# small number of reads ~40-50M +# all features on single chromosome +# +# User states has been a long time problem. + def geneMrnaCountsWeighted(genome, hitfile, countfile, outfilename, ignoreSense=True, withUniqs=False, withMulti=False, acceptfile=None, cachePages=None, doVerbose=False, extendGenome="", replaceModels=False): @@ -200,6 +206,7 @@ def writeCountsToFile(outFilename, countFilename, allGIDs, genome, gidReadDict, for line in uniquecounts: fields = line.strip().split() # add a pseudo-count here to ease calculations below + #TODO: figure out why this was done in prior implementation... uniqueCountDict[fields[0]] = float(fields[-1]) + 1 uniquecounts.close() @@ -264,4 +271,4 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict): if __name__ == "__main__": - main(sys.argv) + main(sys.argv) \ No newline at end of file diff --git a/getNovelSNPs.py b/getNovelSNPs.py index e95d726..4091281 100755 --- a/getNovelSNPs.py +++ b/getNovelSNPs.py @@ -68,6 +68,7 @@ def doNotProcessLine(line): def getNovelSNPInfo(genome, snpEntry, hg): fields = snpEntry.split() + #TODO: refactor naming. is fields[8] rpkm? if fields[8].find("N\A") == -1: return snpEntry else: @@ -92,4 +93,4 @@ def getNovelSNPInfo(genome, snpEntry, hg): if __name__ == "__main__": - main(sys.argv) + main(sys.argv) \ No newline at end of file diff --git a/getSNPs.py b/getSNPs.py index f6071ae..f582675 100755 --- a/getSNPs.py +++ b/getSNPs.py @@ -75,7 +75,7 @@ def makeParser(usage=""): forceChr = getConfigBoolOption(configParser, section, "forceChr", False) cachePages = getConfigIntOption(configParser, section, "cachePages", 0) - parser.set_defaults(doSplices=True, forceChr=False, cachePages=0) + parser.set_defaults(doSplices=doSplices, forceChr=forceChr, cachePages=cachePages) return parser diff --git a/getfasta.py b/getfasta.py index ea347e4..acae47d 100755 --- a/getfasta.py +++ b/getfasta.py @@ -36,7 +36,7 @@ def main(argv=None): outfilename = args[2] getfasta(genome, regionfile, outfilename, options.seqsize, options.minHitThresh, - options.topRegions, options.maxsize, options.usePeaks, options.hitFile, + options.topRegions, options.maxsize, options.usePeaks, options.hitfile, options.doCache, options.doCompact) -- 2.30.2