From a3c0e60eb30c924232d7baaa348a15c5554e3864 Mon Sep 17 00:00:00 2001 From: Sean Upchurch Date: Mon, 28 Mar 2011 16:40:47 -0700 Subject: [PATCH] development checkpoint --- ReadDataset.py | 3 ++- featureIntersects.py | 2 +- findall.py | 54 +++++++++++++++++++++++------------------ getallgenes.py | 2 +- getsplicefa.py | 4 +-- normalizeFinalExonic.py | 4 +-- 6 files changed, 38 insertions(+), 31 deletions(-) diff --git a/ReadDataset.py b/ReadDataset.py index 56e28a8..850a5ec 100644 --- a/ReadDataset.py +++ b/ReadDataset.py @@ -764,6 +764,7 @@ class ReadDataset(): def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False): """ returns the number of row in the splices table. """ + # TODO: if the rds type is DNA should this just return zero? return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL") @@ -1221,5 +1222,5 @@ class ReadDataset(): destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"], row["weight"], row["flag"], row["mismatch"])) - self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", destinationEntries) + self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", destinationEntries) diff --git a/featureIntersects.py b/featureIntersects.py index c24b79e..7338499 100755 --- a/featureIntersects.py +++ b/featureIntersects.py @@ -54,7 +54,7 @@ def makeParser(usage=""): def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100): posList = getPositionList(tabFileName) - feats = featuresIntersecting("human", posList, radius, cistype) + feats = featuresIntersecting("hsapiens", posList, radius, cistype) featkeys = feats.keys() featkeys.sort() for (chrom, pos) in featkeys: diff --git a/findall.py b/findall.py index ec75efb..aa82037 100755 --- a/findall.py +++ b/findall.py @@ -231,6 +231,7 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= doCache = False cachePages = -1 + mockRDS = None if doControl: print "\ncontrol:" mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache) @@ -268,6 +269,7 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= mockRDSsize = len(mockRDS) / 1000000. controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize) else: + mockRDSsize = 0 controlSampleString = " none" hitRDSsize = len(hitRDS) / 1000000. @@ -318,7 +320,11 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= hitChromList = hitRDS.getChromosomes() if doControl: mockChromList = mockRDS.getChromosomes() + else: + mockChromList = [] + mockSampleSize = 0 + hitSampleSize = 0 if normalize: if doControl: mockSampleSize = mockRDSsize @@ -360,27 +366,28 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing= continue #now do background swapping the two samples around - print "calculating background..." - backgroundTrimValue = 1/20. - backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize, - chromosome, useMulti, normalize, maxSpacing, - doDirectionality, doTrim, minHits, minRatio, - readlen, shiftValue, minPeak, minPlusRatio, - maxPlusRatio, leftPlusRatio, listPeak, noMulti, - doControl, factor, backgroundTrimValue) - - statistics["mIndex"] += backgroundRegionStats["index"] - statistics["mTotal"] += backgroundRegionStats["total"] - statistics["failed"] += backgroundRegionStats["failed"] - print statistics["mIndex"], statistics["mTotal"] - if doPvalue: - if pValueType == "self": - p, poissonmean = calculatePValue(allRegionWeights) - else: - p, poissonmean = calculatePValue(backgroundRegionWeights) + if mockRDS is not None: + print "calculating background..." + backgroundTrimValue = 1/20. + backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize, + chromosome, useMulti, normalize, maxSpacing, + doDirectionality, doTrim, minHits, minRatio, + readlen, shiftValue, minPeak, minPlusRatio, + maxPlusRatio, leftPlusRatio, listPeak, noMulti, + doControl, factor, backgroundTrimValue) + + statistics["mIndex"] += backgroundRegionStats["index"] + statistics["mTotal"] += backgroundRegionStats["total"] + statistics["failed"] += backgroundRegionStats["failed"] + print statistics["mIndex"], statistics["mTotal"] + if doPvalue: + if pValueType == "self": + p, poissonmean = calculatePValue(allRegionWeights) + else: + p, poissonmean = calculatePValue(backgroundRegionWeights) - print headline - shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue) + print headline + shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue) footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue) print footer @@ -514,7 +521,7 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio): - if doControl: + if doControl and rds is not None: foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll) else: foldRatio = minRatio @@ -564,8 +571,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom, regionWeights.append(int(sumAll)) if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen: sumMulti = 0. - #first pass uses getFoldRatio on mockRDS as there may not be control - foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalizeToRPM, referenceSampleSize, sumAll) + foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio) if foldRatio >= minRatio: # first pass, with absolute numbers peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue) @@ -719,7 +725,7 @@ def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, repor pValue *= poissonmean pValue /= i+1 - outputList.append("%1.2f" % pValue) + outputList.append("%1.2g" % pValue) outline = string.join(outputList, "\t") print outline diff --git a/getallgenes.py b/getallgenes.py index a717fe8..9433e2e 100755 --- a/getallgenes.py +++ b/getallgenes.py @@ -32,7 +32,7 @@ def main(argv=None): getallgenes(genome, infilename, outfilename, options.maxRadius, options.nomatchfilename, options.step, options.trackFar, options.trackStrand, options.compact, options.colID, - options.doCache, options.extendgenome, options.replaceModels) + options.doCache, options.extendGenome, options.replaceModels) def getParser(usage): diff --git a/getsplicefa.py b/getsplicefa.py index cbca805..197800e 100755 --- a/getsplicefa.py +++ b/getsplicefa.py @@ -19,7 +19,7 @@ def main(argv=None): print verstring delimiter = "|" - usage = "usage: python %prog genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\ + usage = "usage: python getsplicefa.py genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\ \n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\ \n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter @@ -31,7 +31,7 @@ def main(argv=None): genome = args[0] datafilename = args[1] outfilename = args[2] - maxBorder = args[3] + maxBorder = int(args[3]) except IndexError: print usage sys.exit(1) diff --git a/normalizeFinalExonic.py b/normalizeFinalExonic.py index d5a6b4b..6c1ae33 100755 --- a/normalizeFinalExonic.py +++ b/normalizeFinalExonic.py @@ -24,8 +24,8 @@ def main(argv=None): print usage sys.exit(1) - rdsfilename = argv[1] - expandedRPKMfile = args[3] + rdsfilename = args[0] + expandedRPKMfile = args[1] multicountfile = args[2] outfilename = args[3] -- 2.30.2