development checkpoint
authorSean Upchurch <sau@caltech.edu>
Mon, 28 Mar 2011 23:40:47 +0000 (16:40 -0700)
committerSean Upchurch <sau@caltech.edu>
Mon, 28 Mar 2011 23:40:47 +0000 (16:40 -0700)
ReadDataset.py
featureIntersects.py
findall.py
getallgenes.py
getsplicefa.py
normalizeFinalExonic.py

index 56e28a8995b734735ace06d1d990ac9b46753580..850a5ec602a7b6d7a9a22ac98c4bf3eef4b873e5 100644 (file)
@@ -764,6 +764,7 @@ class ReadDataset():
     def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
         """ returns the number of row in the splices table.
         """
+        # TODO: if the rds type is DNA should this just return zero?
         return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
 
 
@@ -1221,5 +1222,5 @@ class ReadDataset():
             destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
                                        row["weight"], row["flag"], row["mismatch"]))
 
-        self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", destinationEntries)
+        self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", destinationEntries)
 
index c24b79edb312fae6c0b19f0d7e3d3ffb45d12984..733849902899ceb75017995ea50bd209aa66c2f9 100755 (executable)
@@ -54,7 +54,7 @@ def makeParser(usage=""):
 def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100):
     
     posList = getPositionList(tabFileName)
-    feats = featuresIntersecting("human", posList, radius, cistype)
+    feats = featuresIntersecting("hsapiens", posList, radius, cistype)
     featkeys = feats.keys()
     featkeys.sort()
     for (chrom, pos) in featkeys:
index ec75efb54f3680c892578a38811c1bd241c37241..aa820374b994d497fbc4dd336a98b795489f5900 100755 (executable)
@@ -231,6 +231,7 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
         doCache = False
         cachePages = -1
 
+    mockRDS = None
     if doControl:
         print "\ncontrol:" 
         mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
@@ -268,6 +269,7 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
         mockRDSsize = len(mockRDS) / 1000000.
         controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize)
     else:
+        mockRDSsize = 0
         controlSampleString = " none"
 
     hitRDSsize = len(hitRDS) / 1000000.
@@ -318,7 +320,11 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
     hitChromList = hitRDS.getChromosomes()
     if doControl:
         mockChromList = mockRDS.getChromosomes()
+    else:
+        mockChromList = []
 
+    mockSampleSize = 0
+    hitSampleSize = 0
     if normalize:
         if doControl:
             mockSampleSize = mockRDSsize
@@ -360,27 +366,28 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
             continue
 
         #now do background swapping the two samples around
-        print "calculating background..."
-        backgroundTrimValue = 1/20.
-        backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize,
-                                                                       chromosome, useMulti, normalize, maxSpacing,
-                                                                       doDirectionality, doTrim, minHits, minRatio,
-                                                                       readlen, shiftValue, minPeak, minPlusRatio,
-                                                                       maxPlusRatio, leftPlusRatio, listPeak, noMulti,
-                                                                       doControl, factor, backgroundTrimValue)
-
-        statistics["mIndex"] += backgroundRegionStats["index"]
-        statistics["mTotal"] += backgroundRegionStats["total"]
-        statistics["failed"] += backgroundRegionStats["failed"]
-        print statistics["mIndex"], statistics["mTotal"]
-        if doPvalue:
-            if pValueType == "self":
-                p, poissonmean = calculatePValue(allRegionWeights)
-            else:
-                p, poissonmean = calculatePValue(backgroundRegionWeights)
+        if mockRDS is not None:
+            print "calculating background..."
+            backgroundTrimValue = 1/20.
+            backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize,
+                                                                           chromosome, useMulti, normalize, maxSpacing,
+                                                                           doDirectionality, doTrim, minHits, minRatio,
+                                                                           readlen, shiftValue, minPeak, minPlusRatio,
+                                                                           maxPlusRatio, leftPlusRatio, listPeak, noMulti,
+                                                                           doControl, factor, backgroundTrimValue)
+
+            statistics["mIndex"] += backgroundRegionStats["index"]
+            statistics["mTotal"] += backgroundRegionStats["total"]
+            statistics["failed"] += backgroundRegionStats["failed"]
+            print statistics["mIndex"], statistics["mTotal"]
+            if doPvalue:
+                if pValueType == "self":
+                    p, poissonmean = calculatePValue(allRegionWeights)
+                else:
+                    p, poissonmean = calculatePValue(backgroundRegionWeights)
 
-        print headline
-        shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
+            print headline
+            shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
 
     footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue)
     print footer
@@ -514,7 +521,7 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm
 
 
 def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio):
-    if doControl:
+    if doControl and rds is not None:
         foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll)
     else:
         foldRatio = minRatio
@@ -564,8 +571,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
             regionWeights.append(int(sumAll))
             if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
                 sumMulti = 0.
-                #first pass uses getFoldRatio on mockRDS as there may not be control
-                foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalizeToRPM, referenceSampleSize, sumAll)
+                foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
                 if foldRatio >= minRatio:
                     # first pass, with absolute numbers
                     peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
@@ -719,7 +725,7 @@ def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, repor
                 pValue *= poissonmean
                 pValue /= i+1
 
-            outputList.append("%1.2f" % pValue)
+            outputList.append("%1.2g" % pValue)
 
         outline = string.join(outputList, "\t")
         print outline
index a717fe86f0881268f8745791dc94d97f6d161d09..9433e2e1dd2ffeb0e299d378eb7ea46b3e10fc9b 100755 (executable)
@@ -32,7 +32,7 @@ def main(argv=None):
     getallgenes(genome, infilename, outfilename, options.maxRadius,
                 options.nomatchfilename, options.step, options.trackFar,
                 options.trackStrand, options.compact, options.colID,
-                options.doCache, options.extendgenome, options.replaceModels)
+                options.doCache, options.extendGenome, options.replaceModels)
 
 
 def getParser(usage):
index cbca805a2426ace12e5121668cffecf2c646d3d5..197800e09fab70bf2824bcf3e9caa1ecfb364ef1 100755 (executable)
@@ -19,7 +19,7 @@ def main(argv=None):
     print verstring
     delimiter = "|"
 
-    usage = "usage: python %prog genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\
+    usage = "usage: python getsplicefa.py genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\
             \n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\
             \n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter
 
@@ -31,7 +31,7 @@ def main(argv=None):
         genome = args[0]
         datafilename = args[1]
         outfilename = args[2]
-        maxBorder = args[3]
+        maxBorder = int(args[3])
     except IndexError:
         print usage
         sys.exit(1)
index d5a6b4b954d4a9df6cb7b749c97c513e9394dafb..6c1ae3323555722deea8bf91c5254ceb310560cb 100755 (executable)
@@ -24,8 +24,8 @@ def main(argv=None):
         print usage
         sys.exit(1)
 
-    rdsfilename = argv[1]
-    expandedRPKMfile = args[3]
+    rdsfilename = args[0]
+    expandedRPKMfile = args[1]
     multicountfile = args[2]
     outfilename = args[3]