From a3c0e60eb30c924232d7baaa348a15c5554e3864 Mon Sep 17 00:00:00 2001
From: Sean Upchurch <sau@caltech.edu>
Date: Mon, 28 Mar 2011 16:40:47 -0700
Subject: [PATCH] development checkpoint

---
 ReadDataset.py          |  3 ++-
 featureIntersects.py    |  2 +-
 findall.py              | 54 +++++++++++++++++++++++------------------
 getallgenes.py          |  2 +-
 getsplicefa.py          |  4 +--
 normalizeFinalExonic.py |  4 +--
 6 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/ReadDataset.py b/ReadDataset.py
index 56e28a8..850a5ec 100644
--- a/ReadDataset.py
+++ b/ReadDataset.py
@@ -764,6 +764,7 @@ class ReadDataset():
     def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
         """ returns the number of row in the splices table.
         """
+        # TODO: if the rds type is DNA should this just return zero?
         return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
 
 
@@ -1221,5 +1222,5 @@ class ReadDataset():
             destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
                                        row["weight"], row["flag"], row["mismatch"]))
 
-        self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", destinationEntries)
+        self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", destinationEntries)
 
diff --git a/featureIntersects.py b/featureIntersects.py
index c24b79e..7338499 100755
--- a/featureIntersects.py
+++ b/featureIntersects.py
@@ -54,7 +54,7 @@ def makeParser(usage=""):
 def featureIntersects(tabFileName, cistype="TFBSCONSSITES", radius=100):
     
     posList = getPositionList(tabFileName)
-    feats = featuresIntersecting("human", posList, radius, cistype)
+    feats = featuresIntersecting("hsapiens", posList, radius, cistype)
     featkeys = feats.keys()
     featkeys.sort()
     for (chrom, pos) in featkeys:
diff --git a/findall.py b/findall.py
index ec75efb..aa82037 100755
--- a/findall.py
+++ b/findall.py
@@ -231,6 +231,7 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
         doCache = False
         cachePages = -1
 
+    mockRDS = None
     if doControl:
         print "\ncontrol:" 
         mockRDS = ReadDataset.ReadDataset(mockfile, verbose=True, cache=doCache)
@@ -268,6 +269,7 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
         mockRDSsize = len(mockRDS) / 1000000.
         controlSampleString = "\t%s (%.1f M reads)" % (mockfile, mockRDSsize)
     else:
+        mockRDSsize = 0
         controlSampleString = " none"
 
     hitRDSsize = len(hitRDS) / 1000000.
@@ -318,7 +320,11 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
     hitChromList = hitRDS.getChromosomes()
     if doControl:
         mockChromList = mockRDS.getChromosomes()
+    else:
+        mockChromList = []
 
+    mockSampleSize = 0
+    hitSampleSize = 0
     if normalize:
         if doControl:
             mockSampleSize = mockRDSsize
@@ -360,27 +366,28 @@ def findall(factor, hitfile, outfilename, minHits=4.0, minRatio=4.0, maxSpacing=
             continue
 
         #now do background swapping the two samples around
-        print "calculating background..."
-        backgroundTrimValue = 1/20.
-        backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize,
-                                                                       chromosome, useMulti, normalize, maxSpacing,
-                                                                       doDirectionality, doTrim, minHits, minRatio,
-                                                                       readlen, shiftValue, minPeak, minPlusRatio,
-                                                                       maxPlusRatio, leftPlusRatio, listPeak, noMulti,
-                                                                       doControl, factor, backgroundTrimValue)
-
-        statistics["mIndex"] += backgroundRegionStats["index"]
-        statistics["mTotal"] += backgroundRegionStats["total"]
-        statistics["failed"] += backgroundRegionStats["failed"]
-        print statistics["mIndex"], statistics["mTotal"]
-        if doPvalue:
-            if pValueType == "self":
-                p, poissonmean = calculatePValue(allRegionWeights)
-            else:
-                p, poissonmean = calculatePValue(backgroundRegionWeights)
+        if mockRDS is not None:
+            print "calculating background..."
+            backgroundTrimValue = 1/20.
+            backgroundRegionStats, backgroundRegionWeights = locateRegions(mockRDS, mockSampleSize, hitRDS, hitSampleSize,
+                                                                           chromosome, useMulti, normalize, maxSpacing,
+                                                                           doDirectionality, doTrim, minHits, minRatio,
+                                                                           readlen, shiftValue, minPeak, minPlusRatio,
+                                                                           maxPlusRatio, leftPlusRatio, listPeak, noMulti,
+                                                                           doControl, factor, backgroundTrimValue)
+
+            statistics["mIndex"] += backgroundRegionStats["index"]
+            statistics["mTotal"] += backgroundRegionStats["total"]
+            statistics["failed"] += backgroundRegionStats["failed"]
+            print statistics["mIndex"], statistics["mTotal"]
+            if doPvalue:
+                if pValueType == "self":
+                    p, poissonmean = calculatePValue(allRegionWeights)
+                else:
+                    p, poissonmean = calculatePValue(backgroundRegionWeights)
 
-        print headline
-        shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
+            print headline
+            shiftModeValue = writeRegionsToFile(outfile, outregions, doPvalue, p, poissonmean, reportshift, shiftValue)
 
     footer = getFooter(statistics, doDirectionality, doRevBackground, shiftValue, reportshift, shiftModeValue)
     print footer
@@ -514,7 +521,7 @@ def learnShift(hitDict, hitSampleSize, mockRDS, chrom, doControl, useMulti, norm
 
 
 def getFoldRatio(rds, chrom, start, stop, doControl, useMulti, normalize, sampleSize, sumAll, minRatio):
-    if doControl:
+    if doControl and rds is not None:
         foldRatio = getFoldRatioFromRDS(rds, chrom, start, stop, useMulti, normalize, sampleSize, sumAll)
     else:
         foldRatio = minRatio
@@ -564,8 +571,7 @@ def locateRegions(rds, rdsSampleSize, referenceRDS, referenceSampleSize, chrom,
             regionWeights.append(int(sumAll))
             if sumAll >= minHits and numStarts > minRatio and (regionStop - regionStart) > readlen:
                 sumMulti = 0.
-                #first pass uses getFoldRatio on mockRDS as there may not be control
-                foldRatio = getFoldRatioFromRDS(referenceRDS, chrom, regionStart, regionStop, useMulti, normalizeToRPM, referenceSampleSize, sumAll)
+                foldRatio = getFoldRatio(referenceRDS, chrom, regionStart, regionStop, doControl, useMulti, normalizeToRPM, referenceSampleSize, sumAll, minRatio)
                 if foldRatio >= minRatio:
                     # first pass, with absolute numbers
                     peak = findPeak(currentReadList, regionStart, regionStop - regionStart, readlen, doWeight=True, leftPlus=doDirectionality, shift=shiftValue)
@@ -719,7 +725,7 @@ def writeRegionsToFile(outfile, outregions, doPvalue, pValue, poissonmean, repor
                 pValue *= poissonmean
                 pValue /= i+1
 
-            outputList.append("%1.2f" % pValue)
+            outputList.append("%1.2g" % pValue)
 
         outline = string.join(outputList, "\t")
         print outline
diff --git a/getallgenes.py b/getallgenes.py
index a717fe8..9433e2e 100755
--- a/getallgenes.py
+++ b/getallgenes.py
@@ -32,7 +32,7 @@ def main(argv=None):
     getallgenes(genome, infilename, outfilename, options.maxRadius,
                 options.nomatchfilename, options.step, options.trackFar,
                 options.trackStrand, options.compact, options.colID,
-                options.doCache, options.extendgenome, options.replaceModels)
+                options.doCache, options.extendGenome, options.replaceModels)
 
 
 def getParser(usage):
diff --git a/getsplicefa.py b/getsplicefa.py
index cbca805..197800e 100755
--- a/getsplicefa.py
+++ b/getsplicefa.py
@@ -19,7 +19,7 @@ def main(argv=None):
     print verstring
     delimiter = "|"
 
-    usage = "usage: python %prog genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\
+    usage = "usage: python getsplicefa.py genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\
             \n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\
             \n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter
 
@@ -31,7 +31,7 @@ def main(argv=None):
         genome = args[0]
         datafilename = args[1]
         outfilename = args[2]
-        maxBorder = args[3]
+        maxBorder = int(args[3])
     except IndexError:
         print usage
         sys.exit(1)
diff --git a/normalizeFinalExonic.py b/normalizeFinalExonic.py
index d5a6b4b..6c1ae33 100755
--- a/normalizeFinalExonic.py
+++ b/normalizeFinalExonic.py
@@ -24,8 +24,8 @@ def main(argv=None):
         print usage
         sys.exit(1)
 
-    rdsfilename = argv[1]
-    expandedRPKMfile = args[3]
+    rdsfilename = args[0]
+    expandedRPKMfile = args[1]
     multicountfile = args[2]
     outfilename = args[3]
 
-- 
2.30.2