first pass cleanup of cistematic/genomes; change bamPreprocessing

[erange.git] / distalPairs.py
diff --git a/distalPairs.py b/distalPairs.py

index d24781a0db7c0ee3702e71cf97f6c2d62ea21c41..dc836501c6b2b98f53b5a4a84ce8778c9673a1a7 100755 (executable)
--- a/distalPairs.py
+++ b/distalPairs.py
@@ -12,15 +12,18 @@ try:
  except:
      pass
  
-from commoncode import readDataset
-import sys, time, optparse
+import sys
+import time
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
  
  
  def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    print "%prog: version 3.3"
+    print "distalPairs: version 3.4"
      print "looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM"
      usage = "usage: python %prog minDist rdsfile outfile [--sameChrom] [--splices] [--maxDist bp] [--verbose] [--cache cachepages]"
  
@@ -44,6 +47,27 @@ def main(argv=None):
      distalPairs(minDist, rdsfile, outfilename, options.sameChromOnly, options.doSplices, options.doVerbose, options.maxDist, options.cachePages)
  
  
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--sameChrom", action="store_true", dest="sameChromOnly")
+    parser.add_option("--splices", action="store_true", dest="doSplices")
+    parser.add_option("--verbose", action="store_true", dest="doVerbose")
+    parser.add_option("--maxDist", type="int", dest="maxDist")
+    parser.add_option("--cache", type="int", dest="cachePages")
+
+    configParser = getConfigParser()
+    section = "distalPairs"
+    sameChromOnly = getConfigBoolOption(configParser, section, "sameChromOnly", False)
+    doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+    doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
+    maxDist = getConfigIntOption(configParser, section, "maxDist", 1000000000)
+    cachePages = getConfigOption(configParser, section, "cachePages", None)
+
+    parser.set_defaults(sameChromOnly=sameChromOnly, doSplices=doSplices, doVerbose=doVerbose, maxDist=maxDist, cachePages=cachePages)
+
+    return parser
+
+
  def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=False, doVerbose=False, maxDist=1000000000, cachePages=None):
      if cachePages is not None:
          doCache = True
@@ -51,7 +75,7 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
          doCache = False
          cachePages = -1
  
-    RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+    RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
      if not RDS.hasIndex():
          print "Will not attempt to run on unIndexed dataset - please index with rdsmetadata.py and rerun"
          sys.exit(1)
@@ -61,33 +85,17 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
  
      print time.ctime()
  
-    if doSplices:
-        print "getting splices"
-        splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
-        print "got splices"
-
      print "getting uniq reads"    
      uniqDict = RDS.getReadsDict(withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
      print "got uniqs"
  
      if doSplices:
-        for readID in splicesDict:
-            theRead = splicesDict[readID]
-            read0 = theRead[0]
-            del read0[1]
-            try:
-                uniqDict[readID].append(read0)
-            except:
-                if len(theRead) == 4:
-                    read2 = theRead[2]
-                    del read2[1]
-                    uniqDict[readID] = [read0,read2]
+        addSplicesToUniqReads(RDS, uniqDict)
  
      if doVerbose:
          print len(uniqDict), time.ctime()
  
      outfile = open(outfilename,"w")
-
      diffChrom = 0
      distal = 0
      total = 0
@@ -95,8 +103,12 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
          readList = uniqDict[readID]
          if len(readList) == 2:
              total += 1
-            (start1, sense1, chrom1, pair1) = readList[0]
-            (start2, sense2, chrom2, pair2) = readList[1]
+            start1 = readList[0]["start"]
+            sense1 = readList[0]["sense"]
+            chrom1 = readList[0]["chrom"]
+            start2 = readList[1]["start"]
+            sense2 = readList[1]["sense"]
+            chrom2 = readList[1]["chrom"]
  
              if chrom1 != chrom2:
                  diffChrom += 1
@@ -104,16 +116,15 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
                      continue
                  else:
                      outline = "%s\t%s\t%d\t%s\t%s\t%d\t%s" % (readID, chrom1, start1, sense1, chrom2, start2, sense2)
-                    outfile.write(outline + "\n")
+                    print >> outfile, outline
                      if doVerbose:
                          print diffChrom, outline
              else:
                  dist = abs(start1 - start2)
-
                  if minDist < dist < maxDist:
                      distal += 1
                      outline = "%s\t%s\t%d\t%s\t%d\t%s\t%d" % (readID, chrom1, start1, sense1, start2, sense2, dist)
-                    outfile.write(outline + "\n")
+                    print >> outfile, outline
                      if doVerbose:
                          print distal, outline
  
@@ -129,5 +140,24 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
      print time.ctime()
  
  
+def addSplicesToUniqReads(RDS, uniqDict):
+    print "getting splices"
+    splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
+    print "got splices"
+    for readID in splicesDict:
+        theRead = splicesDict[readID]
+        read0 = theRead[0]
+        read0["start"] = read0["startL"]
+        del read0["stopL"]
+        try:
+            uniqDict[readID].append(read0)
+        except:
+            if len(theRead) == 4:
+                read2 = theRead[2]
+                read2["start"] = read2["startL"]
+                del read2["stopL"]
+                uniqDict[readID] = [read0,read2]
+
+
  if __name__ == "__main__":
      main(sys.argv)
 \ No newline at end of file