first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / distalPairs.py
index d24781a0db7c0ee3702e71cf97f6c2d62ea21c41..dc836501c6b2b98f53b5a4a84ce8778c9673a1a7 100755 (executable)
@@ -12,15 +12,18 @@ try:
 except:
     pass
 
-from commoncode import readDataset
-import sys, time, optparse
+import sys
+import time
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigIntOption, getConfigBoolOption
 
 
 def main(argv=None):
     if not argv:
         argv = sys.argv
 
-    print "%prog: version 3.3"
+    print "distalPairs: version 3.4"
     print "looks at all chromosomes simultaneously: is both slow and takes up large amount of RAM"
     usage = "usage: python %prog minDist rdsfile outfile [--sameChrom] [--splices] [--maxDist bp] [--verbose] [--cache cachepages]"
 
@@ -44,6 +47,27 @@ def main(argv=None):
     distalPairs(minDist, rdsfile, outfilename, options.sameChromOnly, options.doSplices, options.doVerbose, options.maxDist, options.cachePages)
 
 
+def makeParser(usage=""):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--sameChrom", action="store_true", dest="sameChromOnly")
+    parser.add_option("--splices", action="store_true", dest="doSplices")
+    parser.add_option("--verbose", action="store_true", dest="doVerbose")
+    parser.add_option("--maxDist", type="int", dest="maxDist")
+    parser.add_option("--cache", type="int", dest="cachePages")
+
+    configParser = getConfigParser()
+    section = "distalPairs"
+    sameChromOnly = getConfigBoolOption(configParser, section, "sameChromOnly", False)
+    doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+    doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
+    maxDist = getConfigIntOption(configParser, section, "maxDist", 1000000000)
+    cachePages = getConfigOption(configParser, section, "cachePages", None)
+
+    parser.set_defaults(sameChromOnly=sameChromOnly, doSplices=doSplices, doVerbose=doVerbose, maxDist=maxDist, cachePages=cachePages)
+
+    return parser
+
+
 def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=False, doVerbose=False, maxDist=1000000000, cachePages=None):
     if cachePages is not None:
         doCache = True
@@ -51,7 +75,7 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
         doCache = False
         cachePages = -1
 
-    RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+    RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
     if not RDS.hasIndex():
         print "Will not attempt to run on unIndexed dataset - please index with rdsmetadata.py and rerun"
         sys.exit(1)
@@ -61,33 +85,17 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
 
     print time.ctime()
 
-    if doSplices:
-        print "getting splices"
-        splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
-        print "got splices"
-
     print "getting uniq reads"    
     uniqDict = RDS.getReadsDict(withChrom=True, withPairID=True, doUniqs=True, readIDDict=True)
     print "got uniqs"
 
     if doSplices:
-        for readID in splicesDict:
-            theRead = splicesDict[readID]
-            read0 = theRead[0]
-            del read0[1]
-            try:
-                uniqDict[readID].append(read0)
-            except:
-                if len(theRead) == 4:
-                    read2 = theRead[2]
-                    del read2[1]
-                    uniqDict[readID] = [read0,read2]
+        addSplicesToUniqReads(RDS, uniqDict)
 
     if doVerbose:
         print len(uniqDict), time.ctime()
 
     outfile = open(outfilename,"w")
-
     diffChrom = 0
     distal = 0
     total = 0
@@ -95,8 +103,12 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
         readList = uniqDict[readID]
         if len(readList) == 2:
             total += 1
-            (start1, sense1, chrom1, pair1) = readList[0]
-            (start2, sense2, chrom2, pair2) = readList[1]
+            start1 = readList[0]["start"]
+            sense1 = readList[0]["sense"]
+            chrom1 = readList[0]["chrom"]
+            start2 = readList[1]["start"]
+            sense2 = readList[1]["sense"]
+            chrom2 = readList[1]["chrom"]
 
             if chrom1 != chrom2:
                 diffChrom += 1
@@ -104,16 +116,15 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
                     continue
                 else:
                     outline = "%s\t%s\t%d\t%s\t%s\t%d\t%s" % (readID, chrom1, start1, sense1, chrom2, start2, sense2)
-                    outfile.write(outline + "\n")
+                    print >> outfile, outline
                     if doVerbose:
                         print diffChrom, outline
             else:
                 dist = abs(start1 - start2)
-
                 if minDist < dist < maxDist:
                     distal += 1
                     outline = "%s\t%s\t%d\t%s\t%d\t%s\t%d" % (readID, chrom1, start1, sense1, start2, sense2, dist)
-                    outfile.write(outline + "\n")
+                    print >> outfile, outline
                     if doVerbose:
                         print distal, outline
 
@@ -129,5 +140,24 @@ def distalPairs(minDist, rdsfile, outfilename, sameChromOnly=False, doSplices=Fa
     print time.ctime()
 
 
+def addSplicesToUniqReads(RDS, uniqDict):
+    print "getting splices"
+    splicesDict = RDS.getSplicesDict(withChrom=True, withPairID=True, readIDDict=True, splitRead=True)
+    print "got splices"
+    for readID in splicesDict:
+        theRead = splicesDict[readID]
+        read0 = theRead[0]
+        read0["start"] = read0["startL"]
+        del read0["stopL"]
+        try:
+            uniqDict[readID].append(read0)
+        except:
+            if len(theRead) == 4:
+                read2 = theRead[2]
+                read2["start"] = read2["startL"]
+                del read2["stopL"]
+                uniqDict[readID] = [read0,read2]
+
+
 if __name__ == "__main__":
     main(sys.argv)
\ No newline at end of file