Release version for Erange 4.0a
[erange.git] / farPairs.py
index 00cc91844b23084d712573261e996524739c9870..712e60eff9815f9728b6a03983385f83d96bc6f5 100644 (file)
@@ -14,8 +14,9 @@ except:
 import sys
 import time
 import optparse
+import string
 import ReadDataset
-from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption, countDuplicatesInList
 
 print "farPairs: version 1.4"
 
@@ -38,14 +39,13 @@ def main(argv=None):
     outfilename = args[1]
     outbedname = args[2]
 
-    farPairs(rdsfile, outfilename, outbedname, options.sameChromOnly, options.doVerbose,
+    farPairs(rdsfile, outfilename, outbedname, options.doVerbose,
              options.cachePages, options.minDist, options.maxDist, options.minCount,
              options.label)
 
 
 def getParser(usage):
     parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--sameChromOnly", action="store_true", dest="sameChromOnly")
     parser.add_option("--cache", type="int", dest="cachePages")
     parser.add_option("--verbose", action="store_true", dest="doVerbose")
     parser.add_option("--minDist", type="int", dest="minDist")
@@ -55,7 +55,6 @@ def getParser(usage):
 
     configParser = getConfigParser
     section = "farPairs"
-    sameChromOnly = getConfigBoolOption(configParser, section, "sameChromOnly", False)
     doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
     cachePages = getConfigOption(configParser, section, "cachePages", None)
     minDist = getConfigIntOption(configParser, section, "minDist", 1000)
@@ -63,15 +62,33 @@ def getParser(usage):
     minCount = getConfigIntOption(configParser, section, "minCount", 2)
     label = getConfigOption(configParser, section, "label", None)
 
-    parser.set_defaults(sameChromOnly=sameChromOnly, doVerbose=doVerbose, cachePages=cachePages,
+    parser.set_defaults(doVerbose=doVerbose, cachePages=cachePages,
                         minDist=minDist, maxDist=maxDist, minCount=minCount, label=label)
 
     return parser
 
 
-def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False,
+def farPairs(rdsfile, outfilename, outbedname, doVerbose=False,
              cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):
 
+    flagDict = processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose)
+
+    outfile = open(outfilename, "w")
+    for region in flagDict:
+        regionConnections = countDuplicatesInList(flagDict[region])
+        for (connectedRegion, count) in regionConnections:
+            if count >= minCount:
+                outline = "%s\t%s\t%d" % (region, connectedRegion, count)
+                print >> outfile, outline
+                if doVerbose:
+                    print outline
+
+    outfile.close()
+    if doVerbose:
+        print "finished: ", time.ctime()
+
+
+def processRDSFile(rdsfile, outbedname, minDist, maxDist, cachePages, label, doVerbose):
     doCache = False
     if cachePages is not None:
         doCache = True
@@ -86,11 +103,10 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa
 
     if doVerbose:
         print time.ctime()
-
-    total = 0
-    outfile = open(outfilename, "w")
+    
     outbed = open(outbedname, "w")
     outbed.write('track name="%s distal pairs" color=0,255,0\n' % label)
+    outbed.close()
 
     readlen = RDS.getReadSize()
     flagDict = {}
@@ -98,84 +114,84 @@ def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=Fa
         if doNotProcessChromosome(chromosome):
             continue
 
-        print chromosome
-        uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
-        if doVerbose:
-            print len(uniqDict), time.ctime()
-
-        for readID in uniqDict:
-            readList = uniqDict[readID]
-            if len(readList) == 2:
-                total += 1
-                start1 = readList[0]["start"]
-                flag1 = readList[0]["flag"]
-                start2 = readList[1]["start"]
-                flag2 = readList[1]["flag"]
-
-                if flag1 != flag2:
-                    dist = abs(start1 - start2)
-                    startList = [start1, start2]
-                    stopList = [start1 + readlen, start2 + readlen]
-                    startList.sort()
-                    stopList.sort()
-                    if flag1 != "" and flag2 != "" and minDist < dist < maxDist:
-                        outputLine = splitReadWrite(chromosome, 2, startList, stopList, "+", readID, "0,255,0", "0,255,0")
-                        outbed.write(outputLine)
-                        if doVerbose:
-                            print flag1, flag2, dist
-
-                        try:
-                            flagDict[flag1].append((flag2, start1, start2))
-                        except KeyError:
-                            flagDict[flag1] = [(flag2, start1, start2)]
-
-                        try:
-                            flagDict[flag2].append((flag1, start1, start2))
-                        except KeyError:
-                            flagDict[flag2] = [(flag2, start1, start2)]
+        writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist)
 
     print "%d connected regions" % len(flagDict)
 
-    for region in flagDict:
-        flagDict[region].sort()
-        regionConnections = {}
-        for (region2, start1, start2) in flagDict[region]:
+    return flagDict
+
+
+def doNotProcessChromosome(chrom):
+    return chrom == "chrM"
+
+
+def writeFarPairs(flagDict, chromosome, RDS, readlen, outbedname, doVerbose, minDist, maxDist):
+    outbed = open(outbedname, "a")
+    print chromosome
+    uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, doUniqs=True, readIDDict=True)
+    if doVerbose:
+        print len(uniqDict), time.ctime()
+
+    for readID in uniqDict:
+        readList = uniqDict[readID]
+        if readsAreFarPair(readList, minDist, maxDist):
+            start1 = readList[0]["start"]
+            start2 = readList[1]["start"]
+            startList = [start1, start2]
+            startList.sort()
+            outputLine = splitReadWrite(chromosome, 2, startList, readlen, "+", readID, "0,255,0", "0,255,0")
+            outbed.write(outputLine)
+            flag1 = readList[0]["flag"]
+            flag2 = readList[1]["flag"]
+            if doVerbose:
+                print flag1, flag2, abs(start1 - start2)
+
             try:
-                regionConnections[region2] += 1
+                flagDict[flag1].append(flag2)
             except KeyError:
-                regionConnections[region2] = 1
+                flagDict[flag1] = [flag2]
 
-        for region2 in regionConnections:
-            if regionConnections[region2] >= minCount:
-                outfile.write("%s\t%s\t%d\n" % (region, region2, regionConnections[region2]))
-                if doVerbose:
-                    print "%s\t%s\t%d" % (region, region2, regionConnections[region2])
+            try:
+                flagDict[flag2].append(flag1)
+            except KeyError:
+                flagDict[flag2] = [flag1]
 
-    outfile.close()
     outbed.close()
-    if doVerbose:
-        print "finished: ", time.ctime()
 
 
-def doNotProcessChromosome(chrom):
-    return chrom == "chrM"
+def readsAreFarPair(readList, minDist, maxDist):
+    isFarPair = False
+    if len(readList) == 2:
+        flag1 = readList[0]["flag"]
+        flag2 = readList[1]["flag"]
+        if flag1 != flag2 and flag1 != "" and flag2 != "":
+            start1 = readList[0]["start"]
+            start2 = readList[1]["start"]
+            dist = abs(start1 - start2)
+            if minDist < dist < maxDist:
+                isFarPair = True
 
+    return isFarPair
 
-def splitReadWrite(chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense):
-    readSizes = "%d" % (stopList[0] - startList[0])
-    readCoords = "0"
+
+def splitReadWrite(chrom, numPieces, startList, readlen, rsense, readName, plusSenseColor, minusSenseColor):
+    sizes = ["%d" % readlen]
+    coords = ["0"]
     leftStart = startList[0] - 1
-    rightStop = stopList[-1]
+    rightStop = startList[-1]
     for index in range(1, numPieces):
-        readSizes += ",%d" % (stopList[index] - startList[index] + 1)
-        readCoords += ",%d" % (startList[index] - startList[0])
+        sizes.append("%d" % (readlen + 1))
+        coords.append("%d" % (startList[index] - startList[0]))
 
     if rsense == "+":
-        senseCode = plusSense
+        senseCode = plusSenseColor
     else:
-        senseCode = minusSense
+        senseCode = minusSenseColor
 
+    readSizes = string.join(sizes, ",")
+    readCoords = string.join(coords, ",")
     outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
+
     return outline