erange version 4.0a dev release
[erange.git] / regionintersects.py
index 340d2f88f0cda1177cf0b8c772d5e808c8e39f94..49a0e328af4dc8bf09bf3c7dc3a511e56b4e4a38 100755 (executable)
@@ -8,10 +8,12 @@ try:
 except:
     pass
 
-import sys, optparse
-from commoncode import readDataset, getMergedRegions, findPeak
+import sys
+import optparse
+from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigBoolOption
+import ReadDataset
 
-print "%prog: version 3.0"
+print "regionintersects: version 3.1"
 
 def main(argv=None):
     if not argv:
@@ -19,16 +21,7 @@ def main(argv=None):
 
     usage = "usage: python %prog rdsfile1 regionfile1 rdsfile2 regionfile2 outfile [--reject1 File1] [--reject2 File2] [--union] [--cache] [--raw]"
 
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--reject1", dest="rejectOneName")
-    parser.add_option("--reject2", dest="rejectTwoName")
-    parser.add_option("--union", action="store_true", dest="trackReject")
-    parser.add_option("--cache", action="store_true", dest="doCache")
-    parser.add_option("--raw", action="store_false", dest="normalize")
-    parser.add_option("--verbose", action="store_true", dest="doVerbose")
-    parser.set_defaults(rejectOneName=None, rejectTwoName=None, trackReject=False,
-                        doCache=False, normalize=True, doVerbose=False)
-
+    parser = getParser(usage)
     (options, args) = parser.parse_args(argv[1:])
 
     if len(args) < 5:
@@ -47,6 +40,31 @@ def main(argv=None):
                      options.doVerbose)
 
 
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--reject1", dest="rejectOneName")
+    parser.add_option("--reject2", dest="rejectTwoName")
+    parser.add_option("--union", action="store_true", dest="trackReject")
+    parser.add_option("--cache", action="store_true", dest="doCache")
+    parser.add_option("--raw", action="store_false", dest="normalize")
+    parser.add_option("--verbose", action="store_true", dest="doVerbose")
+
+    configParser = getConfigParser()
+    section = "regionintersects"
+    rejectOneName = getConfigOption(configParser, section, "rejectOneName", None)
+    rejectTwoName = getConfigOption(configParser, section, "rejectTwoName", None)
+    trackReject = getConfigBoolOption(configParser, section, "trackReject", False)
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    normalize = getConfigBoolOption(configParser, section, "normalize", True)
+    doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
+
+    parser.set_defaults(rejectOneName=rejectOneName, rejectTwoName=rejectTwoName,
+                        trackReject=trackReject, doCache=doCache, normalize=normalize,
+                        doVerbose=doVerbose)
+
+    return parser
+
+
 def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
                      outfilename, rejectOneName=None, rejectTwoName=None,
                      trackReject=False, doCache=False, normalize=True, doVerbose=False):
@@ -69,8 +87,8 @@ def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
     oneDict = getMergedRegions(regionOneName, mergedist, verbose=doVerbose)
     twoDict = getMergedRegions(regionTwoName, mergedist, verbose=doVerbose)
 
-    oneRDS = readDataset(readOneName, verbose=doVerbose, cache=doCache) 
-    twoRDS = readDataset(readTwoName, verbose=doVerbose, cache=doCache)
+    oneRDS = ReadDataset.ReadDataset(readOneName, verbose=doVerbose, cache=doCache) 
+    twoRDS = ReadDataset.ReadDataset(readTwoName, verbose=doVerbose, cache=doCache)
 
     if normalize:
         normalize1 = len(oneRDS) / 1000000.
@@ -88,36 +106,37 @@ def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
 
     numRegionsOne = 0
     numRegionsTwo = 0
+    commonChromosomeList = set(oneDict.keys())
     for rchrom in oneDict:
         numRegionsOne += len(oneDict[rchrom])
 
     for rchrom in twoDict:
+        commonChromosomeList.add(rchrom)
         numRegionsTwo += len(twoDict[rchrom])
 
     outfile.write("#%d\tregions in\t%s\n#%d\tregions in\t%s\n" % (numRegionsOne, regionOneName, numRegionsTwo, regionTwoName))
 
-    for rchrom in oneDict:
-        if rchrom not in twoDict:
-            continue
-
-        print rchrom
+    for chromosome in commonChromosomeList:
+        print chromosome
         rindex = 0
         rindex2 = 0
-        fullchrom = "chr" + rchrom
+        fullchrom = "chr%s" % chromosome
         oneReads = oneRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True)
         dictLen1 = len(oneReads[fullchrom])
         twoReads = twoRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True)
         dictLen2 = len(twoReads[fullchrom])
-        chrom = rchrom
-        onePeaksDict[chrom] = []
-        oneFoundDict[chrom] = []
-        for (start, stop, length) in oneDict[chrom]:
+        onePeaksDict[chromosome] = []
+        oneFoundDict[chromosome] = []
+        for region in oneDict[chromosome]:
+            start = region.start
+            stop = region.stop
+            length = region.length
             readList = []
             for localIndex in xrange(rindex, dictLen1):
                 read = oneReads[fullchrom][localIndex]
-                if read[0] < start:
+                if read["start"] < start:
                     rindex += 1
-                elif start <= read[0] <= stop:
+                elif start <= read["start"] <= stop:
                     readList.append(read)
                 else:
                     break
@@ -127,17 +146,20 @@ def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
 
             readList.sort()
 
-            (topPos, numHits, smoothArray, numPlus) = findPeak(readList, start, length, doWeight=True)
-            onePeakScore = smoothArray[topPos[0]]
-            onePeaksDict[chrom].append((topPos[0] + start, length/2, start, stop, numHits/normalize1, onePeakScore/normalize1))
+            peak = findPeak(readList, start, length, doWeight=True)
+            onePeakScore = peak.smoothArray[peak.topPos[0]]
+            onePeaksDict[chromosome].append((peak.topPos[0] + start, length/2, start, stop, peak.numHits/normalize1, onePeakScore/normalize1))
 
-        for (start, stop, length) in twoDict[chrom]:
+        for region in twoDict[chromosome]:
+            start = region.start
+            stop = region.stop
+            length = region.length
             readList2 = []
             for localIndex in xrange(rindex2, dictLen2):
                 read = twoReads[fullchrom][localIndex]
-                if read[0] < start:
+                if read["start"] < start:
                     rindex2 += 1
-                elif start <= read[0] <= stop:
+                elif start <= read["start"] <= stop:
                     readList2.append(read)
                 else:
                     break
@@ -146,45 +168,46 @@ def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
                 continue
 
             readList2.sort()
-            (topPos, numHits, smoothArray, numPlus) = findPeak(readList2, start, length, doWeight=True)
+            peak2 = findPeak(readList2, start, length, doWeight=True)
+            numHits = peak2.numHits
             numHits /= normalize2
             twoIsCommon = False
-            twoPeak = topPos[0] + start
+            twoPeak = peak2.topPos[0] + start
             twoRadius = length/2
-            twoPeakScore = smoothArray[topPos[0]] / normalize2
-            for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chrom]:
+            twoPeakScore = peak2.smoothArray[peak2.topPos[0]] / normalize2
+            for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chromosome]:
                 if abs(twoPeak - onePeak) < (twoRadius + oneRadius):
                     if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict:
-                        oneFoundDict[chrom].append((onePeak, oneRadius, ostart, ostop, ohits))
+                        oneFoundDict[chromosome].append((onePeak, oneRadius, ostart, ostop, ohits))
 
                     twoIsCommon = True
                     commonRegions += 1
-                    outline = "common%d\tchr%s\t%d\t%d\t%.1f\t%.1f\tchr%s\t%d\t%d\t%.1f\t%.1f" % (commonRegions, chrom, ostart, ostop, ohits, opeakScore, chrom, start, stop, numHits, twoPeakScore)
+                    outline = "common%d\tchr%s\t%d\t%d\t%.1f\t%.1f\tchr%s\t%d\t%d\t%.1f\t%.1f" % (commonRegions, chromosome, ostart, ostop, ohits, opeakScore, chromosome, start, stop, numHits, twoPeakScore)
                     if doVerbose:
                         print outline
 
-                    outfile.write(outline + "\n")
+                    print >> outfile, outline
 
             if trackReject and not twoIsCommon:
                 twoRejectIndex += 1
-                outline = "rejectTwo%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (twoRejectIndex, chrom, start, stop, numHits, twoPeakScore)
+                outline = "rejectTwo%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (twoRejectIndex, chromosome, start, stop, numHits, twoPeakScore)
                 if doReject:
-                    rejectTwo.write(outline + "\n")
+                    print >> rejectTwo, outline
                 else:
-                    outfile.write(outline + "\n")
+                    print >> outfile, outline
 
                 if doVerbose:
                     print outline
 
         if trackReject:
-            for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chrom]:
-                if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict[chrom]:
+            for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chromosome]:
+                if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict[chromosome]:
                     oneRejectIndex += 1
-                    outline = "rejectOne%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (oneRejectIndex, chrom, ostart, ostop, ohits, opeakScore)
+                    outline = "rejectOne%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (oneRejectIndex, chromosome, ostart, ostop, ohits, opeakScore)
                     if doReject:
-                        rejectOne.write(outline + "\n")
+                        print >> rejectOne, outline
                     else:
-                        outfile.write(outline + "\n")
+                        print >> outfile, outline
 
                     if doVerbose:
                         print outline