Release version for Erange 4.0a
[erange.git] / siteintersects.py
index 4826a13bbb31824ffea9b187d383f42b35bbf682..e012bb49436c92f50a1d8722182d0cd72a754897 100755 (executable)
@@ -4,10 +4,29 @@
 #
 
 import sys
+from commoncode import regionsOverlap
 
 print "siteintersects: version 2.1"
 
 
+class Site():
+    def __init__(self, line, doExpanded=False):
+        fields = line.strip().split()
+        if doExpanded:
+            self.chrom = fields[1][3:]
+            self.start = int(fields[2])
+            self.stop = int(fields[3])
+            self.rest = fields[4:]
+        else:
+            (chromosome, pos) = fields[0].split(":")
+            self.chrom = chromosome[3:]
+            (start, stop) = pos.split("-")
+            self.start = int(start)
+            self.stop = int(stop)
+            self.rest = fields[1:]
+
+
+
 def main(argv=None):
     if not argv:
         argv = sys.argv
@@ -33,115 +52,114 @@ def main(argv=None):
     siteintersects(sitefilename1, sitefilename2, outfilename, reject1file, reject2file, doReject, doExpanded)
 
 
-def siteintersects(sitefilename1, sitefilename2, outfilename, reject1filename=None, reject2filename=None, doReject=False, doExpanded=False):
+def siteintersects(siteFilename, siteCompareFilename, outfilename, siteRejectFilename=None, compareRejectFilename=None, doReject=False, doExpanded=False):
 
-    siteDict = {}
-    file1Dict = {}
+    siteDict, rejectSiteDict = getSiteDicts(siteFilename, doExpanded, doReject)
+    commonSiteCount = compareSites(siteCompareFilename, compareRejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject)
 
-    infile1count = 0
-    infile = open(sitefilename1)
+    if doReject and siteRejectFilename is not None:
+        writeRejectSiteFile(siteRejectFilename, rejectSiteDict)
+
+    print commonSiteCount
+
+
+def getSiteDicts(sitefilename, doExpanded=False, doReject=False):
+    siteDict = {}
+    rejectSiteDict = {}
+    processedLineCount = 0
+    infile = open(sitefilename)
     infile.readline()
     for line in infile:
-        if line[0] == "#":
+        if doNotProcessLine(line):
             continue
 
-        infile1count += 1
-        fields = line.strip().split()
-        if doExpanded:
-            chrom = fields[1][3:]
-            start = int(fields[2])
-            stop = int(fields[3])
-            rest = fields[4:]
-        else:
-            (chrom, pos) = fields[0].split(":")
-            chrom = chrom[3:]
-            (start, stop) = pos.split("-")
-            start = int(start)
-            stop = int(stop)
-            rest = fields[1:]
+        processedLineCount += 1
+        site = Site(line, doExpanded)
+        chrom = site.chrom
+        start = site.start
+        stop = site.stop
+        rest = site.fieldList
 
         try:
             siteDict[chrom].append((start, stop, rest))
-        except:
+        except KeyError:
             siteDict[chrom] = [(start, stop, rest)]
 
         if doReject:
-            file1Dict[str((chrom, start, stop, rest))] = line
+            rejectSiteDict[str((chrom, start, stop, rest))] = line
 
     infile.close()
+    print "file1: %d" % processedLineCount
 
-    print "file1: %d" % infile1count
+    return siteDict, rejectSiteDict
 
-    infile2count = 0
-    infile = open(sitefilename2)
+
+def doNotProcessLine(line):
+    return line[0] == "#"
+
+
+def compareSites(siteFilename, rejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject):
+    processedLineCount = 0
+    infile = open(siteFilename)
     infile.readline()
 
     commonSites = 0
-    unique2List = []
+    if doReject and rejectFilename is not None:
+        rejectfile = open(rejectFilename, "w")
+    else:
+        doReject=False
+
     outfile = open(outfilename, "w")
     for line in infile:
-        if line[0] == "#":
+        if doNotProcessLine(line):
             continue
 
-        infile2count += 1
-        fields = line.strip().split()
-        if doExpanded:
-            chrom = fields[1][3:]
-            start = int(fields[2])
-            stop = int(fields[3])
-            rest = fields[4:]
-        else:
-            (chrom, pos) = fields[0].split(":")
-            chrom = chrom[3:]
-            (start, stop) = pos.split("-")
-            rest = str(fields[1:])
-
-        start = int(start)
-        stop = int(stop)
-        mid = start + abs(stop - start)/2
+        processedLineCount += 1
+        site = Site(line, doExpanded)
+        chrom = site.chrom
+        start = site.start
+        stop = site.stop
         if chrom not in siteDict:
             if doReject:
-                unique2List.append(line)
-                continue
+                rejectfile.write(line)
+
+            continue
 
-        twoNotCommon = True
+        siteNotCommon = True
         for (rstart, rstop, rline) in siteDict[chrom]:
-            rsize = abs(rstart - rstop) /2
-            rmid = rstart + abs(rstop - rstart)/2
-            if abs(mid - rmid) < rsize:
+            if regionsOverlap(start, stop, rstart, rstop):
                 commonSites += 1
-                if twoNotCommon:
-                    outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, rest))
-                    twoNotCommon = False
+                if siteNotCommon:
+                    outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, site.fieldList))
+                    siteNotCommon = False
 
                 try:
                     if doReject:
-                        del file1Dict[str((chrom, rstart, rstop, rline))]
-                except:
+                        del rejectSiteDict[str((chrom, rstart, rstop, rline))]
+                except KeyError:
                     pass
 
-        if doReject and twoNotCommon:
-            unique2List.append(line)
+        if doReject and siteNotCommon:
+            rejectfile.write(line)
 
-    outfile.close()
+    if doReject:
+        rejectfile.close()
 
-    print "file2: %d" % infile2count
+    outfile.close()
 
-    if doReject:
-        reject1file = open(reject1filename, "w")
-        reject2file = open(reject2filename, "w")
+    print "file2: %d" % processedLineCount
 
-        for key in file1Dict:
-            reject1file.write(file1Dict[key])
+    return commonSites
 
-        for line in unique2List:
-            reject2file.write(line)
 
-        reject1file.close()
-        reject2file.close()
+def writeRejectSiteFile(siteRejectFilename, rejectSiteDict):
+    rejectfile = open(siteRejectFilename, "w")
 
-    print commonSites
+    for key in rejectSiteDict:
+        rejectfile.write(rejectSiteDict[key])
 
+    rejectfile.close()
+    
 
 if __name__ == "__main__":
-    main(sys.argv)
\ No newline at end of file
+    main(sys.argv)