erange version 4.0a dev release
[erange.git] / siteintersects.py
1 #
2 #  siteintersects.py
3 #  ENRAGE
4 #
5
6 import sys
7
8 print "siteintersects: version 2.1"
9
10
11 def main(argv=None):
12     if not argv:
13         argv = sys.argv
14
15     if len(argv) < 4:
16         print "usage: python %s sitefile1 sitefile2 outfile [--reject rejectfile1 rejectfile2] [--expanded]" % argv[0]
17         sys.exit(1)
18
19     sitefilename1 =  argv[1]
20     sitefilename2 = argv[2]
21     outfilename = argv[3]
22
23     doReject = False
24     if "--reject" in sys.argv:    
25         reject1file = open(sys.argv[sys.argv.index("-reject") + 1], "w")
26         reject2file = open(sys.argv[sys.argv.index("-reject") + 2], "w")
27         doReject = True
28
29     doExpanded = False
30     if "--expanded" in sys.argv:
31         doExpanded = True
32
33     siteintersects(sitefilename1, sitefilename2, outfilename, reject1file, reject2file, doReject, doExpanded)
34
35
36 def siteintersects(sitefilename1, sitefilename2, outfilename, reject1filename=None, reject2filename=None, doReject=False, doExpanded=False):
37
38     siteDict = {}
39     file1Dict = {}
40
41     infile1count = 0
42     infile = open(sitefilename1)
43     infile.readline()
44     for line in infile:
45         if line[0] == "#":
46             continue
47
48         infile1count += 1
49         fields = line.strip().split()
50         if doExpanded:
51             chrom = fields[1][3:]
52             start = int(fields[2])
53             stop = int(fields[3])
54             rest = fields[4:]
55         else:
56             (chrom, pos) = fields[0].split(":")
57             chrom = chrom[3:]
58             (start, stop) = pos.split("-")
59             start = int(start)
60             stop = int(stop)
61             rest = fields[1:]
62
63         try:
64             siteDict[chrom].append((start, stop, rest))
65         except:
66             siteDict[chrom] = [(start, stop, rest)]
67
68         if doReject:
69             file1Dict[str((chrom, start, stop, rest))] = line
70
71     infile.close()
72
73     print "file1: %d" % infile1count
74
75     infile2count = 0
76     infile = open(sitefilename2)
77     infile.readline()
78
79     commonSites = 0
80     unique2List = []
81     outfile = open(outfilename, "w")
82     for line in infile:
83         if line[0] == "#":
84             continue
85
86         infile2count += 1
87         fields = line.strip().split()
88         if doExpanded:
89             chrom = fields[1][3:]
90             start = int(fields[2])
91             stop = int(fields[3])
92             rest = fields[4:]
93         else:
94             (chrom, pos) = fields[0].split(":")
95             chrom = chrom[3:]
96             (start, stop) = pos.split("-")
97             rest = str(fields[1:])
98
99         start = int(start)
100         stop = int(stop)
101         mid = start + abs(stop - start)/2
102         if chrom not in siteDict:
103             if doReject:
104                 unique2List.append(line)
105                 continue
106
107         twoNotCommon = True
108         for (rstart, rstop, rline) in siteDict[chrom]:
109             rsize = abs(rstart - rstop) /2
110             rmid = rstart + abs(rstop - rstart)/2
111             if abs(mid - rmid) < rsize:
112                 commonSites += 1
113                 if twoNotCommon:
114                     outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, rest))
115                     twoNotCommon = False
116
117                 try:
118                     if doReject:
119                         del file1Dict[str((chrom, rstart, rstop, rline))]
120                 except:
121                     pass
122
123         if doReject and twoNotCommon:
124             unique2List.append(line)
125
126     outfile.close()
127
128     print "file2: %d" % infile2count
129
130     if doReject:
131         reject1file = open(reject1filename, "w")
132         reject2file = open(reject2filename, "w")
133
134         for key in file1Dict:
135             reject1file.write(file1Dict[key])
136
137         for line in unique2List:
138             reject2file.write(line)
139
140         reject1file.close()
141         reject2file.close()
142
143     print commonSites
144
145
146 if __name__ == "__main__":
147     main(sys.argv)