fix bug in RegionFinder.updateControlStatistics that was causing crash
[erange.git] / siteintersects.py
1 #
2 #  siteintersects.py
3 #  ENRAGE
4 #
5
6 import sys
7 from commoncode import regionsOverlap
8
9 print "siteintersects: version 2.1"
10
11
12 class Site():
13     def __init__(self, line, doExpanded=False):
14         fields = line.strip().split()
15         if doExpanded:
16             self.chrom = fields[1][3:]
17             self.start = int(fields[2])
18             self.stop = int(fields[3])
19             self.rest = fields[4:]
20         else:
21             (chromosome, pos) = fields[0].split(":")
22             self.chrom = chromosome[3:]
23             (start, stop) = pos.split("-")
24             self.start = int(start)
25             self.stop = int(stop)
26             self.rest = fields[1:]
27
28
29
30 def main(argv=None):
31     if not argv:
32         argv = sys.argv
33
34     if len(argv) < 4:
35         print "usage: python %s sitefile1 sitefile2 outfile [--reject rejectfile1 rejectfile2] [--expanded]" % argv[0]
36         sys.exit(1)
37
38     sitefilename1 =  argv[1]
39     sitefilename2 = argv[2]
40     outfilename = argv[3]
41
42     doReject = False
43     if "--reject" in sys.argv:    
44         reject1file = open(sys.argv[sys.argv.index("-reject") + 1], "w")
45         reject2file = open(sys.argv[sys.argv.index("-reject") + 2], "w")
46         doReject = True
47
48     doExpanded = False
49     if "--expanded" in sys.argv:
50         doExpanded = True
51
52     siteintersects(sitefilename1, sitefilename2, outfilename, reject1file, reject2file, doReject, doExpanded)
53
54
55 def siteintersects(siteFilename, siteCompareFilename, outfilename, siteRejectFilename=None, compareRejectFilename=None, doReject=False, doExpanded=False):
56
57     siteDict, rejectSiteDict = getSiteDicts(siteFilename, doExpanded, doReject)
58     commonSiteCount = compareSites(siteCompareFilename, compareRejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject)
59
60     if doReject and siteRejectFilename is not None:
61         writeRejectSiteFile(siteRejectFilename, rejectSiteDict)
62
63     print commonSiteCount
64
65
66 def getSiteDicts(sitefilename, doExpanded=False, doReject=False):
67     siteDict = {}
68     rejectSiteDict = {}
69     processedLineCount = 0
70     infile = open(sitefilename)
71     infile.readline()
72     for line in infile:
73         if doNotProcessLine(line):
74             continue
75
76         processedLineCount += 1
77         site = Site(line, doExpanded)
78         chrom = site.chrom
79         start = site.start
80         stop = site.stop
81         rest = site.fieldList
82
83         try:
84             siteDict[chrom].append((start, stop, rest))
85         except KeyError:
86             siteDict[chrom] = [(start, stop, rest)]
87
88         if doReject:
89             rejectSiteDict[str((chrom, start, stop, rest))] = line
90
91     infile.close()
92     print "file1: %d" % processedLineCount
93
94     return siteDict, rejectSiteDict
95
96
97 def doNotProcessLine(line):
98     return line[0] == "#"
99
100
101 def compareSites(siteFilename, rejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject):
102     processedLineCount = 0
103     infile = open(siteFilename)
104     infile.readline()
105
106     commonSites = 0
107     if doReject and rejectFilename is not None:
108         rejectfile = open(rejectFilename, "w")
109     else:
110         doReject=False
111
112     outfile = open(outfilename, "w")
113     for line in infile:
114         if doNotProcessLine(line):
115             continue
116
117         processedLineCount += 1
118         site = Site(line, doExpanded)
119         chrom = site.chrom
120         start = site.start
121         stop = site.stop
122         if chrom not in siteDict:
123             if doReject:
124                 rejectfile.write(line)
125
126             continue
127
128         siteNotCommon = True
129         for (rstart, rstop, rline) in siteDict[chrom]:
130             if regionsOverlap(start, stop, rstart, rstop):
131                 commonSites += 1
132                 if siteNotCommon:
133                     outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, site.fieldList))
134                     siteNotCommon = False
135
136                 try:
137                     if doReject:
138                         del rejectSiteDict[str((chrom, rstart, rstop, rline))]
139                 except KeyError:
140                     pass
141
142         if doReject and siteNotCommon:
143             rejectfile.write(line)
144
145     if doReject:
146         rejectfile.close()
147
148     outfile.close()
149
150     print "file2: %d" % processedLineCount
151
152     return commonSites
153
154
155 def writeRejectSiteFile(siteRejectFilename, rejectSiteDict):
156     rejectfile = open(siteRejectFilename, "w")
157
158     for key in rejectSiteDict:
159         rejectfile.write(rejectSiteDict[key])
160
161     rejectfile.close()
162     
163
164 if __name__ == "__main__":
165     main(sys.argv)