first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / siteintersects.py
1 #
2 #  siteintersects.py
3 #  ENRAGE
4 #
5
6 import sys
7 from commoncode import regionsOverlap
8
9 print "siteintersects: version 2.1"
10
11
12 class Site():
13     def __init__(self, line, doExpanded=False):
14         fields = line.strip().split()
15         if doExpanded:
16             self.chrom = fields[1][3:]
17             self.start = int(fields[2])
18             self.stop = int(fields[3])
19             self.rest = fields[4:]
20         else:
21             (chromosome, pos) = fields[0].split(":")
22             self.chrom = chromosome[3:]
23             (start, stop) = pos.split("-")
24             self.start = int(start)
25             self.stop = int(stop)
26             self.rest = fields[1:]
27
28
29
30 def main(argv=None):
31     if not argv:
32         argv = sys.argv
33
34     if len(argv) < 4:
35         print "usage: python %s sitefile1 sitefile2 outfile [--reject rejectfile1 rejectfile2] [--expanded]" % argv[0]
36         sys.exit(1)
37
38     sitefilename1 =  argv[1]
39     sitefilename2 = argv[2]
40     outfilename = argv[3]
41
42     doReject = False
43     if "--reject" in sys.argv:    
44         reject1file = open(sys.argv[sys.argv.index("-reject") + 1], "w")
45         reject2file = open(sys.argv[sys.argv.index("-reject") + 2], "w")
46         doReject = True
47
48     doExpanded = False
49     if "--expanded" in sys.argv:
50         doExpanded = True
51
52     siteintersects(sitefilename1, sitefilename2, outfilename, reject1file, reject2file, doReject, doExpanded)
53
54
55 def siteintersects(siteFilename, siteCompareFilename, outfilename, siteRejectFilename=None, compareRejectFilename=None, doReject=False, doExpanded=False):
56
57     siteDict, rejectSiteDict = getSiteDicts(siteFilename, doExpanded, doReject)
58     commonSiteCount = compareSites(siteCompareFilename, compareRejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject)
59
60     if doReject and siteRejectFilename is not None:
61         writeRejectSiteFile(siteRejectFilename, rejectSiteDict)
62
63     print commonSiteCount
64
65
66 def getSiteDicts(sitefilename, doExpanded=False, doReject=False):
67     siteDict = {}
68     rejectSiteDict = {}
69     processedLineCount = 0
70     infile = open(sitefilename)
71     infile.readline()
72     for line in infile:
73         if doNotProcessLine(line):
74             continue
75
76         processedLineCount += 1
77         site = Site(line, doExpanded)
78         chrom = site.chrom
79         start = site.start
80         stop = site.stop
81         rest = site.fieldList
82
83         try:
84             siteDict[chrom].append((start, stop, rest))
85         except KeyError:
86             siteDict[chrom] = [(start, stop, rest)]
87
88         if doReject:
89             rejectSiteDict[str((chrom, start, stop, rest))] = line
90
91     infile.close()
92     print "file1: %d" % processedLineCount
93
94     return siteDict, rejectSiteDict
95
96
97 def doNotProcessLine(line):
98     return line[0] == "#"
99
100
101 def compareSites(siteFilename, rejectFilename, outfilename, siteDict, rejectSiteDict, doExpanded, doReject):
102     processedLineCount = 0
103     infile = open(siteFilename)
104     infile.readline()
105
106     commonSites = 0
107     if doReject and rejectFilename is not None:
108         rejectfile = open(rejectFilename, "w")
109     else:
110         doReject=False
111
112     outfile = open(outfilename, "w")
113     for line in infile:
114         if doNotProcessLine(line):
115             continue
116
117         processedLineCount += 1
118         site = Site(line, doExpanded)
119         chrom = site.chrom
120         start = site.start
121         stop = site.stop
122         if chrom not in siteDict:
123             if doReject:
124                 rejectfile.write(line)
125
126             continue
127
128         siteNotCommon = True
129         #TODO: as handed to me this is O(n^m) - probably could be better...
130         #TODO: if there are multiple matches will count them but only write the first match to the file
131         for (rstart, rstop, rline) in siteDict[chrom]:
132             if regionsOverlap(start, stop, rstart, rstop):
133                 commonSites += 1
134                 if siteNotCommon:
135                     outfile.write("common%d\tchr%s\t%d\t%d\t%s\tchr%s\t%d\t%d\t%s\n" % (commonSites, chrom, rstart, rstop, str(rline), chrom, start, stop, site.fieldList))
136                     siteNotCommon = False
137
138                 try:
139                     if doReject:
140                         del rejectSiteDict[str((chrom, rstart, rstop, rline))]
141                 except KeyError:
142                     pass
143
144         if doReject and siteNotCommon:
145             rejectfile.write(line)
146
147     if doReject:
148         rejectfile.close()
149
150     outfile.close()
151
152     print "file2: %d" % processedLineCount
153
154     return commonSites
155
156
157 def writeRejectSiteFile(siteRejectFilename, rejectSiteDict):
158     rejectfile = open(siteRejectFilename, "w")
159
160     for key in rejectSiteDict:
161         rejectfile.write(rejectSiteDict[key])
162
163     rejectfile.close()
164     
165
166 if __name__ == "__main__":
167     main(sys.argv)