first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / regionintersects.py
1 #
2 #  regionintersects.py
3 #  ENRAGE
4 #
5 try:
6     import psyco
7     psyco.full()
8 except:
9     pass
10
11 import sys
12 import optparse
13 from commoncode import getMergedRegions, findPeak, getConfigParser, getConfigOption, getConfigBoolOption
14 import ReadDataset
15
16 print "regionintersects: version 3.1"
17
18 def main(argv=None):
19     if not argv:
20         argv = sys.argv
21
22     usage = "usage: python %prog rdsfile1 regionfile1 rdsfile2 regionfile2 outfile [--reject1 File1] [--reject2 File2] [--union] [--cache] [--raw]"
23
24     parser = getParser(usage)
25     (options, args) = parser.parse_args(argv[1:])
26
27     if len(args) < 5:
28         print usage
29         sys.exit(1)
30
31     readOneName =  args[0]
32     regionOneName = args[1]
33     readTwoName = args[2]
34     regionTwoName = args[3]
35     outfilename = args[4]
36
37     regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
38                      outfilename, options.rejectOneName, options.rejectTwoName,
39                      options.trackReject, options.doCache, options.normalize,
40                      options.doVerbose)
41
42
43 def getParser(usage):
44     parser = optparse.OptionParser(usage=usage)
45     parser.add_option("--reject1", dest="rejectOneName")
46     parser.add_option("--reject2", dest="rejectTwoName")
47     parser.add_option("--union", action="store_true", dest="trackReject")
48     parser.add_option("--cache", action="store_true", dest="doCache")
49     parser.add_option("--raw", action="store_false", dest="normalize")
50     parser.add_option("--verbose", action="store_true", dest="doVerbose")
51
52     configParser = getConfigParser()
53     section = "regionintersects"
54     rejectOneName = getConfigOption(configParser, section, "rejectOneName", None)
55     rejectTwoName = getConfigOption(configParser, section, "rejectTwoName", None)
56     trackReject = getConfigBoolOption(configParser, section, "trackReject", False)
57     doCache = getConfigBoolOption(configParser, section, "doCache", False)
58     normalize = getConfigBoolOption(configParser, section, "normalize", True)
59     doVerbose = getConfigBoolOption(configParser, section, "doVerbose", False)
60
61     parser.set_defaults(rejectOneName=rejectOneName, rejectTwoName=rejectTwoName,
62                         trackReject=trackReject, doCache=doCache, normalize=normalize,
63                         doVerbose=doVerbose)
64
65     return parser
66
67
68 def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
69                      outfilename, rejectOneName=None, rejectTwoName=None,
70                      trackReject=False, doCache=False, normalize=True, doVerbose=False):
71
72     mergedist=0
73
74     outfile = open(outfilename, "w")
75
76     doReject = False
77     if rejectOneName is not None:
78         trackReject = True
79         doReject = True
80         rejectOne = open(rejectOneName, "w")
81
82     if rejectTwoName is not None:
83         trackReject = True
84         doReject = True
85         rejectTwo = open(rejectTwoName, "w")
86
87     oneDict = getMergedRegions(regionOneName, mergedist, verbose=doVerbose)
88     twoDict = getMergedRegions(regionTwoName, mergedist, verbose=doVerbose)
89
90     oneRDS = ReadDataset.ReadDataset(readOneName, verbose=doVerbose, cache=doCache) 
91     twoRDS = ReadDataset.ReadDataset(readTwoName, verbose=doVerbose, cache=doCache)
92
93     if normalize:
94         normalize1 = len(oneRDS) / 1000000.
95         normalize2 = len(twoRDS) / 1000000.
96     else:
97         normalize1 = 1.
98         normalize2 = 1.
99
100     commonRegions = 0
101     oneRejectIndex = 0
102     twoRejectIndex = 0
103
104     onePeaksDict = {}
105     oneFoundDict = {}
106
107     numRegionsOne = 0
108     numRegionsTwo = 0
109     commonChromosomeList = set(oneDict.keys())
110     for rchrom in oneDict:
111         numRegionsOne += len(oneDict[rchrom])
112
113     for rchrom in twoDict:
114         commonChromosomeList.add(rchrom)
115         numRegionsTwo += len(twoDict[rchrom])
116
117     outfile.write("#%d\tregions in\t%s\n#%d\tregions in\t%s\n" % (numRegionsOne, regionOneName, numRegionsTwo, regionTwoName))
118
119     for chromosome in commonChromosomeList:
120         print chromosome
121         rindex = 0
122         rindex2 = 0
123         fullchrom = "chr%s" % chromosome
124         oneReads = oneRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True)
125         dictLen1 = len(oneReads[fullchrom])
126         twoReads = twoRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True)
127         dictLen2 = len(twoReads[fullchrom])
128         onePeaksDict[chromosome] = []
129         oneFoundDict[chromosome] = []
130         for region in oneDict[chromosome]:
131             start = region.start
132             stop = region.stop
133             length = region.length
134             readList = []
135             for localIndex in xrange(rindex, dictLen1):
136                 read = oneReads[fullchrom][localIndex]
137                 if read["start"] < start:
138                     rindex += 1
139                 elif start <= read["start"] <= stop:
140                     readList.append(read)
141                 else:
142                     break
143
144             if len(readList) < 1:
145                 continue
146
147             readList.sort()
148
149             peak = findPeak(readList, start, length, doWeight=True)
150             onePeakScore = peak.smoothArray[peak.topPos[0]]
151             onePeaksDict[chromosome].append((peak.topPos[0] + start, length/2, start, stop, peak.numHits/normalize1, onePeakScore/normalize1))
152
153         for region in twoDict[chromosome]:
154             start = region.start
155             stop = region.stop
156             length = region.length
157             readList2 = []
158             for localIndex in xrange(rindex2, dictLen2):
159                 read = twoReads[fullchrom][localIndex]
160                 if read["start"] < start:
161                     rindex2 += 1
162                 elif start <= read["start"] <= stop:
163                     readList2.append(read)
164                 else:
165                     break
166
167             if len(readList2) < 1:
168                 continue
169
170             readList2.sort()
171             peak2 = findPeak(readList2, start, length, doWeight=True)
172             numHits = peak2.numHits
173             numHits /= normalize2
174             twoIsCommon = False
175             twoPeak = peak2.topPos[0] + start
176             twoRadius = length/2
177             twoPeakScore = peak2.smoothArray[peak2.topPos[0]] / normalize2
178             for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chromosome]:
179                 if abs(twoPeak - onePeak) < (twoRadius + oneRadius):
180                     if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict:
181                         oneFoundDict[chromosome].append((onePeak, oneRadius, ostart, ostop, ohits))
182
183                     twoIsCommon = True
184                     commonRegions += 1
185                     outline = "common%d\tchr%s\t%d\t%d\t%.1f\t%.1f\tchr%s\t%d\t%d\t%.1f\t%.1f" % (commonRegions, chromosome, ostart, ostop, ohits, opeakScore, chromosome, start, stop, numHits, twoPeakScore)
186                     if doVerbose:
187                         print outline
188
189                     print >> outfile, outline
190
191             if trackReject and not twoIsCommon:
192                 twoRejectIndex += 1
193                 outline = "rejectTwo%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (twoRejectIndex, chromosome, start, stop, numHits, twoPeakScore)
194                 if doReject:
195                     print >> rejectTwo, outline
196                 else:
197                     print >> outfile, outline
198
199                 if doVerbose:
200                     print outline
201
202         if trackReject:
203             for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chromosome]:
204                 if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict[chromosome]:
205                     oneRejectIndex += 1
206                     outline = "rejectOne%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (oneRejectIndex, chromosome, ostart, ostop, ohits, opeakScore)
207                     if doReject:
208                         print >> rejectOne, outline
209                     else:
210                         print >> outfile, outline
211
212                     if doVerbose:
213                         print outline
214
215     if trackReject:
216         print "common: %d   one-only: %d   two-only: %d" % (commonRegions, oneRejectIndex, twoRejectIndex)
217         outfile.write("#common: %d\tone-only: %d\ttwo-only: %d\n" % (commonRegions, oneRejectIndex, twoRejectIndex))
218     else:
219         print "common: %d" % commonRegions
220         outfile.write("#common: %d\n" % commonRegions)
221
222     outfile.close()
223
224
225 if __name__ == "__main__":
226     main(sys.argv)