snapshot of 4.0a development. initial git repo commit
[erange.git] / regionintersects.py
1 #
2 #  regionintersects.py
3 #  ENRAGE
4 #
5 try:
6     import psyco
7     psyco.full()
8 except:
9     pass
10
11 import sys, optparse
12 from commoncode import readDataset, getMergedRegions, findPeak
13
14 print "%prog: version 3.0"
15
16 def main(argv=None):
17     if not argv:
18         argv = sys.argv
19
20     usage = "usage: python %prog rdsfile1 regionfile1 rdsfile2 regionfile2 outfile [--reject1 File1] [--reject2 File2] [--union] [--cache] [--raw]"
21
22     parser = optparse.OptionParser(usage=usage)
23     parser.add_option("--reject1", dest="rejectOneName")
24     parser.add_option("--reject2", dest="rejectTwoName")
25     parser.add_option("--union", action="store_true", dest="trackReject")
26     parser.add_option("--cache", action="store_true", dest="doCache")
27     parser.add_option("--raw", action="store_false", dest="normalize")
28     parser.add_option("--verbose", action="store_true", dest="doVerbose")
29     parser.set_defaults(rejectOneName=None, rejectTwoName=None, trackReject=False,
30                         doCache=False, normalize=True, doVerbose=False)
31
32     (options, args) = parser.parse_args(argv[1:])
33
34     if len(args) < 5:
35         print usage
36         sys.exit(1)
37
38     readOneName =  args[0]
39     regionOneName = args[1]
40     readTwoName = args[2]
41     regionTwoName = args[3]
42     outfilename = args[4]
43
44     regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
45                      outfilename, options.rejectOneName, options.rejectTwoName,
46                      options.trackReject, options.doCache, options.normalize,
47                      options.doVerbose)
48
49
50 def regionintersects(readOneName, regionOneName, readTwoName, regionTwoName,
51                      outfilename, rejectOneName=None, rejectTwoName=None,
52                      trackReject=False, doCache=False, normalize=True, doVerbose=False):
53
54     mergedist=0
55
56     outfile = open(outfilename, "w")
57
58     doReject = False
59     if rejectOneName is not None:
60         trackReject = True
61         doReject = True
62         rejectOne = open(rejectOneName, "w")
63
64     if rejectTwoName is not None:
65         trackReject = True
66         doReject = True
67         rejectTwo = open(rejectTwoName, "w")
68
69     oneDict = getMergedRegions(regionOneName, mergedist, verbose=doVerbose)
70     twoDict = getMergedRegions(regionTwoName, mergedist, verbose=doVerbose)
71
72     oneRDS = readDataset(readOneName, verbose=doVerbose, cache=doCache) 
73     twoRDS = readDataset(readTwoName, verbose=doVerbose, cache=doCache)
74
75     if normalize:
76         normalize1 = len(oneRDS) / 1000000.
77         normalize2 = len(twoRDS) / 1000000.
78     else:
79         normalize1 = 1.
80         normalize2 = 1.
81
82     commonRegions = 0
83     oneRejectIndex = 0
84     twoRejectIndex = 0
85
86     onePeaksDict = {}
87     oneFoundDict = {}
88
89     numRegionsOne = 0
90     numRegionsTwo = 0
91     for rchrom in oneDict:
92         numRegionsOne += len(oneDict[rchrom])
93
94     for rchrom in twoDict:
95         numRegionsTwo += len(twoDict[rchrom])
96
97     outfile.write("#%d\tregions in\t%s\n#%d\tregions in\t%s\n" % (numRegionsOne, regionOneName, numRegionsTwo, regionTwoName))
98
99     for rchrom in oneDict:
100         if rchrom not in twoDict:
101             continue
102
103         print rchrom
104         rindex = 0
105         rindex2 = 0
106         fullchrom = "chr" + rchrom
107         oneReads = oneRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True)
108         dictLen1 = len(oneReads[fullchrom])
109         twoReads = twoRDS.getReadsDict(fullChrom=True, chrom=fullchrom, withWeight=True, doMulti=True)
110         dictLen2 = len(twoReads[fullchrom])
111         chrom = rchrom
112         onePeaksDict[chrom] = []
113         oneFoundDict[chrom] = []
114         for (start, stop, length) in oneDict[chrom]:
115             readList = []
116             for localIndex in xrange(rindex, dictLen1):
117                 read = oneReads[fullchrom][localIndex]
118                 if read[0] < start:
119                     rindex += 1
120                 elif start <= read[0] <= stop:
121                     readList.append(read)
122                 else:
123                     break
124
125             if len(readList) < 1:
126                 continue
127
128             readList.sort()
129
130             (topPos, numHits, smoothArray, numPlus) = findPeak(readList, start, length, doWeight=True)
131             onePeakScore = smoothArray[topPos[0]]
132             onePeaksDict[chrom].append((topPos[0] + start, length/2, start, stop, numHits/normalize1, onePeakScore/normalize1))
133
134         for (start, stop, length) in twoDict[chrom]:
135             readList2 = []
136             for localIndex in xrange(rindex2, dictLen2):
137                 read = twoReads[fullchrom][localIndex]
138                 if read[0] < start:
139                     rindex2 += 1
140                 elif start <= read[0] <= stop:
141                     readList2.append(read)
142                 else:
143                     break
144
145             if len(readList2) < 1:
146                 continue
147
148             readList2.sort()
149             (topPos, numHits, smoothArray, numPlus) = findPeak(readList2, start, length, doWeight=True)
150             numHits /= normalize2
151             twoIsCommon = False
152             twoPeak = topPos[0] + start
153             twoRadius = length/2
154             twoPeakScore = smoothArray[topPos[0]] / normalize2
155             for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chrom]:
156                 if abs(twoPeak - onePeak) < (twoRadius + oneRadius):
157                     if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict:
158                         oneFoundDict[chrom].append((onePeak, oneRadius, ostart, ostop, ohits))
159
160                     twoIsCommon = True
161                     commonRegions += 1
162                     outline = "common%d\tchr%s\t%d\t%d\t%.1f\t%.1f\tchr%s\t%d\t%d\t%.1f\t%.1f" % (commonRegions, chrom, ostart, ostop, ohits, opeakScore, chrom, start, stop, numHits, twoPeakScore)
163                     if doVerbose:
164                         print outline
165
166                     outfile.write(outline + "\n")
167
168             if trackReject and not twoIsCommon:
169                 twoRejectIndex += 1
170                 outline = "rejectTwo%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (twoRejectIndex, chrom, start, stop, numHits, twoPeakScore)
171                 if doReject:
172                     rejectTwo.write(outline + "\n")
173                 else:
174                     outfile.write(outline + "\n")
175
176                 if doVerbose:
177                     print outline
178
179         if trackReject:
180             for (onePeak, oneRadius, ostart, ostop, ohits, opeakScore) in onePeaksDict[chrom]:
181                 if (onePeak, oneRadius, ostart, ostop, ohits) not in oneFoundDict[chrom]:
182                     oneRejectIndex += 1
183                     outline = "rejectOne%d\tchr%s\t%d\t%d\t%.1f\t%.1f" % (oneRejectIndex, chrom, ostart, ostop, ohits, opeakScore)
184                     if doReject:
185                         rejectOne.write(outline + "\n")
186                     else:
187                         outfile.write(outline + "\n")
188
189                     if doVerbose:
190                         print outline
191
192     if trackReject:
193         print "common: %d   one-only: %d   two-only: %d" % (commonRegions, oneRejectIndex, twoRejectIndex)
194         outfile.write("#common: %d\tone-only: %d\ttwo-only: %d\n" % (commonRegions, oneRejectIndex, twoRejectIndex))
195     else:
196         print "common: %d" % commonRegions
197         outfile.write("#common: %d\n" % commonRegions)
198
199     outfile.close()
200
201
202 if __name__ == "__main__":
203     main(sys.argv)