snapshot of 4.0a development. initial git repo commit
[erange.git] / makebedfromrds.py
1 #
2 #  makebedfromrds.py
3 #  ENRAGE
4 #
5 #  Created by Ali Mortazavi on 7/19/08.
6 #
7
8 try:
9     import psyco
10     psyco.full()
11 except:
12     pass
13
14 import sys, optparse
15 from commoncode import readDataset
16
17 PLUS_COLOR = "0,0,255"
18 MINUS_COLOR = "255,0,0"
19 MULTI_PLUS_COLOR = "64,64,64"
20 MULTI_MINUS_COLOR = "192,192,192"
21 SPLICE_COLOR = "255,0,0"
22 UNIQUE_COLOR = "0,0,0"
23 MULTI_COLOR = "128,128,128"
24
25
26 def main(argv=None):
27     if not argv:
28         argv = sys.argv
29
30     verstring = "%prog: version 3.1"
31     print verstring
32
33     doPairs = False
34     
35     usage = "usage:  %prog trackLabel rdsFile bamFile [options]"
36
37     parser = optparse.OptionParser(usage=usage)
38     parser.add_option("--nouniq", action="store_false", dest="withUniqs")
39     parser.add_option("--nomulti", action="store_false", dest="withMulti")
40     parser.add_option("--splices", action="store_true", dest="doSplices")
41     parser.add_option("--spliceColor", action="store_true", dest="doSpliceColor")
42     parser.add_option("--flag", dest="withFlag")
43     parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
44     parser.add_option("--pairs", type="int", dest="pairDist")
45     parser.add_option("--cache", type="int", dest="cachePages")
46     parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
47     parser.add_option("--chrom", action="append", dest="chromList")
48     parser.add_option("--strand", dest="strand")
49     parser.add_option("-r", "--region", dest="region", type="string",
50                       help="samtools region string")
51     parser.set_defaults(withUniqs=True, withMulti=True, doSplices=False, doSpliceColor=False,
52                         pairDist=None, withFlag="", useFlagLike=False, enforceChr=False,
53                         senseStrand="", allChrom=True, doCache=False, cachePages=100000,
54                         chromList=[])
55     (options, args) = parser.parse_args(argv[1:])
56
57     try:
58         trackType = args[0]
59     except IndexError:
60         print "no track specified - see --help for usage"
61         sys.exit(1)
62
63     try:
64         rdsfile = args[1]
65     except IndexError:
66         print "no RDS file specified - see --help for usage"
67         sys.exit(1)
68
69     try:
70         outfilename = args[2]
71     except IndexError:
72         print "no output file specified - see --help for usage"
73         sys.exit(1)
74
75     if options.pairDist is not None:
76         doPairs = True
77
78     if options.chromList:
79         options.allChrom = False
80
81     outputBedFromRds(trackType, rdsfile, outfilename, options.withUniqs, options.withMulti,
82                      options.doSplices, options.doSpliceColor, doPairs, options.pairDist,
83                      options.withFlag, options.useFlagLike, options.enforceChr, options.senseStrand,
84                      options.allChrom, options.doCache, options.cachePages, options.chromList)
85
86
87 def outputBedFromRds(trackType, rdsfile, outfilename, withUniqs=True, withMulti=True,
88                      doSplices=False, doSpliceColor=False, doPairs=False, pairDist=1000000,
89                      withFlag="", useFlagLike=False, enforceChr=False, senseStrand="",
90                      allChrom=True, doCache=False, cachePages=100000, chromList=[]):
91
92     if not withUniqs and not withMulti and not doSplices:
93         print "must be outputing at least one of uniqs, multi, or -splices - exiting"
94         sys.exit(1)
95
96     print "\nsample:"
97     RDS = readDataset(rdsfile, verbose = True, cache=doCache)
98
99     #check that this is better than the dataset's default cache size
100     if cachePages > RDS.getDefaultCacheSize():
101         RDS.setDBcache(cachePages)
102
103     readlength = RDS.getReadSize()
104     minDist = -1 * readlength
105
106     if allChrom:
107         if withUniqs:
108             chromList = RDS.getChromosomes()
109         elif withMulti:
110             chromList = RDS.getChromosomes(table="multi")
111         else:
112             chromList = RDS.getChromosomes(table="splices")
113
114         chromList.sort()
115
116     outfile = open(outfilename, "w")
117     outfile.write('track name="%s" visibility=4 itemRgb="On"\n' % (trackType))
118
119     if withUniqs or withMulti:
120         for achrom in chromList:
121             index = 0
122             if doNotOutputChromosome(achrom, enforceChr):
123                 continue
124
125             print "chromosome %s" % (achrom)
126
127             if doPairs:
128                 hitDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, flag=withFlag,
129                                            withWeight=True, withPairID=True, doUniqs=withUniqs,
130                                            doMulti=withMulti, readIDDict=True,
131                                            flagLike=useFlagLike, strand=senseStrand)
132
133                 readIDList = hitDict.keys()
134                 if doSplices:
135                     spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag,
136                                                     withPairID=True, readIDDict=True,
137                                                     flagLike=useFlagLike, strand=senseStrand)
138
139                     spliceIDList = spliceDict.keys()
140                     combDict = {}
141                     for readID in readIDList:
142                         combDict[readID] = 1
143
144                     for readID in spliceIDList:
145                         combDict[readID] = 1
146
147                     combinedIDList = combDict.keys()
148                 else:
149                     combinedIDList = readIDList
150
151                 for readID in combinedIDList:
152                     localList = []
153                     try:
154                         localList = hitDict[readID]
155                     except:
156                         pass
157
158                     if doSplices:
159                         try:
160                             localList += spliceDict[readID]
161                         except:
162                             pass
163
164                     localList.sort()
165                     listLen = len(localList) - 1
166                     localIndex = 0
167                     while localIndex <= listLen:
168                         try:
169                             (leftpos, leftsense, leftweight, lPairID) = localList[localIndex]
170                             leftstop = leftpos + readlength - 1
171                             lpart = 1
172                             startList = [leftpos]
173                             stopList = [leftstop]
174                         except:
175                             (leftpos, LLstop, LRstart, leftstop, leftsense, lPairID) = localList[localIndex]
176                             leftweight = 1.0
177                             lpart = 2
178                             startList = [leftpos, LRstart]
179                             stopList = [LLstop, leftstop]
180
181                         if localIndex < listLen:
182                             try:
183                                 (rightpos, rightsense, rightweight, rPairID) = localList[localIndex + 1]
184                                 rightstop = rightpos + readlength - 1
185                                 rpart = 1
186                                 rstartList = [rightpos]
187                                 rstopList = [rightstop]
188                             except:
189                                 (rightpos, RLstop, RRstart, rightstop, rightsense, rPairID) = localList[localIndex + 1]
190                                 rightweight = 1.0
191                                 rpart = 2
192                                 rstartList = [rightpos, RRstart]
193                                 rstopList = [RLstop, rightstop]
194                         else:
195                             rightsense = "+"
196                             rightpos = 0
197                             rstartList = []
198                             rstopList = []
199
200                         if leftsense == "+" and rightsense == "-" and minDist < (rightpos - leftstop) < pairDist and lPairID != rPairID:
201                             if doSpliceColor:
202                                 plusSenseColor, minusSenseColor = getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1")
203                             elif leftweight == 1.0 or rightweight == 1.0:
204                                 plusSenseColor = "0,0,0"
205                                 minusSenseColor = MINUS_COLOR
206                             else:
207                                 plusSenseColor = "128,128,128"
208                                 minusSenseColor = MULTI_MINUS_COLOR
209
210                             splitReadWrite(outfile, achrom, lpart + rpart, startList + rstartList, stopList + rstopList, "+", readID, plusSenseColor, minusSenseColor)
211                             localIndex += 2
212                             index += 2
213                         else:
214                             if doSpliceColor:
215                                 plusSenseColor, minusSenseColor = getSpliceColor(lpart, rpart, leftweight, rightweight)
216                                 outputSense = "+"
217                             elif leftweight == 1.0:
218                                 plusSenseColor = PLUS_COLOR
219                                 minusSenseColor = MINUS_COLOR
220                                 outputSense = leftsense
221                             else:
222                                 plusSenseColor = PLUS_COLOR
223                                 minusSenseColor = MINUS_COLOR
224                                 outputSense = leftsense
225
226                             splitReadWrite(outfile, achrom, lpart, startList, stopList, outputSense, readID, plusSenseColor, minusSenseColor)
227                             localIndex += 1
228                             index += 1
229             else:
230                 hitDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, flag=withFlag, withWeight=True, withID=True, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False, flagLike=useFlagLike)
231                 try:
232                     for (pos, sense, weight, readID) in hitDict[achrom]:
233                         splitReadWrite(outfile, achrom, 1, [pos], [pos + readlength - 1], sense, readID, PLUS_COLOR, MINUS_COLOR)
234                         index += 1
235                 except:
236                     pass
237
238                 if doSplices:
239                     spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag, withID=True, flagLike=useFlagLike)
240                     if achrom not in spliceDict:
241                         continue
242                     for (readstart, Lstop, Rstart, readstop, rsense, readName) in spliceDict[achrom]:
243                         splitReadWrite(outfile, achrom, 2, [readstart, Rstart], [Lstop, readstop], rsense, readName, PLUS_COLOR, MINUS_COLOR)
244                         index += 1
245
246     elif doSplices:
247         for achrom in chromList:
248             index = 0
249             if doNotOutputChromosome(achrom, enforceChr):
250                 continue
251
252             print "chromosome %s" % (achrom)
253
254             spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag, withID=True, flagLike=useFlagLike)
255             if achrom not in spliceDict:
256                 continue
257             for (readstart, Lstop, Rstart, readstop, rsense, readName) in spliceDict[achrom]:
258                 splitReadWrite(outfile, achrom, 2, [readstart, Rstart], [Lstop, readstop], rsense, readName, PLUS_COLOR, MINUS_COLOR)
259                 index += 1
260
261         print index
262
263     outfile.close()
264
265
266 def singleReadWrite(chrom, pos, sense, weight, readID, readlength, outfile):
267     start = pos
268     stop = pos + readlength - 1
269     senseColor = getSenseColor(sense, weight)
270     outfile.write("%s %d %d %s %.1f %s 0 0 %s\n" % (chrom, start, stop, readID, weight, sense, senseColor))
271
272
273 def getSenseColor(sense, weight):
274     if weight < 1.0:
275         senseColor = getMultiSenseColor(sense)
276     else:
277         senseColor = getSingleSenseColor(sense)
278
279     return senseColor
280
281
282 def getMultiSenseColor(sense):
283     if sense == "+":
284         senseColor = MULTI_PLUS_COLOR
285     else:
286         senseColor = MULTI_MINUS_COLOR
287
288     return senseColor
289
290
291 def getSingleSenseColor(sense):
292     if sense == "+":
293         senseColor = PLUS_COLOR
294     else:
295         senseColor = MINUS_COLOR
296
297     return senseColor
298
299
300 def splitReadWrite(outfile, chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense):
301     readSizes = getReadSizes(numPieces, startList, stopList)
302     readCoords = getReadCoords(numPieces, startList)
303     leftStart = startList[0]
304     rightStop = stopList[-1]
305
306     if rsense == "+":
307         senseCode = plusSense
308     else:
309         senseCode = minusSense
310     
311     outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
312     outfile.write(outline)
313
314
315 def getReadSizes(numPieces, startList, stopList):
316     readSizes = "%d" % (stopList[0] - startList[0])
317     for index in range(1, numPieces):
318         readSizes += ',%d' % (stopList[index] - startList[index])
319
320     return readSizes
321
322
323 def getReadCoords(numPieces, startList):
324     readCoords = "0"
325     for index in range(1, numPieces):
326         readCoords += ",%d" % (startList[index] - startList[0])
327
328     return readCoords
329
330
331 def getSpliceColor(lpart, rpart, leftweight, rightweight, hackType=None):
332     if hackType == "1":
333         if (lpart + rpart) > 2:
334             aColor = SPLICE_COLOR
335             bColor = SPLICE_COLOR
336         elif leftweight == 1.0 or rightweight == 1.0:
337             aColor = UNIQUE_COLOR
338             bColor = UNIQUE_COLOR
339         else:
340             aColor = MULTI_COLOR
341             bColor = MULTI_COLOR
342     else:
343         if lpart  > 1:
344             aColor = SPLICE_COLOR
345             bColor = SPLICE_COLOR
346         elif leftweight == 1.0:
347             aColor = UNIQUE_COLOR
348             bColor = UNIQUE_COLOR
349         else:
350             aColor = MULTI_COLOR
351             bColor = MULTI_COLOR
352
353     return aColor, bColor
354
355
356 def doNotOutputChromosome(achrom, enforceChr):
357     result = False
358
359     if achrom == "chrM":
360         result = True
361
362     if enforceChr and ("chr" not in achrom):
363         result = True
364
365     return result
366
367
368 if __name__ == "__main__":
369     main(sys.argv)