first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / makebedfromrds.py
1 #
2 #  makebedfromrds.py
3 #  ENRAGE
4 #
5 #  Created by Ali Mortazavi on 7/19/08.
6 #
7
8 try:
9     import psyco
10     psyco.full()
11 except:
12     pass
13
14 import sys
15 import optparse
16 import ReadDataset
17 from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
18
19 PLUS_COLOR = "0,0,255"
20 MINUS_COLOR = "255,0,0"
21 MULTI_PLUS_COLOR = "64,64,64"
22 MULTI_MINUS_COLOR = "192,192,192"
23 SPLICE_COLOR = "255,0,0"
24 UNIQUE_COLOR = "0,0,0"
25 MULTI_COLOR = "128,128,128"
26
27
28 def main(argv=None):
29     if not argv:
30         argv = sys.argv
31
32     verstring = "makebedfromrds: version 3.2"
33     print verstring
34
35     doPairs = False
36     
37     usage = "usage:  %prog trackLabel rdsFile bamFile [options]"
38
39     parser = getParser(usage)
40     (options, args) = parser.parse_args(argv[1:])
41
42     try:
43         trackType = args[0]
44     except IndexError:
45         print "no track specified - see --help for usage"
46         sys.exit(1)
47
48     try:
49         rdsfile = args[1]
50     except IndexError:
51         print "no RDS file specified - see --help for usage"
52         sys.exit(1)
53
54     try:
55         outfilename = args[2]
56     except IndexError:
57         print "no output file specified - see --help for usage"
58         sys.exit(1)
59
60     if options.pairDist is not None:
61         doPairs = True
62
63     if options.chromList:
64         options.allChrom = False
65
66     outputBedFromRds(trackType, rdsfile, outfilename, options.withUniqs, options.withMulti,
67                      options.doSplices, options.doSpliceColor, doPairs, options.pairDist,
68                      options.withFlag, options.useFlagLike, options.enforceChr, options.senseStrand,
69                      options.allChrom, options.doCache, options.cachePages, options.chromList)
70
71
72 def getParser(usage):
73     parser = optparse.OptionParser(usage=usage)
74     parser.add_option("--nouniq", action="store_false", dest="withUniqs")
75     parser.add_option("--nomulti", action="store_false", dest="withMulti")
76     parser.add_option("--splices", action="store_true", dest="doSplices")
77     parser.add_option("--spliceColor", action="store_true", dest="doSpliceColor")
78     parser.add_option("--flag", dest="withFlag")
79     parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
80     parser.add_option("--pairs", type="int", dest="pairDist")
81     parser.add_option("--cache", type="int", dest="cachePages")
82     parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
83     parser.add_option("--chrom", action="append", dest="chromList")
84     parser.add_option("--strand", dest="strand")
85
86     configParser = getConfigParser()
87     section = "makebedfromrds"
88     withUniqs = getConfigBoolOption(configParser, section, "withUniqs", True)
89     withMulti = getConfigBoolOption(configParser, section, "withMulti", False)
90     doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
91     doSpliceColor = getConfigBoolOption(configParser, section, "doSpliceColor", False)
92     pairDist = getConfigOption(configParser, section, "pairDist", None)
93     withFlag = getConfigOption(configParser, section, "withFlag", "")
94     useFlagLike = getConfigBoolOption(configParser, section, "useFlagLike", False)
95     enforceChr = getConfigBoolOption(configParser, section, "enforceChr", False)
96     senseStrand = getConfigOption(configParser, section, "senseStrand", "")
97     allChrom = getConfigBoolOption(configParser, section, "allChrom", True)
98     doCache = getConfigBoolOption(configParser, section, "doCache", False)
99     cachePages = getConfigOption(configParser, section, "cachePages", 100000)
100
101     parser.set_defaults(withUniqs=withUniqs, withMulti=withMulti, doSplices=doSplices, doSpliceColor=doSpliceColor,
102                         pairDist=pairDist, withFlag=withFlag, useFlagLike=useFlagLike, enforceChr=enforceChr,
103                         senseStrand=senseStrand, allChrom=allChrom, doCache=doCache, cachePages=cachePages,
104                         chromList=[])
105
106     return parser
107
108
109 def outputBedFromRds(trackType, rdsfile, outfilename, withUniqs=True, withMulti=True,
110                      doSplices=False, doSpliceColor=False, doPairs=False, pairDist=1000000,
111                      withFlag="", useFlagLike=False, enforceChr=False, senseStrand="",
112                      allChrom=True, doCache=False, cachePages=100000, chromList=[]):
113
114     if not withUniqs and not withMulti and not doSplices:
115         print "must be outputing at least one of uniqs, multi, or -splices - exiting"
116         sys.exit(1)
117
118     print "\nsample:"
119     RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
120
121     #check that this is better than the dataset's default cache size
122     if cachePages > RDS.getDefaultCacheSize():
123         RDS.setDBcache(cachePages)
124
125     readlength = RDS.getReadSize()
126     minDist = -1 * readlength
127
128     if allChrom:
129         if withUniqs:
130             chromList = RDS.getChromosomes()
131         elif withMulti:
132             chromList = RDS.getChromosomes(table="multi")
133         else:
134             chromList = RDS.getChromosomes(table="splices")
135
136         chromList.sort()
137
138     outfile = open(outfilename, "w")
139     outfile.write('track name="%s" visibility=4 itemRgb="On"\n' % (trackType))
140
141     if withUniqs or withMulti:
142         for achrom in chromList:
143             index = 0
144             if doNotOutputChromosome(achrom, enforceChr):
145                 continue
146
147             print "chromosome %s" % (achrom)
148
149             if doPairs:
150                 hitDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, flag=withFlag,
151                                            withWeight=True, withPairID=True, doUniqs=withUniqs,
152                                            doMulti=withMulti, readIDDict=True,
153                                            flagLike=useFlagLike, strand=senseStrand)
154
155                 readIDList = hitDict.keys()
156                 if doSplices:
157                     spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag,
158                                                     withPairID=True, readIDDict=True,
159                                                     flagLike=useFlagLike, strand=senseStrand)
160
161                     spliceIDList = spliceDict.keys()
162                     combDict = {}
163                     for readID in readIDList:
164                         combDict[readID] = 1
165
166                     for readID in spliceIDList:
167                         combDict[readID] = 1
168
169                     combinedIDList = combDict.keys()
170                 else:
171                     combinedIDList = readIDList
172
173                 for readID in combinedIDList:
174                     localList = []
175                     try:
176                         localList = hitDict[readID]
177                     except:
178                         pass
179
180                     if doSplices:
181                         try:
182                             localList += spliceDict[readID]
183                         except:
184                             pass
185
186                     localList.sort()
187                     listLen = len(localList) - 1
188                     localIndex = 0
189                     while localIndex <= listLen:
190                         read = localList[localIndex]
191                         try:
192                             leftpos = read["start"]
193                             leftsense = read["sense"]
194                             leftweight = read["weight"]
195                             lPairID = read["pairID"]
196                             leftstop = leftpos + readlength - 1
197                             lpart = 1
198                             startList = [leftpos]
199                             stopList = [leftstop]
200                         except KeyError:
201                             leftpos = read["startL"]
202                             LLstop = read["stopL"]
203                             LRstart = read["startR"]
204                             leftstop = read["stopL"]
205                             leftsense = read["sense"]
206                             lPairID = read["pairID"]
207                             leftweight = 1.0
208                             lpart = 2
209                             startList = [leftpos, LRstart]
210                             stopList = [LLstop, leftstop]
211
212                         if localIndex < listLen:
213                             read = localList[localIndex + 1]
214                             try:
215                                 rightpos = read["start"]
216                                 rightsense = read["sense"]
217                                 rightweight = read["weight"]
218                                 rPairID= read["pairID"]
219                                 rightstop = rightpos + readlength - 1
220                                 rpart = 1
221                                 rstartList = [rightpos]
222                                 rstopList = [rightstop]
223                             except KeyError:
224                                 rightpos = read["startL"]
225                                 RLstop = read["stopL"]
226                                 RRstart = read["startR"]
227                                 rightstop = read["stopR"]
228                                 rightsense = read["sense"]
229                                 rPairID = read["pairID"]
230                                 rightweight = 1.0
231                                 rpart = 2
232                                 rstartList = [rightpos, RRstart]
233                                 rstopList = [RLstop, rightstop]
234                         else:
235                             rightsense = "+"
236                             rightpos = 0
237                             rstartList = []
238                             rstopList = []
239
240                         if leftsense == "+" and rightsense == "-" and minDist < (rightpos - leftstop) < pairDist and lPairID != rPairID:
241                             if doSpliceColor:
242                                 plusSenseColor, minusSenseColor = getSpliceColor(lpart, rpart, leftweight, rightweight, hackType="1")
243                             elif leftweight == 1.0 or rightweight == 1.0:
244                                 plusSenseColor = "0,0,0"
245                                 minusSenseColor = MINUS_COLOR
246                             else:
247                                 plusSenseColor = "128,128,128"
248                                 minusSenseColor = MULTI_MINUS_COLOR
249
250                             splitReadWrite(outfile, achrom, lpart + rpart, startList + rstartList, stopList + rstopList, "+", readID, plusSenseColor, minusSenseColor)
251                             localIndex += 2
252                             index += 2
253                         else:
254                             if doSpliceColor:
255                                 plusSenseColor, minusSenseColor = getSpliceColor(lpart, rpart, leftweight, rightweight)
256                                 outputSense = "+"
257                             elif leftweight == 1.0:
258                                 plusSenseColor = PLUS_COLOR
259                                 minusSenseColor = MINUS_COLOR
260                                 outputSense = leftsense
261                             else:
262                                 plusSenseColor = PLUS_COLOR
263                                 minusSenseColor = MINUS_COLOR
264                                 outputSense = leftsense
265
266                             splitReadWrite(outfile, achrom, lpart, startList, stopList, outputSense, readID, plusSenseColor, minusSenseColor)
267                             localIndex += 1
268                             index += 1
269             else:
270                 hitDict = RDS.getReadsDict(fullChrom=True, chrom=achrom, flag=withFlag, withWeight=True, withID=True, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False, flagLike=useFlagLike)
271                 try:
272                     for read in hitDict[achrom]:
273                         pos = read["start"]
274                         sense = read["sense"]
275                         readID = read["readID"]
276                         splitReadWrite(outfile, achrom, 1, [pos], [pos + readlength - 1], sense, readID, PLUS_COLOR, MINUS_COLOR)
277                         index += 1
278                 except:
279                     pass
280
281                 if doSplices:
282                     spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag, withID=True, flagLike=useFlagLike)
283                     if achrom not in spliceDict:
284                         continue
285                     for read in spliceDict[achrom]:
286                         readstart = read["startL"]
287                         Lstop = read["stopL"]
288                         Rstart = read["startR"]
289                         readstop = read["stopR"]
290                         rsense = read["sense"]
291                         readName = read["readID"]
292                         splitReadWrite(outfile, achrom, 2, [readstart, Rstart], [Lstop, readstop], rsense, readName, PLUS_COLOR, MINUS_COLOR)
293                         index += 1
294
295     elif doSplices:
296         for achrom in chromList:
297             index = 0
298             if doNotOutputChromosome(achrom, enforceChr):
299                 continue
300
301             print "chromosome %s" % (achrom)
302
303             spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=achrom, flag=withFlag, withID=True, flagLike=useFlagLike)
304             if achrom not in spliceDict:
305                 continue
306             for read in spliceDict[achrom]:
307                 readstart = read["startL"]
308                 Lstop = read["stopL"]
309                 Rstart = read["startR"]
310                 readstop = read["stopR"]
311                 rsense = read["sense"]
312                 readName = read["readID"]
313                 splitReadWrite(outfile, achrom, 2, [readstart, Rstart], [Lstop, readstop], rsense, readName, PLUS_COLOR, MINUS_COLOR)
314                 index += 1
315
316         print index
317
318     outfile.close()
319
320
321 def singleReadWrite(chrom, pos, sense, weight, readID, readlength, outfile):
322     start = pos
323     stop = pos + readlength - 1
324     senseColor = getSenseColor(sense, weight)
325     outfile.write("%s %d %d %s %.1f %s 0 0 %s\n" % (chrom, start, stop, readID, weight, sense, senseColor))
326
327
328 def getSenseColor(sense, weight):
329     if weight < 1.0:
330         senseColor = getMultiSenseColor(sense)
331     else:
332         senseColor = getSingleSenseColor(sense)
333
334     return senseColor
335
336
337 def getMultiSenseColor(sense):
338     if sense == "+":
339         senseColor = MULTI_PLUS_COLOR
340     else:
341         senseColor = MULTI_MINUS_COLOR
342
343     return senseColor
344
345
346 def getSingleSenseColor(sense):
347     if sense == "+":
348         senseColor = PLUS_COLOR
349     else:
350         senseColor = MINUS_COLOR
351
352     return senseColor
353
354
355 def splitReadWrite(outfile, chrom, numPieces, startList, stopList, rsense, readName, plusSense, minusSense):
356     readSizes = getReadSizes(numPieces, startList, stopList)
357     readCoords = getReadCoords(numPieces, startList)
358     leftStart = startList[0]
359     rightStop = stopList[-1]
360
361     if rsense == "+":
362         senseCode = plusSense
363     else:
364         senseCode = minusSense
365     
366     outline = "%s\t%d\t%d\t%s\t1000\t%s\t0\t0\t%s\t%d\t%s\t%s\n" % (chrom, leftStart, rightStop, readName, rsense, senseCode, numPieces, readSizes, readCoords)
367     outfile.write(outline)
368
369
370 def getReadSizes(numPieces, startList, stopList):
371     readSizes = "%d" % (stopList[0] - startList[0])
372     for index in range(1, numPieces):
373         readSizes += ',%d' % (stopList[index] - startList[index])
374
375     return readSizes
376
377
378 def getReadCoords(numPieces, startList):
379     readCoords = "0"
380     for index in range(1, numPieces):
381         readCoords += ",%d" % (startList[index] - startList[0])
382
383     return readCoords
384
385
386 def getSpliceColor(lpart, rpart, leftweight, rightweight, hackType=None):
387     if hackType == "1":
388         if (lpart + rpart) > 2:
389             aColor = SPLICE_COLOR
390             bColor = SPLICE_COLOR
391         elif leftweight == 1.0 or rightweight == 1.0:
392             aColor = UNIQUE_COLOR
393             bColor = UNIQUE_COLOR
394         else:
395             aColor = MULTI_COLOR
396             bColor = MULTI_COLOR
397     else:
398         if lpart  > 1:
399             aColor = SPLICE_COLOR
400             bColor = SPLICE_COLOR
401         elif leftweight == 1.0:
402             aColor = UNIQUE_COLOR
403             bColor = UNIQUE_COLOR
404         else:
405             aColor = MULTI_COLOR
406             bColor = MULTI_COLOR
407
408     return aColor, bColor
409
410
411 def doNotOutputChromosome(achrom, enforceChr):
412     result = False
413
414     if achrom == "chrM":
415         result = True
416
417     if enforceChr and ("chr" not in achrom):
418         result = True
419
420     return result
421
422
423 if __name__ == "__main__":
424     main(sys.argv)