erange version 4.0a dev release
[erange.git] / MakeBamFromRds.py
1 """
2 MakeBamFromRds
3
4 Converts ERANGE RDS zero based file to Bam zero based format.
5
6 Usage: python MakeBamFromRDS.py rdsFile bamFile [options]
7
8 """
9
10 try:
11     import psyco
12     psyco.full()
13 except:
14     pass
15
16 import sys
17 import re
18 import optparse
19 import random
20 import string
21 import pysam
22 import ReadDataset
23 from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
24
25
26 def main(argv=None):
27     if not argv:
28         argv = sys.argv
29
30     verstring = "makeBamFromRds: version 1.0"
31     print verstring
32
33     doPairs = False
34     
35     usage = "usage: python %prog rdsFile bamFile [options]"
36
37     parser = getParser(usage)
38     (options, args) = parser.parse_args(argv[1:])
39
40     if len(args) < 2:
41         print usage
42         sys.exit(1)
43
44     rdsfile = args[0]
45     outfilename = args[1]
46
47     allChrom = True
48     if options.chromList:
49         allChrom = False
50
51     makeBamFromRds(rdsfile, outfilename, options.withUniqs, options.withMulti,
52                      options.doSplices, doPairs, options.withFlag, options.useFlagLike,
53                      options.enforceChr, allChrom, options.doCache, options.cachePages,
54                      options.chromList, options.fastaFileName)
55
56
57 def getParser(usage):
58     parser = optparse.OptionParser(usage=usage)
59     parser.add_option("--nouniq", action="store_false", dest="withUniqs")
60     parser.add_option("--nomulti", action="store_false", dest="withMulti")
61     parser.add_option("--splices", action="store_true", dest="doSplices")
62     parser.add_option("--flag", dest="withFlag")
63     parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
64     parser.add_option("--pairs", action="store_true", dest="doPairs")
65     parser.add_option("--cache", type="int", dest="cachePages")
66     parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
67     parser.add_option("--chrom", action="append", dest="chromList")
68     parser.add_option("--fasta", dest="fastaFileName")
69
70     configParser = getConfigParser()
71     section = "MakeBamFromRds"
72     withUniqs = getConfigBoolOption(configParser, section, "withUniqs", True)
73     withMulti = getConfigBoolOption(configParser, section, "withMulti", True)
74     doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
75     doPairs = getConfigBoolOption(configParser, section, "doPairs", False)
76     withFlag = getConfigOption(configParser, section, "withFlag", "")
77     useFlagLike = getConfigBoolOption(configParser, section, "useFlagLike", False)
78     enforceChr = getConfigBoolOption(configParser, section, "enforceChr", False)
79     doCache = getConfigBoolOption(configParser, section, "doCache", False)
80     cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
81     fastaFileName = getConfigOption(configParser, section, "fastaFileName", "")
82
83     parser.set_defaults(withUniqs=withUniqs, withMulti=withMulti, doSplices=doSplices,
84                         doPairs=doPairs, withFlag=withFlag, useFlagLike=useFlagLike, enforceChr=enforceChr,
85                         doCache=doCache, cachePages=cachePages, fastaFileName=fastaFileName,
86                         chromList=[])
87
88     return parser
89
90
91 def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
92                      doSplices=False, doPairs=False, withFlag="",
93                      useFlagLike=False, enforceChr=False, allChrom=True,
94                      doCache=False, cachePages=100000, chromList=[], fastaFileName=""):
95
96     if not withUniqs and not withMulti and not doSplices:
97         print "must be outputting at least one of uniqs, multi, or -splices - exiting"
98         sys.exit(1)
99
100     print "\nsample:"
101     RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
102
103     if cachePages > RDS.getDefaultCacheSize():
104         RDS.setDBcache(cachePages)
105
106     readlength = RDS.getReadSize()
107
108     if allChrom:
109         if withUniqs:
110             chromList = RDS.getChromosomes()
111         elif withMulti:
112             chromList = RDS.getChromosomes(table="multi")
113         else:
114             chromList = RDS.getChromosomes(table="splices")
115
116         chromList.sort()
117
118     fastaSequenceDict = {}
119     if fastaFileName:
120         fastafile = open(fastaFileName)
121         fastaSequenceDict = getFastaSequenceDictionary(fastaFileName)
122         fastafile.close()
123
124     referenceSequenceList = []
125     chromRemoveList = []
126     for chromosome in chromList:
127         if doNotOutputChromosome(chromosome, enforceChr):
128             chromRemoveList.append(chromosome)
129         else:
130             chromosomeLength = RDS.getMaxCoordinate(chromosome, doUniqs=withUniqs, doMulti=withMulti, doSplices=doSplices)
131             referenceDataDict = {"LN": int(chromosomeLength), "SN": str(chromosome)}
132             referenceSequenceList.append(referenceDataDict)
133
134     for chrom in chromRemoveList:
135         chromList.remove(chrom)
136
137     header = {"HD": {"VN": "1.0"}}
138     if referenceSequenceList:
139         header["SQ"] = referenceSequenceList
140
141     outfile = pysam.Samfile(outfilename, "wb", header=header)
142
143     totalWrites = 0
144     noncanonicalSpliceCount = 0
145     for chrom in chromList:
146         index = 0
147         print "chromosome %s" % (chrom)
148         if withUniqs or withMulti:
149             hitDict = RDS.getReadsDict(fullChrom=True, chrom=chrom, flag=withFlag, withWeight=True, withID=True,
150                                        doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
151                                        flagLike=useFlagLike, withMismatch=True)
152
153             for read in hitDict[chrom]:
154                 writeBAMEntry(outfile, chrom, read, readlength)
155                 index += 1
156
157         if doSplices:
158             numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict)
159             index += numSpliceReadsWritten
160             noncanonicalSpliceCount += noncanonical
161
162         print index
163         totalWrites += index
164
165     outfile.close()
166     print "%d total reads written" % totalWrites
167     print "%d non-canonical splices" % noncanonicalSpliceCount
168
169
170 def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict={}):
171     index = 0
172     noncanonicalSplices = 0
173     spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, withWeight=True,
174                                     withMismatch=True)
175     if chrom not in spliceDict:
176         pass
177     else:
178         for read in spliceDict[chrom]:
179             if fastaSequenceDict.has_key(chrom):
180                 read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], read["startR"], read["stopL"], read["sense"])
181                 noncanonicalSplices += noncanonical
182
183             writeBAMEntry(outfile, chrom, read, readlength)
184             index += 1
185
186     return index, noncanonicalSplices
187
188
189 def writeBAMEntry(outfile, chrom, outputDict, readlength):
190     """ We need to subtract 1 from the position because rds is 1 based and
191         most of the rest of the entire world is 0 based.
192     """
193     tagList = []
194     alignedRead = pysam.AlignedRead()
195     try:
196         (readID, pairID) = outputDict["readID"].split("/")
197         paired = True
198     except ValueError:
199         readID = outputDict["readID"]
200         paired = False
201
202     alignedRead.qname = readID
203     if outputDict["sense"] == "-":
204         alignedRead.is_reverse = True
205
206     alignedRead.rname = outfile.references.index(chrom)
207
208     if outputDict.has_key("startL"):
209         startL = outputDict["startL"] - 1
210         stopL = outputDict["stopL"] - 1
211         startR = outputDict["startR"] - 1
212         stopR = outputDict["stopR"] - 1
213         alignedRead.pos = startL
214         alignedRead.cigar = [(0,stopL - startL), (3, startR - stopL), (0, stopR - startR)]
215         tagList.append(("XS", str(outputDict["sense"])))
216     else:
217         alignedRead.pos = outputDict["start"] - 1
218         alignedRead.cigar = [(0, readlength)]
219
220     if paired:
221         if pairID == "1":
222             alignedRead.is_read1 = True
223             alignedRead.is_proper_pair = True
224         elif pairID == "2":
225             alignedRead.is_read2 = True
226             alignedRead.is_proper_pair = True
227         else:
228             pass
229
230     if outputDict.has_key("mismatch"):
231         mismatchTag = getMismatches(outputDict["mismatch"])
232         if mismatchTag:
233             tagList.append(("MD", str(mismatchTag)))
234
235     if tagList:
236         alignedRead.tags = tuple(tagList)
237
238     outfile.write(alignedRead)
239
240
241 def getMismatches(mismatchString):
242     mismatchList = []
243     positions = re.findall("\d+", mismatchString)
244     nucleotides = re.findall("([ACGTN])\d+", mismatchString)
245     for index in range(0, len(positions)):
246         mismatchList.append("%s%s" % (positions[index], nucleotides[index]))
247
248     mismatch = string.join(mismatchList, "")
249
250     return mismatch
251
252
253 def doNotOutputChromosome(chrom, enforceChr):
254     result = False
255
256     if chrom == "chrM":
257         result = True
258
259     if enforceChr and ("chr" not in chrom):
260         result = True
261
262     return result
263
264
265 def getFastaSequenceDictionary(fastaFileName):
266     fastaSeqDict = {}
267     fchrom = ""
268     fseq = ""
269
270     fastafile = open(fastaFileName)
271     for line in fastafile:
272         if line[0] == ">":
273             if fchrom != "":
274                 fastaSeqDict[fchrom] = fseq
275
276             fseq = ""
277             fchrom = line[1:-1]
278         else:
279             fseq += line.strip()
280
281     fastafile.close()
282
283     return fastaSeqDict
284
285
286 def fixSpliceSense(fastaSequence, startRight, stopLeft, sense=""):
287     spliceSense = {"GTAG": "+",
288                    "GCAG": "+",
289                    "ATAC": "+",
290                    "CTAC": "-",
291                    "CTGC": "-",
292                    "GTAT": "-"
293     }
294
295     noncanonical = 0
296     intronstart = stopLeft
297     intronlen = startRight - stopLeft
298     leftJunctionSig =fastaSequence[intronstart:intronstart+2]
299     rightJunctionSig = fastaSequence[intronstart+intronlen-2:intronstart+intronlen]
300     spliceJunction = string.join([leftJunctionSig, rightJunctionSig], "")
301     spliceJunction = spliceJunction.upper()
302     print spliceJunction
303     if spliceSense.has_key(spliceJunction):
304         sense = spliceSense[spliceJunction]
305     else:
306         noncanonical += 1
307         senses = ["+", "-"]
308         random.shuffle(senses)
309         sense = senses[0]
310
311     return sense, noncanonical
312
313
314 if __name__ == "__main__":
315     main(sys.argv)