erange version 4.0a dev release
[erange.git] / MakeRdsFromBam.py
1 """
2 MakeRdsFromBam
3
4 Created on Jun 3, 2010
5
6 @author: sau
7 """
8
9 try:
10     import psyco
11     psyco.full()
12 except:
13     pass
14
15 import sys
16 import string
17 import optparse
18 import re
19 import pysam
20 from commoncode import writeLog, getConfigParser, getConfigBoolOption, getConfigIntOption, getReverseComplement
21 import ReadDataset
22
23 INSERT_SIZE = 100000
24 verstring = "makeRdsFromBam: version 1.0"
25
26
27 def main(argv=None):
28     if not argv:
29         argv = sys.argv
30     
31     print verstring
32
33     usage = "usage:  %prog label samfile outrdsfile [propertyName::propertyValue] [options]\
34             \ninput reads must be sorted to properly record multireads"
35
36     parser = getParser(usage)
37     (options, args) = parser.parse_args(argv[1:])
38
39     try:
40         label = args[0]
41     except IndexError:
42         print "no label specified - see --help for usage"
43         sys.exit(1)
44
45     try:
46         samFileName = args[1]
47     except IndexError:
48         print "no samfile specified - see --help for usage"
49         sys.exit(1)
50
51     try:
52         outDbName = args[2]
53     except IndexError:
54         print "no outrdsfile specified - see --help for usage"
55         sys.exit(1)
56
57     makeRdsFromBam(label, samFileName, outDbName, options.init, options.doIndex, options.useSamFile,
58                    options.cachePages, options.maxMultiReadCount, options.rnaDataType, options.trimReadID)
59
60
61 def getParser(usage):
62     parser = optparse.OptionParser(usage=usage)
63     parser.add_option("--append", action="store_false", dest="init",
64                       help="append to existing rds file [default: create new]")
65     parser.add_option("--RNA", action="store_true", dest="rnaDataType",
66                       help="set data type to RNA [default: DNA]")
67     parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
68                       help="input file is in sam format")
69     parser.add_option("--index", action="store_true", dest="doIndex",
70                       help="index the output rds file")
71     parser.add_option("--cache", type="int", dest="cachePages",
72                       help="number of cache pages to use [default: 100000")
73     parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
74                       help="multi counts over this value are discarded [default: 10]")
75     parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
76                       help="use the raw read names")
77
78     configParser = getConfigParser()
79     section = "makeRdsFromBam"
80     init = getConfigBoolOption(configParser, section, "init", True)
81     doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
82     useSamFile = getConfigBoolOption(configParser, section, "useSamFile", False)
83     cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
84     maxMultiReadCount = getConfigIntOption(configParser, section, "maxMultiReadCount", 10)
85     rnaDataType = getConfigBoolOption(configParser, section, "rnaDataType", False)
86     trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
87
88     parser.set_defaults(init=init, doIndex=doIndex, useSamFile=useSamFile, cachePages=cachePages,
89                         maxMultiReadCount=maxMultiReadCount, rnaDataType=rnaDataType, trimReadID=trimReadID)
90
91     return parser
92
93
94 def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False,
95                    cachePages=100000, maxMultiReadCount=10, rnaDataType=False, trimReadID=True):
96
97     if useSamFile:
98         fileMode = "r"
99     else:
100         fileMode = "rb"
101
102     try:
103         samfile = pysam.Samfile(samFileName, fileMode)
104     except ValueError:
105         print "samfile index not found"
106         sys.exit(1)
107
108     if rnaDataType:
109         dataType = "RNA"
110     else:
111         dataType = "DNA"
112
113     writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:]))
114
115     rds = ReadDataset.ReadDataset(outDbName, init, dataType, verbose=True)
116     if not init and doIndex:
117         try:
118             if rds.hasIndex():
119                 rds.dropIndex()
120         except:
121             pass
122
123     if "sam_mapped" not in rds.getMetadata():
124         rds.insertMetadata([("sam_mapped", "True")])
125
126     defaultCacheSize = rds.getDefaultCacheSize()
127
128     if cachePages > defaultCacheSize:
129         if init:
130             rds.setDBcache(cachePages, default=True)
131         else:
132             rds.setDBcache(cachePages)
133
134     propertyList = []
135     for arg in sys.argv:
136         if "::" in arg:
137             (pname, pvalue) = arg.strip().split("::")
138             propertyList.append((pname, pvalue))
139
140     if len(propertyList) > 0:
141         rds.insertMetadata(propertyList)
142
143     totalReadCounts = {"unmapped": 0,
144                        "total": 0,
145                        "unique": 0,
146                        "multi": 0,
147                        "multiDiscard": 0,
148                        "splice": 0
149     }
150
151     readsize = 0
152
153     uniqueInsertList = []
154     multiInsertList = []
155     spliceInsertList = []
156
157     processedEntryDict = {}
158     uniqueReadDict = {}
159     multiReadDict = {}
160     spliceReadDict = {}
161
162     samFileIterator = samfile.fetch(until_eof=True)
163
164     for read in samFileIterator:
165         if read.is_unmapped:
166             totalReadCounts["unmapped"] += 1
167             continue
168
169         if readsize == 0:
170             take = (0, 1) # CIGAR operation (M/match, I/insertion)
171             readsize = sum([length for op,length in read.cigar if op in take])
172             if init:
173                 rds.insertMetadata([("readsize", readsize)])
174
175         #Build the read dictionaries
176         try:
177             readSequence = read.seq
178         except KeyError:
179             readSequence = ""
180
181         pairReadSuffix = getPairedReadNumberSuffix(read)
182         readName = "%s%s%s" % (read.qname, readSequence, pairReadSuffix)
183         if trimReadID:
184             rdsEntryName = "%s:%s:%d%s" % (label, read.qname, totalReadCounts["total"], pairReadSuffix)
185         else:
186             rdsEntryName = read.qname
187
188         if processedEntryDict.has_key(readName):
189             if isSpliceEntry(read.cigar):
190                 if spliceReadDict.has_key(readName):
191                     del spliceReadDict[readName]
192             else:
193                 if uniqueReadDict.has_key(readName):
194                     del uniqueReadDict[readName]
195
196                 if multiReadDict.has_key(readName):
197                     (read, priorCount, rdsEntryName) = multiReadDict[readName]
198                     count = priorCount + 1
199                     multiReadDict[readName] = (read, count, rdsEntryName)
200                 else:
201                     multiReadDict[readName] = (read, 1, rdsEntryName)
202         else:
203             processedEntryDict[readName] = ""
204             if isSpliceEntry(read.cigar):
205                 spliceReadDict[readName] = (read,rdsEntryName)
206             else:
207                 uniqueReadDict[readName] = (read, rdsEntryName)
208
209         if totalReadCounts["total"] % INSERT_SIZE == 0:
210             for entry in uniqueReadDict.keys():
211                 (readData, rdsEntryName) = uniqueReadDict[entry]
212                 chrom = samfile.getrname(readData.rname)
213                 uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
214                 totalReadCounts["unique"] += 1
215
216             for entry in spliceReadDict.keys():
217                 (readData, rdsEntryName) = spliceReadDict[entry]
218                 chrom = samfile.getrname(readData.rname)
219                 spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
220                 totalReadCounts["splice"] += 1
221
222             for entry in multiReadDict.keys():
223                 (readData, count, rdsEntryName) = multiReadDict[entry]
224                 chrom = samfile.getrname(readData.rname)
225                 if count > maxMultiReadCount:
226                     totalReadCounts["multiDiscard"] += 1
227                 else:
228                     multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count)) 
229                     totalReadCounts["multi"] += 1
230
231             rds.insertUniqs(uniqueInsertList)
232             rds.insertMulti(multiInsertList)
233             uniqueInsertList = []
234             uniqueReadDict = {}
235             multiInsertList = []
236             multiReadDict = {}
237             if dataType == "RNA":
238                 rds.insertSplices(spliceInsertList)
239                 spliceInsertList = []
240                 spliceReadDict = {}
241
242             print ".",
243             sys.stdout.flush()
244             processedEntryDict = {}
245
246         totalReadCounts["total"] += 1
247
248     if len(uniqueReadDict.keys()) > 0:
249         for entry in uniqueReadDict.keys():
250             (readData, rdsEntryName) = uniqueReadDict[entry]
251             chrom = samfile.getrname(readData.rname)
252             uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
253             totalReadCounts["unique"] += 1
254
255         rds.insertUniqs(uniqueInsertList)
256
257     if len(multiReadDict.keys()) > 0:
258         for entry in multiReadDict.keys():
259             (readData, count, rdsEntryName) = multiReadDict[entry]
260             chrom = samfile.getrname(readData.rname)
261             if count > maxMultiReadCount:
262                 totalReadCounts["multiDiscard"] += 1
263             else:
264                 multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
265                 totalReadCounts["multi"] += 1
266
267         totalReadCounts["multi"] += len(multiInsertList)
268
269     if len(spliceReadDict.keys()) > 0 and dataType == "RNA":
270         for entry in spliceReadDict.keys():
271             (readData, rdsEntryName) = spliceReadDict[entry]
272             chrom = samfile.getrname(readData.rname)
273             spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
274             totalReadCounts["splice"] += 1
275
276         rds.insertSplices(spliceInsertList)
277
278     countStringList = ["\n%d unmapped reads discarded" % totalReadCounts["unmapped"]]
279     countStringList.append("%d unique reads" % totalReadCounts["unique"])
280     countStringList.append("%d multi reads" % totalReadCounts["multi"])
281     countStringList.append("%d multi reads count > %d discarded" % (totalReadCounts["multiDiscard"], maxMultiReadCount))
282     if dataType == "RNA":
283         countStringList.append("%d spliced reads" % totalReadCounts["splice"])
284
285     print string.join(countStringList, "\n")
286     outputCountText = string.join(countStringList, "\t")
287     writeLog("%s.log" % outDbName, verstring, outputCountText)
288
289     if doIndex:
290         print "building index...."
291         if cachePages > defaultCacheSize:
292             rds.setDBcache(cachePages)
293             rds.buildIndex(cachePages)
294         else:
295             rds.buildIndex(defaultCacheSize)
296
297
298 def getRDSEntry(alignedRead, readName, chrom, readSize, weight=1):
299     start = int(alignedRead.pos)
300     stop = int(start + readSize)
301     sense = getReadSense(alignedRead.is_reverse)
302     try:
303         mismatchTag = alignedRead.opt("MD")
304         mismatches = getMismatches(mismatchTag, alignedRead.seq, sense)
305     except KeyError:
306         mismatches = ""
307
308     return (readName, chrom, start, stop, sense, 1.0/weight, '', mismatches)
309
310
311 def getRDSSpliceEntry(alignedRead, readName, chrom, readSize):
312     (readName, chrom, start, stop, sense, weight, flag, mismatches) = getRDSEntry(alignedRead, readName, chrom, readSize)
313     startL, startR, stopL, stopR = getSpliceBounds(start, readSize, alignedRead.cigar)
314     
315     return (readName, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches)
316
317
318 def getPairedReadNumberSuffix(read):
319     readSuffix = ""
320     if not isPairedRead(read):
321         return ""
322
323     if read.is_read1:
324         readSuffix = "/1"
325     elif read.is_read2:
326         readSuffix = "/2"
327
328     return readSuffix
329
330
331 def isPairedRead(read):
332     return read.is_proper_pair and (read.is_read1 or read.is_read2)
333
334
335 def isSpliceEntry(cigarTupleList):
336     isSplice = False
337     for operation,length in cigarTupleList:
338         if operation == 3:
339             isSplice = True
340             break
341
342     return isSplice
343
344
345 def getReadSense(reverse):
346     if reverse:
347         sense = "-"
348     else:
349         sense = "+"
350
351     return sense
352
353
354 def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False):
355     output = []
356     deletionMarker = "^"
357     position = 0
358
359     lengths = re.findall("\d+", mismatchTag)
360     mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
361
362     for mismatchEntry in range(len(mismatchSequences)):
363         mismatch = mismatchSequences[mismatchEntry]
364         position = position + int(lengths[mismatchEntry])
365         if string.find(mismatch, deletionMarker) == 0:
366             continue
367
368         try:
369             if querySequence:
370                 genomicNucleotide = querySequence[position]
371             else:
372                 genomicNucleotide = "N"
373
374             if sense == "-":
375                 mismatch = getReverseComplement(mismatch)
376                 genomicNucleotide  = getReverseComplement(genomicNucleotide)
377
378             erange1BasedElandCompatiblePosition = int(position + 1)
379             output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
380             position += 1
381         except IndexError:
382             if logErrors:
383                 errorMessage = "getMismatch IndexError; tag: %s, seq: %s, pos: %d" % (mismatchTag, querySequence, position)
384                 writeLog("MakeRdsFromBamError.log", "1.0", errorMessage)
385
386             return ""
387
388     return string.join(output, ",")
389
390
391 def getSpliceBounds(start, readsize, cigarTupleList):
392     stopR = int(start + readsize)
393     offset = 0
394
395     for operation,length in cigarTupleList:
396         if operation == 3:
397             stopL = int(start + offset)
398             startR = int(stopL + length)
399
400             return start, startR, stopL, stopR
401         else:
402             offset += length
403
404
405 if __name__ == "__main__":
406     main(sys.argv)