rewrite of findall.py and MakeRdsFromBam to fix bugs resulting from poor initial...
[erange.git] / MakeRdsFromBam.py
1 """
2 MakeRdsFromBam
3
4 Created on Jun 3, 2010
5
6 @author: sau
7 """
8
9 try:
10     import psyco
11     psyco.full()
12 except:
13     pass
14
15 import sys
16 import string
17 import optparse
18 import re
19 import pysam
20 from commoncode import writeLog, getConfigParser, getConfigBoolOption, getConfigIntOption, getReverseComplement
21 import ReadDataset
22
23 INSERT_SIZE = 100000
24 verstring = "makeRdsFromBam: version 1.1"
25
26
27 def main(argv=None):
28     if not argv:
29         argv = sys.argv
30     
31     print verstring
32
33     usage = "usage:  %prog label samfile outrdsfile [propertyName::propertyValue] [options]\
34             \ninput reads must be sorted to properly record multireads"
35
36     parser = getParser(usage)
37     (options, args) = parser.parse_args(argv[1:])
38
39     try:
40         label = args[0]
41     except IndexError:
42         print "no label specified - see --help for usage"
43         sys.exit(1)
44
45     try:
46         samFileName = args[1]
47     except IndexError:
48         print "no samfile specified - see --help for usage"
49         sys.exit(1)
50
51     try:
52         outDbName = args[2]
53     except IndexError:
54         print "no outrdsfile specified - see --help for usage"
55         sys.exit(1)
56
57     if options.rnaDataType:
58         dataType = "RNA"
59     else:
60         dataType = "DNA"
61
62     makeRdsFromBam(label, samFileName, outDbName, options.init, options.doIndex, options.useSamFile,
63                    options.cachePages, options.maxMultiReadCount, dataType, options.trimReadID)
64
65
66 def getParser(usage):
67     parser = optparse.OptionParser(usage=usage)
68     parser.add_option("--append", action="store_false", dest="init",
69                       help="append to existing rds file [default: create new]")
70     parser.add_option("--RNA", action="store_true", dest="rnaDataType",
71                       help="set data type to RNA [default: DNA]")
72     parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
73                       help="input file is in sam format")
74     parser.add_option("--index", action="store_true", dest="doIndex",
75                       help="index the output rds file")
76     parser.add_option("--cache", type="int", dest="cachePages",
77                       help="number of cache pages to use [default: 100000")
78     parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
79                       help="multi counts over this value are discarded [default: 10]")
80     parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
81                       help="use the raw read names")
82
83     configParser = getConfigParser()
84     section = "makeRdsFromBam"
85     init = getConfigBoolOption(configParser, section, "init", True)
86     doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
87     useSamFile = getConfigBoolOption(configParser, section, "useSamFile", False)
88     cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
89     maxMultiReadCount = getConfigIntOption(configParser, section, "maxMultiReadCount", 10)
90     rnaDataType = getConfigBoolOption(configParser, section, "rnaDataType", False)
91     trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
92
93     parser.set_defaults(init=init, doIndex=doIndex, useSamFile=useSamFile, cachePages=cachePages,
94                         maxMultiReadCount=maxMultiReadCount, rnaDataType=rnaDataType, trimReadID=trimReadID)
95
96     return parser
97
98
99 def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False,
100                    cachePages=100000, maxMultiReadCount=10, dataType="DNA", trimReadID=True):
101
102     if useSamFile:
103         fileMode = "r"
104     else:
105         fileMode = "rb"
106
107     writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:]))
108     rds = ReadDataset.ReadDataset(outDbName, init, dataType, verbose=True)
109     if not init and doIndex:
110         try:
111             if rds.hasIndex():
112                 rds.dropIndex()
113         except:
114             pass
115
116     if "sam_mapped" not in rds.getMetadata():
117         rds.insertMetadata([("sam_mapped", "True")])
118
119     defaultCacheSize = rds.getDefaultCacheSize()
120
121     if cachePages > defaultCacheSize:
122         if init:
123             rds.setDBcache(cachePages, default=True)
124         else:
125             rds.setDBcache(cachePages)
126
127     propertyList = []
128     for arg in sys.argv:
129         if "::" in arg:
130             (pname, pvalue) = arg.strip().split("::")
131             propertyList.append((pname, pvalue))
132
133     if len(propertyList) > 0:
134         rds.insertMetadata(propertyList)
135
136     totalReadCounts = {"unmapped": 0,
137                        "total": 0,
138                        "unique": 0,
139                        "multi": 0,
140                        "multiDiscard": 0,
141                        "splice": 0,
142                        "multisplice": 0
143     }
144
145     readsize = 0
146
147     uniqueInsertList = []
148     multiInsertList = []
149     spliceInsertList = []
150     multispliceInsertList = []
151
152     uniqueReadDict = {}
153     multiReadDict = {}
154     multispliceReadDict = {}
155     spliceReadDict = {}
156     multireadCounts = getMultiReadIDCounts(samFileName, fileMode)
157
158     for readID in multireadCounts:
159         if multireadCounts[readID] > maxMultiReadCount:
160             totalReadCounts["multiDiscard"] += 1
161
162     try:
163         samfile = pysam.Samfile(samFileName, fileMode)
164     except ValueError:
165         print "samfile index not found"
166         sys.exit(1)
167
168     samFileIterator = samfile.fetch(until_eof=True)
169     for read in samFileIterator:
170         if read.is_unmapped:
171             totalReadCounts["unmapped"] += 1
172             continue
173
174         if readsize == 0:
175             take = (0, 1) # CIGAR operation (M/match, I/insertion)
176             readsize = sum([length for op,length in read.cigar if op in take])
177             if init:
178                 rds.insertMetadata([("readsize", readsize)])
179
180         pairReadSuffix = getPairedReadNumberSuffix(read)
181         readName = "%s%s" % (read.qname, pairReadSuffix)
182         if trimReadID:
183             rdsEntryName = "%s:%s:%d%s" % (label, read.qname, totalReadCounts["total"], pairReadSuffix)
184         else:
185             rdsEntryName = read.qname
186
187         try:
188             count = multireadCounts[readName]
189         except KeyError:
190             count = 1
191
192         if count == 1:
193             if isSpliceEntry(read.cigar):
194                 spliceReadDict[readName] = (read,rdsEntryName)
195             else:
196                 uniqueReadDict[readName] = (read, rdsEntryName)
197         elif count <= maxMultiReadCount:
198             if isSpliceEntry(read.cigar):
199                 multispliceReadDict[readName] = (read, count, rdsEntryName)
200             else:
201                 multiReadDict[readName] = (read, count, rdsEntryName)
202
203         if totalReadCounts["total"] % INSERT_SIZE == 0:
204             for entry in uniqueReadDict.keys():
205                 (readData, rdsEntryName) = uniqueReadDict[entry]
206                 chrom = samfile.getrname(readData.rname)
207                 uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
208                 totalReadCounts["unique"] += 1
209
210             for entry in multiReadDict.keys():
211                 (readData, count, rdsEntryName) = multiReadDict[entry]
212                 chrom = samfile.getrname(readData.rname)
213                 multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count)) 
214
215             if dataType == "RNA":
216                 for entry in spliceReadDict.keys():
217                     (readData, rdsEntryName) = spliceReadDict[entry]
218                     chrom = samfile.getrname(readData.rname)
219                     spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
220                     totalReadCounts["splice"] += 1
221
222                 for entry in multispliceReadDict.keys():
223                     (readData, count, rdsEntryName) = multispliceReadDict[entry]
224                     chrom = samfile.getrname(readData.rname)
225                     multispliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize, weight=count))
226                     totalReadCounts["multisplice"] += 1
227
228             rds.insertUniqs(uniqueInsertList)
229             rds.insertMulti(multiInsertList)
230             uniqueInsertList = []
231             uniqueReadDict = {}
232             multiInsertList = []
233             multiReadDict = {}
234             if dataType == "RNA":
235                 rds.insertSplices(spliceInsertList)
236                 spliceInsertList = []
237                 spliceReadDict = {}
238                 rds.insertMultisplices(multispliceInsertList)
239                 multispliceInsertList = []
240                 multispliceReadDict = {}
241
242             print ".",
243             sys.stdout.flush()
244
245         totalReadCounts["total"] += 1
246
247     if len(uniqueReadDict.keys()) > 0:
248         for entry in uniqueReadDict.keys():
249             (readData, rdsEntryName) = uniqueReadDict[entry]
250             chrom = samfile.getrname(readData.rname)
251             uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
252             totalReadCounts["unique"] += 1
253
254         rds.insertUniqs(uniqueInsertList)
255
256     if len(multiReadDict.keys()) > 0:
257         for entry in multiReadDict.keys():
258             (readData, count, rdsEntryName) = multiReadDict[entry]
259             chrom = samfile.getrname(readData.rname)
260             multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
261
262         rds.insertMulti(multiInsertList)
263
264     if len(spliceReadDict.keys()) > 0 and dataType == "RNA":
265         for entry in spliceReadDict.keys():
266             (readData, rdsEntryName) = spliceReadDict[entry]
267             chrom = samfile.getrname(readData.rname)
268             spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
269             totalReadCounts["splice"] += 1
270
271         rds.insertSplices(spliceInsertList)
272
273     if len(multispliceReadDict.keys()) > 0 and dataType == "RNA":
274         for entry in multispliceReadDict.keys():
275             (readData, count, rdsEntryName) = multispliceReadDict[entry]
276             chrom = samfile.getrname(readData.rname)
277             multispliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize, weight=count))
278             totalReadCounts["multisplice"] += 1
279
280         rds.insertMultisplices(multispliceInsertList)
281
282     totalReadCounts["multi"] = len(multireadCounts) - totalReadCounts["multiDiscard"] - totalReadCounts["multisplice"]
283     countStringList = ["\n%d unmapped reads discarded" % totalReadCounts["unmapped"]]
284     countStringList.append("%d unique reads" % totalReadCounts["unique"])
285     countStringList.append("%d multi reads" % totalReadCounts["multi"])
286     countStringList.append("%d multi reads count > %d discarded" % (totalReadCounts["multiDiscard"], maxMultiReadCount))
287     if dataType == "RNA":
288         countStringList.append("%d spliced reads" % totalReadCounts["splice"])
289         countStringList.append("%d spliced multireads" % totalReadCounts["multisplice"])
290
291     print string.join(countStringList, "\n")
292     outputCountText = string.join(countStringList, "\t")
293     writeLog("%s.log" % outDbName, verstring, outputCountText)
294
295     if doIndex:
296         print "building index...."
297         if cachePages > defaultCacheSize:
298             rds.setDBcache(cachePages)
299             rds.buildIndex(cachePages)
300         else:
301             rds.buildIndex(defaultCacheSize)
302
303
304 def getMultiReadIDCounts(samFileName, fileMode):
305     try:
306         samfile = pysam.Samfile(samFileName, fileMode)
307     except ValueError:
308         print "samfile index not found"
309         sys.exit(1)
310
311     readIDCounts = {}
312     for read in samfile.fetch(until_eof=True):
313         pairReadSuffix = getPairedReadNumberSuffix(read)
314         readName = "%s%s" % (read.qname, pairReadSuffix)
315         try:
316             readIDCounts[readName] += 1
317         except KeyError:
318             readIDCounts[readName] = 1
319
320     for readID in readIDCounts.keys():
321         if readIDCounts[readID] == 1:
322             del readIDCounts[readID]
323
324     return readIDCounts
325
326
327 def getRDSEntry(alignedRead, readName, chrom, readSize, weight=1):
328     start = int(alignedRead.pos)
329     stop = int(start + readSize)
330     sense = getReadSense(alignedRead.is_reverse)
331     try:
332         mismatchTag = alignedRead.opt("MD")
333         mismatches = getMismatches(mismatchTag, alignedRead.seq, sense)
334     except KeyError:
335         mismatches = ""
336
337     return (readName, chrom, start, stop, sense, 1.0/weight, '', mismatches)
338
339
340 def getRDSSpliceEntry(alignedRead, readName, chrom, readSize, weight=1):
341     (readName, chrom, start, stop, sense, weight, flag, mismatches) = getRDSEntry(alignedRead, readName, chrom, readSize, weight)
342     startL, startR, stopL, stopR = getSpliceBounds(start, readSize, alignedRead.cigar)
343     
344     return (readName, chrom, startL, stopL, startR, stopR, sense, weight, "", mismatches)
345
346
347 def getPairedReadNumberSuffix(read):
348     readSuffix = ""
349     if not isPairedRead(read):
350         return ""
351
352     if read.is_read1:
353         readSuffix = "/1"
354     elif read.is_read2:
355         readSuffix = "/2"
356
357     return readSuffix
358
359
360 def isPairedRead(read):
361     return read.is_proper_pair and (read.is_read1 or read.is_read2)
362
363
364 def isSpliceEntry(cigarTupleList):
365     isSplice = False
366     for operation,length in cigarTupleList:
367         if operation == 3:
368             isSplice = True
369             break
370
371     return isSplice
372
373
374 def getReadSense(reverse):
375     if reverse:
376         sense = "-"
377     else:
378         sense = "+"
379
380     return sense
381
382
383 def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False):
384     output = []
385     deletionMarker = "^"
386     position = 0
387
388     lengths = re.findall("\d+", mismatchTag)
389     mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
390
391     for mismatchEntry in range(len(mismatchSequences)):
392         mismatch = mismatchSequences[mismatchEntry]
393         position = position + int(lengths[mismatchEntry])
394         if string.find(mismatch, deletionMarker) == 0:
395             continue
396
397         try:
398             if querySequence:
399                 genomicNucleotide = querySequence[position]
400             else:
401                 genomicNucleotide = "N"
402
403             if sense == "-":
404                 mismatch = getReverseComplement(mismatch)
405                 genomicNucleotide  = getReverseComplement(genomicNucleotide)
406
407             erange1BasedElandCompatiblePosition = int(position + 1)
408             output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
409             position += 1
410         except IndexError:
411             if logErrors:
412                 errorMessage = "getMismatch IndexError; tag: %s, seq: %s, pos: %d" % (mismatchTag, querySequence, position)
413                 writeLog("MakeRdsFromBamError.log", "1.0", errorMessage)
414
415             return ""
416
417     return string.join(output, ",")
418
419
420 def getSpliceBounds(start, readsize, cigarTupleList):
421     stopR = int(start + readsize)
422     offset = 0
423
424     for operation,length in cigarTupleList:
425         if operation == 3:
426             stopL = int(start + offset)
427             startR = int(stopL + length)
428
429             return start, startR, stopL, stopR
430         else:
431             offset += length
432
433
434 if __name__ == "__main__":
435     main(sys.argv)