snapshot of 4.0a development. initial git repo commit
[erange.git] / MakeRdsFromBam.py
1 """
2 MakeRdsFromBam
3
4 Created on Jun 3, 2010
5
6 @author: sau
7 """
8
9 try:
10     import psyco
11     psyco.full()
12 except:
13     pass
14
15 import sys, string, optparse, re
16 import pysam
17 from commoncode import readDataset, writeLog
18
19 verstring = "%prog: version 1.0"
20
21
22 def main(argv=None):
23     if not argv:
24         argv = sys.argv
25     
26     print verstring
27
28     usage = "usage:  %prog label samfile outrdsfile [propertyName::propertyValue] [options]\
29             \ninput reads must be sorted to properly record multireads"
30
31     parser = optparse.OptionParser(usage=usage)
32     parser.add_option("--append", action="store_false", dest="init",
33                       help="append to existing rds file [default: create new]")
34     parser.add_option("--RNA", action="store_true", dest="rnaDataType",
35                       help="set data type to RNA [default: DNA]")
36     parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
37                       help="input file is in sam format")
38     parser.add_option("--index", action="store_true", dest="doIndex",
39                       help="index the output rds file")
40     parser.add_option("--cache", type="int", dest="cachePages",
41                       help="number of cache pages to use [default: 100000")
42     parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
43                       help="multi counts over this value are discarded [default: 10]")
44     parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
45                       help="use the raw read names")
46     parser.set_defaults(init=True, doIndex=False, useSamFile=False, cachePages=100000,
47                         maxMultiReadCount=10, rnaDataType=False, trimReadID=True)
48
49     (options, args) = parser.parse_args(argv[1:])
50
51     try:
52         label = args[0]
53     except IndexError:
54         print "no label specified - see --help for usage"
55         sys.exit(1)
56
57     try:
58         samFileName = args[1]
59     except IndexError:
60         print "no samfile specified - see --help for usage"
61         sys.exit(1)
62
63     try:
64         outDbName = args[2]
65     except IndexError:
66         print "no outrdsfile specified - see --help for usage"
67         sys.exit(1)
68
69     makeRdsFromBam(label, samFileName, outDbName, options.init, options.doIndex, options.useSamFile,
70                    options.cachePages, options.maxMultiReadCount, options.rnaDataType, options.trimReadID)
71
72
73 def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False,
74                    cachePages=100000, maxMultiReadCount=10, rnaDataType=False, trimReadID=True):
75
76     if useSamFile:
77         fileMode = "r"
78     else:
79         fileMode = "rb"
80
81     try:
82         samfile = pysam.Samfile(samFileName, fileMode)
83     except ValueError:
84         print "samfile index not found"
85         sys.exit(1)
86
87     if rnaDataType:
88         dataType = "RNA"
89     else:
90         dataType = "DNA"
91
92     writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:]))
93
94     rds = readDataset(outDbName, init, dataType, verbose=True)
95     if not init and doIndex:
96         try:
97             if rds.hasIndex():
98                 rds.dropIndex()
99         except:
100             pass
101
102     if "sam_mapped" not in rds.getMetadata():
103         rds.insertMetadata([("sam_mapped", "True")])
104
105     defaultCacheSize = rds.getDefaultCacheSize()
106
107     if cachePages > defaultCacheSize:
108         if init:
109             rds.setDBcache(cachePages, default=True)
110         else:
111             rds.setDBcache(cachePages)
112
113     propertyList = []
114     for arg in sys.argv:
115         if "::" in arg:
116             (pname, pvalue) = arg.strip().split("::")
117             propertyList.append((pname, pvalue))
118
119     if len(propertyList) > 0:
120         rds.insertMetadata(propertyList)
121
122     countReads = {"unmapped": 0,
123                   "total": 0,
124                   "unique": 0,
125                   "multi": 0,
126                   "multiDiscard": 0,
127                   "splice": 0
128     }
129
130     readsize = 0
131     insertSize = 100000
132
133     uniqueInsertList = []
134     multiInsertList = []
135     spliceInsertList = []
136
137     processedEntryDict = {}
138     uniqueReadDict = {}
139     multiReadDict = {}
140     spliceReadDict = {}
141
142     samFileIterator = samfile.fetch(until_eof=True)
143
144     for read in samFileIterator:
145         if read.is_unmapped:
146             countReads["unmapped"] += 1
147             continue
148
149         if readsize == 0:
150             take = (0, 2, 3) # CIGAR operation (M/match, D/del, N/ref_skip)
151             readsize = sum([length for op,length in read.cigar if op in take])
152             if init:
153                 rds.insertMetadata([("readsize", readsize)])
154
155         #Build the read dictionaries
156         try:
157             readSequence = read.seq
158         except KeyError:
159             readSequence = ""
160
161         pairReadSuffix = getPairedReadNumberSuffix(read)
162         readName = "%s%s%s" % (read.qname, readSequence, pairReadSuffix)
163         if trimReadID:
164             rdsEntryName = "%s:%s:%d%s" % (label, read.qname, countReads["total"], pairReadSuffix)
165         else:
166             rdsEntryName = read.qname
167
168         if processedEntryDict.has_key(readName):
169             if isSpliceEntry(read.cigar):
170                 if spliceReadDict.has_key(readName):
171                     del spliceReadDict[readName]
172             else:
173                 if uniqueReadDict.has_key(readName):
174                     del uniqueReadDict[readName]
175
176                 if multiReadDict.has_key(readName):
177                     (read, priorCount, rdsEntryName) = multiReadDict[readName]
178                     count = priorCount + 1
179                     multiReadDict[readName] = (read, count, rdsEntryName)
180                 else:
181                     multiReadDict[readName] = (read, 1, rdsEntryName)
182         else:
183             processedEntryDict[readName] = ""
184             if isSpliceEntry(read.cigar):
185                 spliceReadDict[readName] = (read,rdsEntryName)
186             else:
187                 uniqueReadDict[readName] = (read, rdsEntryName)
188
189         if countReads["total"] % insertSize == 0:
190             for entry in uniqueReadDict.keys():
191                 (readData, rdsEntryName) = uniqueReadDict[entry]
192                 chrom = samfile.getrname(readData.rname)
193                 uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
194                 countReads["unique"] += 1
195
196             for entry in spliceReadDict.keys():
197                 (readData, rdsEntryName) = spliceReadDict[entry]
198                 chrom = samfile.getrname(readData.rname)
199                 spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
200                 countReads["splice"] += 1
201
202             for entry in multiReadDict.keys():
203                 (readData, count, rdsEntryName) = multiReadDict[entry]
204                 chrom = samfile.getrname(readData.rname)
205                 if count > maxMultiReadCount:
206                     countReads["multiDiscard"] += 1
207                 else:
208                     multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count)) 
209                     countReads["multi"] += 1
210
211             rds.insertUniqs(uniqueInsertList)
212             rds.insertMulti(multiInsertList)
213             uniqueInsertList = []
214             uniqueReadDict = {}
215             multiInsertList = []
216             multiReadDict = {}
217             if dataType == "RNA":
218                 rds.insertSplices(spliceInsertList)
219                 spliceInsertList = []
220                 spliceReadDict = {}
221
222             print ".",
223             sys.stdout.flush()
224             processedEntryDict = {}
225
226         countReads["total"] += 1
227
228     if len(uniqueReadDict.keys()) > 0:
229         for entry in uniqueReadDict.keys():
230             (readData, rdsEntryName) = uniqueReadDict[entry]
231             chrom = samfile.getrname(readData.rname)
232             uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
233             countReads["unique"] += 1
234
235         rds.insertUniqs(uniqueInsertList)
236
237     if len(multiReadDict.keys()) > 0:
238         for entry in multiReadDict.keys():
239             (readData, count, rdsEntryName) = multiReadDict[entry]
240             chrom = samfile.getrname(readData.rname)
241             if count > maxMultiReadCount:
242                 countReads["multiDiscard"] += 1
243             else:
244                 multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
245                 countReads["multi"] += 1
246
247         countReads["multi"] += len(multiInsertList)
248
249     if len(spliceReadDict.keys()) > 0 and dataType == "RNA":
250         for entry in spliceReadDict.keys():
251             (readData, rdsEntryName) = spliceReadDict[entry]
252             chrom = samfile.getrname(readData.rname)
253             spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
254             countReads["splice"] += 1
255
256         rds.insertSplices(spliceInsertList)
257
258     countString = "\n%d unmapped reads discarded" % countReads["unmapped"]
259     countString += "\t%d unique reads" % countReads["unique"]
260     countString += "\t%d multi reads" % countReads["multi"]
261     countString += "\t%d multi reads count > %d discarded" % (countReads["multiDiscard"], maxMultiReadCount)
262     if dataType == "RNA":
263         countString += "\t%d spliced reads" % countReads["splice"]
264
265     print countString.replace("\t", "\n")
266
267     writeLog("%s.log" % outDbName, verstring, countString)
268
269     if doIndex:
270         print "building index...."
271         if cachePages > defaultCacheSize:
272             rds.setDBcache(cachePages)
273             rds.buildIndex(cachePages)
274         else:
275             rds.buildIndex(defaultCacheSize)
276
277
278 def getRDSEntry(alignedRead, readName, chrom, readSize, weight=1):
279     start = int(alignedRead.pos)
280     stop = int(start+readSize)
281     sense = getReadSense(alignedRead.is_reverse)
282     try:
283         mismatchTag = alignedRead.opt("MD")
284         mismatches = getMismatches(mismatchTag, alignedRead.seq, sense)
285     except KeyError:
286         mismatches = ""
287
288     return (readName, chrom, start, stop, sense, 1.0/weight, '', mismatches)
289
290
291 def getRDSSpliceEntry(alignedRead, readName, chrom, readSize):
292     (readName, chrom, start, stop, sense, weight, flag, mismatches) = getRDSEntry(alignedRead, readName, chrom, readSize)
293     startL, startR, stopL, stopR = getSpliceBounds(start, readSize, alignedRead.cigar)
294     
295     return (readName, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches)
296
297
298 def getPairedReadNumberSuffix(read):
299     readSuffix = ""
300     if not isPairedRead(read):
301         return ""
302
303     if read.is_read1:
304         readSuffix = "/1"
305     elif read.is_read2:
306         readSuffix = "/2"
307
308     return readSuffix
309
310
311 def isPairedRead(read):
312     return read.is_proper_pair and (read.is_read1 or read.is_read2)
313
314
315 def isSpliceEntry(cigarTupleList):
316     isSplice = False
317     for operation,length in cigarTupleList:
318         if operation == 3:
319             isSplice = True
320             break
321
322     return isSplice
323
324
325 def getReadSense(reverse):
326     if reverse:
327         sense = "-"
328     else:
329         sense = "+"
330
331     return sense
332
333
334 def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False):
335     output = []
336     deletionMarker = "^"
337     position = 0
338
339     lengths = re.findall("\d+", mismatchTag)
340     mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
341
342     for mismatchEntry in range(len(mismatchSequences)):
343         mismatch = mismatchSequences[mismatchEntry]
344         position = position + int(lengths[mismatchEntry])
345         if string.find(mismatch, deletionMarker) == 0:
346             continue
347
348         try:
349             if querySequence:
350                 genomicNucleotide = querySequence[position]
351             else:
352                 genomicNucleotide = "N"
353
354             if sense == "-":
355                 mismatch = getComplementNucleotide(mismatch)
356                 genomicNucleotide  = getComplementNucleotide(genomicNucleotide)
357
358             elandCompatiblePosition = int(position + 1)
359             output.append("%s%d%s" % (mismatch, elandCompatiblePosition, genomicNucleotide))
360             position += 1
361         except IndexError:
362             if logErrors:
363                 errorMessage = "getMismatch IndexError; tag: %s, seq: %s, pos: %d" % (mismatchTag, querySequence, position)
364                 writeLog("MakeRdsFromBamError.log", "1.0", errorMessage)
365
366             return ""
367
368     return string.join(output, ",")
369
370
371 def getComplementNucleotide(nucleotide):
372     complement = {"A": "T",
373                   "T": "A",
374                   "C": "G",
375                   "G": "C",
376                   "N": "N"
377     }
378
379     return complement[nucleotide]
380
381
382 def getSpliceBounds(start, readsize, cigarTupleList):
383     stopR = int(start + readsize)
384     offset = 0
385
386     for operation,length in cigarTupleList:
387         if operation == 3:
388             stopL = int(start + offset)
389             startR = int(stopL + length)
390
391             return start, startR, stopL, stopR
392         else:
393             offset += length
394
395
396 if __name__ == "__main__":
397     main(sys.argv)