20 from commoncode import writeLog, getConfigParser, getConfigBoolOption, getConfigIntOption, getReverseComplement
24 verstring = "makeRdsFromBam: version 1.1"
33 usage = "usage: %prog label samfile outrdsfile [propertyName::propertyValue] [options]\
34 \ninput reads must be sorted to properly record multireads"
36 parser = getParser(usage)
37 (options, args) = parser.parse_args(argv[1:])
42 print "no label specified - see --help for usage"
48 print "no samfile specified - see --help for usage"
54 print "no outrdsfile specified - see --help for usage"
57 if options.rnaDataType:
62 makeRdsFromBam(label, samFileName, outDbName, options.init, options.doIndex, options.useSamFile,
63 options.cachePages, options.maxMultiReadCount, dataType, options.trimReadID)
67 parser = optparse.OptionParser(usage=usage)
68 parser.add_option("--append", action="store_false", dest="init",
69 help="append to existing rds file [default: create new]")
70 parser.add_option("--RNA", action="store_true", dest="rnaDataType",
71 help="set data type to RNA [default: DNA]")
72 parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
73 help="input file is in sam format")
74 parser.add_option("--index", action="store_true", dest="doIndex",
75 help="index the output rds file")
76 parser.add_option("--cache", type="int", dest="cachePages",
77 help="number of cache pages to use [default: 100000")
78 parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
79 help="multi counts over this value are discarded [default: 10]")
80 parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
81 help="use the raw read names")
83 configParser = getConfigParser()
84 section = "makeRdsFromBam"
85 init = getConfigBoolOption(configParser, section, "init", True)
86 doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
87 useSamFile = getConfigBoolOption(configParser, section, "useSamFile", False)
88 cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
89 maxMultiReadCount = getConfigIntOption(configParser, section, "maxMultiReadCount", 10)
90 rnaDataType = getConfigBoolOption(configParser, section, "rnaDataType", False)
91 trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
93 parser.set_defaults(init=init, doIndex=doIndex, useSamFile=useSamFile, cachePages=cachePages,
94 maxMultiReadCount=maxMultiReadCount, rnaDataType=rnaDataType, trimReadID=trimReadID)
99 def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False,
100 cachePages=100000, maxMultiReadCount=10, dataType="DNA", trimReadID=True):
107 writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:]))
108 rds = ReadDataset.ReadDataset(outDbName, init, dataType, verbose=True)
109 if not init and doIndex:
116 if "sam_mapped" not in rds.getMetadata():
117 rds.insertMetadata([("sam_mapped", "True")])
119 defaultCacheSize = rds.getDefaultCacheSize()
121 if cachePages > defaultCacheSize:
123 rds.setDBcache(cachePages, default=True)
125 rds.setDBcache(cachePages)
130 (pname, pvalue) = arg.strip().split("::")
131 propertyList.append((pname, pvalue))
133 if len(propertyList) > 0:
134 rds.insertMetadata(propertyList)
136 totalReadCounts = {"unmapped": 0,
147 uniqueInsertList = []
149 spliceInsertList = []
150 multispliceInsertList = []
154 multispliceReadDict = {}
156 multireadCounts = getMultiReadIDCounts(samFileName, fileMode)
158 for readID in multireadCounts:
159 if multireadCounts[readID] > maxMultiReadCount:
160 totalReadCounts["multiDiscard"] += 1
163 samfile = pysam.Samfile(samFileName, fileMode)
165 print "samfile index not found"
168 samFileIterator = samfile.fetch(until_eof=True)
169 for read in samFileIterator:
171 totalReadCounts["unmapped"] += 1
175 take = (0, 1) # CIGAR operation (M/match, I/insertion)
176 readsize = sum([length for op,length in read.cigar if op in take])
178 rds.insertMetadata([("readsize", readsize)])
180 pairReadSuffix = getPairedReadNumberSuffix(read)
181 readName = "%s%s" % (read.qname, pairReadSuffix)
183 rdsEntryName = "%s:%s:%d%s" % (label, read.qname, totalReadCounts["total"], pairReadSuffix)
185 rdsEntryName = read.qname
188 count = multireadCounts[readName]
193 if isSpliceEntry(read.cigar):
194 spliceReadDict[readName] = (read,rdsEntryName)
196 uniqueReadDict[readName] = (read, rdsEntryName)
197 elif count <= maxMultiReadCount:
198 if isSpliceEntry(read.cigar):
199 multispliceReadDict[readName] = (read, count, rdsEntryName)
201 multiReadDict[readName] = (read, count, rdsEntryName)
203 if totalReadCounts["total"] % INSERT_SIZE == 0:
204 for entry in uniqueReadDict.keys():
205 (readData, rdsEntryName) = uniqueReadDict[entry]
206 chrom = samfile.getrname(readData.rname)
207 uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
208 totalReadCounts["unique"] += 1
210 for entry in multiReadDict.keys():
211 (readData, count, rdsEntryName) = multiReadDict[entry]
212 chrom = samfile.getrname(readData.rname)
213 multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
215 if dataType == "RNA":
216 for entry in spliceReadDict.keys():
217 (readData, rdsEntryName) = spliceReadDict[entry]
218 chrom = samfile.getrname(readData.rname)
219 spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
220 totalReadCounts["splice"] += 1
222 for entry in multispliceReadDict.keys():
223 (readData, count, rdsEntryName) = multispliceReadDict[entry]
224 chrom = samfile.getrname(readData.rname)
225 multispliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize, weight=count))
226 totalReadCounts["multisplice"] += 1
228 rds.insertUniqs(uniqueInsertList)
229 rds.insertMulti(multiInsertList)
230 uniqueInsertList = []
234 if dataType == "RNA":
235 rds.insertSplices(spliceInsertList)
236 spliceInsertList = []
238 rds.insertMultisplices(multispliceInsertList)
239 multispliceInsertList = []
240 multispliceReadDict = {}
245 totalReadCounts["total"] += 1
247 if len(uniqueReadDict.keys()) > 0:
248 for entry in uniqueReadDict.keys():
249 (readData, rdsEntryName) = uniqueReadDict[entry]
250 chrom = samfile.getrname(readData.rname)
251 uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
252 totalReadCounts["unique"] += 1
254 rds.insertUniqs(uniqueInsertList)
256 if len(multiReadDict.keys()) > 0:
257 for entry in multiReadDict.keys():
258 (readData, count, rdsEntryName) = multiReadDict[entry]
259 chrom = samfile.getrname(readData.rname)
260 multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
262 rds.insertMulti(multiInsertList)
264 if len(spliceReadDict.keys()) > 0 and dataType == "RNA":
265 for entry in spliceReadDict.keys():
266 (readData, rdsEntryName) = spliceReadDict[entry]
267 chrom = samfile.getrname(readData.rname)
268 spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
269 totalReadCounts["splice"] += 1
271 rds.insertSplices(spliceInsertList)
273 if len(multispliceReadDict.keys()) > 0 and dataType == "RNA":
274 for entry in multispliceReadDict.keys():
275 (readData, count, rdsEntryName) = multispliceReadDict[entry]
276 chrom = samfile.getrname(readData.rname)
277 multispliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize, weight=count))
278 totalReadCounts["multisplice"] += 1
280 rds.insertMultisplices(multispliceInsertList)
282 totalReadCounts["multi"] = len(multireadCounts) - totalReadCounts["multiDiscard"] - totalReadCounts["multisplice"]
283 countStringList = ["\n%d unmapped reads discarded" % totalReadCounts["unmapped"]]
284 countStringList.append("%d unique reads" % totalReadCounts["unique"])
285 countStringList.append("%d multi reads" % totalReadCounts["multi"])
286 countStringList.append("%d multi reads count > %d discarded" % (totalReadCounts["multiDiscard"], maxMultiReadCount))
287 if dataType == "RNA":
288 countStringList.append("%d spliced reads" % totalReadCounts["splice"])
289 countStringList.append("%d spliced multireads" % totalReadCounts["multisplice"])
291 print string.join(countStringList, "\n")
292 outputCountText = string.join(countStringList, "\t")
293 writeLog("%s.log" % outDbName, verstring, outputCountText)
296 print "building index...."
297 if cachePages > defaultCacheSize:
298 rds.setDBcache(cachePages)
299 rds.buildIndex(cachePages)
301 rds.buildIndex(defaultCacheSize)
304 def getMultiReadIDCounts(samFileName, fileMode):
306 samfile = pysam.Samfile(samFileName, fileMode)
308 print "samfile index not found"
312 for read in samfile.fetch(until_eof=True):
313 pairReadSuffix = getPairedReadNumberSuffix(read)
314 readName = "%s%s" % (read.qname, pairReadSuffix)
316 readIDCounts[readName] += 1
318 readIDCounts[readName] = 1
320 for readID in readIDCounts.keys():
321 if readIDCounts[readID] == 1:
322 del readIDCounts[readID]
327 def getRDSEntry(alignedRead, readName, chrom, readSize, weight=1):
328 start = int(alignedRead.pos)
329 stop = int(start + readSize)
330 sense = getReadSense(alignedRead.is_reverse)
332 mismatchTag = alignedRead.opt("MD")
333 mismatches = getMismatches(mismatchTag, alignedRead.seq, sense)
337 return (readName, chrom, start, stop, sense, 1.0/weight, '', mismatches)
340 def getRDSSpliceEntry(alignedRead, readName, chrom, readSize, weight=1):
341 (readName, chrom, start, stop, sense, weight, flag, mismatches) = getRDSEntry(alignedRead, readName, chrom, readSize, weight)
342 startL, startR, stopL, stopR = getSpliceBounds(start, readSize, alignedRead.cigar)
344 return (readName, chrom, startL, stopL, startR, stopR, sense, weight, "", mismatches)
347 def getPairedReadNumberSuffix(read):
349 if not isPairedRead(read):
360 def isPairedRead(read):
361 return read.is_proper_pair and (read.is_read1 or read.is_read2)
364 def isSpliceEntry(cigarTupleList):
366 for operation,length in cigarTupleList:
374 def getReadSense(reverse):
383 def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False):
388 lengths = re.findall("\d+", mismatchTag)
389 mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
391 for mismatchEntry in range(len(mismatchSequences)):
392 mismatch = mismatchSequences[mismatchEntry]
393 position = position + int(lengths[mismatchEntry])
394 if string.find(mismatch, deletionMarker) == 0:
399 genomicNucleotide = querySequence[position]
401 genomicNucleotide = "N"
404 mismatch = getReverseComplement(mismatch)
405 genomicNucleotide = getReverseComplement(genomicNucleotide)
407 erange1BasedElandCompatiblePosition = int(position + 1)
408 output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
412 errorMessage = "getMismatch IndexError; tag: %s, seq: %s, pos: %d" % (mismatchTag, querySequence, position)
413 writeLog("MakeRdsFromBamError.log", "1.0", errorMessage)
417 return string.join(output, ",")
420 def getSpliceBounds(start, readsize, cigarTupleList):
423 for operation,length in cigarTupleList:
425 stopL = int(start + offset)
426 startR = int(stopL + length)
427 stopR = int(startR + readsize - offset)
429 return start, startR, stopL, stopR
434 if __name__ == "__main__":