20 from commoncode import writeLog, getConfigParser, getConfigBoolOption, getConfigIntOption, getReverseComplement
24 verstring = "makeRdsFromBam: version 1.0"
33 usage = "usage: %prog label samfile outrdsfile [propertyName::propertyValue] [options]\
34 \ninput reads must be sorted to properly record multireads"
36 parser = getParser(usage)
37 (options, args) = parser.parse_args(argv[1:])
42 print "no label specified - see --help for usage"
48 print "no samfile specified - see --help for usage"
54 print "no outrdsfile specified - see --help for usage"
57 makeRdsFromBam(label, samFileName, outDbName, options.init, options.doIndex, options.useSamFile,
58 options.cachePages, options.maxMultiReadCount, options.rnaDataType, options.trimReadID)
62 parser = optparse.OptionParser(usage=usage)
63 parser.add_option("--append", action="store_false", dest="init",
64 help="append to existing rds file [default: create new]")
65 parser.add_option("--RNA", action="store_true", dest="rnaDataType",
66 help="set data type to RNA [default: DNA]")
67 parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
68 help="input file is in sam format")
69 parser.add_option("--index", action="store_true", dest="doIndex",
70 help="index the output rds file")
71 parser.add_option("--cache", type="int", dest="cachePages",
72 help="number of cache pages to use [default: 100000")
73 parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
74 help="multi counts over this value are discarded [default: 10]")
75 parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
76 help="use the raw read names")
78 configParser = getConfigParser()
79 section = "makeRdsFromBam"
80 init = getConfigBoolOption(configParser, section, "init", True)
81 doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
82 useSamFile = getConfigBoolOption(configParser, section, "useSamFile", False)
83 cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
84 maxMultiReadCount = getConfigIntOption(configParser, section, "maxMultiReadCount", 10)
85 rnaDataType = getConfigBoolOption(configParser, section, "rnaDataType", False)
86 trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
88 parser.set_defaults(init=init, doIndex=doIndex, useSamFile=useSamFile, cachePages=cachePages,
89 maxMultiReadCount=maxMultiReadCount, rnaDataType=rnaDataType, trimReadID=trimReadID)
94 def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False,
95 cachePages=100000, maxMultiReadCount=10, rnaDataType=False, trimReadID=True):
103 samfile = pysam.Samfile(samFileName, fileMode)
105 print "samfile index not found"
113 writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:]))
115 rds = ReadDataset.ReadDataset(outDbName, init, dataType, verbose=True)
116 if not init and doIndex:
123 if "sam_mapped" not in rds.getMetadata():
124 rds.insertMetadata([("sam_mapped", "True")])
126 defaultCacheSize = rds.getDefaultCacheSize()
128 if cachePages > defaultCacheSize:
130 rds.setDBcache(cachePages, default=True)
132 rds.setDBcache(cachePages)
137 (pname, pvalue) = arg.strip().split("::")
138 propertyList.append((pname, pvalue))
140 if len(propertyList) > 0:
141 rds.insertMetadata(propertyList)
143 totalReadCounts = {"unmapped": 0,
153 uniqueInsertList = []
155 spliceInsertList = []
157 processedEntryDict = {}
162 samFileIterator = samfile.fetch(until_eof=True)
164 for read in samFileIterator:
166 totalReadCounts["unmapped"] += 1
170 take = (0, 1) # CIGAR operation (M/match, I/insertion)
171 readsize = sum([length for op,length in read.cigar if op in take])
173 rds.insertMetadata([("readsize", readsize)])
175 #Build the read dictionaries
177 readSequence = read.seq
181 pairReadSuffix = getPairedReadNumberSuffix(read)
182 readName = "%s%s%s" % (read.qname, readSequence, pairReadSuffix)
184 rdsEntryName = "%s:%s:%d%s" % (label, read.qname, totalReadCounts["total"], pairReadSuffix)
186 rdsEntryName = read.qname
188 if processedEntryDict.has_key(readName):
189 if isSpliceEntry(read.cigar):
190 if spliceReadDict.has_key(readName):
191 del spliceReadDict[readName]
193 if uniqueReadDict.has_key(readName):
194 del uniqueReadDict[readName]
196 if multiReadDict.has_key(readName):
197 (read, priorCount, rdsEntryName) = multiReadDict[readName]
198 count = priorCount + 1
199 multiReadDict[readName] = (read, count, rdsEntryName)
201 multiReadDict[readName] = (read, 1, rdsEntryName)
203 processedEntryDict[readName] = ""
204 if isSpliceEntry(read.cigar):
205 spliceReadDict[readName] = (read,rdsEntryName)
207 uniqueReadDict[readName] = (read, rdsEntryName)
209 if totalReadCounts["total"] % INSERT_SIZE == 0:
210 for entry in uniqueReadDict.keys():
211 (readData, rdsEntryName) = uniqueReadDict[entry]
212 chrom = samfile.getrname(readData.rname)
213 uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
214 totalReadCounts["unique"] += 1
216 for entry in spliceReadDict.keys():
217 (readData, rdsEntryName) = spliceReadDict[entry]
218 chrom = samfile.getrname(readData.rname)
219 spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
220 totalReadCounts["splice"] += 1
222 for entry in multiReadDict.keys():
223 (readData, count, rdsEntryName) = multiReadDict[entry]
224 chrom = samfile.getrname(readData.rname)
225 if count > maxMultiReadCount:
226 totalReadCounts["multiDiscard"] += 1
228 multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
229 totalReadCounts["multi"] += 1
231 rds.insertUniqs(uniqueInsertList)
232 rds.insertMulti(multiInsertList)
233 uniqueInsertList = []
237 if dataType == "RNA":
238 rds.insertSplices(spliceInsertList)
239 spliceInsertList = []
244 processedEntryDict = {}
246 totalReadCounts["total"] += 1
248 if len(uniqueReadDict.keys()) > 0:
249 for entry in uniqueReadDict.keys():
250 (readData, rdsEntryName) = uniqueReadDict[entry]
251 chrom = samfile.getrname(readData.rname)
252 uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
253 totalReadCounts["unique"] += 1
255 rds.insertUniqs(uniqueInsertList)
257 if len(multiReadDict.keys()) > 0:
258 for entry in multiReadDict.keys():
259 (readData, count, rdsEntryName) = multiReadDict[entry]
260 chrom = samfile.getrname(readData.rname)
261 if count > maxMultiReadCount:
262 totalReadCounts["multiDiscard"] += 1
264 multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
265 totalReadCounts["multi"] += 1
267 totalReadCounts["multi"] += len(multiInsertList)
269 if len(spliceReadDict.keys()) > 0 and dataType == "RNA":
270 for entry in spliceReadDict.keys():
271 (readData, rdsEntryName) = spliceReadDict[entry]
272 chrom = samfile.getrname(readData.rname)
273 spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
274 totalReadCounts["splice"] += 1
276 rds.insertSplices(spliceInsertList)
278 countStringList = ["\n%d unmapped reads discarded" % totalReadCounts["unmapped"]]
279 countStringList.append("%d unique reads" % totalReadCounts["unique"])
280 countStringList.append("%d multi reads" % totalReadCounts["multi"])
281 countStringList.append("%d multi reads count > %d discarded" % (totalReadCounts["multiDiscard"], maxMultiReadCount))
282 if dataType == "RNA":
283 countStringList.append("%d spliced reads" % totalReadCounts["splice"])
285 print string.join(countStringList, "\n")
286 outputCountText = string.join(countStringList, "\t")
287 writeLog("%s.log" % outDbName, verstring, outputCountText)
290 print "building index...."
291 if cachePages > defaultCacheSize:
292 rds.setDBcache(cachePages)
293 rds.buildIndex(cachePages)
295 rds.buildIndex(defaultCacheSize)
298 def getRDSEntry(alignedRead, readName, chrom, readSize, weight=1):
299 start = int(alignedRead.pos)
300 stop = int(start + readSize)
301 sense = getReadSense(alignedRead.is_reverse)
303 mismatchTag = alignedRead.opt("MD")
304 mismatches = getMismatches(mismatchTag, alignedRead.seq, sense)
308 return (readName, chrom, start, stop, sense, 1.0/weight, '', mismatches)
311 def getRDSSpliceEntry(alignedRead, readName, chrom, readSize):
312 (readName, chrom, start, stop, sense, weight, flag, mismatches) = getRDSEntry(alignedRead, readName, chrom, readSize)
313 startL, startR, stopL, stopR = getSpliceBounds(start, readSize, alignedRead.cigar)
315 return (readName, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches)
318 def getPairedReadNumberSuffix(read):
320 if not isPairedRead(read):
331 def isPairedRead(read):
332 return read.is_proper_pair and (read.is_read1 or read.is_read2)
335 def isSpliceEntry(cigarTupleList):
337 for operation,length in cigarTupleList:
345 def getReadSense(reverse):
354 def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False):
359 lengths = re.findall("\d+", mismatchTag)
360 mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
362 for mismatchEntry in range(len(mismatchSequences)):
363 mismatch = mismatchSequences[mismatchEntry]
364 position = position + int(lengths[mismatchEntry])
365 if string.find(mismatch, deletionMarker) == 0:
370 genomicNucleotide = querySequence[position]
372 genomicNucleotide = "N"
375 mismatch = getReverseComplement(mismatch)
376 genomicNucleotide = getReverseComplement(genomicNucleotide)
378 erange1BasedElandCompatiblePosition = int(position + 1)
379 output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
383 errorMessage = "getMismatch IndexError; tag: %s, seq: %s, pos: %d" % (mismatchTag, querySequence, position)
384 writeLog("MakeRdsFromBamError.log", "1.0", errorMessage)
388 return string.join(output, ",")
391 def getSpliceBounds(start, readsize, cigarTupleList):
392 stopR = int(start + readsize)
395 for operation,length in cigarTupleList:
397 stopL = int(start + offset)
398 startR = int(stopL + length)
400 return start, startR, stopL, stopR
405 if __name__ == "__main__":