15 import sys, string, optparse, re
17 from commoncode import readDataset, writeLog
19 verstring = "%prog: version 1.0"
28 usage = "usage: %prog label samfile outrdsfile [propertyName::propertyValue] [options]\
29 \ninput reads must be sorted to properly record multireads"
31 parser = optparse.OptionParser(usage=usage)
32 parser.add_option("--append", action="store_false", dest="init",
33 help="append to existing rds file [default: create new]")
34 parser.add_option("--RNA", action="store_true", dest="rnaDataType",
35 help="set data type to RNA [default: DNA]")
36 parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
37 help="input file is in sam format")
38 parser.add_option("--index", action="store_true", dest="doIndex",
39 help="index the output rds file")
40 parser.add_option("--cache", type="int", dest="cachePages",
41 help="number of cache pages to use [default: 100000")
42 parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
43 help="multi counts over this value are discarded [default: 10]")
44 parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
45 help="use the raw read names")
46 parser.set_defaults(init=True, doIndex=False, useSamFile=False, cachePages=100000,
47 maxMultiReadCount=10, rnaDataType=False, trimReadID=True)
49 (options, args) = parser.parse_args(argv[1:])
54 print "no label specified - see --help for usage"
60 print "no samfile specified - see --help for usage"
66 print "no outrdsfile specified - see --help for usage"
69 makeRdsFromBam(label, samFileName, outDbName, options.init, options.doIndex, options.useSamFile,
70 options.cachePages, options.maxMultiReadCount, options.rnaDataType, options.trimReadID)
73 def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False,
74 cachePages=100000, maxMultiReadCount=10, rnaDataType=False, trimReadID=True):
82 samfile = pysam.Samfile(samFileName, fileMode)
84 print "samfile index not found"
92 writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:]))
94 rds = readDataset(outDbName, init, dataType, verbose=True)
95 if not init and doIndex:
102 if "sam_mapped" not in rds.getMetadata():
103 rds.insertMetadata([("sam_mapped", "True")])
105 defaultCacheSize = rds.getDefaultCacheSize()
107 if cachePages > defaultCacheSize:
109 rds.setDBcache(cachePages, default=True)
111 rds.setDBcache(cachePages)
116 (pname, pvalue) = arg.strip().split("::")
117 propertyList.append((pname, pvalue))
119 if len(propertyList) > 0:
120 rds.insertMetadata(propertyList)
122 countReads = {"unmapped": 0,
133 uniqueInsertList = []
135 spliceInsertList = []
137 processedEntryDict = {}
142 samFileIterator = samfile.fetch(until_eof=True)
144 for read in samFileIterator:
146 countReads["unmapped"] += 1
150 take = (0, 2, 3) # CIGAR operation (M/match, D/del, N/ref_skip)
151 readsize = sum([length for op,length in read.cigar if op in take])
153 rds.insertMetadata([("readsize", readsize)])
155 #Build the read dictionaries
157 readSequence = read.seq
161 pairReadSuffix = getPairedReadNumberSuffix(read)
162 readName = "%s%s%s" % (read.qname, readSequence, pairReadSuffix)
164 rdsEntryName = "%s:%s:%d%s" % (label, read.qname, countReads["total"], pairReadSuffix)
166 rdsEntryName = read.qname
168 if processedEntryDict.has_key(readName):
169 if isSpliceEntry(read.cigar):
170 if spliceReadDict.has_key(readName):
171 del spliceReadDict[readName]
173 if uniqueReadDict.has_key(readName):
174 del uniqueReadDict[readName]
176 if multiReadDict.has_key(readName):
177 (read, priorCount, rdsEntryName) = multiReadDict[readName]
178 count = priorCount + 1
179 multiReadDict[readName] = (read, count, rdsEntryName)
181 multiReadDict[readName] = (read, 1, rdsEntryName)
183 processedEntryDict[readName] = ""
184 if isSpliceEntry(read.cigar):
185 spliceReadDict[readName] = (read,rdsEntryName)
187 uniqueReadDict[readName] = (read, rdsEntryName)
189 if countReads["total"] % insertSize == 0:
190 for entry in uniqueReadDict.keys():
191 (readData, rdsEntryName) = uniqueReadDict[entry]
192 chrom = samfile.getrname(readData.rname)
193 uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
194 countReads["unique"] += 1
196 for entry in spliceReadDict.keys():
197 (readData, rdsEntryName) = spliceReadDict[entry]
198 chrom = samfile.getrname(readData.rname)
199 spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
200 countReads["splice"] += 1
202 for entry in multiReadDict.keys():
203 (readData, count, rdsEntryName) = multiReadDict[entry]
204 chrom = samfile.getrname(readData.rname)
205 if count > maxMultiReadCount:
206 countReads["multiDiscard"] += 1
208 multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
209 countReads["multi"] += 1
211 rds.insertUniqs(uniqueInsertList)
212 rds.insertMulti(multiInsertList)
213 uniqueInsertList = []
217 if dataType == "RNA":
218 rds.insertSplices(spliceInsertList)
219 spliceInsertList = []
224 processedEntryDict = {}
226 countReads["total"] += 1
228 if len(uniqueReadDict.keys()) > 0:
229 for entry in uniqueReadDict.keys():
230 (readData, rdsEntryName) = uniqueReadDict[entry]
231 chrom = samfile.getrname(readData.rname)
232 uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
233 countReads["unique"] += 1
235 rds.insertUniqs(uniqueInsertList)
237 if len(multiReadDict.keys()) > 0:
238 for entry in multiReadDict.keys():
239 (readData, count, rdsEntryName) = multiReadDict[entry]
240 chrom = samfile.getrname(readData.rname)
241 if count > maxMultiReadCount:
242 countReads["multiDiscard"] += 1
244 multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
245 countReads["multi"] += 1
247 countReads["multi"] += len(multiInsertList)
249 if len(spliceReadDict.keys()) > 0 and dataType == "RNA":
250 for entry in spliceReadDict.keys():
251 (readData, rdsEntryName) = spliceReadDict[entry]
252 chrom = samfile.getrname(readData.rname)
253 spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
254 countReads["splice"] += 1
256 rds.insertSplices(spliceInsertList)
258 countString = "\n%d unmapped reads discarded" % countReads["unmapped"]
259 countString += "\t%d unique reads" % countReads["unique"]
260 countString += "\t%d multi reads" % countReads["multi"]
261 countString += "\t%d multi reads count > %d discarded" % (countReads["multiDiscard"], maxMultiReadCount)
262 if dataType == "RNA":
263 countString += "\t%d spliced reads" % countReads["splice"]
265 print countString.replace("\t", "\n")
267 writeLog("%s.log" % outDbName, verstring, countString)
270 print "building index...."
271 if cachePages > defaultCacheSize:
272 rds.setDBcache(cachePages)
273 rds.buildIndex(cachePages)
275 rds.buildIndex(defaultCacheSize)
278 def getRDSEntry(alignedRead, readName, chrom, readSize, weight=1):
279 start = int(alignedRead.pos)
280 stop = int(start+readSize)
281 sense = getReadSense(alignedRead.is_reverse)
283 mismatchTag = alignedRead.opt("MD")
284 mismatches = getMismatches(mismatchTag, alignedRead.seq, sense)
288 return (readName, chrom, start, stop, sense, 1.0/weight, '', mismatches)
291 def getRDSSpliceEntry(alignedRead, readName, chrom, readSize):
292 (readName, chrom, start, stop, sense, weight, flag, mismatches) = getRDSEntry(alignedRead, readName, chrom, readSize)
293 startL, startR, stopL, stopR = getSpliceBounds(start, readSize, alignedRead.cigar)
295 return (readName, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches)
298 def getPairedReadNumberSuffix(read):
300 if not isPairedRead(read):
311 def isPairedRead(read):
312 return read.is_proper_pair and (read.is_read1 or read.is_read2)
315 def isSpliceEntry(cigarTupleList):
317 for operation,length in cigarTupleList:
325 def getReadSense(reverse):
334 def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False):
339 lengths = re.findall("\d+", mismatchTag)
340 mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
342 for mismatchEntry in range(len(mismatchSequences)):
343 mismatch = mismatchSequences[mismatchEntry]
344 position = position + int(lengths[mismatchEntry])
345 if string.find(mismatch, deletionMarker) == 0:
350 genomicNucleotide = querySequence[position]
352 genomicNucleotide = "N"
355 mismatch = getComplementNucleotide(mismatch)
356 genomicNucleotide = getComplementNucleotide(genomicNucleotide)
358 elandCompatiblePosition = int(position + 1)
359 output.append("%s%d%s" % (mismatch, elandCompatiblePosition, genomicNucleotide))
363 errorMessage = "getMismatch IndexError; tag: %s, seq: %s, pos: %d" % (mismatchTag, querySequence, position)
364 writeLog("MakeRdsFromBamError.log", "1.0", errorMessage)
368 return string.join(output, ",")
371 def getComplementNucleotide(nucleotide):
372 complement = {"A": "T",
379 return complement[nucleotide]
382 def getSpliceBounds(start, readsize, cigarTupleList):
383 stopR = int(start + readsize)
386 for operation,length in cigarTupleList:
388 stopL = int(start + offset)
389 startR = int(stopL + length)
391 return start, startR, stopL, stopR
396 if __name__ == "__main__":