MakeRdsFromBam.py

   1 """
   2 MakeRdsFromBam
   3
   4 Created on Jun 3, 2010
   5
   6 @author: sau
   7 """
   8
   9 try:
  10     import psyco
  11     psyco.full()
  12 except:
  13     pass
  14
  15 import sys
  16 import string
  17 import optparse
  18 import re
  19 import pysam
  20 from commoncode import writeLog, getConfigParser, getConfigBoolOption, getConfigIntOption, getReverseComplement
  21 import ReadDataset
  22
  23 INSERT_SIZE = 100000
  24 verstring = "makeRdsFromBam: version 1.0"
  25
  26
  27 def main(argv=None):
  28     if not argv:
  29         argv = sys.argv
  30
  31     print verstring
  32
  33     usage = "usage:  %prog label samfile outrdsfile [propertyName::propertyValue] [options]\
  34             \ninput reads must be sorted to properly record multireads"
  35
  36     parser = getParser(usage)
  37     (options, args) = parser.parse_args(argv[1:])
  38
  39     try:
  40         label = args[0]
  41     except IndexError:
  42         print "no label specified - see --help for usage"
  43         sys.exit(1)
  44
  45     try:
  46         samFileName = args[1]
  47     except IndexError:
  48         print "no samfile specified - see --help for usage"
  49         sys.exit(1)
  50
  51     try:
  52         outDbName = args[2]
  53     except IndexError:
  54         print "no outrdsfile specified - see --help for usage"
  55         sys.exit(1)
  56
  57     makeRdsFromBam(label, samFileName, outDbName, options.init, options.doIndex, options.useSamFile,
  58                    options.cachePages, options.maxMultiReadCount, options.rnaDataType, options.trimReadID)
  59
  60
  61 def getParser(usage):
  62     parser = optparse.OptionParser(usage=usage)
  63     parser.add_option("--append", action="store_false", dest="init",
  64                       help="append to existing rds file [default: create new]")
  65     parser.add_option("--RNA", action="store_true", dest="rnaDataType",
  66                       help="set data type to RNA [default: DNA]")
  67     parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
  68                       help="input file is in sam format")
  69     parser.add_option("--index", action="store_true", dest="doIndex",
  70                       help="index the output rds file")
  71     parser.add_option("--cache", type="int", dest="cachePages",
  72                       help="number of cache pages to use [default: 100000")
  73     parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
  74                       help="multi counts over this value are discarded [default: 10]")
  75     parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
  76                       help="use the raw read names")
  77
  78     configParser = getConfigParser()
  79     section = "makeRdsFromBam"
  80     init = getConfigBoolOption(configParser, section, "init", True)
  81     doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
  82     useSamFile = getConfigBoolOption(configParser, section, "useSamFile", False)
  83     cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
  84     maxMultiReadCount = getConfigIntOption(configParser, section, "maxMultiReadCount", 10)
  85     rnaDataType = getConfigBoolOption(configParser, section, "rnaDataType", False)
  86     trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
  87
  88     parser.set_defaults(init=init, doIndex=doIndex, useSamFile=useSamFile, cachePages=cachePages,
  89                         maxMultiReadCount=maxMultiReadCount, rnaDataType=rnaDataType, trimReadID=trimReadID)
  90
  91     return parser
  92
  93
  94 def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False,
  95                    cachePages=100000, maxMultiReadCount=10, rnaDataType=False, trimReadID=True):
  96
  97     if useSamFile:
  98         fileMode = "r"
  99     else:
 100         fileMode = "rb"
 101
 102     try:
 103         samfile = pysam.Samfile(samFileName, fileMode)
 104     except ValueError:
 105         print "samfile index not found"
 106         sys.exit(1)
 107
 108     if rnaDataType:
 109         dataType = "RNA"
 110     else:
 111         dataType = "DNA"
 112
 113     writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:]))
 114
 115     rds = ReadDataset.ReadDataset(outDbName, init, dataType, verbose=True)
 116     if not init and doIndex:
 117         try:
 118             if rds.hasIndex():
 119                 rds.dropIndex()
 120         except:
 121             pass
 122
 123     if "sam_mapped" not in rds.getMetadata():
 124         rds.insertMetadata([("sam_mapped", "True")])
 125
 126     defaultCacheSize = rds.getDefaultCacheSize()
 127
 128     if cachePages > defaultCacheSize:
 129         if init:
 130             rds.setDBcache(cachePages, default=True)
 131         else:
 132             rds.setDBcache(cachePages)
 133
 134     propertyList = []
 135     for arg in sys.argv:
 136         if "::" in arg:
 137             (pname, pvalue) = arg.strip().split("::")
 138             propertyList.append((pname, pvalue))
 139
 140     if len(propertyList) > 0:
 141         rds.insertMetadata(propertyList)
 142
 143     totalReadCounts = {"unmapped": 0,
 144                        "total": 0,
 145                        "unique": 0,
 146                        "multi": 0,
 147                        "multiDiscard": 0,
 148                        "splice": 0
 149     }
 150
 151     readsize = 0
 152
 153     uniqueInsertList = []
 154     multiInsertList = []
 155     spliceInsertList = []
 156
 157     processedEntryDict = {}
 158     uniqueReadDict = {}
 159     multiReadDict = {}
 160     spliceReadDict = {}
 161
 162     samFileIterator = samfile.fetch(until_eof=True)
 163
 164     for read in samFileIterator:
 165         if read.is_unmapped:
 166             totalReadCounts["unmapped"] += 1
 167             continue
 168
 169         if readsize == 0:
 170             take = (0, 1) # CIGAR operation (M/match, I/insertion)
 171             readsize = sum([length for op,length in read.cigar if op in take])
 172             if init:
 173                 rds.insertMetadata([("readsize", readsize)])
 174
 175         #Build the read dictionaries
 176         try:
 177             readSequence = read.seq
 178         except KeyError:
 179             readSequence = ""
 180
 181         pairReadSuffix = getPairedReadNumberSuffix(read)
 182         readName = "%s%s%s" % (read.qname, readSequence, pairReadSuffix)
 183         if trimReadID:
 184             rdsEntryName = "%s:%s:%d%s" % (label, read.qname, totalReadCounts["total"], pairReadSuffix)
 185         else:
 186             rdsEntryName = read.qname
 187
 188         if processedEntryDict.has_key(readName):
 189             if isSpliceEntry(read.cigar):
 190                 if spliceReadDict.has_key(readName):
 191                     del spliceReadDict[readName]
 192             else:
 193                 if uniqueReadDict.has_key(readName):
 194                     del uniqueReadDict[readName]
 195
 196                 if multiReadDict.has_key(readName):
 197                     (read, priorCount, rdsEntryName) = multiReadDict[readName]
 198                     count = priorCount + 1
 199                     multiReadDict[readName] = (read, count, rdsEntryName)
 200                 else:
 201                     multiReadDict[readName] = (read, 1, rdsEntryName)
 202         else:
 203             processedEntryDict[readName] = ""
 204             if isSpliceEntry(read.cigar):
 205                 spliceReadDict[readName] = (read,rdsEntryName)
 206             else:
 207                 uniqueReadDict[readName] = (read, rdsEntryName)
 208
 209         if totalReadCounts["total"] % INSERT_SIZE == 0:
 210             for entry in uniqueReadDict.keys():
 211                 (readData, rdsEntryName) = uniqueReadDict[entry]
 212                 chrom = samfile.getrname(readData.rname)
 213                 uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
 214                 totalReadCounts["unique"] += 1
 215
 216             for entry in spliceReadDict.keys():
 217                 (readData, rdsEntryName) = spliceReadDict[entry]
 218                 chrom = samfile.getrname(readData.rname)
 219                 spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
 220                 totalReadCounts["splice"] += 1
 221
 222             for entry in multiReadDict.keys():
 223                 (readData, count, rdsEntryName) = multiReadDict[entry]
 224                 chrom = samfile.getrname(readData.rname)
 225                 if count > maxMultiReadCount:
 226                     totalReadCounts["multiDiscard"] += 1
 227                 else:
 228                     multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
 229                     totalReadCounts["multi"] += 1
 230
 231             rds.insertUniqs(uniqueInsertList)
 232             rds.insertMulti(multiInsertList)
 233             uniqueInsertList = []
 234             uniqueReadDict = {}
 235             multiInsertList = []
 236             multiReadDict = {}
 237             if dataType == "RNA":
 238                 rds.insertSplices(spliceInsertList)
 239                 spliceInsertList = []
 240                 spliceReadDict = {}
 241
 242             print ".",
 243             sys.stdout.flush()
 244             processedEntryDict = {}
 245
 246         totalReadCounts["total"] += 1
 247
 248     if len(uniqueReadDict.keys()) > 0:
 249         for entry in uniqueReadDict.keys():
 250             (readData, rdsEntryName) = uniqueReadDict[entry]
 251             chrom = samfile.getrname(readData.rname)
 252             uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
 253             totalReadCounts["unique"] += 1
 254
 255         rds.insertUniqs(uniqueInsertList)
 256
 257     if len(multiReadDict.keys()) > 0:
 258         for entry in multiReadDict.keys():
 259             (readData, count, rdsEntryName) = multiReadDict[entry]
 260             chrom = samfile.getrname(readData.rname)
 261             if count > maxMultiReadCount:
 262                 totalReadCounts["multiDiscard"] += 1
 263             else:
 264                 multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
 265                 totalReadCounts["multi"] += 1
 266
 267         totalReadCounts["multi"] += len(multiInsertList)
 268
 269     if len(spliceReadDict.keys()) > 0 and dataType == "RNA":
 270         for entry in spliceReadDict.keys():
 271             (readData, rdsEntryName) = spliceReadDict[entry]
 272             chrom = samfile.getrname(readData.rname)
 273             spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
 274             totalReadCounts["splice"] += 1
 275
 276         rds.insertSplices(spliceInsertList)
 277
 278     countStringList = ["\n%d unmapped reads discarded" % totalReadCounts["unmapped"]]
 279     countStringList.append("%d unique reads" % totalReadCounts["unique"])
 280     countStringList.append("%d multi reads" % totalReadCounts["multi"])
 281     countStringList.append("%d multi reads count > %d discarded" % (totalReadCounts["multiDiscard"], maxMultiReadCount))
 282     if dataType == "RNA":
 283         countStringList.append("%d spliced reads" % totalReadCounts["splice"])
 284
 285     print string.join(countStringList, "\n")
 286     outputCountText = string.join(countStringList, "\t")
 287     writeLog("%s.log" % outDbName, verstring, outputCountText)
 288
 289     if doIndex:
 290         print "building index...."
 291         if cachePages > defaultCacheSize:
 292             rds.setDBcache(cachePages)
 293             rds.buildIndex(cachePages)
 294         else:
 295             rds.buildIndex(defaultCacheSize)
 296
 297
 298 def getRDSEntry(alignedRead, readName, chrom, readSize, weight=1):
 299     start = int(alignedRead.pos)
 300     stop = int(start + readSize)
 301     sense = getReadSense(alignedRead.is_reverse)
 302     try:
 303         mismatchTag = alignedRead.opt("MD")
 304         mismatches = getMismatches(mismatchTag, alignedRead.seq, sense)
 305     except KeyError:
 306         mismatches = ""
 307
 308     return (readName, chrom, start, stop, sense, 1.0/weight, '', mismatches)
 309
 310
 311 def getRDSSpliceEntry(alignedRead, readName, chrom, readSize):
 312     (readName, chrom, start, stop, sense, weight, flag, mismatches) = getRDSEntry(alignedRead, readName, chrom, readSize)
 313     startL, startR, stopL, stopR = getSpliceBounds(start, readSize, alignedRead.cigar)
 314
 315     return (readName, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches)
 316
 317
 318 def getPairedReadNumberSuffix(read):
 319     readSuffix = ""
 320     if not isPairedRead(read):
 321         return ""
 322
 323     if read.is_read1:
 324         readSuffix = "/1"
 325     elif read.is_read2:
 326         readSuffix = "/2"
 327
 328     return readSuffix
 329
 330
 331 def isPairedRead(read):
 332     return read.is_proper_pair and (read.is_read1 or read.is_read2)
 333
 334
 335 def isSpliceEntry(cigarTupleList):
 336     isSplice = False
 337     for operation,length in cigarTupleList:
 338         if operation == 3:
 339             isSplice = True
 340             break
 341
 342     return isSplice
 343
 344
 345 def getReadSense(reverse):
 346     if reverse:
 347         sense = "-"
 348     else:
 349         sense = "+"
 350
 351     return sense
 352
 353
 354 def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False):
 355     output = []
 356     deletionMarker = "^"
 357     position = 0
 358
 359     lengths = re.findall("\d+", mismatchTag)
 360     mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
 361
 362     for mismatchEntry in range(len(mismatchSequences)):
 363         mismatch = mismatchSequences[mismatchEntry]
 364         position = position + int(lengths[mismatchEntry])
 365         if string.find(mismatch, deletionMarker) == 0:
 366             continue
 367
 368         try:
 369             if querySequence:
 370                 genomicNucleotide = querySequence[position]
 371             else:
 372                 genomicNucleotide = "N"
 373
 374             if sense == "-":
 375                 mismatch = getReverseComplement(mismatch)
 376                 genomicNucleotide  = getReverseComplement(genomicNucleotide)
 377
 378             erange1BasedElandCompatiblePosition = int(position + 1)
 379             output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
 380             position += 1
 381         except IndexError:
 382             if logErrors:
 383                 errorMessage = "getMismatch IndexError; tag: %s, seq: %s, pos: %d" % (mismatchTag, querySequence, position)
 384                 writeLog("MakeRdsFromBamError.log", "1.0", errorMessage)
 385
 386             return ""
 387
 388     return string.join(output, ",")
 389
 390
 391 def getSpliceBounds(start, readsize, cigarTupleList):
 392     stopR = int(start + readsize)
 393     offset = 0
 394
 395     for operation,length in cigarTupleList:
 396         if operation == 3:
 397             stopL = int(start + offset)
 398             startR = int(stopL + length)
 399
 400             return start, startR, stopL, stopR
 401         else:
 402             offset += length
 403
 404
 405 if __name__ == "__main__":
 406     main(sys.argv)