MakeRdsFromBam.py

   1 """
   2 MakeRdsFromBam
   3
   4 Created on Jun 3, 2010
   5
   6 @author: sau
   7 """
   8
   9 try:
  10     import psyco
  11     psyco.full()
  12 except:
  13     pass
  14
  15 import sys
  16 import string
  17 import optparse
  18 import re
  19 import pysam
  20 from commoncode import writeLog, getConfigParser, getConfigBoolOption, getConfigIntOption, getReverseComplement
  21 import ReadDataset
  22
  23 INSERT_SIZE = 100000
  24 verstring = "makeRdsFromBam: version 1.1"
  25
  26
  27 def main(argv=None):
  28     if not argv:
  29         argv = sys.argv
  30
  31     print verstring
  32
  33     usage = "usage:  %prog label samfile outrdsfile [propertyName::propertyValue] [options]\
  34             \ninput reads must be sorted to properly record multireads"
  35
  36     parser = getParser(usage)
  37     (options, args) = parser.parse_args(argv[1:])
  38
  39     try:
  40         label = args[0]
  41     except IndexError:
  42         print "no label specified - see --help for usage"
  43         sys.exit(1)
  44
  45     try:
  46         samFileName = args[1]
  47     except IndexError:
  48         print "no samfile specified - see --help for usage"
  49         sys.exit(1)
  50
  51     try:
  52         outDbName = args[2]
  53     except IndexError:
  54         print "no outrdsfile specified - see --help for usage"
  55         sys.exit(1)
  56
  57     if options.rnaDataType:
  58         dataType = "RNA"
  59     else:
  60         dataType = "DNA"
  61
  62     makeRdsFromBam(label, samFileName, outDbName, options.init, options.doIndex, options.useSamFile,
  63                    options.cachePages, options.maxMultiReadCount, dataType, options.trimReadID)
  64
  65
  66 def getParser(usage):
  67     parser = optparse.OptionParser(usage=usage)
  68     parser.add_option("--append", action="store_false", dest="init",
  69                       help="append to existing rds file [default: create new]")
  70     parser.add_option("--RNA", action="store_true", dest="rnaDataType",
  71                       help="set data type to RNA [default: DNA]")
  72     parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
  73                       help="input file is in sam format")
  74     parser.add_option("--index", action="store_true", dest="doIndex",
  75                       help="index the output rds file")
  76     parser.add_option("--cache", type="int", dest="cachePages",
  77                       help="number of cache pages to use [default: 100000")
  78     parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
  79                       help="multi counts over this value are discarded [default: 10]")
  80     parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
  81                       help="use the raw read names")
  82
  83     configParser = getConfigParser()
  84     section = "makeRdsFromBam"
  85     init = getConfigBoolOption(configParser, section, "init", True)
  86     doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
  87     useSamFile = getConfigBoolOption(configParser, section, "useSamFile", False)
  88     cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
  89     maxMultiReadCount = getConfigIntOption(configParser, section, "maxMultiReadCount", 10)
  90     rnaDataType = getConfigBoolOption(configParser, section, "rnaDataType", False)
  91     trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
  92
  93     parser.set_defaults(init=init, doIndex=doIndex, useSamFile=useSamFile, cachePages=cachePages,
  94                         maxMultiReadCount=maxMultiReadCount, rnaDataType=rnaDataType, trimReadID=trimReadID)
  95
  96     return parser
  97
  98
  99 def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False,
 100                    cachePages=100000, maxMultiReadCount=10, dataType="DNA", trimReadID=True):
 101
 102     if useSamFile:
 103         fileMode = "r"
 104     else:
 105         fileMode = "rb"
 106
 107     writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:]))
 108     rds = ReadDataset.ReadDataset(outDbName, init, dataType, verbose=True)
 109     if not init and doIndex:
 110         try:
 111             if rds.hasIndex():
 112                 rds.dropIndex()
 113         except:
 114             pass
 115
 116     if "sam_mapped" not in rds.getMetadata():
 117         rds.insertMetadata([("sam_mapped", "True")])
 118
 119     defaultCacheSize = rds.getDefaultCacheSize()
 120
 121     if cachePages > defaultCacheSize:
 122         if init:
 123             rds.setDBcache(cachePages, default=True)
 124         else:
 125             rds.setDBcache(cachePages)
 126
 127     propertyList = []
 128     for arg in sys.argv:
 129         if "::" in arg:
 130             (pname, pvalue) = arg.strip().split("::")
 131             propertyList.append((pname, pvalue))
 132
 133     if len(propertyList) > 0:
 134         rds.insertMetadata(propertyList)
 135
 136     totalReadCounts = {"unmapped": 0,
 137                        "total": 0,
 138                        "unique": 0,
 139                        "multi": 0,
 140                        "multiDiscard": 0,
 141                        "splice": 0,
 142                        "multisplice": 0
 143     }
 144
 145     readsize = 0
 146
 147     uniqueInsertList = []
 148     multiInsertList = []
 149     spliceInsertList = []
 150     multispliceInsertList = []
 151
 152     uniqueReadDict = {}
 153     multiReadDict = {}
 154     multispliceReadDict = {}
 155     spliceReadDict = {}
 156     multireadCounts = getMultiReadIDCounts(samFileName, fileMode)
 157
 158     for readID in multireadCounts:
 159         if multireadCounts[readID] > maxMultiReadCount:
 160             totalReadCounts["multiDiscard"] += 1
 161
 162     try:
 163         samfile = pysam.Samfile(samFileName, fileMode)
 164     except ValueError:
 165         print "samfile index not found"
 166         sys.exit(1)
 167
 168     samFileIterator = samfile.fetch(until_eof=True)
 169     for read in samFileIterator:
 170         if read.is_unmapped:
 171             totalReadCounts["unmapped"] += 1
 172             continue
 173
 174         if readsize == 0:
 175             take = (0, 1) # CIGAR operation (M/match, I/insertion)
 176             readsize = sum([length for op,length in read.cigar if op in take])
 177             if init:
 178                 rds.insertMetadata([("readsize", readsize)])
 179
 180         pairReadSuffix = getPairedReadNumberSuffix(read)
 181         readName = "%s%s" % (read.qname, pairReadSuffix)
 182         if trimReadID:
 183             rdsEntryName = "%s:%s:%d%s" % (label, read.qname, totalReadCounts["total"], pairReadSuffix)
 184         else:
 185             rdsEntryName = read.qname
 186
 187         try:
 188             count = multireadCounts[readName]
 189         except KeyError:
 190             count = 1
 191
 192         if count == 1:
 193             if isSpliceEntry(read.cigar):
 194                 spliceReadDict[readName] = (read,rdsEntryName)
 195             else:
 196                 uniqueReadDict[readName] = (read, rdsEntryName)
 197         elif count <= maxMultiReadCount:
 198             if isSpliceEntry(read.cigar):
 199                 multispliceReadDict[readName] = (read, count, rdsEntryName)
 200             else:
 201                 multiReadDict[readName] = (read, count, rdsEntryName)
 202
 203         if totalReadCounts["total"] % INSERT_SIZE == 0:
 204             for entry in uniqueReadDict.keys():
 205                 (readData, rdsEntryName) = uniqueReadDict[entry]
 206                 chrom = samfile.getrname(readData.rname)
 207                 uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
 208                 totalReadCounts["unique"] += 1
 209
 210             for entry in multiReadDict.keys():
 211                 (readData, count, rdsEntryName) = multiReadDict[entry]
 212                 chrom = samfile.getrname(readData.rname)
 213                 multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
 214
 215             if dataType == "RNA":
 216                 for entry in spliceReadDict.keys():
 217                     (readData, rdsEntryName) = spliceReadDict[entry]
 218                     chrom = samfile.getrname(readData.rname)
 219                     spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
 220                     totalReadCounts["splice"] += 1
 221
 222                 for entry in multispliceReadDict.keys():
 223                     (readData, count, rdsEntryName) = multispliceReadDict[entry]
 224                     chrom = samfile.getrname(readData.rname)
 225                     multispliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize, weight=count))
 226                     totalReadCounts["multisplice"] += 1
 227
 228             rds.insertUniqs(uniqueInsertList)
 229             rds.insertMulti(multiInsertList)
 230             uniqueInsertList = []
 231             uniqueReadDict = {}
 232             multiInsertList = []
 233             multiReadDict = {}
 234             if dataType == "RNA":
 235                 rds.insertSplices(spliceInsertList)
 236                 spliceInsertList = []
 237                 spliceReadDict = {}
 238                 rds.insertMultisplices(multispliceInsertList)
 239                 multispliceInsertList = []
 240                 multispliceReadDict = {}
 241
 242             print ".",
 243             sys.stdout.flush()
 244
 245         totalReadCounts["total"] += 1
 246
 247     if len(uniqueReadDict.keys()) > 0:
 248         for entry in uniqueReadDict.keys():
 249             (readData, rdsEntryName) = uniqueReadDict[entry]
 250             chrom = samfile.getrname(readData.rname)
 251             uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
 252             totalReadCounts["unique"] += 1
 253
 254         rds.insertUniqs(uniqueInsertList)
 255
 256     if len(multiReadDict.keys()) > 0:
 257         for entry in multiReadDict.keys():
 258             (readData, count, rdsEntryName) = multiReadDict[entry]
 259             chrom = samfile.getrname(readData.rname)
 260             multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
 261
 262         rds.insertMulti(multiInsertList)
 263
 264     if len(spliceReadDict.keys()) > 0 and dataType == "RNA":
 265         for entry in spliceReadDict.keys():
 266             (readData, rdsEntryName) = spliceReadDict[entry]
 267             chrom = samfile.getrname(readData.rname)
 268             spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
 269             totalReadCounts["splice"] += 1
 270
 271         rds.insertSplices(spliceInsertList)
 272
 273     if len(multispliceReadDict.keys()) > 0 and dataType == "RNA":
 274         for entry in multispliceReadDict.keys():
 275             (readData, count, rdsEntryName) = multispliceReadDict[entry]
 276             chrom = samfile.getrname(readData.rname)
 277             multispliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize, weight=count))
 278             totalReadCounts["multisplice"] += 1
 279
 280         rds.insertMultisplices(multispliceInsertList)
 281
 282     totalReadCounts["multi"] = len(multireadCounts) - totalReadCounts["multiDiscard"] - totalReadCounts["multisplice"]
 283     countStringList = ["\n%d unmapped reads discarded" % totalReadCounts["unmapped"]]
 284     countStringList.append("%d unique reads" % totalReadCounts["unique"])
 285     countStringList.append("%d multi reads" % totalReadCounts["multi"])
 286     countStringList.append("%d multi reads count > %d discarded" % (totalReadCounts["multiDiscard"], maxMultiReadCount))
 287     if dataType == "RNA":
 288         countStringList.append("%d spliced reads" % totalReadCounts["splice"])
 289         countStringList.append("%d spliced multireads" % totalReadCounts["multisplice"])
 290
 291     print string.join(countStringList, "\n")
 292     outputCountText = string.join(countStringList, "\t")
 293     writeLog("%s.log" % outDbName, verstring, outputCountText)
 294
 295     if doIndex:
 296         print "building index...."
 297         if cachePages > defaultCacheSize:
 298             rds.setDBcache(cachePages)
 299             rds.buildIndex(cachePages)
 300         else:
 301             rds.buildIndex(defaultCacheSize)
 302
 303
 304 def getMultiReadIDCounts(samFileName, fileMode):
 305     try:
 306         samfile = pysam.Samfile(samFileName, fileMode)
 307     except ValueError:
 308         print "samfile index not found"
 309         sys.exit(1)
 310
 311     readIDCounts = {}
 312     for read in samfile.fetch(until_eof=True):
 313         pairReadSuffix = getPairedReadNumberSuffix(read)
 314         readName = "%s%s" % (read.qname, pairReadSuffix)
 315         try:
 316             readIDCounts[readName] += 1
 317         except KeyError:
 318             readIDCounts[readName] = 1
 319
 320     for readID in readIDCounts.keys():
 321         if readIDCounts[readID] == 1:
 322             del readIDCounts[readID]
 323
 324     return readIDCounts
 325
 326
 327 def getRDSEntry(alignedRead, readName, chrom, readSize, weight=1):
 328     start = int(alignedRead.pos)
 329     stop = int(start + readSize)
 330     sense = getReadSense(alignedRead.is_reverse)
 331     try:
 332         mismatchTag = alignedRead.opt("MD")
 333         mismatches = getMismatches(mismatchTag, alignedRead.seq, sense)
 334     except KeyError:
 335         mismatches = ""
 336
 337     return (readName, chrom, start, stop, sense, 1.0/weight, '', mismatches)
 338
 339
 340 def getRDSSpliceEntry(alignedRead, readName, chrom, readSize, weight=1):
 341     (readName, chrom, start, stop, sense, weight, flag, mismatches) = getRDSEntry(alignedRead, readName, chrom, readSize, weight)
 342     startL, startR, stopL, stopR = getSpliceBounds(start, readSize, alignedRead.cigar)
 343
 344     return (readName, chrom, startL, stopL, startR, stopR, sense, weight, "", mismatches)
 345
 346
 347 def getPairedReadNumberSuffix(read):
 348     readSuffix = ""
 349     if not isPairedRead(read):
 350         return ""
 351
 352     if read.is_read1:
 353         readSuffix = "/1"
 354     elif read.is_read2:
 355         readSuffix = "/2"
 356
 357     return readSuffix
 358
 359
 360 def isPairedRead(read):
 361     return read.is_proper_pair and (read.is_read1 or read.is_read2)
 362
 363
 364 def isSpliceEntry(cigarTupleList):
 365     isSplice = False
 366     for operation,length in cigarTupleList:
 367         if operation == 3:
 368             isSplice = True
 369             break
 370
 371     return isSplice
 372
 373
 374 def getReadSense(reverse):
 375     if reverse:
 376         sense = "-"
 377     else:
 378         sense = "+"
 379
 380     return sense
 381
 382
 383 def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False):
 384     output = []
 385     deletionMarker = "^"
 386     position = 0
 387
 388     lengths = re.findall("\d+", mismatchTag)
 389     mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
 390
 391     for mismatchEntry in range(len(mismatchSequences)):
 392         mismatch = mismatchSequences[mismatchEntry]
 393         position = position + int(lengths[mismatchEntry])
 394         if string.find(mismatch, deletionMarker) == 0:
 395             continue
 396
 397         try:
 398             if querySequence:
 399                 genomicNucleotide = querySequence[position]
 400             else:
 401                 genomicNucleotide = "N"
 402
 403             if sense == "-":
 404                 mismatch = getReverseComplement(mismatch)
 405                 genomicNucleotide  = getReverseComplement(genomicNucleotide)
 406
 407             erange1BasedElandCompatiblePosition = int(position + 1)
 408             output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
 409             position += 1
 410         except IndexError:
 411             if logErrors:
 412                 errorMessage = "getMismatch IndexError; tag: %s, seq: %s, pos: %d" % (mismatchTag, querySequence, position)
 413                 writeLog("MakeRdsFromBamError.log", "1.0", errorMessage)
 414
 415             return ""
 416
 417     return string.join(output, ",")
 418
 419
 420 def getSpliceBounds(start, readsize, cigarTupleList):
 421     stopR = int(start + readsize)
 422     offset = 0
 423
 424     for operation,length in cigarTupleList:
 425         if operation == 3:
 426             stopL = int(start + offset)
 427             startR = int(stopL + length)
 428
 429             return start, startR, stopL, stopR
 430         else:
 431             offset += length
 432
 433
 434 if __name__ == "__main__":
 435     main(sys.argv)