MakeRdsFromBam.py

   1 """
   2 MakeRdsFromBam
   3
   4 Created on Jun 3, 2010
   5
   6 @author: sau
   7 """
   8
   9 try:
  10     import psyco
  11     psyco.full()
  12 except:
  13     pass
  14
  15 import sys, string, optparse, re
  16 import pysam
  17 from commoncode import readDataset, writeLog
  18
  19 verstring = "%prog: version 1.0"
  20
  21
  22 def main(argv=None):
  23     if not argv:
  24         argv = sys.argv
  25
  26     print verstring
  27
  28     usage = "usage:  %prog label samfile outrdsfile [propertyName::propertyValue] [options]\
  29             \ninput reads must be sorted to properly record multireads"
  30
  31     parser = optparse.OptionParser(usage=usage)
  32     parser.add_option("--append", action="store_false", dest="init",
  33                       help="append to existing rds file [default: create new]")
  34     parser.add_option("--RNA", action="store_true", dest="rnaDataType",
  35                       help="set data type to RNA [default: DNA]")
  36     parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
  37                       help="input file is in sam format")
  38     parser.add_option("--index", action="store_true", dest="doIndex",
  39                       help="index the output rds file")
  40     parser.add_option("--cache", type="int", dest="cachePages",
  41                       help="number of cache pages to use [default: 100000")
  42     parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
  43                       help="multi counts over this value are discarded [default: 10]")
  44     parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
  45                       help="use the raw read names")
  46     parser.set_defaults(init=True, doIndex=False, useSamFile=False, cachePages=100000,
  47                         maxMultiReadCount=10, rnaDataType=False, trimReadID=True)
  48
  49     (options, args) = parser.parse_args(argv[1:])
  50
  51     try:
  52         label = args[0]
  53     except IndexError:
  54         print "no label specified - see --help for usage"
  55         sys.exit(1)
  56
  57     try:
  58         samFileName = args[1]
  59     except IndexError:
  60         print "no samfile specified - see --help for usage"
  61         sys.exit(1)
  62
  63     try:
  64         outDbName = args[2]
  65     except IndexError:
  66         print "no outrdsfile specified - see --help for usage"
  67         sys.exit(1)
  68
  69     makeRdsFromBam(label, samFileName, outDbName, options.init, options.doIndex, options.useSamFile,
  70                    options.cachePages, options.maxMultiReadCount, options.rnaDataType, options.trimReadID)
  71
  72
  73 def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False,
  74                    cachePages=100000, maxMultiReadCount=10, rnaDataType=False, trimReadID=True):
  75
  76     if useSamFile:
  77         fileMode = "r"
  78     else:
  79         fileMode = "rb"
  80
  81     try:
  82         samfile = pysam.Samfile(samFileName, fileMode)
  83     except ValueError:
  84         print "samfile index not found"
  85         sys.exit(1)
  86
  87     if rnaDataType:
  88         dataType = "RNA"
  89     else:
  90         dataType = "DNA"
  91
  92     writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:]))
  93
  94     rds = readDataset(outDbName, init, dataType, verbose=True)
  95     if not init and doIndex:
  96         try:
  97             if rds.hasIndex():
  98                 rds.dropIndex()
  99         except:
 100             pass
 101
 102     if "sam_mapped" not in rds.getMetadata():
 103         rds.insertMetadata([("sam_mapped", "True")])
 104
 105     defaultCacheSize = rds.getDefaultCacheSize()
 106
 107     if cachePages > defaultCacheSize:
 108         if init:
 109             rds.setDBcache(cachePages, default=True)
 110         else:
 111             rds.setDBcache(cachePages)
 112
 113     propertyList = []
 114     for arg in sys.argv:
 115         if "::" in arg:
 116             (pname, pvalue) = arg.strip().split("::")
 117             propertyList.append((pname, pvalue))
 118
 119     if len(propertyList) > 0:
 120         rds.insertMetadata(propertyList)
 121
 122     countReads = {"unmapped": 0,
 123                   "total": 0,
 124                   "unique": 0,
 125                   "multi": 0,
 126                   "multiDiscard": 0,
 127                   "splice": 0
 128     }
 129
 130     readsize = 0
 131     insertSize = 100000
 132
 133     uniqueInsertList = []
 134     multiInsertList = []
 135     spliceInsertList = []
 136
 137     processedEntryDict = {}
 138     uniqueReadDict = {}
 139     multiReadDict = {}
 140     spliceReadDict = {}
 141
 142     samFileIterator = samfile.fetch(until_eof=True)
 143
 144     for read in samFileIterator:
 145         if read.is_unmapped:
 146             countReads["unmapped"] += 1
 147             continue
 148
 149         if readsize == 0:
 150             take = (0, 2, 3) # CIGAR operation (M/match, D/del, N/ref_skip)
 151             readsize = sum([length for op,length in read.cigar if op in take])
 152             if init:
 153                 rds.insertMetadata([("readsize", readsize)])
 154
 155         #Build the read dictionaries
 156         try:
 157             readSequence = read.seq
 158         except KeyError:
 159             readSequence = ""
 160
 161         pairReadSuffix = getPairedReadNumberSuffix(read)
 162         readName = "%s%s%s" % (read.qname, readSequence, pairReadSuffix)
 163         if trimReadID:
 164             rdsEntryName = "%s:%s:%d%s" % (label, read.qname, countReads["total"], pairReadSuffix)
 165         else:
 166             rdsEntryName = read.qname
 167
 168         if processedEntryDict.has_key(readName):
 169             if isSpliceEntry(read.cigar):
 170                 if spliceReadDict.has_key(readName):
 171                     del spliceReadDict[readName]
 172             else:
 173                 if uniqueReadDict.has_key(readName):
 174                     del uniqueReadDict[readName]
 175
 176                 if multiReadDict.has_key(readName):
 177                     (read, priorCount, rdsEntryName) = multiReadDict[readName]
 178                     count = priorCount + 1
 179                     multiReadDict[readName] = (read, count, rdsEntryName)
 180                 else:
 181                     multiReadDict[readName] = (read, 1, rdsEntryName)
 182         else:
 183             processedEntryDict[readName] = ""
 184             if isSpliceEntry(read.cigar):
 185                 spliceReadDict[readName] = (read,rdsEntryName)
 186             else:
 187                 uniqueReadDict[readName] = (read, rdsEntryName)
 188
 189         if countReads["total"] % insertSize == 0:
 190             for entry in uniqueReadDict.keys():
 191                 (readData, rdsEntryName) = uniqueReadDict[entry]
 192                 chrom = samfile.getrname(readData.rname)
 193                 uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
 194                 countReads["unique"] += 1
 195
 196             for entry in spliceReadDict.keys():
 197                 (readData, rdsEntryName) = spliceReadDict[entry]
 198                 chrom = samfile.getrname(readData.rname)
 199                 spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
 200                 countReads["splice"] += 1
 201
 202             for entry in multiReadDict.keys():
 203                 (readData, count, rdsEntryName) = multiReadDict[entry]
 204                 chrom = samfile.getrname(readData.rname)
 205                 if count > maxMultiReadCount:
 206                     countReads["multiDiscard"] += 1
 207                 else:
 208                     multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
 209                     countReads["multi"] += 1
 210
 211             rds.insertUniqs(uniqueInsertList)
 212             rds.insertMulti(multiInsertList)
 213             uniqueInsertList = []
 214             uniqueReadDict = {}
 215             multiInsertList = []
 216             multiReadDict = {}
 217             if dataType == "RNA":
 218                 rds.insertSplices(spliceInsertList)
 219                 spliceInsertList = []
 220                 spliceReadDict = {}
 221
 222             print ".",
 223             sys.stdout.flush()
 224             processedEntryDict = {}
 225
 226         countReads["total"] += 1
 227
 228     if len(uniqueReadDict.keys()) > 0:
 229         for entry in uniqueReadDict.keys():
 230             (readData, rdsEntryName) = uniqueReadDict[entry]
 231             chrom = samfile.getrname(readData.rname)
 232             uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
 233             countReads["unique"] += 1
 234
 235         rds.insertUniqs(uniqueInsertList)
 236
 237     if len(multiReadDict.keys()) > 0:
 238         for entry in multiReadDict.keys():
 239             (readData, count, rdsEntryName) = multiReadDict[entry]
 240             chrom = samfile.getrname(readData.rname)
 241             if count > maxMultiReadCount:
 242                 countReads["multiDiscard"] += 1
 243             else:
 244                 multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
 245                 countReads["multi"] += 1
 246
 247         countReads["multi"] += len(multiInsertList)
 248
 249     if len(spliceReadDict.keys()) > 0 and dataType == "RNA":
 250         for entry in spliceReadDict.keys():
 251             (readData, rdsEntryName) = spliceReadDict[entry]
 252             chrom = samfile.getrname(readData.rname)
 253             spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
 254             countReads["splice"] += 1
 255
 256         rds.insertSplices(spliceInsertList)
 257
 258     countString = "\n%d unmapped reads discarded" % countReads["unmapped"]
 259     countString += "\t%d unique reads" % countReads["unique"]
 260     countString += "\t%d multi reads" % countReads["multi"]
 261     countString += "\t%d multi reads count > %d discarded" % (countReads["multiDiscard"], maxMultiReadCount)
 262     if dataType == "RNA":
 263         countString += "\t%d spliced reads" % countReads["splice"]
 264
 265     print countString.replace("\t", "\n")
 266
 267     writeLog("%s.log" % outDbName, verstring, countString)
 268
 269     if doIndex:
 270         print "building index...."
 271         if cachePages > defaultCacheSize:
 272             rds.setDBcache(cachePages)
 273             rds.buildIndex(cachePages)
 274         else:
 275             rds.buildIndex(defaultCacheSize)
 276
 277
 278 def getRDSEntry(alignedRead, readName, chrom, readSize, weight=1):
 279     start = int(alignedRead.pos)
 280     stop = int(start+readSize)
 281     sense = getReadSense(alignedRead.is_reverse)
 282     try:
 283         mismatchTag = alignedRead.opt("MD")
 284         mismatches = getMismatches(mismatchTag, alignedRead.seq, sense)
 285     except KeyError:
 286         mismatches = ""
 287
 288     return (readName, chrom, start, stop, sense, 1.0/weight, '', mismatches)
 289
 290
 291 def getRDSSpliceEntry(alignedRead, readName, chrom, readSize):
 292     (readName, chrom, start, stop, sense, weight, flag, mismatches) = getRDSEntry(alignedRead, readName, chrom, readSize)
 293     startL, startR, stopL, stopR = getSpliceBounds(start, readSize, alignedRead.cigar)
 294
 295     return (readName, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches)
 296
 297
 298 def getPairedReadNumberSuffix(read):
 299     readSuffix = ""
 300     if not isPairedRead(read):
 301         return ""
 302
 303     if read.is_read1:
 304         readSuffix = "/1"
 305     elif read.is_read2:
 306         readSuffix = "/2"
 307
 308     return readSuffix
 309
 310
 311 def isPairedRead(read):
 312     return read.is_proper_pair and (read.is_read1 or read.is_read2)
 313
 314
 315 def isSpliceEntry(cigarTupleList):
 316     isSplice = False
 317     for operation,length in cigarTupleList:
 318         if operation == 3:
 319             isSplice = True
 320             break
 321
 322     return isSplice
 323
 324
 325 def getReadSense(reverse):
 326     if reverse:
 327         sense = "-"
 328     else:
 329         sense = "+"
 330
 331     return sense
 332
 333
 334 def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False):
 335     output = []
 336     deletionMarker = "^"
 337     position = 0
 338
 339     lengths = re.findall("\d+", mismatchTag)
 340     mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
 341
 342     for mismatchEntry in range(len(mismatchSequences)):
 343         mismatch = mismatchSequences[mismatchEntry]
 344         position = position + int(lengths[mismatchEntry])
 345         if string.find(mismatch, deletionMarker) == 0:
 346             continue
 347
 348         try:
 349             if querySequence:
 350                 genomicNucleotide = querySequence[position]
 351             else:
 352                 genomicNucleotide = "N"
 353
 354             if sense == "-":
 355                 mismatch = getComplementNucleotide(mismatch)
 356                 genomicNucleotide  = getComplementNucleotide(genomicNucleotide)
 357
 358             elandCompatiblePosition = int(position + 1)
 359             output.append("%s%d%s" % (mismatch, elandCompatiblePosition, genomicNucleotide))
 360             position += 1
 361         except IndexError:
 362             if logErrors:
 363                 errorMessage = "getMismatch IndexError; tag: %s, seq: %s, pos: %d" % (mismatchTag, querySequence, position)
 364                 writeLog("MakeRdsFromBamError.log", "1.0", errorMessage)
 365
 366             return ""
 367
 368     return string.join(output, ",")
 369
 370
 371 def getComplementNucleotide(nucleotide):
 372     complement = {"A": "T",
 373                   "T": "A",
 374                   "C": "G",
 375                   "G": "C",
 376                   "N": "N"
 377     }
 378
 379     return complement[nucleotide]
 380
 381
 382 def getSpliceBounds(start, readsize, cigarTupleList):
 383     stopR = int(start + readsize)
 384     offset = 0
 385
 386     for operation,length in cigarTupleList:
 387         if operation == 3:
 388             stopL = int(start + offset)
 389             startR = int(stopL + length)
 390
 391             return start, startR, stopL, stopR
 392         else:
 393             offset += length
 394
 395
 396 if __name__ == "__main__":
 397     main(sys.argv)