first pass cleanup of cistematic/genomes; change bamPreprocessing

[erange.git] / MakeRdsFromBam.py
diff --git a/MakeRdsFromBam.py b/MakeRdsFromBam.py

index e9df847ab167825f972b670690c953d443eff806..b16bccd2da0fae8b7e6ed85c20237286e9623823 100644 (file)
--- a/MakeRdsFromBam.py
+++ b/MakeRdsFromBam.py
@@ -12,11 +12,16 @@ try:
  except:
      pass
  
-import sys, string, optparse, re
+import sys
+import string
+import optparse
+import re
  import pysam
-from commoncode import readDataset, writeLog
+from commoncode import writeLog, getConfigParser, getConfigBoolOption, getConfigIntOption, getReverseComplement
+import ReadDataset
  
-verstring = "%prog: version 1.0"
+INSERT_SIZE = 100000
+verstring = "makeRdsFromBam: version 1.1"
  
  
  def main(argv=None):
@@ -28,24 +33,7 @@ def main(argv=None):
      usage = "usage:  %prog label samfile outrdsfile [propertyName::propertyValue] [options]\
              \ninput reads must be sorted to properly record multireads"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--append", action="store_false", dest="init",
-                      help="append to existing rds file [default: create new]")
-    parser.add_option("--RNA", action="store_true", dest="rnaDataType",
-                      help="set data type to RNA [default: DNA]")
-    parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
-                      help="input file is in sam format")
-    parser.add_option("--index", action="store_true", dest="doIndex",
-                      help="index the output rds file")
-    parser.add_option("--cache", type="int", dest="cachePages",
-                      help="number of cache pages to use [default: 100000")
-    parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
-                      help="multi counts over this value are discarded [default: 10]")
-    parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
-                      help="use the raw read names")
-    parser.set_defaults(init=True, doIndex=False, useSamFile=False, cachePages=100000,
-                        maxMultiReadCount=10, rnaDataType=False, trimReadID=True)
-
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      try:
@@ -66,32 +54,58 @@ def main(argv=None):
          print "no outrdsfile specified - see --help for usage"
          sys.exit(1)
  
+    if options.rnaDataType:
+        dataType = "RNA"
+    else:
+        dataType = "DNA"
+
      makeRdsFromBam(label, samFileName, outDbName, options.init, options.doIndex, options.useSamFile,
-                   options.cachePages, options.maxMultiReadCount, options.rnaDataType, options.trimReadID)
+                   options.cachePages, options.maxMultiReadCount, dataType, options.trimReadID)
+
+
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--append", action="store_false", dest="init",
+                      help="append to existing rds file [default: create new]")
+    parser.add_option("--RNA", action="store_true", dest="rnaDataType",
+                      help="set data type to RNA [default: DNA]")
+    parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
+                      help="input file is in sam format")
+    parser.add_option("--index", action="store_true", dest="doIndex",
+                      help="index the output rds file")
+    parser.add_option("--cache", type="int", dest="cachePages",
+                      help="number of cache pages to use [default: 100000")
+    parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
+                      help="multi counts over this value are discarded [default: 10]")
+    parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
+                      help="use the raw read names")
+
+    configParser = getConfigParser()
+    section = "makeRdsFromBam"
+    init = getConfigBoolOption(configParser, section, "init", True)
+    doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
+    useSamFile = getConfigBoolOption(configParser, section, "useSamFile", False)
+    cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
+    maxMultiReadCount = getConfigIntOption(configParser, section, "maxMultiReadCount", 10)
+    rnaDataType = getConfigBoolOption(configParser, section, "rnaDataType", False)
+    trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
+
+    parser.set_defaults(init=init, doIndex=doIndex, useSamFile=useSamFile, cachePages=cachePages,
+                        maxMultiReadCount=maxMultiReadCount, rnaDataType=rnaDataType, trimReadID=trimReadID)
+
+    return parser
  
  
  def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False,
-                   cachePages=100000, maxMultiReadCount=10, rnaDataType=False, trimReadID=True):
+                   cachePages=100000, maxMultiReadCount=10, dataType="DNA", trimReadID=True):
  
      if useSamFile:
          fileMode = "r"
      else:
          fileMode = "rb"
  
-    try:
-        samfile = pysam.Samfile(samFileName, fileMode)
-    except ValueError:
-        print "samfile index not found"
-        sys.exit(1)
-
-    if rnaDataType:
-        dataType = "RNA"
-    else:
-        dataType = "DNA"
-
      writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:]))
-
-    rds = readDataset(outDbName, init, dataType, verbose=True)
+    rds = ReadDataset.ReadDataset(outDbName, init, dataType, verbose=True)
      if not init and doIndex:
          try:
              if rds.hasIndex():
@@ -119,94 +133,97 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS
      if len(propertyList) > 0:
          rds.insertMetadata(propertyList)
  
-    countReads = {"unmapped": 0,
-                  "total": 0,
-                  "unique": 0,
-                  "multi": 0,
-                  "multiDiscard": 0,
-                  "splice": 0
+    totalReadCounts = {"unmapped": 0,
+                       "total": 0,
+                       "unique": 0,
+                       "multi": 0,
+                       "multiDiscard": 0,
+                       "splice": 0,
+                       "multisplice": 0
      }
  
      readsize = 0
-    insertSize = 100000
  
      uniqueInsertList = []
      multiInsertList = []
      spliceInsertList = []
+    multispliceInsertList = []
  
-    processedEntryDict = {}
      uniqueReadDict = {}
      multiReadDict = {}
+    multispliceReadDict = {}
      spliceReadDict = {}
+    multireadCounts = getMultiReadIDCounts(samFileName, fileMode)
  
-    samFileIterator = samfile.fetch(until_eof=True)
+    for readID in multireadCounts:
+        if multireadCounts[readID] > maxMultiReadCount:
+            totalReadCounts["multiDiscard"] += 1
+
+    try:
+        samfile = pysam.Samfile(samFileName, fileMode)
+    except ValueError:
+        print "samfile index not found"
+        sys.exit(1)
  
+    samFileIterator = samfile.fetch(until_eof=True)
      for read in samFileIterator:
          if read.is_unmapped:
-            countReads["unmapped"] += 1
+            totalReadCounts["unmapped"] += 1
              continue
  
          if readsize == 0:
-            take = (0, 2, 3) # CIGAR operation (M/match, D/del, N/ref_skip)
+            take = (0, 1) # CIGAR operation (M/match, I/insertion)
              readsize = sum([length for op,length in read.cigar if op in take])
              if init:
                  rds.insertMetadata([("readsize", readsize)])
  
-        #Build the read dictionaries
-        try:
-            readSequence = read.seq
-        except KeyError:
-            readSequence = ""
-
          pairReadSuffix = getPairedReadNumberSuffix(read)
-        readName = "%s%s%s" % (read.qname, readSequence, pairReadSuffix)
+        readName = "%s%s" % (read.qname, pairReadSuffix)
          if trimReadID:
-            rdsEntryName = "%s:%s:%d%s" % (label, read.qname, countReads["total"], pairReadSuffix)
+            rdsEntryName = "%s:%s:%d%s" % (label, read.qname, totalReadCounts["total"], pairReadSuffix)
          else:
              rdsEntryName = read.qname
  
-        if processedEntryDict.has_key(readName):
-            if isSpliceEntry(read.cigar):
-                if spliceReadDict.has_key(readName):
-                    del spliceReadDict[readName]
-            else:
-                if uniqueReadDict.has_key(readName):
-                    del uniqueReadDict[readName]
-
-                if multiReadDict.has_key(readName):
-                    (read, priorCount, rdsEntryName) = multiReadDict[readName]
-                    count = priorCount + 1
-                    multiReadDict[readName] = (read, count, rdsEntryName)
-                else:
-                    multiReadDict[readName] = (read, 1, rdsEntryName)
-        else:
-            processedEntryDict[readName] = ""
+        try:
+            count = multireadCounts[readName]
+        except KeyError:
+            count = 1
+
+        if count == 1:
              if isSpliceEntry(read.cigar):
                  spliceReadDict[readName] = (read,rdsEntryName)
              else:
                  uniqueReadDict[readName] = (read, rdsEntryName)
+        elif count <= maxMultiReadCount:
+            if isSpliceEntry(read.cigar):
+                multispliceReadDict[readName] = (read, count, rdsEntryName)
+            else:
+                multiReadDict[readName] = (read, count, rdsEntryName)
  
-        if countReads["total"] % insertSize == 0:
+        if totalReadCounts["total"] % INSERT_SIZE == 0:
              for entry in uniqueReadDict.keys():
                  (readData, rdsEntryName) = uniqueReadDict[entry]
                  chrom = samfile.getrname(readData.rname)
                  uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
-                countReads["unique"] += 1
-
-            for entry in spliceReadDict.keys():
-                (readData, rdsEntryName) = spliceReadDict[entry]
-                chrom = samfile.getrname(readData.rname)
-                spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
-                countReads["splice"] += 1
+                totalReadCounts["unique"] += 1
  
              for entry in multiReadDict.keys():
                  (readData, count, rdsEntryName) = multiReadDict[entry]
                  chrom = samfile.getrname(readData.rname)
-                if count > maxMultiReadCount:
-                    countReads["multiDiscard"] += 1
-                else:
-                    multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count)) 
-                    countReads["multi"] += 1
+                multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count)) 
+
+            if dataType == "RNA":
+                for entry in spliceReadDict.keys():
+                    (readData, rdsEntryName) = spliceReadDict[entry]
+                    chrom = samfile.getrname(readData.rname)
+                    spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
+                    totalReadCounts["splice"] += 1
+
+                for entry in multispliceReadDict.keys():
+                    (readData, count, rdsEntryName) = multispliceReadDict[entry]
+                    chrom = samfile.getrname(readData.rname)
+                    multispliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize, weight=count))
+                    totalReadCounts["multisplice"] += 1
  
              rds.insertUniqs(uniqueInsertList)
              rds.insertMulti(multiInsertList)
@@ -218,19 +235,21 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS
                  rds.insertSplices(spliceInsertList)
                  spliceInsertList = []
                  spliceReadDict = {}
+                rds.insertMultisplices(multispliceInsertList)
+                multispliceInsertList = []
+                multispliceReadDict = {}
  
              print ".",
              sys.stdout.flush()
-            processedEntryDict = {}
  
-        countReads["total"] += 1
+        totalReadCounts["total"] += 1
  
      if len(uniqueReadDict.keys()) > 0:
          for entry in uniqueReadDict.keys():
              (readData, rdsEntryName) = uniqueReadDict[entry]
              chrom = samfile.getrname(readData.rname)
              uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
-            countReads["unique"] += 1
+            totalReadCounts["unique"] += 1
  
          rds.insertUniqs(uniqueInsertList)
  
@@ -238,33 +257,40 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS
          for entry in multiReadDict.keys():
              (readData, count, rdsEntryName) = multiReadDict[entry]
              chrom = samfile.getrname(readData.rname)
-            if count > maxMultiReadCount:
-                countReads["multiDiscard"] += 1
-            else:
-                multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
-                countReads["multi"] += 1
+            multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
  
-        countReads["multi"] += len(multiInsertList)
+        rds.insertMulti(multiInsertList)
  
      if len(spliceReadDict.keys()) > 0 and dataType == "RNA":
          for entry in spliceReadDict.keys():
              (readData, rdsEntryName) = spliceReadDict[entry]
              chrom = samfile.getrname(readData.rname)
              spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
-            countReads["splice"] += 1
+            totalReadCounts["splice"] += 1
  
          rds.insertSplices(spliceInsertList)
  
-    countString = "\n%d unmapped reads discarded" % countReads["unmapped"]
-    countString += "\t%d unique reads" % countReads["unique"]
-    countString += "\t%d multi reads" % countReads["multi"]
-    countString += "\t%d multi reads count > %d discarded" % (countReads["multiDiscard"], maxMultiReadCount)
-    if dataType == "RNA":
-        countString += "\t%d spliced reads" % countReads["splice"]
+    if len(multispliceReadDict.keys()) > 0 and dataType == "RNA":
+        for entry in multispliceReadDict.keys():
+            (readData, count, rdsEntryName) = multispliceReadDict[entry]
+            chrom = samfile.getrname(readData.rname)
+            multispliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize, weight=count))
+            totalReadCounts["multisplice"] += 1
+
+        rds.insertMultisplices(multispliceInsertList)
  
-    print countString.replace("\t", "\n")
+    totalReadCounts["multi"] = len(multireadCounts) - totalReadCounts["multiDiscard"] - totalReadCounts["multisplice"]
+    countStringList = ["\n%d unmapped reads discarded" % totalReadCounts["unmapped"]]
+    countStringList.append("%d unique reads" % totalReadCounts["unique"])
+    countStringList.append("%d multi reads" % totalReadCounts["multi"])
+    countStringList.append("%d multi reads count > %d discarded" % (totalReadCounts["multiDiscard"], maxMultiReadCount))
+    if dataType == "RNA":
+        countStringList.append("%d spliced reads" % totalReadCounts["splice"])
+        countStringList.append("%d spliced multireads" % totalReadCounts["multisplice"])
  
-    writeLog("%s.log" % outDbName, verstring, countString)
+    print string.join(countStringList, "\n")
+    outputCountText = string.join(countStringList, "\t")
+    writeLog("%s.log" % outDbName, verstring, outputCountText)
  
      if doIndex:
          print "building index...."
@@ -275,9 +301,32 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS
              rds.buildIndex(defaultCacheSize)
  
  
+def getMultiReadIDCounts(samFileName, fileMode):
+    try:
+        samfile = pysam.Samfile(samFileName, fileMode)
+    except ValueError:
+        print "samfile index not found"
+        sys.exit(1)
+
+    readIDCounts = {}
+    for read in samfile.fetch(until_eof=True):
+        pairReadSuffix = getPairedReadNumberSuffix(read)
+        readName = "%s%s" % (read.qname, pairReadSuffix)
+        try:
+            readIDCounts[readName] += 1
+        except KeyError:
+            readIDCounts[readName] = 1
+
+    for readID in readIDCounts.keys():
+        if readIDCounts[readID] == 1:
+            del readIDCounts[readID]
+
+    return readIDCounts
+
+
  def getRDSEntry(alignedRead, readName, chrom, readSize, weight=1):
      start = int(alignedRead.pos)
-    stop = int(start+readSize)
+    stop = int(start + readSize)
      sense = getReadSense(alignedRead.is_reverse)
      try:
          mismatchTag = alignedRead.opt("MD")
@@ -288,11 +337,11 @@ def getRDSEntry(alignedRead, readName, chrom, readSize, weight=1):
      return (readName, chrom, start, stop, sense, 1.0/weight, '', mismatches)
  
  
-def getRDSSpliceEntry(alignedRead, readName, chrom, readSize):
-    (readName, chrom, start, stop, sense, weight, flag, mismatches) = getRDSEntry(alignedRead, readName, chrom, readSize)
+def getRDSSpliceEntry(alignedRead, readName, chrom, readSize, weight=1):
+    (readName, chrom, start, stop, sense, weight, flag, mismatches) = getRDSEntry(alignedRead, readName, chrom, readSize, weight)
      startL, startR, stopL, stopR = getSpliceBounds(start, readSize, alignedRead.cigar)
      
-    return (readName, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches)
+    return (readName, chrom, startL, stopL, startR, stopR, sense, weight, "", mismatches)
  
  
  def getPairedReadNumberSuffix(read):
@@ -352,11 +401,11 @@ def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False):
                  genomicNucleotide = "N"
  
              if sense == "-":
-                mismatch = getComplementNucleotide(mismatch)
-                genomicNucleotide  = getComplementNucleotide(genomicNucleotide)
+                mismatch = getReverseComplement(mismatch)
+                genomicNucleotide  = getReverseComplement(genomicNucleotide)
  
-            elandCompatiblePosition = int(position + 1)
-            output.append("%s%d%s" % (mismatch, elandCompatiblePosition, genomicNucleotide))
+            erange1BasedElandCompatiblePosition = int(position + 1)
+            output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
              position += 1
          except IndexError:
              if logErrors:
@@ -368,25 +417,14 @@ def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False):
      return string.join(output, ",")
  
  
-def getComplementNucleotide(nucleotide):
-    complement = {"A": "T",
-                  "T": "A",
-                  "C": "G",
-                  "G": "C",
-                  "N": "N"
-    }
-
-    return complement[nucleotide]
-
-
  def getSpliceBounds(start, readsize, cigarTupleList):
-    stopR = int(start + readsize)
      offset = 0
  
      for operation,length in cigarTupleList:
          if operation == 3:
              stopL = int(start + offset)
              startR = int(stopL + length)
+            stopR = int(startR + readsize - offset)
  
              return start, startR, stopL, stopR
          else: