erange version 4.0a dev release

[erange.git] / MakeBamFromRds.py
diff --git a/MakeBamFromRds.py b/MakeBamFromRds.py

index 935a04e4bfe973ba9250dbca1c90f12f03c17d67..bfb48fde3fc0bebc9bdceaa1dc16f83dd566da59 100644 (file)
--- a/MakeBamFromRds.py
+++ b/MakeBamFromRds.py
@@ -17,37 +17,24 @@ import sys
  import re
  import optparse
  import random
+import string
  import pysam
-from commoncode import readDataset
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
  
  
  def main(argv=None):
      if not argv:
          argv = sys.argv
  
-    verstring = "MakeBamFromRds: version 1.0"
+    verstring = "makeBamFromRds: version 1.0"
      print verstring
  
      doPairs = False
      
      usage = "usage: python %prog rdsFile bamFile [options]"
  
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--nouniq", action="store_false", dest="withUniqs")
-    parser.add_option("--nomulti", action="store_false", dest="withMulti")
-    parser.add_option("--splices", action="store_true", dest="doSplices")
-    parser.add_option("--flag", dest="withFlag")
-    parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
-    parser.add_option("--pairs", action="store_true", dest="doPairs")
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
-    parser.add_option("--chrom", action="append", dest="chromList")
-    parser.add_option("--fasta", dest="fastaFileName")
-    parser.set_defaults(withUniqs=True, withMulti=True, doSplices=False,
-                        doPairs=False, withFlag="", useFlagLike=False, enforceChr=False,
-                        doCache=False, cachePages=100000, fastaFileName="",
-                        chromList=[])
-
+    parser = getParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      if len(args) < 2:
@@ -67,6 +54,40 @@ def main(argv=None):
                       options.chromList, options.fastaFileName)
  
  
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--nouniq", action="store_false", dest="withUniqs")
+    parser.add_option("--nomulti", action="store_false", dest="withMulti")
+    parser.add_option("--splices", action="store_true", dest="doSplices")
+    parser.add_option("--flag", dest="withFlag")
+    parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
+    parser.add_option("--pairs", action="store_true", dest="doPairs")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
+    parser.add_option("--chrom", action="append", dest="chromList")
+    parser.add_option("--fasta", dest="fastaFileName")
+
+    configParser = getConfigParser()
+    section = "MakeBamFromRds"
+    withUniqs = getConfigBoolOption(configParser, section, "withUniqs", True)
+    withMulti = getConfigBoolOption(configParser, section, "withMulti", True)
+    doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+    doPairs = getConfigBoolOption(configParser, section, "doPairs", False)
+    withFlag = getConfigOption(configParser, section, "withFlag", "")
+    useFlagLike = getConfigBoolOption(configParser, section, "useFlagLike", False)
+    enforceChr = getConfigBoolOption(configParser, section, "enforceChr", False)
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
+    fastaFileName = getConfigOption(configParser, section, "fastaFileName", "")
+
+    parser.set_defaults(withUniqs=withUniqs, withMulti=withMulti, doSplices=doSplices,
+                        doPairs=doPairs, withFlag=withFlag, useFlagLike=useFlagLike, enforceChr=enforceChr,
+                        doCache=doCache, cachePages=cachePages, fastaFileName=fastaFileName,
+                        chromList=[])
+
+    return parser
+
+
  def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
                       doSplices=False, doPairs=False, withFlag="",
                       useFlagLike=False, enforceChr=False, allChrom=True,
@@ -77,7 +98,7 @@ def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
          sys.exit(1)
  
      print "\nsample:"
-    RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+    RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
  
      if cachePages > RDS.getDefaultCacheSize():
          RDS.setDBcache(cachePages)
@@ -120,42 +141,43 @@ def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
      outfile = pysam.Samfile(outfilename, "wb", header=header)
  
      totalWrites = 0
-    noncanonicalSplices = 0
+    noncanonicalSpliceCount = 0
      for chrom in chromList:
          index = 0
          print "chromosome %s" % (chrom)
          if withUniqs or withMulti:
              hitDict = RDS.getReadsDict(fullChrom=True, chrom=chrom, flag=withFlag, withWeight=True, withID=True,
-                                       withPairID=doPairs, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
-                                       flagLike=useFlagLike, entryDict=True)
+                                       doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
+                                       flagLike=useFlagLike, withMismatch=True)
  
              for read in hitDict[chrom]:
                  writeBAMEntry(outfile, chrom, read, readlength)
                  index += 1
  
          if doSplices:
-            numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict)
+            numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict)
              index += numSpliceReadsWritten
-            noncanonicalSplices += noncanonical
+            noncanonicalSpliceCount += noncanonical
  
          print index
          totalWrites += index
  
      outfile.close()
      print "%d total reads written" % totalWrites
-    print "%d non-canonical splices" % noncanonicalSplices
+    print "%d non-canonical splices" % noncanonicalSpliceCount
  
  
-def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict={}):
+def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict={}):
      index = 0
      noncanonicalSplices = 0
-    spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, entryDict=True, withWeight=True)
+    spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, withWeight=True,
+                                    withMismatch=True)
      if chrom not in spliceDict:
          pass
      else:
          for read in spliceDict[chrom]:
              if fastaSequenceDict.has_key(chrom):
-                read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], chrom, read["startR"], read["stopL"], read["sense"])
+                read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], read["startR"], read["stopL"], read["sense"])
                  noncanonicalSplices += noncanonical
  
              writeBAMEntry(outfile, chrom, read, readlength)
@@ -165,28 +187,37 @@ def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, f
  
  
  def writeBAMEntry(outfile, chrom, outputDict, readlength):
+    """ We need to subtract 1 from the position because rds is 1 based and
+        most of the rest of the entire world is 0 based.
+    """
      tagList = []
      alignedRead = pysam.AlignedRead()
-    alignedRead.qname = outputDict["readID"]
+    try:
+        (readID, pairID) = outputDict["readID"].split("/")
+        paired = True
+    except ValueError:
+        readID = outputDict["readID"]
+        paired = False
+
+    alignedRead.qname = readID
      if outputDict["sense"] == "-":
          alignedRead.is_reverse = True
  
      alignedRead.rname = outfile.references.index(chrom)
  
      if outputDict.has_key("startL"):
-        startL = outputDict["startL"]
-        stopL = outputDict["stopL"]
-        startR = outputDict["startR"]
-        stopR = outputDict["stopR"]
+        startL = outputDict["startL"] - 1
+        stopL = outputDict["stopL"] - 1
+        startR = outputDict["startR"] - 1
+        stopR = outputDict["stopR"] - 1
          alignedRead.pos = startL
-        alignedRead.cigar = [(0,stopL - startL + 1), (3, startR - stopL - 1), (0, stopR - startR + 1)]
-        tagList.append(("XS", outputDict["sense"]))
+        alignedRead.cigar = [(0,stopL - startL), (3, startR - stopL), (0, stopR - startR)]
+        tagList.append(("XS", str(outputDict["sense"])))
      else:
-        alignedRead.pos = outputDict["start"]
+        alignedRead.pos = outputDict["start"] - 1
          alignedRead.cigar = [(0, readlength)]
  
-    if outputDict.has_key("pairID"):
-        pairID = outputDict["pairID"]
+    if paired:
          if pairID == "1":
              alignedRead.is_read1 = True
              alignedRead.is_proper_pair = True
@@ -199,20 +230,22 @@ def writeBAMEntry(outfile, chrom, outputDict, readlength):
      if outputDict.has_key("mismatch"):
          mismatchTag = getMismatches(outputDict["mismatch"])
          if mismatchTag:
-            tagList.append(("MD", mismatchTag))
-    
+            tagList.append(("MD", str(mismatchTag)))
+
      if tagList:
-        alignedRead.tags = tagList
+        alignedRead.tags = tuple(tagList)
  
      outfile.write(alignedRead)
  
  
  def getMismatches(mismatchString):
-    mismatch = ""
+    mismatchList = []
      positions = re.findall("\d+", mismatchString)
      nucleotides = re.findall("([ACGTN])\d+", mismatchString)
      for index in range(0, len(positions)):
-        mismatch = "%s%s%s" % (mismatch, positions[index], nucleotides[index])
+        mismatchList.append("%s%s" % (positions[index], nucleotides[index]))
+
+    mismatch = string.join(mismatchList, "")
  
      return mismatch
  
@@ -250,7 +283,7 @@ def getFastaSequenceDictionary(fastaFileName):
      return fastaSeqDict
  
  
-def fixSpliceSense(fastaSequence, chrom, startRight, stopLeft, sense=""):
+def fixSpliceSense(fastaSequence, startRight, stopLeft, sense=""):
      spliceSense = {"GTAG": "+",
                     "GCAG": "+",
                     "ATAC": "+",
@@ -264,8 +297,9 @@ def fixSpliceSense(fastaSequence, chrom, startRight, stopLeft, sense=""):
      intronlen = startRight - stopLeft
      leftJunctionSig =fastaSequence[intronstart:intronstart+2]
      rightJunctionSig = fastaSequence[intronstart+intronlen-2:intronstart+intronlen]
-    spliceJunction = leftJunctionSig + rightJunctionSig
+    spliceJunction = string.join([leftJunctionSig, rightJunctionSig], "")
      spliceJunction = spliceJunction.upper()
+    print spliceJunction
      if spliceSense.has_key(spliceJunction):
          sense = spliceSense[spliceJunction]
      else: