erange version 4.0a dev release
[erange.git] / MakeBamFromRds.py
index 935a04e4bfe973ba9250dbca1c90f12f03c17d67..bfb48fde3fc0bebc9bdceaa1dc16f83dd566da59 100644 (file)
@@ -17,37 +17,24 @@ import sys
 import re
 import optparse
 import random
+import string
 import pysam
-from commoncode import readDataset
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
 
 
 def main(argv=None):
     if not argv:
         argv = sys.argv
 
-    verstring = "MakeBamFromRds: version 1.0"
+    verstring = "makeBamFromRds: version 1.0"
     print verstring
 
     doPairs = False
     
     usage = "usage: python %prog rdsFile bamFile [options]"
 
-    parser = optparse.OptionParser(usage=usage)
-    parser.add_option("--nouniq", action="store_false", dest="withUniqs")
-    parser.add_option("--nomulti", action="store_false", dest="withMulti")
-    parser.add_option("--splices", action="store_true", dest="doSplices")
-    parser.add_option("--flag", dest="withFlag")
-    parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
-    parser.add_option("--pairs", action="store_true", dest="doPairs")
-    parser.add_option("--cache", type="int", dest="cachePages")
-    parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
-    parser.add_option("--chrom", action="append", dest="chromList")
-    parser.add_option("--fasta", dest="fastaFileName")
-    parser.set_defaults(withUniqs=True, withMulti=True, doSplices=False,
-                        doPairs=False, withFlag="", useFlagLike=False, enforceChr=False,
-                        doCache=False, cachePages=100000, fastaFileName="",
-                        chromList=[])
-
+    parser = getParser(usage)
     (options, args) = parser.parse_args(argv[1:])
 
     if len(args) < 2:
@@ -67,6 +54,40 @@ def main(argv=None):
                      options.chromList, options.fastaFileName)
 
 
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--nouniq", action="store_false", dest="withUniqs")
+    parser.add_option("--nomulti", action="store_false", dest="withMulti")
+    parser.add_option("--splices", action="store_true", dest="doSplices")
+    parser.add_option("--flag", dest="withFlag")
+    parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
+    parser.add_option("--pairs", action="store_true", dest="doPairs")
+    parser.add_option("--cache", type="int", dest="cachePages")
+    parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
+    parser.add_option("--chrom", action="append", dest="chromList")
+    parser.add_option("--fasta", dest="fastaFileName")
+
+    configParser = getConfigParser()
+    section = "MakeBamFromRds"
+    withUniqs = getConfigBoolOption(configParser, section, "withUniqs", True)
+    withMulti = getConfigBoolOption(configParser, section, "withMulti", True)
+    doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+    doPairs = getConfigBoolOption(configParser, section, "doPairs", False)
+    withFlag = getConfigOption(configParser, section, "withFlag", "")
+    useFlagLike = getConfigBoolOption(configParser, section, "useFlagLike", False)
+    enforceChr = getConfigBoolOption(configParser, section, "enforceChr", False)
+    doCache = getConfigBoolOption(configParser, section, "doCache", False)
+    cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
+    fastaFileName = getConfigOption(configParser, section, "fastaFileName", "")
+
+    parser.set_defaults(withUniqs=withUniqs, withMulti=withMulti, doSplices=doSplices,
+                        doPairs=doPairs, withFlag=withFlag, useFlagLike=useFlagLike, enforceChr=enforceChr,
+                        doCache=doCache, cachePages=cachePages, fastaFileName=fastaFileName,
+                        chromList=[])
+
+    return parser
+
+
 def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
                      doSplices=False, doPairs=False, withFlag="",
                      useFlagLike=False, enforceChr=False, allChrom=True,
@@ -77,7 +98,7 @@ def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
         sys.exit(1)
 
     print "\nsample:"
-    RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+    RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
 
     if cachePages > RDS.getDefaultCacheSize():
         RDS.setDBcache(cachePages)
@@ -120,42 +141,43 @@ def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
     outfile = pysam.Samfile(outfilename, "wb", header=header)
 
     totalWrites = 0
-    noncanonicalSplices = 0
+    noncanonicalSpliceCount = 0
     for chrom in chromList:
         index = 0
         print "chromosome %s" % (chrom)
         if withUniqs or withMulti:
             hitDict = RDS.getReadsDict(fullChrom=True, chrom=chrom, flag=withFlag, withWeight=True, withID=True,
-                                       withPairID=doPairs, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
-                                       flagLike=useFlagLike, entryDict=True)
+                                       doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
+                                       flagLike=useFlagLike, withMismatch=True)
 
             for read in hitDict[chrom]:
                 writeBAMEntry(outfile, chrom, read, readlength)
                 index += 1
 
         if doSplices:
-            numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict)
+            numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict)
             index += numSpliceReadsWritten
-            noncanonicalSplices += noncanonical
+            noncanonicalSpliceCount += noncanonical
 
         print index
         totalWrites += index
 
     outfile.close()
     print "%d total reads written" % totalWrites
-    print "%d non-canonical splices" % noncanonicalSplices
+    print "%d non-canonical splices" % noncanonicalSpliceCount
 
 
-def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict={}):
+def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict={}):
     index = 0
     noncanonicalSplices = 0
-    spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, entryDict=True, withWeight=True)
+    spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, withWeight=True,
+                                    withMismatch=True)
     if chrom not in spliceDict:
         pass
     else:
         for read in spliceDict[chrom]:
             if fastaSequenceDict.has_key(chrom):
-                read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], chrom, read["startR"], read["stopL"], read["sense"])
+                read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], read["startR"], read["stopL"], read["sense"])
                 noncanonicalSplices += noncanonical
 
             writeBAMEntry(outfile, chrom, read, readlength)
@@ -165,28 +187,37 @@ def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, f
 
 
 def writeBAMEntry(outfile, chrom, outputDict, readlength):
+    """ We need to subtract 1 from the position because rds is 1 based and
+        most of the rest of the entire world is 0 based.
+    """
     tagList = []
     alignedRead = pysam.AlignedRead()
-    alignedRead.qname = outputDict["readID"]
+    try:
+        (readID, pairID) = outputDict["readID"].split("/")
+        paired = True
+    except ValueError:
+        readID = outputDict["readID"]
+        paired = False
+
+    alignedRead.qname = readID
     if outputDict["sense"] == "-":
         alignedRead.is_reverse = True
 
     alignedRead.rname = outfile.references.index(chrom)
 
     if outputDict.has_key("startL"):
-        startL = outputDict["startL"]
-        stopL = outputDict["stopL"]
-        startR = outputDict["startR"]
-        stopR = outputDict["stopR"]
+        startL = outputDict["startL"] - 1
+        stopL = outputDict["stopL"] - 1
+        startR = outputDict["startR"] - 1
+        stopR = outputDict["stopR"] - 1
         alignedRead.pos = startL
-        alignedRead.cigar = [(0,stopL - startL + 1), (3, startR - stopL - 1), (0, stopR - startR + 1)]
-        tagList.append(("XS", outputDict["sense"]))
+        alignedRead.cigar = [(0,stopL - startL), (3, startR - stopL), (0, stopR - startR)]
+        tagList.append(("XS", str(outputDict["sense"])))
     else:
-        alignedRead.pos = outputDict["start"]
+        alignedRead.pos = outputDict["start"] - 1
         alignedRead.cigar = [(0, readlength)]
 
-    if outputDict.has_key("pairID"):
-        pairID = outputDict["pairID"]
+    if paired:
         if pairID == "1":
             alignedRead.is_read1 = True
             alignedRead.is_proper_pair = True
@@ -199,20 +230,22 @@ def writeBAMEntry(outfile, chrom, outputDict, readlength):
     if outputDict.has_key("mismatch"):
         mismatchTag = getMismatches(outputDict["mismatch"])
         if mismatchTag:
-            tagList.append(("MD", mismatchTag))
-    
+            tagList.append(("MD", str(mismatchTag)))
+
     if tagList:
-        alignedRead.tags = tagList
+        alignedRead.tags = tuple(tagList)
 
     outfile.write(alignedRead)
 
 
 def getMismatches(mismatchString):
-    mismatch = ""
+    mismatchList = []
     positions = re.findall("\d+", mismatchString)
     nucleotides = re.findall("([ACGTN])\d+", mismatchString)
     for index in range(0, len(positions)):
-        mismatch = "%s%s%s" % (mismatch, positions[index], nucleotides[index])
+        mismatchList.append("%s%s" % (positions[index], nucleotides[index]))
+
+    mismatch = string.join(mismatchList, "")
 
     return mismatch
 
@@ -250,7 +283,7 @@ def getFastaSequenceDictionary(fastaFileName):
     return fastaSeqDict
 
 
-def fixSpliceSense(fastaSequence, chrom, startRight, stopLeft, sense=""):
+def fixSpliceSense(fastaSequence, startRight, stopLeft, sense=""):
     spliceSense = {"GTAG": "+",
                    "GCAG": "+",
                    "ATAC": "+",
@@ -264,8 +297,9 @@ def fixSpliceSense(fastaSequence, chrom, startRight, stopLeft, sense=""):
     intronlen = startRight - stopLeft
     leftJunctionSig =fastaSequence[intronstart:intronstart+2]
     rightJunctionSig = fastaSequence[intronstart+intronlen-2:intronstart+intronlen]
-    spliceJunction = leftJunctionSig + rightJunctionSig
+    spliceJunction = string.join([leftJunctionSig, rightJunctionSig], "")
     spliceJunction = spliceJunction.upper()
+    print spliceJunction
     if spliceSense.has_key(spliceJunction):
         sense = spliceSense[spliceJunction]
     else: