import re
import optparse
import random
+import string
import pysam
-from commoncode import readDataset
+import ReadDataset
+from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption
def main(argv=None):
if not argv:
argv = sys.argv
- verstring = "MakeBamFromRds: version 1.0"
+ verstring = "makeBamFromRds: version 1.0"
print verstring
doPairs = False
usage = "usage: python %prog rdsFile bamFile [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--nouniq", action="store_false", dest="withUniqs")
- parser.add_option("--nomulti", action="store_false", dest="withMulti")
- parser.add_option("--splices", action="store_true", dest="doSplices")
- parser.add_option("--flag", dest="withFlag")
- parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
- parser.add_option("--pairs", action="store_true", dest="doPairs")
- parser.add_option("--cache", type="int", dest="cachePages")
- parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
- parser.add_option("--chrom", action="append", dest="chromList")
- parser.add_option("--fasta", dest="fastaFileName")
- parser.set_defaults(withUniqs=True, withMulti=True, doSplices=False,
- doPairs=False, withFlag="", useFlagLike=False, enforceChr=False,
- doCache=False, cachePages=100000, fastaFileName="",
- chromList=[])
-
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
if len(args) < 2:
options.chromList, options.fastaFileName)
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--nouniq", action="store_false", dest="withUniqs")
+ parser.add_option("--nomulti", action="store_false", dest="withMulti")
+ parser.add_option("--splices", action="store_true", dest="doSplices")
+ parser.add_option("--flag", dest="withFlag")
+ parser.add_option("--flaglike", action="store_true", dest="useFlagLike")
+ parser.add_option("--pairs", action="store_true", dest="doPairs")
+ parser.add_option("--cache", type="int", dest="cachePages")
+ parser.add_option("--enforceChr", action="store_true", dest="enforceChr")
+ parser.add_option("--chrom", action="append", dest="chromList")
+ parser.add_option("--fasta", dest="fastaFileName")
+
+ configParser = getConfigParser()
+ section = "MakeBamFromRds"
+ withUniqs = getConfigBoolOption(configParser, section, "withUniqs", True)
+ withMulti = getConfigBoolOption(configParser, section, "withMulti", True)
+ doSplices = getConfigBoolOption(configParser, section, "doSplices", False)
+ doPairs = getConfigBoolOption(configParser, section, "doPairs", False)
+ withFlag = getConfigOption(configParser, section, "withFlag", "")
+ useFlagLike = getConfigBoolOption(configParser, section, "useFlagLike", False)
+ enforceChr = getConfigBoolOption(configParser, section, "enforceChr", False)
+ doCache = getConfigBoolOption(configParser, section, "doCache", False)
+ cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
+ fastaFileName = getConfigOption(configParser, section, "fastaFileName", "")
+
+ parser.set_defaults(withUniqs=withUniqs, withMulti=withMulti, doSplices=doSplices,
+ doPairs=doPairs, withFlag=withFlag, useFlagLike=useFlagLike, enforceChr=enforceChr,
+ doCache=doCache, cachePages=cachePages, fastaFileName=fastaFileName,
+ chromList=[])
+
+ return parser
+
+
def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True,
doSplices=False, doPairs=False, withFlag="",
useFlagLike=False, enforceChr=False, allChrom=True,
sys.exit(1)
print "\nsample:"
- RDS = readDataset(rdsfile, verbose = True, cache=doCache)
+ RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache)
if cachePages > RDS.getDefaultCacheSize():
RDS.setDBcache(cachePages)
outfile = pysam.Samfile(outfilename, "wb", header=header)
totalWrites = 0
- noncanonicalSplices = 0
+ noncanonicalSpliceCount = 0
for chrom in chromList:
index = 0
print "chromosome %s" % (chrom)
if withUniqs or withMulti:
hitDict = RDS.getReadsDict(fullChrom=True, chrom=chrom, flag=withFlag, withWeight=True, withID=True,
- withPairID=doPairs, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
- flagLike=useFlagLike, entryDict=True)
+ doUniqs=withUniqs, doMulti=withMulti, readIDDict=False,
+ flagLike=useFlagLike, withMismatch=True)
for read in hitDict[chrom]:
writeBAMEntry(outfile, chrom, read, readlength)
index += 1
if doSplices:
- numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict)
+ numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict)
index += numSpliceReadsWritten
- noncanonicalSplices += noncanonical
+ noncanonicalSpliceCount += noncanonical
print index
totalWrites += index
outfile.close()
print "%d total reads written" % totalWrites
- print "%d non-canonical splices" % noncanonicalSplices
+ print "%d non-canonical splices" % noncanonicalSpliceCount
-def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict={}):
+def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict={}):
index = 0
noncanonicalSplices = 0
- spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, entryDict=True, withWeight=True)
+ spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, withWeight=True,
+ withMismatch=True)
if chrom not in spliceDict:
pass
else:
for read in spliceDict[chrom]:
if fastaSequenceDict.has_key(chrom):
- read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], chrom, read["startR"], read["stopL"], read["sense"])
+ read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], read["startR"], read["stopL"], read["sense"])
noncanonicalSplices += noncanonical
writeBAMEntry(outfile, chrom, read, readlength)
def writeBAMEntry(outfile, chrom, outputDict, readlength):
+ """ We need to subtract 1 from the position because rds is 1 based and
+ most of the rest of the entire world is 0 based.
+ """
tagList = []
alignedRead = pysam.AlignedRead()
- alignedRead.qname = outputDict["readID"]
+ try:
+ (readID, pairID) = outputDict["readID"].split("/")
+ paired = True
+ except ValueError:
+ readID = outputDict["readID"]
+ paired = False
+
+ alignedRead.qname = readID
if outputDict["sense"] == "-":
alignedRead.is_reverse = True
alignedRead.rname = outfile.references.index(chrom)
if outputDict.has_key("startL"):
- startL = outputDict["startL"]
- stopL = outputDict["stopL"]
- startR = outputDict["startR"]
- stopR = outputDict["stopR"]
+ startL = outputDict["startL"] - 1
+ stopL = outputDict["stopL"] - 1
+ startR = outputDict["startR"] - 1
+ stopR = outputDict["stopR"] - 1
alignedRead.pos = startL
- alignedRead.cigar = [(0,stopL - startL + 1), (3, startR - stopL - 1), (0, stopR - startR + 1)]
- tagList.append(("XS", outputDict["sense"]))
+ alignedRead.cigar = [(0,stopL - startL), (3, startR - stopL), (0, stopR - startR)]
+ tagList.append(("XS", str(outputDict["sense"])))
else:
- alignedRead.pos = outputDict["start"]
+ alignedRead.pos = outputDict["start"] - 1
alignedRead.cigar = [(0, readlength)]
- if outputDict.has_key("pairID"):
- pairID = outputDict["pairID"]
+ if paired:
if pairID == "1":
alignedRead.is_read1 = True
alignedRead.is_proper_pair = True
if outputDict.has_key("mismatch"):
mismatchTag = getMismatches(outputDict["mismatch"])
if mismatchTag:
- tagList.append(("MD", mismatchTag))
-
+ tagList.append(("MD", str(mismatchTag)))
+
if tagList:
- alignedRead.tags = tagList
+ alignedRead.tags = tuple(tagList)
outfile.write(alignedRead)
def getMismatches(mismatchString):
- mismatch = ""
+ mismatchList = []
positions = re.findall("\d+", mismatchString)
nucleotides = re.findall("([ACGTN])\d+", mismatchString)
for index in range(0, len(positions)):
- mismatch = "%s%s%s" % (mismatch, positions[index], nucleotides[index])
+ mismatchList.append("%s%s" % (positions[index], nucleotides[index]))
+
+ mismatch = string.join(mismatchList, "")
return mismatch
return fastaSeqDict
-def fixSpliceSense(fastaSequence, chrom, startRight, stopLeft, sense=""):
+def fixSpliceSense(fastaSequence, startRight, stopLeft, sense=""):
spliceSense = {"GTAG": "+",
"GCAG": "+",
"ATAC": "+",
intronlen = startRight - stopLeft
leftJunctionSig =fastaSequence[intronstart:intronstart+2]
rightJunctionSig = fastaSequence[intronstart+intronlen-2:intronstart+intronlen]
- spliceJunction = leftJunctionSig + rightJunctionSig
+ spliceJunction = string.join([leftJunctionSig, rightJunctionSig], "")
spliceJunction = spliceJunction.upper()
+ print spliceJunction
if spliceSense.has_key(spliceJunction):
sense = spliceSense[spliceJunction]
else: