X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=MakeBamFromRds.py;fp=MakeBamFromRds.py;h=bfb48fde3fc0bebc9bdceaa1dc16f83dd566da59;hp=935a04e4bfe973ba9250dbca1c90f12f03c17d67;hb=0d3e3112fd04c2e6b44a25cacef1d591658ad181;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/MakeBamFromRds.py b/MakeBamFromRds.py index 935a04e..bfb48fd 100644 --- a/MakeBamFromRds.py +++ b/MakeBamFromRds.py @@ -17,37 +17,24 @@ import sys import re import optparse import random +import string import pysam -from commoncode import readDataset +import ReadDataset +from commoncode import getConfigParser, getConfigOption, getConfigBoolOption, getConfigIntOption def main(argv=None): if not argv: argv = sys.argv - verstring = "MakeBamFromRds: version 1.0" + verstring = "makeBamFromRds: version 1.0" print verstring doPairs = False usage = "usage: python %prog rdsFile bamFile [options]" - parser = optparse.OptionParser(usage=usage) - parser.add_option("--nouniq", action="store_false", dest="withUniqs") - parser.add_option("--nomulti", action="store_false", dest="withMulti") - parser.add_option("--splices", action="store_true", dest="doSplices") - parser.add_option("--flag", dest="withFlag") - parser.add_option("--flaglike", action="store_true", dest="useFlagLike") - parser.add_option("--pairs", action="store_true", dest="doPairs") - parser.add_option("--cache", type="int", dest="cachePages") - parser.add_option("--enforceChr", action="store_true", dest="enforceChr") - parser.add_option("--chrom", action="append", dest="chromList") - parser.add_option("--fasta", dest="fastaFileName") - parser.set_defaults(withUniqs=True, withMulti=True, doSplices=False, - doPairs=False, withFlag="", useFlagLike=False, enforceChr=False, - doCache=False, cachePages=100000, fastaFileName="", - chromList=[]) - + parser = getParser(usage) (options, args) = parser.parse_args(argv[1:]) if len(args) < 2: @@ -67,6 +54,40 @@ def main(argv=None): options.chromList, options.fastaFileName) +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--nouniq", action="store_false", dest="withUniqs") + parser.add_option("--nomulti", action="store_false", dest="withMulti") + parser.add_option("--splices", action="store_true", dest="doSplices") + parser.add_option("--flag", dest="withFlag") + parser.add_option("--flaglike", action="store_true", dest="useFlagLike") + parser.add_option("--pairs", action="store_true", dest="doPairs") + parser.add_option("--cache", type="int", dest="cachePages") + parser.add_option("--enforceChr", action="store_true", dest="enforceChr") + parser.add_option("--chrom", action="append", dest="chromList") + parser.add_option("--fasta", dest="fastaFileName") + + configParser = getConfigParser() + section = "MakeBamFromRds" + withUniqs = getConfigBoolOption(configParser, section, "withUniqs", True) + withMulti = getConfigBoolOption(configParser, section, "withMulti", True) + doSplices = getConfigBoolOption(configParser, section, "doSplices", False) + doPairs = getConfigBoolOption(configParser, section, "doPairs", False) + withFlag = getConfigOption(configParser, section, "withFlag", "") + useFlagLike = getConfigBoolOption(configParser, section, "useFlagLike", False) + enforceChr = getConfigBoolOption(configParser, section, "enforceChr", False) + doCache = getConfigBoolOption(configParser, section, "doCache", False) + cachePages = getConfigIntOption(configParser, section, "cachePages", 100000) + fastaFileName = getConfigOption(configParser, section, "fastaFileName", "") + + parser.set_defaults(withUniqs=withUniqs, withMulti=withMulti, doSplices=doSplices, + doPairs=doPairs, withFlag=withFlag, useFlagLike=useFlagLike, enforceChr=enforceChr, + doCache=doCache, cachePages=cachePages, fastaFileName=fastaFileName, + chromList=[]) + + return parser + + def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True, doSplices=False, doPairs=False, withFlag="", useFlagLike=False, enforceChr=False, allChrom=True, @@ -77,7 +98,7 @@ def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True, sys.exit(1) print "\nsample:" - RDS = readDataset(rdsfile, verbose = True, cache=doCache) + RDS = ReadDataset.ReadDataset(rdsfile, verbose = True, cache=doCache) if cachePages > RDS.getDefaultCacheSize(): RDS.setDBcache(cachePages) @@ -120,42 +141,43 @@ def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True, outfile = pysam.Samfile(outfilename, "wb", header=header) totalWrites = 0 - noncanonicalSplices = 0 + noncanonicalSpliceCount = 0 for chrom in chromList: index = 0 print "chromosome %s" % (chrom) if withUniqs or withMulti: hitDict = RDS.getReadsDict(fullChrom=True, chrom=chrom, flag=withFlag, withWeight=True, withID=True, - withPairID=doPairs, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False, - flagLike=useFlagLike, entryDict=True) + doUniqs=withUniqs, doMulti=withMulti, readIDDict=False, + flagLike=useFlagLike, withMismatch=True) for read in hitDict[chrom]: writeBAMEntry(outfile, chrom, read, readlength) index += 1 if doSplices: - numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict) + numSpliceReadsWritten, noncanonical = processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict) index += numSpliceReadsWritten - noncanonicalSplices += noncanonical + noncanonicalSpliceCount += noncanonical print index totalWrites += index outfile.close() print "%d total reads written" % totalWrites - print "%d non-canonical splices" % noncanonicalSplices + print "%d non-canonical splices" % noncanonicalSpliceCount -def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict={}): +def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, doPairs, fastaSequenceDict={}): index = 0 noncanonicalSplices = 0 - spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, entryDict=True, withWeight=True) + spliceDict = RDS.getSplicesDict(fullChrom=True, chrom=chrom, flag=withFlag, withID=True, flagLike=useFlagLike, withWeight=True, + withMismatch=True) if chrom not in spliceDict: pass else: for read in spliceDict[chrom]: if fastaSequenceDict.has_key(chrom): - read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], chrom, read["startR"], read["stopL"], read["sense"]) + read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], read["startR"], read["stopL"], read["sense"]) noncanonicalSplices += noncanonical writeBAMEntry(outfile, chrom, read, readlength) @@ -165,28 +187,37 @@ def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, f def writeBAMEntry(outfile, chrom, outputDict, readlength): + """ We need to subtract 1 from the position because rds is 1 based and + most of the rest of the entire world is 0 based. + """ tagList = [] alignedRead = pysam.AlignedRead() - alignedRead.qname = outputDict["readID"] + try: + (readID, pairID) = outputDict["readID"].split("/") + paired = True + except ValueError: + readID = outputDict["readID"] + paired = False + + alignedRead.qname = readID if outputDict["sense"] == "-": alignedRead.is_reverse = True alignedRead.rname = outfile.references.index(chrom) if outputDict.has_key("startL"): - startL = outputDict["startL"] - stopL = outputDict["stopL"] - startR = outputDict["startR"] - stopR = outputDict["stopR"] + startL = outputDict["startL"] - 1 + stopL = outputDict["stopL"] - 1 + startR = outputDict["startR"] - 1 + stopR = outputDict["stopR"] - 1 alignedRead.pos = startL - alignedRead.cigar = [(0,stopL - startL + 1), (3, startR - stopL - 1), (0, stopR - startR + 1)] - tagList.append(("XS", outputDict["sense"])) + alignedRead.cigar = [(0,stopL - startL), (3, startR - stopL), (0, stopR - startR)] + tagList.append(("XS", str(outputDict["sense"]))) else: - alignedRead.pos = outputDict["start"] + alignedRead.pos = outputDict["start"] - 1 alignedRead.cigar = [(0, readlength)] - if outputDict.has_key("pairID"): - pairID = outputDict["pairID"] + if paired: if pairID == "1": alignedRead.is_read1 = True alignedRead.is_proper_pair = True @@ -199,20 +230,22 @@ def writeBAMEntry(outfile, chrom, outputDict, readlength): if outputDict.has_key("mismatch"): mismatchTag = getMismatches(outputDict["mismatch"]) if mismatchTag: - tagList.append(("MD", mismatchTag)) - + tagList.append(("MD", str(mismatchTag))) + if tagList: - alignedRead.tags = tagList + alignedRead.tags = tuple(tagList) outfile.write(alignedRead) def getMismatches(mismatchString): - mismatch = "" + mismatchList = [] positions = re.findall("\d+", mismatchString) nucleotides = re.findall("([ACGTN])\d+", mismatchString) for index in range(0, len(positions)): - mismatch = "%s%s%s" % (mismatch, positions[index], nucleotides[index]) + mismatchList.append("%s%s" % (positions[index], nucleotides[index])) + + mismatch = string.join(mismatchList, "") return mismatch @@ -250,7 +283,7 @@ def getFastaSequenceDictionary(fastaFileName): return fastaSeqDict -def fixSpliceSense(fastaSequence, chrom, startRight, stopLeft, sense=""): +def fixSpliceSense(fastaSequence, startRight, stopLeft, sense=""): spliceSense = {"GTAG": "+", "GCAG": "+", "ATAC": "+", @@ -264,8 +297,9 @@ def fixSpliceSense(fastaSequence, chrom, startRight, stopLeft, sense=""): intronlen = startRight - stopLeft leftJunctionSig =fastaSequence[intronstart:intronstart+2] rightJunctionSig = fastaSequence[intronstart+intronlen-2:intronstart+intronlen] - spliceJunction = leftJunctionSig + rightJunctionSig + spliceJunction = string.join([leftJunctionSig, rightJunctionSig], "") spliceJunction = spliceJunction.upper() + print spliceJunction if spliceSense.has_key(spliceJunction): sense = spliceSense[spliceJunction] else: