X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=MakeRdsFromBam.py;h=969d4ccf556524200904e6f78f53ddc1cd1018b2;hp=e9df847ab167825f972b670690c953d443eff806;hb=0d3e3112fd04c2e6b44a25cacef1d591658ad181;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/MakeRdsFromBam.py b/MakeRdsFromBam.py index e9df847..969d4cc 100644 --- a/MakeRdsFromBam.py +++ b/MakeRdsFromBam.py @@ -12,11 +12,16 @@ try: except: pass -import sys, string, optparse, re +import sys +import string +import optparse +import re import pysam -from commoncode import readDataset, writeLog +from commoncode import writeLog, getConfigParser, getConfigBoolOption, getConfigIntOption, getReverseComplement +import ReadDataset -verstring = "%prog: version 1.0" +INSERT_SIZE = 100000 +verstring = "makeRdsFromBam: version 1.0" def main(argv=None): @@ -28,24 +33,7 @@ def main(argv=None): usage = "usage: %prog label samfile outrdsfile [propertyName::propertyValue] [options]\ \ninput reads must be sorted to properly record multireads" - parser = optparse.OptionParser(usage=usage) - parser.add_option("--append", action="store_false", dest="init", - help="append to existing rds file [default: create new]") - parser.add_option("--RNA", action="store_true", dest="rnaDataType", - help="set data type to RNA [default: DNA]") - parser.add_option("-S", "--sam", action="store_true", dest="useSamFile", - help="input file is in sam format") - parser.add_option("--index", action="store_true", dest="doIndex", - help="index the output rds file") - parser.add_option("--cache", type="int", dest="cachePages", - help="number of cache pages to use [default: 100000") - parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount", - help="multi counts over this value are discarded [default: 10]") - parser.add_option("--rawreadID", action="store_false", dest="trimReadID", - help="use the raw read names") - parser.set_defaults(init=True, doIndex=False, useSamFile=False, cachePages=100000, - maxMultiReadCount=10, rnaDataType=False, trimReadID=True) - + parser = getParser(usage) (options, args) = parser.parse_args(argv[1:]) try: @@ -70,6 +58,39 @@ def main(argv=None): options.cachePages, options.maxMultiReadCount, options.rnaDataType, options.trimReadID) +def getParser(usage): + parser = optparse.OptionParser(usage=usage) + parser.add_option("--append", action="store_false", dest="init", + help="append to existing rds file [default: create new]") + parser.add_option("--RNA", action="store_true", dest="rnaDataType", + help="set data type to RNA [default: DNA]") + parser.add_option("-S", "--sam", action="store_true", dest="useSamFile", + help="input file is in sam format") + parser.add_option("--index", action="store_true", dest="doIndex", + help="index the output rds file") + parser.add_option("--cache", type="int", dest="cachePages", + help="number of cache pages to use [default: 100000") + parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount", + help="multi counts over this value are discarded [default: 10]") + parser.add_option("--rawreadID", action="store_false", dest="trimReadID", + help="use the raw read names") + + configParser = getConfigParser() + section = "makeRdsFromBam" + init = getConfigBoolOption(configParser, section, "init", True) + doIndex = getConfigBoolOption(configParser, section, "doIndex", False) + useSamFile = getConfigBoolOption(configParser, section, "useSamFile", False) + cachePages = getConfigIntOption(configParser, section, "cachePages", 100000) + maxMultiReadCount = getConfigIntOption(configParser, section, "maxMultiReadCount", 10) + rnaDataType = getConfigBoolOption(configParser, section, "rnaDataType", False) + trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True) + + parser.set_defaults(init=init, doIndex=doIndex, useSamFile=useSamFile, cachePages=cachePages, + maxMultiReadCount=maxMultiReadCount, rnaDataType=rnaDataType, trimReadID=trimReadID) + + return parser + + def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False, cachePages=100000, maxMultiReadCount=10, rnaDataType=False, trimReadID=True): @@ -91,7 +112,7 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:])) - rds = readDataset(outDbName, init, dataType, verbose=True) + rds = ReadDataset.ReadDataset(outDbName, init, dataType, verbose=True) if not init and doIndex: try: if rds.hasIndex(): @@ -119,16 +140,15 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS if len(propertyList) > 0: rds.insertMetadata(propertyList) - countReads = {"unmapped": 0, - "total": 0, - "unique": 0, - "multi": 0, - "multiDiscard": 0, - "splice": 0 + totalReadCounts = {"unmapped": 0, + "total": 0, + "unique": 0, + "multi": 0, + "multiDiscard": 0, + "splice": 0 } readsize = 0 - insertSize = 100000 uniqueInsertList = [] multiInsertList = [] @@ -143,11 +163,11 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS for read in samFileIterator: if read.is_unmapped: - countReads["unmapped"] += 1 + totalReadCounts["unmapped"] += 1 continue if readsize == 0: - take = (0, 2, 3) # CIGAR operation (M/match, D/del, N/ref_skip) + take = (0, 1) # CIGAR operation (M/match, I/insertion) readsize = sum([length for op,length in read.cigar if op in take]) if init: rds.insertMetadata([("readsize", readsize)]) @@ -161,7 +181,7 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS pairReadSuffix = getPairedReadNumberSuffix(read) readName = "%s%s%s" % (read.qname, readSequence, pairReadSuffix) if trimReadID: - rdsEntryName = "%s:%s:%d%s" % (label, read.qname, countReads["total"], pairReadSuffix) + rdsEntryName = "%s:%s:%d%s" % (label, read.qname, totalReadCounts["total"], pairReadSuffix) else: rdsEntryName = read.qname @@ -186,27 +206,27 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS else: uniqueReadDict[readName] = (read, rdsEntryName) - if countReads["total"] % insertSize == 0: + if totalReadCounts["total"] % INSERT_SIZE == 0: for entry in uniqueReadDict.keys(): (readData, rdsEntryName) = uniqueReadDict[entry] chrom = samfile.getrname(readData.rname) uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize)) - countReads["unique"] += 1 + totalReadCounts["unique"] += 1 for entry in spliceReadDict.keys(): (readData, rdsEntryName) = spliceReadDict[entry] chrom = samfile.getrname(readData.rname) spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize)) - countReads["splice"] += 1 + totalReadCounts["splice"] += 1 for entry in multiReadDict.keys(): (readData, count, rdsEntryName) = multiReadDict[entry] chrom = samfile.getrname(readData.rname) if count > maxMultiReadCount: - countReads["multiDiscard"] += 1 + totalReadCounts["multiDiscard"] += 1 else: multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count)) - countReads["multi"] += 1 + totalReadCounts["multi"] += 1 rds.insertUniqs(uniqueInsertList) rds.insertMulti(multiInsertList) @@ -223,14 +243,14 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS sys.stdout.flush() processedEntryDict = {} - countReads["total"] += 1 + totalReadCounts["total"] += 1 if len(uniqueReadDict.keys()) > 0: for entry in uniqueReadDict.keys(): (readData, rdsEntryName) = uniqueReadDict[entry] chrom = samfile.getrname(readData.rname) uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize)) - countReads["unique"] += 1 + totalReadCounts["unique"] += 1 rds.insertUniqs(uniqueInsertList) @@ -239,32 +259,32 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS (readData, count, rdsEntryName) = multiReadDict[entry] chrom = samfile.getrname(readData.rname) if count > maxMultiReadCount: - countReads["multiDiscard"] += 1 + totalReadCounts["multiDiscard"] += 1 else: multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count)) - countReads["multi"] += 1 + totalReadCounts["multi"] += 1 - countReads["multi"] += len(multiInsertList) + totalReadCounts["multi"] += len(multiInsertList) if len(spliceReadDict.keys()) > 0 and dataType == "RNA": for entry in spliceReadDict.keys(): (readData, rdsEntryName) = spliceReadDict[entry] chrom = samfile.getrname(readData.rname) spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize)) - countReads["splice"] += 1 + totalReadCounts["splice"] += 1 rds.insertSplices(spliceInsertList) - countString = "\n%d unmapped reads discarded" % countReads["unmapped"] - countString += "\t%d unique reads" % countReads["unique"] - countString += "\t%d multi reads" % countReads["multi"] - countString += "\t%d multi reads count > %d discarded" % (countReads["multiDiscard"], maxMultiReadCount) + countStringList = ["\n%d unmapped reads discarded" % totalReadCounts["unmapped"]] + countStringList.append("%d unique reads" % totalReadCounts["unique"]) + countStringList.append("%d multi reads" % totalReadCounts["multi"]) + countStringList.append("%d multi reads count > %d discarded" % (totalReadCounts["multiDiscard"], maxMultiReadCount)) if dataType == "RNA": - countString += "\t%d spliced reads" % countReads["splice"] + countStringList.append("%d spliced reads" % totalReadCounts["splice"]) - print countString.replace("\t", "\n") - - writeLog("%s.log" % outDbName, verstring, countString) + print string.join(countStringList, "\n") + outputCountText = string.join(countStringList, "\t") + writeLog("%s.log" % outDbName, verstring, outputCountText) if doIndex: print "building index...." @@ -277,7 +297,7 @@ def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useS def getRDSEntry(alignedRead, readName, chrom, readSize, weight=1): start = int(alignedRead.pos) - stop = int(start+readSize) + stop = int(start + readSize) sense = getReadSense(alignedRead.is_reverse) try: mismatchTag = alignedRead.opt("MD") @@ -352,11 +372,11 @@ def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False): genomicNucleotide = "N" if sense == "-": - mismatch = getComplementNucleotide(mismatch) - genomicNucleotide = getComplementNucleotide(genomicNucleotide) + mismatch = getReverseComplement(mismatch) + genomicNucleotide = getReverseComplement(genomicNucleotide) - elandCompatiblePosition = int(position + 1) - output.append("%s%d%s" % (mismatch, elandCompatiblePosition, genomicNucleotide)) + erange1BasedElandCompatiblePosition = int(position + 1) + output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide)) position += 1 except IndexError: if logErrors: @@ -368,17 +388,6 @@ def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False): return string.join(output, ",") -def getComplementNucleotide(nucleotide): - complement = {"A": "T", - "T": "A", - "C": "G", - "G": "C", - "N": "N" - } - - return complement[nucleotide] - - def getSpliceBounds(start, readsize, cigarTupleList): stopR = int(start + readsize) offset = 0