except:
pass
-import sys, string, optparse, re
+import sys
+import string
+import optparse
+import re
import pysam
-from commoncode import readDataset, writeLog
+from commoncode import writeLog, getConfigParser, getConfigBoolOption, getConfigIntOption, getReverseComplement
+import ReadDataset
-verstring = "%prog: version 1.0"
+INSERT_SIZE = 100000
+verstring = "makeRdsFromBam: version 1.1"
def main(argv=None):
usage = "usage: %prog label samfile outrdsfile [propertyName::propertyValue] [options]\
\ninput reads must be sorted to properly record multireads"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("--append", action="store_false", dest="init",
- help="append to existing rds file [default: create new]")
- parser.add_option("--RNA", action="store_true", dest="rnaDataType",
- help="set data type to RNA [default: DNA]")
- parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
- help="input file is in sam format")
- parser.add_option("--index", action="store_true", dest="doIndex",
- help="index the output rds file")
- parser.add_option("--cache", type="int", dest="cachePages",
- help="number of cache pages to use [default: 100000")
- parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
- help="multi counts over this value are discarded [default: 10]")
- parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
- help="use the raw read names")
- parser.set_defaults(init=True, doIndex=False, useSamFile=False, cachePages=100000,
- maxMultiReadCount=10, rnaDataType=False, trimReadID=True)
-
+ parser = getParser(usage)
(options, args) = parser.parse_args(argv[1:])
try:
print "no outrdsfile specified - see --help for usage"
sys.exit(1)
+ if options.rnaDataType:
+ dataType = "RNA"
+ else:
+ dataType = "DNA"
+
makeRdsFromBam(label, samFileName, outDbName, options.init, options.doIndex, options.useSamFile,
- options.cachePages, options.maxMultiReadCount, options.rnaDataType, options.trimReadID)
+ options.cachePages, options.maxMultiReadCount, dataType, options.trimReadID)
+
+
+def getParser(usage):
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--append", action="store_false", dest="init",
+ help="append to existing rds file [default: create new]")
+ parser.add_option("--RNA", action="store_true", dest="rnaDataType",
+ help="set data type to RNA [default: DNA]")
+ parser.add_option("-S", "--sam", action="store_true", dest="useSamFile",
+ help="input file is in sam format")
+ parser.add_option("--index", action="store_true", dest="doIndex",
+ help="index the output rds file")
+ parser.add_option("--cache", type="int", dest="cachePages",
+ help="number of cache pages to use [default: 100000")
+ parser.add_option("-m", "--multiCount", type="int", dest="maxMultiReadCount",
+ help="multi counts over this value are discarded [default: 10]")
+ parser.add_option("--rawreadID", action="store_false", dest="trimReadID",
+ help="use the raw read names")
+
+ configParser = getConfigParser()
+ section = "makeRdsFromBam"
+ init = getConfigBoolOption(configParser, section, "init", True)
+ doIndex = getConfigBoolOption(configParser, section, "doIndex", False)
+ useSamFile = getConfigBoolOption(configParser, section, "useSamFile", False)
+ cachePages = getConfigIntOption(configParser, section, "cachePages", 100000)
+ maxMultiReadCount = getConfigIntOption(configParser, section, "maxMultiReadCount", 10)
+ rnaDataType = getConfigBoolOption(configParser, section, "rnaDataType", False)
+ trimReadID = getConfigBoolOption(configParser, section, "trimReadID", True)
+
+ parser.set_defaults(init=init, doIndex=doIndex, useSamFile=useSamFile, cachePages=cachePages,
+ maxMultiReadCount=maxMultiReadCount, rnaDataType=rnaDataType, trimReadID=trimReadID)
+
+ return parser
def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False,
- cachePages=100000, maxMultiReadCount=10, rnaDataType=False, trimReadID=True):
+ cachePages=100000, maxMultiReadCount=10, dataType="DNA", trimReadID=True):
if useSamFile:
fileMode = "r"
else:
fileMode = "rb"
- try:
- samfile = pysam.Samfile(samFileName, fileMode)
- except ValueError:
- print "samfile index not found"
- sys.exit(1)
-
- if rnaDataType:
- dataType = "RNA"
- else:
- dataType = "DNA"
-
writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:]))
-
- rds = readDataset(outDbName, init, dataType, verbose=True)
+ rds = ReadDataset.ReadDataset(outDbName, init, dataType, verbose=True)
if not init and doIndex:
try:
if rds.hasIndex():
if len(propertyList) > 0:
rds.insertMetadata(propertyList)
- countReads = {"unmapped": 0,
- "total": 0,
- "unique": 0,
- "multi": 0,
- "multiDiscard": 0,
- "splice": 0
+ totalReadCounts = {"unmapped": 0,
+ "total": 0,
+ "unique": 0,
+ "multi": 0,
+ "multiDiscard": 0,
+ "splice": 0,
+ "multisplice": 0
}
readsize = 0
- insertSize = 100000
uniqueInsertList = []
multiInsertList = []
spliceInsertList = []
+ multispliceInsertList = []
- processedEntryDict = {}
uniqueReadDict = {}
multiReadDict = {}
+ multispliceReadDict = {}
spliceReadDict = {}
+ multireadCounts = getMultiReadIDCounts(samFileName, fileMode)
- samFileIterator = samfile.fetch(until_eof=True)
+ for readID in multireadCounts:
+ if multireadCounts[readID] > maxMultiReadCount:
+ totalReadCounts["multiDiscard"] += 1
+
+ try:
+ samfile = pysam.Samfile(samFileName, fileMode)
+ except ValueError:
+ print "samfile index not found"
+ sys.exit(1)
+ samFileIterator = samfile.fetch(until_eof=True)
for read in samFileIterator:
if read.is_unmapped:
- countReads["unmapped"] += 1
+ totalReadCounts["unmapped"] += 1
continue
if readsize == 0:
- take = (0, 2, 3) # CIGAR operation (M/match, D/del, N/ref_skip)
+ take = (0, 1) # CIGAR operation (M/match, I/insertion)
readsize = sum([length for op,length in read.cigar if op in take])
if init:
rds.insertMetadata([("readsize", readsize)])
- #Build the read dictionaries
- try:
- readSequence = read.seq
- except KeyError:
- readSequence = ""
-
pairReadSuffix = getPairedReadNumberSuffix(read)
- readName = "%s%s%s" % (read.qname, readSequence, pairReadSuffix)
+ readName = "%s%s" % (read.qname, pairReadSuffix)
if trimReadID:
- rdsEntryName = "%s:%s:%d%s" % (label, read.qname, countReads["total"], pairReadSuffix)
+ rdsEntryName = "%s:%s:%d%s" % (label, read.qname, totalReadCounts["total"], pairReadSuffix)
else:
rdsEntryName = read.qname
- if processedEntryDict.has_key(readName):
- if isSpliceEntry(read.cigar):
- if spliceReadDict.has_key(readName):
- del spliceReadDict[readName]
- else:
- if uniqueReadDict.has_key(readName):
- del uniqueReadDict[readName]
-
- if multiReadDict.has_key(readName):
- (read, priorCount, rdsEntryName) = multiReadDict[readName]
- count = priorCount + 1
- multiReadDict[readName] = (read, count, rdsEntryName)
- else:
- multiReadDict[readName] = (read, 1, rdsEntryName)
- else:
- processedEntryDict[readName] = ""
+ try:
+ count = multireadCounts[readName]
+ except KeyError:
+ count = 1
+
+ if count == 1:
if isSpliceEntry(read.cigar):
spliceReadDict[readName] = (read,rdsEntryName)
else:
uniqueReadDict[readName] = (read, rdsEntryName)
+ elif count <= maxMultiReadCount:
+ if isSpliceEntry(read.cigar):
+ multispliceReadDict[readName] = (read, count, rdsEntryName)
+ else:
+ multiReadDict[readName] = (read, count, rdsEntryName)
- if countReads["total"] % insertSize == 0:
+ if totalReadCounts["total"] % INSERT_SIZE == 0:
for entry in uniqueReadDict.keys():
(readData, rdsEntryName) = uniqueReadDict[entry]
chrom = samfile.getrname(readData.rname)
uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
- countReads["unique"] += 1
-
- for entry in spliceReadDict.keys():
- (readData, rdsEntryName) = spliceReadDict[entry]
- chrom = samfile.getrname(readData.rname)
- spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
- countReads["splice"] += 1
+ totalReadCounts["unique"] += 1
for entry in multiReadDict.keys():
(readData, count, rdsEntryName) = multiReadDict[entry]
chrom = samfile.getrname(readData.rname)
- if count > maxMultiReadCount:
- countReads["multiDiscard"] += 1
- else:
- multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
- countReads["multi"] += 1
+ multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
+
+ if dataType == "RNA":
+ for entry in spliceReadDict.keys():
+ (readData, rdsEntryName) = spliceReadDict[entry]
+ chrom = samfile.getrname(readData.rname)
+ spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
+ totalReadCounts["splice"] += 1
+
+ for entry in multispliceReadDict.keys():
+ (readData, count, rdsEntryName) = multispliceReadDict[entry]
+ chrom = samfile.getrname(readData.rname)
+ multispliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize, weight=count))
+ totalReadCounts["multisplice"] += 1
rds.insertUniqs(uniqueInsertList)
rds.insertMulti(multiInsertList)
rds.insertSplices(spliceInsertList)
spliceInsertList = []
spliceReadDict = {}
+ rds.insertMultisplices(multispliceInsertList)
+ multispliceInsertList = []
+ multispliceReadDict = {}
print ".",
sys.stdout.flush()
- processedEntryDict = {}
- countReads["total"] += 1
+ totalReadCounts["total"] += 1
if len(uniqueReadDict.keys()) > 0:
for entry in uniqueReadDict.keys():
(readData, rdsEntryName) = uniqueReadDict[entry]
chrom = samfile.getrname(readData.rname)
uniqueInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize))
- countReads["unique"] += 1
+ totalReadCounts["unique"] += 1
rds.insertUniqs(uniqueInsertList)
for entry in multiReadDict.keys():
(readData, count, rdsEntryName) = multiReadDict[entry]
chrom = samfile.getrname(readData.rname)
- if count > maxMultiReadCount:
- countReads["multiDiscard"] += 1
- else:
- multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
- countReads["multi"] += 1
+ multiInsertList.append(getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count))
- countReads["multi"] += len(multiInsertList)
+ rds.insertMulti(multiInsertList)
if len(spliceReadDict.keys()) > 0 and dataType == "RNA":
for entry in spliceReadDict.keys():
(readData, rdsEntryName) = spliceReadDict[entry]
chrom = samfile.getrname(readData.rname)
spliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
- countReads["splice"] += 1
+ totalReadCounts["splice"] += 1
rds.insertSplices(spliceInsertList)
- countString = "\n%d unmapped reads discarded" % countReads["unmapped"]
- countString += "\t%d unique reads" % countReads["unique"]
- countString += "\t%d multi reads" % countReads["multi"]
- countString += "\t%d multi reads count > %d discarded" % (countReads["multiDiscard"], maxMultiReadCount)
- if dataType == "RNA":
- countString += "\t%d spliced reads" % countReads["splice"]
+ if len(multispliceReadDict.keys()) > 0 and dataType == "RNA":
+ for entry in multispliceReadDict.keys():
+ (readData, count, rdsEntryName) = multispliceReadDict[entry]
+ chrom = samfile.getrname(readData.rname)
+ multispliceInsertList.append(getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize, weight=count))
+ totalReadCounts["multisplice"] += 1
+
+ rds.insertMultisplices(multispliceInsertList)
- print countString.replace("\t", "\n")
+ totalReadCounts["multi"] = len(multireadCounts) - totalReadCounts["multiDiscard"] - totalReadCounts["multisplice"]
+ countStringList = ["\n%d unmapped reads discarded" % totalReadCounts["unmapped"]]
+ countStringList.append("%d unique reads" % totalReadCounts["unique"])
+ countStringList.append("%d multi reads" % totalReadCounts["multi"])
+ countStringList.append("%d multi reads count > %d discarded" % (totalReadCounts["multiDiscard"], maxMultiReadCount))
+ if dataType == "RNA":
+ countStringList.append("%d spliced reads" % totalReadCounts["splice"])
+ countStringList.append("%d spliced multireads" % totalReadCounts["multisplice"])
- writeLog("%s.log" % outDbName, verstring, countString)
+ print string.join(countStringList, "\n")
+ outputCountText = string.join(countStringList, "\t")
+ writeLog("%s.log" % outDbName, verstring, outputCountText)
if doIndex:
print "building index...."
rds.buildIndex(defaultCacheSize)
+def getMultiReadIDCounts(samFileName, fileMode):
+ try:
+ samfile = pysam.Samfile(samFileName, fileMode)
+ except ValueError:
+ print "samfile index not found"
+ sys.exit(1)
+
+ readIDCounts = {}
+ for read in samfile.fetch(until_eof=True):
+ pairReadSuffix = getPairedReadNumberSuffix(read)
+ readName = "%s%s" % (read.qname, pairReadSuffix)
+ try:
+ readIDCounts[readName] += 1
+ except KeyError:
+ readIDCounts[readName] = 1
+
+ for readID in readIDCounts.keys():
+ if readIDCounts[readID] == 1:
+ del readIDCounts[readID]
+
+ return readIDCounts
+
+
def getRDSEntry(alignedRead, readName, chrom, readSize, weight=1):
start = int(alignedRead.pos)
- stop = int(start+readSize)
+ stop = int(start + readSize)
sense = getReadSense(alignedRead.is_reverse)
try:
mismatchTag = alignedRead.opt("MD")
return (readName, chrom, start, stop, sense, 1.0/weight, '', mismatches)
-def getRDSSpliceEntry(alignedRead, readName, chrom, readSize):
- (readName, chrom, start, stop, sense, weight, flag, mismatches) = getRDSEntry(alignedRead, readName, chrom, readSize)
+def getRDSSpliceEntry(alignedRead, readName, chrom, readSize, weight=1):
+ (readName, chrom, start, stop, sense, weight, flag, mismatches) = getRDSEntry(alignedRead, readName, chrom, readSize, weight)
startL, startR, stopL, stopR = getSpliceBounds(start, readSize, alignedRead.cigar)
- return (readName, chrom, startL, stopL, startR, stopR, sense, 1.0, "", mismatches)
+ return (readName, chrom, startL, stopL, startR, stopR, sense, weight, "", mismatches)
def getPairedReadNumberSuffix(read):
genomicNucleotide = "N"
if sense == "-":
- mismatch = getComplementNucleotide(mismatch)
- genomicNucleotide = getComplementNucleotide(genomicNucleotide)
+ mismatch = getReverseComplement(mismatch)
+ genomicNucleotide = getReverseComplement(genomicNucleotide)
- elandCompatiblePosition = int(position + 1)
- output.append("%s%d%s" % (mismatch, elandCompatiblePosition, genomicNucleotide))
+ erange1BasedElandCompatiblePosition = int(position + 1)
+ output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
position += 1
except IndexError:
if logErrors:
return string.join(output, ",")
-def getComplementNucleotide(nucleotide):
- complement = {"A": "T",
- "T": "A",
- "C": "G",
- "G": "C",
- "N": "N"
- }
-
- return complement[nucleotide]
-
-
def getSpliceBounds(start, readsize, cigarTupleList):
- stopR = int(start + readsize)
offset = 0
for operation,length in cigarTupleList:
if operation == 3:
stopL = int(start + offset)
startR = int(stopL + length)
+ stopR = int(startR + readsize - offset)
return start, startR, stopL, stopR
else: