-import sqlite3 as sqlite
import string
import tempfile
import shutil
import os
+import re
+import sys
+import pysam
from array import array
-from commoncode import getReverseComplement, getConfigParser, getConfigOption
-
-currentRDSVersion = "2.1"
+from commoncode import getReverseComplement, isSpliceEntry
+currentRDSVersion = "3.0"
class ReadDatasetError(Exception):
pass
+class ReadCounter():
+ uniqReadCount = 0
+ multiReadCount = 0.0
+ spliceReadCount = 0
+
+ def __init__(self, multiReadIDCount={}, sense="both"):
+ self.multiReadIDCount = multiReadIDCount
+ self.sense = sense
+
+
+ def __call__(self, read):
+ multiplicity = self.getReadMultiplicity(read)
+ if multiplicity > 1:
+ self.multiReadCount += float(1.0/multiplicity)
+ elif isSpliceEntry(read.cigar):
+ self.spliceReadCount += 1
+ else:
+ self.uniqReadCount += 1
+
+
+ def getReadMultiplicity(self, read):
+ pairReadSuffix = getPairedReadNumberSuffix(read)
+ readID = "%s%s" % (read.qname, pairReadSuffix)
+ try:
+ multiplicity = self.multiReadIDCount[readID]
+ except KeyError:
+ multiplicity = 1
+
+ return multiplicity
+
+
+ def isMulti(self, read):
+ pairReadSuffix = getPairedReadNumberSuffix(read)
+ readID = "%s%s" % (read.qname, pairReadSuffix)
+ return readID in self.multiReadIDCount
+
+
+
+class MaxCoordFinder(ReadCounter):
+ maxUniqueStart = 0
+ maxMultiStart = 0
+ maxSpliceStart = 0
+
+ def __call__(self, read):
+ if self.isMulti(read):
+ self.maxMultiStart = max(self.maxMultiStart, read.pos)
+ elif isSpliceEntry(read.cigar):
+ self.maxSpliceStart = max(self.maxMultiStart, getSpliceRightStart(read.pos, read.cigar))
+ else:
+ self.maxUniqueStart = max(self.maxMultiStart, read.pos)
+
+
+
class ReadDataset():
""" Class for storing reads from experiments. Assumes that custom scripts
will translate incoming data into a format that can be inserted into the
splices table.
"""
+
def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False,
cache=False, reportCount=True):
""" creates an rds datafile if initialize is set to true, otherwise
will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
"""
- self.dbcon = ""
- self.memcon = ""
- self.dataType = ""
- self.rdsVersion = currentRDSVersion
- self.memBacked = False
- self.memChrom = ""
- self.memCursor = ""
- self.cachedDBFile = ""
if initialize and datasetType not in ["DNA", "RNA"]:
raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
- if cache:
- if verbose:
- print "caching ...."
+ self.dataType = ""
+ self.multiReadIDCounts = self.getMultiReadIDCounts(datafile, "rb")
+ self.bamfile = pysam.Samfile(datafile, "rb")
+ self.readsize = None
+ self.totalReadWeight = None
- self.cacheDB(datafile)
- dbFile = self.cachedDBFile
- else:
- dbFile = datafile
+ self.metadata = {"dataType": datasetType}
- self.dbcon = sqlite.connect(dbFile)
- self.dbcon.row_factory = sqlite.Row
- self.dbcon.execute("PRAGMA temp_store = MEMORY")
if initialize:
self.dataType = datasetType
- self.initializeTables(self.dbcon)
else:
- metadata = self.getMetadata("dataType")
- self.dataType = metadata["dataType"]
+ self.dataType = self.metadata["dataType"]
try:
- metadata = self.getMetadata("rdsVersion")
- self.rdsVersion = metadata["rdsVersion"]
- except:
- try:
- self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
- except IOError:
- print "could not add rdsVersion - read-only ?"
- self.rdsVersion = "pre-1.0"
+ self.rdsVersion = self.metadata["rdsVersion"]
+ except KeyError:
+ self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
+ self.fullReadCounts = self.getFullReadCount()
if verbose:
self.printRDSInfo(datafile, reportCount, initialize)
def __len__(self):
- """ return the number of usable reads in the dataset.
+ """ return the number of reads in the bam file
+ This is not the same as the original as the original returned total weight.
+ """
+ count = 0
+ references = self.bamfile.references
+ for reference in references:
+ count += self.bamfile.count(reference)
+
+ return count
+
+
+ def getMultiReadIDCounts(self, samFileName, fileMode):
+ try:
+ samfile = pysam.Samfile(samFileName, fileMode)
+ except ValueError:
+ print "samfile index not found"
+ sys.exit(1)
+
+ readIDCounts = {}
+ for read in samfile.fetch(until_eof=True):
+ pairReadSuffix = getPairedReadNumberSuffix(read)
+ readName = "%s%s" % (read.qname, pairReadSuffix)
+ try:
+ readIDCounts[readName] += 1.0
+ except KeyError:
+ readIDCounts[readName] = 1.0
+
+ for readID in readIDCounts.keys():
+ if int(readIDCounts[readID]) == 1:
+ del readIDCounts[readID]
+
+ return readIDCounts
+
+
+ def totalReadWeight(self):
+ """ return the total weight of usable reads in the bam file.
"""
- total = self.getUniqsCount()
- total += self.getMultiCount()
+ if self.totalReadWeight is None:
+ total = self.fullReadCounts["uniq"]
+ total += self.fullReadCounts["multi"]
- if self.dataType == "RNA":
- total += self.getSplicesCount()
+ if self.dataType == "RNA":
+ total += self.fullReadCounts["splice"]
- total = int(total)
+ self.totalReadWeight = int(total)
- return total
+ return self.totalReadWeight
def __del__(self):
""" cleanup copy in local cache, if present.
"""
- if self.cachedDBFile != "":
- self.uncacheDB()
+ pass
def printRDSInfo(self, datafile, reportCount, initialize):
else:
print "dataset %s" % datafile
- metadata = self.getMetadata()
print "metadata:"
- pnameList = metadata.keys()
+ pnameList = self.metadata.keys()
pnameList.sort()
for pname in pnameList:
- print "\t" + pname + "\t" + metadata[pname]
+ print "\t" + pname + "\t" + str(self.metadata[pname])
if reportCount and not initialize:
self.printReadCounts()
print "default cache size is %d pages" % self.getDefaultCacheSize()
- if self.hasIndex():
- print "found index"
- else:
- print "not indexed"
def printReadCounts(self):
- ucount = self.getUniqsCount()
- mcount = self.getMultiCount()
+ ucount = self.fullReadCounts["uniq"]
+ mcount = self.fullReadCounts["multi"]
if self.dataType == "DNA":
print "\n%d unique reads and %d multireads" % (ucount, mcount)
elif self.dataType == "RNA":
- scount = self.getSplicesCount()
+ scount = self.fullReadCounts["splice"]
print "\n%d unique reads, %d spliced reads and %d multireads" % (ucount, scount, mcount)
def cacheDB(self, filename):
- """ copy geneinfoDB to a local cache.
- """
- configParser = getConfigParser()
- cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
- tempfile.tempdir = cisTemp
- self.cachedDBFile = "%s.db" % tempfile.mktemp()
- shutil.copyfile(filename, self.cachedDBFile)
+ pass
def saveCacheDB(self, filename):
- """ copy geneinfoDB to a local cache.
- """
- shutil.copyfile(self.cachedDBFile, filename)
+ pass
def uncacheDB(self):
- """ delete geneinfoDB from local cache.
- """
- global cachedDBFile
- if self.cachedDBFile != "":
- try:
- os.remove(self.cachedDBFile)
- except:
- print "could not delete %s" % self.cachedDBFile
-
- self.cachedDB = ""
+ pass
def attachDB(self, filename, dbName):
- """ attach another database file to the readDataset.
- """
- stmt = "attach '%s' as %s" % (filename, dbName)
- self.execute(stmt)
+ pass
def detachDB(self, dbName):
- """ detach a database file to the readDataset.
- """
- stmt = "detach %s" % (dbName)
- self.execute(stmt)
+ pass
def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
- """ import into current RDS the table (with columns destcolumns,
- with default all columns) from the database file asname,
- using the column specification of ascolumns (default all).
- """
- stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
- if flagged != "":
- stmt += " where flag = '%s' " % flagged
-
- self.executeCommit(stmt)
+ pass
def getTables(self, dbName=""):
- """ get a list of table names in a particular database file.
- """
- resultList = []
- sql = self.getSqlCursor()
-
- if dbName != "":
- dbName = "%s." % dbName
-
- stmt = "select name from %ssqlite_master where type='table'" % dbName
- sql.execute(stmt)
- results = sql.fetchall()
-
- for row in results:
- resultList.append(row["name"])
-
- return resultList
+ pass
def getSqlCursor(self):
- if self.memBacked:
- sql = self.getMemCursor()
- else:
- sql = self.getFileCursor()
-
- return sql
+ pass
def getMemCursor(self):
- """ returns a cursor to memory database for low-level (SQL)
- access to the data.
- """
- return self.memcon.cursor()
+ pass
def getFileCursor(self):
- """ returns a cursor to file database for low-level (SQL)
- access to the data.
- """
- return self.dbcon.cursor()
+ pass
- def hasIndex(self):
- """ return True if the RDS file has at least one index.
- """
- stmt = "select count(*) from sqlite_master where type='index'"
- count = int(self.execute(stmt, returnResults=True)[0][0])
+ def initializeTables(self, dbConnection, cache=100000):
+ pass
- return count > 0
+ def getMetadata(self, valueName=""):
+ return self.metadata
- def initializeTables(self, dbConnection, cache=100000):
- """ creates table schema in a database connection, which is
- typically a database file or an in-memory database.
- """
- dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
- dbConnection.execute("create table metadata (name varchar, value varchar)")
- dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
- positionSchema = "start int, stop int"
- tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
- dbConnection.execute("create table uniqs %s" % tableSchema)
- dbConnection.execute("create table multi %s" % tableSchema)
- if self.dataType == "RNA":
- positionSchema = "startL int, stopL int, startR int, stopR int"
- tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
- dbConnection.execute("create table splices %s" % tableSchema)
-
- positionSchema = "startL int, stopL int, startR int, stopR int"
- tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
- dbConnection.execute("create table multisplices %s" % tableSchema)
-
- dbConnection.commit()
-
-
- def getMetadata(self, valueName=""):
- """ returns a dictionary of metadata.
- """
- whereClause = ""
- resultsDict = {}
- if valueName != "":
- whereClause = " where name='%s'" % valueName
+ def getReadSize(self):
+ """ This is following the same model as the original where it assumes that all
+ read have the same readsize and that this is reflected by the size of the
+ first read.
+ """
+ if self.readsize is None:
+ bamFileIterator = self.bamfile.fetch(until_eof=True)
+ for read in bamFileIterator:
+ self.calculateReadsizeFromCigar(read.cigar)
+ break
- sql = self.getSqlCursor()
+ return self.readsize
- sql.execute("select name, value from metadata %s" % whereClause)
- results = sql.fetchall()
- for row in results:
- parameterName = row["name"]
- parameterValue = row["value"]
- if parameterName not in resultsDict:
- resultsDict[parameterName] = parameterValue
- else:
- trying = True
- index = 2
- while trying:
- newName = string.join([parameterName, str(index)], ":")
- if newName not in resultsDict:
- resultsDict[newName] = parameterValue
- trying = False
+ def calculateReadsizeFromCigar(self, cigar):
+ take = (0, 1) # CIGAR operation (M/match, I/insertion)
+ self.readsize = sum([length for op,length in cigar if op in take])
- index += 1
- return resultsDict
+ def getDefaultCacheSize(self):
+ """ returns 0 as cache is going to be deprecated
+ """
+ return 0
- def getReadSize(self):
- """ returns readsize if defined in metadata.
+ def getChromosomes(self, fullChrom=True):
+ """ returns a sorted list of distinct chromosomes in bam file reference list.
"""
- metadata = self.getMetadata()
- if "readsize" not in metadata:
- raise ReadDatasetError("no readsize parameter defined")
+ if fullChrom:
+ results = list(self.getFullReferenceNames())
else:
- readSize = metadata["readsize"]
- if "import" in readSize:
- readSize = readSize.split()[0]
+ results = list(self.getShortReferenceNames())
- readSize = int(readSize)
- if readSize < 0:
- raise ReadDatasetError("readsize is negative")
+ results.sort()
- return readSize
+ return results
- def getDefaultCacheSize(self):
- """ returns the default cache size.
+ def getFullReferenceNames(self):
+ """ returns a set of bam file reference names.
"""
- return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
+ return set(self.bamfile.references)
- def getChromosomes(self, table="uniqs", fullChrom=True):
- """ returns a sorted list of distinct chromosomes in table.
+ def getShortReferenceNames(self):
+ """ returns a set of bam file reference names after removing first 3 characters.
"""
- statement = "select distinct chrom from %s" % table
- sql = self.getSqlCursor()
-
- sql.execute(statement)
- results = []
- for row in sql:
- if fullChrom:
- if row["chrom"] not in results:
- results.append(row["chrom"])
- else:
- shortName = row["chrom"][3:]
- if len(shortName.strip()) > 0 and shortName not in results:
- results.append(shortName)
-
- results.sort()
+ results = set()
+ references = self.bamfile.references
+ for chromosome in references:
+ shortName = chromosome[3:]
+ if len(shortName.strip()) > 0:
+ results.add(shortName)
return results
""" returns the maximum coordinate for reads on a given chromosome.
"""
maxCoord = 0
+ coordFinder = MaxCoordFinder()
+ self.bamfile.fetch(reference=chrom, callback=coordFinder)
if doUniqs:
- maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
+ maxCoord = coordFinder.maxUniqueStart
if doSplices:
- spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
- maxCoord = max(spliceMax, maxCoord)
+ maxCoord = max(coordFinder.maxSpliceStart, maxCoord)
if doMulti:
- multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
- maxCoord = max(multiMax, maxCoord)
-
- return maxCoord
-
-
- def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
- maxCoord = 0
- sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
- sql = self.getSqlCursor()
- try:
- sql.execute(sqlStatement)
- maxCoord = int(sql.fetchall()[0][0])
- except:
- print "couldn't retrieve coordMax for chromosome %s" % chrom
+ maxCoord = max(coordFinder.maxMultiStart, maxCoord)
return maxCoord
def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
- readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
+ readIDDict=False, readLike="", start=None, stop=None, limit=-1, hasMismatch=False,
flagLike=False, strand='', combine5p=False):
- """ returns a dictionary of reads in a variety of formats
- and which can be restricted by chromosome or custom-flag.
- Returns unique reads by default, but can return multireads
- with doMulti set to True.
-
+ """ First pass of rewrite
+ 1) Leave as much in place as possible
+ 2) For now treat multis just like uniqs
+ """
"""
- #TODO: Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
-
whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
if findallOptimize:
selectQuery = "select start, sense, sum(weight)"
sql = self.getSqlCursor()
sqlQuery = string.join(stmt)
sql.execute(sqlQuery)
+ """
+ # Edits are starting here - trying to ignore the entire sql construct
+ bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop)
resultsDict = {}
- if findallOptimize:
- resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
- if self.memBacked:
- self.memcon.row_factory = sqlite.Row
- else:
- self.dbcon.row_factory = sqlite.Row
- else:
- currentChrom = ""
- currentReadID = ""
- pairID = 0
- for row in sql:
- readID = row["readID"]
- if fullChrom:
- chrom = row["chrom"]
- else:
- chrom = row["chrom"][3:]
-
- if not readIDDict and chrom != currentChrom:
- resultsDict[chrom] = []
- currentChrom = chrom
- dictKey = chrom
- elif readIDDict:
- theReadID = readID
- if "::" in readID:
- theReadID = readID.split("::")[0]
-
- if "/" in theReadID and withPairID:
- (theReadID, pairID) = readID.split("/")
-
- if theReadID != currentReadID:
- resultsDict[theReadID] = []
- currentReadID = theReadID
- dictKey = theReadID
-
- newrow = {"start": int(row["start"])}
- if bothEnds:
- newrow["stop"] = int(row["stop"])
-
- if not noSense:
- newrow["sense"] = row["sense"]
-
- if withWeight:
- newrow["weight"] = float(row["weight"])
-
- if withFlag:
- newrow["flag"] = row["flag"]
-
- if withMismatch:
- newrow["mismatch"] = row["mismatch"]
-
- if withID:
- newrow["readID"] = readID
-
- if withChrom:
- newrow["chrom"] = chrom
-
- if withPairID:
- newrow["pairID"] = pairID
-
- resultsDict[dictKey].append(newrow)
-
- return resultsDict
-
-
- def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
- if splice:
- startText = "startL"
- stopText = "stopR"
- else:
- startText = "start"
- stopText = "stop"
+ if findallOptimize and chrom != "":
+ readIDDict = False
+ bothEnds = False
+ noSense = False
+ withWeight = True
+ withFlag = False
+ withMismatch = False
+ withID = False
+ withChrom = False
+ withPairID = False
+ fullChrom = True
+ #resultsDict[chrom] = [{"start": int(read.pos), "sense": getReadSense(read.is_reverse), "weight": 1.0} for read in bamFileIterator if not isSpliceEntry(read.cigar)]
+
+ pairID = 0
+ for read in bamFileIterator:
+ pairReadSuffix = getPairedReadNumberSuffix(read)
+ readID = "%s%s" % (read.qname, pairReadSuffix)
+ if isSpliceEntry(read.cigar) or self.readDoesNotMeetCriteria(read, readID, doMulti=doMulti):
+ continue
- whereClause = []
- if chrom != "" and chrom != self.memChrom:
- whereClause.append("chrom = '%s'" % chrom)
+ sense = getReadSense(read.is_reverse)
+ if fullChrom:
+ chrom = self.bamfile.getrname(read.rname)
+ else:
+ chrom = self.bamfile.getrname(read.rname)[3:]
- if flag != "":
- if flagLike:
- flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
- whereClause.append(flagLikeClause)
+ if readIDDict:
+ dictKey = read.qname
else:
- whereClause.append("flag = '%s'" % flag)
+ dictKey = chrom
- if start > -1:
- whereClause.append("%s > %d" % (startText, start))
+ newrow = {"start": int(read.pos)}
+ if bothEnds:
+ if self.readsize is None:
+ self.calculateReadsizeFromCigar(read.cigar)
- if stop > -1:
- whereClause.append("%s < %d" % (stopText, stop))
+ newrow["stop"] = int(read.pos + self.readsize)
- if len(readLike) > 0:
- readIDClause = string.join(["readID LIKE '", readLike, "%'"], "")
- whereClause.append(readIDClause)
+ if not noSense:
+ newrow["sense"] = sense
- if hasMismatch:
- whereClause.append("mismatch != ''")
+ if withWeight:
+ try:
+ newrow["weight"] = 1.0/self.multiReadIDCounts[readID]
+ except KeyError:
+ newrow["weight"] = 1.0
- if strand in ["+", "-"]:
- whereClause.append("sense = '%s'" % strand)
+ if withFlag:
+ #newrow["flag"] = row["flag"]
+ pass
- if len(whereClause) > 0:
- whereStatement = string.join(whereClause, " and ")
- whereQuery = "where %s" % whereStatement
- else:
- whereQuery = ""
+ if withMismatch:
+ try:
+ mismatchTag = read.opt("MD")
+ mismatches = getMismatches(mismatchTag, read.seq, sense)
+ except KeyError:
+ mismatches = ""
+
+ newrow["mismatch"] = mismatches
- return whereQuery
+ if withID:
+ newrow["readID"] = readID
+ if withChrom:
+ newrow["chrom"] = chrom
- def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
+ if withPairID:
+ newrow["pairID"] = pairReadSuffix[1:]
- selectClause = [baseSelect]
- if bothEnds:
- selectClause.append("stop")
+ try:
+ resultsDict[dictKey].append(newrow)
+ except KeyError:
+ resultsDict[dictKey] = [newrow]
- if not noSense:
- selectClause.append("sense")
+ return resultsDict
- if withWeight:
- selectClause.append("weight")
- if withFlag:
- selectClause.append("flag")
+ def readDoesNotMeetCriteria(self, read, readID, doMulti=False, flag="", readLike="", hasMismatch=False, strand=""):
+ readDoesNotMeetCriteria = self.rejectMultiReadEntry(readID, doMulti)
- if withMismatch:
- selectClause.append("mismatch")
+ if flag != "":
+ #TODO: need to pick a tag code for the erange flag - using ZF for now
+ # although it looks like the flag is really just a label on a region.
+ # since BAM can retrieve based on location if we store the regions then
+ # we can just retrieve it directly from the BAM and we do not need to
+ # flag the individual reads. The location will be sufficient.
+ try:
+ if flag != read.opt("ZF"):
+ readDoesNotMeetCriteria = True
+ except KeyError:
+ readDoesNotMeetCriteria = True
- selectQuery = string.join(selectClause, ",")
- return selectQuery
+ if hasMismatch:
+ try:
+ mismatchTag = read.opt("MD")
+ except KeyError:
+ readDoesNotMeetCriteria = True
+ if strand in ["+", "-"] and getReadSense(read.is_reverse) != strand:
+ readDoesNotMeetCriteria = True
- def getReadGroupQuery(self, findallOptimize, limit, combine5p):
- groupBy = []
- if findallOptimize:
- groupBy = ["GROUP BY start, sense"]
+ return readDoesNotMeetCriteria
- if limit > 0 and not combine5p:
- groupBy.append("LIMIT %d" % limit)
- groupQuery = string.join(groupBy)
+ def rejectMultiReadEntry(self, readID, doMulti=False, threshold=10):
+ rejectRead = False
+ if readID in self.multiReadIDCounts.keys():
+ if self.multiReadIDCounts[readID] > threshold or not doMulti:
+ rejectRead = True
- return groupQuery
+ return rejectRead
def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
flag="", withWeight=False, withFlag=False, withMismatch=False,
withID=False, withChrom=False, withPairID=False, readIDDict=False,
- splitRead=False, hasMismatch=False, flagLike=False, start=-1,
- stop=-1, strand=""):
- """ returns a dictionary of spliced reads in a variety of
- formats and which can be restricted by chromosome or custom-flag.
- Returns unique spliced reads for now.
+ splitRead=False, hasMismatch=False, flagLike=False, start=None,
+ stop=None, strand=""):
+ """ First pass of rewrite
+ 1) Leave as much in place as possible
+ 2) For now treat multis just like regular splices
+ """
"""
whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
sql.execute(stmt)
- currentReadID = ""
- currentChrom = ""
+ """
+
+ # Edits are starting here - trying to ignore the entire sql construct
+ bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop)
resultsDict = {}
- for row in sql:
- pairID = 0
- readID = row["readID"]
+ pairID = 0
+ for read in bamFileIterator:
+ if not isSpliceEntry(read.cigar):
+ continue
+
+ sense = getReadSense(read.is_reverse)
+ pairReadSuffix = getPairedReadNumberSuffix(read)
+ readID = "%s%s" % (read.qname, pairReadSuffix)
if fullChrom:
- chrom = row["chrom"]
+ chrom = self.bamfile.getrname(read.rname)
else:
- chrom = row["chrom"][3:]
+ chrom = self.bamfile.getrname(read.rname)[3:]
- if not readIDDict and chrom != currentChrom:
- resultsDict[chrom] = []
- currentChrom = chrom
+ if readIDDict:
+ dictKey = read.qname
+ else:
dictKey = chrom
- elif readIDDict:
- if "/" in readID:
- (theReadID, pairID) = readID.split("/")
- else:
- theReadID = readID
-
- if theReadID != currentReadID:
- resultsDict[theReadID] = []
- currentReadID = theReadID
- dictKey = theReadID
-
- newrow = {"startL": int(row["startL"])}
- newrow["stopL"] = int(row["stopL"])
- newrow["startR"] = int(row["startR"])
- newrow["stopR"] = int(row["stopR"])
+
+ if self.readsize is None:
+ self.calculateReadsizeFromCigar(read.cigar)
+
+ startL, startR, stopL, stopR = getSpliceBounds(read.pos, self.readsize, read.cigar)
+ newrow = {"startL": int(startL)}
+ newrow["stopL"] = int(stopL)
+ newrow["startR"] = int(startR)
+ newrow["stopR"] = int(stopR)
if not noSense:
- newrow["sense"] = row["sense"]
+ newrow["sense"] = sense
if withWeight:
- newrow["weight"] = float(row["weight"])
+ newrow["weight"] = 1.0
if withFlag:
- newrow["flag"] = row["flag"]
+ #newrow["flag"] = row["flag"]
+ pass
if withMismatch:
- newrow["mismatch"] = row["mismatch"]
+ try:
+ mismatchTag = read.opt("MD")
+ mismatches = getMismatches(mismatchTag, read.seq, sense)
+ except KeyError:
+ mismatches = ""
+
+ newrow["mismatch"] = mismatches
if withID:
newrow["readID"] = readID
rightDict = newrow
del rightDict["startL"]
del rightDict["stopL"]
- resultsDict[dictKey].append(leftDict)
+ try:
+ resultsDict[dictKey].append(leftDict)
+ except KeyError:
+ resultsDict[dictKey] = [leftDict]
+
resultsDict[dictKey].append(rightDict)
else:
- resultsDict[dictKey].append(newrow)
+ try:
+ resultsDict[dictKey].append(newrow)
+ except KeyError:
+ resultsDict[dictKey] = [newrow]
return resultsDict
- def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
+ def getFullReadCount(self):
+ fullReadCount = {"uniq": 0,
+ "multi": 0,
+ "splice": 0
+ }
+
+ for chromosome in self.getChromosomes():
+ uniqCount, multiCount, spliceCount = self.getCounts(chrom=chromosome, reportCombined=False, multi=True, splices=True)
+ fullReadCount["uniq"] += uniqCount
+ fullReadCount["multi"] += multiCount
+ fullReadCount["splice"] += spliceCount
+
+ return fullReadCount
+
+
+ def getCounts(self, chrom=None, rmin=None, rmax=None, uniqs=True, multi=False,
splices=False, reportCombined=True, sense="both"):
""" return read counts for a given region.
"""
ucount = 0
mcount = 0
scount = 0
- restrict = ""
- if sense in ["+", "-"]:
- restrict = " sense ='%s' " % sense
+ readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=sense)
if uniqs:
- try:
- ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
- except:
- ucount = 0
+ ucount = readCounter.uniqReadCount
if multi:
- try:
- mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
- except:
- mcount = 0
+ mcount = readCounter.multiReadCount
if splices:
- try:
- scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
- except:
- scount = 0
+ scount = readCounter.spliceReadCount
if reportCombined:
total = ucount + mcount + scount
return (ucount, mcount, scount)
- def getTotalCounts(self, chrom="", rmin="", rmax=""):
- """ return read counts for a given region.
- """
- return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
-
-
- def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
- """ returns the number of row in the specified table.
- """
- whereClause = []
- count = 0
-
- if chrom !="" and chrom != self.memChrom:
- whereClause = ["chrom='%s'" % chrom]
-
- if rmin != "":
- whereClause.append("%s >= %s" % (startField, str(rmin)))
-
- if rmax != "":
- whereClause.append("%s <= %s" % (startField, str(rmax)))
+ def getSplicesCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False):
+ readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
- if restrict != "":
- whereClause.append(restrict)
+ return readCounter.spliceReadCount
- if len(whereClause) > 0:
- whereStatement = string.join(whereClause, " and ")
- whereQuery = "where %s" % whereStatement
- else:
- whereQuery = ""
- sql = self.getSqlCursor()
+ def getUniqsCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False):
+ readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
- if distinct:
- sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
- else:
- sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
+ return readCounter.uniqReadCount
- result = sql.fetchone()
-
- try:
- count = int(result[0])
- except:
- count = 0
-
- return count
+ def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
+ readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
- def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
- """ returns the number of row in the splices table.
- """
- # TODO: if the rds type is DNA should this just return zero?
- return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
+ return readCounter.multiReadCount
- def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
- """ returns the number of distinct readIDs in the uniqs table.
- """
- return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
+ def getBamReadCounter(self, chrom="", rmin=None, rmax=None, sense="both"):
+ readCounter = ReadCounter(self.multiReadIDCounts, sense)
+ self.bamfile.fetch(reference=chrom, start=rmin, end=rmax, callback=readCounter)
-
- def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
- """ returns the total weight of readIDs in the multi table.
- """
- return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
+ return readCounter
+ #TODO: Primary
def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
""" get readID's.
"""
hitChromList.sort()
snpDict = {}
- for achrom in hitChromList:
+ for chromosome in hitChromList:
if verbose:
- print "getting mismatches from chromosome %s" % (achrom)
+ print "getting mismatches from chromosome %s" % (chromosome)
- snpDict[achrom] = []
+ snpDict[chromosome] = []
if useSplices and self.dataType == "RNA":
- spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
+ spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withMismatch=True, readIDDict=True, hasMismatch=True)
spliceIDList = spliceDict.keys()
for spliceID in spliceIDList:
spliceEntry = spliceDict[spliceID][0]
secondhalf = change_pos - firsthalf
change_at = rightstart + secondhalf
- snpDict[achrom].append([startpos, change_at, change_base, change_from])
+ snpDict[chromosome].append([startpos, change_at, change_base, change_from])
- hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
- if achrom not in hitDict.keys():
+ hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withMismatch=True, hasMismatch=True)
+ if chromosome not in hitDict.keys():
continue
- for readEntry in hitDict[achrom]:
+ for readEntry in hitDict[chromosome]:
start = readEntry["start"]
sense = readEntry["sense"]
mismatches = readEntry["mismatch"]
change_pos = readlen - int(mismatch[1:change_len-1]) + 1
change_at = start + change_pos - 1
- snpDict[achrom].append([start, change_at, change_base, change_from])
+ snpDict[chromosome].append([start, change_at, change_base, change_from])
return snpDict
spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
else:
spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
-
+
if chromosome in spliceDict:
for spliceEntry in spliceDict[chromosome]:
Lstart = spliceEntry["startL"]
def insertMetadata(self, valuesList):
- """ inserts a list of (pname, pvalue) into the metadata
- table.
- """
- self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
- self.dbcon.commit()
+ for (pname, pvalue) in valuesList:
+ self.metadata[pname] = pvalue
def updateMetadata(self, pname, newValue, originalValue=""):
- """ update a metadata field given the original value and the new value.
- """
- stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
if originalValue != "":
- stmt += " and value='%s' " % str(originalValue)
-
- self.dbcon.execute(stmt)
- self.dbcon.commit()
-
-
- def insertUniqs(self, valuesList):
- """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
- into the uniqs table.
- """
- self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
- self.dbcon.commit()
-
-
- def insertMulti(self, valuesList):
- """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
- into the multi table.
- """
- self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
- self.dbcon.commit()
-
-
- def insertSplices(self, valuesList):
- """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
- into the splices table.
- """
- self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
- self.dbcon.commit()
-
-
- def insertMultisplices(self, valuesList):
- """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
- into the multisplices table.
- """
- self.dbcon.executemany("insert into multisplices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
- self.dbcon.commit()
+ if self.metadata[pname] == originalValue:
+ self.metadata[pname] = newValue
+ else:
+ self.metadata[pname] = newValue
+ #TODO: Primary
def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
""" update reads on file database in a list region of regions for a chromosome to have a new flag.
regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
self.dbcon.commit()
+ #TODO: Primary
def setFlags(self, flag, uniqs=True, multi=True, splices=True):
""" set the flag fields in the entire dataset.
"""
if uniqs:
- self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
+ self.setFlagsInDB("uniqs", flag)
if multi:
- self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
+ self.setFlagsInDB("multi", flag)
if self.dataType == "RNA" and splices:
- self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
+ self.setFlagsInDB("splices", flag)
self.dbcon.commit()
+ #TODO: Secondary to setFlags()
+ def setFlagsInDB(self, table, flag):
+ """ set the flag field for every entry in a table.
+ """
+ self.dbcon.execute("UPDATE %s SET flag = '%s'" % (table, flag))
+
+
+ #TODO: Primary
def resetFlags(self, uniqs=True, multi=True, splices=True):
""" reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
"""
self.setFlags("", uniqs, multi, splices)
+ #TODO: Primary
def reweighMultireads(self, readList):
self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
def setSynchronousPragma(self, value="ON"):
- try:
- self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
- except:
- print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
+ pass
def setDBcache(self, cache, default=False):
- self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
- if default:
- self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
+ pass
def execute(self, statement, returnResults=False):
- sql = self.getSqlCursor()
-
- sql.execute(statement)
- if returnResults:
- result = sql.fetchall()
- return result
+ pass
def executeCommit(self, statement):
- self.execute(statement)
-
- if self.memBacked:
- self.memcon.commit()
- else:
- self.dbcon.commit()
+ pass
def buildIndex(self, cache=100000):
- """ Builds the file indeces for the main tables.
- Cache is the number of 1.5 kb pages to keep in memory.
- 100000 pages translates into 150MB of RAM, which is our default.
- """
- if cache > self.getDefaultCacheSize():
- self.setDBcache(cache)
- self.setSynchronousPragma("OFF")
- self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
- print "built uPosIndex"
- self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
- print "built uChromIndex"
- self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
- print "built mPosIndex"
- self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
- print "built mChromIndex"
-
- if self.dataType == "RNA":
- self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
- print "built sPosIndex"
- self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
- print "built sPosIndex2"
- self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
- print "built sChromIndex"
-
- self.dbcon.commit()
- self.setSynchronousPragma("ON")
+ pass
def dropIndex(self):
- """ drops the file indices for the main tables.
- """
- try:
- self.setSynchronousPragma("OFF")
- self.dbcon.execute("DROP INDEX uPosIndex")
- self.dbcon.execute("DROP INDEX uChromIndex")
- self.dbcon.execute("DROP INDEX mPosIndex")
- self.dbcon.execute("DROP INDEX mChromIndex")
+ pass
- if self.dataType == "RNA":
- self.dbcon.execute("DROP INDEX sPosIndex")
- try:
- self.dbcon.execute("DROP INDEX sPosIndex2")
- except:
- pass
- self.dbcon.execute("DROP INDEX sChromIndex")
+ def memSync(self, chrom="", index=False):
+ pass
+
- self.dbcon.commit()
- except:
- print "problem dropping index"
+ def copyDBEntriesToMemory(self, dbName, whereClause=""):
+ pass
- self.setSynchronousPragma("ON")
+ def copySpliceDBEntriesToMemory(self, whereClause=""):
+ pass
- def memSync(self, chrom="", index=False):
- """ makes a copy of the dataset into memory for faster access.
- Can be restricted to a "full" chromosome. Can also build the
- memory indices.
- """
- self.memcon = ""
- self.memcon = sqlite.connect(":memory:")
- self.initializeTables(self.memcon)
- cursor = self.dbcon.cursor()
- whereclause = ""
- if chrom != "":
- print "memSync %s" % chrom
- whereclause = " where chrom = '%s' " % chrom
- self.memChrom = chrom
+
+def getReadSense(reverse):
+ if reverse:
+ sense = "-"
+ else:
+ sense = "+"
+
+ return sense
+
+
+def getSpliceRightStart(start, cigarTupleList):
+ offset = 0
+
+ for operation,length in cigarTupleList:
+ if operation == 3:
+ stopL = int(start + offset)
+ startR = int(stopL + length)
+
+ return startR
else:
- self.memChrom = ""
-
- self.memcon.execute("PRAGMA temp_store = MEMORY")
- self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
- # copy metadata to memory
- self.memcon.execute("delete from metadata")
- results = cursor.execute("select name, value from metadata")
- results2 = []
- for row in results:
- results2.append((row["name"], row["value"]))
-
- self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
-
- self.copyDBEntriesToMemory("uniqs", whereclause)
- self.copyDBEntriesToMemory("multi", whereclause)
- if self.dataType == "RNA":
- self.copySpliceDBEntriesToMemory(whereclause)
-
- if index:
- if chrom != "":
- self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
- self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
- if self.dataType == "RNA":
- self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
- self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
- else:
- self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
- self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
- if self.dataType == "RNA":
- self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
- self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
+ offset += length
- self.memBacked = True
- self.memcon.row_factory = sqlite.Row
- self.memcon.commit()
+def getSpliceBounds(start, readsize, cigarTupleList):
+ offset = 0
+ #TODO: check operations here - we want to make sure we are adding the correct ones
+ for operation,length in cigarTupleList:
+ if operation == 3:
+ stopL = int(start + offset)
+ startR = int(stopL + length)
+ offset = 0
+ else:
+ offset += length
- def copyDBEntriesToMemory(self, dbName, whereClause=""):
- cursor = self.dbcon.cursor()
- sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
- destinationEntries = []
- for row in sourceEntries:
- destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
+ stopR = startR + offset
- self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
+ return start, startR, stopL, stopR
- def copySpliceDBEntriesToMemory(self, whereClause=""):
- cursor = self.dbcon.cursor()
- sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
- destinationEntries = []
- for row in sourceEntries:
- destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
- row["weight"], row["flag"], row["mismatch"]))
+def getPairedReadNumberSuffix(read):
+ readSuffix = ""
+ if not isPairedRead(read):
+ return ""
+
+ if read.is_read1:
+ readSuffix = "/1"
+ elif read.is_read2:
+ readSuffix = "/2"
+
+ return readSuffix
+
+
+def isPairedRead(read):
+ return read.is_proper_pair and (read.is_read1 or read.is_read2)
+
+
+def getMismatches(mismatchTag, querySequence="", sense="+"):
+ output = []
+ deletionMarker = "^"
+ position = 0
+
+ lengths = re.findall("\d+", mismatchTag)
+ mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
+
+ for mismatchEntry in range(len(mismatchSequences)):
+ mismatch = mismatchSequences[mismatchEntry]
+ position = position + int(lengths[mismatchEntry])
+ if string.find(mismatch, deletionMarker) == 0:
+ continue
+
+ try:
+ if querySequence:
+ genomicNucleotide = querySequence[position]
+ else:
+ genomicNucleotide = "N"
+
+ if sense == "-":
+ mismatch = getReverseComplement(mismatch)
+ genomicNucleotide = getReverseComplement(genomicNucleotide)
- self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", destinationEntries)
+ erange1BasedElandCompatiblePosition = int(position + 1)
+ output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
+ position += 1
+ except IndexError:
+ return ""
+ return string.join(output, ",")