-"""
-Created on Jul 1, 2010
-
-@author: sau
-"""
-
import sqlite3 as sqlite
import string
import tempfile
import shutil
import os
-from os import environ
from array import array
-from commoncode import getReverseComplement
+from commoncode import getReverseComplement, getConfigParser, getConfigOption
-if environ.get("CISTEMATIC_TEMP"):
- cisTemp = environ.get("CISTEMATIC_TEMP")
-else:
- cisTemp = "/tmp"
-
-tempfile.tempdir = cisTemp
-currentRDSVersion = "1.1"
+currentRDSVersion = "2.1"
class ReadDatasetError(Exception):
self.memCursor = ""
self.cachedDBFile = ""
+ if initialize and datasetType not in ["DNA", "RNA"]:
+ raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
+
if cache:
if verbose:
print "caching ...."
self.dbcon.row_factory = sqlite.Row
self.dbcon.execute("PRAGMA temp_store = MEMORY")
if initialize:
- if datasetType not in ["DNA", "RNA"]:
- raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
- else:
- self.dataType = datasetType
-
+ self.dataType = datasetType
self.initializeTables(self.dbcon)
else:
metadata = self.getMetadata("dataType")
self.rdsVersion = "pre-1.0"
if verbose:
- if initialize:
- print "INITIALIZED dataset %s" % datafile
- else:
- print "dataset %s" % datafile
-
- metadata = self.getMetadata()
- print "metadata:"
- pnameList = metadata.keys()
- pnameList.sort()
- for pname in pnameList:
- print "\t" + pname + "\t" + metadata[pname]
-
- if reportCount:
- ucount = self.getUniqsCount()
- mcount = self.getMultiCount()
- if self.dataType == "DNA" and not initialize:
- try:
- print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
- except ValueError:
- print "\n%s unique reads and %s multireads" % (ucount, mcount)
- elif self.dataType == "RNA" and not initialize:
- scount = self.getSplicesCount()
- try:
- print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
- except ValueError:
- print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
-
- print "default cache size is %d pages" % self.getDefaultCacheSize()
- if self.hasIndex():
- print "found index"
- else:
- print "not indexed"
+ self.printRDSInfo(datafile, reportCount, initialize)
def __len__(self):
self.uncacheDB()
+ def printRDSInfo(self, datafile, reportCount, initialize):
+ if initialize:
+ print "INITIALIZED dataset %s" % datafile
+ else:
+ print "dataset %s" % datafile
+
+ metadata = self.getMetadata()
+ print "metadata:"
+ pnameList = metadata.keys()
+ pnameList.sort()
+ for pname in pnameList:
+ print "\t" + pname + "\t" + metadata[pname]
+
+ if reportCount and not initialize:
+ self.printReadCounts()
+
+ print "default cache size is %d pages" % self.getDefaultCacheSize()
+ if self.hasIndex():
+ print "found index"
+ else:
+ print "not indexed"
+
+
+ def printReadCounts(self):
+ ucount = self.getUniqsCount()
+ mcount = self.getMultiCount()
+ if self.dataType == "DNA":
+ print "\n%d unique reads and %d multireads" % (ucount, mcount)
+ elif self.dataType == "RNA":
+ scount = self.getSplicesCount()
+ print "\n%d unique reads, %d spliced reads and %d multireads" % (ucount, scount, mcount)
+
+
def cacheDB(self, filename):
""" copy geneinfoDB to a local cache.
"""
+ configParser = getConfigParser()
+ cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
+ tempfile.tempdir = cisTemp
self.cachedDBFile = "%s.db" % tempfile.mktemp()
shutil.copyfile(filename, self.cachedDBFile)
self.cachedDB = ""
- def attachDB(self, filename, asname):
+ def attachDB(self, filename, dbName):
""" attach another database file to the readDataset.
"""
- stmt = "attach '%s' as %s" % (filename, asname)
+ stmt = "attach '%s' as %s" % (filename, dbName)
self.execute(stmt)
- def detachDB(self, asname):
+ def detachDB(self, dbName):
""" detach a database file to the readDataset.
"""
- stmt = "detach %s" % (asname)
+ stmt = "detach %s" % (dbName)
self.execute(stmt)
- def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
+ def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
""" import into current RDS the table (with columns destcolumns,
with default all columns) from the database file asname,
using the column specification of ascolumns (default all).
"""
- stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
+ stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
if flagged != "":
stmt += " where flag = '%s' " % flagged
self.executeCommit(stmt)
- def getTables(self, asname=""):
+ def getTables(self, dbName=""):
""" get a list of table names in a particular database file.
"""
resultList = []
sql = self.getSqlCursor()
- if asname != "":
- asname += "."
+ if dbName != "":
+ dbName = "%s." % dbName
- stmt = "select name from %ssqlite_master where type='table'" % asname
+ stmt = "select name from %ssqlite_master where type='table'" % dbName
sql.execute(stmt)
results = sql.fetchall()
return sql
+ def getMemCursor(self):
+ """ returns a cursor to memory database for low-level (SQL)
+ access to the data.
+ """
+ return self.memcon.cursor()
+
+
+ def getFileCursor(self):
+ """ returns a cursor to file database for low-level (SQL)
+ access to the data.
+ """
+ return self.dbcon.cursor()
+
+
def hasIndex(self):
- """ check whether the RDS file has at least one index.
+ """ return True if the RDS file has at least one index.
"""
stmt = "select count(*) from sqlite_master where type='index'"
count = int(self.execute(stmt, returnResults=True)[0][0])
- if count > 0:
- return True
- return False
+ return count > 0
def initializeTables(self, dbConnection, cache=100000):
tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
dbConnection.execute("create table splices %s" % tableSchema)
- dbConnection.commit()
-
-
- def getFileCursor(self):
- """ returns a cursor to file database for low-level (SQL)
- access to the data.
- """
- return self.dbcon.cursor()
-
+ positionSchema = "startL int, stopL int, startR int, stopR int"
+ tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
+ dbConnection.execute("create table multisplices %s" % tableSchema)
- def getMemCursor(self):
- """ returns a cursor to memory database for low-level (SQL)
- access to the data.
- """
- return self.memcon.cursor()
+ dbConnection.commit()
def getMetadata(self, valueName=""):
if "readsize" not in metadata:
raise ReadDatasetError("no readsize parameter defined")
else:
- mysize = metadata["readsize"]
- if "import" in mysize:
- mysize = mysize.split()[0]
+ readSize = metadata["readsize"]
+ if "import" in readSize:
+ readSize = readSize.split()[0]
- return int(mysize)
+ readSize = int(readSize)
+ if readSize < 0:
+ raise ReadDatasetError("readsize is negative")
+
+ return readSize
def getDefaultCacheSize(self):
def getChromosomes(self, table="uniqs", fullChrom=True):
- """ returns a list of distinct chromosomes in table.
+ """ returns a sorted list of distinct chromosomes in table.
"""
statement = "select distinct chrom from %s" % table
sql = self.getSqlCursor()
if row["chrom"] not in results:
results.append(row["chrom"])
else:
- if len(row["chrom"][3:].strip()) < 1:
- continue
-
- if row["chrom"][3:] not in results:
- results.append(row["chrom"][3:])
+ shortName = row["chrom"][3:]
+ if len(shortName.strip()) > 0 and shortName not in results:
+ results.append(shortName)
results.sort()
return results
- def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
+ def getMaxCoordinate(self, chrom, doUniqs=True,
doMulti=False, doSplices=False):
""" returns the maximum coordinate for reads on a given chromosome.
"""
maxCoord = 0
- sql = self.getSqlCursor()
if doUniqs:
- try:
- sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
- maxCoord = int(sql.fetchall()[0][0])
- except:
- print "couldn't retrieve coordMax for chromosome %s" % chrom
+ maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
if doSplices:
- sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
- try:
- spliceMax = int(sql.fetchall()[0][0])
- if spliceMax > maxCoord:
- maxCoord = spliceMax
- except:
- pass
+ spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
+ maxCoord = max(spliceMax, maxCoord)
if doMulti:
- sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
- try:
- multiMax = int(sql.fetchall()[0][0])
- if multiMax > maxCoord:
- maxCoord = multiMax
- except:
- pass
+ multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
+ maxCoord = max(multiMax, maxCoord)
- if verbose:
- print "%s maxCoord: %d" % (chrom, maxCoord)
+ return maxCoord
+
+
+ def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
+ maxCoord = 0
+ sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
+ sql = self.getSqlCursor()
+ try:
+ sql.execute(sqlStatement)
+ maxCoord = int(sql.fetchall()[0][0])
+ except:
+ print "couldn't retrieve coordMax for chromosome %s" % chrom
return maxCoord
and which can be restricted by chromosome or custom-flag.
Returns unique reads by default, but can return multireads
with doMulti set to True.
- """
- whereClause = []
- resultsDict = {}
- if chrom != "" and chrom != self.memChrom:
- whereClause.append("chrom = '%s'" % chrom)
-
- if flag != "":
- if flagLike:
- flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
- whereClause.append(flagLikeClause)
- else:
- whereClause.append("flag = '%s'" % flag)
-
- if start > -1:
- whereClause.append("start > %d" % start)
-
- if stop > -1:
- whereClause.append("stop < %d" % stop)
-
- if len(readLike) > 0:
- readIDClause = string.join(["readID LIKE '", readLike, "%'"], "")
- whereClause.append(readIDClause)
-
- if hasMismatch:
- whereClause.append("mismatch != ''")
-
- if strand in ["+", "-"]:
- whereClause.append("sense = '%s'" % strand)
-
- if len(whereClause) > 0:
- whereStatement = string.join(whereClause, " and ")
- whereQuery = "where %s" % whereStatement
- else:
- whereQuery = ""
+ """
+ #TODO: Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
- groupBy = []
+ whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
if findallOptimize:
- selectClause = ["select start, sense, sum(weight)"]
- groupBy = ["GROUP BY start, sense"]
+ selectQuery = "select start, sense, sum(weight)"
else:
- selectClause = ["select ID, chrom, start, readID"]
- if bothEnds:
- selectClause.append("stop")
+ selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
- if not noSense:
- selectClause.append("sense")
-
- if withWeight:
- selectClause.append("weight")
-
- if withFlag:
- selectClause.append("flag")
-
- if withMismatch:
- selectClause.append("mismatch")
-
- if limit > 0 and not combine5p:
- groupBy.append("LIMIT %d" % limit)
-
- selectQuery = string.join(selectClause, ",")
- groupQuery = string.join(groupBy)
+ groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
if doUniqs:
stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
if doMulti:
if findallOptimize:
if self.memBacked:
self.memcon.row_factory = None
- sql = self.memcon.cursor()
else:
self.dbcon.row_factory = None
- sql = self.dbcon.cursor()
stmt.append("order by start")
elif readIDDict:
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
-
stmt.append("order by readID, start")
else:
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
-
stmt.append("order by chrom, start")
+ sql = self.getSqlCursor()
sqlQuery = string.join(stmt)
sql.execute(sqlQuery)
+ resultsDict = {}
if findallOptimize:
resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
if self.memBacked:
return resultsDict
- def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
- flag="", withWeight=False, withFlag=False, withMismatch=False,
- withID=False, withChrom=False, withPairID=False, readIDDict=False,
- splitRead=False, hasMismatch=False, flagLike=False, start=-1,
- stop=-1, strand=""):
- """ returns a dictionary of spliced reads in a variety of
- formats and which can be restricted by chromosome or custom-flag.
- Returns unique spliced reads for now.
- """
- whereClause = []
- resultsDict = {}
+ def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
+ if splice:
+ startText = "startL"
+ stopText = "stopR"
+ else:
+ startText = "start"
+ stopText = "stop"
+ whereClause = []
if chrom != "" and chrom != self.memChrom:
- whereClause = ["chrom = '%s'" % chrom]
+ whereClause.append("chrom = '%s'" % chrom)
if flag != "":
if flagLike:
else:
whereClause.append("flag = '%s'" % flag)
+ if start > -1:
+ whereClause.append("%s > %d" % (startText, start))
+
+ if stop > -1:
+ whereClause.append("%s < %d" % (stopText, stop))
+
+ if len(readLike) > 0:
+ readIDClause = string.join(["readID LIKE '", readLike, "%'"], "")
+ whereClause.append(readIDClause)
+
if hasMismatch:
whereClause.append("mismatch != ''")
- if strand != "":
+ if strand in ["+", "-"]:
whereClause.append("sense = '%s'" % strand)
- if start > -1:
- whereClause.append("startL > %d" % start)
-
- if stop > -1:
- whereClause.append("stopR < %d" % stop)
-
if len(whereClause) > 0:
whereStatement = string.join(whereClause, " and ")
whereQuery = "where %s" % whereStatement
else:
whereQuery = ""
- selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
+ return whereQuery
+
+
+ def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
+
+ selectClause = [baseSelect]
+ if bothEnds:
+ selectClause.append("stop")
+
if not noSense:
selectClause.append("sense")
if withMismatch:
selectClause.append("mismatch")
- selectQuery = string.join(selectClause, " ,")
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
+ selectQuery = string.join(selectClause, ",")
+
+ return selectQuery
+
+
+ def getReadGroupQuery(self, findallOptimize, limit, combine5p):
+ groupBy = []
+ if findallOptimize:
+ groupBy = ["GROUP BY start, sense"]
+
+ if limit > 0 and not combine5p:
+ groupBy.append("LIMIT %d" % limit)
+
+ groupQuery = string.join(groupBy)
+
+ return groupQuery
+
+
+ def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
+ flag="", withWeight=False, withFlag=False, withMismatch=False,
+ withID=False, withChrom=False, withPairID=False, readIDDict=False,
+ splitRead=False, hasMismatch=False, flagLike=False, start=-1,
+ stop=-1, strand=""):
+ """ returns a dictionary of spliced reads in a variety of
+ formats and which can be restricted by chromosome or custom-flag.
+ Returns unique spliced reads for now.
+ """
+ whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
+ selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
+ selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
+ sql = self.getSqlCursor()
stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
sql.execute(stmt)
currentReadID = ""
currentChrom = ""
+ resultsDict = {}
for row in sql:
pairID = 0
readID = row["readID"]
def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
- """ returns the number of row in the uniqs table.
+ """ returns the number of row in the specified table.
"""
whereClause = []
count = 0
else:
whereQuery = ""
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
+ sql = self.getSqlCursor()
if distinct:
sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
""" returns the number of row in the splices table.
"""
+ # TODO: if the rds type is DNA should this just return zero?
return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
""" get readID's.
"""
stmt = []
- limitPart = ""
- if limit > 0:
- limitPart = "LIMIT %d" % limit
-
if uniqs:
stmt.append("select readID from uniqs")
else:
selectPart = ""
- sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
+ limitPart = ""
+ if limit > 0:
+ limitPart = "LIMIT %d" % limit
+ sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
+ sql = self.getSqlCursor()
sql.execute(sqlQuery)
result = sql.fetchall()
print "getting mismatches from chromosome %s" % (achrom)
snpDict[achrom] = []
- hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
if useSplices and self.dataType == "RNA":
spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
spliceIDList = spliceDict.keys()
- for k in spliceIDList:
- spliceEntry = spliceDict[k][0]
+ for spliceID in spliceIDList:
+ spliceEntry = spliceDict[spliceID][0]
startpos = spliceEntry["startL"]
lefthalf = spliceEntry["stopL"]
rightstart = spliceEntry["startR"]
snpDict[achrom].append([startpos, change_at, change_base, change_from])
+ hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
if achrom not in hitDict.keys():
continue
def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
- useSplices=False, normalizationFactor = 1.0, trackStrand=False,
+ useSplices=False, normalizationFactor=1.0, trackStrand=False,
keepStrand="both", shiftValue=0):
"""return a profile of the chromosome as an array of per-base read coverage....
keepStrand = 'both', 'plusOnly', or 'minusOnly'.
dataType = metadata["dataType"]
scale = 1. / normalizationFactor
shift = {}
- shift['+'] = int(shiftValue)
- shift['-'] = -1 * int(shiftValue)
+ shift["+"] = int(shiftValue)
+ shift["-"] = -1 * int(shiftValue)
if cstop > 0:
lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
self.dbcon.commit()
+ def insertMultisplices(self, valuesList):
+ """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
+ into the multisplices table.
+ """
+ self.dbcon.executemany("insert into multisplices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
+ self.dbcon.commit()
+
+
def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
""" update reads on file database in a list region of regions for a chromosome to have a new flag.
regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
row["weight"], row["flag"], row["mismatch"]))
- self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", destinationEntries)
+ self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", destinationEntries)