from array import array
from commoncode import getReverseComplement, getConfigParser, getConfigOption
-currentRDSVersion = "2.0"
+currentRDSVersion = "2.1"
class ReadDatasetError(Exception):
self.memCursor = ""
self.cachedDBFile = ""
+ if initialize and datasetType not in ["DNA", "RNA"]:
+ raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
+
if cache:
if verbose:
print "caching ...."
self.dbcon.row_factory = sqlite.Row
self.dbcon.execute("PRAGMA temp_store = MEMORY")
if initialize:
- if datasetType not in ["DNA", "RNA"]:
- raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
- else:
- self.dataType = datasetType
-
+ self.dataType = datasetType
self.initializeTables(self.dbcon)
else:
metadata = self.getMetadata("dataType")
self.rdsVersion = "pre-1.0"
if verbose:
- if initialize:
- print "INITIALIZED dataset %s" % datafile
- else:
- print "dataset %s" % datafile
-
- metadata = self.getMetadata()
- print "metadata:"
- pnameList = metadata.keys()
- pnameList.sort()
- for pname in pnameList:
- print "\t" + pname + "\t" + metadata[pname]
-
- if reportCount:
- ucount = self.getUniqsCount()
- mcount = self.getMultiCount()
- if self.dataType == "DNA" and not initialize:
- try:
- print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
- except ValueError:
- print "\n%s unique reads and %s multireads" % (ucount, mcount)
- elif self.dataType == "RNA" and not initialize:
- scount = self.getSplicesCount()
- try:
- print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
- except ValueError:
- print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
-
- print "default cache size is %d pages" % self.getDefaultCacheSize()
- if self.hasIndex():
- print "found index"
- else:
- print "not indexed"
+ self.printRDSInfo(datafile, reportCount, initialize)
def __len__(self):
self.uncacheDB()
+ def printRDSInfo(self, datafile, reportCount, initialize):
+ if initialize:
+ print "INITIALIZED dataset %s" % datafile
+ else:
+ print "dataset %s" % datafile
+
+ metadata = self.getMetadata()
+ print "metadata:"
+ pnameList = metadata.keys()
+ pnameList.sort()
+ for pname in pnameList:
+ print "\t" + pname + "\t" + metadata[pname]
+
+ if reportCount and not initialize:
+ self.printReadCounts()
+
+ print "default cache size is %d pages" % self.getDefaultCacheSize()
+ if self.hasIndex():
+ print "found index"
+ else:
+ print "not indexed"
+
+
+ def printReadCounts(self):
+ ucount = self.getUniqsCount()
+ mcount = self.getMultiCount()
+ if self.dataType == "DNA":
+ print "\n%d unique reads and %d multireads" % (ucount, mcount)
+ elif self.dataType == "RNA":
+ scount = self.getSplicesCount()
+ print "\n%d unique reads, %d spliced reads and %d multireads" % (ucount, scount, mcount)
+
+
def cacheDB(self, filename):
""" copy geneinfoDB to a local cache.
"""
return sql
+ def getMemCursor(self):
+ """ returns a cursor to memory database for low-level (SQL)
+ access to the data.
+ """
+ return self.memcon.cursor()
+
+
+ def getFileCursor(self):
+ """ returns a cursor to file database for low-level (SQL)
+ access to the data.
+ """
+ return self.dbcon.cursor()
+
+
def hasIndex(self):
- """ check whether the RDS file has at least one index.
+ """ return True if the RDS file has at least one index.
"""
stmt = "select count(*) from sqlite_master where type='index'"
count = int(self.execute(stmt, returnResults=True)[0][0])
- if count > 0:
- return True
- return False
+ return count > 0
def initializeTables(self, dbConnection, cache=100000):
tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
dbConnection.execute("create table splices %s" % tableSchema)
- dbConnection.commit()
-
-
- def getFileCursor(self):
- """ returns a cursor to file database for low-level (SQL)
- access to the data.
- """
- return self.dbcon.cursor()
-
+ positionSchema = "startL int, stopL int, startR int, stopR int"
+ tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
+ dbConnection.execute("create table multisplices %s" % tableSchema)
- def getMemCursor(self):
- """ returns a cursor to memory database for low-level (SQL)
- access to the data.
- """
- return self.memcon.cursor()
+ dbConnection.commit()
def getMetadata(self, valueName=""):
def getChromosomes(self, table="uniqs", fullChrom=True):
- """ returns a list of distinct chromosomes in table.
+ """ returns a sorted list of distinct chromosomes in table.
"""
statement = "select distinct chrom from %s" % table
sql = self.getSqlCursor()
return results
- def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
+ def getMaxCoordinate(self, chrom, doUniqs=True,
doMulti=False, doSplices=False):
""" returns the maximum coordinate for reads on a given chromosome.
"""
multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
maxCoord = max(multiMax, maxCoord)
- if verbose:
- print "%s maxCoord: %d" % (chrom, maxCoord)
-
return maxCoord
and which can be restricted by chromosome or custom-flag.
Returns unique reads by default, but can return multireads
with doMulti set to True.
-
- Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
+
"""
+ #TODO: Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
if findallOptimize:
if findallOptimize:
if self.memBacked:
self.memcon.row_factory = None
- sql = self.memcon.cursor()
else:
self.dbcon.row_factory = None
- sql = self.dbcon.cursor()
stmt.append("order by start")
elif readIDDict:
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
-
stmt.append("order by readID, start")
else:
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
-
stmt.append("order by chrom, start")
+ sql = self.getSqlCursor()
sqlQuery = string.join(stmt)
sql.execute(sqlQuery)
whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
+ sql = self.getSqlCursor()
stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
sql.execute(stmt)
def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
- """ returns the number of row in the uniqs table.
+ """ returns the number of row in the specified table.
"""
whereClause = []
count = 0
else:
whereQuery = ""
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
+ sql = self.getSqlCursor()
if distinct:
sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
""" returns the number of row in the splices table.
"""
+ # TODO: if the rds type is DNA should this just return zero?
return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
limitPart = "LIMIT %d" % limit
sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
- if self.memBacked:
- sql = self.memcon.cursor()
- else:
- sql = self.dbcon.cursor()
-
+ sql = self.getSqlCursor()
sql.execute(sqlQuery)
result = sql.fetchall()
self.dbcon.commit()
+ def insertMultisplices(self, valuesList):
+ """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
+ into the multisplices table.
+ """
+ self.dbcon.executemany("insert into multisplices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
+ self.dbcon.commit()
+
+
def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
""" update reads on file database in a list region of regions for a chromosome to have a new flag.
regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
row["weight"], row["flag"], row["mismatch"]))
- self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", destinationEntries)
+ self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", destinationEntries)