X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=ReadDataset.py;h=71544ca2420d2489fca13452976d930a6c55a9c5;hp=5ff60e2e6954c862888c5807495ef49142e14fa3;hb=47bd897210cb85e042f11d7400b46d94400cc428;hpb=0d3e3112fd04c2e6b44a25cacef1d591658ad181 diff --git a/ReadDataset.py b/ReadDataset.py index 5ff60e2..71544ca 100644 --- a/ReadDataset.py +++ b/ReadDataset.py @@ -6,7 +6,7 @@ import os from array import array from commoncode import getReverseComplement, getConfigParser, getConfigOption -currentRDSVersion = "2.0" +currentRDSVersion = "2.1" class ReadDatasetError(Exception): @@ -35,6 +35,9 @@ class ReadDataset(): self.memCursor = "" self.cachedDBFile = "" + if initialize and datasetType not in ["DNA", "RNA"]: + raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'") + if cache: if verbose: print "caching ...." @@ -48,11 +51,7 @@ class ReadDataset(): self.dbcon.row_factory = sqlite.Row self.dbcon.execute("PRAGMA temp_store = MEMORY") if initialize: - if datasetType not in ["DNA", "RNA"]: - raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'") - else: - self.dataType = datasetType - + self.dataType = datasetType self.initializeTables(self.dbcon) else: metadata = self.getMetadata("dataType") @@ -69,38 +68,7 @@ class ReadDataset(): self.rdsVersion = "pre-1.0" if verbose: - if initialize: - print "INITIALIZED dataset %s" % datafile - else: - print "dataset %s" % datafile - - metadata = self.getMetadata() - print "metadata:" - pnameList = metadata.keys() - pnameList.sort() - for pname in pnameList: - print "\t" + pname + "\t" + metadata[pname] - - if reportCount: - ucount = self.getUniqsCount() - mcount = self.getMultiCount() - if self.dataType == "DNA" and not initialize: - try: - print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount)) - except ValueError: - print "\n%s unique reads and %s multireads" % (ucount, mcount) - elif self.dataType == "RNA" and not initialize: - scount = self.getSplicesCount() - try: - print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount)) - except ValueError: - print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount) - - print "default cache size is %d pages" % self.getDefaultCacheSize() - if self.hasIndex(): - print "found index" - else: - print "not indexed" + self.printRDSInfo(datafile, reportCount, initialize) def __len__(self): @@ -124,6 +92,39 @@ class ReadDataset(): self.uncacheDB() + def printRDSInfo(self, datafile, reportCount, initialize): + if initialize: + print "INITIALIZED dataset %s" % datafile + else: + print "dataset %s" % datafile + + metadata = self.getMetadata() + print "metadata:" + pnameList = metadata.keys() + pnameList.sort() + for pname in pnameList: + print "\t" + pname + "\t" + metadata[pname] + + if reportCount and not initialize: + self.printReadCounts() + + print "default cache size is %d pages" % self.getDefaultCacheSize() + if self.hasIndex(): + print "found index" + else: + print "not indexed" + + + def printReadCounts(self): + ucount = self.getUniqsCount() + mcount = self.getMultiCount() + if self.dataType == "DNA": + print "\n%d unique reads and %d multireads" % (ucount, mcount) + elif self.dataType == "RNA": + scount = self.getSplicesCount() + print "\n%d unique reads, %d spliced reads and %d multireads" % (ucount, scount, mcount) + + def cacheDB(self, filename): """ copy geneinfoDB to a local cache. """ @@ -153,42 +154,42 @@ class ReadDataset(): self.cachedDB = "" - def attachDB(self, filename, asname): + def attachDB(self, filename, dbName): """ attach another database file to the readDataset. """ - stmt = "attach '%s' as %s" % (filename, asname) + stmt = "attach '%s' as %s" % (filename, dbName) self.execute(stmt) - def detachDB(self, asname): + def detachDB(self, dbName): """ detach a database file to the readDataset. """ - stmt = "detach %s" % (asname) + stmt = "detach %s" % (dbName) self.execute(stmt) - def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""): + def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""): """ import into current RDS the table (with columns destcolumns, with default all columns) from the database file asname, using the column specification of ascolumns (default all). """ - stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table) + stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table) if flagged != "": stmt += " where flag = '%s' " % flagged self.executeCommit(stmt) - def getTables(self, asname=""): + def getTables(self, dbName=""): """ get a list of table names in a particular database file. """ resultList = [] sql = self.getSqlCursor() - if asname != "": - asname += "." + if dbName != "": + dbName = "%s." % dbName - stmt = "select name from %ssqlite_master where type='table'" % asname + stmt = "select name from %ssqlite_master where type='table'" % dbName sql.execute(stmt) results = sql.fetchall() @@ -207,15 +208,27 @@ class ReadDataset(): return sql + def getMemCursor(self): + """ returns a cursor to memory database for low-level (SQL) + access to the data. + """ + return self.memcon.cursor() + + + def getFileCursor(self): + """ returns a cursor to file database for low-level (SQL) + access to the data. + """ + return self.dbcon.cursor() + + def hasIndex(self): - """ check whether the RDS file has at least one index. + """ return True if the RDS file has at least one index. """ stmt = "select count(*) from sqlite_master where type='index'" count = int(self.execute(stmt, returnResults=True)[0][0]) - if count > 0: - return True - return False + return count > 0 def initializeTables(self, dbConnection, cache=100000): @@ -234,21 +247,11 @@ class ReadDataset(): tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema dbConnection.execute("create table splices %s" % tableSchema) - dbConnection.commit() - - - def getFileCursor(self): - """ returns a cursor to file database for low-level (SQL) - access to the data. - """ - return self.dbcon.cursor() - + positionSchema = "startL int, stopL int, startR int, stopR int" + tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema + dbConnection.execute("create table multisplices %s" % tableSchema) - def getMemCursor(self): - """ returns a cursor to memory database for low-level (SQL) - access to the data. - """ - return self.memcon.cursor() + dbConnection.commit() def getMetadata(self, valueName=""): @@ -291,11 +294,15 @@ class ReadDataset(): if "readsize" not in metadata: raise ReadDatasetError("no readsize parameter defined") else: - mysize = metadata["readsize"] - if "import" in mysize: - mysize = mysize.split()[0] + readSize = metadata["readsize"] + if "import" in readSize: + readSize = readSize.split()[0] + + readSize = int(readSize) + if readSize < 0: + raise ReadDatasetError("readsize is negative") - return int(mysize) + return readSize def getDefaultCacheSize(self): @@ -305,7 +312,7 @@ class ReadDataset(): def getChromosomes(self, table="uniqs", fullChrom=True): - """ returns a list of distinct chromosomes in table. + """ returns a sorted list of distinct chromosomes in table. """ statement = "select distinct chrom from %s" % table sql = self.getSqlCursor() @@ -317,51 +324,44 @@ class ReadDataset(): if row["chrom"] not in results: results.append(row["chrom"]) else: - if len(row["chrom"][3:].strip()) < 1: - continue - - if row["chrom"][3:] not in results: - results.append(row["chrom"][3:]) + shortName = row["chrom"][3:] + if len(shortName.strip()) > 0 and shortName not in results: + results.append(shortName) results.sort() return results - def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True, + def getMaxCoordinate(self, chrom, doUniqs=True, doMulti=False, doSplices=False): """ returns the maximum coordinate for reads on a given chromosome. """ maxCoord = 0 - sql = self.getSqlCursor() if doUniqs: - try: - sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom) - maxCoord = int(sql.fetchall()[0][0]) - except: - print "couldn't retrieve coordMax for chromosome %s" % chrom + maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs") if doSplices: - sql.execute("select max(startR) from splices where chrom = '%s'" % chrom) - try: - spliceMax = int(sql.fetchall()[0][0]) - if spliceMax > maxCoord: - maxCoord = spliceMax - except: - pass + spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR") + maxCoord = max(spliceMax, maxCoord) if doMulti: - sql.execute("select max(start) from multi where chrom = '%s'" % chrom) - try: - multiMax = int(sql.fetchall()[0][0]) - if multiMax > maxCoord: - maxCoord = multiMax - except: - pass + multiMax = self.getMaxStartCoordinateInTable(chrom, "multi") + maxCoord = max(multiMax, maxCoord) - if verbose: - print "%s maxCoord: %d" % (chrom, maxCoord) + return maxCoord + + + def getMaxStartCoordinateInTable(self, chrom, table, startField="start"): + maxCoord = 0 + sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom) + sql = self.getSqlCursor() + try: + sql.execute(sqlStatement) + maxCoord = int(sql.fetchall()[0][0]) + except: + print "couldn't retrieve coordMax for chromosome %s" % chrom return maxCoord @@ -375,70 +375,17 @@ class ReadDataset(): and which can be restricted by chromosome or custom-flag. Returns unique reads by default, but can return multireads with doMulti set to True. - - Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict - """ - whereClause = [] - resultsDict = {} - - if chrom != "" and chrom != self.memChrom: - whereClause.append("chrom = '%s'" % chrom) - - if flag != "": - if flagLike: - flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "") - whereClause.append(flagLikeClause) - else: - whereClause.append("flag = '%s'" % flag) - - if start > -1: - whereClause.append("start > %d" % start) - if stop > -1: - whereClause.append("stop < %d" % stop) - - if len(readLike) > 0: - readIDClause = string.join(["readID LIKE '", readLike, "%'"], "") - whereClause.append(readIDClause) - - if hasMismatch: - whereClause.append("mismatch != ''") - - if strand in ["+", "-"]: - whereClause.append("sense = '%s'" % strand) - - if len(whereClause) > 0: - whereStatement = string.join(whereClause, " and ") - whereQuery = "where %s" % whereStatement - else: - whereQuery = "" + """ + #TODO: Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict - groupBy = [] + whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike) if findallOptimize: - selectClause = ["select start, sense, sum(weight)"] - groupBy = ["GROUP BY start, sense"] + selectQuery = "select start, sense, sum(weight)" else: - selectClause = ["select ID, chrom, start, readID"] - if bothEnds: - selectClause.append("stop") - - if not noSense: - selectClause.append("sense") + selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds) - if withWeight: - selectClause.append("weight") - - if withFlag: - selectClause.append("flag") - - if withMismatch: - selectClause.append("mismatch") - - if limit > 0 and not combine5p: - groupBy.append("LIMIT %d" % limit) - - selectQuery = string.join(selectClause, ",") - groupQuery = string.join(groupBy) + groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p) if doUniqs: stmt = [selectQuery, "from uniqs", whereQuery, groupQuery] if doMulti: @@ -474,30 +421,20 @@ class ReadDataset(): if findallOptimize: if self.memBacked: self.memcon.row_factory = None - sql = self.memcon.cursor() else: self.dbcon.row_factory = None - sql = self.dbcon.cursor() stmt.append("order by start") elif readIDDict: - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() - stmt.append("order by readID, start") else: - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() - stmt.append("order by chrom, start") + sql = self.getSqlCursor() sqlQuery = string.join(stmt) sql.execute(sqlQuery) + resultsDict = {} if findallOptimize: resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql] if self.memBacked: @@ -562,20 +499,17 @@ class ReadDataset(): return resultsDict - def getSplicesDict(self, noSense=False, fullChrom=False, chrom="", - flag="", withWeight=False, withFlag=False, withMismatch=False, - withID=False, withChrom=False, withPairID=False, readIDDict=False, - splitRead=False, hasMismatch=False, flagLike=False, start=-1, - stop=-1, strand=""): - """ returns a dictionary of spliced reads in a variety of - formats and which can be restricted by chromosome or custom-flag. - Returns unique spliced reads for now. - """ - whereClause = [] - resultsDict = {} + def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False): + if splice: + startText = "startL" + stopText = "stopR" + else: + startText = "start" + stopText = "stop" + whereClause = [] if chrom != "" and chrom != self.memChrom: - whereClause = ["chrom = '%s'" % chrom] + whereClause.append("chrom = '%s'" % chrom) if flag != "": if flagLike: @@ -584,25 +518,37 @@ class ReadDataset(): else: whereClause.append("flag = '%s'" % flag) + if start > -1: + whereClause.append("%s > %d" % (startText, start)) + + if stop > -1: + whereClause.append("%s < %d" % (stopText, stop)) + + if len(readLike) > 0: + readIDClause = string.join(["readID LIKE '", readLike, "%'"], "") + whereClause.append(readIDClause) + if hasMismatch: whereClause.append("mismatch != ''") - if strand != "": + if strand in ["+", "-"]: whereClause.append("sense = '%s'" % strand) - if start > -1: - whereClause.append("startL > %d" % start) - - if stop > -1: - whereClause.append("stopR < %d" % stop) - if len(whereClause) > 0: whereStatement = string.join(whereClause, " and ") whereQuery = "where %s" % whereStatement else: whereQuery = "" - selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"] + return whereQuery + + + def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False): + + selectClause = [baseSelect] + if bothEnds: + selectClause.append("stop") + if not noSense: selectClause.append("sense") @@ -615,16 +561,43 @@ class ReadDataset(): if withMismatch: selectClause.append("mismatch") - selectQuery = string.join(selectClause, " ,") - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() + selectQuery = string.join(selectClause, ",") + + return selectQuery + + + def getReadGroupQuery(self, findallOptimize, limit, combine5p): + groupBy = [] + if findallOptimize: + groupBy = ["GROUP BY start, sense"] + + if limit > 0 and not combine5p: + groupBy.append("LIMIT %d" % limit) + + groupQuery = string.join(groupBy) + + return groupQuery + + + def getSplicesDict(self, noSense=False, fullChrom=False, chrom="", + flag="", withWeight=False, withFlag=False, withMismatch=False, + withID=False, withChrom=False, withPairID=False, readIDDict=False, + splitRead=False, hasMismatch=False, flagLike=False, start=-1, + stop=-1, strand=""): + """ returns a dictionary of spliced reads in a variety of + formats and which can be restricted by chromosome or custom-flag. + Returns unique spliced reads for now. + """ + whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True) + selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID" + selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch) + sql = self.getSqlCursor() stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery) sql.execute(stmt) currentReadID = "" currentChrom = "" + resultsDict = {} for row in sql: pairID = 0 readID = row["readID"] @@ -731,7 +704,7 @@ class ReadDataset(): def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"): - """ returns the number of row in the uniqs table. + """ returns the number of row in the specified table. """ whereClause = [] count = 0 @@ -754,10 +727,7 @@ class ReadDataset(): else: whereQuery = "" - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() + sql = self.getSqlCursor() if distinct: sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery)) @@ -777,6 +747,7 @@ class ReadDataset(): def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False): """ returns the number of row in the splices table. """ + # TODO: if the rds type is DNA should this just return zero? return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL") @@ -796,10 +767,6 @@ class ReadDataset(): """ get readID's. """ stmt = [] - limitPart = "" - if limit > 0: - limitPart = "LIMIT %d" % limit - if uniqs: stmt.append("select readID from uniqs") @@ -814,12 +781,12 @@ class ReadDataset(): else: selectPart = "" - sqlQuery = "%s group by readID %s" % (selectPart, limitPart) - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() + limitPart = "" + if limit > 0: + limitPart = "LIMIT %d" % limit + sqlQuery = "%s group by readID %s" % (selectPart, limitPart) + sql = self.getSqlCursor() sql.execute(sqlQuery) result = sql.fetchall() @@ -845,12 +812,11 @@ class ReadDataset(): print "getting mismatches from chromosome %s" % (achrom) snpDict[achrom] = [] - hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True) if useSplices and self.dataType == "RNA": spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True) spliceIDList = spliceDict.keys() - for k in spliceIDList: - spliceEntry = spliceDict[k][0] + for spliceID in spliceIDList: + spliceEntry = spliceDict[spliceID][0] startpos = spliceEntry["startL"] lefthalf = spliceEntry["stopL"] rightstart = spliceEntry["startR"] @@ -881,6 +847,7 @@ class ReadDataset(): snpDict[achrom].append([startpos, change_at, change_base, change_from]) + hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True) if achrom not in hitDict.keys(): continue @@ -910,7 +877,7 @@ class ReadDataset(): def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True, - useSplices=False, normalizationFactor = 1.0, trackStrand=False, + useSplices=False, normalizationFactor=1.0, trackStrand=False, keepStrand="both", shiftValue=0): """return a profile of the chromosome as an array of per-base read coverage.... keepStrand = 'both', 'plusOnly', or 'minusOnly'. @@ -925,8 +892,8 @@ class ReadDataset(): dataType = metadata["dataType"] scale = 1. / normalizationFactor shift = {} - shift['+'] = int(shiftValue) - shift['-'] = -1 * int(shiftValue) + shift["+"] = int(shiftValue) + shift["-"] = -1 * int(shiftValue) if cstop > 0: lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen @@ -1031,6 +998,14 @@ class ReadDataset(): self.dbcon.commit() + def insertMultisplices(self, valuesList): + """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) + into the multisplices table. + """ + self.dbcon.executemany("insert into multisplices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList) + self.dbcon.commit() + + def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"): """ update reads on file database in a list region of regions for a chromosome to have a new flag. regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with @@ -1234,5 +1209,5 @@ class ReadDataset(): destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"], row["weight"], row["flag"], row["mismatch"])) - self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", destinationEntries) + self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", destinationEntries)