1 import sqlite3 as sqlite
6 from array import array
7 from commoncode import getReverseComplement, getConfigParser, getConfigOption
9 currentRDSVersion = "2.0"
12 class ReadDatasetError(Exception):
17 """ Class for storing reads from experiments. Assumes that custom scripts
18 will translate incoming data into a format that can be inserted into the
19 class using the insert* methods. Default class subtype ('DNA') includes
20 tables for unique and multireads, whereas 'RNA' subtype also includes a
24 def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False,
25 cache=False, reportCount=True):
26 """ creates an rds datafile if initialize is set to true, otherwise
27 will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
32 self.rdsVersion = currentRDSVersion
33 self.memBacked = False
36 self.cachedDBFile = ""
42 self.cacheDB(datafile)
43 dbFile = self.cachedDBFile
47 self.dbcon = sqlite.connect(dbFile)
48 self.dbcon.row_factory = sqlite.Row
49 self.dbcon.execute("PRAGMA temp_store = MEMORY")
51 if datasetType not in ["DNA", "RNA"]:
52 raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
54 self.dataType = datasetType
56 self.initializeTables(self.dbcon)
58 metadata = self.getMetadata("dataType")
59 self.dataType = metadata["dataType"]
62 metadata = self.getMetadata("rdsVersion")
63 self.rdsVersion = metadata["rdsVersion"]
66 self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
68 print "could not add rdsVersion - read-only ?"
69 self.rdsVersion = "pre-1.0"
73 print "INITIALIZED dataset %s" % datafile
75 print "dataset %s" % datafile
77 metadata = self.getMetadata()
79 pnameList = metadata.keys()
81 for pname in pnameList:
82 print "\t" + pname + "\t" + metadata[pname]
85 ucount = self.getUniqsCount()
86 mcount = self.getMultiCount()
87 if self.dataType == "DNA" and not initialize:
89 print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
91 print "\n%s unique reads and %s multireads" % (ucount, mcount)
92 elif self.dataType == "RNA" and not initialize:
93 scount = self.getSplicesCount()
95 print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
97 print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
99 print "default cache size is %d pages" % self.getDefaultCacheSize()
107 """ return the number of usable reads in the dataset.
109 total = self.getUniqsCount()
110 total += self.getMultiCount()
112 if self.dataType == "RNA":
113 total += self.getSplicesCount()
121 """ cleanup copy in local cache, if present.
123 if self.cachedDBFile != "":
127 def cacheDB(self, filename):
128 """ copy geneinfoDB to a local cache.
130 configParser = getConfigParser()
131 cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
132 tempfile.tempdir = cisTemp
133 self.cachedDBFile = "%s.db" % tempfile.mktemp()
134 shutil.copyfile(filename, self.cachedDBFile)
137 def saveCacheDB(self, filename):
138 """ copy geneinfoDB to a local cache.
140 shutil.copyfile(self.cachedDBFile, filename)
144 """ delete geneinfoDB from local cache.
147 if self.cachedDBFile != "":
149 os.remove(self.cachedDBFile)
151 print "could not delete %s" % self.cachedDBFile
156 def attachDB(self, filename, dbName):
157 """ attach another database file to the readDataset.
159 stmt = "attach '%s' as %s" % (filename, dbName)
163 def detachDB(self, dbName):
164 """ detach a database file to the readDataset.
166 stmt = "detach %s" % (dbName)
170 def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
171 """ import into current RDS the table (with columns destcolumns,
172 with default all columns) from the database file asname,
173 using the column specification of ascolumns (default all).
175 stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
177 stmt += " where flag = '%s' " % flagged
179 self.executeCommit(stmt)
182 def getTables(self, dbName=""):
183 """ get a list of table names in a particular database file.
186 sql = self.getSqlCursor()
189 dbName = "%s." % dbName
191 stmt = "select name from %ssqlite_master where type='table'" % dbName
193 results = sql.fetchall()
196 resultList.append(row["name"])
201 def getSqlCursor(self):
203 sql = self.getMemCursor()
205 sql = self.getFileCursor()
210 def getMemCursor(self):
211 """ returns a cursor to memory database for low-level (SQL)
214 return self.memcon.cursor()
217 def getFileCursor(self):
218 """ returns a cursor to file database for low-level (SQL)
221 return self.dbcon.cursor()
225 """ return True if the RDS file has at least one index.
227 stmt = "select count(*) from sqlite_master where type='index'"
228 count = int(self.execute(stmt, returnResults=True)[0][0])
233 def initializeTables(self, dbConnection, cache=100000):
234 """ creates table schema in a database connection, which is
235 typically a database file or an in-memory database.
237 dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
238 dbConnection.execute("create table metadata (name varchar, value varchar)")
239 dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
240 positionSchema = "start int, stop int"
241 tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
242 dbConnection.execute("create table uniqs %s" % tableSchema)
243 dbConnection.execute("create table multi %s" % tableSchema)
244 if self.dataType == "RNA":
245 positionSchema = "startL int, stopL int, startR int, stopR int"
246 tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
247 dbConnection.execute("create table splices %s" % tableSchema)
249 dbConnection.commit()
252 def getMetadata(self, valueName=""):
253 """ returns a dictionary of metadata.
259 whereClause = " where name='%s'" % valueName
261 sql = self.getSqlCursor()
263 sql.execute("select name, value from metadata %s" % whereClause)
264 results = sql.fetchall()
267 parameterName = row["name"]
268 parameterValue = row["value"]
269 if parameterName not in resultsDict:
270 resultsDict[parameterName] = parameterValue
275 newName = string.join([parameterName, str(index)], ":")
276 if newName not in resultsDict:
277 resultsDict[newName] = parameterValue
285 def getReadSize(self):
286 """ returns readsize if defined in metadata.
288 metadata = self.getMetadata()
289 if "readsize" not in metadata:
290 raise ReadDatasetError("no readsize parameter defined")
292 readSize = metadata["readsize"]
293 if "import" in readSize:
294 readSize = readSize.split()[0]
296 readSize = int(readSize)
298 raise ReadDatasetError("readsize is negative")
303 def getDefaultCacheSize(self):
304 """ returns the default cache size.
306 return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
309 def getChromosomes(self, table="uniqs", fullChrom=True):
310 """ returns a sorted list of distinct chromosomes in table.
312 statement = "select distinct chrom from %s" % table
313 sql = self.getSqlCursor()
315 sql.execute(statement)
319 if row["chrom"] not in results:
320 results.append(row["chrom"])
322 shortName = row["chrom"][3:]
323 if len(shortName.strip()) > 0 and shortName not in results:
324 results.append(shortName)
331 def getMaxCoordinate(self, chrom, doUniqs=True,
332 doMulti=False, doSplices=False):
333 """ returns the maximum coordinate for reads on a given chromosome.
338 maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
341 spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
342 maxCoord = max(spliceMax, maxCoord)
345 multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
346 maxCoord = max(multiMax, maxCoord)
351 def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
353 sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
354 sql = self.getSqlCursor()
356 sql.execute(sqlStatement)
357 maxCoord = int(sql.fetchall()[0][0])
359 print "couldn't retrieve coordMax for chromosome %s" % chrom
364 def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
365 flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
366 withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
367 readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
368 flagLike=False, strand='', combine5p=False):
369 """ returns a dictionary of reads in a variety of formats
370 and which can be restricted by chromosome or custom-flag.
371 Returns unique reads by default, but can return multireads
372 with doMulti set to True.
375 #TODO: Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
377 whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
379 selectQuery = "select start, sense, sum(weight)"
381 selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
383 groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
385 stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
387 stmt.append("UNION ALL")
388 stmt.append(selectQuery)
389 stmt.append("from multi")
390 stmt.append(whereQuery)
391 stmt.append(groupQuery)
393 stmt = [selectQuery, "from multi", whereQuery]
397 selectQuery = "select start, sense, weight, chrom"
400 subSelect = [selectQuery, "from uniqs", whereQuery]
402 subSelect.append("union all")
403 subSelect.append(selectQuery)
404 subSelect.append("from multi")
405 subSelect.append(whereQuery)
407 subSelect = [selectQuery, "from multi", whereQuery]
409 sqlStmt = string.join(subSelect)
411 selectQuery = "select start, sense, sum(weight)"
413 stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
414 selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
418 self.memcon.row_factory = None
420 self.dbcon.row_factory = None
422 stmt.append("order by start")
424 stmt.append("order by readID, start")
426 stmt.append("order by chrom, start")
428 sql = self.getSqlCursor()
429 sqlQuery = string.join(stmt)
430 sql.execute(sqlQuery)
434 resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
436 self.memcon.row_factory = sqlite.Row
438 self.dbcon.row_factory = sqlite.Row
444 readID = row["readID"]
448 chrom = row["chrom"][3:]
450 if not readIDDict and chrom != currentChrom:
451 resultsDict[chrom] = []
457 theReadID = readID.split("::")[0]
459 if "/" in theReadID and withPairID:
460 (theReadID, pairID) = readID.split("/")
462 if theReadID != currentReadID:
463 resultsDict[theReadID] = []
464 currentReadID = theReadID
467 newrow = {"start": int(row["start"])}
469 newrow["stop"] = int(row["stop"])
472 newrow["sense"] = row["sense"]
475 newrow["weight"] = float(row["weight"])
478 newrow["flag"] = row["flag"]
481 newrow["mismatch"] = row["mismatch"]
484 newrow["readID"] = readID
487 newrow["chrom"] = chrom
490 newrow["pairID"] = pairID
492 resultsDict[dictKey].append(newrow)
497 def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
506 if chrom != "" and chrom != self.memChrom:
507 whereClause.append("chrom = '%s'" % chrom)
511 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
512 whereClause.append(flagLikeClause)
514 whereClause.append("flag = '%s'" % flag)
517 whereClause.append("%s > %d" % (startText, start))
520 whereClause.append("%s < %d" % (stopText, stop))
522 if len(readLike) > 0:
523 readIDClause = string.join(["readID LIKE '", readLike, "%'"], "")
524 whereClause.append(readIDClause)
527 whereClause.append("mismatch != ''")
529 if strand in ["+", "-"]:
530 whereClause.append("sense = '%s'" % strand)
532 if len(whereClause) > 0:
533 whereStatement = string.join(whereClause, " and ")
534 whereQuery = "where %s" % whereStatement
541 def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
543 selectClause = [baseSelect]
545 selectClause.append("stop")
548 selectClause.append("sense")
551 selectClause.append("weight")
554 selectClause.append("flag")
557 selectClause.append("mismatch")
559 selectQuery = string.join(selectClause, ",")
564 def getReadGroupQuery(self, findallOptimize, limit, combine5p):
567 groupBy = ["GROUP BY start, sense"]
569 if limit > 0 and not combine5p:
570 groupBy.append("LIMIT %d" % limit)
572 groupQuery = string.join(groupBy)
577 def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
578 flag="", withWeight=False, withFlag=False, withMismatch=False,
579 withID=False, withChrom=False, withPairID=False, readIDDict=False,
580 splitRead=False, hasMismatch=False, flagLike=False, start=-1,
582 """ returns a dictionary of spliced reads in a variety of
583 formats and which can be restricted by chromosome or custom-flag.
584 Returns unique spliced reads for now.
586 whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
587 selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
588 selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
589 sql = self.getSqlCursor()
591 stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
598 readID = row["readID"]
602 chrom = row["chrom"][3:]
604 if not readIDDict and chrom != currentChrom:
605 resultsDict[chrom] = []
610 (theReadID, pairID) = readID.split("/")
614 if theReadID != currentReadID:
615 resultsDict[theReadID] = []
616 currentReadID = theReadID
619 newrow = {"startL": int(row["startL"])}
620 newrow["stopL"] = int(row["stopL"])
621 newrow["startR"] = int(row["startR"])
622 newrow["stopR"] = int(row["stopR"])
624 newrow["sense"] = row["sense"]
627 newrow["weight"] = float(row["weight"])
630 newrow["flag"] = row["flag"]
633 newrow["mismatch"] = row["mismatch"]
636 newrow["readID"] = readID
639 newrow["chrom"] = chrom
642 newrow["pairID"] = pairID
645 leftDict = newrow.copy()
646 del leftDict["startR"]
647 del leftDict["stopR"]
649 del rightDict["startL"]
650 del rightDict["stopL"]
651 resultsDict[dictKey].append(leftDict)
652 resultsDict[dictKey].append(rightDict)
654 resultsDict[dictKey].append(newrow)
659 def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
660 splices=False, reportCombined=True, sense="both"):
661 """ return read counts for a given region.
667 if sense in ["+", "-"]:
668 restrict = " sense ='%s' " % sense
672 ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
678 mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
684 scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
689 total = ucount + mcount + scount
692 return (ucount, mcount, scount)
695 def getTotalCounts(self, chrom="", rmin="", rmax=""):
696 """ return read counts for a given region.
698 return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
701 def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
702 """ returns the number of row in the specified table.
707 if chrom !="" and chrom != self.memChrom:
708 whereClause = ["chrom='%s'" % chrom]
711 whereClause.append("%s >= %s" % (startField, str(rmin)))
714 whereClause.append("%s <= %s" % (startField, str(rmax)))
717 whereClause.append(restrict)
719 if len(whereClause) > 0:
720 whereStatement = string.join(whereClause, " and ")
721 whereQuery = "where %s" % whereStatement
725 sql = self.getSqlCursor()
728 sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
730 sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
732 result = sql.fetchone()
735 count = int(result[0])
742 def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
743 """ returns the number of row in the splices table.
745 # TODO: if the rds type is DNA should this just return zero?
746 return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
749 def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
750 """ returns the number of distinct readIDs in the uniqs table.
752 return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
755 def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
756 """ returns the total weight of readIDs in the multi table.
758 return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
761 def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
766 stmt.append("select readID from uniqs")
769 stmt.append("select readID from multi")
772 stmt.append("select readID from splices")
775 selectPart = string.join(stmt, " union ")
781 limitPart = "LIMIT %d" % limit
783 sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
784 sql = self.getSqlCursor()
785 sql.execute(sqlQuery)
786 result = sql.fetchall()
789 return [x[0].split("/")[0] for x in result]
791 return [x[0] for x in result]
794 def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
795 """ returns the uniq and spliced mismatches in a dictionary.
797 readlen = self.getReadSize()
799 hitChromList = [mischrom]
801 hitChromList = self.getChromosomes()
805 for achrom in hitChromList:
807 print "getting mismatches from chromosome %s" % (achrom)
810 if useSplices and self.dataType == "RNA":
811 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
812 spliceIDList = spliceDict.keys()
813 for spliceID in spliceIDList:
814 spliceEntry = spliceDict[spliceID][0]
815 startpos = spliceEntry["startL"]
816 lefthalf = spliceEntry["stopL"]
817 rightstart = spliceEntry["startR"]
818 sense = spliceEntry["sense"]
819 mismatches = spliceEntry["mismatch"]
820 spMismatchList = mismatches.split(",")
821 for mismatch in spMismatchList:
825 change_len = len(mismatch)
827 change_from = mismatch[0]
828 change_base = mismatch[change_len-1]
829 change_pos = int(mismatch[1:change_len-1])
831 change_from = getReverseComplement([mismatch[0]])
832 change_base = getReverseComplement([mismatch[change_len-1]])
833 change_pos = readlen - int(mismatch[1:change_len-1]) + 1
835 firsthalf = int(lefthalf)-int(startpos)+1
837 if int(change_pos) <= int(firsthalf):
838 change_at = startpos + change_pos - 1
840 secondhalf = change_pos - firsthalf
841 change_at = rightstart + secondhalf
843 snpDict[achrom].append([startpos, change_at, change_base, change_from])
845 hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
846 if achrom not in hitDict.keys():
849 for readEntry in hitDict[achrom]:
850 start = readEntry["start"]
851 sense = readEntry["sense"]
852 mismatches = readEntry["mismatch"]
853 mismatchList = mismatches.split(",")
854 for mismatch in mismatchList:
858 change_len = len(mismatch)
860 change_from = mismatch[0]
861 change_base = mismatch[change_len-1]
862 change_pos = int(mismatch[1:change_len-1])
864 change_from = getReverseComplement([mismatch[0]])
865 change_base = getReverseComplement([mismatch[change_len-1]])
866 change_pos = readlen - int(mismatch[1:change_len-1]) + 1
868 change_at = start + change_pos - 1
869 snpDict[achrom].append([start, change_at, change_base, change_from])
874 def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
875 useSplices=False, normalizationFactor=1.0, trackStrand=False,
876 keepStrand="both", shiftValue=0):
877 """return a profile of the chromosome as an array of per-base read coverage....
878 keepStrand = 'both', 'plusOnly', or 'minusOnly'.
879 Will also shift position of unique and multireads (but not splices) if shift is a natural number
881 metadata = self.getMetadata()
883 readlen = int(metadata["readsize"])
887 dataType = metadata["dataType"]
888 scale = 1. / normalizationFactor
890 shift["+"] = int(shiftValue)
891 shift["-"] = -1 * int(shiftValue)
894 lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
896 lastNT = cstop - cstart + readlen + shift["+"]
898 chromModel = array("f",[0.] * lastNT)
899 hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
903 for readEntry in hitDict[chromosome]:
904 hstart = readEntry["start"]
905 sense = readEntry ["sense"]
906 weight = readEntry["weight"]
907 hstart = hstart - cstart + shift[sense]
908 for currentpos in range(hstart,hstart+readlen):
910 if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
911 chromModel[currentpos] += scale * weight
912 elif sense == "-" and keepStrand != "plusOnly":
913 chromModel[currentpos] -= scale * weight
918 if useSplices and dataType == "RNA":
920 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
922 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
924 if chromosome in spliceDict:
925 for spliceEntry in spliceDict[chromosome]:
926 Lstart = spliceEntry["startL"]
927 Lstop = spliceEntry["stopL"]
928 Rstart = spliceEntry["startR"]
929 Rstop = spliceEntry["stopR"]
930 rsense = spliceEntry["sense"]
931 if (Rstop - cstart) < lastNT:
932 for index in range(abs(Lstop - Lstart)):
933 currentpos = Lstart - cstart + index
934 # we only track unique splices
935 if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
936 chromModel[currentpos] += scale
937 elif rsense == "-" and keepStrand != "plusOnly":
938 chromModel[currentpos] -= scale
940 for index in range(abs(Rstop - Rstart)):
941 currentpos = Rstart - cstart + index
942 # we only track unique splices
943 if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
944 chromModel[currentpos] += scale
945 elif rsense == "-" and keepStrand != "plusOnly":
946 chromModel[currentpos] -= scale
953 def insertMetadata(self, valuesList):
954 """ inserts a list of (pname, pvalue) into the metadata
957 self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
961 def updateMetadata(self, pname, newValue, originalValue=""):
962 """ update a metadata field given the original value and the new value.
964 stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
965 if originalValue != "":
966 stmt += " and value='%s' " % str(originalValue)
968 self.dbcon.execute(stmt)
972 def insertUniqs(self, valuesList):
973 """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
974 into the uniqs table.
976 self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
980 def insertMulti(self, valuesList):
981 """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
982 into the multi table.
984 self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
988 def insertSplices(self, valuesList):
989 """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
990 into the splices table.
992 self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
996 def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
997 """ update reads on file database in a list region of regions for a chromosome to have a new flag.
998 regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
999 sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
1003 restrict = " and sense = ? "
1006 self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1009 self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1011 if self.dataType == "RNA" and splices:
1012 self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
1013 self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
1018 def setFlags(self, flag, uniqs=True, multi=True, splices=True):
1019 """ set the flag fields in the entire dataset.
1022 self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
1025 self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
1027 if self.dataType == "RNA" and splices:
1028 self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
1033 def resetFlags(self, uniqs=True, multi=True, splices=True):
1034 """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
1036 self.setFlags("", uniqs, multi, splices)
1039 def reweighMultireads(self, readList):
1040 self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
1043 def setSynchronousPragma(self, value="ON"):
1045 self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
1047 print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
1050 def setDBcache(self, cache, default=False):
1051 self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
1053 self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
1056 def execute(self, statement, returnResults=False):
1057 sql = self.getSqlCursor()
1059 sql.execute(statement)
1061 result = sql.fetchall()
1065 def executeCommit(self, statement):
1066 self.execute(statement)
1069 self.memcon.commit()
1074 def buildIndex(self, cache=100000):
1075 """ Builds the file indeces for the main tables.
1076 Cache is the number of 1.5 kb pages to keep in memory.
1077 100000 pages translates into 150MB of RAM, which is our default.
1079 if cache > self.getDefaultCacheSize():
1080 self.setDBcache(cache)
1081 self.setSynchronousPragma("OFF")
1082 self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1083 print "built uPosIndex"
1084 self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
1085 print "built uChromIndex"
1086 self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1087 print "built mPosIndex"
1088 self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
1089 print "built mChromIndex"
1091 if self.dataType == "RNA":
1092 self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
1093 print "built sPosIndex"
1094 self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
1095 print "built sPosIndex2"
1096 self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
1097 print "built sChromIndex"
1100 self.setSynchronousPragma("ON")
1103 def dropIndex(self):
1104 """ drops the file indices for the main tables.
1107 self.setSynchronousPragma("OFF")
1108 self.dbcon.execute("DROP INDEX uPosIndex")
1109 self.dbcon.execute("DROP INDEX uChromIndex")
1110 self.dbcon.execute("DROP INDEX mPosIndex")
1111 self.dbcon.execute("DROP INDEX mChromIndex")
1113 if self.dataType == "RNA":
1114 self.dbcon.execute("DROP INDEX sPosIndex")
1116 self.dbcon.execute("DROP INDEX sPosIndex2")
1120 self.dbcon.execute("DROP INDEX sChromIndex")
1124 print "problem dropping index"
1126 self.setSynchronousPragma("ON")
1129 def memSync(self, chrom="", index=False):
1130 """ makes a copy of the dataset into memory for faster access.
1131 Can be restricted to a "full" chromosome. Can also build the
1135 self.memcon = sqlite.connect(":memory:")
1136 self.initializeTables(self.memcon)
1137 cursor = self.dbcon.cursor()
1140 print "memSync %s" % chrom
1141 whereclause = " where chrom = '%s' " % chrom
1142 self.memChrom = chrom
1146 self.memcon.execute("PRAGMA temp_store = MEMORY")
1147 self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
1148 # copy metadata to memory
1149 self.memcon.execute("delete from metadata")
1150 results = cursor.execute("select name, value from metadata")
1153 results2.append((row["name"], row["value"]))
1155 self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
1157 self.copyDBEntriesToMemory("uniqs", whereclause)
1158 self.copyDBEntriesToMemory("multi", whereclause)
1159 if self.dataType == "RNA":
1160 self.copySpliceDBEntriesToMemory(whereclause)
1164 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
1165 self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
1166 if self.dataType == "RNA":
1167 self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
1168 self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
1170 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1171 self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1172 if self.dataType == "RNA":
1173 self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
1174 self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
1176 self.memBacked = True
1177 self.memcon.row_factory = sqlite.Row
1178 self.memcon.commit()
1181 def copyDBEntriesToMemory(self, dbName, whereClause=""):
1182 cursor = self.dbcon.cursor()
1183 sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
1184 destinationEntries = []
1185 for row in sourceEntries:
1186 destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
1188 self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
1191 def copySpliceDBEntriesToMemory(self, whereClause=""):
1192 cursor = self.dbcon.cursor()
1193 sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
1194 destinationEntries = []
1195 for row in sourceEntries:
1196 destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
1197 row["weight"], row["flag"], row["mismatch"]))
1199 self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", destinationEntries)