1 import sqlite3 as sqlite
6 from array import array
7 from commoncode import getReverseComplement, getConfigParser, getConfigOption
9 currentRDSVersion = "2.1"
12 class ReadDatasetError(Exception):
17 """ Class for storing reads from experiments. Assumes that custom scripts
18 will translate incoming data into a format that can be inserted into the
19 class using the insert* methods. Default class subtype ('DNA') includes
20 tables for unique and multireads, whereas 'RNA' subtype also includes a
24 def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False,
25 cache=False, reportCount=True):
26 """ creates an rds datafile if initialize is set to true, otherwise
27 will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
32 self.rdsVersion = currentRDSVersion
33 self.memBacked = False
36 self.cachedDBFile = ""
38 if initialize and datasetType not in ["DNA", "RNA"]:
39 raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
45 self.cacheDB(datafile)
46 dbFile = self.cachedDBFile
50 self.dbcon = sqlite.connect(dbFile)
51 self.dbcon.row_factory = sqlite.Row
52 self.dbcon.execute("PRAGMA temp_store = MEMORY")
54 self.dataType = datasetType
55 self.initializeTables(self.dbcon)
57 metadata = self.getMetadata("dataType")
58 self.dataType = metadata["dataType"]
61 metadata = self.getMetadata("rdsVersion")
62 self.rdsVersion = metadata["rdsVersion"]
65 self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
67 print "could not add rdsVersion - read-only ?"
68 self.rdsVersion = "pre-1.0"
71 self.printRDSInfo(datafile, reportCount, initialize)
75 """ return the number of usable reads in the dataset.
77 total = self.getUniqsCount()
78 total += self.getMultiCount()
80 if self.dataType == "RNA":
81 total += self.getSplicesCount()
89 """ cleanup copy in local cache, if present.
91 if self.cachedDBFile != "":
95 def printRDSInfo(self, datafile, reportCount, initialize):
97 print "INITIALIZED dataset %s" % datafile
99 print "dataset %s" % datafile
101 metadata = self.getMetadata()
103 pnameList = metadata.keys()
105 for pname in pnameList:
106 print "\t" + pname + "\t" + metadata[pname]
108 if reportCount and not initialize:
109 self.printReadCounts()
111 print "default cache size is %d pages" % self.getDefaultCacheSize()
118 def printReadCounts(self):
119 ucount = self.getUniqsCount()
120 mcount = self.getMultiCount()
121 if self.dataType == "DNA":
122 print "\n%d unique reads and %d multireads" % (ucount, mcount)
123 elif self.dataType == "RNA":
124 scount = self.getSplicesCount()
125 print "\n%d unique reads, %d spliced reads and %d multireads" % (ucount, scount, mcount)
128 def cacheDB(self, filename):
129 """ copy geneinfoDB to a local cache.
131 configParser = getConfigParser()
132 cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
133 tempfile.tempdir = cisTemp
134 self.cachedDBFile = "%s.db" % tempfile.mktemp()
135 shutil.copyfile(filename, self.cachedDBFile)
138 def saveCacheDB(self, filename):
139 """ copy geneinfoDB to a local cache.
141 shutil.copyfile(self.cachedDBFile, filename)
145 """ delete geneinfoDB from local cache.
148 if self.cachedDBFile != "":
150 os.remove(self.cachedDBFile)
152 print "could not delete %s" % self.cachedDBFile
157 def attachDB(self, filename, dbName):
158 """ attach another database file to the readDataset.
160 stmt = "attach '%s' as %s" % (filename, dbName)
164 def detachDB(self, dbName):
165 """ detach a database file to the readDataset.
167 stmt = "detach %s" % (dbName)
171 def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
172 """ import into current RDS the table (with columns destcolumns,
173 with default all columns) from the database file asname,
174 using the column specification of ascolumns (default all).
176 stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
178 stmt += " where flag = '%s' " % flagged
180 self.executeCommit(stmt)
183 def getTables(self, dbName=""):
184 """ get a list of table names in a particular database file.
187 sql = self.getSqlCursor()
190 dbName = "%s." % dbName
192 stmt = "select name from %ssqlite_master where type='table'" % dbName
194 results = sql.fetchall()
197 resultList.append(row["name"])
202 def getSqlCursor(self):
204 sql = self.getMemCursor()
206 sql = self.getFileCursor()
211 def getMemCursor(self):
212 """ returns a cursor to memory database for low-level (SQL)
215 return self.memcon.cursor()
218 def getFileCursor(self):
219 """ returns a cursor to file database for low-level (SQL)
222 return self.dbcon.cursor()
226 """ return True if the RDS file has at least one index.
228 stmt = "select count(*) from sqlite_master where type='index'"
229 count = int(self.execute(stmt, returnResults=True)[0][0])
234 def initializeTables(self, dbConnection, cache=100000):
235 """ creates table schema in a database connection, which is
236 typically a database file or an in-memory database.
238 dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
239 dbConnection.execute("create table metadata (name varchar, value varchar)")
240 dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
241 positionSchema = "start int, stop int"
242 tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
243 dbConnection.execute("create table uniqs %s" % tableSchema)
244 dbConnection.execute("create table multi %s" % tableSchema)
245 if self.dataType == "RNA":
246 positionSchema = "startL int, stopL int, startR int, stopR int"
247 tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
248 dbConnection.execute("create table splices %s" % tableSchema)
250 positionSchema = "startL int, stopL int, startR int, stopR int"
251 tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
252 dbConnection.execute("create table multisplices %s" % tableSchema)
254 dbConnection.commit()
257 def getMetadata(self, valueName=""):
258 """ returns a dictionary of metadata.
264 whereClause = " where name='%s'" % valueName
266 sql = self.getSqlCursor()
268 sql.execute("select name, value from metadata %s" % whereClause)
269 results = sql.fetchall()
272 parameterName = row["name"]
273 parameterValue = row["value"]
274 if parameterName not in resultsDict:
275 resultsDict[parameterName] = parameterValue
280 newName = string.join([parameterName, str(index)], ":")
281 if newName not in resultsDict:
282 resultsDict[newName] = parameterValue
290 def getReadSize(self):
291 """ returns readsize if defined in metadata.
293 metadata = self.getMetadata()
294 if "readsize" not in metadata:
295 raise ReadDatasetError("no readsize parameter defined")
297 readSize = metadata["readsize"]
298 if "import" in readSize:
299 readSize = readSize.split()[0]
301 readSize = int(readSize)
303 raise ReadDatasetError("readsize is negative")
308 def getDefaultCacheSize(self):
309 """ returns the default cache size.
311 return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
314 def getChromosomes(self, table="uniqs", fullChrom=True):
315 """ returns a sorted list of distinct chromosomes in table.
317 statement = "select distinct chrom from %s" % table
318 sql = self.getSqlCursor()
320 sql.execute(statement)
324 if row["chrom"] not in results:
325 results.append(row["chrom"])
327 shortName = row["chrom"][3:]
328 if len(shortName.strip()) > 0 and shortName not in results:
329 results.append(shortName)
336 def getMaxCoordinate(self, chrom, doUniqs=True,
337 doMulti=False, doSplices=False):
338 """ returns the maximum coordinate for reads on a given chromosome.
343 maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
346 spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
347 maxCoord = max(spliceMax, maxCoord)
350 multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
351 maxCoord = max(multiMax, maxCoord)
356 def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
358 sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
359 sql = self.getSqlCursor()
361 sql.execute(sqlStatement)
362 maxCoord = int(sql.fetchall()[0][0])
364 print "couldn't retrieve coordMax for chromosome %s" % chrom
369 def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
370 flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
371 withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
372 readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
373 flagLike=False, strand='', combine5p=False):
374 """ returns a dictionary of reads in a variety of formats
375 and which can be restricted by chromosome or custom-flag.
376 Returns unique reads by default, but can return multireads
377 with doMulti set to True.
380 #TODO: Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
382 whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
384 selectQuery = "select start, sense, sum(weight)"
386 selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
388 groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
390 stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
392 stmt.append("UNION ALL")
393 stmt.append(selectQuery)
394 stmt.append("from multi")
395 stmt.append(whereQuery)
396 stmt.append(groupQuery)
398 stmt = [selectQuery, "from multi", whereQuery]
402 selectQuery = "select start, sense, weight, chrom"
405 subSelect = [selectQuery, "from uniqs", whereQuery]
407 subSelect.append("union all")
408 subSelect.append(selectQuery)
409 subSelect.append("from multi")
410 subSelect.append(whereQuery)
412 subSelect = [selectQuery, "from multi", whereQuery]
414 sqlStmt = string.join(subSelect)
416 selectQuery = "select start, sense, sum(weight)"
418 stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
419 selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
423 self.memcon.row_factory = None
425 self.dbcon.row_factory = None
427 stmt.append("order by start")
429 stmt.append("order by readID, start")
431 stmt.append("order by chrom, start")
433 sql = self.getSqlCursor()
434 sqlQuery = string.join(stmt)
435 sql.execute(sqlQuery)
439 resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
441 self.memcon.row_factory = sqlite.Row
443 self.dbcon.row_factory = sqlite.Row
449 readID = row["readID"]
453 chrom = row["chrom"][3:]
455 if not readIDDict and chrom != currentChrom:
456 resultsDict[chrom] = []
462 theReadID = readID.split("::")[0]
464 if "/" in theReadID and withPairID:
465 (theReadID, pairID) = readID.split("/")
467 if theReadID != currentReadID:
468 resultsDict[theReadID] = []
469 currentReadID = theReadID
472 newrow = {"start": int(row["start"])}
474 newrow["stop"] = int(row["stop"])
477 newrow["sense"] = row["sense"]
480 newrow["weight"] = float(row["weight"])
483 newrow["flag"] = row["flag"]
486 newrow["mismatch"] = row["mismatch"]
489 newrow["readID"] = readID
492 newrow["chrom"] = chrom
495 newrow["pairID"] = pairID
497 resultsDict[dictKey].append(newrow)
502 def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
511 if chrom != "" and chrom != self.memChrom:
512 whereClause.append("chrom = '%s'" % chrom)
516 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
517 whereClause.append(flagLikeClause)
519 whereClause.append("flag = '%s'" % flag)
522 whereClause.append("%s > %d" % (startText, start))
525 whereClause.append("%s < %d" % (stopText, stop))
527 if len(readLike) > 0:
528 readIDClause = string.join(["readID LIKE '", readLike, "%'"], "")
529 whereClause.append(readIDClause)
532 whereClause.append("mismatch != ''")
534 if strand in ["+", "-"]:
535 whereClause.append("sense = '%s'" % strand)
537 if len(whereClause) > 0:
538 whereStatement = string.join(whereClause, " and ")
539 whereQuery = "where %s" % whereStatement
546 def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
548 selectClause = [baseSelect]
550 selectClause.append("stop")
553 selectClause.append("sense")
556 selectClause.append("weight")
559 selectClause.append("flag")
562 selectClause.append("mismatch")
564 selectQuery = string.join(selectClause, ",")
569 def getReadGroupQuery(self, findallOptimize, limit, combine5p):
572 groupBy = ["GROUP BY start, sense"]
574 if limit > 0 and not combine5p:
575 groupBy.append("LIMIT %d" % limit)
577 groupQuery = string.join(groupBy)
582 def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
583 flag="", withWeight=False, withFlag=False, withMismatch=False,
584 withID=False, withChrom=False, withPairID=False, readIDDict=False,
585 splitRead=False, hasMismatch=False, flagLike=False, start=-1,
587 """ returns a dictionary of spliced reads in a variety of
588 formats and which can be restricted by chromosome or custom-flag.
589 Returns unique spliced reads for now.
591 whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
592 selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
593 selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
594 sql = self.getSqlCursor()
596 stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
603 readID = row["readID"]
607 chrom = row["chrom"][3:]
609 if not readIDDict and chrom != currentChrom:
610 resultsDict[chrom] = []
615 (theReadID, pairID) = readID.split("/")
619 if theReadID != currentReadID:
620 resultsDict[theReadID] = []
621 currentReadID = theReadID
624 newrow = {"startL": int(row["startL"])}
625 newrow["stopL"] = int(row["stopL"])
626 newrow["startR"] = int(row["startR"])
627 newrow["stopR"] = int(row["stopR"])
629 newrow["sense"] = row["sense"]
632 newrow["weight"] = float(row["weight"])
635 newrow["flag"] = row["flag"]
638 newrow["mismatch"] = row["mismatch"]
641 newrow["readID"] = readID
644 newrow["chrom"] = chrom
647 newrow["pairID"] = pairID
650 leftDict = newrow.copy()
651 del leftDict["startR"]
652 del leftDict["stopR"]
654 del rightDict["startL"]
655 del rightDict["stopL"]
656 resultsDict[dictKey].append(leftDict)
657 resultsDict[dictKey].append(rightDict)
659 resultsDict[dictKey].append(newrow)
664 def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
665 splices=False, reportCombined=True, sense="both"):
666 """ return read counts for a given region.
672 if sense in ["+", "-"]:
673 restrict = " sense ='%s' " % sense
677 ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
683 mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
689 scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
694 total = ucount + mcount + scount
697 return (ucount, mcount, scount)
700 def getTotalCounts(self, chrom="", rmin="", rmax=""):
701 """ return read counts for a given region.
703 return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
706 def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
707 """ returns the number of row in the specified table.
712 if chrom !="" and chrom != self.memChrom:
713 whereClause = ["chrom='%s'" % chrom]
716 whereClause.append("%s >= %s" % (startField, str(rmin)))
719 whereClause.append("%s <= %s" % (startField, str(rmax)))
722 whereClause.append(restrict)
724 if len(whereClause) > 0:
725 whereStatement = string.join(whereClause, " and ")
726 whereQuery = "where %s" % whereStatement
730 sql = self.getSqlCursor()
733 sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
735 sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
737 result = sql.fetchone()
740 count = int(result[0])
747 def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
748 """ returns the number of row in the splices table.
750 # TODO: if the rds type is DNA should this just return zero?
751 return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
754 def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
755 """ returns the number of distinct readIDs in the uniqs table.
757 return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
760 def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
761 """ returns the total weight of readIDs in the multi table.
763 return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
766 def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
771 stmt.append("select readID from uniqs")
774 stmt.append("select readID from multi")
777 stmt.append("select readID from splices")
780 selectPart = string.join(stmt, " union ")
786 limitPart = "LIMIT %d" % limit
788 sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
789 sql = self.getSqlCursor()
790 sql.execute(sqlQuery)
791 result = sql.fetchall()
794 return [x[0].split("/")[0] for x in result]
796 return [x[0] for x in result]
799 def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
800 """ returns the uniq and spliced mismatches in a dictionary.
802 readlen = self.getReadSize()
804 hitChromList = [mischrom]
806 hitChromList = self.getChromosomes()
810 for achrom in hitChromList:
812 print "getting mismatches from chromosome %s" % (achrom)
815 if useSplices and self.dataType == "RNA":
816 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
817 spliceIDList = spliceDict.keys()
818 for spliceID in spliceIDList:
819 spliceEntry = spliceDict[spliceID][0]
820 startpos = spliceEntry["startL"]
821 lefthalf = spliceEntry["stopL"]
822 rightstart = spliceEntry["startR"]
823 sense = spliceEntry["sense"]
824 mismatches = spliceEntry["mismatch"]
825 spMismatchList = mismatches.split(",")
826 for mismatch in spMismatchList:
830 change_len = len(mismatch)
832 change_from = mismatch[0]
833 change_base = mismatch[change_len-1]
834 change_pos = int(mismatch[1:change_len-1])
836 change_from = getReverseComplement(mismatch[0])
837 change_base = getReverseComplement(mismatch[change_len-1])
838 change_pos = readlen - int(mismatch[1:change_len-1]) + 1
840 firsthalf = int(lefthalf)-int(startpos)+1
842 if int(change_pos) <= int(firsthalf):
843 change_at = startpos + change_pos - 1
845 secondhalf = change_pos - firsthalf
846 change_at = rightstart + secondhalf
848 snpDict[achrom].append([startpos, change_at, change_base, change_from])
850 hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
851 if achrom not in hitDict.keys():
854 for readEntry in hitDict[achrom]:
855 start = readEntry["start"]
856 sense = readEntry["sense"]
857 mismatches = readEntry["mismatch"]
858 mismatchList = mismatches.split(",")
859 for mismatch in mismatchList:
863 change_len = len(mismatch)
865 change_from = mismatch[0]
866 change_base = mismatch[change_len-1]
867 change_pos = int(mismatch[1:change_len-1])
869 change_from = getReverseComplement(mismatch[0])
870 change_base = getReverseComplement(mismatch[change_len-1])
871 change_pos = readlen - int(mismatch[1:change_len-1]) + 1
873 change_at = start + change_pos - 1
874 snpDict[achrom].append([start, change_at, change_base, change_from])
879 def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
880 useSplices=False, normalizationFactor=1.0, trackStrand=False,
881 keepStrand="both", shiftValue=0):
882 """return a profile of the chromosome as an array of per-base read coverage....
883 keepStrand = 'both', 'plusOnly', or 'minusOnly'.
884 Will also shift position of unique and multireads (but not splices) if shift is a natural number
886 metadata = self.getMetadata()
888 readlen = int(metadata["readsize"])
892 dataType = metadata["dataType"]
893 scale = 1. / normalizationFactor
895 shift["+"] = int(shiftValue)
896 shift["-"] = -1 * int(shiftValue)
899 lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
901 lastNT = cstop - cstart + readlen + shift["+"]
903 chromModel = array("f",[0.] * lastNT)
904 hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
908 for readEntry in hitDict[chromosome]:
909 hstart = readEntry["start"]
910 sense = readEntry ["sense"]
911 weight = readEntry["weight"]
912 hstart = hstart - cstart + shift[sense]
913 for currentpos in range(hstart,hstart+readlen):
915 if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
916 chromModel[currentpos] += scale * weight
917 elif sense == "-" and keepStrand != "plusOnly":
918 chromModel[currentpos] -= scale * weight
923 if useSplices and dataType == "RNA":
925 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
927 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
929 if chromosome in spliceDict:
930 for spliceEntry in spliceDict[chromosome]:
931 Lstart = spliceEntry["startL"]
932 Lstop = spliceEntry["stopL"]
933 Rstart = spliceEntry["startR"]
934 Rstop = spliceEntry["stopR"]
935 rsense = spliceEntry["sense"]
936 if (Rstop - cstart) < lastNT:
937 for index in range(abs(Lstop - Lstart)):
938 currentpos = Lstart - cstart + index
939 # we only track unique splices
940 if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
941 chromModel[currentpos] += scale
942 elif rsense == "-" and keepStrand != "plusOnly":
943 chromModel[currentpos] -= scale
945 for index in range(abs(Rstop - Rstart)):
946 currentpos = Rstart - cstart + index
947 # we only track unique splices
948 if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
949 chromModel[currentpos] += scale
950 elif rsense == "-" and keepStrand != "plusOnly":
951 chromModel[currentpos] -= scale
958 def insertMetadata(self, valuesList):
959 """ inserts a list of (pname, pvalue) into the metadata
962 self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
966 def updateMetadata(self, pname, newValue, originalValue=""):
967 """ update a metadata field given the original value and the new value.
969 stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
970 if originalValue != "":
971 stmt += " and value='%s' " % str(originalValue)
973 self.dbcon.execute(stmt)
977 def insertUniqs(self, valuesList):
978 """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
979 into the uniqs table.
981 self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
985 def insertMulti(self, valuesList):
986 """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
987 into the multi table.
989 self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
993 def insertSplices(self, valuesList):
994 """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
995 into the splices table.
997 self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
1001 def insertMultisplices(self, valuesList):
1002 """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
1003 into the multisplices table.
1005 self.dbcon.executemany("insert into multisplices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
1009 def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
1010 """ update reads on file database in a list region of regions for a chromosome to have a new flag.
1011 regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
1012 sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
1016 restrict = " and sense = ? "
1019 self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1022 self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1024 if self.dataType == "RNA" and splices:
1025 self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
1026 self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
1031 def setFlags(self, flag, uniqs=True, multi=True, splices=True):
1032 """ set the flag fields in the entire dataset.
1035 self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
1038 self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
1040 if self.dataType == "RNA" and splices:
1041 self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
1046 def resetFlags(self, uniqs=True, multi=True, splices=True):
1047 """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
1049 self.setFlags("", uniqs, multi, splices)
1052 def reweighMultireads(self, readList):
1053 self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
1056 def setSynchronousPragma(self, value="ON"):
1058 self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
1060 print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
1063 def setDBcache(self, cache, default=False):
1064 self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
1066 self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
1069 def execute(self, statement, returnResults=False):
1070 sql = self.getSqlCursor()
1072 sql.execute(statement)
1074 result = sql.fetchall()
1078 def executeCommit(self, statement):
1079 self.execute(statement)
1082 self.memcon.commit()
1087 def buildIndex(self, cache=100000):
1088 """ Builds the file indeces for the main tables.
1089 Cache is the number of 1.5 kb pages to keep in memory.
1090 100000 pages translates into 150MB of RAM, which is our default.
1092 if cache > self.getDefaultCacheSize():
1093 self.setDBcache(cache)
1094 self.setSynchronousPragma("OFF")
1095 self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1096 print "built uPosIndex"
1097 self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
1098 print "built uChromIndex"
1099 self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1100 print "built mPosIndex"
1101 self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
1102 print "built mChromIndex"
1104 if self.dataType == "RNA":
1105 self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
1106 print "built sPosIndex"
1107 self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
1108 print "built sPosIndex2"
1109 self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
1110 print "built sChromIndex"
1113 self.setSynchronousPragma("ON")
1116 def dropIndex(self):
1117 """ drops the file indices for the main tables.
1120 self.setSynchronousPragma("OFF")
1121 self.dbcon.execute("DROP INDEX uPosIndex")
1122 self.dbcon.execute("DROP INDEX uChromIndex")
1123 self.dbcon.execute("DROP INDEX mPosIndex")
1124 self.dbcon.execute("DROP INDEX mChromIndex")
1126 if self.dataType == "RNA":
1127 self.dbcon.execute("DROP INDEX sPosIndex")
1129 self.dbcon.execute("DROP INDEX sPosIndex2")
1133 self.dbcon.execute("DROP INDEX sChromIndex")
1137 print "problem dropping index"
1139 self.setSynchronousPragma("ON")
1142 def memSync(self, chrom="", index=False):
1143 """ makes a copy of the dataset into memory for faster access.
1144 Can be restricted to a "full" chromosome. Can also build the
1148 self.memcon = sqlite.connect(":memory:")
1149 self.initializeTables(self.memcon)
1150 cursor = self.dbcon.cursor()
1153 print "memSync %s" % chrom
1154 whereclause = " where chrom = '%s' " % chrom
1155 self.memChrom = chrom
1159 self.memcon.execute("PRAGMA temp_store = MEMORY")
1160 self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
1161 # copy metadata to memory
1162 self.memcon.execute("delete from metadata")
1163 results = cursor.execute("select name, value from metadata")
1166 results2.append((row["name"], row["value"]))
1168 self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
1170 self.copyDBEntriesToMemory("uniqs", whereclause)
1171 self.copyDBEntriesToMemory("multi", whereclause)
1172 if self.dataType == "RNA":
1173 self.copySpliceDBEntriesToMemory(whereclause)
1177 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
1178 self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
1179 if self.dataType == "RNA":
1180 self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
1181 self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
1183 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1184 self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1185 if self.dataType == "RNA":
1186 self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
1187 self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
1189 self.memBacked = True
1190 self.memcon.row_factory = sqlite.Row
1191 self.memcon.commit()
1194 def copyDBEntriesToMemory(self, dbName, whereClause=""):
1195 cursor = self.dbcon.cursor()
1196 sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
1197 destinationEntries = []
1198 for row in sourceEntries:
1199 destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
1201 self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
1204 def copySpliceDBEntriesToMemory(self, whereClause=""):
1205 cursor = self.dbcon.cursor()
1206 sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
1207 destinationEntries = []
1208 for row in sourceEntries:
1209 destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
1210 row["weight"], row["flag"], row["mismatch"]))
1212 self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", destinationEntries)