1 import sqlite3 as sqlite
6 from array import array
7 from commoncode import getReverseComplement, getConfigParser, getConfigOption
9 currentRDSVersion = "2.0"
12 class ReadDatasetError(Exception):
17 """ Class for storing reads from experiments. Assumes that custom scripts
18 will translate incoming data into a format that can be inserted into the
19 class using the insert* methods. Default class subtype ('DNA') includes
20 tables for unique and multireads, whereas 'RNA' subtype also includes a
24 def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False,
25 cache=False, reportCount=True):
26 """ creates an rds datafile if initialize is set to true, otherwise
27 will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
32 self.rdsVersion = currentRDSVersion
33 self.memBacked = False
36 self.cachedDBFile = ""
42 self.cacheDB(datafile)
43 dbFile = self.cachedDBFile
47 self.dbcon = sqlite.connect(dbFile)
48 self.dbcon.row_factory = sqlite.Row
49 self.dbcon.execute("PRAGMA temp_store = MEMORY")
51 if datasetType not in ["DNA", "RNA"]:
52 raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
54 self.dataType = datasetType
56 self.initializeTables(self.dbcon)
58 metadata = self.getMetadata("dataType")
59 self.dataType = metadata["dataType"]
62 metadata = self.getMetadata("rdsVersion")
63 self.rdsVersion = metadata["rdsVersion"]
66 self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
68 print "could not add rdsVersion - read-only ?"
69 self.rdsVersion = "pre-1.0"
73 print "INITIALIZED dataset %s" % datafile
75 print "dataset %s" % datafile
77 metadata = self.getMetadata()
79 pnameList = metadata.keys()
81 for pname in pnameList:
82 print "\t" + pname + "\t" + metadata[pname]
85 ucount = self.getUniqsCount()
86 mcount = self.getMultiCount()
87 if self.dataType == "DNA" and not initialize:
89 print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
91 print "\n%s unique reads and %s multireads" % (ucount, mcount)
92 elif self.dataType == "RNA" and not initialize:
93 scount = self.getSplicesCount()
95 print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
97 print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
99 print "default cache size is %d pages" % self.getDefaultCacheSize()
107 """ return the number of usable reads in the dataset.
109 total = self.getUniqsCount()
110 total += self.getMultiCount()
112 if self.dataType == "RNA":
113 total += self.getSplicesCount()
121 """ cleanup copy in local cache, if present.
123 if self.cachedDBFile != "":
127 def cacheDB(self, filename):
128 """ copy geneinfoDB to a local cache.
130 configParser = getConfigParser()
131 cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
132 tempfile.tempdir = cisTemp
133 self.cachedDBFile = "%s.db" % tempfile.mktemp()
134 shutil.copyfile(filename, self.cachedDBFile)
137 def saveCacheDB(self, filename):
138 """ copy geneinfoDB to a local cache.
140 shutil.copyfile(self.cachedDBFile, filename)
144 """ delete geneinfoDB from local cache.
147 if self.cachedDBFile != "":
149 os.remove(self.cachedDBFile)
151 print "could not delete %s" % self.cachedDBFile
156 def attachDB(self, filename, asname):
157 """ attach another database file to the readDataset.
159 stmt = "attach '%s' as %s" % (filename, asname)
163 def detachDB(self, asname):
164 """ detach a database file to the readDataset.
166 stmt = "detach %s" % (asname)
170 def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
171 """ import into current RDS the table (with columns destcolumns,
172 with default all columns) from the database file asname,
173 using the column specification of ascolumns (default all).
175 stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
177 stmt += " where flag = '%s' " % flagged
179 self.executeCommit(stmt)
182 def getTables(self, asname=""):
183 """ get a list of table names in a particular database file.
186 sql = self.getSqlCursor()
191 stmt = "select name from %ssqlite_master where type='table'" % asname
193 results = sql.fetchall()
196 resultList.append(row["name"])
201 def getSqlCursor(self):
203 sql = self.getMemCursor()
205 sql = self.getFileCursor()
211 """ check whether the RDS file has at least one index.
213 stmt = "select count(*) from sqlite_master where type='index'"
214 count = int(self.execute(stmt, returnResults=True)[0][0])
221 def initializeTables(self, dbConnection, cache=100000):
222 """ creates table schema in a database connection, which is
223 typically a database file or an in-memory database.
225 dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
226 dbConnection.execute("create table metadata (name varchar, value varchar)")
227 dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
228 positionSchema = "start int, stop int"
229 tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
230 dbConnection.execute("create table uniqs %s" % tableSchema)
231 dbConnection.execute("create table multi %s" % tableSchema)
232 if self.dataType == "RNA":
233 positionSchema = "startL int, stopL int, startR int, stopR int"
234 tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
235 dbConnection.execute("create table splices %s" % tableSchema)
237 dbConnection.commit()
240 def getFileCursor(self):
241 """ returns a cursor to file database for low-level (SQL)
244 return self.dbcon.cursor()
247 def getMemCursor(self):
248 """ returns a cursor to memory database for low-level (SQL)
251 return self.memcon.cursor()
254 def getMetadata(self, valueName=""):
255 """ returns a dictionary of metadata.
261 whereClause = " where name='%s'" % valueName
263 sql = self.getSqlCursor()
265 sql.execute("select name, value from metadata %s" % whereClause)
266 results = sql.fetchall()
269 parameterName = row["name"]
270 parameterValue = row["value"]
271 if parameterName not in resultsDict:
272 resultsDict[parameterName] = parameterValue
277 newName = string.join([parameterName, str(index)], ":")
278 if newName not in resultsDict:
279 resultsDict[newName] = parameterValue
287 def getReadSize(self):
288 """ returns readsize if defined in metadata.
290 metadata = self.getMetadata()
291 if "readsize" not in metadata:
292 raise ReadDatasetError("no readsize parameter defined")
294 mysize = metadata["readsize"]
295 if "import" in mysize:
296 mysize = mysize.split()[0]
301 def getDefaultCacheSize(self):
302 """ returns the default cache size.
304 return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
307 def getChromosomes(self, table="uniqs", fullChrom=True):
308 """ returns a list of distinct chromosomes in table.
310 statement = "select distinct chrom from %s" % table
311 sql = self.getSqlCursor()
313 sql.execute(statement)
317 if row["chrom"] not in results:
318 results.append(row["chrom"])
320 if len(row["chrom"][3:].strip()) < 1:
323 if row["chrom"][3:] not in results:
324 results.append(row["chrom"][3:])
331 def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
332 doMulti=False, doSplices=False):
333 """ returns the maximum coordinate for reads on a given chromosome.
336 sql = self.getSqlCursor()
340 sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
341 maxCoord = int(sql.fetchall()[0][0])
343 print "couldn't retrieve coordMax for chromosome %s" % chrom
346 sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
348 spliceMax = int(sql.fetchall()[0][0])
349 if spliceMax > maxCoord:
355 sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
357 multiMax = int(sql.fetchall()[0][0])
358 if multiMax > maxCoord:
364 print "%s maxCoord: %d" % (chrom, maxCoord)
369 def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
370 flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
371 withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
372 readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
373 flagLike=False, strand='', combine5p=False):
374 """ returns a dictionary of reads in a variety of formats
375 and which can be restricted by chromosome or custom-flag.
376 Returns unique reads by default, but can return multireads
377 with doMulti set to True.
379 Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
384 if chrom != "" and chrom != self.memChrom:
385 whereClause.append("chrom = '%s'" % chrom)
389 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
390 whereClause.append(flagLikeClause)
392 whereClause.append("flag = '%s'" % flag)
395 whereClause.append("start > %d" % start)
398 whereClause.append("stop < %d" % stop)
400 if len(readLike) > 0:
401 readIDClause = string.join(["readID LIKE '", readLike, "%'"], "")
402 whereClause.append(readIDClause)
405 whereClause.append("mismatch != ''")
407 if strand in ["+", "-"]:
408 whereClause.append("sense = '%s'" % strand)
410 if len(whereClause) > 0:
411 whereStatement = string.join(whereClause, " and ")
412 whereQuery = "where %s" % whereStatement
418 selectClause = ["select start, sense, sum(weight)"]
419 groupBy = ["GROUP BY start, sense"]
421 selectClause = ["select ID, chrom, start, readID"]
423 selectClause.append("stop")
426 selectClause.append("sense")
429 selectClause.append("weight")
432 selectClause.append("flag")
435 selectClause.append("mismatch")
437 if limit > 0 and not combine5p:
438 groupBy.append("LIMIT %d" % limit)
440 selectQuery = string.join(selectClause, ",")
441 groupQuery = string.join(groupBy)
443 stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
445 stmt.append("UNION ALL")
446 stmt.append(selectQuery)
447 stmt.append("from multi")
448 stmt.append(whereQuery)
449 stmt.append(groupQuery)
451 stmt = [selectQuery, "from multi", whereQuery]
455 selectQuery = "select start, sense, weight, chrom"
458 subSelect = [selectQuery, "from uniqs", whereQuery]
460 subSelect.append("union all")
461 subSelect.append(selectQuery)
462 subSelect.append("from multi")
463 subSelect.append(whereQuery)
465 subSelect = [selectQuery, "from multi", whereQuery]
467 sqlStmt = string.join(subSelect)
469 selectQuery = "select start, sense, sum(weight)"
471 stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
472 selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
476 self.memcon.row_factory = None
477 sql = self.memcon.cursor()
479 self.dbcon.row_factory = None
480 sql = self.dbcon.cursor()
482 stmt.append("order by start")
485 sql = self.memcon.cursor()
487 sql = self.dbcon.cursor()
489 stmt.append("order by readID, start")
492 sql = self.memcon.cursor()
494 sql = self.dbcon.cursor()
496 stmt.append("order by chrom, start")
498 sqlQuery = string.join(stmt)
499 sql.execute(sqlQuery)
502 resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
504 self.memcon.row_factory = sqlite.Row
506 self.dbcon.row_factory = sqlite.Row
512 readID = row["readID"]
516 chrom = row["chrom"][3:]
518 if not readIDDict and chrom != currentChrom:
519 resultsDict[chrom] = []
525 theReadID = readID.split("::")[0]
527 if "/" in theReadID and withPairID:
528 (theReadID, pairID) = readID.split("/")
530 if theReadID != currentReadID:
531 resultsDict[theReadID] = []
532 currentReadID = theReadID
535 newrow = {"start": int(row["start"])}
537 newrow["stop"] = int(row["stop"])
540 newrow["sense"] = row["sense"]
543 newrow["weight"] = float(row["weight"])
546 newrow["flag"] = row["flag"]
549 newrow["mismatch"] = row["mismatch"]
552 newrow["readID"] = readID
555 newrow["chrom"] = chrom
558 newrow["pairID"] = pairID
560 resultsDict[dictKey].append(newrow)
565 def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
566 flag="", withWeight=False, withFlag=False, withMismatch=False,
567 withID=False, withChrom=False, withPairID=False, readIDDict=False,
568 splitRead=False, hasMismatch=False, flagLike=False, start=-1,
570 """ returns a dictionary of spliced reads in a variety of
571 formats and which can be restricted by chromosome or custom-flag.
572 Returns unique spliced reads for now.
577 if chrom != "" and chrom != self.memChrom:
578 whereClause = ["chrom = '%s'" % chrom]
582 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
583 whereClause.append(flagLikeClause)
585 whereClause.append("flag = '%s'" % flag)
588 whereClause.append("mismatch != ''")
591 whereClause.append("sense = '%s'" % strand)
594 whereClause.append("startL > %d" % start)
597 whereClause.append("stopR < %d" % stop)
599 if len(whereClause) > 0:
600 whereStatement = string.join(whereClause, " and ")
601 whereQuery = "where %s" % whereStatement
605 selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
607 selectClause.append("sense")
610 selectClause.append("weight")
613 selectClause.append("flag")
616 selectClause.append("mismatch")
618 selectQuery = string.join(selectClause, " ,")
620 sql = self.memcon.cursor()
622 sql = self.dbcon.cursor()
624 stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
630 readID = row["readID"]
634 chrom = row["chrom"][3:]
636 if not readIDDict and chrom != currentChrom:
637 resultsDict[chrom] = []
642 (theReadID, pairID) = readID.split("/")
646 if theReadID != currentReadID:
647 resultsDict[theReadID] = []
648 currentReadID = theReadID
651 newrow = {"startL": int(row["startL"])}
652 newrow["stopL"] = int(row["stopL"])
653 newrow["startR"] = int(row["startR"])
654 newrow["stopR"] = int(row["stopR"])
656 newrow["sense"] = row["sense"]
659 newrow["weight"] = float(row["weight"])
662 newrow["flag"] = row["flag"]
665 newrow["mismatch"] = row["mismatch"]
668 newrow["readID"] = readID
671 newrow["chrom"] = chrom
674 newrow["pairID"] = pairID
677 leftDict = newrow.copy()
678 del leftDict["startR"]
679 del leftDict["stopR"]
681 del rightDict["startL"]
682 del rightDict["stopL"]
683 resultsDict[dictKey].append(leftDict)
684 resultsDict[dictKey].append(rightDict)
686 resultsDict[dictKey].append(newrow)
691 def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
692 splices=False, reportCombined=True, sense="both"):
693 """ return read counts for a given region.
699 if sense in ["+", "-"]:
700 restrict = " sense ='%s' " % sense
704 ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
710 mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
716 scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
721 total = ucount + mcount + scount
724 return (ucount, mcount, scount)
727 def getTotalCounts(self, chrom="", rmin="", rmax=""):
728 """ return read counts for a given region.
730 return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
733 def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
734 """ returns the number of row in the uniqs table.
739 if chrom !="" and chrom != self.memChrom:
740 whereClause = ["chrom='%s'" % chrom]
743 whereClause.append("%s >= %s" % (startField, str(rmin)))
746 whereClause.append("%s <= %s" % (startField, str(rmax)))
749 whereClause.append(restrict)
751 if len(whereClause) > 0:
752 whereStatement = string.join(whereClause, " and ")
753 whereQuery = "where %s" % whereStatement
758 sql = self.memcon.cursor()
760 sql = self.dbcon.cursor()
763 sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
765 sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
767 result = sql.fetchone()
770 count = int(result[0])
777 def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
778 """ returns the number of row in the splices table.
780 return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
783 def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
784 """ returns the number of distinct readIDs in the uniqs table.
786 return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
789 def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
790 """ returns the total weight of readIDs in the multi table.
792 return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
795 def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
801 limitPart = "LIMIT %d" % limit
804 stmt.append("select readID from uniqs")
807 stmt.append("select readID from multi")
810 stmt.append("select readID from splices")
813 selectPart = string.join(stmt, " union ")
817 sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
819 sql = self.memcon.cursor()
821 sql = self.dbcon.cursor()
823 sql.execute(sqlQuery)
824 result = sql.fetchall()
827 return [x[0].split("/")[0] for x in result]
829 return [x[0] for x in result]
832 def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
833 """ returns the uniq and spliced mismatches in a dictionary.
835 readlen = self.getReadSize()
837 hitChromList = [mischrom]
839 hitChromList = self.getChromosomes()
843 for achrom in hitChromList:
845 print "getting mismatches from chromosome %s" % (achrom)
848 hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
849 if useSplices and self.dataType == "RNA":
850 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
851 spliceIDList = spliceDict.keys()
852 for k in spliceIDList:
853 spliceEntry = spliceDict[k][0]
854 startpos = spliceEntry["startL"]
855 lefthalf = spliceEntry["stopL"]
856 rightstart = spliceEntry["startR"]
857 sense = spliceEntry["sense"]
858 mismatches = spliceEntry["mismatch"]
859 spMismatchList = mismatches.split(",")
860 for mismatch in spMismatchList:
864 change_len = len(mismatch)
866 change_from = mismatch[0]
867 change_base = mismatch[change_len-1]
868 change_pos = int(mismatch[1:change_len-1])
870 change_from = getReverseComplement([mismatch[0]])
871 change_base = getReverseComplement([mismatch[change_len-1]])
872 change_pos = readlen - int(mismatch[1:change_len-1]) + 1
874 firsthalf = int(lefthalf)-int(startpos)+1
876 if int(change_pos) <= int(firsthalf):
877 change_at = startpos + change_pos - 1
879 secondhalf = change_pos - firsthalf
880 change_at = rightstart + secondhalf
882 snpDict[achrom].append([startpos, change_at, change_base, change_from])
884 if achrom not in hitDict.keys():
887 for readEntry in hitDict[achrom]:
888 start = readEntry["start"]
889 sense = readEntry["sense"]
890 mismatches = readEntry["mismatch"]
891 mismatchList = mismatches.split(",")
892 for mismatch in mismatchList:
896 change_len = len(mismatch)
898 change_from = mismatch[0]
899 change_base = mismatch[change_len-1]
900 change_pos = int(mismatch[1:change_len-1])
902 change_from = getReverseComplement([mismatch[0]])
903 change_base = getReverseComplement([mismatch[change_len-1]])
904 change_pos = readlen - int(mismatch[1:change_len-1]) + 1
906 change_at = start + change_pos - 1
907 snpDict[achrom].append([start, change_at, change_base, change_from])
912 def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
913 useSplices=False, normalizationFactor = 1.0, trackStrand=False,
914 keepStrand="both", shiftValue=0):
915 """return a profile of the chromosome as an array of per-base read coverage....
916 keepStrand = 'both', 'plusOnly', or 'minusOnly'.
917 Will also shift position of unique and multireads (but not splices) if shift is a natural number
919 metadata = self.getMetadata()
921 readlen = int(metadata["readsize"])
925 dataType = metadata["dataType"]
926 scale = 1. / normalizationFactor
928 shift['+'] = int(shiftValue)
929 shift['-'] = -1 * int(shiftValue)
932 lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
934 lastNT = cstop - cstart + readlen + shift["+"]
936 chromModel = array("f",[0.] * lastNT)
937 hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
941 for readEntry in hitDict[chromosome]:
942 hstart = readEntry["start"]
943 sense = readEntry ["sense"]
944 weight = readEntry["weight"]
945 hstart = hstart - cstart + shift[sense]
946 for currentpos in range(hstart,hstart+readlen):
948 if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
949 chromModel[currentpos] += scale * weight
950 elif sense == "-" and keepStrand != "plusOnly":
951 chromModel[currentpos] -= scale * weight
956 if useSplices and dataType == "RNA":
958 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
960 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
962 if chromosome in spliceDict:
963 for spliceEntry in spliceDict[chromosome]:
964 Lstart = spliceEntry["startL"]
965 Lstop = spliceEntry["stopL"]
966 Rstart = spliceEntry["startR"]
967 Rstop = spliceEntry["stopR"]
968 rsense = spliceEntry["sense"]
969 if (Rstop - cstart) < lastNT:
970 for index in range(abs(Lstop - Lstart)):
971 currentpos = Lstart - cstart + index
972 # we only track unique splices
973 if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
974 chromModel[currentpos] += scale
975 elif rsense == "-" and keepStrand != "plusOnly":
976 chromModel[currentpos] -= scale
978 for index in range(abs(Rstop - Rstart)):
979 currentpos = Rstart - cstart + index
980 # we only track unique splices
981 if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
982 chromModel[currentpos] += scale
983 elif rsense == "-" and keepStrand != "plusOnly":
984 chromModel[currentpos] -= scale
991 def insertMetadata(self, valuesList):
992 """ inserts a list of (pname, pvalue) into the metadata
995 self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
999 def updateMetadata(self, pname, newValue, originalValue=""):
1000 """ update a metadata field given the original value and the new value.
1002 stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
1003 if originalValue != "":
1004 stmt += " and value='%s' " % str(originalValue)
1006 self.dbcon.execute(stmt)
1010 def insertUniqs(self, valuesList):
1011 """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1012 into the uniqs table.
1014 self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1018 def insertMulti(self, valuesList):
1019 """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1020 into the multi table.
1022 self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1026 def insertSplices(self, valuesList):
1027 """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
1028 into the splices table.
1030 self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
1034 def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
1035 """ update reads on file database in a list region of regions for a chromosome to have a new flag.
1036 regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
1037 sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
1041 restrict = " and sense = ? "
1044 self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1047 self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1049 if self.dataType == "RNA" and splices:
1050 self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
1051 self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
1056 def setFlags(self, flag, uniqs=True, multi=True, splices=True):
1057 """ set the flag fields in the entire dataset.
1060 self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
1063 self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
1065 if self.dataType == "RNA" and splices:
1066 self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
1071 def resetFlags(self, uniqs=True, multi=True, splices=True):
1072 """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
1074 self.setFlags("", uniqs, multi, splices)
1077 def reweighMultireads(self, readList):
1078 self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
1081 def setSynchronousPragma(self, value="ON"):
1083 self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
1085 print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
1088 def setDBcache(self, cache, default=False):
1089 self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
1091 self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
1094 def execute(self, statement, returnResults=False):
1095 sql = self.getSqlCursor()
1097 sql.execute(statement)
1099 result = sql.fetchall()
1103 def executeCommit(self, statement):
1104 self.execute(statement)
1107 self.memcon.commit()
1112 def buildIndex(self, cache=100000):
1113 """ Builds the file indeces for the main tables.
1114 Cache is the number of 1.5 kb pages to keep in memory.
1115 100000 pages translates into 150MB of RAM, which is our default.
1117 if cache > self.getDefaultCacheSize():
1118 self.setDBcache(cache)
1119 self.setSynchronousPragma("OFF")
1120 self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1121 print "built uPosIndex"
1122 self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
1123 print "built uChromIndex"
1124 self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1125 print "built mPosIndex"
1126 self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
1127 print "built mChromIndex"
1129 if self.dataType == "RNA":
1130 self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
1131 print "built sPosIndex"
1132 self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
1133 print "built sPosIndex2"
1134 self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
1135 print "built sChromIndex"
1138 self.setSynchronousPragma("ON")
1141 def dropIndex(self):
1142 """ drops the file indices for the main tables.
1145 self.setSynchronousPragma("OFF")
1146 self.dbcon.execute("DROP INDEX uPosIndex")
1147 self.dbcon.execute("DROP INDEX uChromIndex")
1148 self.dbcon.execute("DROP INDEX mPosIndex")
1149 self.dbcon.execute("DROP INDEX mChromIndex")
1151 if self.dataType == "RNA":
1152 self.dbcon.execute("DROP INDEX sPosIndex")
1154 self.dbcon.execute("DROP INDEX sPosIndex2")
1158 self.dbcon.execute("DROP INDEX sChromIndex")
1162 print "problem dropping index"
1164 self.setSynchronousPragma("ON")
1167 def memSync(self, chrom="", index=False):
1168 """ makes a copy of the dataset into memory for faster access.
1169 Can be restricted to a "full" chromosome. Can also build the
1173 self.memcon = sqlite.connect(":memory:")
1174 self.initializeTables(self.memcon)
1175 cursor = self.dbcon.cursor()
1178 print "memSync %s" % chrom
1179 whereclause = " where chrom = '%s' " % chrom
1180 self.memChrom = chrom
1184 self.memcon.execute("PRAGMA temp_store = MEMORY")
1185 self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
1186 # copy metadata to memory
1187 self.memcon.execute("delete from metadata")
1188 results = cursor.execute("select name, value from metadata")
1191 results2.append((row["name"], row["value"]))
1193 self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
1195 self.copyDBEntriesToMemory("uniqs", whereclause)
1196 self.copyDBEntriesToMemory("multi", whereclause)
1197 if self.dataType == "RNA":
1198 self.copySpliceDBEntriesToMemory(whereclause)
1202 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
1203 self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
1204 if self.dataType == "RNA":
1205 self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
1206 self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
1208 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1209 self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1210 if self.dataType == "RNA":
1211 self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
1212 self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
1214 self.memBacked = True
1215 self.memcon.row_factory = sqlite.Row
1216 self.memcon.commit()
1219 def copyDBEntriesToMemory(self, dbName, whereClause=""):
1220 cursor = self.dbcon.cursor()
1221 sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
1222 destinationEntries = []
1223 for row in sourceEntries:
1224 destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
1226 self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
1229 def copySpliceDBEntriesToMemory(self, whereClause=""):
1230 cursor = self.dbcon.cursor()
1231 sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
1232 destinationEntries = []
1233 for row in sourceEntries:
1234 destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
1235 row["weight"], row["flag"], row["mismatch"]))
1237 self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", destinationEntries)