7 import sqlite3 as sqlite
12 from os import environ
13 from array import array
14 from commoncode import getReverseComplement
16 if environ.get("CISTEMATIC_TEMP"):
17 cisTemp = environ.get("CISTEMATIC_TEMP")
21 tempfile.tempdir = cisTemp
22 currentRDSVersion = "1.1"
25 class ReadDatasetError(Exception):
30 """ Class for storing reads from experiments. Assumes that custom scripts
31 will translate incoming data into a format that can be inserted into the
32 class using the insert* methods. Default class subtype ('DNA') includes
33 tables for unique and multireads, whereas 'RNA' subtype also includes a
37 def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False,
38 cache=False, reportCount=True):
39 """ creates an rds datafile if initialize is set to true, otherwise
40 will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
45 self.rdsVersion = currentRDSVersion
46 self.memBacked = False
49 self.cachedDBFile = ""
55 self.cacheDB(datafile)
56 dbFile = self.cachedDBFile
60 self.dbcon = sqlite.connect(dbFile)
61 self.dbcon.row_factory = sqlite.Row
62 self.dbcon.execute("PRAGMA temp_store = MEMORY")
64 if datasetType not in ["DNA", "RNA"]:
65 raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
67 self.dataType = datasetType
69 self.initializeTables(self.dbcon)
71 metadata = self.getMetadata("dataType")
72 self.dataType = metadata["dataType"]
75 metadata = self.getMetadata("rdsVersion")
76 self.rdsVersion = metadata["rdsVersion"]
79 self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
81 print "could not add rdsVersion - read-only ?"
82 self.rdsVersion = "pre-1.0"
86 print "INITIALIZED dataset %s" % datafile
88 print "dataset %s" % datafile
90 metadata = self.getMetadata()
92 pnameList = metadata.keys()
94 for pname in pnameList:
95 print "\t" + pname + "\t" + metadata[pname]
98 ucount = self.getUniqsCount()
99 mcount = self.getMultiCount()
100 if self.dataType == "DNA" and not initialize:
102 print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
104 print "\n%s unique reads and %s multireads" % (ucount, mcount)
105 elif self.dataType == "RNA" and not initialize:
106 scount = self.getSplicesCount()
108 print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
110 print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
112 print "default cache size is %d pages" % self.getDefaultCacheSize()
120 """ return the number of usable reads in the dataset.
122 total = self.getUniqsCount()
123 total += self.getMultiCount()
125 if self.dataType == "RNA":
126 total += self.getSplicesCount()
134 """ cleanup copy in local cache, if present.
136 if self.cachedDBFile != "":
140 def cacheDB(self, filename):
141 """ copy geneinfoDB to a local cache.
143 self.cachedDBFile = "%s.db" % tempfile.mktemp()
144 shutil.copyfile(filename, self.cachedDBFile)
147 def saveCacheDB(self, filename):
148 """ copy geneinfoDB to a local cache.
150 shutil.copyfile(self.cachedDBFile, filename)
154 """ delete geneinfoDB from local cache.
157 if self.cachedDBFile != "":
159 os.remove(self.cachedDBFile)
161 print "could not delete %s" % self.cachedDBFile
166 def attachDB(self, filename, asname):
167 """ attach another database file to the readDataset.
169 stmt = "attach '%s' as %s" % (filename, asname)
173 def detachDB(self, asname):
174 """ detach a database file to the readDataset.
176 stmt = "detach %s" % (asname)
180 def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
181 """ import into current RDS the table (with columns destcolumns,
182 with default all columns) from the database file asname,
183 using the column specification of ascolumns (default all).
185 stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
187 stmt += " where flag = '%s' " % flagged
189 self.executeCommit(stmt)
192 def getTables(self, asname=""):
193 """ get a list of table names in a particular database file.
196 sql = self.getSqlCursor()
201 stmt = "select name from %ssqlite_master where type='table'" % asname
203 results = sql.fetchall()
206 resultList.append(row["name"])
211 def getSqlCursor(self):
213 sql = self.getMemCursor()
215 sql = self.getFileCursor()
221 """ check whether the RDS file has at least one index.
223 stmt = "select count(*) from sqlite_master where type='index'"
224 count = int(self.execute(stmt, returnResults=True)[0][0])
231 def initializeTables(self, dbConnection, cache=100000):
232 """ creates table schema in a database connection, which is
233 typically a database file or an in-memory database.
235 dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
236 dbConnection.execute("create table metadata (name varchar, value varchar)")
237 dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
238 positionSchema = "start int, stop int"
239 tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
240 dbConnection.execute("create table uniqs %s" % tableSchema)
241 dbConnection.execute("create table multi %s" % tableSchema)
242 if self.dataType == "RNA":
243 positionSchema = "startL int, stopL int, startR int, stopR int"
244 tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
245 dbConnection.execute("create table splices %s" % tableSchema)
247 dbConnection.commit()
250 def getFileCursor(self):
251 """ returns a cursor to file database for low-level (SQL)
254 return self.dbcon.cursor()
257 def getMemCursor(self):
258 """ returns a cursor to memory database for low-level (SQL)
261 return self.memcon.cursor()
264 def getMetadata(self, valueName=""):
265 """ returns a dictionary of metadata.
271 whereClause = " where name='%s'" % valueName
273 sql = self.getSqlCursor()
275 sql.execute("select name, value from metadata %s" % whereClause)
276 results = sql.fetchall()
279 parameterName = row["name"]
280 parameterValue = row["value"]
281 if parameterName not in resultsDict:
282 resultsDict[parameterName] = parameterValue
287 newName = string.join([parameterName, str(index)], ":")
288 if newName not in resultsDict:
289 resultsDict[newName] = parameterValue
297 def getReadSize(self):
298 """ returns readsize if defined in metadata.
300 metadata = self.getMetadata()
301 if "readsize" not in metadata:
302 raise ReadDatasetError("no readsize parameter defined")
304 mysize = metadata["readsize"]
305 if "import" in mysize:
306 mysize = mysize.split()[0]
311 def getDefaultCacheSize(self):
312 """ returns the default cache size.
314 return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
317 def getChromosomes(self, table="uniqs", fullChrom=True):
318 """ returns a list of distinct chromosomes in table.
320 statement = "select distinct chrom from %s" % table
321 sql = self.getSqlCursor()
323 sql.execute(statement)
327 if row["chrom"] not in results:
328 results.append(row["chrom"])
330 if len(row["chrom"][3:].strip()) < 1:
333 if row["chrom"][3:] not in results:
334 results.append(row["chrom"][3:])
341 def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
342 doMulti=False, doSplices=False):
343 """ returns the maximum coordinate for reads on a given chromosome.
346 sql = self.getSqlCursor()
350 sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
351 maxCoord = int(sql.fetchall()[0][0])
353 print "couldn't retrieve coordMax for chromosome %s" % chrom
356 sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
358 spliceMax = int(sql.fetchall()[0][0])
359 if spliceMax > maxCoord:
365 sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
367 multiMax = int(sql.fetchall()[0][0])
368 if multiMax > maxCoord:
374 print "%s maxCoord: %d" % (chrom, maxCoord)
379 def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
380 flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
381 withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
382 readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
383 flagLike=False, strand='', combine5p=False):
384 """ returns a dictionary of reads in a variety of formats
385 and which can be restricted by chromosome or custom-flag.
386 Returns unique reads by default, but can return multireads
387 with doMulti set to True.
392 if chrom != "" and chrom != self.memChrom:
393 whereClause.append("chrom = '%s'" % chrom)
397 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
398 whereClause.append(flagLikeClause)
400 whereClause.append("flag = '%s'" % flag)
403 whereClause.append("start > %d" % start)
406 whereClause.append("stop < %d" % stop)
408 if len(readLike) > 0:
409 readIDClause = string.join(["readID LIKE '", readLike, "%'"], "")
410 whereClause.append(readIDClause)
413 whereClause.append("mismatch != ''")
415 if strand in ["+", "-"]:
416 whereClause.append("sense = '%s'" % strand)
418 if len(whereClause) > 0:
419 whereStatement = string.join(whereClause, " and ")
420 whereQuery = "where %s" % whereStatement
426 selectClause = ["select start, sense, sum(weight)"]
427 groupBy = ["GROUP BY start, sense"]
429 selectClause = ["select ID, chrom, start, readID"]
431 selectClause.append("stop")
434 selectClause.append("sense")
437 selectClause.append("weight")
440 selectClause.append("flag")
443 selectClause.append("mismatch")
445 if limit > 0 and not combine5p:
446 groupBy.append("LIMIT %d" % limit)
448 selectQuery = string.join(selectClause, ",")
449 groupQuery = string.join(groupBy)
451 stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
453 stmt.append("UNION ALL")
454 stmt.append(selectQuery)
455 stmt.append("from multi")
456 stmt.append(whereQuery)
457 stmt.append(groupQuery)
459 stmt = [selectQuery, "from multi", whereQuery]
463 selectQuery = "select start, sense, weight, chrom"
466 subSelect = [selectQuery, "from uniqs", whereQuery]
468 subSelect.append("union all")
469 subSelect.append(selectQuery)
470 subSelect.append("from multi")
471 subSelect.append(whereQuery)
473 subSelect = [selectQuery, "from multi", whereQuery]
475 sqlStmt = string.join(subSelect)
477 selectQuery = "select start, sense, sum(weight)"
479 stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
480 selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
484 self.memcon.row_factory = None
485 sql = self.memcon.cursor()
487 self.dbcon.row_factory = None
488 sql = self.dbcon.cursor()
490 stmt.append("order by start")
493 sql = self.memcon.cursor()
495 sql = self.dbcon.cursor()
497 stmt.append("order by readID, start")
500 sql = self.memcon.cursor()
502 sql = self.dbcon.cursor()
504 stmt.append("order by chrom, start")
506 sqlQuery = string.join(stmt)
507 sql.execute(sqlQuery)
510 resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
512 self.memcon.row_factory = sqlite.Row
514 self.dbcon.row_factory = sqlite.Row
520 readID = row["readID"]
524 chrom = row["chrom"][3:]
526 if not readIDDict and chrom != currentChrom:
527 resultsDict[chrom] = []
533 theReadID = readID.split("::")[0]
535 if "/" in theReadID and withPairID:
536 (theReadID, pairID) = readID.split("/")
538 if theReadID != currentReadID:
539 resultsDict[theReadID] = []
540 currentReadID = theReadID
543 newrow = {"start": int(row["start"])}
545 newrow["stop"] = int(row["stop"])
548 newrow["sense"] = row["sense"]
551 newrow["weight"] = float(row["weight"])
554 newrow["flag"] = row["flag"]
557 newrow["mismatch"] = row["mismatch"]
560 newrow["readID"] = readID
563 newrow["chrom"] = chrom
566 newrow["pairID"] = pairID
568 resultsDict[dictKey].append(newrow)
573 def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
574 flag="", withWeight=False, withFlag=False, withMismatch=False,
575 withID=False, withChrom=False, withPairID=False, readIDDict=False,
576 splitRead=False, hasMismatch=False, flagLike=False, start=-1,
578 """ returns a dictionary of spliced reads in a variety of
579 formats and which can be restricted by chromosome or custom-flag.
580 Returns unique spliced reads for now.
585 if chrom != "" and chrom != self.memChrom:
586 whereClause = ["chrom = '%s'" % chrom]
590 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
591 whereClause.append(flagLikeClause)
593 whereClause.append("flag = '%s'" % flag)
596 whereClause.append("mismatch != ''")
599 whereClause.append("sense = '%s'" % strand)
602 whereClause.append("startL > %d" % start)
605 whereClause.append("stopR < %d" % stop)
607 if len(whereClause) > 0:
608 whereStatement = string.join(whereClause, " and ")
609 whereQuery = "where %s" % whereStatement
613 selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
615 selectClause.append("sense")
618 selectClause.append("weight")
621 selectClause.append("flag")
624 selectClause.append("mismatch")
626 selectQuery = string.join(selectClause, " ,")
628 sql = self.memcon.cursor()
630 sql = self.dbcon.cursor()
632 stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
638 readID = row["readID"]
642 chrom = row["chrom"][3:]
644 if not readIDDict and chrom != currentChrom:
645 resultsDict[chrom] = []
650 (theReadID, pairID) = readID.split("/")
654 if theReadID != currentReadID:
655 resultsDict[theReadID] = []
656 currentReadID = theReadID
659 newrow = {"startL": int(row["startL"])}
660 newrow["stopL"] = int(row["stopL"])
661 newrow["startR"] = int(row["startR"])
662 newrow["stopR"] = int(row["stopR"])
664 newrow["sense"] = row["sense"]
667 newrow["weight"] = float(row["weight"])
670 newrow["flag"] = row["flag"]
673 newrow["mismatch"] = row["mismatch"]
676 newrow["readID"] = readID
679 newrow["chrom"] = chrom
682 newrow["pairID"] = pairID
685 leftDict = newrow.copy()
686 del leftDict["startR"]
687 del leftDict["stopR"]
689 del rightDict["startL"]
690 del rightDict["stopL"]
691 resultsDict[dictKey].append(leftDict)
692 resultsDict[dictKey].append(rightDict)
694 resultsDict[dictKey].append(newrow)
699 def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
700 splices=False, reportCombined=True, sense="both"):
701 """ return read counts for a given region.
707 if sense in ["+", "-"]:
708 restrict = " sense ='%s' " % sense
712 ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
718 mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
724 scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
729 total = ucount + mcount + scount
732 return (ucount, mcount, scount)
735 def getTotalCounts(self, chrom="", rmin="", rmax=""):
736 """ return read counts for a given region.
738 return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
741 def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
742 """ returns the number of row in the uniqs table.
747 if chrom !="" and chrom != self.memChrom:
748 whereClause = ["chrom='%s'" % chrom]
751 whereClause.append("%s >= %s" % (startField, str(rmin)))
754 whereClause.append("%s <= %s" % (startField, str(rmax)))
757 whereClause.append(restrict)
759 if len(whereClause) > 0:
760 whereStatement = string.join(whereClause, " and ")
761 whereQuery = "where %s" % whereStatement
766 sql = self.memcon.cursor()
768 sql = self.dbcon.cursor()
771 sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
773 sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
775 result = sql.fetchone()
778 count = int(result[0])
785 def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
786 """ returns the number of row in the splices table.
788 return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
791 def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
792 """ returns the number of distinct readIDs in the uniqs table.
794 return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
797 def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
798 """ returns the total weight of readIDs in the multi table.
800 return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
803 def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
809 limitPart = "LIMIT %d" % limit
812 stmt.append("select readID from uniqs")
815 stmt.append("select readID from multi")
818 stmt.append("select readID from splices")
821 selectPart = string.join(stmt, " union ")
825 sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
827 sql = self.memcon.cursor()
829 sql = self.dbcon.cursor()
831 sql.execute(sqlQuery)
832 result = sql.fetchall()
835 return [x[0].split("/")[0] for x in result]
837 return [x[0] for x in result]
840 def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
841 """ returns the uniq and spliced mismatches in a dictionary.
843 readlen = self.getReadSize()
845 hitChromList = [mischrom]
847 hitChromList = self.getChromosomes()
851 for achrom in hitChromList:
853 print "getting mismatches from chromosome %s" % (achrom)
856 hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
857 if useSplices and self.dataType == "RNA":
858 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
859 spliceIDList = spliceDict.keys()
860 for k in spliceIDList:
861 spliceEntry = spliceDict[k][0]
862 startpos = spliceEntry["startL"]
863 lefthalf = spliceEntry["stopL"]
864 rightstart = spliceEntry["startR"]
865 sense = spliceEntry["sense"]
866 mismatches = spliceEntry["mismatch"]
867 spMismatchList = mismatches.split(",")
868 for mismatch in spMismatchList:
872 change_len = len(mismatch)
874 change_from = mismatch[0]
875 change_base = mismatch[change_len-1]
876 change_pos = int(mismatch[1:change_len-1])
878 change_from = getReverseComplement([mismatch[0]])
879 change_base = getReverseComplement([mismatch[change_len-1]])
880 change_pos = readlen - int(mismatch[1:change_len-1]) + 1
882 firsthalf = int(lefthalf)-int(startpos)+1
884 if int(change_pos) <= int(firsthalf):
885 change_at = startpos + change_pos - 1
887 secondhalf = change_pos - firsthalf
888 change_at = rightstart + secondhalf
890 snpDict[achrom].append([startpos, change_at, change_base, change_from])
892 if achrom not in hitDict.keys():
895 for readEntry in hitDict[achrom]:
896 start = readEntry["start"]
897 sense = readEntry["sense"]
898 mismatches = readEntry["mismatch"]
899 mismatchList = mismatches.split(",")
900 for mismatch in mismatchList:
904 change_len = len(mismatch)
906 change_from = mismatch[0]
907 change_base = mismatch[change_len-1]
908 change_pos = int(mismatch[1:change_len-1])
910 change_from = getReverseComplement([mismatch[0]])
911 change_base = getReverseComplement([mismatch[change_len-1]])
912 change_pos = readlen - int(mismatch[1:change_len-1]) + 1
914 change_at = start + change_pos - 1
915 snpDict[achrom].append([start, change_at, change_base, change_from])
920 def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
921 useSplices=False, normalizationFactor = 1.0, trackStrand=False,
922 keepStrand="both", shiftValue=0):
923 """return a profile of the chromosome as an array of per-base read coverage....
924 keepStrand = 'both', 'plusOnly', or 'minusOnly'.
925 Will also shift position of unique and multireads (but not splices) if shift is a natural number
927 metadata = self.getMetadata()
929 readlen = int(metadata["readsize"])
933 dataType = metadata["dataType"]
934 scale = 1. / normalizationFactor
936 shift['+'] = int(shiftValue)
937 shift['-'] = -1 * int(shiftValue)
940 lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
942 lastNT = cstop - cstart + readlen + shift["+"]
944 chromModel = array("f",[0.] * lastNT)
945 hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
949 for readEntry in hitDict[chromosome]:
950 hstart = readEntry["start"]
951 sense = readEntry ["sense"]
952 weight = readEntry["weight"]
953 hstart = hstart - cstart + shift[sense]
954 for currentpos in range(hstart,hstart+readlen):
956 if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
957 chromModel[currentpos] += scale * weight
958 elif sense == "-" and keepStrand != "plusOnly":
959 chromModel[currentpos] -= scale * weight
964 if useSplices and dataType == "RNA":
966 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
968 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
970 if chromosome in spliceDict:
971 for spliceEntry in spliceDict[chromosome]:
972 Lstart = spliceEntry["startL"]
973 Lstop = spliceEntry["stopL"]
974 Rstart = spliceEntry["startR"]
975 Rstop = spliceEntry["stopR"]
976 rsense = spliceEntry["sense"]
977 if (Rstop - cstart) < lastNT:
978 for index in range(abs(Lstop - Lstart)):
979 currentpos = Lstart - cstart + index
980 # we only track unique splices
981 if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
982 chromModel[currentpos] += scale
983 elif rsense == "-" and keepStrand != "plusOnly":
984 chromModel[currentpos] -= scale
986 for index in range(abs(Rstop - Rstart)):
987 currentpos = Rstart - cstart + index
988 # we only track unique splices
989 if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
990 chromModel[currentpos] += scale
991 elif rsense == "-" and keepStrand != "plusOnly":
992 chromModel[currentpos] -= scale
999 def insertMetadata(self, valuesList):
1000 """ inserts a list of (pname, pvalue) into the metadata
1003 self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
1007 def updateMetadata(self, pname, newValue, originalValue=""):
1008 """ update a metadata field given the original value and the new value.
1010 stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
1011 if originalValue != "":
1012 stmt += " and value='%s' " % str(originalValue)
1014 self.dbcon.execute(stmt)
1018 def insertUniqs(self, valuesList):
1019 """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1020 into the uniqs table.
1022 self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1026 def insertMulti(self, valuesList):
1027 """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1028 into the multi table.
1030 self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1034 def insertSplices(self, valuesList):
1035 """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
1036 into the splices table.
1038 self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
1042 def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
1043 """ update reads on file database in a list region of regions for a chromosome to have a new flag.
1044 regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
1045 sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
1049 restrict = " and sense = ? "
1052 self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1055 self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1057 if self.dataType == "RNA" and splices:
1058 self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
1059 self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
1064 def setFlags(self, flag, uniqs=True, multi=True, splices=True):
1065 """ set the flag fields in the entire dataset.
1068 self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
1071 self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
1073 if self.dataType == "RNA" and splices:
1074 self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
1079 def resetFlags(self, uniqs=True, multi=True, splices=True):
1080 """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
1082 self.setFlags("", uniqs, multi, splices)
1085 def reweighMultireads(self, readList):
1086 self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
1089 def setSynchronousPragma(self, value="ON"):
1091 self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
1093 print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
1096 def setDBcache(self, cache, default=False):
1097 self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
1099 self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
1102 def execute(self, statement, returnResults=False):
1103 sql = self.getSqlCursor()
1105 sql.execute(statement)
1107 result = sql.fetchall()
1111 def executeCommit(self, statement):
1112 self.execute(statement)
1115 self.memcon.commit()
1120 def buildIndex(self, cache=100000):
1121 """ Builds the file indeces for the main tables.
1122 Cache is the number of 1.5 kb pages to keep in memory.
1123 100000 pages translates into 150MB of RAM, which is our default.
1125 if cache > self.getDefaultCacheSize():
1126 self.setDBcache(cache)
1127 self.setSynchronousPragma("OFF")
1128 self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1129 print "built uPosIndex"
1130 self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
1131 print "built uChromIndex"
1132 self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1133 print "built mPosIndex"
1134 self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
1135 print "built mChromIndex"
1137 if self.dataType == "RNA":
1138 self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
1139 print "built sPosIndex"
1140 self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
1141 print "built sPosIndex2"
1142 self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
1143 print "built sChromIndex"
1146 self.setSynchronousPragma("ON")
1149 def dropIndex(self):
1150 """ drops the file indices for the main tables.
1153 self.setSynchronousPragma("OFF")
1154 self.dbcon.execute("DROP INDEX uPosIndex")
1155 self.dbcon.execute("DROP INDEX uChromIndex")
1156 self.dbcon.execute("DROP INDEX mPosIndex")
1157 self.dbcon.execute("DROP INDEX mChromIndex")
1159 if self.dataType == "RNA":
1160 self.dbcon.execute("DROP INDEX sPosIndex")
1162 self.dbcon.execute("DROP INDEX sPosIndex2")
1166 self.dbcon.execute("DROP INDEX sChromIndex")
1170 print "problem dropping index"
1172 self.setSynchronousPragma("ON")
1175 def memSync(self, chrom="", index=False):
1176 """ makes a copy of the dataset into memory for faster access.
1177 Can be restricted to a "full" chromosome. Can also build the
1181 self.memcon = sqlite.connect(":memory:")
1182 self.initializeTables(self.memcon)
1183 cursor = self.dbcon.cursor()
1186 print "memSync %s" % chrom
1187 whereclause = " where chrom = '%s' " % chrom
1188 self.memChrom = chrom
1192 self.memcon.execute("PRAGMA temp_store = MEMORY")
1193 self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
1194 # copy metadata to memory
1195 self.memcon.execute("delete from metadata")
1196 results = cursor.execute("select name, value from metadata")
1199 results2.append((row["name"], row["value"]))
1201 self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
1203 self.copyDBEntriesToMemory("uniqs", whereclause)
1204 self.copyDBEntriesToMemory("multi", whereclause)
1205 if self.dataType == "RNA":
1206 self.copySpliceDBEntriesToMemory(whereclause)
1210 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
1211 self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
1212 if self.dataType == "RNA":
1213 self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
1214 self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
1216 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1217 self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1218 if self.dataType == "RNA":
1219 self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
1220 self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
1222 self.memBacked = True
1223 self.memcon.row_factory = sqlite.Row
1224 self.memcon.commit()
1227 def copyDBEntriesToMemory(self, dbName, whereClause=""):
1228 cursor = self.dbcon.cursor()
1229 sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
1230 destinationEntries = []
1231 for row in sourceEntries:
1232 destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
1234 self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
1237 def copySpliceDBEntriesToMemory(self, whereClause=""):
1238 cursor = self.dbcon.cursor()
1239 sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
1240 destinationEntries = []
1241 for row in sourceEntries:
1242 destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
1243 row["weight"], row["flag"], row["mismatch"]))
1245 self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", destinationEntries)