1 import sqlite3 as sqlite
6 from array import array
7 from commoncode import getReverseComplement, getConfigParser, getConfigOption
9 currentRDSVersion = "2.0"
12 class ReadDatasetError(Exception):
17 """ Class for storing reads from experiments. Assumes that custom scripts
18 will translate incoming data into a format that can be inserted into the
19 class using the insert* methods. Default class subtype ('DNA') includes
20 tables for unique and multireads, whereas 'RNA' subtype also includes a
24 def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False,
25 cache=False, reportCount=True):
26 """ creates an rds datafile if initialize is set to true, otherwise
27 will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
32 self.rdsVersion = currentRDSVersion
33 self.memBacked = False
36 self.cachedDBFile = ""
42 self.cacheDB(datafile)
43 dbFile = self.cachedDBFile
47 self.dbcon = sqlite.connect(dbFile)
48 self.dbcon.row_factory = sqlite.Row
49 self.dbcon.execute("PRAGMA temp_store = MEMORY")
51 if datasetType not in ["DNA", "RNA"]:
52 raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
54 self.dataType = datasetType
56 self.initializeTables(self.dbcon)
58 metadata = self.getMetadata("dataType")
59 self.dataType = metadata["dataType"]
62 metadata = self.getMetadata("rdsVersion")
63 self.rdsVersion = metadata["rdsVersion"]
66 self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
68 print "could not add rdsVersion - read-only ?"
69 self.rdsVersion = "pre-1.0"
73 print "INITIALIZED dataset %s" % datafile
75 print "dataset %s" % datafile
77 metadata = self.getMetadata()
79 pnameList = metadata.keys()
81 for pname in pnameList:
82 print "\t" + pname + "\t" + metadata[pname]
85 ucount = self.getUniqsCount()
86 mcount = self.getMultiCount()
87 if self.dataType == "DNA" and not initialize:
89 print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
91 print "\n%s unique reads and %s multireads" % (ucount, mcount)
92 elif self.dataType == "RNA" and not initialize:
93 scount = self.getSplicesCount()
95 print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
97 print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
99 print "default cache size is %d pages" % self.getDefaultCacheSize()
107 """ return the number of usable reads in the dataset.
109 total = self.getUniqsCount()
110 total += self.getMultiCount()
112 if self.dataType == "RNA":
113 total += self.getSplicesCount()
121 """ cleanup copy in local cache, if present.
123 if self.cachedDBFile != "":
127 def cacheDB(self, filename):
128 """ copy geneinfoDB to a local cache.
130 configParser = getConfigParser()
131 cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
132 tempfile.tempdir = cisTemp
133 self.cachedDBFile = "%s.db" % tempfile.mktemp()
134 shutil.copyfile(filename, self.cachedDBFile)
137 def saveCacheDB(self, filename):
138 """ copy geneinfoDB to a local cache.
140 shutil.copyfile(self.cachedDBFile, filename)
144 """ delete geneinfoDB from local cache.
147 if self.cachedDBFile != "":
149 os.remove(self.cachedDBFile)
151 print "could not delete %s" % self.cachedDBFile
156 def attachDB(self, filename, dbName):
157 """ attach another database file to the readDataset.
159 stmt = "attach '%s' as %s" % (filename, dbName)
163 def detachDB(self, dbName):
164 """ detach a database file to the readDataset.
166 stmt = "detach %s" % (dbName)
170 def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
171 """ import into current RDS the table (with columns destcolumns,
172 with default all columns) from the database file asname,
173 using the column specification of ascolumns (default all).
175 stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
177 stmt += " where flag = '%s' " % flagged
179 self.executeCommit(stmt)
182 def getTables(self, dbName=""):
183 """ get a list of table names in a particular database file.
186 sql = self.getSqlCursor()
189 dbName = "%s." % dbName
191 stmt = "select name from %ssqlite_master where type='table'" % dbName
193 results = sql.fetchall()
196 resultList.append(row["name"])
201 def getSqlCursor(self):
203 sql = self.getMemCursor()
205 sql = self.getFileCursor()
211 """ check whether the RDS file has at least one index.
213 stmt = "select count(*) from sqlite_master where type='index'"
214 count = int(self.execute(stmt, returnResults=True)[0][0])
221 def initializeTables(self, dbConnection, cache=100000):
222 """ creates table schema in a database connection, which is
223 typically a database file or an in-memory database.
225 dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
226 dbConnection.execute("create table metadata (name varchar, value varchar)")
227 dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
228 positionSchema = "start int, stop int"
229 tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
230 dbConnection.execute("create table uniqs %s" % tableSchema)
231 dbConnection.execute("create table multi %s" % tableSchema)
232 if self.dataType == "RNA":
233 positionSchema = "startL int, stopL int, startR int, stopR int"
234 tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
235 dbConnection.execute("create table splices %s" % tableSchema)
237 dbConnection.commit()
240 def getFileCursor(self):
241 """ returns a cursor to file database for low-level (SQL)
244 return self.dbcon.cursor()
247 def getMemCursor(self):
248 """ returns a cursor to memory database for low-level (SQL)
251 return self.memcon.cursor()
254 def getMetadata(self, valueName=""):
255 """ returns a dictionary of metadata.
261 whereClause = " where name='%s'" % valueName
263 sql = self.getSqlCursor()
265 sql.execute("select name, value from metadata %s" % whereClause)
266 results = sql.fetchall()
269 parameterName = row["name"]
270 parameterValue = row["value"]
271 if parameterName not in resultsDict:
272 resultsDict[parameterName] = parameterValue
277 newName = string.join([parameterName, str(index)], ":")
278 if newName not in resultsDict:
279 resultsDict[newName] = parameterValue
287 def getReadSize(self):
288 """ returns readsize if defined in metadata.
290 metadata = self.getMetadata()
291 if "readsize" not in metadata:
292 raise ReadDatasetError("no readsize parameter defined")
294 readSize = metadata["readsize"]
295 if "import" in readSize:
296 readSize = readSize.split()[0]
298 readSize = int(readSize)
300 raise ReadDatasetError("readsize is negative")
305 def getDefaultCacheSize(self):
306 """ returns the default cache size.
308 return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
311 def getChromosomes(self, table="uniqs", fullChrom=True):
312 """ returns a list of distinct chromosomes in table.
314 statement = "select distinct chrom from %s" % table
315 sql = self.getSqlCursor()
317 sql.execute(statement)
321 if row["chrom"] not in results:
322 results.append(row["chrom"])
324 shortName = row["chrom"][3:]
325 if len(shortName.strip()) > 0 and shortName not in results:
326 results.append(shortName)
333 def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
334 doMulti=False, doSplices=False):
335 """ returns the maximum coordinate for reads on a given chromosome.
340 maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
343 spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
344 maxCoord = max(spliceMax, maxCoord)
347 multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
348 maxCoord = max(multiMax, maxCoord)
351 print "%s maxCoord: %d" % (chrom, maxCoord)
356 def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
358 sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
359 sql = self.getSqlCursor()
361 sql.execute(sqlStatement)
362 maxCoord = int(sql.fetchall()[0][0])
364 print "couldn't retrieve coordMax for chromosome %s" % chrom
369 def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
370 flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
371 withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
372 readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
373 flagLike=False, strand='', combine5p=False):
374 """ returns a dictionary of reads in a variety of formats
375 and which can be restricted by chromosome or custom-flag.
376 Returns unique reads by default, but can return multireads
377 with doMulti set to True.
379 Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
382 whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
384 selectQuery = "select start, sense, sum(weight)"
386 selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
388 groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
390 stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
392 stmt.append("UNION ALL")
393 stmt.append(selectQuery)
394 stmt.append("from multi")
395 stmt.append(whereQuery)
396 stmt.append(groupQuery)
398 stmt = [selectQuery, "from multi", whereQuery]
402 selectQuery = "select start, sense, weight, chrom"
405 subSelect = [selectQuery, "from uniqs", whereQuery]
407 subSelect.append("union all")
408 subSelect.append(selectQuery)
409 subSelect.append("from multi")
410 subSelect.append(whereQuery)
412 subSelect = [selectQuery, "from multi", whereQuery]
414 sqlStmt = string.join(subSelect)
416 selectQuery = "select start, sense, sum(weight)"
418 stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
419 selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
423 self.memcon.row_factory = None
424 sql = self.memcon.cursor()
426 self.dbcon.row_factory = None
427 sql = self.dbcon.cursor()
429 stmt.append("order by start")
432 sql = self.memcon.cursor()
434 sql = self.dbcon.cursor()
436 stmt.append("order by readID, start")
439 sql = self.memcon.cursor()
441 sql = self.dbcon.cursor()
443 stmt.append("order by chrom, start")
445 sqlQuery = string.join(stmt)
446 sql.execute(sqlQuery)
450 resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
452 self.memcon.row_factory = sqlite.Row
454 self.dbcon.row_factory = sqlite.Row
460 readID = row["readID"]
464 chrom = row["chrom"][3:]
466 if not readIDDict and chrom != currentChrom:
467 resultsDict[chrom] = []
473 theReadID = readID.split("::")[0]
475 if "/" in theReadID and withPairID:
476 (theReadID, pairID) = readID.split("/")
478 if theReadID != currentReadID:
479 resultsDict[theReadID] = []
480 currentReadID = theReadID
483 newrow = {"start": int(row["start"])}
485 newrow["stop"] = int(row["stop"])
488 newrow["sense"] = row["sense"]
491 newrow["weight"] = float(row["weight"])
494 newrow["flag"] = row["flag"]
497 newrow["mismatch"] = row["mismatch"]
500 newrow["readID"] = readID
503 newrow["chrom"] = chrom
506 newrow["pairID"] = pairID
508 resultsDict[dictKey].append(newrow)
513 def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
522 if chrom != "" and chrom != self.memChrom:
523 whereClause.append("chrom = '%s'" % chrom)
527 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
528 whereClause.append(flagLikeClause)
530 whereClause.append("flag = '%s'" % flag)
533 whereClause.append("%s > %d" % (startText, start))
536 whereClause.append("%s < %d" % (stopText, stop))
538 if len(readLike) > 0:
539 readIDClause = string.join(["readID LIKE '", readLike, "%'"], "")
540 whereClause.append(readIDClause)
543 whereClause.append("mismatch != ''")
545 if strand in ["+", "-"]:
546 whereClause.append("sense = '%s'" % strand)
548 if len(whereClause) > 0:
549 whereStatement = string.join(whereClause, " and ")
550 whereQuery = "where %s" % whereStatement
557 def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
559 selectClause = [baseSelect]
561 selectClause.append("stop")
564 selectClause.append("sense")
567 selectClause.append("weight")
570 selectClause.append("flag")
573 selectClause.append("mismatch")
575 selectQuery = string.join(selectClause, ",")
580 def getReadGroupQuery(self, findallOptimize, limit, combine5p):
583 groupBy = ["GROUP BY start, sense"]
585 if limit > 0 and not combine5p:
586 groupBy.append("LIMIT %d" % limit)
588 groupQuery = string.join(groupBy)
593 def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
594 flag="", withWeight=False, withFlag=False, withMismatch=False,
595 withID=False, withChrom=False, withPairID=False, readIDDict=False,
596 splitRead=False, hasMismatch=False, flagLike=False, start=-1,
598 """ returns a dictionary of spliced reads in a variety of
599 formats and which can be restricted by chromosome or custom-flag.
600 Returns unique spliced reads for now.
602 whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
603 selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
604 selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
606 sql = self.memcon.cursor()
608 sql = self.dbcon.cursor()
610 stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
617 readID = row["readID"]
621 chrom = row["chrom"][3:]
623 if not readIDDict and chrom != currentChrom:
624 resultsDict[chrom] = []
629 (theReadID, pairID) = readID.split("/")
633 if theReadID != currentReadID:
634 resultsDict[theReadID] = []
635 currentReadID = theReadID
638 newrow = {"startL": int(row["startL"])}
639 newrow["stopL"] = int(row["stopL"])
640 newrow["startR"] = int(row["startR"])
641 newrow["stopR"] = int(row["stopR"])
643 newrow["sense"] = row["sense"]
646 newrow["weight"] = float(row["weight"])
649 newrow["flag"] = row["flag"]
652 newrow["mismatch"] = row["mismatch"]
655 newrow["readID"] = readID
658 newrow["chrom"] = chrom
661 newrow["pairID"] = pairID
664 leftDict = newrow.copy()
665 del leftDict["startR"]
666 del leftDict["stopR"]
668 del rightDict["startL"]
669 del rightDict["stopL"]
670 resultsDict[dictKey].append(leftDict)
671 resultsDict[dictKey].append(rightDict)
673 resultsDict[dictKey].append(newrow)
678 def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
679 splices=False, reportCombined=True, sense="both"):
680 """ return read counts for a given region.
686 if sense in ["+", "-"]:
687 restrict = " sense ='%s' " % sense
691 ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
697 mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
703 scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
708 total = ucount + mcount + scount
711 return (ucount, mcount, scount)
714 def getTotalCounts(self, chrom="", rmin="", rmax=""):
715 """ return read counts for a given region.
717 return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
720 def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
721 """ returns the number of row in the uniqs table.
726 if chrom !="" and chrom != self.memChrom:
727 whereClause = ["chrom='%s'" % chrom]
730 whereClause.append("%s >= %s" % (startField, str(rmin)))
733 whereClause.append("%s <= %s" % (startField, str(rmax)))
736 whereClause.append(restrict)
738 if len(whereClause) > 0:
739 whereStatement = string.join(whereClause, " and ")
740 whereQuery = "where %s" % whereStatement
745 sql = self.memcon.cursor()
747 sql = self.dbcon.cursor()
750 sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
752 sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
754 result = sql.fetchone()
757 count = int(result[0])
764 def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
765 """ returns the number of row in the splices table.
767 return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
770 def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
771 """ returns the number of distinct readIDs in the uniqs table.
773 return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
776 def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
777 """ returns the total weight of readIDs in the multi table.
779 return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
782 def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
787 stmt.append("select readID from uniqs")
790 stmt.append("select readID from multi")
793 stmt.append("select readID from splices")
796 selectPart = string.join(stmt, " union ")
802 limitPart = "LIMIT %d" % limit
804 sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
806 sql = self.memcon.cursor()
808 sql = self.dbcon.cursor()
810 sql.execute(sqlQuery)
811 result = sql.fetchall()
814 return [x[0].split("/")[0] for x in result]
816 return [x[0] for x in result]
819 def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
820 """ returns the uniq and spliced mismatches in a dictionary.
822 readlen = self.getReadSize()
824 hitChromList = [mischrom]
826 hitChromList = self.getChromosomes()
830 for achrom in hitChromList:
832 print "getting mismatches from chromosome %s" % (achrom)
835 if useSplices and self.dataType == "RNA":
836 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
837 spliceIDList = spliceDict.keys()
838 for spliceID in spliceIDList:
839 spliceEntry = spliceDict[spliceID][0]
840 startpos = spliceEntry["startL"]
841 lefthalf = spliceEntry["stopL"]
842 rightstart = spliceEntry["startR"]
843 sense = spliceEntry["sense"]
844 mismatches = spliceEntry["mismatch"]
845 spMismatchList = mismatches.split(",")
846 for mismatch in spMismatchList:
850 change_len = len(mismatch)
852 change_from = mismatch[0]
853 change_base = mismatch[change_len-1]
854 change_pos = int(mismatch[1:change_len-1])
856 change_from = getReverseComplement([mismatch[0]])
857 change_base = getReverseComplement([mismatch[change_len-1]])
858 change_pos = readlen - int(mismatch[1:change_len-1]) + 1
860 firsthalf = int(lefthalf)-int(startpos)+1
862 if int(change_pos) <= int(firsthalf):
863 change_at = startpos + change_pos - 1
865 secondhalf = change_pos - firsthalf
866 change_at = rightstart + secondhalf
868 snpDict[achrom].append([startpos, change_at, change_base, change_from])
870 hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
871 if achrom not in hitDict.keys():
874 for readEntry in hitDict[achrom]:
875 start = readEntry["start"]
876 sense = readEntry["sense"]
877 mismatches = readEntry["mismatch"]
878 mismatchList = mismatches.split(",")
879 for mismatch in mismatchList:
883 change_len = len(mismatch)
885 change_from = mismatch[0]
886 change_base = mismatch[change_len-1]
887 change_pos = int(mismatch[1:change_len-1])
889 change_from = getReverseComplement([mismatch[0]])
890 change_base = getReverseComplement([mismatch[change_len-1]])
891 change_pos = readlen - int(mismatch[1:change_len-1]) + 1
893 change_at = start + change_pos - 1
894 snpDict[achrom].append([start, change_at, change_base, change_from])
899 def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
900 useSplices=False, normalizationFactor=1.0, trackStrand=False,
901 keepStrand="both", shiftValue=0):
902 """return a profile of the chromosome as an array of per-base read coverage....
903 keepStrand = 'both', 'plusOnly', or 'minusOnly'.
904 Will also shift position of unique and multireads (but not splices) if shift is a natural number
906 metadata = self.getMetadata()
908 readlen = int(metadata["readsize"])
912 dataType = metadata["dataType"]
913 scale = 1. / normalizationFactor
915 shift["+"] = int(shiftValue)
916 shift["-"] = -1 * int(shiftValue)
919 lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
921 lastNT = cstop - cstart + readlen + shift["+"]
923 chromModel = array("f",[0.] * lastNT)
924 hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
928 for readEntry in hitDict[chromosome]:
929 hstart = readEntry["start"]
930 sense = readEntry ["sense"]
931 weight = readEntry["weight"]
932 hstart = hstart - cstart + shift[sense]
933 for currentpos in range(hstart,hstart+readlen):
935 if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
936 chromModel[currentpos] += scale * weight
937 elif sense == "-" and keepStrand != "plusOnly":
938 chromModel[currentpos] -= scale * weight
943 if useSplices and dataType == "RNA":
945 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
947 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
949 if chromosome in spliceDict:
950 for spliceEntry in spliceDict[chromosome]:
951 Lstart = spliceEntry["startL"]
952 Lstop = spliceEntry["stopL"]
953 Rstart = spliceEntry["startR"]
954 Rstop = spliceEntry["stopR"]
955 rsense = spliceEntry["sense"]
956 if (Rstop - cstart) < lastNT:
957 for index in range(abs(Lstop - Lstart)):
958 currentpos = Lstart - cstart + index
959 # we only track unique splices
960 if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
961 chromModel[currentpos] += scale
962 elif rsense == "-" and keepStrand != "plusOnly":
963 chromModel[currentpos] -= scale
965 for index in range(abs(Rstop - Rstart)):
966 currentpos = Rstart - cstart + index
967 # we only track unique splices
968 if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
969 chromModel[currentpos] += scale
970 elif rsense == "-" and keepStrand != "plusOnly":
971 chromModel[currentpos] -= scale
978 def insertMetadata(self, valuesList):
979 """ inserts a list of (pname, pvalue) into the metadata
982 self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
986 def updateMetadata(self, pname, newValue, originalValue=""):
987 """ update a metadata field given the original value and the new value.
989 stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
990 if originalValue != "":
991 stmt += " and value='%s' " % str(originalValue)
993 self.dbcon.execute(stmt)
997 def insertUniqs(self, valuesList):
998 """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
999 into the uniqs table.
1001 self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1005 def insertMulti(self, valuesList):
1006 """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1007 into the multi table.
1009 self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1013 def insertSplices(self, valuesList):
1014 """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
1015 into the splices table.
1017 self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
1021 def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
1022 """ update reads on file database in a list region of regions for a chromosome to have a new flag.
1023 regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
1024 sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
1028 restrict = " and sense = ? "
1031 self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1034 self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1036 if self.dataType == "RNA" and splices:
1037 self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
1038 self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
1043 def setFlags(self, flag, uniqs=True, multi=True, splices=True):
1044 """ set the flag fields in the entire dataset.
1047 self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
1050 self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
1052 if self.dataType == "RNA" and splices:
1053 self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
1058 def resetFlags(self, uniqs=True, multi=True, splices=True):
1059 """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
1061 self.setFlags("", uniqs, multi, splices)
1064 def reweighMultireads(self, readList):
1065 self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
1068 def setSynchronousPragma(self, value="ON"):
1070 self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
1072 print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
1075 def setDBcache(self, cache, default=False):
1076 self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
1078 self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
1081 def execute(self, statement, returnResults=False):
1082 sql = self.getSqlCursor()
1084 sql.execute(statement)
1086 result = sql.fetchall()
1090 def executeCommit(self, statement):
1091 self.execute(statement)
1094 self.memcon.commit()
1099 def buildIndex(self, cache=100000):
1100 """ Builds the file indeces for the main tables.
1101 Cache is the number of 1.5 kb pages to keep in memory.
1102 100000 pages translates into 150MB of RAM, which is our default.
1104 if cache > self.getDefaultCacheSize():
1105 self.setDBcache(cache)
1106 self.setSynchronousPragma("OFF")
1107 self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1108 print "built uPosIndex"
1109 self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
1110 print "built uChromIndex"
1111 self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1112 print "built mPosIndex"
1113 self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
1114 print "built mChromIndex"
1116 if self.dataType == "RNA":
1117 self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
1118 print "built sPosIndex"
1119 self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
1120 print "built sPosIndex2"
1121 self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
1122 print "built sChromIndex"
1125 self.setSynchronousPragma("ON")
1128 def dropIndex(self):
1129 """ drops the file indices for the main tables.
1132 self.setSynchronousPragma("OFF")
1133 self.dbcon.execute("DROP INDEX uPosIndex")
1134 self.dbcon.execute("DROP INDEX uChromIndex")
1135 self.dbcon.execute("DROP INDEX mPosIndex")
1136 self.dbcon.execute("DROP INDEX mChromIndex")
1138 if self.dataType == "RNA":
1139 self.dbcon.execute("DROP INDEX sPosIndex")
1141 self.dbcon.execute("DROP INDEX sPosIndex2")
1145 self.dbcon.execute("DROP INDEX sChromIndex")
1149 print "problem dropping index"
1151 self.setSynchronousPragma("ON")
1154 def memSync(self, chrom="", index=False):
1155 """ makes a copy of the dataset into memory for faster access.
1156 Can be restricted to a "full" chromosome. Can also build the
1160 self.memcon = sqlite.connect(":memory:")
1161 self.initializeTables(self.memcon)
1162 cursor = self.dbcon.cursor()
1165 print "memSync %s" % chrom
1166 whereclause = " where chrom = '%s' " % chrom
1167 self.memChrom = chrom
1171 self.memcon.execute("PRAGMA temp_store = MEMORY")
1172 self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
1173 # copy metadata to memory
1174 self.memcon.execute("delete from metadata")
1175 results = cursor.execute("select name, value from metadata")
1178 results2.append((row["name"], row["value"]))
1180 self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
1182 self.copyDBEntriesToMemory("uniqs", whereclause)
1183 self.copyDBEntriesToMemory("multi", whereclause)
1184 if self.dataType == "RNA":
1185 self.copySpliceDBEntriesToMemory(whereclause)
1189 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
1190 self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
1191 if self.dataType == "RNA":
1192 self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
1193 self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
1195 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1196 self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1197 if self.dataType == "RNA":
1198 self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
1199 self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
1201 self.memBacked = True
1202 self.memcon.row_factory = sqlite.Row
1203 self.memcon.commit()
1206 def copyDBEntriesToMemory(self, dbName, whereClause=""):
1207 cursor = self.dbcon.cursor()
1208 sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
1209 destinationEntries = []
1210 for row in sourceEntries:
1211 destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
1213 self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
1216 def copySpliceDBEntriesToMemory(self, whereClause=""):
1217 cursor = self.dbcon.cursor()
1218 sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
1219 destinationEntries = []
1220 for row in sourceEntries:
1221 destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
1222 row["weight"], row["flag"], row["mismatch"]))
1224 self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", destinationEntries)