1 import sqlite3 as sqlite
6 from array import array
7 from commoncode import getReverseComplement, getConfigParser, getConfigOption
9 currentRDSVersion = "2.0"
12 class ReadDatasetError(Exception):
17 """ Class for storing reads from experiments. Assumes that custom scripts
18 will translate incoming data into a format that can be inserted into the
19 class using the insert* methods. Default class subtype ('DNA') includes
20 tables for unique and multireads, whereas 'RNA' subtype also includes a
24 def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False,
25 cache=False, reportCount=True):
26 """ creates an rds datafile if initialize is set to true, otherwise
27 will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
32 self.rdsVersion = currentRDSVersion
33 self.memBacked = False
36 self.cachedDBFile = ""
42 self.cacheDB(datafile)
43 dbFile = self.cachedDBFile
47 self.dbcon = sqlite.connect(dbFile)
48 self.dbcon.row_factory = sqlite.Row
49 self.dbcon.execute("PRAGMA temp_store = MEMORY")
51 if datasetType not in ["DNA", "RNA"]:
52 raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
54 self.dataType = datasetType
56 self.initializeTables(self.dbcon)
58 metadata = self.getMetadata("dataType")
59 self.dataType = metadata["dataType"]
62 metadata = self.getMetadata("rdsVersion")
63 self.rdsVersion = metadata["rdsVersion"]
66 self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
68 print "could not add rdsVersion - read-only ?"
69 self.rdsVersion = "pre-1.0"
73 print "INITIALIZED dataset %s" % datafile
75 print "dataset %s" % datafile
77 metadata = self.getMetadata()
79 pnameList = metadata.keys()
81 for pname in pnameList:
82 print "\t" + pname + "\t" + metadata[pname]
85 ucount = self.getUniqsCount()
86 mcount = self.getMultiCount()
87 if self.dataType == "DNA" and not initialize:
89 print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
91 print "\n%s unique reads and %s multireads" % (ucount, mcount)
92 elif self.dataType == "RNA" and not initialize:
93 scount = self.getSplicesCount()
95 print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
97 print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
99 print "default cache size is %d pages" % self.getDefaultCacheSize()
107 """ return the number of usable reads in the dataset.
109 total = self.getUniqsCount()
110 total += self.getMultiCount()
112 if self.dataType == "RNA":
113 total += self.getSplicesCount()
121 """ cleanup copy in local cache, if present.
123 if self.cachedDBFile != "":
127 def cacheDB(self, filename):
128 """ copy geneinfoDB to a local cache.
130 configParser = getConfigParser()
131 cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
132 tempfile.tempdir = cisTemp
133 self.cachedDBFile = "%s.db" % tempfile.mktemp()
134 shutil.copyfile(filename, self.cachedDBFile)
137 def saveCacheDB(self, filename):
138 """ copy geneinfoDB to a local cache.
140 shutil.copyfile(self.cachedDBFile, filename)
144 """ delete geneinfoDB from local cache.
147 if self.cachedDBFile != "":
149 os.remove(self.cachedDBFile)
151 print "could not delete %s" % self.cachedDBFile
156 def attachDB(self, filename, dbName):
157 """ attach another database file to the readDataset.
159 stmt = "attach '%s' as %s" % (filename, dbName)
163 def detachDB(self, dbName):
164 """ detach a database file to the readDataset.
166 stmt = "detach %s" % (dbName)
170 def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
171 """ import into current RDS the table (with columns destcolumns,
172 with default all columns) from the database file asname,
173 using the column specification of ascolumns (default all).
175 stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
177 stmt += " where flag = '%s' " % flagged
179 self.executeCommit(stmt)
182 def getTables(self, dbName=""):
183 """ get a list of table names in a particular database file.
186 sql = self.getSqlCursor()
189 dbName = "%s." % dbName
191 stmt = "select name from %ssqlite_master where type='table'" % dbName
193 results = sql.fetchall()
196 resultList.append(row["name"])
201 def getSqlCursor(self):
203 sql = self.getMemCursor()
205 sql = self.getFileCursor()
211 """ check whether the RDS file has at least one index.
213 stmt = "select count(*) from sqlite_master where type='index'"
214 count = int(self.execute(stmt, returnResults=True)[0][0])
221 def initializeTables(self, dbConnection, cache=100000):
222 """ creates table schema in a database connection, which is
223 typically a database file or an in-memory database.
225 dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
226 dbConnection.execute("create table metadata (name varchar, value varchar)")
227 dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
228 positionSchema = "start int, stop int"
229 tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
230 dbConnection.execute("create table uniqs %s" % tableSchema)
231 dbConnection.execute("create table multi %s" % tableSchema)
232 if self.dataType == "RNA":
233 positionSchema = "startL int, stopL int, startR int, stopR int"
234 tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
235 dbConnection.execute("create table splices %s" % tableSchema)
237 dbConnection.commit()
240 def getFileCursor(self):
241 """ returns a cursor to file database for low-level (SQL)
244 return self.dbcon.cursor()
247 def getMemCursor(self):
248 """ returns a cursor to memory database for low-level (SQL)
251 return self.memcon.cursor()
254 def getMetadata(self, valueName=""):
255 """ returns a dictionary of metadata.
261 whereClause = " where name='%s'" % valueName
263 sql = self.getSqlCursor()
265 sql.execute("select name, value from metadata %s" % whereClause)
266 results = sql.fetchall()
269 parameterName = row["name"]
270 parameterValue = row["value"]
271 if parameterName not in resultsDict:
272 resultsDict[parameterName] = parameterValue
277 newName = string.join([parameterName, str(index)], ":")
278 if newName not in resultsDict:
279 resultsDict[newName] = parameterValue
287 def getReadSize(self):
288 """ returns readsize if defined in metadata.
290 metadata = self.getMetadata()
291 if "readsize" not in metadata:
292 raise ReadDatasetError("no readsize parameter defined")
294 readSize = metadata["readsize"]
295 if "import" in readSize:
296 readSize = readSize.split()[0]
298 readSize = int(readSize)
300 raise ReadDatasetError("readsize is negative")
305 def getDefaultCacheSize(self):
306 """ returns the default cache size.
308 return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
311 def getChromosomes(self, table="uniqs", fullChrom=True):
312 """ returns a list of distinct chromosomes in table.
314 statement = "select distinct chrom from %s" % table
315 sql = self.getSqlCursor()
317 sql.execute(statement)
321 if row["chrom"] not in results:
322 results.append(row["chrom"])
324 shortName = row["chrom"][3:]
325 if len(shortName.strip()) > 0 and shortName not in results:
326 results.append(shortName)
333 def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
334 doMulti=False, doSplices=False):
335 """ returns the maximum coordinate for reads on a given chromosome.
340 maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
343 spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
344 maxCoord = max(spliceMax, maxCoord)
347 multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
348 maxCoord = max(multiMax, maxCoord)
351 print "%s maxCoord: %d" % (chrom, maxCoord)
356 def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
358 sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
359 sql = self.getSqlCursor()
361 sql.execute(sqlStatement)
362 maxCoord = int(sql.fetchall()[0][0])
364 print "couldn't retrieve coordMax for chromosome %s" % chrom
369 def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
370 flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
371 withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
372 readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
373 flagLike=False, strand='', combine5p=False):
374 """ returns a dictionary of reads in a variety of formats
375 and which can be restricted by chromosome or custom-flag.
376 Returns unique reads by default, but can return multireads
377 with doMulti set to True.
379 Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
382 whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
384 selectQuery = "select start, sense, sum(weight)"
386 selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
388 groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
390 stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
392 stmt.append("UNION ALL")
393 stmt.append(selectQuery)
394 stmt.append("from multi")
395 stmt.append(whereQuery)
396 stmt.append(groupQuery)
398 stmt = [selectQuery, "from multi", whereQuery]
402 selectQuery = "select start, sense, weight, chrom"
405 subSelect = [selectQuery, "from uniqs", whereQuery]
407 subSelect.append("union all")
408 subSelect.append(selectQuery)
409 subSelect.append("from multi")
410 subSelect.append(whereQuery)
412 subSelect = [selectQuery, "from multi", whereQuery]
414 sqlStmt = string.join(subSelect)
416 selectQuery = "select start, sense, sum(weight)"
418 stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
419 selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
423 self.memcon.row_factory = None
424 sql = self.memcon.cursor()
426 self.dbcon.row_factory = None
427 sql = self.dbcon.cursor()
429 stmt.append("order by start")
432 sql = self.memcon.cursor()
434 sql = self.dbcon.cursor()
436 stmt.append("order by readID, start")
439 sql = self.memcon.cursor()
441 sql = self.dbcon.cursor()
443 stmt.append("order by chrom, start")
445 sqlQuery = string.join(stmt)
446 sql.execute(sqlQuery)
450 resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
452 self.memcon.row_factory = sqlite.Row
454 self.dbcon.row_factory = sqlite.Row
460 readID = row["readID"]
464 chrom = row["chrom"][3:]
466 if not readIDDict and chrom != currentChrom:
467 resultsDict[chrom] = []
473 theReadID = readID.split("::")[0]
475 if "/" in theReadID and withPairID:
476 (theReadID, pairID) = readID.split("/")
478 if theReadID != currentReadID:
479 resultsDict[theReadID] = []
480 currentReadID = theReadID
483 newrow = {"start": int(row["start"])}
485 newrow["stop"] = int(row["stop"])
488 newrow["sense"] = row["sense"]
491 newrow["weight"] = float(row["weight"])
494 newrow["flag"] = row["flag"]
497 newrow["mismatch"] = row["mismatch"]
500 newrow["readID"] = readID
503 newrow["chrom"] = chrom
506 newrow["pairID"] = pairID
508 resultsDict[dictKey].append(newrow)
513 def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
522 if chrom != "" and chrom != self.memChrom:
523 whereClause.append("chrom = '%s'" % chrom)
527 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
528 whereClause.append(flagLikeClause)
530 whereClause.append("flag = '%s'" % flag)
533 whereClause.append("%s > %d" % (startText, start))
536 whereClause.append("%s < %d" % (stopText, stop))
538 if len(readLike) > 0:
539 readIDClause = string.join(["readID LIKE '", readLike, "%'"], "")
540 whereClause.append(readIDClause)
543 whereClause.append("mismatch != ''")
545 if strand in ["+", "-"]:
546 whereClause.append("sense = '%s'" % strand)
548 if len(whereClause) > 0:
549 whereStatement = string.join(whereClause, " and ")
550 whereQuery = "where %s" % whereStatement
557 def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
559 selectClause = [baseSelect]
561 selectClause.append("stop")
564 selectClause.append("sense")
567 selectClause.append("weight")
570 selectClause.append("flag")
573 selectClause.append("mismatch")
575 selectQuery = string.join(selectClause, ",")
580 def getReadGroupQuery(self, findallOptimize, limit, combine5p):
583 groupBy = ["GROUP BY start, sense"]
585 if limit > 0 and not combine5p:
586 groupBy.append("LIMIT %d" % limit)
588 groupQuery = string.join(groupBy)
593 def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
594 flag="", withWeight=False, withFlag=False, withMismatch=False,
595 withID=False, withChrom=False, withPairID=False, readIDDict=False,
596 splitRead=False, hasMismatch=False, flagLike=False, start=-1,
598 """ returns a dictionary of spliced reads in a variety of
599 formats and which can be restricted by chromosome or custom-flag.
600 Returns unique spliced reads for now.
602 whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
603 selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
604 selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
606 sql = self.memcon.cursor()
608 sql = self.dbcon.cursor()
610 stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
617 readID = row["readID"]
621 chrom = row["chrom"][3:]
623 if not readIDDict and chrom != currentChrom:
624 resultsDict[chrom] = []
629 (theReadID, pairID) = readID.split("/")
633 if theReadID != currentReadID:
634 resultsDict[theReadID] = []
635 currentReadID = theReadID
638 newrow = {"startL": int(row["startL"])}
639 newrow["stopL"] = int(row["stopL"])
640 newrow["startR"] = int(row["startR"])
641 newrow["stopR"] = int(row["stopR"])
643 newrow["sense"] = row["sense"]
646 newrow["weight"] = float(row["weight"])
649 newrow["flag"] = row["flag"]
652 newrow["mismatch"] = row["mismatch"]
655 newrow["readID"] = readID
658 newrow["chrom"] = chrom
661 newrow["pairID"] = pairID
664 leftDict = newrow.copy()
665 del leftDict["startR"]
666 del leftDict["stopR"]
668 del rightDict["startL"]
669 del rightDict["stopL"]
670 resultsDict[dictKey].append(leftDict)
671 resultsDict[dictKey].append(rightDict)
673 resultsDict[dictKey].append(newrow)
678 def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
679 splices=False, reportCombined=True, sense="both"):
680 """ return read counts for a given region.
686 if sense in ["+", "-"]:
687 restrict = " sense ='%s' " % sense
691 ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
697 mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
703 scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
708 total = ucount + mcount + scount
711 return (ucount, mcount, scount)
714 def getTotalCounts(self, chrom="", rmin="", rmax=""):
715 """ return read counts for a given region.
717 return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
720 def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
721 """ returns the number of row in the uniqs table.
726 if chrom !="" and chrom != self.memChrom:
727 whereClause = ["chrom='%s'" % chrom]
730 whereClause.append("%s >= %s" % (startField, str(rmin)))
733 whereClause.append("%s <= %s" % (startField, str(rmax)))
736 whereClause.append(restrict)
738 if len(whereClause) > 0:
739 whereStatement = string.join(whereClause, " and ")
740 whereQuery = "where %s" % whereStatement
745 sql = self.memcon.cursor()
747 sql = self.dbcon.cursor()
750 sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
752 sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
754 result = sql.fetchone()
757 count = int(result[0])
764 def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
765 """ returns the number of row in the splices table.
767 # TODO: if the rds type is DNA should this just return zero?
768 return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
771 def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
772 """ returns the number of distinct readIDs in the uniqs table.
774 return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
777 def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
778 """ returns the total weight of readIDs in the multi table.
780 return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
783 def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
788 stmt.append("select readID from uniqs")
791 stmt.append("select readID from multi")
794 stmt.append("select readID from splices")
797 selectPart = string.join(stmt, " union ")
803 limitPart = "LIMIT %d" % limit
805 sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
807 sql = self.memcon.cursor()
809 sql = self.dbcon.cursor()
811 sql.execute(sqlQuery)
812 result = sql.fetchall()
815 return [x[0].split("/")[0] for x in result]
817 return [x[0] for x in result]
820 def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
821 """ returns the uniq and spliced mismatches in a dictionary.
823 readlen = self.getReadSize()
825 hitChromList = [mischrom]
827 hitChromList = self.getChromosomes()
831 for achrom in hitChromList:
833 print "getting mismatches from chromosome %s" % (achrom)
836 if useSplices and self.dataType == "RNA":
837 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
838 spliceIDList = spliceDict.keys()
839 for spliceID in spliceIDList:
840 spliceEntry = spliceDict[spliceID][0]
841 startpos = spliceEntry["startL"]
842 lefthalf = spliceEntry["stopL"]
843 rightstart = spliceEntry["startR"]
844 sense = spliceEntry["sense"]
845 mismatches = spliceEntry["mismatch"]
846 spMismatchList = mismatches.split(",")
847 for mismatch in spMismatchList:
851 change_len = len(mismatch)
853 change_from = mismatch[0]
854 change_base = mismatch[change_len-1]
855 change_pos = int(mismatch[1:change_len-1])
857 change_from = getReverseComplement([mismatch[0]])
858 change_base = getReverseComplement([mismatch[change_len-1]])
859 change_pos = readlen - int(mismatch[1:change_len-1]) + 1
861 firsthalf = int(lefthalf)-int(startpos)+1
863 if int(change_pos) <= int(firsthalf):
864 change_at = startpos + change_pos - 1
866 secondhalf = change_pos - firsthalf
867 change_at = rightstart + secondhalf
869 snpDict[achrom].append([startpos, change_at, change_base, change_from])
871 hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
872 if achrom not in hitDict.keys():
875 for readEntry in hitDict[achrom]:
876 start = readEntry["start"]
877 sense = readEntry["sense"]
878 mismatches = readEntry["mismatch"]
879 mismatchList = mismatches.split(",")
880 for mismatch in mismatchList:
884 change_len = len(mismatch)
886 change_from = mismatch[0]
887 change_base = mismatch[change_len-1]
888 change_pos = int(mismatch[1:change_len-1])
890 change_from = getReverseComplement([mismatch[0]])
891 change_base = getReverseComplement([mismatch[change_len-1]])
892 change_pos = readlen - int(mismatch[1:change_len-1]) + 1
894 change_at = start + change_pos - 1
895 snpDict[achrom].append([start, change_at, change_base, change_from])
900 def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
901 useSplices=False, normalizationFactor=1.0, trackStrand=False,
902 keepStrand="both", shiftValue=0):
903 """return a profile of the chromosome as an array of per-base read coverage....
904 keepStrand = 'both', 'plusOnly', or 'minusOnly'.
905 Will also shift position of unique and multireads (but not splices) if shift is a natural number
907 metadata = self.getMetadata()
909 readlen = int(metadata["readsize"])
913 dataType = metadata["dataType"]
914 scale = 1. / normalizationFactor
916 shift["+"] = int(shiftValue)
917 shift["-"] = -1 * int(shiftValue)
920 lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
922 lastNT = cstop - cstart + readlen + shift["+"]
924 chromModel = array("f",[0.] * lastNT)
925 hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
929 for readEntry in hitDict[chromosome]:
930 hstart = readEntry["start"]
931 sense = readEntry ["sense"]
932 weight = readEntry["weight"]
933 hstart = hstart - cstart + shift[sense]
934 for currentpos in range(hstart,hstart+readlen):
936 if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
937 chromModel[currentpos] += scale * weight
938 elif sense == "-" and keepStrand != "plusOnly":
939 chromModel[currentpos] -= scale * weight
944 if useSplices and dataType == "RNA":
946 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
948 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
950 if chromosome in spliceDict:
951 for spliceEntry in spliceDict[chromosome]:
952 Lstart = spliceEntry["startL"]
953 Lstop = spliceEntry["stopL"]
954 Rstart = spliceEntry["startR"]
955 Rstop = spliceEntry["stopR"]
956 rsense = spliceEntry["sense"]
957 if (Rstop - cstart) < lastNT:
958 for index in range(abs(Lstop - Lstart)):
959 currentpos = Lstart - cstart + index
960 # we only track unique splices
961 if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
962 chromModel[currentpos] += scale
963 elif rsense == "-" and keepStrand != "plusOnly":
964 chromModel[currentpos] -= scale
966 for index in range(abs(Rstop - Rstart)):
967 currentpos = Rstart - cstart + index
968 # we only track unique splices
969 if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
970 chromModel[currentpos] += scale
971 elif rsense == "-" and keepStrand != "plusOnly":
972 chromModel[currentpos] -= scale
979 def insertMetadata(self, valuesList):
980 """ inserts a list of (pname, pvalue) into the metadata
983 self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
987 def updateMetadata(self, pname, newValue, originalValue=""):
988 """ update a metadata field given the original value and the new value.
990 stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
991 if originalValue != "":
992 stmt += " and value='%s' " % str(originalValue)
994 self.dbcon.execute(stmt)
998 def insertUniqs(self, valuesList):
999 """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1000 into the uniqs table.
1002 self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1006 def insertMulti(self, valuesList):
1007 """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1008 into the multi table.
1010 self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1014 def insertSplices(self, valuesList):
1015 """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
1016 into the splices table.
1018 self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
1022 def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
1023 """ update reads on file database in a list region of regions for a chromosome to have a new flag.
1024 regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
1025 sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
1029 restrict = " and sense = ? "
1032 self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1035 self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1037 if self.dataType == "RNA" and splices:
1038 self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
1039 self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
1044 def setFlags(self, flag, uniqs=True, multi=True, splices=True):
1045 """ set the flag fields in the entire dataset.
1048 self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
1051 self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
1053 if self.dataType == "RNA" and splices:
1054 self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
1059 def resetFlags(self, uniqs=True, multi=True, splices=True):
1060 """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
1062 self.setFlags("", uniqs, multi, splices)
1065 def reweighMultireads(self, readList):
1066 self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
1069 def setSynchronousPragma(self, value="ON"):
1071 self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
1073 print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
1076 def setDBcache(self, cache, default=False):
1077 self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
1079 self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
1082 def execute(self, statement, returnResults=False):
1083 sql = self.getSqlCursor()
1085 sql.execute(statement)
1087 result = sql.fetchall()
1091 def executeCommit(self, statement):
1092 self.execute(statement)
1095 self.memcon.commit()
1100 def buildIndex(self, cache=100000):
1101 """ Builds the file indeces for the main tables.
1102 Cache is the number of 1.5 kb pages to keep in memory.
1103 100000 pages translates into 150MB of RAM, which is our default.
1105 if cache > self.getDefaultCacheSize():
1106 self.setDBcache(cache)
1107 self.setSynchronousPragma("OFF")
1108 self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1109 print "built uPosIndex"
1110 self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
1111 print "built uChromIndex"
1112 self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1113 print "built mPosIndex"
1114 self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
1115 print "built mChromIndex"
1117 if self.dataType == "RNA":
1118 self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
1119 print "built sPosIndex"
1120 self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
1121 print "built sPosIndex2"
1122 self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
1123 print "built sChromIndex"
1126 self.setSynchronousPragma("ON")
1129 def dropIndex(self):
1130 """ drops the file indices for the main tables.
1133 self.setSynchronousPragma("OFF")
1134 self.dbcon.execute("DROP INDEX uPosIndex")
1135 self.dbcon.execute("DROP INDEX uChromIndex")
1136 self.dbcon.execute("DROP INDEX mPosIndex")
1137 self.dbcon.execute("DROP INDEX mChromIndex")
1139 if self.dataType == "RNA":
1140 self.dbcon.execute("DROP INDEX sPosIndex")
1142 self.dbcon.execute("DROP INDEX sPosIndex2")
1146 self.dbcon.execute("DROP INDEX sChromIndex")
1150 print "problem dropping index"
1152 self.setSynchronousPragma("ON")
1155 def memSync(self, chrom="", index=False):
1156 """ makes a copy of the dataset into memory for faster access.
1157 Can be restricted to a "full" chromosome. Can also build the
1161 self.memcon = sqlite.connect(":memory:")
1162 self.initializeTables(self.memcon)
1163 cursor = self.dbcon.cursor()
1166 print "memSync %s" % chrom
1167 whereclause = " where chrom = '%s' " % chrom
1168 self.memChrom = chrom
1172 self.memcon.execute("PRAGMA temp_store = MEMORY")
1173 self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
1174 # copy metadata to memory
1175 self.memcon.execute("delete from metadata")
1176 results = cursor.execute("select name, value from metadata")
1179 results2.append((row["name"], row["value"]))
1181 self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
1183 self.copyDBEntriesToMemory("uniqs", whereclause)
1184 self.copyDBEntriesToMemory("multi", whereclause)
1185 if self.dataType == "RNA":
1186 self.copySpliceDBEntriesToMemory(whereclause)
1190 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
1191 self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
1192 if self.dataType == "RNA":
1193 self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
1194 self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
1196 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1197 self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1198 if self.dataType == "RNA":
1199 self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
1200 self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
1202 self.memBacked = True
1203 self.memcon.row_factory = sqlite.Row
1204 self.memcon.commit()
1207 def copyDBEntriesToMemory(self, dbName, whereClause=""):
1208 cursor = self.dbcon.cursor()
1209 sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
1210 destinationEntries = []
1211 for row in sourceEntries:
1212 destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
1214 self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
1217 def copySpliceDBEntriesToMemory(self, whereClause=""):
1218 cursor = self.dbcon.cursor()
1219 sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
1220 destinationEntries = []
1221 for row in sourceEntries:
1222 destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
1223 row["weight"], row["flag"], row["mismatch"]))
1225 self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", destinationEntries)