development checkpoint
[erange.git] / ReadDataset.py
1 import sqlite3 as sqlite
2 import string
3 import tempfile
4 import shutil
5 import os
6 from array import array
7 from commoncode import getReverseComplement, getConfigParser, getConfigOption
8
9 currentRDSVersion = "2.0"
10
11
12 class ReadDatasetError(Exception):
13     pass
14
15
16 class ReadDataset():
17     """ Class for storing reads from experiments. Assumes that custom scripts
18     will translate incoming data into a format that can be inserted into the
19     class using the insert* methods. Default class subtype ('DNA') includes
20     tables for unique and multireads, whereas 'RNA' subtype also includes a
21     splices table.
22     """
23
24     def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False, 
25                  cache=False, reportCount=True):
26         """ creates an rds datafile if initialize is set to true, otherwise
27         will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
28         """
29         self.dbcon = ""
30         self.memcon = ""
31         self.dataType = ""
32         self.rdsVersion = currentRDSVersion
33         self.memBacked = False
34         self.memChrom = ""
35         self.memCursor = ""
36         self.cachedDBFile = ""
37
38         if cache:
39             if verbose:
40                 print "caching ...."
41
42             self.cacheDB(datafile)
43             dbFile = self.cachedDBFile
44         else:
45             dbFile = datafile
46
47         self.dbcon = sqlite.connect(dbFile)
48         self.dbcon.row_factory = sqlite.Row
49         self.dbcon.execute("PRAGMA temp_store = MEMORY")
50         if initialize:
51             if datasetType not in ["DNA", "RNA"]:
52                 raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
53             else:
54                 self.dataType = datasetType
55
56             self.initializeTables(self.dbcon)
57         else:
58             metadata = self.getMetadata("dataType")
59             self.dataType = metadata["dataType"]
60
61         try:
62             metadata = self.getMetadata("rdsVersion")
63             self.rdsVersion = metadata["rdsVersion"]
64         except:
65             try:
66                 self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
67             except IOError:
68                 print "could not add rdsVersion - read-only ?"
69                 self.rdsVersion = "pre-1.0"
70
71         if verbose:
72             if initialize:
73                 print "INITIALIZED dataset %s" % datafile
74             else:
75                 print "dataset %s" % datafile
76
77             metadata = self.getMetadata()
78             print "metadata:"
79             pnameList = metadata.keys()
80             pnameList.sort()
81             for pname in pnameList:
82                 print "\t" + pname + "\t" + metadata[pname]
83
84             if reportCount:
85                 ucount = self.getUniqsCount()
86                 mcount = self.getMultiCount()
87                 if self.dataType == "DNA" and not initialize:
88                     try:
89                         print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
90                     except ValueError:
91                         print "\n%s unique reads and %s multireads" % (ucount, mcount)
92                 elif self.dataType == "RNA" and not initialize:
93                     scount = self.getSplicesCount()
94                     try:
95                         print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
96                     except ValueError:
97                         print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
98
99             print "default cache size is %d pages" % self.getDefaultCacheSize()
100             if self.hasIndex():
101                 print "found index"
102             else:
103                 print "not indexed"
104
105
106     def __len__(self):
107         """ return the number of usable reads in the dataset.
108         """
109         total = self.getUniqsCount()
110         total += self.getMultiCount()
111
112         if self.dataType == "RNA":
113             total += self.getSplicesCount()
114
115         total = int(total)
116
117         return total
118
119
120     def __del__(self):
121         """ cleanup copy in local cache, if present.
122         """
123         if self.cachedDBFile != "":
124             self.uncacheDB()
125
126
127     def cacheDB(self, filename):
128         """ copy geneinfoDB to a local cache.
129         """
130         configParser = getConfigParser()
131         cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
132         tempfile.tempdir = cisTemp
133         self.cachedDBFile =  "%s.db" % tempfile.mktemp()
134         shutil.copyfile(filename, self.cachedDBFile)
135
136
137     def saveCacheDB(self, filename):
138         """ copy geneinfoDB to a local cache.
139         """
140         shutil.copyfile(self.cachedDBFile, filename)
141
142
143     def uncacheDB(self):
144         """ delete geneinfoDB from local cache.
145         """
146         global cachedDBFile
147         if self.cachedDBFile != "":
148             try:
149                 os.remove(self.cachedDBFile)
150             except:
151                 print "could not delete %s" % self.cachedDBFile
152
153             self.cachedDB = ""
154
155
156     def attachDB(self, filename, dbName):
157         """ attach another database file to the readDataset.
158         """
159         stmt = "attach '%s' as %s" % (filename, dbName)
160         self.execute(stmt)
161
162
163     def detachDB(self, dbName):
164         """ detach a database file to the readDataset.
165         """
166         stmt = "detach %s" % (dbName)
167         self.execute(stmt)
168
169
170     def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
171         """ import into current RDS the table (with columns destcolumns,
172             with default all columns) from the database file asname,
173             using the column specification of ascolumns (default all).
174         """
175         stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
176         if flagged != "":
177             stmt += " where flag = '%s' " % flagged
178
179         self.executeCommit(stmt)
180
181
182     def getTables(self, dbName=""):
183         """ get a list of table names in a particular database file.
184         """
185         resultList = []
186         sql = self.getSqlCursor()
187
188         if dbName != "":
189             dbName = "%s." % dbName
190
191         stmt = "select name from %ssqlite_master where type='table'" % dbName
192         sql.execute(stmt)
193         results = sql.fetchall()
194
195         for row in results:
196             resultList.append(row["name"])
197
198         return resultList
199
200
201     def getSqlCursor(self):
202         if self.memBacked:
203             sql = self.getMemCursor()
204         else:
205             sql = self.getFileCursor()
206
207         return sql
208
209
210     def hasIndex(self):
211         """ check whether the RDS file has at least one index.
212         """
213         stmt = "select count(*) from sqlite_master where type='index'"
214         count = int(self.execute(stmt, returnResults=True)[0][0])
215         if count > 0:
216             return True
217
218         return False
219
220
221     def initializeTables(self, dbConnection, cache=100000):
222         """ creates table schema in a database connection, which is
223         typically a database file or an in-memory database.
224         """
225         dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
226         dbConnection.execute("create table metadata (name varchar, value varchar)")
227         dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
228         positionSchema = "start int, stop int"
229         tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
230         dbConnection.execute("create table uniqs %s" % tableSchema)
231         dbConnection.execute("create table multi %s" % tableSchema)
232         if self.dataType == "RNA":
233             positionSchema = "startL int, stopL int, startR int, stopR int"
234             tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
235             dbConnection.execute("create table splices %s" % tableSchema)
236
237         dbConnection.commit()
238
239
240     def getFileCursor(self):
241         """ returns a cursor to file database for low-level (SQL)
242         access to the data.
243         """
244         return self.dbcon.cursor()
245
246
247     def getMemCursor(self):
248         """ returns a cursor to memory database for low-level (SQL)
249         access to the data.
250         """
251         return self.memcon.cursor()
252
253
254     def getMetadata(self, valueName=""):
255         """ returns a dictionary of metadata.
256         """
257         whereClause = ""
258         resultsDict = {}
259
260         if valueName != "":
261             whereClause = " where name='%s'" % valueName
262
263         sql = self.getSqlCursor()
264
265         sql.execute("select name, value from metadata %s" % whereClause)
266         results = sql.fetchall()
267
268         for row in results:
269             parameterName = row["name"]
270             parameterValue = row["value"]
271             if parameterName not in resultsDict:
272                 resultsDict[parameterName] = parameterValue
273             else:
274                 trying = True
275                 index = 2
276                 while trying:
277                     newName = string.join([parameterName, str(index)], ":")
278                     if newName not in resultsDict:
279                         resultsDict[newName] = parameterValue
280                         trying = False
281
282                     index += 1
283
284         return resultsDict
285
286
287     def getReadSize(self):
288         """ returns readsize if defined in metadata.
289         """
290         metadata = self.getMetadata()
291         if "readsize" not in metadata:
292             raise ReadDatasetError("no readsize parameter defined")
293         else:
294             readSize = metadata["readsize"]
295             if "import" in readSize:
296                 readSize = readSize.split()[0]
297
298             readSize = int(readSize)
299             if readSize < 0:
300                 raise ReadDatasetError("readsize is negative")
301
302             return readSize
303
304
305     def getDefaultCacheSize(self):
306         """ returns the default cache size.
307         """
308         return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
309
310
311     def getChromosomes(self, table="uniqs", fullChrom=True):
312         """ returns a list of distinct chromosomes in table.
313         """
314         statement = "select distinct chrom from %s" % table
315         sql = self.getSqlCursor()
316
317         sql.execute(statement)
318         results = []
319         for row in sql:
320             if fullChrom:
321                 if row["chrom"] not in results:
322                     results.append(row["chrom"])
323             else:
324                 shortName = row["chrom"][3:]
325                 if  len(shortName.strip()) > 0 and shortName not in results:
326                     results.append(shortName)
327
328         results.sort()
329
330         return results
331
332
333     def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
334                          doMulti=False, doSplices=False):
335         """ returns the maximum coordinate for reads on a given chromosome.
336         """
337         maxCoord = 0
338
339         if doUniqs:
340             maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
341
342         if doSplices:
343             spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
344             maxCoord = max(spliceMax, maxCoord)
345
346         if doMulti:
347             multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
348             maxCoord = max(multiMax, maxCoord)
349
350         if verbose:
351             print "%s maxCoord: %d" % (chrom, maxCoord)
352
353         return maxCoord
354
355
356     def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
357         maxCoord = 0
358         sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
359         sql = self.getSqlCursor()
360         try:
361             sql.execute(sqlStatement)
362             maxCoord = int(sql.fetchall()[0][0])
363         except:
364             print "couldn't retrieve coordMax for chromosome %s" % chrom
365
366         return maxCoord
367
368
369     def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
370                      flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
371                      withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
372                      readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
373                      flagLike=False, strand='', combine5p=False):
374         """ returns a dictionary of reads in a variety of formats
375         and which can be restricted by chromosome or custom-flag.
376         Returns unique reads by default, but can return multireads
377         with doMulti set to True.
378         
379         Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
380         """
381
382         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
383         if findallOptimize:
384             selectQuery = "select start, sense, sum(weight)"
385         else:
386             selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
387
388         groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
389         if doUniqs:
390             stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
391             if doMulti:
392                 stmt.append("UNION ALL")
393                 stmt.append(selectQuery)
394                 stmt.append("from multi")
395                 stmt.append(whereQuery)
396                 stmt.append(groupQuery)
397         else:
398             stmt = [selectQuery, "from multi", whereQuery]
399
400         if combine5p:
401             if findallOptimize:
402                 selectQuery = "select start, sense, weight, chrom"
403
404             if doUniqs:
405                 subSelect = [selectQuery, "from uniqs", whereQuery]
406                 if doMulti:
407                     subSelect.append("union all")
408                     subSelect.append(selectQuery)
409                     subSelect.append("from multi")
410                     subSelect.append(whereQuery)
411             else:
412                 subSelect = [selectQuery, "from multi", whereQuery]
413
414             sqlStmt = string.join(subSelect)
415             if findallOptimize:
416                 selectQuery = "select start, sense, sum(weight)"
417
418             stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
419                     selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
420
421         if findallOptimize:
422             if self.memBacked:
423                 self.memcon.row_factory = None
424                 sql = self.memcon.cursor()
425             else:
426                 self.dbcon.row_factory = None
427                 sql = self.dbcon.cursor()
428
429             stmt.append("order by start")
430         elif readIDDict:
431             if self.memBacked:
432                 sql = self.memcon.cursor()
433             else:
434                 sql = self.dbcon.cursor()
435
436             stmt.append("order by readID, start")
437         else:
438             if self.memBacked:
439                 sql = self.memcon.cursor()
440             else:
441                 sql = self.dbcon.cursor()
442
443             stmt.append("order by chrom, start")
444
445         sqlQuery = string.join(stmt)
446         sql.execute(sqlQuery)
447
448         resultsDict = {}
449         if findallOptimize:
450             resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
451             if self.memBacked:
452                 self.memcon.row_factory = sqlite.Row
453             else:
454                 self.dbcon.row_factory = sqlite.Row
455         else:
456             currentChrom = ""
457             currentReadID = ""
458             pairID = 0
459             for row in sql:
460                 readID = row["readID"]
461                 if fullChrom:
462                     chrom = row["chrom"]
463                 else:
464                     chrom = row["chrom"][3:]
465
466                 if not readIDDict and chrom != currentChrom:
467                     resultsDict[chrom] = []
468                     currentChrom = chrom
469                     dictKey = chrom
470                 elif readIDDict:
471                     theReadID = readID
472                     if "::" in readID:
473                         theReadID = readID.split("::")[0]
474
475                     if "/" in theReadID and withPairID:
476                         (theReadID, pairID) = readID.split("/")
477
478                     if theReadID != currentReadID:
479                         resultsDict[theReadID] = []
480                         currentReadID = theReadID
481                         dictKey = theReadID
482
483                 newrow = {"start": int(row["start"])}
484                 if bothEnds:
485                     newrow["stop"] = int(row["stop"])
486
487                 if not noSense:
488                     newrow["sense"] = row["sense"]
489
490                 if withWeight:
491                     newrow["weight"] = float(row["weight"])
492
493                 if withFlag:
494                     newrow["flag"] = row["flag"]
495
496                 if withMismatch:
497                     newrow["mismatch"] = row["mismatch"]
498
499                 if withID:
500                     newrow["readID"] = readID
501
502                 if withChrom:
503                     newrow["chrom"] = chrom
504
505                 if withPairID:
506                     newrow["pairID"] = pairID
507
508                 resultsDict[dictKey].append(newrow)
509
510         return resultsDict
511
512
513     def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
514         if splice:
515             startText = "startL"
516             stopText = "stopR"
517         else:
518             startText = "start"
519             stopText = "stop"
520
521         whereClause = []
522         if chrom != "" and chrom != self.memChrom:
523             whereClause.append("chrom = '%s'" % chrom)
524
525         if flag != "":
526             if flagLike:
527                 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
528                 whereClause.append(flagLikeClause)
529             else:
530                 whereClause.append("flag = '%s'" % flag)
531
532         if start > -1:
533             whereClause.append("%s > %d" % (startText, start))
534
535         if stop > -1:
536             whereClause.append("%s < %d" % (stopText, stop))
537
538         if len(readLike) > 0:
539             readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
540             whereClause.append(readIDClause)
541
542         if hasMismatch:
543             whereClause.append("mismatch != ''")
544
545         if strand in ["+", "-"]:
546             whereClause.append("sense = '%s'" % strand)
547
548         if len(whereClause) > 0:
549             whereStatement = string.join(whereClause, " and ")
550             whereQuery = "where %s" % whereStatement
551         else:
552             whereQuery = ""
553
554         return whereQuery
555
556
557     def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
558
559         selectClause = [baseSelect]
560         if bothEnds:
561             selectClause.append("stop")
562
563         if not noSense:
564             selectClause.append("sense")
565
566         if withWeight:
567             selectClause.append("weight")
568
569         if withFlag:
570             selectClause.append("flag")
571
572         if withMismatch:
573             selectClause.append("mismatch")
574
575         selectQuery = string.join(selectClause, ",")
576
577         return selectQuery
578
579
580     def getReadGroupQuery(self, findallOptimize, limit, combine5p):
581         groupBy = []
582         if findallOptimize:
583             groupBy = ["GROUP BY start, sense"]
584
585         if limit > 0 and not combine5p:
586             groupBy.append("LIMIT %d" % limit)
587
588         groupQuery = string.join(groupBy)
589
590         return groupQuery
591
592
593     def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
594                        flag="", withWeight=False, withFlag=False, withMismatch=False,
595                        withID=False, withChrom=False, withPairID=False, readIDDict=False,
596                        splitRead=False, hasMismatch=False, flagLike=False, start=-1,
597                        stop=-1, strand=""):
598         """ returns a dictionary of spliced reads in a variety of
599         formats and which can be restricted by chromosome or custom-flag.
600         Returns unique spliced reads for now.
601         """
602         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
603         selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
604         selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
605         if self.memBacked:
606             sql = self.memcon.cursor()
607         else:
608             sql = self.dbcon.cursor()
609
610         stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
611         sql.execute(stmt)
612         currentReadID = ""
613         currentChrom = ""
614         resultsDict = {}
615         for row in sql:
616             pairID = 0
617             readID = row["readID"]
618             if fullChrom:
619                 chrom = row["chrom"]
620             else:
621                 chrom = row["chrom"][3:]
622
623             if not readIDDict and chrom != currentChrom:
624                 resultsDict[chrom] = []
625                 currentChrom = chrom
626                 dictKey = chrom
627             elif readIDDict:
628                 if "/" in readID:
629                     (theReadID, pairID) = readID.split("/")
630                 else:
631                     theReadID = readID
632
633                 if theReadID != currentReadID:
634                     resultsDict[theReadID] = []
635                     currentReadID = theReadID
636                     dictKey = theReadID
637
638             newrow = {"startL": int(row["startL"])}
639             newrow["stopL"] = int(row["stopL"])
640             newrow["startR"] = int(row["startR"])
641             newrow["stopR"] = int(row["stopR"])
642             if not noSense:
643                 newrow["sense"] = row["sense"]
644
645             if withWeight:
646                 newrow["weight"] = float(row["weight"])
647
648             if withFlag:
649                 newrow["flag"] = row["flag"]
650
651             if withMismatch:
652                 newrow["mismatch"] = row["mismatch"]
653
654             if withID:
655                 newrow["readID"] = readID
656
657             if withChrom:
658                 newrow["chrom"] = chrom
659
660             if withPairID:
661                 newrow["pairID"] = pairID
662
663             if splitRead:
664                 leftDict = newrow.copy()
665                 del leftDict["startR"]
666                 del leftDict["stopR"]
667                 rightDict = newrow
668                 del rightDict["startL"]
669                 del rightDict["stopL"]
670                 resultsDict[dictKey].append(leftDict)
671                 resultsDict[dictKey].append(rightDict)
672             else:
673                 resultsDict[dictKey].append(newrow)
674
675         return resultsDict
676
677
678     def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
679                   splices=False, reportCombined=True, sense="both"):
680         """ return read counts for a given region.
681         """
682         ucount = 0
683         mcount = 0
684         scount = 0
685         restrict = ""
686         if sense in ["+", "-"]:
687             restrict = " sense ='%s' " % sense
688
689         if uniqs:
690             try:
691                 ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
692             except:
693                 ucount = 0
694
695         if multi:
696             try:
697                 mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
698             except:
699                 mcount = 0
700
701         if splices:
702             try:
703                 scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
704             except:
705                 scount = 0
706
707         if reportCombined:
708             total = ucount + mcount + scount
709             return total
710         else:
711             return (ucount, mcount, scount)
712
713
714     def getTotalCounts(self, chrom="", rmin="", rmax=""):
715         """ return read counts for a given region.
716         """
717         return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
718
719
720     def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
721         """ returns the number of row in the uniqs table.
722         """
723         whereClause = []
724         count = 0
725
726         if chrom !=""  and chrom != self.memChrom:
727             whereClause = ["chrom='%s'" % chrom]
728
729         if rmin != "":
730             whereClause.append("%s >= %s" % (startField, str(rmin)))
731
732         if rmax != "":
733             whereClause.append("%s <= %s" % (startField, str(rmax)))
734
735         if restrict != "":
736             whereClause.append(restrict)
737
738         if len(whereClause) > 0:
739             whereStatement = string.join(whereClause, " and ")
740             whereQuery = "where %s" % whereStatement
741         else:
742             whereQuery = ""
743
744         if self.memBacked:
745             sql = self.memcon.cursor()
746         else:
747             sql = self.dbcon.cursor()
748
749         if distinct:
750             sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
751         else:
752             sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
753
754         result = sql.fetchone()
755
756         try:
757             count = int(result[0])
758         except:
759             count = 0
760
761         return count
762
763
764     def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
765         """ returns the number of row in the splices table.
766         """
767         # TODO: if the rds type is DNA should this just return zero?
768         return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
769
770
771     def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
772         """ returns the number of distinct readIDs in the uniqs table.
773         """
774         return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
775
776
777     def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
778         """ returns the total weight of readIDs in the multi table.
779         """
780         return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
781
782
783     def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
784         """ get readID's.
785         """
786         stmt = []
787         if uniqs:
788             stmt.append("select readID from uniqs")
789
790         if multi:
791             stmt.append("select readID from multi")
792
793         if splices:
794             stmt.append("select readID from splices")
795
796         if len(stmt) > 0:
797             selectPart = string.join(stmt, " union ")
798         else:
799             selectPart = ""
800
801         limitPart = ""
802         if limit > 0:
803             limitPart = "LIMIT %d" % limit
804
805         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
806         if self.memBacked:
807             sql = self.memcon.cursor()
808         else:
809             sql = self.dbcon.cursor()
810
811         sql.execute(sqlQuery)
812         result = sql.fetchall()
813
814         if paired:
815             return [x[0].split("/")[0] for x in result]
816         else:
817             return [x[0] for x in result]
818
819
820     def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
821         """ returns the uniq and spliced mismatches in a dictionary.
822         """
823         readlen = self.getReadSize()
824         if mischrom:
825             hitChromList = [mischrom]
826         else:
827             hitChromList = self.getChromosomes()
828             hitChromList.sort()
829
830         snpDict = {}
831         for achrom in hitChromList:
832             if verbose:
833                 print "getting mismatches from chromosome %s" % (achrom)
834
835             snpDict[achrom] = []
836             if useSplices and self.dataType == "RNA":
837                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
838                 spliceIDList = spliceDict.keys()
839                 for spliceID in spliceIDList:
840                     spliceEntry = spliceDict[spliceID][0]
841                     startpos = spliceEntry["startL"]
842                     lefthalf = spliceEntry["stopL"]
843                     rightstart = spliceEntry["startR"]
844                     sense = spliceEntry["sense"]
845                     mismatches = spliceEntry["mismatch"]
846                     spMismatchList = mismatches.split(",")
847                     for mismatch in spMismatchList:
848                         if "N" in mismatch:
849                             continue
850
851                         change_len = len(mismatch)
852                         if sense == "+":
853                             change_from = mismatch[0]
854                             change_base = mismatch[change_len-1]
855                             change_pos = int(mismatch[1:change_len-1])
856                         elif sense == "-":
857                             change_from = getReverseComplement([mismatch[0]])
858                             change_base = getReverseComplement([mismatch[change_len-1]])
859                             change_pos = readlen - int(mismatch[1:change_len-1]) + 1
860
861                         firsthalf = int(lefthalf)-int(startpos)+1
862                         secondhalf = 0
863                         if int(change_pos) <= int(firsthalf):
864                             change_at = startpos + change_pos - 1
865                         else:
866                             secondhalf = change_pos - firsthalf
867                             change_at = rightstart + secondhalf
868
869                         snpDict[achrom].append([startpos, change_at, change_base, change_from])
870
871             hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
872             if achrom not in hitDict.keys():
873                 continue
874
875             for readEntry in hitDict[achrom]:
876                 start = readEntry["start"]
877                 sense = readEntry["sense"]
878                 mismatches = readEntry["mismatch"]
879                 mismatchList = mismatches.split(",")
880                 for mismatch in mismatchList:
881                     if "N" in mismatch:
882                         continue
883
884                     change_len = len(mismatch)
885                     if sense == "+":
886                         change_from = mismatch[0]
887                         change_base = mismatch[change_len-1]
888                         change_pos = int(mismatch[1:change_len-1])
889                     elif sense == "-":
890                         change_from = getReverseComplement([mismatch[0]])
891                         change_base = getReverseComplement([mismatch[change_len-1]])
892                         change_pos = readlen - int(mismatch[1:change_len-1]) + 1
893
894                     change_at = start + change_pos - 1
895                     snpDict[achrom].append([start, change_at, change_base, change_from])
896
897         return snpDict
898
899
900     def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
901                         useSplices=False, normalizationFactor=1.0, trackStrand=False,
902                         keepStrand="both", shiftValue=0):
903         """return a profile of the chromosome as an array of per-base read coverage....
904             keepStrand = 'both', 'plusOnly', or 'minusOnly'.
905             Will also shift position of unique and multireads (but not splices) if shift is a natural number
906         """
907         metadata = self.getMetadata()
908         try:
909             readlen = int(metadata["readsize"])
910         except KeyError:
911             readlen = 0
912
913         dataType = metadata["dataType"]
914         scale = 1. / normalizationFactor
915         shift = {}
916         shift["+"] = int(shiftValue)
917         shift["-"] = -1 * int(shiftValue)
918
919         if cstop > 0:
920             lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
921         else:
922             lastNT = cstop - cstart + readlen + shift["+"]
923
924         chromModel = array("f",[0.] * lastNT)
925         hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
926         if cstart < 0:
927             cstart = 0
928
929         for readEntry in hitDict[chromosome]:
930             hstart = readEntry["start"]
931             sense =  readEntry ["sense"]
932             weight = readEntry["weight"]
933             hstart = hstart - cstart + shift[sense]
934             for currentpos in range(hstart,hstart+readlen):
935                 try:
936                     if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
937                         chromModel[currentpos] += scale * weight
938                     elif sense == "-" and keepStrand != "plusOnly":
939                         chromModel[currentpos] -= scale * weight
940                 except:
941                     continue
942
943         del hitDict
944         if useSplices and dataType == "RNA":
945             if cstop > 0:
946                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
947             else:
948                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
949    
950             if chromosome in spliceDict:
951                 for spliceEntry in spliceDict[chromosome]:
952                     Lstart = spliceEntry["startL"]
953                     Lstop = spliceEntry["stopL"]
954                     Rstart = spliceEntry["startR"]
955                     Rstop = spliceEntry["stopR"]
956                     rsense = spliceEntry["sense"]
957                     if (Rstop - cstart) < lastNT:
958                         for index in range(abs(Lstop - Lstart)):
959                             currentpos = Lstart - cstart + index
960                             # we only track unique splices
961                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
962                                 chromModel[currentpos] += scale
963                             elif rsense == "-" and keepStrand != "plusOnly":
964                                 chromModel[currentpos] -= scale
965
966                         for index in range(abs(Rstop - Rstart)):
967                             currentpos = Rstart - cstart + index
968                             # we only track unique splices
969                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
970                                 chromModel[currentpos] += scale
971                             elif rsense == "-" and keepStrand != "plusOnly":
972                                 chromModel[currentpos] -= scale
973
974             del spliceDict
975
976         return chromModel
977
978
979     def insertMetadata(self, valuesList):
980         """ inserts a list of (pname, pvalue) into the metadata
981         table.
982         """
983         self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
984         self.dbcon.commit()
985
986
987     def updateMetadata(self, pname, newValue, originalValue=""):
988         """ update a metadata field given the original value and the new value.
989         """
990         stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
991         if originalValue != "":
992             stmt += " and value='%s' " % str(originalValue)
993
994         self.dbcon.execute(stmt)
995         self.dbcon.commit()
996
997
998     def insertUniqs(self, valuesList):
999         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1000         into the uniqs table.
1001         """
1002         self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1003         self.dbcon.commit()
1004
1005
1006     def insertMulti(self, valuesList):
1007         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1008         into the multi table.
1009         """
1010         self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1011         self.dbcon.commit()
1012
1013
1014     def insertSplices(self, valuesList):
1015         """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
1016         into the splices table.
1017         """
1018         self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
1019         self.dbcon.commit()
1020
1021
1022     def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
1023         """ update reads on file database in a list region of regions for a chromosome to have a new flag.
1024             regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
1025             sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
1026         """
1027         restrict = ""
1028         if sense != "both":
1029             restrict = " and sense = ? "
1030
1031         if uniqs:
1032             self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1033
1034         if multi:
1035             self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1036
1037         if self.dataType == "RNA" and splices:
1038             self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
1039             self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
1040
1041         self.dbcon.commit()
1042
1043
1044     def setFlags(self, flag, uniqs=True, multi=True, splices=True):
1045         """ set the flag fields in the entire dataset.
1046         """
1047         if uniqs:
1048             self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
1049
1050         if multi:
1051             self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
1052
1053         if self.dataType == "RNA" and splices:
1054             self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
1055
1056         self.dbcon.commit()
1057
1058
1059     def resetFlags(self, uniqs=True, multi=True, splices=True):
1060         """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
1061         """
1062         self.setFlags("", uniqs, multi, splices)
1063
1064
1065     def reweighMultireads(self, readList):
1066         self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
1067
1068
1069     def setSynchronousPragma(self, value="ON"):
1070         try:
1071             self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
1072         except:
1073             print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
1074
1075
1076     def setDBcache(self, cache, default=False):
1077         self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
1078         if default:
1079             self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
1080
1081
1082     def execute(self, statement, returnResults=False):
1083         sql = self.getSqlCursor()
1084
1085         sql.execute(statement)
1086         if returnResults:
1087             result = sql.fetchall()
1088             return result
1089
1090
1091     def executeCommit(self, statement):
1092         self.execute(statement)
1093
1094         if self.memBacked:
1095             self.memcon.commit()
1096         else:
1097             self.dbcon.commit()
1098
1099
1100     def buildIndex(self, cache=100000):
1101         """ Builds the file indeces for the main tables.
1102             Cache is the number of 1.5 kb pages to keep in memory.
1103             100000 pages translates into 150MB of RAM, which is our default.
1104         """
1105         if cache > self.getDefaultCacheSize():
1106             self.setDBcache(cache)
1107         self.setSynchronousPragma("OFF")
1108         self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1109         print "built uPosIndex"
1110         self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
1111         print "built uChromIndex"
1112         self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1113         print "built mPosIndex"
1114         self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
1115         print "built mChromIndex"
1116
1117         if self.dataType == "RNA":
1118             self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
1119             print "built sPosIndex"
1120             self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
1121             print "built sPosIndex2"
1122             self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
1123             print "built sChromIndex"
1124
1125         self.dbcon.commit()
1126         self.setSynchronousPragma("ON")
1127
1128
1129     def dropIndex(self):
1130         """ drops the file indices for the main tables.
1131         """
1132         try:
1133             self.setSynchronousPragma("OFF")
1134             self.dbcon.execute("DROP INDEX uPosIndex")
1135             self.dbcon.execute("DROP INDEX uChromIndex")
1136             self.dbcon.execute("DROP INDEX mPosIndex")
1137             self.dbcon.execute("DROP INDEX mChromIndex")
1138
1139             if self.dataType == "RNA":
1140                 self.dbcon.execute("DROP INDEX sPosIndex")
1141                 try:
1142                     self.dbcon.execute("DROP INDEX sPosIndex2")
1143                 except:
1144                     pass
1145
1146                 self.dbcon.execute("DROP INDEX sChromIndex")
1147
1148             self.dbcon.commit()
1149         except:
1150             print "problem dropping index"
1151
1152         self.setSynchronousPragma("ON")
1153
1154
1155     def memSync(self, chrom="", index=False):
1156         """ makes a copy of the dataset into memory for faster access.
1157         Can be restricted to a "full" chromosome. Can also build the
1158         memory indices.
1159         """
1160         self.memcon = ""
1161         self.memcon = sqlite.connect(":memory:")
1162         self.initializeTables(self.memcon)
1163         cursor = self.dbcon.cursor()
1164         whereclause = ""
1165         if chrom != "":
1166             print "memSync %s" % chrom
1167             whereclause = " where chrom = '%s' " % chrom
1168             self.memChrom = chrom
1169         else:
1170             self.memChrom = ""
1171
1172         self.memcon.execute("PRAGMA temp_store = MEMORY")
1173         self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
1174         # copy metadata to memory
1175         self.memcon.execute("delete from metadata")
1176         results = cursor.execute("select name, value from metadata")
1177         results2 = []
1178         for row in results:
1179             results2.append((row["name"], row["value"]))
1180
1181         self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
1182
1183         self.copyDBEntriesToMemory("uniqs", whereclause)
1184         self.copyDBEntriesToMemory("multi", whereclause)
1185         if self.dataType == "RNA":
1186             self.copySpliceDBEntriesToMemory(whereclause)
1187
1188         if index:
1189             if chrom != "":
1190                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
1191                 self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
1192                 if self.dataType == "RNA":
1193                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
1194                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
1195             else:
1196                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1197                 self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1198                 if self.dataType == "RNA":
1199                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
1200                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
1201
1202         self.memBacked = True
1203         self.memcon.row_factory = sqlite.Row
1204         self.memcon.commit()
1205
1206
1207     def copyDBEntriesToMemory(self, dbName, whereClause=""):
1208         cursor = self.dbcon.cursor()
1209         sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
1210         destinationEntries = []
1211         for row in sourceEntries:
1212             destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
1213
1214         self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
1215
1216
1217     def copySpliceDBEntriesToMemory(self, whereClause=""):
1218         cursor = self.dbcon.cursor()
1219         sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
1220         destinationEntries = []
1221         for row in sourceEntries:
1222             destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
1223                                        row["weight"], row["flag"], row["mismatch"]))
1224
1225         self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", destinationEntries)
1226