Release version for Erange 4.0a
[erange.git] / ReadDataset.py
1 import sqlite3 as sqlite
2 import string
3 import tempfile
4 import shutil
5 import os
6 from array import array
7 from commoncode import getReverseComplement, getConfigParser, getConfigOption
8
9 currentRDSVersion = "2.0"
10
11
12 class ReadDatasetError(Exception):
13     pass
14
15
16 class ReadDataset():
17     """ Class for storing reads from experiments. Assumes that custom scripts
18     will translate incoming data into a format that can be inserted into the
19     class using the insert* methods. Default class subtype ('DNA') includes
20     tables for unique and multireads, whereas 'RNA' subtype also includes a
21     splices table.
22     """
23
24     def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False, 
25                  cache=False, reportCount=True):
26         """ creates an rds datafile if initialize is set to true, otherwise
27         will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
28         """
29         self.dbcon = ""
30         self.memcon = ""
31         self.dataType = ""
32         self.rdsVersion = currentRDSVersion
33         self.memBacked = False
34         self.memChrom = ""
35         self.memCursor = ""
36         self.cachedDBFile = ""
37
38         if cache:
39             if verbose:
40                 print "caching ...."
41
42             self.cacheDB(datafile)
43             dbFile = self.cachedDBFile
44         else:
45             dbFile = datafile
46
47         self.dbcon = sqlite.connect(dbFile)
48         self.dbcon.row_factory = sqlite.Row
49         self.dbcon.execute("PRAGMA temp_store = MEMORY")
50         if initialize:
51             if datasetType not in ["DNA", "RNA"]:
52                 raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
53             else:
54                 self.dataType = datasetType
55
56             self.initializeTables(self.dbcon)
57         else:
58             metadata = self.getMetadata("dataType")
59             self.dataType = metadata["dataType"]
60
61         try:
62             metadata = self.getMetadata("rdsVersion")
63             self.rdsVersion = metadata["rdsVersion"]
64         except:
65             try:
66                 self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
67             except IOError:
68                 print "could not add rdsVersion - read-only ?"
69                 self.rdsVersion = "pre-1.0"
70
71         if verbose:
72             if initialize:
73                 print "INITIALIZED dataset %s" % datafile
74             else:
75                 print "dataset %s" % datafile
76
77             metadata = self.getMetadata()
78             print "metadata:"
79             pnameList = metadata.keys()
80             pnameList.sort()
81             for pname in pnameList:
82                 print "\t" + pname + "\t" + metadata[pname]
83
84             if reportCount:
85                 ucount = self.getUniqsCount()
86                 mcount = self.getMultiCount()
87                 if self.dataType == "DNA" and not initialize:
88                     try:
89                         print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
90                     except ValueError:
91                         print "\n%s unique reads and %s multireads" % (ucount, mcount)
92                 elif self.dataType == "RNA" and not initialize:
93                     scount = self.getSplicesCount()
94                     try:
95                         print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
96                     except ValueError:
97                         print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
98
99             print "default cache size is %d pages" % self.getDefaultCacheSize()
100             if self.hasIndex():
101                 print "found index"
102             else:
103                 print "not indexed"
104
105
106     def __len__(self):
107         """ return the number of usable reads in the dataset.
108         """
109         total = self.getUniqsCount()
110         total += self.getMultiCount()
111
112         if self.dataType == "RNA":
113             total += self.getSplicesCount()
114
115         total = int(total)
116
117         return total
118
119
120     def __del__(self):
121         """ cleanup copy in local cache, if present.
122         """
123         if self.cachedDBFile != "":
124             self.uncacheDB()
125
126
127     def cacheDB(self, filename):
128         """ copy geneinfoDB to a local cache.
129         """
130         configParser = getConfigParser()
131         cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
132         tempfile.tempdir = cisTemp
133         self.cachedDBFile =  "%s.db" % tempfile.mktemp()
134         shutil.copyfile(filename, self.cachedDBFile)
135
136
137     def saveCacheDB(self, filename):
138         """ copy geneinfoDB to a local cache.
139         """
140         shutil.copyfile(self.cachedDBFile, filename)
141
142
143     def uncacheDB(self):
144         """ delete geneinfoDB from local cache.
145         """
146         global cachedDBFile
147         if self.cachedDBFile != "":
148             try:
149                 os.remove(self.cachedDBFile)
150             except:
151                 print "could not delete %s" % self.cachedDBFile
152
153             self.cachedDB = ""
154
155
156     def attachDB(self, filename, dbName):
157         """ attach another database file to the readDataset.
158         """
159         stmt = "attach '%s' as %s" % (filename, dbName)
160         self.execute(stmt)
161
162
163     def detachDB(self, dbName):
164         """ detach a database file to the readDataset.
165         """
166         stmt = "detach %s" % (dbName)
167         self.execute(stmt)
168
169
170     def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
171         """ import into current RDS the table (with columns destcolumns,
172             with default all columns) from the database file asname,
173             using the column specification of ascolumns (default all).
174         """
175         stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
176         if flagged != "":
177             stmt += " where flag = '%s' " % flagged
178
179         self.executeCommit(stmt)
180
181
182     def getTables(self, dbName=""):
183         """ get a list of table names in a particular database file.
184         """
185         resultList = []
186         sql = self.getSqlCursor()
187
188         if dbName != "":
189             dbName = "%s." % dbName
190
191         stmt = "select name from %ssqlite_master where type='table'" % dbName
192         sql.execute(stmt)
193         results = sql.fetchall()
194
195         for row in results:
196             resultList.append(row["name"])
197
198         return resultList
199
200
201     def getSqlCursor(self):
202         if self.memBacked:
203             sql = self.getMemCursor()
204         else:
205             sql = self.getFileCursor()
206
207         return sql
208
209
210     def hasIndex(self):
211         """ check whether the RDS file has at least one index.
212         """
213         stmt = "select count(*) from sqlite_master where type='index'"
214         count = int(self.execute(stmt, returnResults=True)[0][0])
215         if count > 0:
216             return True
217
218         return False
219
220
221     def initializeTables(self, dbConnection, cache=100000):
222         """ creates table schema in a database connection, which is
223         typically a database file or an in-memory database.
224         """
225         dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
226         dbConnection.execute("create table metadata (name varchar, value varchar)")
227         dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
228         positionSchema = "start int, stop int"
229         tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
230         dbConnection.execute("create table uniqs %s" % tableSchema)
231         dbConnection.execute("create table multi %s" % tableSchema)
232         if self.dataType == "RNA":
233             positionSchema = "startL int, stopL int, startR int, stopR int"
234             tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
235             dbConnection.execute("create table splices %s" % tableSchema)
236
237         dbConnection.commit()
238
239
240     def getFileCursor(self):
241         """ returns a cursor to file database for low-level (SQL)
242         access to the data.
243         """
244         return self.dbcon.cursor()
245
246
247     def getMemCursor(self):
248         """ returns a cursor to memory database for low-level (SQL)
249         access to the data.
250         """
251         return self.memcon.cursor()
252
253
254     def getMetadata(self, valueName=""):
255         """ returns a dictionary of metadata.
256         """
257         whereClause = ""
258         resultsDict = {}
259
260         if valueName != "":
261             whereClause = " where name='%s'" % valueName
262
263         sql = self.getSqlCursor()
264
265         sql.execute("select name, value from metadata %s" % whereClause)
266         results = sql.fetchall()
267
268         for row in results:
269             parameterName = row["name"]
270             parameterValue = row["value"]
271             if parameterName not in resultsDict:
272                 resultsDict[parameterName] = parameterValue
273             else:
274                 trying = True
275                 index = 2
276                 while trying:
277                     newName = string.join([parameterName, str(index)], ":")
278                     if newName not in resultsDict:
279                         resultsDict[newName] = parameterValue
280                         trying = False
281
282                     index += 1
283
284         return resultsDict
285
286
287     def getReadSize(self):
288         """ returns readsize if defined in metadata.
289         """
290         metadata = self.getMetadata()
291         if "readsize" not in metadata:
292             raise ReadDatasetError("no readsize parameter defined")
293         else:
294             readSize = metadata["readsize"]
295             if "import" in readSize:
296                 readSize = readSize.split()[0]
297
298             readSize = int(readSize)
299             if readSize < 0:
300                 raise ReadDatasetError("readsize is negative")
301
302             return readSize
303
304
305     def getDefaultCacheSize(self):
306         """ returns the default cache size.
307         """
308         return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
309
310
311     def getChromosomes(self, table="uniqs", fullChrom=True):
312         """ returns a list of distinct chromosomes in table.
313         """
314         statement = "select distinct chrom from %s" % table
315         sql = self.getSqlCursor()
316
317         sql.execute(statement)
318         results = []
319         for row in sql:
320             if fullChrom:
321                 if row["chrom"] not in results:
322                     results.append(row["chrom"])
323             else:
324                 shortName = row["chrom"][3:]
325                 if  len(shortName.strip()) > 0 and shortName not in results:
326                     results.append(shortName)
327
328         results.sort()
329
330         return results
331
332
333     def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
334                          doMulti=False, doSplices=False):
335         """ returns the maximum coordinate for reads on a given chromosome.
336         """
337         maxCoord = 0
338
339         if doUniqs:
340             maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
341
342         if doSplices:
343             spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
344             maxCoord = max(spliceMax, maxCoord)
345
346         if doMulti:
347             multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
348             maxCoord = max(multiMax, maxCoord)
349
350         if verbose:
351             print "%s maxCoord: %d" % (chrom, maxCoord)
352
353         return maxCoord
354
355
356     def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
357         maxCoord = 0
358         sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
359         sql = self.getSqlCursor()
360         try:
361             sql.execute(sqlStatement)
362             maxCoord = int(sql.fetchall()[0][0])
363         except:
364             print "couldn't retrieve coordMax for chromosome %s" % chrom
365
366         return maxCoord
367
368
369     def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
370                      flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
371                      withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
372                      readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
373                      flagLike=False, strand='', combine5p=False):
374         """ returns a dictionary of reads in a variety of formats
375         and which can be restricted by chromosome or custom-flag.
376         Returns unique reads by default, but can return multireads
377         with doMulti set to True.
378         
379         Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
380         """
381
382         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
383         if findallOptimize:
384             selectQuery = "select start, sense, sum(weight)"
385         else:
386             selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
387
388         groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
389         if doUniqs:
390             stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
391             if doMulti:
392                 stmt.append("UNION ALL")
393                 stmt.append(selectQuery)
394                 stmt.append("from multi")
395                 stmt.append(whereQuery)
396                 stmt.append(groupQuery)
397         else:
398             stmt = [selectQuery, "from multi", whereQuery]
399
400         if combine5p:
401             if findallOptimize:
402                 selectQuery = "select start, sense, weight, chrom"
403
404             if doUniqs:
405                 subSelect = [selectQuery, "from uniqs", whereQuery]
406                 if doMulti:
407                     subSelect.append("union all")
408                     subSelect.append(selectQuery)
409                     subSelect.append("from multi")
410                     subSelect.append(whereQuery)
411             else:
412                 subSelect = [selectQuery, "from multi", whereQuery]
413
414             sqlStmt = string.join(subSelect)
415             if findallOptimize:
416                 selectQuery = "select start, sense, sum(weight)"
417
418             stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
419                     selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
420
421         if findallOptimize:
422             if self.memBacked:
423                 self.memcon.row_factory = None
424                 sql = self.memcon.cursor()
425             else:
426                 self.dbcon.row_factory = None
427                 sql = self.dbcon.cursor()
428
429             stmt.append("order by start")
430         elif readIDDict:
431             if self.memBacked:
432                 sql = self.memcon.cursor()
433             else:
434                 sql = self.dbcon.cursor()
435
436             stmt.append("order by readID, start")
437         else:
438             if self.memBacked:
439                 sql = self.memcon.cursor()
440             else:
441                 sql = self.dbcon.cursor()
442
443             stmt.append("order by chrom, start")
444
445         sqlQuery = string.join(stmt)
446         sql.execute(sqlQuery)
447
448         resultsDict = {}
449         if findallOptimize:
450             resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
451             if self.memBacked:
452                 self.memcon.row_factory = sqlite.Row
453             else:
454                 self.dbcon.row_factory = sqlite.Row
455         else:
456             currentChrom = ""
457             currentReadID = ""
458             pairID = 0
459             for row in sql:
460                 readID = row["readID"]
461                 if fullChrom:
462                     chrom = row["chrom"]
463                 else:
464                     chrom = row["chrom"][3:]
465
466                 if not readIDDict and chrom != currentChrom:
467                     resultsDict[chrom] = []
468                     currentChrom = chrom
469                     dictKey = chrom
470                 elif readIDDict:
471                     theReadID = readID
472                     if "::" in readID:
473                         theReadID = readID.split("::")[0]
474
475                     if "/" in theReadID and withPairID:
476                         (theReadID, pairID) = readID.split("/")
477
478                     if theReadID != currentReadID:
479                         resultsDict[theReadID] = []
480                         currentReadID = theReadID
481                         dictKey = theReadID
482
483                 newrow = {"start": int(row["start"])}
484                 if bothEnds:
485                     newrow["stop"] = int(row["stop"])
486
487                 if not noSense:
488                     newrow["sense"] = row["sense"]
489
490                 if withWeight:
491                     newrow["weight"] = float(row["weight"])
492
493                 if withFlag:
494                     newrow["flag"] = row["flag"]
495
496                 if withMismatch:
497                     newrow["mismatch"] = row["mismatch"]
498
499                 if withID:
500                     newrow["readID"] = readID
501
502                 if withChrom:
503                     newrow["chrom"] = chrom
504
505                 if withPairID:
506                     newrow["pairID"] = pairID
507
508                 resultsDict[dictKey].append(newrow)
509
510         return resultsDict
511
512
513     def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
514         if splice:
515             startText = "startL"
516             stopText = "stopR"
517         else:
518             startText = "start"
519             stopText = "stop"
520
521         whereClause = []
522         if chrom != "" and chrom != self.memChrom:
523             whereClause.append("chrom = '%s'" % chrom)
524
525         if flag != "":
526             if flagLike:
527                 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
528                 whereClause.append(flagLikeClause)
529             else:
530                 whereClause.append("flag = '%s'" % flag)
531
532         if start > -1:
533             whereClause.append("%s > %d" % (startText, start))
534
535         if stop > -1:
536             whereClause.append("%s < %d" % (stopText, stop))
537
538         if len(readLike) > 0:
539             readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
540             whereClause.append(readIDClause)
541
542         if hasMismatch:
543             whereClause.append("mismatch != ''")
544
545         if strand in ["+", "-"]:
546             whereClause.append("sense = '%s'" % strand)
547
548         if len(whereClause) > 0:
549             whereStatement = string.join(whereClause, " and ")
550             whereQuery = "where %s" % whereStatement
551         else:
552             whereQuery = ""
553
554         return whereQuery
555
556
557     def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
558
559         selectClause = [baseSelect]
560         if bothEnds:
561             selectClause.append("stop")
562
563         if not noSense:
564             selectClause.append("sense")
565
566         if withWeight:
567             selectClause.append("weight")
568
569         if withFlag:
570             selectClause.append("flag")
571
572         if withMismatch:
573             selectClause.append("mismatch")
574
575         selectQuery = string.join(selectClause, ",")
576
577         return selectQuery
578
579
580     def getReadGroupQuery(self, findallOptimize, limit, combine5p):
581         groupBy = []
582         if findallOptimize:
583             groupBy = ["GROUP BY start, sense"]
584
585         if limit > 0 and not combine5p:
586             groupBy.append("LIMIT %d" % limit)
587
588         groupQuery = string.join(groupBy)
589
590         return groupQuery
591
592
593     def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
594                        flag="", withWeight=False, withFlag=False, withMismatch=False,
595                        withID=False, withChrom=False, withPairID=False, readIDDict=False,
596                        splitRead=False, hasMismatch=False, flagLike=False, start=-1,
597                        stop=-1, strand=""):
598         """ returns a dictionary of spliced reads in a variety of
599         formats and which can be restricted by chromosome or custom-flag.
600         Returns unique spliced reads for now.
601         """
602         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
603         selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
604         selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
605         if self.memBacked:
606             sql = self.memcon.cursor()
607         else:
608             sql = self.dbcon.cursor()
609
610         stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
611         sql.execute(stmt)
612         currentReadID = ""
613         currentChrom = ""
614         resultsDict = {}
615         for row in sql:
616             pairID = 0
617             readID = row["readID"]
618             if fullChrom:
619                 chrom = row["chrom"]
620             else:
621                 chrom = row["chrom"][3:]
622
623             if not readIDDict and chrom != currentChrom:
624                 resultsDict[chrom] = []
625                 currentChrom = chrom
626                 dictKey = chrom
627             elif readIDDict:
628                 if "/" in readID:
629                     (theReadID, pairID) = readID.split("/")
630                 else:
631                     theReadID = readID
632
633                 if theReadID != currentReadID:
634                     resultsDict[theReadID] = []
635                     currentReadID = theReadID
636                     dictKey = theReadID
637
638             newrow = {"startL": int(row["startL"])}
639             newrow["stopL"] = int(row["stopL"])
640             newrow["startR"] = int(row["startR"])
641             newrow["stopR"] = int(row["stopR"])
642             if not noSense:
643                 newrow["sense"] = row["sense"]
644
645             if withWeight:
646                 newrow["weight"] = float(row["weight"])
647
648             if withFlag:
649                 newrow["flag"] = row["flag"]
650
651             if withMismatch:
652                 newrow["mismatch"] = row["mismatch"]
653
654             if withID:
655                 newrow["readID"] = readID
656
657             if withChrom:
658                 newrow["chrom"] = chrom
659
660             if withPairID:
661                 newrow["pairID"] = pairID
662
663             if splitRead:
664                 leftDict = newrow.copy()
665                 del leftDict["startR"]
666                 del leftDict["stopR"]
667                 rightDict = newrow
668                 del rightDict["startL"]
669                 del rightDict["stopL"]
670                 resultsDict[dictKey].append(leftDict)
671                 resultsDict[dictKey].append(rightDict)
672             else:
673                 resultsDict[dictKey].append(newrow)
674
675         return resultsDict
676
677
678     def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
679                   splices=False, reportCombined=True, sense="both"):
680         """ return read counts for a given region.
681         """
682         ucount = 0
683         mcount = 0
684         scount = 0
685         restrict = ""
686         if sense in ["+", "-"]:
687             restrict = " sense ='%s' " % sense
688
689         if uniqs:
690             try:
691                 ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
692             except:
693                 ucount = 0
694
695         if multi:
696             try:
697                 mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
698             except:
699                 mcount = 0
700
701         if splices:
702             try:
703                 scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
704             except:
705                 scount = 0
706
707         if reportCombined:
708             total = ucount + mcount + scount
709             return total
710         else:
711             return (ucount, mcount, scount)
712
713
714     def getTotalCounts(self, chrom="", rmin="", rmax=""):
715         """ return read counts for a given region.
716         """
717         return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
718
719
720     def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
721         """ returns the number of row in the uniqs table.
722         """
723         whereClause = []
724         count = 0
725
726         if chrom !=""  and chrom != self.memChrom:
727             whereClause = ["chrom='%s'" % chrom]
728
729         if rmin != "":
730             whereClause.append("%s >= %s" % (startField, str(rmin)))
731
732         if rmax != "":
733             whereClause.append("%s <= %s" % (startField, str(rmax)))
734
735         if restrict != "":
736             whereClause.append(restrict)
737
738         if len(whereClause) > 0:
739             whereStatement = string.join(whereClause, " and ")
740             whereQuery = "where %s" % whereStatement
741         else:
742             whereQuery = ""
743
744         if self.memBacked:
745             sql = self.memcon.cursor()
746         else:
747             sql = self.dbcon.cursor()
748
749         if distinct:
750             sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
751         else:
752             sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
753
754         result = sql.fetchone()
755
756         try:
757             count = int(result[0])
758         except:
759             count = 0
760
761         return count
762
763
764     def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
765         """ returns the number of row in the splices table.
766         """
767         return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
768
769
770     def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
771         """ returns the number of distinct readIDs in the uniqs table.
772         """
773         return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
774
775
776     def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
777         """ returns the total weight of readIDs in the multi table.
778         """
779         return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
780
781
782     def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
783         """ get readID's.
784         """
785         stmt = []
786         if uniqs:
787             stmt.append("select readID from uniqs")
788
789         if multi:
790             stmt.append("select readID from multi")
791
792         if splices:
793             stmt.append("select readID from splices")
794
795         if len(stmt) > 0:
796             selectPart = string.join(stmt, " union ")
797         else:
798             selectPart = ""
799
800         limitPart = ""
801         if limit > 0:
802             limitPart = "LIMIT %d" % limit
803
804         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
805         if self.memBacked:
806             sql = self.memcon.cursor()
807         else:
808             sql = self.dbcon.cursor()
809
810         sql.execute(sqlQuery)
811         result = sql.fetchall()
812
813         if paired:
814             return [x[0].split("/")[0] for x in result]
815         else:
816             return [x[0] for x in result]
817
818
819     def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
820         """ returns the uniq and spliced mismatches in a dictionary.
821         """
822         readlen = self.getReadSize()
823         if mischrom:
824             hitChromList = [mischrom]
825         else:
826             hitChromList = self.getChromosomes()
827             hitChromList.sort()
828
829         snpDict = {}
830         for achrom in hitChromList:
831             if verbose:
832                 print "getting mismatches from chromosome %s" % (achrom)
833
834             snpDict[achrom] = []
835             if useSplices and self.dataType == "RNA":
836                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
837                 spliceIDList = spliceDict.keys()
838                 for spliceID in spliceIDList:
839                     spliceEntry = spliceDict[spliceID][0]
840                     startpos = spliceEntry["startL"]
841                     lefthalf = spliceEntry["stopL"]
842                     rightstart = spliceEntry["startR"]
843                     sense = spliceEntry["sense"]
844                     mismatches = spliceEntry["mismatch"]
845                     spMismatchList = mismatches.split(",")
846                     for mismatch in spMismatchList:
847                         if "N" in mismatch:
848                             continue
849
850                         change_len = len(mismatch)
851                         if sense == "+":
852                             change_from = mismatch[0]
853                             change_base = mismatch[change_len-1]
854                             change_pos = int(mismatch[1:change_len-1])
855                         elif sense == "-":
856                             change_from = getReverseComplement([mismatch[0]])
857                             change_base = getReverseComplement([mismatch[change_len-1]])
858                             change_pos = readlen - int(mismatch[1:change_len-1]) + 1
859
860                         firsthalf = int(lefthalf)-int(startpos)+1
861                         secondhalf = 0
862                         if int(change_pos) <= int(firsthalf):
863                             change_at = startpos + change_pos - 1
864                         else:
865                             secondhalf = change_pos - firsthalf
866                             change_at = rightstart + secondhalf
867
868                         snpDict[achrom].append([startpos, change_at, change_base, change_from])
869
870             hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
871             if achrom not in hitDict.keys():
872                 continue
873
874             for readEntry in hitDict[achrom]:
875                 start = readEntry["start"]
876                 sense = readEntry["sense"]
877                 mismatches = readEntry["mismatch"]
878                 mismatchList = mismatches.split(",")
879                 for mismatch in mismatchList:
880                     if "N" in mismatch:
881                         continue
882
883                     change_len = len(mismatch)
884                     if sense == "+":
885                         change_from = mismatch[0]
886                         change_base = mismatch[change_len-1]
887                         change_pos = int(mismatch[1:change_len-1])
888                     elif sense == "-":
889                         change_from = getReverseComplement([mismatch[0]])
890                         change_base = getReverseComplement([mismatch[change_len-1]])
891                         change_pos = readlen - int(mismatch[1:change_len-1]) + 1
892
893                     change_at = start + change_pos - 1
894                     snpDict[achrom].append([start, change_at, change_base, change_from])
895
896         return snpDict
897
898
899     def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
900                         useSplices=False, normalizationFactor=1.0, trackStrand=False,
901                         keepStrand="both", shiftValue=0):
902         """return a profile of the chromosome as an array of per-base read coverage....
903             keepStrand = 'both', 'plusOnly', or 'minusOnly'.
904             Will also shift position of unique and multireads (but not splices) if shift is a natural number
905         """
906         metadata = self.getMetadata()
907         try:
908             readlen = int(metadata["readsize"])
909         except KeyError:
910             readlen = 0
911
912         dataType = metadata["dataType"]
913         scale = 1. / normalizationFactor
914         shift = {}
915         shift["+"] = int(shiftValue)
916         shift["-"] = -1 * int(shiftValue)
917
918         if cstop > 0:
919             lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
920         else:
921             lastNT = cstop - cstart + readlen + shift["+"]
922
923         chromModel = array("f",[0.] * lastNT)
924         hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
925         if cstart < 0:
926             cstart = 0
927
928         for readEntry in hitDict[chromosome]:
929             hstart = readEntry["start"]
930             sense =  readEntry ["sense"]
931             weight = readEntry["weight"]
932             hstart = hstart - cstart + shift[sense]
933             for currentpos in range(hstart,hstart+readlen):
934                 try:
935                     if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
936                         chromModel[currentpos] += scale * weight
937                     elif sense == "-" and keepStrand != "plusOnly":
938                         chromModel[currentpos] -= scale * weight
939                 except:
940                     continue
941
942         del hitDict
943         if useSplices and dataType == "RNA":
944             if cstop > 0:
945                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
946             else:
947                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
948    
949             if chromosome in spliceDict:
950                 for spliceEntry in spliceDict[chromosome]:
951                     Lstart = spliceEntry["startL"]
952                     Lstop = spliceEntry["stopL"]
953                     Rstart = spliceEntry["startR"]
954                     Rstop = spliceEntry["stopR"]
955                     rsense = spliceEntry["sense"]
956                     if (Rstop - cstart) < lastNT:
957                         for index in range(abs(Lstop - Lstart)):
958                             currentpos = Lstart - cstart + index
959                             # we only track unique splices
960                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
961                                 chromModel[currentpos] += scale
962                             elif rsense == "-" and keepStrand != "plusOnly":
963                                 chromModel[currentpos] -= scale
964
965                         for index in range(abs(Rstop - Rstart)):
966                             currentpos = Rstart - cstart + index
967                             # we only track unique splices
968                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
969                                 chromModel[currentpos] += scale
970                             elif rsense == "-" and keepStrand != "plusOnly":
971                                 chromModel[currentpos] -= scale
972
973             del spliceDict
974
975         return chromModel
976
977
978     def insertMetadata(self, valuesList):
979         """ inserts a list of (pname, pvalue) into the metadata
980         table.
981         """
982         self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
983         self.dbcon.commit()
984
985
986     def updateMetadata(self, pname, newValue, originalValue=""):
987         """ update a metadata field given the original value and the new value.
988         """
989         stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
990         if originalValue != "":
991             stmt += " and value='%s' " % str(originalValue)
992
993         self.dbcon.execute(stmt)
994         self.dbcon.commit()
995
996
997     def insertUniqs(self, valuesList):
998         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
999         into the uniqs table.
1000         """
1001         self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1002         self.dbcon.commit()
1003
1004
1005     def insertMulti(self, valuesList):
1006         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1007         into the multi table.
1008         """
1009         self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1010         self.dbcon.commit()
1011
1012
1013     def insertSplices(self, valuesList):
1014         """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
1015         into the splices table.
1016         """
1017         self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
1018         self.dbcon.commit()
1019
1020
1021     def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
1022         """ update reads on file database in a list region of regions for a chromosome to have a new flag.
1023             regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
1024             sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
1025         """
1026         restrict = ""
1027         if sense != "both":
1028             restrict = " and sense = ? "
1029
1030         if uniqs:
1031             self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1032
1033         if multi:
1034             self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1035
1036         if self.dataType == "RNA" and splices:
1037             self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
1038             self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
1039
1040         self.dbcon.commit()
1041
1042
1043     def setFlags(self, flag, uniqs=True, multi=True, splices=True):
1044         """ set the flag fields in the entire dataset.
1045         """
1046         if uniqs:
1047             self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
1048
1049         if multi:
1050             self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
1051
1052         if self.dataType == "RNA" and splices:
1053             self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
1054
1055         self.dbcon.commit()
1056
1057
1058     def resetFlags(self, uniqs=True, multi=True, splices=True):
1059         """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
1060         """
1061         self.setFlags("", uniqs, multi, splices)
1062
1063
1064     def reweighMultireads(self, readList):
1065         self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
1066
1067
1068     def setSynchronousPragma(self, value="ON"):
1069         try:
1070             self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
1071         except:
1072             print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
1073
1074
1075     def setDBcache(self, cache, default=False):
1076         self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
1077         if default:
1078             self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
1079
1080
1081     def execute(self, statement, returnResults=False):
1082         sql = self.getSqlCursor()
1083
1084         sql.execute(statement)
1085         if returnResults:
1086             result = sql.fetchall()
1087             return result
1088
1089
1090     def executeCommit(self, statement):
1091         self.execute(statement)
1092
1093         if self.memBacked:
1094             self.memcon.commit()
1095         else:
1096             self.dbcon.commit()
1097
1098
1099     def buildIndex(self, cache=100000):
1100         """ Builds the file indeces for the main tables.
1101             Cache is the number of 1.5 kb pages to keep in memory.
1102             100000 pages translates into 150MB of RAM, which is our default.
1103         """
1104         if cache > self.getDefaultCacheSize():
1105             self.setDBcache(cache)
1106         self.setSynchronousPragma("OFF")
1107         self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1108         print "built uPosIndex"
1109         self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
1110         print "built uChromIndex"
1111         self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1112         print "built mPosIndex"
1113         self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
1114         print "built mChromIndex"
1115
1116         if self.dataType == "RNA":
1117             self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
1118             print "built sPosIndex"
1119             self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
1120             print "built sPosIndex2"
1121             self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
1122             print "built sChromIndex"
1123
1124         self.dbcon.commit()
1125         self.setSynchronousPragma("ON")
1126
1127
1128     def dropIndex(self):
1129         """ drops the file indices for the main tables.
1130         """
1131         try:
1132             self.setSynchronousPragma("OFF")
1133             self.dbcon.execute("DROP INDEX uPosIndex")
1134             self.dbcon.execute("DROP INDEX uChromIndex")
1135             self.dbcon.execute("DROP INDEX mPosIndex")
1136             self.dbcon.execute("DROP INDEX mChromIndex")
1137
1138             if self.dataType == "RNA":
1139                 self.dbcon.execute("DROP INDEX sPosIndex")
1140                 try:
1141                     self.dbcon.execute("DROP INDEX sPosIndex2")
1142                 except:
1143                     pass
1144
1145                 self.dbcon.execute("DROP INDEX sChromIndex")
1146
1147             self.dbcon.commit()
1148         except:
1149             print "problem dropping index"
1150
1151         self.setSynchronousPragma("ON")
1152
1153
1154     def memSync(self, chrom="", index=False):
1155         """ makes a copy of the dataset into memory for faster access.
1156         Can be restricted to a "full" chromosome. Can also build the
1157         memory indices.
1158         """
1159         self.memcon = ""
1160         self.memcon = sqlite.connect(":memory:")
1161         self.initializeTables(self.memcon)
1162         cursor = self.dbcon.cursor()
1163         whereclause = ""
1164         if chrom != "":
1165             print "memSync %s" % chrom
1166             whereclause = " where chrom = '%s' " % chrom
1167             self.memChrom = chrom
1168         else:
1169             self.memChrom = ""
1170
1171         self.memcon.execute("PRAGMA temp_store = MEMORY")
1172         self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
1173         # copy metadata to memory
1174         self.memcon.execute("delete from metadata")
1175         results = cursor.execute("select name, value from metadata")
1176         results2 = []
1177         for row in results:
1178             results2.append((row["name"], row["value"]))
1179
1180         self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
1181
1182         self.copyDBEntriesToMemory("uniqs", whereclause)
1183         self.copyDBEntriesToMemory("multi", whereclause)
1184         if self.dataType == "RNA":
1185             self.copySpliceDBEntriesToMemory(whereclause)
1186
1187         if index:
1188             if chrom != "":
1189                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
1190                 self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
1191                 if self.dataType == "RNA":
1192                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
1193                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
1194             else:
1195                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1196                 self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1197                 if self.dataType == "RNA":
1198                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
1199                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
1200
1201         self.memBacked = True
1202         self.memcon.row_factory = sqlite.Row
1203         self.memcon.commit()
1204
1205
1206     def copyDBEntriesToMemory(self, dbName, whereClause=""):
1207         cursor = self.dbcon.cursor()
1208         sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
1209         destinationEntries = []
1210         for row in sourceEntries:
1211             destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
1212
1213         self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
1214
1215
1216     def copySpliceDBEntriesToMemory(self, whereClause=""):
1217         cursor = self.dbcon.cursor()
1218         sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
1219         destinationEntries = []
1220         for row in sourceEntries:
1221             destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
1222                                        row["weight"], row["flag"], row["mismatch"]))
1223
1224         self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", destinationEntries)
1225