erange version 4.0a dev release
[erange.git] / ReadDataset.py
1 import sqlite3 as sqlite
2 import string
3 import tempfile
4 import shutil
5 import os
6 from array import array
7 from commoncode import getReverseComplement, getConfigParser, getConfigOption
8
9 currentRDSVersion = "2.0"
10
11
12 class ReadDatasetError(Exception):
13     pass
14
15
16 class ReadDataset():
17     """ Class for storing reads from experiments. Assumes that custom scripts
18     will translate incoming data into a format that can be inserted into the
19     class using the insert* methods. Default class subtype ('DNA') includes
20     tables for unique and multireads, whereas 'RNA' subtype also includes a
21     splices table.
22     """
23
24     def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False, 
25                  cache=False, reportCount=True):
26         """ creates an rds datafile if initialize is set to true, otherwise
27         will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
28         """
29         self.dbcon = ""
30         self.memcon = ""
31         self.dataType = ""
32         self.rdsVersion = currentRDSVersion
33         self.memBacked = False
34         self.memChrom = ""
35         self.memCursor = ""
36         self.cachedDBFile = ""
37
38         if cache:
39             if verbose:
40                 print "caching ...."
41
42             self.cacheDB(datafile)
43             dbFile = self.cachedDBFile
44         else:
45             dbFile = datafile
46
47         self.dbcon = sqlite.connect(dbFile)
48         self.dbcon.row_factory = sqlite.Row
49         self.dbcon.execute("PRAGMA temp_store = MEMORY")
50         if initialize:
51             if datasetType not in ["DNA", "RNA"]:
52                 raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
53             else:
54                 self.dataType = datasetType
55
56             self.initializeTables(self.dbcon)
57         else:
58             metadata = self.getMetadata("dataType")
59             self.dataType = metadata["dataType"]
60
61         try:
62             metadata = self.getMetadata("rdsVersion")
63             self.rdsVersion = metadata["rdsVersion"]
64         except:
65             try:
66                 self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
67             except IOError:
68                 print "could not add rdsVersion - read-only ?"
69                 self.rdsVersion = "pre-1.0"
70
71         if verbose:
72             if initialize:
73                 print "INITIALIZED dataset %s" % datafile
74             else:
75                 print "dataset %s" % datafile
76
77             metadata = self.getMetadata()
78             print "metadata:"
79             pnameList = metadata.keys()
80             pnameList.sort()
81             for pname in pnameList:
82                 print "\t" + pname + "\t" + metadata[pname]
83
84             if reportCount:
85                 ucount = self.getUniqsCount()
86                 mcount = self.getMultiCount()
87                 if self.dataType == "DNA" and not initialize:
88                     try:
89                         print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
90                     except ValueError:
91                         print "\n%s unique reads and %s multireads" % (ucount, mcount)
92                 elif self.dataType == "RNA" and not initialize:
93                     scount = self.getSplicesCount()
94                     try:
95                         print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
96                     except ValueError:
97                         print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
98
99             print "default cache size is %d pages" % self.getDefaultCacheSize()
100             if self.hasIndex():
101                 print "found index"
102             else:
103                 print "not indexed"
104
105
106     def __len__(self):
107         """ return the number of usable reads in the dataset.
108         """
109         total = self.getUniqsCount()
110         total += self.getMultiCount()
111
112         if self.dataType == "RNA":
113             total += self.getSplicesCount()
114
115         total = int(total)
116
117         return total
118
119
120     def __del__(self):
121         """ cleanup copy in local cache, if present.
122         """
123         if self.cachedDBFile != "":
124             self.uncacheDB()
125
126
127     def cacheDB(self, filename):
128         """ copy geneinfoDB to a local cache.
129         """
130         configParser = getConfigParser()
131         cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
132         tempfile.tempdir = cisTemp
133         self.cachedDBFile =  "%s.db" % tempfile.mktemp()
134         shutil.copyfile(filename, self.cachedDBFile)
135
136
137     def saveCacheDB(self, filename):
138         """ copy geneinfoDB to a local cache.
139         """
140         shutil.copyfile(self.cachedDBFile, filename)
141
142
143     def uncacheDB(self):
144         """ delete geneinfoDB from local cache.
145         """
146         global cachedDBFile
147         if self.cachedDBFile != "":
148             try:
149                 os.remove(self.cachedDBFile)
150             except:
151                 print "could not delete %s" % self.cachedDBFile
152
153             self.cachedDB = ""
154
155
156     def attachDB(self, filename, asname):
157         """ attach another database file to the readDataset.
158         """
159         stmt = "attach '%s' as %s" % (filename, asname)
160         self.execute(stmt)
161
162
163     def detachDB(self, asname):
164         """ detach a database file to the readDataset.
165         """
166         stmt = "detach %s" % (asname)
167         self.execute(stmt)
168
169
170     def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
171         """ import into current RDS the table (with columns destcolumns,
172             with default all columns) from the database file asname,
173             using the column specification of ascolumns (default all).
174         """
175         stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
176         if flagged != "":
177             stmt += " where flag = '%s' " % flagged
178
179         self.executeCommit(stmt)
180
181
182     def getTables(self, asname=""):
183         """ get a list of table names in a particular database file.
184         """
185         resultList = []
186         sql = self.getSqlCursor()
187
188         if asname != "":
189             asname += "."
190
191         stmt = "select name from %ssqlite_master where type='table'" % asname
192         sql.execute(stmt)
193         results = sql.fetchall()
194
195         for row in results:
196             resultList.append(row["name"])
197
198         return resultList
199
200
201     def getSqlCursor(self):
202         if self.memBacked:
203             sql = self.getMemCursor()
204         else:
205             sql = self.getFileCursor()
206
207         return sql
208
209
210     def hasIndex(self):
211         """ check whether the RDS file has at least one index.
212         """
213         stmt = "select count(*) from sqlite_master where type='index'"
214         count = int(self.execute(stmt, returnResults=True)[0][0])
215         if count > 0:
216             return True
217
218         return False
219
220
221     def initializeTables(self, dbConnection, cache=100000):
222         """ creates table schema in a database connection, which is
223         typically a database file or an in-memory database.
224         """
225         dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
226         dbConnection.execute("create table metadata (name varchar, value varchar)")
227         dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
228         positionSchema = "start int, stop int"
229         tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
230         dbConnection.execute("create table uniqs %s" % tableSchema)
231         dbConnection.execute("create table multi %s" % tableSchema)
232         if self.dataType == "RNA":
233             positionSchema = "startL int, stopL int, startR int, stopR int"
234             tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
235             dbConnection.execute("create table splices %s" % tableSchema)
236
237         dbConnection.commit()
238
239
240     def getFileCursor(self):
241         """ returns a cursor to file database for low-level (SQL)
242         access to the data.
243         """
244         return self.dbcon.cursor()
245
246
247     def getMemCursor(self):
248         """ returns a cursor to memory database for low-level (SQL)
249         access to the data.
250         """
251         return self.memcon.cursor()
252
253
254     def getMetadata(self, valueName=""):
255         """ returns a dictionary of metadata.
256         """
257         whereClause = ""
258         resultsDict = {}
259
260         if valueName != "":
261             whereClause = " where name='%s'" % valueName
262
263         sql = self.getSqlCursor()
264
265         sql.execute("select name, value from metadata %s" % whereClause)
266         results = sql.fetchall()
267
268         for row in results:
269             parameterName = row["name"]
270             parameterValue = row["value"]
271             if parameterName not in resultsDict:
272                 resultsDict[parameterName] = parameterValue
273             else:
274                 trying = True
275                 index = 2
276                 while trying:
277                     newName = string.join([parameterName, str(index)], ":")
278                     if newName not in resultsDict:
279                         resultsDict[newName] = parameterValue
280                         trying = False
281
282                     index += 1
283
284         return resultsDict
285
286
287     def getReadSize(self):
288         """ returns readsize if defined in metadata.
289         """
290         metadata = self.getMetadata()
291         if "readsize" not in metadata:
292             raise ReadDatasetError("no readsize parameter defined")
293         else:
294             mysize = metadata["readsize"]
295             if "import" in mysize:
296                 mysize = mysize.split()[0]
297
298             return int(mysize)
299
300
301     def getDefaultCacheSize(self):
302         """ returns the default cache size.
303         """
304         return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
305
306
307     def getChromosomes(self, table="uniqs", fullChrom=True):
308         """ returns a list of distinct chromosomes in table.
309         """
310         statement = "select distinct chrom from %s" % table
311         sql = self.getSqlCursor()
312
313         sql.execute(statement)
314         results = []
315         for row in sql:
316             if fullChrom:
317                 if row["chrom"] not in results:
318                     results.append(row["chrom"])
319             else:
320                 if  len(row["chrom"][3:].strip()) < 1:
321                     continue
322
323                 if row["chrom"][3:] not in results:
324                     results.append(row["chrom"][3:])
325
326         results.sort()
327
328         return results
329
330
331     def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
332                          doMulti=False, doSplices=False):
333         """ returns the maximum coordinate for reads on a given chromosome.
334         """
335         maxCoord = 0
336         sql = self.getSqlCursor()
337
338         if doUniqs:
339             try:
340                 sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
341                 maxCoord = int(sql.fetchall()[0][0])
342             except:
343                 print "couldn't retrieve coordMax for chromosome %s" % chrom
344
345         if doSplices:
346             sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
347             try:
348                 spliceMax = int(sql.fetchall()[0][0])
349                 if spliceMax > maxCoord:
350                     maxCoord = spliceMax
351             except:
352                 pass
353
354         if doMulti:
355             sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
356             try:
357                 multiMax = int(sql.fetchall()[0][0])
358                 if multiMax > maxCoord:
359                     maxCoord = multiMax
360             except:
361                 pass
362
363         if verbose:
364             print "%s maxCoord: %d" % (chrom, maxCoord)
365
366         return maxCoord
367
368
369     def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
370                      flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
371                      withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
372                      readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
373                      flagLike=False, strand='', combine5p=False):
374         """ returns a dictionary of reads in a variety of formats
375         and which can be restricted by chromosome or custom-flag.
376         Returns unique reads by default, but can return multireads
377         with doMulti set to True.
378         
379         Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
380         """
381         whereClause = []
382         resultsDict = {}
383
384         if chrom != "" and chrom != self.memChrom:
385             whereClause.append("chrom = '%s'" % chrom)
386
387         if flag != "":
388             if flagLike:
389                 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
390                 whereClause.append(flagLikeClause)
391             else:
392                 whereClause.append("flag = '%s'" % flag)
393
394         if start > -1:
395             whereClause.append("start > %d" % start)
396
397         if stop > -1:
398             whereClause.append("stop < %d" % stop)
399
400         if len(readLike) > 0:
401             readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
402             whereClause.append(readIDClause)
403
404         if hasMismatch:
405             whereClause.append("mismatch != ''")
406
407         if strand in ["+", "-"]:
408             whereClause.append("sense = '%s'" % strand)
409
410         if len(whereClause) > 0:
411             whereStatement = string.join(whereClause, " and ")
412             whereQuery = "where %s" % whereStatement
413         else:
414             whereQuery = ""
415
416         groupBy = []
417         if findallOptimize:
418             selectClause = ["select start, sense, sum(weight)"]
419             groupBy = ["GROUP BY start, sense"]
420         else:
421             selectClause = ["select ID, chrom, start, readID"]
422             if bothEnds:
423                 selectClause.append("stop")
424
425             if not noSense:
426                 selectClause.append("sense")
427
428             if withWeight:
429                 selectClause.append("weight")
430
431             if withFlag:
432                 selectClause.append("flag")
433
434             if withMismatch:
435                 selectClause.append("mismatch")
436
437         if limit > 0 and not combine5p:
438             groupBy.append("LIMIT %d" % limit)
439
440         selectQuery = string.join(selectClause, ",")
441         groupQuery = string.join(groupBy)
442         if doUniqs:
443             stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
444             if doMulti:
445                 stmt.append("UNION ALL")
446                 stmt.append(selectQuery)
447                 stmt.append("from multi")
448                 stmt.append(whereQuery)
449                 stmt.append(groupQuery)
450         else:
451             stmt = [selectQuery, "from multi", whereQuery]
452
453         if combine5p:
454             if findallOptimize:
455                 selectQuery = "select start, sense, weight, chrom"
456
457             if doUniqs:
458                 subSelect = [selectQuery, "from uniqs", whereQuery]
459                 if doMulti:
460                     subSelect.append("union all")
461                     subSelect.append(selectQuery)
462                     subSelect.append("from multi")
463                     subSelect.append(whereQuery)
464             else:
465                 subSelect = [selectQuery, "from multi", whereQuery]
466
467             sqlStmt = string.join(subSelect)
468             if findallOptimize:
469                 selectQuery = "select start, sense, sum(weight)"
470
471             stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
472                     selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
473
474         if findallOptimize:
475             if self.memBacked:
476                 self.memcon.row_factory = None
477                 sql = self.memcon.cursor()
478             else:
479                 self.dbcon.row_factory = None
480                 sql = self.dbcon.cursor()
481
482             stmt.append("order by start")
483         elif readIDDict:
484             if self.memBacked:
485                 sql = self.memcon.cursor()
486             else:
487                 sql = self.dbcon.cursor()
488
489             stmt.append("order by readID, start")
490         else:
491             if self.memBacked:
492                 sql = self.memcon.cursor()
493             else:
494                 sql = self.dbcon.cursor()
495
496             stmt.append("order by chrom, start")
497
498         sqlQuery = string.join(stmt)
499         sql.execute(sqlQuery)
500
501         if findallOptimize:
502             resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
503             if self.memBacked:
504                 self.memcon.row_factory = sqlite.Row
505             else:
506                 self.dbcon.row_factory = sqlite.Row
507         else:
508             currentChrom = ""
509             currentReadID = ""
510             pairID = 0
511             for row in sql:
512                 readID = row["readID"]
513                 if fullChrom:
514                     chrom = row["chrom"]
515                 else:
516                     chrom = row["chrom"][3:]
517
518                 if not readIDDict and chrom != currentChrom:
519                     resultsDict[chrom] = []
520                     currentChrom = chrom
521                     dictKey = chrom
522                 elif readIDDict:
523                     theReadID = readID
524                     if "::" in readID:
525                         theReadID = readID.split("::")[0]
526
527                     if "/" in theReadID and withPairID:
528                         (theReadID, pairID) = readID.split("/")
529
530                     if theReadID != currentReadID:
531                         resultsDict[theReadID] = []
532                         currentReadID = theReadID
533                         dictKey = theReadID
534
535                 newrow = {"start": int(row["start"])}
536                 if bothEnds:
537                     newrow["stop"] = int(row["stop"])
538
539                 if not noSense:
540                     newrow["sense"] = row["sense"]
541
542                 if withWeight:
543                     newrow["weight"] = float(row["weight"])
544
545                 if withFlag:
546                     newrow["flag"] = row["flag"]
547
548                 if withMismatch:
549                     newrow["mismatch"] = row["mismatch"]
550
551                 if withID:
552                     newrow["readID"] = readID
553
554                 if withChrom:
555                     newrow["chrom"] = chrom
556
557                 if withPairID:
558                     newrow["pairID"] = pairID
559
560                 resultsDict[dictKey].append(newrow)
561
562         return resultsDict
563
564
565     def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
566                        flag="", withWeight=False, withFlag=False, withMismatch=False,
567                        withID=False, withChrom=False, withPairID=False, readIDDict=False,
568                        splitRead=False, hasMismatch=False, flagLike=False, start=-1,
569                        stop=-1, strand=""):
570         """ returns a dictionary of spliced reads in a variety of
571         formats and which can be restricted by chromosome or custom-flag.
572         Returns unique spliced reads for now.
573         """
574         whereClause = []
575         resultsDict = {}
576
577         if chrom != "" and chrom != self.memChrom:
578             whereClause = ["chrom = '%s'" % chrom]
579
580         if flag != "":
581             if flagLike:
582                 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
583                 whereClause.append(flagLikeClause)
584             else:
585                 whereClause.append("flag = '%s'" % flag)
586
587         if hasMismatch:
588             whereClause.append("mismatch != ''")
589
590         if strand != "":
591             whereClause.append("sense = '%s'" % strand)
592
593         if start > -1:
594             whereClause.append("startL > %d" % start)
595
596         if stop > -1:
597             whereClause.append("stopR < %d" % stop)
598
599         if len(whereClause) > 0:
600             whereStatement = string.join(whereClause, " and ")
601             whereQuery = "where %s" % whereStatement
602         else:
603             whereQuery = ""
604
605         selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
606         if not noSense:
607             selectClause.append("sense")
608
609         if withWeight:
610             selectClause.append("weight")
611
612         if withFlag:
613             selectClause.append("flag")
614
615         if withMismatch:
616             selectClause.append("mismatch")
617
618         selectQuery = string.join(selectClause, " ,")
619         if self.memBacked:
620             sql = self.memcon.cursor()
621         else:
622             sql = self.dbcon.cursor()
623
624         stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
625         sql.execute(stmt)
626         currentReadID = ""
627         currentChrom = ""
628         for row in sql:
629             pairID = 0
630             readID = row["readID"]
631             if fullChrom:
632                 chrom = row["chrom"]
633             else:
634                 chrom = row["chrom"][3:]
635
636             if not readIDDict and chrom != currentChrom:
637                 resultsDict[chrom] = []
638                 currentChrom = chrom
639                 dictKey = chrom
640             elif readIDDict:
641                 if "/" in readID:
642                     (theReadID, pairID) = readID.split("/")
643                 else:
644                     theReadID = readID
645
646                 if theReadID != currentReadID:
647                     resultsDict[theReadID] = []
648                     currentReadID = theReadID
649                     dictKey = theReadID
650
651             newrow = {"startL": int(row["startL"])}
652             newrow["stopL"] = int(row["stopL"])
653             newrow["startR"] = int(row["startR"])
654             newrow["stopR"] = int(row["stopR"])
655             if not noSense:
656                 newrow["sense"] = row["sense"]
657
658             if withWeight:
659                 newrow["weight"] = float(row["weight"])
660
661             if withFlag:
662                 newrow["flag"] = row["flag"]
663
664             if withMismatch:
665                 newrow["mismatch"] = row["mismatch"]
666
667             if withID:
668                 newrow["readID"] = readID
669
670             if withChrom:
671                 newrow["chrom"] = chrom
672
673             if withPairID:
674                 newrow["pairID"] = pairID
675
676             if splitRead:
677                 leftDict = newrow.copy()
678                 del leftDict["startR"]
679                 del leftDict["stopR"]
680                 rightDict = newrow
681                 del rightDict["startL"]
682                 del rightDict["stopL"]
683                 resultsDict[dictKey].append(leftDict)
684                 resultsDict[dictKey].append(rightDict)
685             else:
686                 resultsDict[dictKey].append(newrow)
687
688         return resultsDict
689
690
691     def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
692                   splices=False, reportCombined=True, sense="both"):
693         """ return read counts for a given region.
694         """
695         ucount = 0
696         mcount = 0
697         scount = 0
698         restrict = ""
699         if sense in ["+", "-"]:
700             restrict = " sense ='%s' " % sense
701
702         if uniqs:
703             try:
704                 ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
705             except:
706                 ucount = 0
707
708         if multi:
709             try:
710                 mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
711             except:
712                 mcount = 0
713
714         if splices:
715             try:
716                 scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
717             except:
718                 scount = 0
719
720         if reportCombined:
721             total = ucount + mcount + scount
722             return total
723         else:
724             return (ucount, mcount, scount)
725
726
727     def getTotalCounts(self, chrom="", rmin="", rmax=""):
728         """ return read counts for a given region.
729         """
730         return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
731
732
733     def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
734         """ returns the number of row in the uniqs table.
735         """
736         whereClause = []
737         count = 0
738
739         if chrom !=""  and chrom != self.memChrom:
740             whereClause = ["chrom='%s'" % chrom]
741
742         if rmin != "":
743             whereClause.append("%s >= %s" % (startField, str(rmin)))
744
745         if rmax != "":
746             whereClause.append("%s <= %s" % (startField, str(rmax)))
747
748         if restrict != "":
749             whereClause.append(restrict)
750
751         if len(whereClause) > 0:
752             whereStatement = string.join(whereClause, " and ")
753             whereQuery = "where %s" % whereStatement
754         else:
755             whereQuery = ""
756
757         if self.memBacked:
758             sql = self.memcon.cursor()
759         else:
760             sql = self.dbcon.cursor()
761
762         if distinct:
763             sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
764         else:
765             sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
766
767         result = sql.fetchone()
768
769         try:
770             count = int(result[0])
771         except:
772             count = 0
773
774         return count
775
776
777     def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
778         """ returns the number of row in the splices table.
779         """
780         return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
781
782
783     def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
784         """ returns the number of distinct readIDs in the uniqs table.
785         """
786         return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
787
788
789     def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
790         """ returns the total weight of readIDs in the multi table.
791         """
792         return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
793
794
795     def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
796         """ get readID's.
797         """
798         stmt = []
799         limitPart = ""
800         if limit > 0:
801             limitPart = "LIMIT %d" % limit
802
803         if uniqs:
804             stmt.append("select readID from uniqs")
805
806         if multi:
807             stmt.append("select readID from multi")
808
809         if splices:
810             stmt.append("select readID from splices")
811
812         if len(stmt) > 0:
813             selectPart = string.join(stmt, " union ")
814         else:
815             selectPart = ""
816
817         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
818         if self.memBacked:
819             sql = self.memcon.cursor()
820         else:
821             sql = self.dbcon.cursor()
822
823         sql.execute(sqlQuery)
824         result = sql.fetchall()
825
826         if paired:
827             return [x[0].split("/")[0] for x in result]
828         else:
829             return [x[0] for x in result]
830
831
832     def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
833         """ returns the uniq and spliced mismatches in a dictionary.
834         """
835         readlen = self.getReadSize()
836         if mischrom:
837             hitChromList = [mischrom]
838         else:
839             hitChromList = self.getChromosomes()
840             hitChromList.sort()
841
842         snpDict = {}
843         for achrom in hitChromList:
844             if verbose:
845                 print "getting mismatches from chromosome %s" % (achrom)
846
847             snpDict[achrom] = []
848             hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
849             if useSplices and self.dataType == "RNA":
850                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
851                 spliceIDList = spliceDict.keys()
852                 for k in spliceIDList:
853                     spliceEntry = spliceDict[k][0]
854                     startpos = spliceEntry["startL"]
855                     lefthalf = spliceEntry["stopL"]
856                     rightstart = spliceEntry["startR"]
857                     sense = spliceEntry["sense"]
858                     mismatches = spliceEntry["mismatch"]
859                     spMismatchList = mismatches.split(",")
860                     for mismatch in spMismatchList:
861                         if "N" in mismatch:
862                             continue
863
864                         change_len = len(mismatch)
865                         if sense == "+":
866                             change_from = mismatch[0]
867                             change_base = mismatch[change_len-1]
868                             change_pos = int(mismatch[1:change_len-1])
869                         elif sense == "-":
870                             change_from = getReverseComplement([mismatch[0]])
871                             change_base = getReverseComplement([mismatch[change_len-1]])
872                             change_pos = readlen - int(mismatch[1:change_len-1]) + 1
873
874                         firsthalf = int(lefthalf)-int(startpos)+1
875                         secondhalf = 0
876                         if int(change_pos) <= int(firsthalf):
877                             change_at = startpos + change_pos - 1
878                         else:
879                             secondhalf = change_pos - firsthalf
880                             change_at = rightstart + secondhalf
881
882                         snpDict[achrom].append([startpos, change_at, change_base, change_from])
883
884             if achrom not in hitDict.keys():
885                 continue
886
887             for readEntry in hitDict[achrom]:
888                 start = readEntry["start"]
889                 sense = readEntry["sense"]
890                 mismatches = readEntry["mismatch"]
891                 mismatchList = mismatches.split(",")
892                 for mismatch in mismatchList:
893                     if "N" in mismatch:
894                         continue
895
896                     change_len = len(mismatch)
897                     if sense == "+":
898                         change_from = mismatch[0]
899                         change_base = mismatch[change_len-1]
900                         change_pos = int(mismatch[1:change_len-1])
901                     elif sense == "-":
902                         change_from = getReverseComplement([mismatch[0]])
903                         change_base = getReverseComplement([mismatch[change_len-1]])
904                         change_pos = readlen - int(mismatch[1:change_len-1]) + 1
905
906                     change_at = start + change_pos - 1
907                     snpDict[achrom].append([start, change_at, change_base, change_from])
908
909         return snpDict
910
911
912     def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
913                         useSplices=False, normalizationFactor = 1.0, trackStrand=False,
914                         keepStrand="both", shiftValue=0):
915         """return a profile of the chromosome as an array of per-base read coverage....
916             keepStrand = 'both', 'plusOnly', or 'minusOnly'.
917             Will also shift position of unique and multireads (but not splices) if shift is a natural number
918         """
919         metadata = self.getMetadata()
920         try:
921             readlen = int(metadata["readsize"])
922         except KeyError:
923             readlen = 0
924
925         dataType = metadata["dataType"]
926         scale = 1. / normalizationFactor
927         shift = {}
928         shift['+'] = int(shiftValue)
929         shift['-'] = -1 * int(shiftValue)
930
931         if cstop > 0:
932             lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
933         else:
934             lastNT = cstop - cstart + readlen + shift["+"]
935
936         chromModel = array("f",[0.] * lastNT)
937         hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
938         if cstart < 0:
939             cstart = 0
940
941         for readEntry in hitDict[chromosome]:
942             hstart = readEntry["start"]
943             sense =  readEntry ["sense"]
944             weight = readEntry["weight"]
945             hstart = hstart - cstart + shift[sense]
946             for currentpos in range(hstart,hstart+readlen):
947                 try:
948                     if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
949                         chromModel[currentpos] += scale * weight
950                     elif sense == "-" and keepStrand != "plusOnly":
951                         chromModel[currentpos] -= scale * weight
952                 except:
953                     continue
954
955         del hitDict
956         if useSplices and dataType == "RNA":
957             if cstop > 0:
958                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
959             else:
960                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
961    
962             if chromosome in spliceDict:
963                 for spliceEntry in spliceDict[chromosome]:
964                     Lstart = spliceEntry["startL"]
965                     Lstop = spliceEntry["stopL"]
966                     Rstart = spliceEntry["startR"]
967                     Rstop = spliceEntry["stopR"]
968                     rsense = spliceEntry["sense"]
969                     if (Rstop - cstart) < lastNT:
970                         for index in range(abs(Lstop - Lstart)):
971                             currentpos = Lstart - cstart + index
972                             # we only track unique splices
973                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
974                                 chromModel[currentpos] += scale
975                             elif rsense == "-" and keepStrand != "plusOnly":
976                                 chromModel[currentpos] -= scale
977
978                         for index in range(abs(Rstop - Rstart)):
979                             currentpos = Rstart - cstart + index
980                             # we only track unique splices
981                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
982                                 chromModel[currentpos] += scale
983                             elif rsense == "-" and keepStrand != "plusOnly":
984                                 chromModel[currentpos] -= scale
985
986             del spliceDict
987
988         return chromModel
989
990
991     def insertMetadata(self, valuesList):
992         """ inserts a list of (pname, pvalue) into the metadata
993         table.
994         """
995         self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
996         self.dbcon.commit()
997
998
999     def updateMetadata(self, pname, newValue, originalValue=""):
1000         """ update a metadata field given the original value and the new value.
1001         """
1002         stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
1003         if originalValue != "":
1004             stmt += " and value='%s' " % str(originalValue)
1005
1006         self.dbcon.execute(stmt)
1007         self.dbcon.commit()
1008
1009
1010     def insertUniqs(self, valuesList):
1011         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1012         into the uniqs table.
1013         """
1014         self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1015         self.dbcon.commit()
1016
1017
1018     def insertMulti(self, valuesList):
1019         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1020         into the multi table.
1021         """
1022         self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1023         self.dbcon.commit()
1024
1025
1026     def insertSplices(self, valuesList):
1027         """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
1028         into the splices table.
1029         """
1030         self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
1031         self.dbcon.commit()
1032
1033
1034     def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
1035         """ update reads on file database in a list region of regions for a chromosome to have a new flag.
1036             regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
1037             sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
1038         """
1039         restrict = ""
1040         if sense != "both":
1041             restrict = " and sense = ? "
1042
1043         if uniqs:
1044             self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1045
1046         if multi:
1047             self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1048
1049         if self.dataType == "RNA" and splices:
1050             self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
1051             self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
1052
1053         self.dbcon.commit()
1054
1055
1056     def setFlags(self, flag, uniqs=True, multi=True, splices=True):
1057         """ set the flag fields in the entire dataset.
1058         """
1059         if uniqs:
1060             self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
1061
1062         if multi:
1063             self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
1064
1065         if self.dataType == "RNA" and splices:
1066             self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
1067
1068         self.dbcon.commit()
1069
1070
1071     def resetFlags(self, uniqs=True, multi=True, splices=True):
1072         """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
1073         """
1074         self.setFlags("", uniqs, multi, splices)
1075
1076
1077     def reweighMultireads(self, readList):
1078         self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
1079
1080
1081     def setSynchronousPragma(self, value="ON"):
1082         try:
1083             self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
1084         except:
1085             print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
1086
1087
1088     def setDBcache(self, cache, default=False):
1089         self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
1090         if default:
1091             self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
1092
1093
1094     def execute(self, statement, returnResults=False):
1095         sql = self.getSqlCursor()
1096
1097         sql.execute(statement)
1098         if returnResults:
1099             result = sql.fetchall()
1100             return result
1101
1102
1103     def executeCommit(self, statement):
1104         self.execute(statement)
1105
1106         if self.memBacked:
1107             self.memcon.commit()
1108         else:
1109             self.dbcon.commit()
1110
1111
1112     def buildIndex(self, cache=100000):
1113         """ Builds the file indeces for the main tables.
1114             Cache is the number of 1.5 kb pages to keep in memory.
1115             100000 pages translates into 150MB of RAM, which is our default.
1116         """
1117         if cache > self.getDefaultCacheSize():
1118             self.setDBcache(cache)
1119         self.setSynchronousPragma("OFF")
1120         self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1121         print "built uPosIndex"
1122         self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
1123         print "built uChromIndex"
1124         self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1125         print "built mPosIndex"
1126         self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
1127         print "built mChromIndex"
1128
1129         if self.dataType == "RNA":
1130             self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
1131             print "built sPosIndex"
1132             self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
1133             print "built sPosIndex2"
1134             self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
1135             print "built sChromIndex"
1136
1137         self.dbcon.commit()
1138         self.setSynchronousPragma("ON")
1139
1140
1141     def dropIndex(self):
1142         """ drops the file indices for the main tables.
1143         """
1144         try:
1145             self.setSynchronousPragma("OFF")
1146             self.dbcon.execute("DROP INDEX uPosIndex")
1147             self.dbcon.execute("DROP INDEX uChromIndex")
1148             self.dbcon.execute("DROP INDEX mPosIndex")
1149             self.dbcon.execute("DROP INDEX mChromIndex")
1150
1151             if self.dataType == "RNA":
1152                 self.dbcon.execute("DROP INDEX sPosIndex")
1153                 try:
1154                     self.dbcon.execute("DROP INDEX sPosIndex2")
1155                 except:
1156                     pass
1157
1158                 self.dbcon.execute("DROP INDEX sChromIndex")
1159
1160             self.dbcon.commit()
1161         except:
1162             print "problem dropping index"
1163
1164         self.setSynchronousPragma("ON")
1165
1166
1167     def memSync(self, chrom="", index=False):
1168         """ makes a copy of the dataset into memory for faster access.
1169         Can be restricted to a "full" chromosome. Can also build the
1170         memory indices.
1171         """
1172         self.memcon = ""
1173         self.memcon = sqlite.connect(":memory:")
1174         self.initializeTables(self.memcon)
1175         cursor = self.dbcon.cursor()
1176         whereclause = ""
1177         if chrom != "":
1178             print "memSync %s" % chrom
1179             whereclause = " where chrom = '%s' " % chrom
1180             self.memChrom = chrom
1181         else:
1182             self.memChrom = ""
1183
1184         self.memcon.execute("PRAGMA temp_store = MEMORY")
1185         self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
1186         # copy metadata to memory
1187         self.memcon.execute("delete from metadata")
1188         results = cursor.execute("select name, value from metadata")
1189         results2 = []
1190         for row in results:
1191             results2.append((row["name"], row["value"]))
1192
1193         self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
1194
1195         self.copyDBEntriesToMemory("uniqs", whereclause)
1196         self.copyDBEntriesToMemory("multi", whereclause)
1197         if self.dataType == "RNA":
1198             self.copySpliceDBEntriesToMemory(whereclause)
1199
1200         if index:
1201             if chrom != "":
1202                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
1203                 self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
1204                 if self.dataType == "RNA":
1205                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
1206                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
1207             else:
1208                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1209                 self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1210                 if self.dataType == "RNA":
1211                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
1212                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
1213
1214         self.memBacked = True
1215         self.memcon.row_factory = sqlite.Row
1216         self.memcon.commit()
1217
1218
1219     def copyDBEntriesToMemory(self, dbName, whereClause=""):
1220         cursor = self.dbcon.cursor()
1221         sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
1222         destinationEntries = []
1223         for row in sourceEntries:
1224             destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
1225
1226         self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
1227
1228
1229     def copySpliceDBEntriesToMemory(self, whereClause=""):
1230         cursor = self.dbcon.cursor()
1231         sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
1232         destinationEntries = []
1233         for row in sourceEntries:
1234             destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
1235                                        row["weight"], row["flag"], row["mismatch"]))
1236
1237         self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", destinationEntries)
1238