c9d2a0bf2b4eb56e0fa906be2319fcac866c30b3
[erange.git] / ReadDataset.py
1 import sqlite3 as sqlite
2 import string
3 import tempfile
4 import shutil
5 import os
6 from array import array
7 from commoncode import getReverseComplement, getConfigParser, getConfigOption
8
9 currentRDSVersion = "2.0"
10
11
12 class ReadDatasetError(Exception):
13     pass
14
15
16 class ReadDataset():
17     """ Class for storing reads from experiments. Assumes that custom scripts
18     will translate incoming data into a format that can be inserted into the
19     class using the insert* methods. Default class subtype ('DNA') includes
20     tables for unique and multireads, whereas 'RNA' subtype also includes a
21     splices table.
22     """
23
24     def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False, 
25                  cache=False, reportCount=True):
26         """ creates an rds datafile if initialize is set to true, otherwise
27         will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
28         """
29         self.dbcon = ""
30         self.memcon = ""
31         self.dataType = ""
32         self.rdsVersion = currentRDSVersion
33         self.memBacked = False
34         self.memChrom = ""
35         self.memCursor = ""
36         self.cachedDBFile = ""
37
38         if cache:
39             if verbose:
40                 print "caching ...."
41
42             self.cacheDB(datafile)
43             dbFile = self.cachedDBFile
44         else:
45             dbFile = datafile
46
47         self.dbcon = sqlite.connect(dbFile)
48         self.dbcon.row_factory = sqlite.Row
49         self.dbcon.execute("PRAGMA temp_store = MEMORY")
50         if initialize:
51             if datasetType not in ["DNA", "RNA"]:
52                 raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
53             else:
54                 self.dataType = datasetType
55
56             self.initializeTables(self.dbcon)
57         else:
58             metadata = self.getMetadata("dataType")
59             self.dataType = metadata["dataType"]
60
61         try:
62             metadata = self.getMetadata("rdsVersion")
63             self.rdsVersion = metadata["rdsVersion"]
64         except:
65             try:
66                 self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
67             except IOError:
68                 print "could not add rdsVersion - read-only ?"
69                 self.rdsVersion = "pre-1.0"
70
71         if verbose:
72             if initialize:
73                 print "INITIALIZED dataset %s" % datafile
74             else:
75                 print "dataset %s" % datafile
76
77             metadata = self.getMetadata()
78             print "metadata:"
79             pnameList = metadata.keys()
80             pnameList.sort()
81             for pname in pnameList:
82                 print "\t" + pname + "\t" + metadata[pname]
83
84             if reportCount:
85                 ucount = self.getUniqsCount()
86                 mcount = self.getMultiCount()
87                 if self.dataType == "DNA" and not initialize:
88                     try:
89                         print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
90                     except ValueError:
91                         print "\n%s unique reads and %s multireads" % (ucount, mcount)
92                 elif self.dataType == "RNA" and not initialize:
93                     scount = self.getSplicesCount()
94                     try:
95                         print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
96                     except ValueError:
97                         print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
98
99             print "default cache size is %d pages" % self.getDefaultCacheSize()
100             if self.hasIndex():
101                 print "found index"
102             else:
103                 print "not indexed"
104
105
106     def __len__(self):
107         """ return the number of usable reads in the dataset.
108         """
109         total = self.getUniqsCount()
110         total += self.getMultiCount()
111
112         if self.dataType == "RNA":
113             total += self.getSplicesCount()
114
115         total = int(total)
116
117         return total
118
119
120     def __del__(self):
121         """ cleanup copy in local cache, if present.
122         """
123         if self.cachedDBFile != "":
124             self.uncacheDB()
125
126
127     def cacheDB(self, filename):
128         """ copy geneinfoDB to a local cache.
129         """
130         configParser = getConfigParser()
131         cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
132         tempfile.tempdir = cisTemp
133         self.cachedDBFile =  "%s.db" % tempfile.mktemp()
134         shutil.copyfile(filename, self.cachedDBFile)
135
136
137     def saveCacheDB(self, filename):
138         """ copy geneinfoDB to a local cache.
139         """
140         shutil.copyfile(self.cachedDBFile, filename)
141
142
143     def uncacheDB(self):
144         """ delete geneinfoDB from local cache.
145         """
146         global cachedDBFile
147         if self.cachedDBFile != "":
148             try:
149                 os.remove(self.cachedDBFile)
150             except:
151                 print "could not delete %s" % self.cachedDBFile
152
153             self.cachedDB = ""
154
155
156     def attachDB(self, filename, dbName):
157         """ attach another database file to the readDataset.
158         """
159         stmt = "attach '%s' as %s" % (filename, dbName)
160         self.execute(stmt)
161
162
163     def detachDB(self, dbName):
164         """ detach a database file to the readDataset.
165         """
166         stmt = "detach %s" % (dbName)
167         self.execute(stmt)
168
169
170     def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
171         """ import into current RDS the table (with columns destcolumns,
172             with default all columns) from the database file asname,
173             using the column specification of ascolumns (default all).
174         """
175         stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
176         if flagged != "":
177             stmt += " where flag = '%s' " % flagged
178
179         self.executeCommit(stmt)
180
181
182     def getTables(self, dbName=""):
183         """ get a list of table names in a particular database file.
184         """
185         resultList = []
186         sql = self.getSqlCursor()
187
188         if dbName != "":
189             dbName = "%s." % dbName
190
191         stmt = "select name from %ssqlite_master where type='table'" % dbName
192         sql.execute(stmt)
193         results = sql.fetchall()
194
195         for row in results:
196             resultList.append(row["name"])
197
198         return resultList
199
200
201     def getSqlCursor(self):
202         if self.memBacked:
203             sql = self.getMemCursor()
204         else:
205             sql = self.getFileCursor()
206
207         return sql
208
209
210     def getMemCursor(self):
211         """ returns a cursor to memory database for low-level (SQL)
212         access to the data.
213         """
214         return self.memcon.cursor()
215
216
217     def getFileCursor(self):
218         """ returns a cursor to file database for low-level (SQL)
219         access to the data.
220         """
221         return self.dbcon.cursor()
222
223
224     def hasIndex(self):
225         """ return True if the RDS file has at least one index.
226         """
227         stmt = "select count(*) from sqlite_master where type='index'"
228         count = int(self.execute(stmt, returnResults=True)[0][0])
229
230         return count > 0
231
232
233     def initializeTables(self, dbConnection, cache=100000):
234         """ creates table schema in a database connection, which is
235         typically a database file or an in-memory database.
236         """
237         dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
238         dbConnection.execute("create table metadata (name varchar, value varchar)")
239         dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
240         positionSchema = "start int, stop int"
241         tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
242         dbConnection.execute("create table uniqs %s" % tableSchema)
243         dbConnection.execute("create table multi %s" % tableSchema)
244         if self.dataType == "RNA":
245             positionSchema = "startL int, stopL int, startR int, stopR int"
246             tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
247             dbConnection.execute("create table splices %s" % tableSchema)
248
249         dbConnection.commit()
250
251
252     def getMetadata(self, valueName=""):
253         """ returns a dictionary of metadata.
254         """
255         whereClause = ""
256         resultsDict = {}
257
258         if valueName != "":
259             whereClause = " where name='%s'" % valueName
260
261         sql = self.getSqlCursor()
262
263         sql.execute("select name, value from metadata %s" % whereClause)
264         results = sql.fetchall()
265
266         for row in results:
267             parameterName = row["name"]
268             parameterValue = row["value"]
269             if parameterName not in resultsDict:
270                 resultsDict[parameterName] = parameterValue
271             else:
272                 trying = True
273                 index = 2
274                 while trying:
275                     newName = string.join([parameterName, str(index)], ":")
276                     if newName not in resultsDict:
277                         resultsDict[newName] = parameterValue
278                         trying = False
279
280                     index += 1
281
282         return resultsDict
283
284
285     def getReadSize(self):
286         """ returns readsize if defined in metadata.
287         """
288         metadata = self.getMetadata()
289         if "readsize" not in metadata:
290             raise ReadDatasetError("no readsize parameter defined")
291         else:
292             readSize = metadata["readsize"]
293             if "import" in readSize:
294                 readSize = readSize.split()[0]
295
296             readSize = int(readSize)
297             if readSize < 0:
298                 raise ReadDatasetError("readsize is negative")
299
300             return readSize
301
302
303     def getDefaultCacheSize(self):
304         """ returns the default cache size.
305         """
306         return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
307
308
309     def getChromosomes(self, table="uniqs", fullChrom=True):
310         """ returns a sorted list of distinct chromosomes in table.
311         """
312         statement = "select distinct chrom from %s" % table
313         sql = self.getSqlCursor()
314
315         sql.execute(statement)
316         results = []
317         for row in sql:
318             if fullChrom:
319                 if row["chrom"] not in results:
320                     results.append(row["chrom"])
321             else:
322                 shortName = row["chrom"][3:]
323                 if  len(shortName.strip()) > 0 and shortName not in results:
324                     results.append(shortName)
325
326         results.sort()
327
328         return results
329
330
331     def getMaxCoordinate(self, chrom, doUniqs=True,
332                          doMulti=False, doSplices=False):
333         """ returns the maximum coordinate for reads on a given chromosome.
334         """
335         maxCoord = 0
336
337         if doUniqs:
338             maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
339
340         if doSplices:
341             spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
342             maxCoord = max(spliceMax, maxCoord)
343
344         if doMulti:
345             multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
346             maxCoord = max(multiMax, maxCoord)
347
348         return maxCoord
349
350
351     def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
352         maxCoord = 0
353         sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
354         sql = self.getSqlCursor()
355         try:
356             sql.execute(sqlStatement)
357             maxCoord = int(sql.fetchall()[0][0])
358         except:
359             print "couldn't retrieve coordMax for chromosome %s" % chrom
360
361         return maxCoord
362
363
364     def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
365                      flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
366                      withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
367                      readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
368                      flagLike=False, strand='', combine5p=False):
369         """ returns a dictionary of reads in a variety of formats
370         and which can be restricted by chromosome or custom-flag.
371         Returns unique reads by default, but can return multireads
372         with doMulti set to True.
373
374         """
375         #TODO: Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
376
377         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
378         if findallOptimize:
379             selectQuery = "select start, sense, sum(weight)"
380         else:
381             selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
382
383         groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
384         if doUniqs:
385             stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
386             if doMulti:
387                 stmt.append("UNION ALL")
388                 stmt.append(selectQuery)
389                 stmt.append("from multi")
390                 stmt.append(whereQuery)
391                 stmt.append(groupQuery)
392         else:
393             stmt = [selectQuery, "from multi", whereQuery]
394
395         if combine5p:
396             if findallOptimize:
397                 selectQuery = "select start, sense, weight, chrom"
398
399             if doUniqs:
400                 subSelect = [selectQuery, "from uniqs", whereQuery]
401                 if doMulti:
402                     subSelect.append("union all")
403                     subSelect.append(selectQuery)
404                     subSelect.append("from multi")
405                     subSelect.append(whereQuery)
406             else:
407                 subSelect = [selectQuery, "from multi", whereQuery]
408
409             sqlStmt = string.join(subSelect)
410             if findallOptimize:
411                 selectQuery = "select start, sense, sum(weight)"
412
413             stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
414                     selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
415
416         if findallOptimize:
417             if self.memBacked:
418                 self.memcon.row_factory = None
419             else:
420                 self.dbcon.row_factory = None
421
422             stmt.append("order by start")
423         elif readIDDict:
424             stmt.append("order by readID, start")
425         else:
426             stmt.append("order by chrom, start")
427
428         sql = self.getSqlCursor()
429         sqlQuery = string.join(stmt)
430         sql.execute(sqlQuery)
431
432         resultsDict = {}
433         if findallOptimize:
434             resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
435             if self.memBacked:
436                 self.memcon.row_factory = sqlite.Row
437             else:
438                 self.dbcon.row_factory = sqlite.Row
439         else:
440             currentChrom = ""
441             currentReadID = ""
442             pairID = 0
443             for row in sql:
444                 readID = row["readID"]
445                 if fullChrom:
446                     chrom = row["chrom"]
447                 else:
448                     chrom = row["chrom"][3:]
449
450                 if not readIDDict and chrom != currentChrom:
451                     resultsDict[chrom] = []
452                     currentChrom = chrom
453                     dictKey = chrom
454                 elif readIDDict:
455                     theReadID = readID
456                     if "::" in readID:
457                         theReadID = readID.split("::")[0]
458
459                     if "/" in theReadID and withPairID:
460                         (theReadID, pairID) = readID.split("/")
461
462                     if theReadID != currentReadID:
463                         resultsDict[theReadID] = []
464                         currentReadID = theReadID
465                         dictKey = theReadID
466
467                 newrow = {"start": int(row["start"])}
468                 if bothEnds:
469                     newrow["stop"] = int(row["stop"])
470
471                 if not noSense:
472                     newrow["sense"] = row["sense"]
473
474                 if withWeight:
475                     newrow["weight"] = float(row["weight"])
476
477                 if withFlag:
478                     newrow["flag"] = row["flag"]
479
480                 if withMismatch:
481                     newrow["mismatch"] = row["mismatch"]
482
483                 if withID:
484                     newrow["readID"] = readID
485
486                 if withChrom:
487                     newrow["chrom"] = chrom
488
489                 if withPairID:
490                     newrow["pairID"] = pairID
491
492                 resultsDict[dictKey].append(newrow)
493
494         return resultsDict
495
496
497     def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
498         if splice:
499             startText = "startL"
500             stopText = "stopR"
501         else:
502             startText = "start"
503             stopText = "stop"
504
505         whereClause = []
506         if chrom != "" and chrom != self.memChrom:
507             whereClause.append("chrom = '%s'" % chrom)
508
509         if flag != "":
510             if flagLike:
511                 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
512                 whereClause.append(flagLikeClause)
513             else:
514                 whereClause.append("flag = '%s'" % flag)
515
516         if start > -1:
517             whereClause.append("%s > %d" % (startText, start))
518
519         if stop > -1:
520             whereClause.append("%s < %d" % (stopText, stop))
521
522         if len(readLike) > 0:
523             readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
524             whereClause.append(readIDClause)
525
526         if hasMismatch:
527             whereClause.append("mismatch != ''")
528
529         if strand in ["+", "-"]:
530             whereClause.append("sense = '%s'" % strand)
531
532         if len(whereClause) > 0:
533             whereStatement = string.join(whereClause, " and ")
534             whereQuery = "where %s" % whereStatement
535         else:
536             whereQuery = ""
537
538         return whereQuery
539
540
541     def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
542
543         selectClause = [baseSelect]
544         if bothEnds:
545             selectClause.append("stop")
546
547         if not noSense:
548             selectClause.append("sense")
549
550         if withWeight:
551             selectClause.append("weight")
552
553         if withFlag:
554             selectClause.append("flag")
555
556         if withMismatch:
557             selectClause.append("mismatch")
558
559         selectQuery = string.join(selectClause, ",")
560
561         return selectQuery
562
563
564     def getReadGroupQuery(self, findallOptimize, limit, combine5p):
565         groupBy = []
566         if findallOptimize:
567             groupBy = ["GROUP BY start, sense"]
568
569         if limit > 0 and not combine5p:
570             groupBy.append("LIMIT %d" % limit)
571
572         groupQuery = string.join(groupBy)
573
574         return groupQuery
575
576
577     def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
578                        flag="", withWeight=False, withFlag=False, withMismatch=False,
579                        withID=False, withChrom=False, withPairID=False, readIDDict=False,
580                        splitRead=False, hasMismatch=False, flagLike=False, start=-1,
581                        stop=-1, strand=""):
582         """ returns a dictionary of spliced reads in a variety of
583         formats and which can be restricted by chromosome or custom-flag.
584         Returns unique spliced reads for now.
585         """
586         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
587         selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
588         selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
589         sql = self.getSqlCursor()
590
591         stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
592         sql.execute(stmt)
593         currentReadID = ""
594         currentChrom = ""
595         resultsDict = {}
596         for row in sql:
597             pairID = 0
598             readID = row["readID"]
599             if fullChrom:
600                 chrom = row["chrom"]
601             else:
602                 chrom = row["chrom"][3:]
603
604             if not readIDDict and chrom != currentChrom:
605                 resultsDict[chrom] = []
606                 currentChrom = chrom
607                 dictKey = chrom
608             elif readIDDict:
609                 if "/" in readID:
610                     (theReadID, pairID) = readID.split("/")
611                 else:
612                     theReadID = readID
613
614                 if theReadID != currentReadID:
615                     resultsDict[theReadID] = []
616                     currentReadID = theReadID
617                     dictKey = theReadID
618
619             newrow = {"startL": int(row["startL"])}
620             newrow["stopL"] = int(row["stopL"])
621             newrow["startR"] = int(row["startR"])
622             newrow["stopR"] = int(row["stopR"])
623             if not noSense:
624                 newrow["sense"] = row["sense"]
625
626             if withWeight:
627                 newrow["weight"] = float(row["weight"])
628
629             if withFlag:
630                 newrow["flag"] = row["flag"]
631
632             if withMismatch:
633                 newrow["mismatch"] = row["mismatch"]
634
635             if withID:
636                 newrow["readID"] = readID
637
638             if withChrom:
639                 newrow["chrom"] = chrom
640
641             if withPairID:
642                 newrow["pairID"] = pairID
643
644             if splitRead:
645                 leftDict = newrow.copy()
646                 del leftDict["startR"]
647                 del leftDict["stopR"]
648                 rightDict = newrow
649                 del rightDict["startL"]
650                 del rightDict["stopL"]
651                 resultsDict[dictKey].append(leftDict)
652                 resultsDict[dictKey].append(rightDict)
653             else:
654                 resultsDict[dictKey].append(newrow)
655
656         return resultsDict
657
658
659     def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
660                   splices=False, reportCombined=True, sense="both"):
661         """ return read counts for a given region.
662         """
663         ucount = 0
664         mcount = 0
665         scount = 0
666         restrict = ""
667         if sense in ["+", "-"]:
668             restrict = " sense ='%s' " % sense
669
670         if uniqs:
671             try:
672                 ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
673             except:
674                 ucount = 0
675
676         if multi:
677             try:
678                 mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
679             except:
680                 mcount = 0
681
682         if splices:
683             try:
684                 scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
685             except:
686                 scount = 0
687
688         if reportCombined:
689             total = ucount + mcount + scount
690             return total
691         else:
692             return (ucount, mcount, scount)
693
694
695     def getTotalCounts(self, chrom="", rmin="", rmax=""):
696         """ return read counts for a given region.
697         """
698         return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
699
700
701     def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
702         """ returns the number of row in the specified table.
703         """
704         whereClause = []
705         count = 0
706
707         if chrom !=""  and chrom != self.memChrom:
708             whereClause = ["chrom='%s'" % chrom]
709
710         if rmin != "":
711             whereClause.append("%s >= %s" % (startField, str(rmin)))
712
713         if rmax != "":
714             whereClause.append("%s <= %s" % (startField, str(rmax)))
715
716         if restrict != "":
717             whereClause.append(restrict)
718
719         if len(whereClause) > 0:
720             whereStatement = string.join(whereClause, " and ")
721             whereQuery = "where %s" % whereStatement
722         else:
723             whereQuery = ""
724
725         sql = self.getSqlCursor()
726
727         if distinct:
728             sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
729         else:
730             sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
731
732         result = sql.fetchone()
733
734         try:
735             count = int(result[0])
736         except:
737             count = 0
738
739         return count
740
741
742     def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
743         """ returns the number of row in the splices table.
744         """
745         # TODO: if the rds type is DNA should this just return zero?
746         return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
747
748
749     def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
750         """ returns the number of distinct readIDs in the uniqs table.
751         """
752         return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
753
754
755     def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
756         """ returns the total weight of readIDs in the multi table.
757         """
758         return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
759
760
761     def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
762         """ get readID's.
763         """
764         stmt = []
765         if uniqs:
766             stmt.append("select readID from uniqs")
767
768         if multi:
769             stmt.append("select readID from multi")
770
771         if splices:
772             stmt.append("select readID from splices")
773
774         if len(stmt) > 0:
775             selectPart = string.join(stmt, " union ")
776         else:
777             selectPart = ""
778
779         limitPart = ""
780         if limit > 0:
781             limitPart = "LIMIT %d" % limit
782
783         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
784         sql = self.getSqlCursor()
785         sql.execute(sqlQuery)
786         result = sql.fetchall()
787
788         if paired:
789             return [x[0].split("/")[0] for x in result]
790         else:
791             return [x[0] for x in result]
792
793
794     def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
795         """ returns the uniq and spliced mismatches in a dictionary.
796         """
797         readlen = self.getReadSize()
798         if mischrom:
799             hitChromList = [mischrom]
800         else:
801             hitChromList = self.getChromosomes()
802             hitChromList.sort()
803
804         snpDict = {}
805         for achrom in hitChromList:
806             if verbose:
807                 print "getting mismatches from chromosome %s" % (achrom)
808
809             snpDict[achrom] = []
810             if useSplices and self.dataType == "RNA":
811                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
812                 spliceIDList = spliceDict.keys()
813                 for spliceID in spliceIDList:
814                     spliceEntry = spliceDict[spliceID][0]
815                     startpos = spliceEntry["startL"]
816                     lefthalf = spliceEntry["stopL"]
817                     rightstart = spliceEntry["startR"]
818                     sense = spliceEntry["sense"]
819                     mismatches = spliceEntry["mismatch"]
820                     spMismatchList = mismatches.split(",")
821                     for mismatch in spMismatchList:
822                         if "N" in mismatch:
823                             continue
824
825                         change_len = len(mismatch)
826                         if sense == "+":
827                             change_from = mismatch[0]
828                             change_base = mismatch[change_len-1]
829                             change_pos = int(mismatch[1:change_len-1])
830                         elif sense == "-":
831                             change_from = getReverseComplement([mismatch[0]])
832                             change_base = getReverseComplement([mismatch[change_len-1]])
833                             change_pos = readlen - int(mismatch[1:change_len-1]) + 1
834
835                         firsthalf = int(lefthalf)-int(startpos)+1
836                         secondhalf = 0
837                         if int(change_pos) <= int(firsthalf):
838                             change_at = startpos + change_pos - 1
839                         else:
840                             secondhalf = change_pos - firsthalf
841                             change_at = rightstart + secondhalf
842
843                         snpDict[achrom].append([startpos, change_at, change_base, change_from])
844
845             hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
846             if achrom not in hitDict.keys():
847                 continue
848
849             for readEntry in hitDict[achrom]:
850                 start = readEntry["start"]
851                 sense = readEntry["sense"]
852                 mismatches = readEntry["mismatch"]
853                 mismatchList = mismatches.split(",")
854                 for mismatch in mismatchList:
855                     if "N" in mismatch:
856                         continue
857
858                     change_len = len(mismatch)
859                     if sense == "+":
860                         change_from = mismatch[0]
861                         change_base = mismatch[change_len-1]
862                         change_pos = int(mismatch[1:change_len-1])
863                     elif sense == "-":
864                         change_from = getReverseComplement([mismatch[0]])
865                         change_base = getReverseComplement([mismatch[change_len-1]])
866                         change_pos = readlen - int(mismatch[1:change_len-1]) + 1
867
868                     change_at = start + change_pos - 1
869                     snpDict[achrom].append([start, change_at, change_base, change_from])
870
871         return snpDict
872
873
874     def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
875                         useSplices=False, normalizationFactor=1.0, trackStrand=False,
876                         keepStrand="both", shiftValue=0):
877         """return a profile of the chromosome as an array of per-base read coverage....
878             keepStrand = 'both', 'plusOnly', or 'minusOnly'.
879             Will also shift position of unique and multireads (but not splices) if shift is a natural number
880         """
881         metadata = self.getMetadata()
882         try:
883             readlen = int(metadata["readsize"])
884         except KeyError:
885             readlen = 0
886
887         dataType = metadata["dataType"]
888         scale = 1. / normalizationFactor
889         shift = {}
890         shift["+"] = int(shiftValue)
891         shift["-"] = -1 * int(shiftValue)
892
893         if cstop > 0:
894             lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
895         else:
896             lastNT = cstop - cstart + readlen + shift["+"]
897
898         chromModel = array("f",[0.] * lastNT)
899         hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
900         if cstart < 0:
901             cstart = 0
902
903         for readEntry in hitDict[chromosome]:
904             hstart = readEntry["start"]
905             sense =  readEntry ["sense"]
906             weight = readEntry["weight"]
907             hstart = hstart - cstart + shift[sense]
908             for currentpos in range(hstart,hstart+readlen):
909                 try:
910                     if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
911                         chromModel[currentpos] += scale * weight
912                     elif sense == "-" and keepStrand != "plusOnly":
913                         chromModel[currentpos] -= scale * weight
914                 except:
915                     continue
916
917         del hitDict
918         if useSplices and dataType == "RNA":
919             if cstop > 0:
920                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
921             else:
922                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
923    
924             if chromosome in spliceDict:
925                 for spliceEntry in spliceDict[chromosome]:
926                     Lstart = spliceEntry["startL"]
927                     Lstop = spliceEntry["stopL"]
928                     Rstart = spliceEntry["startR"]
929                     Rstop = spliceEntry["stopR"]
930                     rsense = spliceEntry["sense"]
931                     if (Rstop - cstart) < lastNT:
932                         for index in range(abs(Lstop - Lstart)):
933                             currentpos = Lstart - cstart + index
934                             # we only track unique splices
935                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
936                                 chromModel[currentpos] += scale
937                             elif rsense == "-" and keepStrand != "plusOnly":
938                                 chromModel[currentpos] -= scale
939
940                         for index in range(abs(Rstop - Rstart)):
941                             currentpos = Rstart - cstart + index
942                             # we only track unique splices
943                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
944                                 chromModel[currentpos] += scale
945                             elif rsense == "-" and keepStrand != "plusOnly":
946                                 chromModel[currentpos] -= scale
947
948             del spliceDict
949
950         return chromModel
951
952
953     def insertMetadata(self, valuesList):
954         """ inserts a list of (pname, pvalue) into the metadata
955         table.
956         """
957         self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
958         self.dbcon.commit()
959
960
961     def updateMetadata(self, pname, newValue, originalValue=""):
962         """ update a metadata field given the original value and the new value.
963         """
964         stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
965         if originalValue != "":
966             stmt += " and value='%s' " % str(originalValue)
967
968         self.dbcon.execute(stmt)
969         self.dbcon.commit()
970
971
972     def insertUniqs(self, valuesList):
973         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
974         into the uniqs table.
975         """
976         self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
977         self.dbcon.commit()
978
979
980     def insertMulti(self, valuesList):
981         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
982         into the multi table.
983         """
984         self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
985         self.dbcon.commit()
986
987
988     def insertSplices(self, valuesList):
989         """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
990         into the splices table.
991         """
992         self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
993         self.dbcon.commit()
994
995
996     def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
997         """ update reads on file database in a list region of regions for a chromosome to have a new flag.
998             regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
999             sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
1000         """
1001         restrict = ""
1002         if sense != "both":
1003             restrict = " and sense = ? "
1004
1005         if uniqs:
1006             self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1007
1008         if multi:
1009             self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1010
1011         if self.dataType == "RNA" and splices:
1012             self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
1013             self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
1014
1015         self.dbcon.commit()
1016
1017
1018     def setFlags(self, flag, uniqs=True, multi=True, splices=True):
1019         """ set the flag fields in the entire dataset.
1020         """
1021         if uniqs:
1022             self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
1023
1024         if multi:
1025             self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
1026
1027         if self.dataType == "RNA" and splices:
1028             self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
1029
1030         self.dbcon.commit()
1031
1032
1033     def resetFlags(self, uniqs=True, multi=True, splices=True):
1034         """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
1035         """
1036         self.setFlags("", uniqs, multi, splices)
1037
1038
1039     def reweighMultireads(self, readList):
1040         self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
1041
1042
1043     def setSynchronousPragma(self, value="ON"):
1044         try:
1045             self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
1046         except:
1047             print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
1048
1049
1050     def setDBcache(self, cache, default=False):
1051         self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
1052         if default:
1053             self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
1054
1055
1056     def execute(self, statement, returnResults=False):
1057         sql = self.getSqlCursor()
1058
1059         sql.execute(statement)
1060         if returnResults:
1061             result = sql.fetchall()
1062             return result
1063
1064
1065     def executeCommit(self, statement):
1066         self.execute(statement)
1067
1068         if self.memBacked:
1069             self.memcon.commit()
1070         else:
1071             self.dbcon.commit()
1072
1073
1074     def buildIndex(self, cache=100000):
1075         """ Builds the file indeces for the main tables.
1076             Cache is the number of 1.5 kb pages to keep in memory.
1077             100000 pages translates into 150MB of RAM, which is our default.
1078         """
1079         if cache > self.getDefaultCacheSize():
1080             self.setDBcache(cache)
1081         self.setSynchronousPragma("OFF")
1082         self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1083         print "built uPosIndex"
1084         self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
1085         print "built uChromIndex"
1086         self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1087         print "built mPosIndex"
1088         self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
1089         print "built mChromIndex"
1090
1091         if self.dataType == "RNA":
1092             self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
1093             print "built sPosIndex"
1094             self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
1095             print "built sPosIndex2"
1096             self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
1097             print "built sChromIndex"
1098
1099         self.dbcon.commit()
1100         self.setSynchronousPragma("ON")
1101
1102
1103     def dropIndex(self):
1104         """ drops the file indices for the main tables.
1105         """
1106         try:
1107             self.setSynchronousPragma("OFF")
1108             self.dbcon.execute("DROP INDEX uPosIndex")
1109             self.dbcon.execute("DROP INDEX uChromIndex")
1110             self.dbcon.execute("DROP INDEX mPosIndex")
1111             self.dbcon.execute("DROP INDEX mChromIndex")
1112
1113             if self.dataType == "RNA":
1114                 self.dbcon.execute("DROP INDEX sPosIndex")
1115                 try:
1116                     self.dbcon.execute("DROP INDEX sPosIndex2")
1117                 except:
1118                     pass
1119
1120                 self.dbcon.execute("DROP INDEX sChromIndex")
1121
1122             self.dbcon.commit()
1123         except:
1124             print "problem dropping index"
1125
1126         self.setSynchronousPragma("ON")
1127
1128
1129     def memSync(self, chrom="", index=False):
1130         """ makes a copy of the dataset into memory for faster access.
1131         Can be restricted to a "full" chromosome. Can also build the
1132         memory indices.
1133         """
1134         self.memcon = ""
1135         self.memcon = sqlite.connect(":memory:")
1136         self.initializeTables(self.memcon)
1137         cursor = self.dbcon.cursor()
1138         whereclause = ""
1139         if chrom != "":
1140             print "memSync %s" % chrom
1141             whereclause = " where chrom = '%s' " % chrom
1142             self.memChrom = chrom
1143         else:
1144             self.memChrom = ""
1145
1146         self.memcon.execute("PRAGMA temp_store = MEMORY")
1147         self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
1148         # copy metadata to memory
1149         self.memcon.execute("delete from metadata")
1150         results = cursor.execute("select name, value from metadata")
1151         results2 = []
1152         for row in results:
1153             results2.append((row["name"], row["value"]))
1154
1155         self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
1156
1157         self.copyDBEntriesToMemory("uniqs", whereclause)
1158         self.copyDBEntriesToMemory("multi", whereclause)
1159         if self.dataType == "RNA":
1160             self.copySpliceDBEntriesToMemory(whereclause)
1161
1162         if index:
1163             if chrom != "":
1164                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
1165                 self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
1166                 if self.dataType == "RNA":
1167                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
1168                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
1169             else:
1170                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1171                 self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1172                 if self.dataType == "RNA":
1173                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
1174                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
1175
1176         self.memBacked = True
1177         self.memcon.row_factory = sqlite.Row
1178         self.memcon.commit()
1179
1180
1181     def copyDBEntriesToMemory(self, dbName, whereClause=""):
1182         cursor = self.dbcon.cursor()
1183         sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
1184         destinationEntries = []
1185         for row in sourceEntries:
1186             destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
1187
1188         self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
1189
1190
1191     def copySpliceDBEntriesToMemory(self, whereClause=""):
1192         cursor = self.dbcon.cursor()
1193         sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
1194         destinationEntries = []
1195         for row in sourceEntries:
1196             destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
1197                                        row["weight"], row["flag"], row["mismatch"]))
1198
1199         self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", destinationEntries)
1200