snapshot of 4.0a development. initial git repo commit
[erange.git] / ReadDataset.py
1 """
2 Created on Jul 1, 2010
3
4 @author: sau
5 """
6
7 import sqlite3 as sqlite
8 import string
9 import tempfile
10 import shutil
11 import os
12 from os import environ
13 from array import array
14 from commoncode import getReverseComplement
15
16 if environ.get("CISTEMATIC_TEMP"):
17     cisTemp = environ.get("CISTEMATIC_TEMP")
18 else:
19     cisTemp = "/tmp"
20
21 tempfile.tempdir = cisTemp
22 currentRDSVersion = "1.1"
23
24
25 class ReadDatasetError(Exception):
26     pass
27
28
29 class ReadDataset():
30     """ Class for storing reads from experiments. Assumes that custom scripts
31     will translate incoming data into a format that can be inserted into the
32     class using the insert* methods. Default class subtype ('DNA') includes
33     tables for unique and multireads, whereas 'RNA' subtype also includes a
34     splices table.
35     """
36
37     def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False, 
38                  cache=False, reportCount=True):
39         """ creates an rds datafile if initialize is set to true, otherwise
40         will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
41         """
42         self.dbcon = ""
43         self.memcon = ""
44         self.dataType = ""
45         self.rdsVersion = currentRDSVersion
46         self.memBacked = False
47         self.memChrom = ""
48         self.memCursor = ""
49         self.cachedDBFile = ""
50
51         if cache:
52             if verbose:
53                 print "caching ...."
54
55             self.cacheDB(datafile)
56             dbFile = self.cachedDBFile
57         else:
58             dbFile = datafile
59
60         self.dbcon = sqlite.connect(dbFile)
61         self.dbcon.row_factory = sqlite.Row
62         self.dbcon.execute("PRAGMA temp_store = MEMORY")
63         if initialize:
64             if datasetType not in ["DNA", "RNA"]:
65                 raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
66             else:
67                 self.dataType = datasetType
68
69             self.initializeTables(self.dbcon)
70         else:
71             metadata = self.getMetadata("dataType")
72             self.dataType = metadata["dataType"]
73
74         try:
75             metadata = self.getMetadata("rdsVersion")
76             self.rdsVersion = metadata["rdsVersion"]
77         except:
78             try:
79                 self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
80             except IOError:
81                 print "could not add rdsVersion - read-only ?"
82                 self.rdsVersion = "pre-1.0"
83
84         if verbose:
85             if initialize:
86                 print "INITIALIZED dataset %s" % datafile
87             else:
88                 print "dataset %s" % datafile
89
90             metadata = self.getMetadata()
91             print "metadata:"
92             pnameList = metadata.keys()
93             pnameList.sort()
94             for pname in pnameList:
95                 print "\t" + pname + "\t" + metadata[pname]
96
97             if reportCount:
98                 ucount = self.getUniqsCount()
99                 mcount = self.getMultiCount()
100                 if self.dataType == "DNA" and not initialize:
101                     try:
102                         print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
103                     except ValueError:
104                         print "\n%s unique reads and %s multireads" % (ucount, mcount)
105                 elif self.dataType == "RNA" and not initialize:
106                     scount = self.getSplicesCount()
107                     try:
108                         print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
109                     except ValueError:
110                         print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
111
112             print "default cache size is %d pages" % self.getDefaultCacheSize()
113             if self.hasIndex():
114                 print "found index"
115             else:
116                 print "not indexed"
117
118
119     def __len__(self):
120         """ return the number of usable reads in the dataset.
121         """
122         total = self.getUniqsCount()
123         total += self.getMultiCount()
124
125         if self.dataType == "RNA":
126             total += self.getSplicesCount()
127
128         total = int(total)
129
130         return total
131
132
133     def __del__(self):
134         """ cleanup copy in local cache, if present.
135         """
136         if self.cachedDBFile != "":
137             self.uncacheDB()
138
139
140     def cacheDB(self, filename):
141         """ copy geneinfoDB to a local cache.
142         """
143         self.cachedDBFile =  "%s.db" % tempfile.mktemp()
144         shutil.copyfile(filename, self.cachedDBFile)
145
146
147     def saveCacheDB(self, filename):
148         """ copy geneinfoDB to a local cache.
149         """
150         shutil.copyfile(self.cachedDBFile, filename)
151
152
153     def uncacheDB(self):
154         """ delete geneinfoDB from local cache.
155         """
156         global cachedDBFile
157         if self.cachedDBFile != "":
158             try:
159                 os.remove(self.cachedDBFile)
160             except:
161                 print "could not delete %s" % self.cachedDBFile
162
163             self.cachedDB = ""
164
165
166     def attachDB(self, filename, asname):
167         """ attach another database file to the readDataset.
168         """
169         stmt = "attach '%s' as %s" % (filename, asname)
170         self.execute(stmt)
171
172
173     def detachDB(self, asname):
174         """ detach a database file to the readDataset.
175         """
176         stmt = "detach %s" % (asname)
177         self.execute(stmt)
178
179
180     def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
181         """ import into current RDS the table (with columns destcolumns,
182             with default all columns) from the database file asname,
183             using the column specification of ascolumns (default all).
184         """
185         stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
186         if flagged != "":
187             stmt += " where flag = '%s' " % flagged
188
189         self.executeCommit(stmt)
190
191
192     def getTables(self, asname=""):
193         """ get a list of table names in a particular database file.
194         """
195         resultList = []
196         sql = self.getSqlCursor()
197
198         if asname != "":
199             asname += "."
200
201         stmt = "select name from %ssqlite_master where type='table'" % asname
202         sql.execute(stmt)
203         results = sql.fetchall()
204
205         for row in results:
206             resultList.append(row["name"])
207
208         return resultList
209
210
211     def getSqlCursor(self):
212         if self.memBacked:
213             sql = self.getMemCursor()
214         else:
215             sql = self.getFileCursor()
216
217         return sql
218
219
220     def hasIndex(self):
221         """ check whether the RDS file has at least one index.
222         """
223         stmt = "select count(*) from sqlite_master where type='index'"
224         count = int(self.execute(stmt, returnResults=True)[0][0])
225         if count > 0:
226             return True
227
228         return False
229
230
231     def initializeTables(self, dbConnection, cache=100000):
232         """ creates table schema in a database connection, which is
233         typically a database file or an in-memory database.
234         """
235         dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
236         dbConnection.execute("create table metadata (name varchar, value varchar)")
237         dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
238         positionSchema = "start int, stop int"
239         tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
240         dbConnection.execute("create table uniqs %s" % tableSchema)
241         dbConnection.execute("create table multi %s" % tableSchema)
242         if self.dataType == "RNA":
243             positionSchema = "startL int, stopL int, startR int, stopR int"
244             tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
245             dbConnection.execute("create table splices %s" % tableSchema)
246
247         dbConnection.commit()
248
249
250     def getFileCursor(self):
251         """ returns a cursor to file database for low-level (SQL)
252         access to the data.
253         """
254         return self.dbcon.cursor()
255
256
257     def getMemCursor(self):
258         """ returns a cursor to memory database for low-level (SQL)
259         access to the data.
260         """
261         return self.memcon.cursor()
262
263
264     def getMetadata(self, valueName=""):
265         """ returns a dictionary of metadata.
266         """
267         whereClause = ""
268         resultsDict = {}
269
270         if valueName != "":
271             whereClause = " where name='%s'" % valueName
272
273         sql = self.getSqlCursor()
274
275         sql.execute("select name, value from metadata %s" % whereClause)
276         results = sql.fetchall()
277
278         for row in results:
279             parameterName = row["name"]
280             parameterValue = row["value"]
281             if parameterName not in resultsDict:
282                 resultsDict[parameterName] = parameterValue
283             else:
284                 trying = True
285                 index = 2
286                 while trying:
287                     newName = string.join([parameterName, str(index)], ":")
288                     if newName not in resultsDict:
289                         resultsDict[newName] = parameterValue
290                         trying = False
291
292                     index += 1
293
294         return resultsDict
295
296
297     def getReadSize(self):
298         """ returns readsize if defined in metadata.
299         """
300         metadata = self.getMetadata()
301         if "readsize" not in metadata:
302             raise ReadDatasetError("no readsize parameter defined")
303         else:
304             mysize = metadata["readsize"]
305             if "import" in mysize:
306                 mysize = mysize.split()[0]
307
308             return int(mysize)
309
310
311     def getDefaultCacheSize(self):
312         """ returns the default cache size.
313         """
314         return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
315
316
317     def getChromosomes(self, table="uniqs", fullChrom=True):
318         """ returns a list of distinct chromosomes in table.
319         """
320         statement = "select distinct chrom from %s" % table
321         sql = self.getSqlCursor()
322
323         sql.execute(statement)
324         results = []
325         for row in sql:
326             if fullChrom:
327                 if row["chrom"] not in results:
328                     results.append(row["chrom"])
329             else:
330                 if  len(row["chrom"][3:].strip()) < 1:
331                     continue
332
333                 if row["chrom"][3:] not in results:
334                     results.append(row["chrom"][3:])
335
336         results.sort()
337
338         return results
339
340
341     def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
342                          doMulti=False, doSplices=False):
343         """ returns the maximum coordinate for reads on a given chromosome.
344         """
345         maxCoord = 0
346         sql = self.getSqlCursor()
347
348         if doUniqs:
349             try:
350                 sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
351                 maxCoord = int(sql.fetchall()[0][0])
352             except:
353                 print "couldn't retrieve coordMax for chromosome %s" % chrom
354
355         if doSplices:
356             sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
357             try:
358                 spliceMax = int(sql.fetchall()[0][0])
359                 if spliceMax > maxCoord:
360                     maxCoord = spliceMax
361             except:
362                 pass
363
364         if doMulti:
365             sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
366             try:
367                 multiMax = int(sql.fetchall()[0][0])
368                 if multiMax > maxCoord:
369                     maxCoord = multiMax
370             except:
371                 pass
372
373         if verbose:
374             print "%s maxCoord: %d" % (chrom, maxCoord)
375
376         return maxCoord
377
378
379     def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
380                      flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
381                      withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
382                      readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
383                      flagLike=False, strand='', combine5p=False):
384         """ returns a dictionary of reads in a variety of formats
385         and which can be restricted by chromosome or custom-flag.
386         Returns unique reads by default, but can return multireads
387         with doMulti set to True.
388         """
389         whereClause = []
390         resultsDict = {}
391
392         if chrom != "" and chrom != self.memChrom:
393             whereClause.append("chrom = '%s'" % chrom)
394
395         if flag != "":
396             if flagLike:
397                 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
398                 whereClause.append(flagLikeClause)
399             else:
400                 whereClause.append("flag = '%s'" % flag)
401
402         if start > -1:
403             whereClause.append("start > %d" % start)
404
405         if stop > -1:
406             whereClause.append("stop < %d" % stop)
407
408         if len(readLike) > 0:
409             readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
410             whereClause.append(readIDClause)
411
412         if hasMismatch:
413             whereClause.append("mismatch != ''")
414
415         if strand in ["+", "-"]:
416             whereClause.append("sense = '%s'" % strand)
417
418         if len(whereClause) > 0:
419             whereStatement = string.join(whereClause, " and ")
420             whereQuery = "where %s" % whereStatement
421         else:
422             whereQuery = ""
423
424         groupBy = []
425         if findallOptimize:
426             selectClause = ["select start, sense, sum(weight)"]
427             groupBy = ["GROUP BY start, sense"]
428         else:
429             selectClause = ["select ID, chrom, start, readID"]
430             if bothEnds:
431                 selectClause.append("stop")
432
433             if not noSense:
434                 selectClause.append("sense")
435
436             if withWeight:
437                 selectClause.append("weight")
438
439             if withFlag:
440                 selectClause.append("flag")
441
442             if withMismatch:
443                 selectClause.append("mismatch")
444
445         if limit > 0 and not combine5p:
446             groupBy.append("LIMIT %d" % limit)
447
448         selectQuery = string.join(selectClause, ",")
449         groupQuery = string.join(groupBy)
450         if doUniqs:
451             stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
452             if doMulti:
453                 stmt.append("UNION ALL")
454                 stmt.append(selectQuery)
455                 stmt.append("from multi")
456                 stmt.append(whereQuery)
457                 stmt.append(groupQuery)
458         else:
459             stmt = [selectQuery, "from multi", whereQuery]
460
461         if combine5p:
462             if findallOptimize:
463                 selectQuery = "select start, sense, weight, chrom"
464
465             if doUniqs:
466                 subSelect = [selectQuery, "from uniqs", whereQuery]
467                 if doMulti:
468                     subSelect.append("union all")
469                     subSelect.append(selectQuery)
470                     subSelect.append("from multi")
471                     subSelect.append(whereQuery)
472             else:
473                 subSelect = [selectQuery, "from multi", whereQuery]
474
475             sqlStmt = string.join(subSelect)
476             if findallOptimize:
477                 selectQuery = "select start, sense, sum(weight)"
478
479             stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
480                     selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
481
482         if findallOptimize:
483             if self.memBacked:
484                 self.memcon.row_factory = None
485                 sql = self.memcon.cursor()
486             else:
487                 self.dbcon.row_factory = None
488                 sql = self.dbcon.cursor()
489
490             stmt.append("order by start")
491         elif readIDDict:
492             if self.memBacked:
493                 sql = self.memcon.cursor()
494             else:
495                 sql = self.dbcon.cursor()
496
497             stmt.append("order by readID, start")
498         else:
499             if self.memBacked:
500                 sql = self.memcon.cursor()
501             else:
502                 sql = self.dbcon.cursor()
503
504             stmt.append("order by chrom, start")
505
506         sqlQuery = string.join(stmt)
507         sql.execute(sqlQuery)
508
509         if findallOptimize:
510             resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
511             if self.memBacked:
512                 self.memcon.row_factory = sqlite.Row
513             else:
514                 self.dbcon.row_factory = sqlite.Row
515         else:
516             currentChrom = ""
517             currentReadID = ""
518             pairID = 0
519             for row in sql:
520                 readID = row["readID"]
521                 if fullChrom:
522                     chrom = row["chrom"]
523                 else:
524                     chrom = row["chrom"][3:]
525
526                 if not readIDDict and chrom != currentChrom:
527                     resultsDict[chrom] = []
528                     currentChrom = chrom
529                     dictKey = chrom
530                 elif readIDDict:
531                     theReadID = readID
532                     if "::" in readID:
533                         theReadID = readID.split("::")[0]
534
535                     if "/" in theReadID and withPairID:
536                         (theReadID, pairID) = readID.split("/")
537
538                     if theReadID != currentReadID:
539                         resultsDict[theReadID] = []
540                         currentReadID = theReadID
541                         dictKey = theReadID
542
543                 newrow = {"start": int(row["start"])}
544                 if bothEnds:
545                     newrow["stop"] = int(row["stop"])
546
547                 if not noSense:
548                     newrow["sense"] = row["sense"]
549
550                 if withWeight:
551                     newrow["weight"] = float(row["weight"])
552
553                 if withFlag:
554                     newrow["flag"] = row["flag"]
555
556                 if withMismatch:
557                     newrow["mismatch"] = row["mismatch"]
558
559                 if withID:
560                     newrow["readID"] = readID
561
562                 if withChrom:
563                     newrow["chrom"] = chrom
564
565                 if withPairID:
566                     newrow["pairID"] = pairID
567
568                 resultsDict[dictKey].append(newrow)
569
570         return resultsDict
571
572
573     def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
574                        flag="", withWeight=False, withFlag=False, withMismatch=False,
575                        withID=False, withChrom=False, withPairID=False, readIDDict=False,
576                        splitRead=False, hasMismatch=False, flagLike=False, start=-1,
577                        stop=-1, strand=""):
578         """ returns a dictionary of spliced reads in a variety of
579         formats and which can be restricted by chromosome or custom-flag.
580         Returns unique spliced reads for now.
581         """
582         whereClause = []
583         resultsDict = {}
584
585         if chrom != "" and chrom != self.memChrom:
586             whereClause = ["chrom = '%s'" % chrom]
587
588         if flag != "":
589             if flagLike:
590                 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
591                 whereClause.append(flagLikeClause)
592             else:
593                 whereClause.append("flag = '%s'" % flag)
594
595         if hasMismatch:
596             whereClause.append("mismatch != ''")
597
598         if strand != "":
599             whereClause.append("sense = '%s'" % strand)
600
601         if start > -1:
602             whereClause.append("startL > %d" % start)
603
604         if stop > -1:
605             whereClause.append("stopR < %d" % stop)
606
607         if len(whereClause) > 0:
608             whereStatement = string.join(whereClause, " and ")
609             whereQuery = "where %s" % whereStatement
610         else:
611             whereQuery = ""
612
613         selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
614         if not noSense:
615             selectClause.append("sense")
616
617         if withWeight:
618             selectClause.append("weight")
619
620         if withFlag:
621             selectClause.append("flag")
622
623         if withMismatch:
624             selectClause.append("mismatch")
625
626         selectQuery = string.join(selectClause, " ,")
627         if self.memBacked:
628             sql = self.memcon.cursor()
629         else:
630             sql = self.dbcon.cursor()
631
632         stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
633         sql.execute(stmt)
634         currentReadID = ""
635         currentChrom = ""
636         for row in sql:
637             pairID = 0
638             readID = row["readID"]
639             if fullChrom:
640                 chrom = row["chrom"]
641             else:
642                 chrom = row["chrom"][3:]
643
644             if not readIDDict and chrom != currentChrom:
645                 resultsDict[chrom] = []
646                 currentChrom = chrom
647                 dictKey = chrom
648             elif readIDDict:
649                 if "/" in readID:
650                     (theReadID, pairID) = readID.split("/")
651                 else:
652                     theReadID = readID
653
654                 if theReadID != currentReadID:
655                     resultsDict[theReadID] = []
656                     currentReadID = theReadID
657                     dictKey = theReadID
658
659             newrow = {"startL": int(row["startL"])}
660             newrow["stopL"] = int(row["stopL"])
661             newrow["startR"] = int(row["startR"])
662             newrow["stopR"] = int(row["stopR"])
663             if not noSense:
664                 newrow["sense"] = row["sense"]
665
666             if withWeight:
667                 newrow["weight"] = float(row["weight"])
668
669             if withFlag:
670                 newrow["flag"] = row["flag"]
671
672             if withMismatch:
673                 newrow["mismatch"] = row["mismatch"]
674
675             if withID:
676                 newrow["readID"] = readID
677
678             if withChrom:
679                 newrow["chrom"] = chrom
680
681             if withPairID:
682                 newrow["pairID"] = pairID
683
684             if splitRead:
685                 leftDict = newrow.copy()
686                 del leftDict["startR"]
687                 del leftDict["stopR"]
688                 rightDict = newrow
689                 del rightDict["startL"]
690                 del rightDict["stopL"]
691                 resultsDict[dictKey].append(leftDict)
692                 resultsDict[dictKey].append(rightDict)
693             else:
694                 resultsDict[dictKey].append(newrow)
695
696         return resultsDict
697
698
699     def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
700                   splices=False, reportCombined=True, sense="both"):
701         """ return read counts for a given region.
702         """
703         ucount = 0
704         mcount = 0
705         scount = 0
706         restrict = ""
707         if sense in ["+", "-"]:
708             restrict = " sense ='%s' " % sense
709
710         if uniqs:
711             try:
712                 ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
713             except:
714                 ucount = 0
715
716         if multi:
717             try:
718                 mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
719             except:
720                 mcount = 0
721
722         if splices:
723             try:
724                 scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
725             except:
726                 scount = 0
727
728         if reportCombined:
729             total = ucount + mcount + scount
730             return total
731         else:
732             return (ucount, mcount, scount)
733
734
735     def getTotalCounts(self, chrom="", rmin="", rmax=""):
736         """ return read counts for a given region.
737         """
738         return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
739
740
741     def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
742         """ returns the number of row in the uniqs table.
743         """
744         whereClause = []
745         count = 0
746
747         if chrom !=""  and chrom != self.memChrom:
748             whereClause = ["chrom='%s'" % chrom]
749
750         if rmin != "":
751             whereClause.append("%s >= %s" % (startField, str(rmin)))
752
753         if rmax != "":
754             whereClause.append("%s <= %s" % (startField, str(rmax)))
755
756         if restrict != "":
757             whereClause.append(restrict)
758
759         if len(whereClause) > 0:
760             whereStatement = string.join(whereClause, " and ")
761             whereQuery = "where %s" % whereStatement
762         else:
763             whereQuery = ""
764
765         if self.memBacked:
766             sql = self.memcon.cursor()
767         else:
768             sql = self.dbcon.cursor()
769
770         if distinct:
771             sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
772         else:
773             sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
774
775         result = sql.fetchone()
776
777         try:
778             count = int(result[0])
779         except:
780             count = 0
781
782         return count
783
784
785     def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
786         """ returns the number of row in the splices table.
787         """
788         return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
789
790
791     def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
792         """ returns the number of distinct readIDs in the uniqs table.
793         """
794         return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
795
796
797     def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
798         """ returns the total weight of readIDs in the multi table.
799         """
800         return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
801
802
803     def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
804         """ get readID's.
805         """
806         stmt = []
807         limitPart = ""
808         if limit > 0:
809             limitPart = "LIMIT %d" % limit
810
811         if uniqs:
812             stmt.append("select readID from uniqs")
813
814         if multi:
815             stmt.append("select readID from multi")
816
817         if splices:
818             stmt.append("select readID from splices")
819
820         if len(stmt) > 0:
821             selectPart = string.join(stmt, " union ")
822         else:
823             selectPart = ""
824
825         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
826         if self.memBacked:
827             sql = self.memcon.cursor()
828         else:
829             sql = self.dbcon.cursor()
830
831         sql.execute(sqlQuery)
832         result = sql.fetchall()
833
834         if paired:
835             return [x[0].split("/")[0] for x in result]
836         else:
837             return [x[0] for x in result]
838
839
840     def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
841         """ returns the uniq and spliced mismatches in a dictionary.
842         """
843         readlen = self.getReadSize()
844         if mischrom:
845             hitChromList = [mischrom]
846         else:
847             hitChromList = self.getChromosomes()
848             hitChromList.sort()
849
850         snpDict = {}
851         for achrom in hitChromList:
852             if verbose:
853                 print "getting mismatches from chromosome %s" % (achrom)
854
855             snpDict[achrom] = []
856             hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
857             if useSplices and self.dataType == "RNA":
858                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
859                 spliceIDList = spliceDict.keys()
860                 for k in spliceIDList:
861                     spliceEntry = spliceDict[k][0]
862                     startpos = spliceEntry["startL"]
863                     lefthalf = spliceEntry["stopL"]
864                     rightstart = spliceEntry["startR"]
865                     sense = spliceEntry["sense"]
866                     mismatches = spliceEntry["mismatch"]
867                     spMismatchList = mismatches.split(",")
868                     for mismatch in spMismatchList:
869                         if "N" in mismatch:
870                             continue
871
872                         change_len = len(mismatch)
873                         if sense == "+":
874                             change_from = mismatch[0]
875                             change_base = mismatch[change_len-1]
876                             change_pos = int(mismatch[1:change_len-1])
877                         elif sense == "-":
878                             change_from = getReverseComplement([mismatch[0]])
879                             change_base = getReverseComplement([mismatch[change_len-1]])
880                             change_pos = readlen - int(mismatch[1:change_len-1]) + 1
881
882                         firsthalf = int(lefthalf)-int(startpos)+1
883                         secondhalf = 0
884                         if int(change_pos) <= int(firsthalf):
885                             change_at = startpos + change_pos - 1
886                         else:
887                             secondhalf = change_pos - firsthalf
888                             change_at = rightstart + secondhalf
889
890                         snpDict[achrom].append([startpos, change_at, change_base, change_from])
891
892             if achrom not in hitDict.keys():
893                 continue
894
895             for readEntry in hitDict[achrom]:
896                 start = readEntry["start"]
897                 sense = readEntry["sense"]
898                 mismatches = readEntry["mismatch"]
899                 mismatchList = mismatches.split(",")
900                 for mismatch in mismatchList:
901                     if "N" in mismatch:
902                         continue
903
904                     change_len = len(mismatch)
905                     if sense == "+":
906                         change_from = mismatch[0]
907                         change_base = mismatch[change_len-1]
908                         change_pos = int(mismatch[1:change_len-1])
909                     elif sense == "-":
910                         change_from = getReverseComplement([mismatch[0]])
911                         change_base = getReverseComplement([mismatch[change_len-1]])
912                         change_pos = readlen - int(mismatch[1:change_len-1]) + 1
913
914                     change_at = start + change_pos - 1
915                     snpDict[achrom].append([start, change_at, change_base, change_from])
916
917         return snpDict
918
919
920     def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
921                         useSplices=False, normalizationFactor = 1.0, trackStrand=False,
922                         keepStrand="both", shiftValue=0):
923         """return a profile of the chromosome as an array of per-base read coverage....
924             keepStrand = 'both', 'plusOnly', or 'minusOnly'.
925             Will also shift position of unique and multireads (but not splices) if shift is a natural number
926         """
927         metadata = self.getMetadata()
928         try:
929             readlen = int(metadata["readsize"])
930         except KeyError:
931             readlen = 0
932
933         dataType = metadata["dataType"]
934         scale = 1. / normalizationFactor
935         shift = {}
936         shift['+'] = int(shiftValue)
937         shift['-'] = -1 * int(shiftValue)
938
939         if cstop > 0:
940             lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
941         else:
942             lastNT = cstop - cstart + readlen + shift["+"]
943
944         chromModel = array("f",[0.] * lastNT)
945         hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
946         if cstart < 0:
947             cstart = 0
948
949         for readEntry in hitDict[chromosome]:
950             hstart = readEntry["start"]
951             sense =  readEntry ["sense"]
952             weight = readEntry["weight"]
953             hstart = hstart - cstart + shift[sense]
954             for currentpos in range(hstart,hstart+readlen):
955                 try:
956                     if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
957                         chromModel[currentpos] += scale * weight
958                     elif sense == "-" and keepStrand != "plusOnly":
959                         chromModel[currentpos] -= scale * weight
960                 except:
961                     continue
962
963         del hitDict
964         if useSplices and dataType == "RNA":
965             if cstop > 0:
966                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
967             else:
968                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
969    
970             if chromosome in spliceDict:
971                 for spliceEntry in spliceDict[chromosome]:
972                     Lstart = spliceEntry["startL"]
973                     Lstop = spliceEntry["stopL"]
974                     Rstart = spliceEntry["startR"]
975                     Rstop = spliceEntry["stopR"]
976                     rsense = spliceEntry["sense"]
977                     if (Rstop - cstart) < lastNT:
978                         for index in range(abs(Lstop - Lstart)):
979                             currentpos = Lstart - cstart + index
980                             # we only track unique splices
981                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
982                                 chromModel[currentpos] += scale
983                             elif rsense == "-" and keepStrand != "plusOnly":
984                                 chromModel[currentpos] -= scale
985
986                         for index in range(abs(Rstop - Rstart)):
987                             currentpos = Rstart - cstart + index
988                             # we only track unique splices
989                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
990                                 chromModel[currentpos] += scale
991                             elif rsense == "-" and keepStrand != "plusOnly":
992                                 chromModel[currentpos] -= scale
993
994             del spliceDict
995
996         return chromModel
997
998
999     def insertMetadata(self, valuesList):
1000         """ inserts a list of (pname, pvalue) into the metadata
1001         table.
1002         """
1003         self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
1004         self.dbcon.commit()
1005
1006
1007     def updateMetadata(self, pname, newValue, originalValue=""):
1008         """ update a metadata field given the original value and the new value.
1009         """
1010         stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
1011         if originalValue != "":
1012             stmt += " and value='%s' " % str(originalValue)
1013
1014         self.dbcon.execute(stmt)
1015         self.dbcon.commit()
1016
1017
1018     def insertUniqs(self, valuesList):
1019         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1020         into the uniqs table.
1021         """
1022         self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1023         self.dbcon.commit()
1024
1025
1026     def insertMulti(self, valuesList):
1027         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1028         into the multi table.
1029         """
1030         self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1031         self.dbcon.commit()
1032
1033
1034     def insertSplices(self, valuesList):
1035         """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
1036         into the splices table.
1037         """
1038         self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
1039         self.dbcon.commit()
1040
1041
1042     def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
1043         """ update reads on file database in a list region of regions for a chromosome to have a new flag.
1044             regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
1045             sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
1046         """
1047         restrict = ""
1048         if sense != "both":
1049             restrict = " and sense = ? "
1050
1051         if uniqs:
1052             self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1053
1054         if multi:
1055             self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1056
1057         if self.dataType == "RNA" and splices:
1058             self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
1059             self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
1060
1061         self.dbcon.commit()
1062
1063
1064     def setFlags(self, flag, uniqs=True, multi=True, splices=True):
1065         """ set the flag fields in the entire dataset.
1066         """
1067         if uniqs:
1068             self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
1069
1070         if multi:
1071             self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
1072
1073         if self.dataType == "RNA" and splices:
1074             self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
1075
1076         self.dbcon.commit()
1077
1078
1079     def resetFlags(self, uniqs=True, multi=True, splices=True):
1080         """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
1081         """
1082         self.setFlags("", uniqs, multi, splices)
1083
1084
1085     def reweighMultireads(self, readList):
1086         self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
1087
1088
1089     def setSynchronousPragma(self, value="ON"):
1090         try:
1091             self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
1092         except:
1093             print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
1094
1095
1096     def setDBcache(self, cache, default=False):
1097         self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
1098         if default:
1099             self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
1100
1101
1102     def execute(self, statement, returnResults=False):
1103         sql = self.getSqlCursor()
1104
1105         sql.execute(statement)
1106         if returnResults:
1107             result = sql.fetchall()
1108             return result
1109
1110
1111     def executeCommit(self, statement):
1112         self.execute(statement)
1113
1114         if self.memBacked:
1115             self.memcon.commit()
1116         else:
1117             self.dbcon.commit()
1118
1119
1120     def buildIndex(self, cache=100000):
1121         """ Builds the file indeces for the main tables.
1122             Cache is the number of 1.5 kb pages to keep in memory.
1123             100000 pages translates into 150MB of RAM, which is our default.
1124         """
1125         if cache > self.getDefaultCacheSize():
1126             self.setDBcache(cache)
1127         self.setSynchronousPragma("OFF")
1128         self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1129         print "built uPosIndex"
1130         self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
1131         print "built uChromIndex"
1132         self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1133         print "built mPosIndex"
1134         self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
1135         print "built mChromIndex"
1136
1137         if self.dataType == "RNA":
1138             self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
1139             print "built sPosIndex"
1140             self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
1141             print "built sPosIndex2"
1142             self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
1143             print "built sChromIndex"
1144
1145         self.dbcon.commit()
1146         self.setSynchronousPragma("ON")
1147
1148
1149     def dropIndex(self):
1150         """ drops the file indices for the main tables.
1151         """
1152         try:
1153             self.setSynchronousPragma("OFF")
1154             self.dbcon.execute("DROP INDEX uPosIndex")
1155             self.dbcon.execute("DROP INDEX uChromIndex")
1156             self.dbcon.execute("DROP INDEX mPosIndex")
1157             self.dbcon.execute("DROP INDEX mChromIndex")
1158
1159             if self.dataType == "RNA":
1160                 self.dbcon.execute("DROP INDEX sPosIndex")
1161                 try:
1162                     self.dbcon.execute("DROP INDEX sPosIndex2")
1163                 except:
1164                     pass
1165
1166                 self.dbcon.execute("DROP INDEX sChromIndex")
1167
1168             self.dbcon.commit()
1169         except:
1170             print "problem dropping index"
1171
1172         self.setSynchronousPragma("ON")
1173
1174
1175     def memSync(self, chrom="", index=False):
1176         """ makes a copy of the dataset into memory for faster access.
1177         Can be restricted to a "full" chromosome. Can also build the
1178         memory indices.
1179         """
1180         self.memcon = ""
1181         self.memcon = sqlite.connect(":memory:")
1182         self.initializeTables(self.memcon)
1183         cursor = self.dbcon.cursor()
1184         whereclause = ""
1185         if chrom != "":
1186             print "memSync %s" % chrom
1187             whereclause = " where chrom = '%s' " % chrom
1188             self.memChrom = chrom
1189         else:
1190             self.memChrom = ""
1191
1192         self.memcon.execute("PRAGMA temp_store = MEMORY")
1193         self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
1194         # copy metadata to memory
1195         self.memcon.execute("delete from metadata")
1196         results = cursor.execute("select name, value from metadata")
1197         results2 = []
1198         for row in results:
1199             results2.append((row["name"], row["value"]))
1200
1201         self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
1202
1203         self.copyDBEntriesToMemory("uniqs", whereclause)
1204         self.copyDBEntriesToMemory("multi", whereclause)
1205         if self.dataType == "RNA":
1206             self.copySpliceDBEntriesToMemory(whereclause)
1207
1208         if index:
1209             if chrom != "":
1210                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
1211                 self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
1212                 if self.dataType == "RNA":
1213                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
1214                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
1215             else:
1216                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1217                 self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1218                 if self.dataType == "RNA":
1219                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
1220                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
1221
1222         self.memBacked = True
1223         self.memcon.row_factory = sqlite.Row
1224         self.memcon.commit()
1225
1226
1227     def copyDBEntriesToMemory(self, dbName, whereClause=""):
1228         cursor = self.dbcon.cursor()
1229         sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
1230         destinationEntries = []
1231         for row in sourceEntries:
1232             destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
1233
1234         self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
1235
1236
1237     def copySpliceDBEntriesToMemory(self, whereClause=""):
1238         cursor = self.dbcon.cursor()
1239         sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
1240         destinationEntries = []
1241         for row in sourceEntries:
1242             destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
1243                                        row["weight"], row["flag"], row["mismatch"]))
1244
1245         self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", destinationEntries)
1246