ReadDataset.py

   1 import sqlite3 as sqlite
   2 import string
   3 import tempfile
   4 import shutil
   5 import os
   6 from array import array
   7 from commoncode import getReverseComplement, getConfigParser, getConfigOption
   8
   9 currentRDSVersion = "2.0"
  10
  11
  12 class ReadDatasetError(Exception):
  13     pass
  14
  15
  16 class ReadDataset():
  17     """ Class for storing reads from experiments. Assumes that custom scripts
  18     will translate incoming data into a format that can be inserted into the
  19     class using the insert* methods. Default class subtype ('DNA') includes
  20     tables for unique and multireads, whereas 'RNA' subtype also includes a
  21     splices table.
  22     """
  23
  24     def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False,
  25                  cache=False, reportCount=True):
  26         """ creates an rds datafile if initialize is set to true, otherwise
  27         will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
  28         """
  29         self.dbcon = ""
  30         self.memcon = ""
  31         self.dataType = ""
  32         self.rdsVersion = currentRDSVersion
  33         self.memBacked = False
  34         self.memChrom = ""
  35         self.memCursor = ""
  36         self.cachedDBFile = ""
  37
  38         if cache:
  39             if verbose:
  40                 print "caching ...."
  41
  42             self.cacheDB(datafile)
  43             dbFile = self.cachedDBFile
  44         else:
  45             dbFile = datafile
  46
  47         self.dbcon = sqlite.connect(dbFile)
  48         self.dbcon.row_factory = sqlite.Row
  49         self.dbcon.execute("PRAGMA temp_store = MEMORY")
  50         if initialize:
  51             if datasetType not in ["DNA", "RNA"]:
  52                 raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
  53             else:
  54                 self.dataType = datasetType
  55
  56             self.initializeTables(self.dbcon)
  57         else:
  58             metadata = self.getMetadata("dataType")
  59             self.dataType = metadata["dataType"]
  60
  61         try:
  62             metadata = self.getMetadata("rdsVersion")
  63             self.rdsVersion = metadata["rdsVersion"]
  64         except:
  65             try:
  66                 self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
  67             except IOError:
  68                 print "could not add rdsVersion - read-only ?"
  69                 self.rdsVersion = "pre-1.0"
  70
  71         if verbose:
  72             if initialize:
  73                 print "INITIALIZED dataset %s" % datafile
  74             else:
  75                 print "dataset %s" % datafile
  76
  77             metadata = self.getMetadata()
  78             print "metadata:"
  79             pnameList = metadata.keys()
  80             pnameList.sort()
  81             for pname in pnameList:
  82                 print "\t" + pname + "\t" + metadata[pname]
  83
  84             if reportCount:
  85                 ucount = self.getUniqsCount()
  86                 mcount = self.getMultiCount()
  87                 if self.dataType == "DNA" and not initialize:
  88                     try:
  89                         print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
  90                     except ValueError:
  91                         print "\n%s unique reads and %s multireads" % (ucount, mcount)
  92                 elif self.dataType == "RNA" and not initialize:
  93                     scount = self.getSplicesCount()
  94                     try:
  95                         print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
  96                     except ValueError:
  97                         print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
  98
  99             print "default cache size is %d pages" % self.getDefaultCacheSize()
 100             if self.hasIndex():
 101                 print "found index"
 102             else:
 103                 print "not indexed"
 104
 105
 106     def __len__(self):
 107         """ return the number of usable reads in the dataset.
 108         """
 109         total = self.getUniqsCount()
 110         total += self.getMultiCount()
 111
 112         if self.dataType == "RNA":
 113             total += self.getSplicesCount()
 114
 115         total = int(total)
 116
 117         return total
 118
 119
 120     def __del__(self):
 121         """ cleanup copy in local cache, if present.
 122         """
 123         if self.cachedDBFile != "":
 124             self.uncacheDB()
 125
 126
 127     def cacheDB(self, filename):
 128         """ copy geneinfoDB to a local cache.
 129         """
 130         configParser = getConfigParser()
 131         cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
 132         tempfile.tempdir = cisTemp
 133         self.cachedDBFile =  "%s.db" % tempfile.mktemp()
 134         shutil.copyfile(filename, self.cachedDBFile)
 135
 136
 137     def saveCacheDB(self, filename):
 138         """ copy geneinfoDB to a local cache.
 139         """
 140         shutil.copyfile(self.cachedDBFile, filename)
 141
 142
 143     def uncacheDB(self):
 144         """ delete geneinfoDB from local cache.
 145         """
 146         global cachedDBFile
 147         if self.cachedDBFile != "":
 148             try:
 149                 os.remove(self.cachedDBFile)
 150             except:
 151                 print "could not delete %s" % self.cachedDBFile
 152
 153             self.cachedDB = ""
 154
 155
 156     def attachDB(self, filename, dbName):
 157         """ attach another database file to the readDataset.
 158         """
 159         stmt = "attach '%s' as %s" % (filename, dbName)
 160         self.execute(stmt)
 161
 162
 163     def detachDB(self, dbName):
 164         """ detach a database file to the readDataset.
 165         """
 166         stmt = "detach %s" % (dbName)
 167         self.execute(stmt)
 168
 169
 170     def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
 171         """ import into current RDS the table (with columns destcolumns,
 172             with default all columns) from the database file asname,
 173             using the column specification of ascolumns (default all).
 174         """
 175         stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
 176         if flagged != "":
 177             stmt += " where flag = '%s' " % flagged
 178
 179         self.executeCommit(stmt)
 180
 181
 182     def getTables(self, dbName=""):
 183         """ get a list of table names in a particular database file.
 184         """
 185         resultList = []
 186         sql = self.getSqlCursor()
 187
 188         if dbName != "":
 189             dbName = "%s." % dbName
 190
 191         stmt = "select name from %ssqlite_master where type='table'" % dbName
 192         sql.execute(stmt)
 193         results = sql.fetchall()
 194
 195         for row in results:
 196             resultList.append(row["name"])
 197
 198         return resultList
 199
 200
 201     def getSqlCursor(self):
 202         if self.memBacked:
 203             sql = self.getMemCursor()
 204         else:
 205             sql = self.getFileCursor()
 206
 207         return sql
 208
 209
 210     def getMemCursor(self):
 211         """ returns a cursor to memory database for low-level (SQL)
 212         access to the data.
 213         """
 214         return self.memcon.cursor()
 215
 216
 217     def getFileCursor(self):
 218         """ returns a cursor to file database for low-level (SQL)
 219         access to the data.
 220         """
 221         return self.dbcon.cursor()
 222
 223
 224     def hasIndex(self):
 225         """ return True if the RDS file has at least one index.
 226         """
 227         stmt = "select count(*) from sqlite_master where type='index'"
 228         count = int(self.execute(stmt, returnResults=True)[0][0])
 229
 230         return count > 0
 231
 232
 233     def initializeTables(self, dbConnection, cache=100000):
 234         """ creates table schema in a database connection, which is
 235         typically a database file or an in-memory database.
 236         """
 237         dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
 238         dbConnection.execute("create table metadata (name varchar, value varchar)")
 239         dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
 240         positionSchema = "start int, stop int"
 241         tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
 242         dbConnection.execute("create table uniqs %s" % tableSchema)
 243         dbConnection.execute("create table multi %s" % tableSchema)
 244         if self.dataType == "RNA":
 245             positionSchema = "startL int, stopL int, startR int, stopR int"
 246             tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
 247             dbConnection.execute("create table splices %s" % tableSchema)
 248
 249         dbConnection.commit()
 250
 251
 252     def getMetadata(self, valueName=""):
 253         """ returns a dictionary of metadata.
 254         """
 255         whereClause = ""
 256         resultsDict = {}
 257
 258         if valueName != "":
 259             whereClause = " where name='%s'" % valueName
 260
 261         sql = self.getSqlCursor()
 262
 263         sql.execute("select name, value from metadata %s" % whereClause)
 264         results = sql.fetchall()
 265
 266         for row in results:
 267             parameterName = row["name"]
 268             parameterValue = row["value"]
 269             if parameterName not in resultsDict:
 270                 resultsDict[parameterName] = parameterValue
 271             else:
 272                 trying = True
 273                 index = 2
 274                 while trying:
 275                     newName = string.join([parameterName, str(index)], ":")
 276                     if newName not in resultsDict:
 277                         resultsDict[newName] = parameterValue
 278                         trying = False
 279
 280                     index += 1
 281
 282         return resultsDict
 283
 284
 285     def getReadSize(self):
 286         """ returns readsize if defined in metadata.
 287         """
 288         metadata = self.getMetadata()
 289         if "readsize" not in metadata:
 290             raise ReadDatasetError("no readsize parameter defined")
 291         else:
 292             readSize = metadata["readsize"]
 293             if "import" in readSize:
 294                 readSize = readSize.split()[0]
 295
 296             readSize = int(readSize)
 297             if readSize < 0:
 298                 raise ReadDatasetError("readsize is negative")
 299
 300             return readSize
 301
 302
 303     def getDefaultCacheSize(self):
 304         """ returns the default cache size.
 305         """
 306         return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
 307
 308
 309     def getChromosomes(self, table="uniqs", fullChrom=True):
 310         """ returns a sorted list of distinct chromosomes in table.
 311         """
 312         statement = "select distinct chrom from %s" % table
 313         sql = self.getSqlCursor()
 314
 315         sql.execute(statement)
 316         results = []
 317         for row in sql:
 318             if fullChrom:
 319                 if row["chrom"] not in results:
 320                     results.append(row["chrom"])
 321             else:
 322                 shortName = row["chrom"][3:]
 323                 if  len(shortName.strip()) > 0 and shortName not in results:
 324                     results.append(shortName)
 325
 326         results.sort()
 327
 328         return results
 329
 330
 331     def getMaxCoordinate(self, chrom, doUniqs=True,
 332                          doMulti=False, doSplices=False):
 333         """ returns the maximum coordinate for reads on a given chromosome.
 334         """
 335         maxCoord = 0
 336
 337         if doUniqs:
 338             maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
 339
 340         if doSplices:
 341             spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
 342             maxCoord = max(spliceMax, maxCoord)
 343
 344         if doMulti:
 345             multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
 346             maxCoord = max(multiMax, maxCoord)
 347
 348         return maxCoord
 349
 350
 351     def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
 352         maxCoord = 0
 353         sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
 354         sql = self.getSqlCursor()
 355         try:
 356             sql.execute(sqlStatement)
 357             maxCoord = int(sql.fetchall()[0][0])
 358         except:
 359             print "couldn't retrieve coordMax for chromosome %s" % chrom
 360
 361         return maxCoord
 362
 363
 364     def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
 365                      flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
 366                      withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
 367                      readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
 368                      flagLike=False, strand='', combine5p=False):
 369         """ returns a dictionary of reads in a variety of formats
 370         and which can be restricted by chromosome or custom-flag.
 371         Returns unique reads by default, but can return multireads
 372         with doMulti set to True.
 373
 374         """
 375         #TODO: Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
 376
 377         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
 378         if findallOptimize:
 379             selectQuery = "select start, sense, sum(weight)"
 380         else:
 381             selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
 382
 383         groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
 384         if doUniqs:
 385             stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
 386             if doMulti:
 387                 stmt.append("UNION ALL")
 388                 stmt.append(selectQuery)
 389                 stmt.append("from multi")
 390                 stmt.append(whereQuery)
 391                 stmt.append(groupQuery)
 392         else:
 393             stmt = [selectQuery, "from multi", whereQuery]
 394
 395         if combine5p:
 396             if findallOptimize:
 397                 selectQuery = "select start, sense, weight, chrom"
 398
 399             if doUniqs:
 400                 subSelect = [selectQuery, "from uniqs", whereQuery]
 401                 if doMulti:
 402                     subSelect.append("union all")
 403                     subSelect.append(selectQuery)
 404                     subSelect.append("from multi")
 405                     subSelect.append(whereQuery)
 406             else:
 407                 subSelect = [selectQuery, "from multi", whereQuery]
 408
 409             sqlStmt = string.join(subSelect)
 410             if findallOptimize:
 411                 selectQuery = "select start, sense, sum(weight)"
 412
 413             stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
 414                     selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
 415
 416         if findallOptimize:
 417             if self.memBacked:
 418                 self.memcon.row_factory = None
 419             else:
 420                 self.dbcon.row_factory = None
 421
 422             stmt.append("order by start")
 423         elif readIDDict:
 424             stmt.append("order by readID, start")
 425         else:
 426             stmt.append("order by chrom, start")
 427
 428         sql = self.getSqlCursor()
 429         sqlQuery = string.join(stmt)
 430         sql.execute(sqlQuery)
 431
 432         resultsDict = {}
 433         if findallOptimize:
 434             resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
 435             if self.memBacked:
 436                 self.memcon.row_factory = sqlite.Row
 437             else:
 438                 self.dbcon.row_factory = sqlite.Row
 439         else:
 440             currentChrom = ""
 441             currentReadID = ""
 442             pairID = 0
 443             for row in sql:
 444                 readID = row["readID"]
 445                 if fullChrom:
 446                     chrom = row["chrom"]
 447                 else:
 448                     chrom = row["chrom"][3:]
 449
 450                 if not readIDDict and chrom != currentChrom:
 451                     resultsDict[chrom] = []
 452                     currentChrom = chrom
 453                     dictKey = chrom
 454                 elif readIDDict:
 455                     theReadID = readID
 456                     if "::" in readID:
 457                         theReadID = readID.split("::")[0]
 458
 459                     if "/" in theReadID and withPairID:
 460                         (theReadID, pairID) = readID.split("/")
 461
 462                     if theReadID != currentReadID:
 463                         resultsDict[theReadID] = []
 464                         currentReadID = theReadID
 465                         dictKey = theReadID
 466
 467                 newrow = {"start": int(row["start"])}
 468                 if bothEnds:
 469                     newrow["stop"] = int(row["stop"])
 470
 471                 if not noSense:
 472                     newrow["sense"] = row["sense"]
 473
 474                 if withWeight:
 475                     newrow["weight"] = float(row["weight"])
 476
 477                 if withFlag:
 478                     newrow["flag"] = row["flag"]
 479
 480                 if withMismatch:
 481                     newrow["mismatch"] = row["mismatch"]
 482
 483                 if withID:
 484                     newrow["readID"] = readID
 485
 486                 if withChrom:
 487                     newrow["chrom"] = chrom
 488
 489                 if withPairID:
 490                     newrow["pairID"] = pairID
 491
 492                 resultsDict[dictKey].append(newrow)
 493
 494         return resultsDict
 495
 496
 497     def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
 498         if splice:
 499             startText = "startL"
 500             stopText = "stopR"
 501         else:
 502             startText = "start"
 503             stopText = "stop"
 504
 505         whereClause = []
 506         if chrom != "" and chrom != self.memChrom:
 507             whereClause.append("chrom = '%s'" % chrom)
 508
 509         if flag != "":
 510             if flagLike:
 511                 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
 512                 whereClause.append(flagLikeClause)
 513             else:
 514                 whereClause.append("flag = '%s'" % flag)
 515
 516         if start > -1:
 517             whereClause.append("%s > %d" % (startText, start))
 518
 519         if stop > -1:
 520             whereClause.append("%s < %d" % (stopText, stop))
 521
 522         if len(readLike) > 0:
 523             readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
 524             whereClause.append(readIDClause)
 525
 526         if hasMismatch:
 527             whereClause.append("mismatch != ''")
 528
 529         if strand in ["+", "-"]:
 530             whereClause.append("sense = '%s'" % strand)
 531
 532         if len(whereClause) > 0:
 533             whereStatement = string.join(whereClause, " and ")
 534             whereQuery = "where %s" % whereStatement
 535         else:
 536             whereQuery = ""
 537
 538         return whereQuery
 539
 540
 541     def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
 542
 543         selectClause = [baseSelect]
 544         if bothEnds:
 545             selectClause.append("stop")
 546
 547         if not noSense:
 548             selectClause.append("sense")
 549
 550         if withWeight:
 551             selectClause.append("weight")
 552
 553         if withFlag:
 554             selectClause.append("flag")
 555
 556         if withMismatch:
 557             selectClause.append("mismatch")
 558
 559         selectQuery = string.join(selectClause, ",")
 560
 561         return selectQuery
 562
 563
 564     def getReadGroupQuery(self, findallOptimize, limit, combine5p):
 565         groupBy = []
 566         if findallOptimize:
 567             groupBy = ["GROUP BY start, sense"]
 568
 569         if limit > 0 and not combine5p:
 570             groupBy.append("LIMIT %d" % limit)
 571
 572         groupQuery = string.join(groupBy)
 573
 574         return groupQuery
 575
 576
 577     def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
 578                        flag="", withWeight=False, withFlag=False, withMismatch=False,
 579                        withID=False, withChrom=False, withPairID=False, readIDDict=False,
 580                        splitRead=False, hasMismatch=False, flagLike=False, start=-1,
 581                        stop=-1, strand=""):
 582         """ returns a dictionary of spliced reads in a variety of
 583         formats and which can be restricted by chromosome or custom-flag.
 584         Returns unique spliced reads for now.
 585         """
 586         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
 587         selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
 588         selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
 589         sql = self.getSqlCursor()
 590
 591         stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
 592         sql.execute(stmt)
 593         currentReadID = ""
 594         currentChrom = ""
 595         resultsDict = {}
 596         for row in sql:
 597             pairID = 0
 598             readID = row["readID"]
 599             if fullChrom:
 600                 chrom = row["chrom"]
 601             else:
 602                 chrom = row["chrom"][3:]
 603
 604             if not readIDDict and chrom != currentChrom:
 605                 resultsDict[chrom] = []
 606                 currentChrom = chrom
 607                 dictKey = chrom
 608             elif readIDDict:
 609                 if "/" in readID:
 610                     (theReadID, pairID) = readID.split("/")
 611                 else:
 612                     theReadID = readID
 613
 614                 if theReadID != currentReadID:
 615                     resultsDict[theReadID] = []
 616                     currentReadID = theReadID
 617                     dictKey = theReadID
 618
 619             newrow = {"startL": int(row["startL"])}
 620             newrow["stopL"] = int(row["stopL"])
 621             newrow["startR"] = int(row["startR"])
 622             newrow["stopR"] = int(row["stopR"])
 623             if not noSense:
 624                 newrow["sense"] = row["sense"]
 625
 626             if withWeight:
 627                 newrow["weight"] = float(row["weight"])
 628
 629             if withFlag:
 630                 newrow["flag"] = row["flag"]
 631
 632             if withMismatch:
 633                 newrow["mismatch"] = row["mismatch"]
 634
 635             if withID:
 636                 newrow["readID"] = readID
 637
 638             if withChrom:
 639                 newrow["chrom"] = chrom
 640
 641             if withPairID:
 642                 newrow["pairID"] = pairID
 643
 644             if splitRead:
 645                 leftDict = newrow.copy()
 646                 del leftDict["startR"]
 647                 del leftDict["stopR"]
 648                 rightDict = newrow
 649                 del rightDict["startL"]
 650                 del rightDict["stopL"]
 651                 resultsDict[dictKey].append(leftDict)
 652                 resultsDict[dictKey].append(rightDict)
 653             else:
 654                 resultsDict[dictKey].append(newrow)
 655
 656         return resultsDict
 657
 658
 659     def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
 660                   splices=False, reportCombined=True, sense="both"):
 661         """ return read counts for a given region.
 662         """
 663         ucount = 0
 664         mcount = 0
 665         scount = 0
 666         restrict = ""
 667         if sense in ["+", "-"]:
 668             restrict = " sense ='%s' " % sense
 669
 670         if uniqs:
 671             try:
 672                 ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
 673             except:
 674                 ucount = 0
 675
 676         if multi:
 677             try:
 678                 mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
 679             except:
 680                 mcount = 0
 681
 682         if splices:
 683             try:
 684                 scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
 685             except:
 686                 scount = 0
 687
 688         if reportCombined:
 689             total = ucount + mcount + scount
 690             return total
 691         else:
 692             return (ucount, mcount, scount)
 693
 694
 695     def getTotalCounts(self, chrom="", rmin="", rmax=""):
 696         """ return read counts for a given region.
 697         """
 698         return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
 699
 700
 701     def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
 702         """ returns the number of row in the specified table.
 703         """
 704         whereClause = []
 705         count = 0
 706
 707         if chrom !=""  and chrom != self.memChrom:
 708             whereClause = ["chrom='%s'" % chrom]
 709
 710         if rmin != "":
 711             whereClause.append("%s >= %s" % (startField, str(rmin)))
 712
 713         if rmax != "":
 714             whereClause.append("%s <= %s" % (startField, str(rmax)))
 715
 716         if restrict != "":
 717             whereClause.append(restrict)
 718
 719         if len(whereClause) > 0:
 720             whereStatement = string.join(whereClause, " and ")
 721             whereQuery = "where %s" % whereStatement
 722         else:
 723             whereQuery = ""
 724
 725         sql = self.getSqlCursor()
 726
 727         if distinct:
 728             sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
 729         else:
 730             sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
 731
 732         result = sql.fetchone()
 733
 734         try:
 735             count = int(result[0])
 736         except:
 737             count = 0
 738
 739         return count
 740
 741
 742     def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
 743         """ returns the number of row in the splices table.
 744         """
 745         # TODO: if the rds type is DNA should this just return zero?
 746         return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
 747
 748
 749     def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
 750         """ returns the number of distinct readIDs in the uniqs table.
 751         """
 752         return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
 753
 754
 755     def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
 756         """ returns the total weight of readIDs in the multi table.
 757         """
 758         return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
 759
 760
 761     def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
 762         """ get readID's.
 763         """
 764         stmt = []
 765         if uniqs:
 766             stmt.append("select readID from uniqs")
 767
 768         if multi:
 769             stmt.append("select readID from multi")
 770
 771         if splices:
 772             stmt.append("select readID from splices")
 773
 774         if len(stmt) > 0:
 775             selectPart = string.join(stmt, " union ")
 776         else:
 777             selectPart = ""
 778
 779         limitPart = ""
 780         if limit > 0:
 781             limitPart = "LIMIT %d" % limit
 782
 783         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
 784         sql = self.getSqlCursor()
 785         sql.execute(sqlQuery)
 786         result = sql.fetchall()
 787
 788         if paired:
 789             return [x[0].split("/")[0] for x in result]
 790         else:
 791             return [x[0] for x in result]
 792
 793
 794     def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
 795         """ returns the uniq and spliced mismatches in a dictionary.
 796         """
 797         readlen = self.getReadSize()
 798         if mischrom:
 799             hitChromList = [mischrom]
 800         else:
 801             hitChromList = self.getChromosomes()
 802             hitChromList.sort()
 803
 804         snpDict = {}
 805         for achrom in hitChromList:
 806             if verbose:
 807                 print "getting mismatches from chromosome %s" % (achrom)
 808
 809             snpDict[achrom] = []
 810             if useSplices and self.dataType == "RNA":
 811                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
 812                 spliceIDList = spliceDict.keys()
 813                 for spliceID in spliceIDList:
 814                     spliceEntry = spliceDict[spliceID][0]
 815                     startpos = spliceEntry["startL"]
 816                     lefthalf = spliceEntry["stopL"]
 817                     rightstart = spliceEntry["startR"]
 818                     sense = spliceEntry["sense"]
 819                     mismatches = spliceEntry["mismatch"]
 820                     spMismatchList = mismatches.split(",")
 821                     for mismatch in spMismatchList:
 822                         if "N" in mismatch:
 823                             continue
 824
 825                         change_len = len(mismatch)
 826                         if sense == "+":
 827                             change_from = mismatch[0]
 828                             change_base = mismatch[change_len-1]
 829                             change_pos = int(mismatch[1:change_len-1])
 830                         elif sense == "-":
 831                             change_from = getReverseComplement([mismatch[0]])
 832                             change_base = getReverseComplement([mismatch[change_len-1]])
 833                             change_pos = readlen - int(mismatch[1:change_len-1]) + 1
 834
 835                         firsthalf = int(lefthalf)-int(startpos)+1
 836                         secondhalf = 0
 837                         if int(change_pos) <= int(firsthalf):
 838                             change_at = startpos + change_pos - 1
 839                         else:
 840                             secondhalf = change_pos - firsthalf
 841                             change_at = rightstart + secondhalf
 842
 843                         snpDict[achrom].append([startpos, change_at, change_base, change_from])
 844
 845             hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
 846             if achrom not in hitDict.keys():
 847                 continue
 848
 849             for readEntry in hitDict[achrom]:
 850                 start = readEntry["start"]
 851                 sense = readEntry["sense"]
 852                 mismatches = readEntry["mismatch"]
 853                 mismatchList = mismatches.split(",")
 854                 for mismatch in mismatchList:
 855                     if "N" in mismatch:
 856                         continue
 857
 858                     change_len = len(mismatch)
 859                     if sense == "+":
 860                         change_from = mismatch[0]
 861                         change_base = mismatch[change_len-1]
 862                         change_pos = int(mismatch[1:change_len-1])
 863                     elif sense == "-":
 864                         change_from = getReverseComplement([mismatch[0]])
 865                         change_base = getReverseComplement([mismatch[change_len-1]])
 866                         change_pos = readlen - int(mismatch[1:change_len-1]) + 1
 867
 868                     change_at = start + change_pos - 1
 869                     snpDict[achrom].append([start, change_at, change_base, change_from])
 870
 871         return snpDict
 872
 873
 874     def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
 875                         useSplices=False, normalizationFactor=1.0, trackStrand=False,
 876                         keepStrand="both", shiftValue=0):
 877         """return a profile of the chromosome as an array of per-base read coverage....
 878             keepStrand = 'both', 'plusOnly', or 'minusOnly'.
 879             Will also shift position of unique and multireads (but not splices) if shift is a natural number
 880         """
 881         metadata = self.getMetadata()
 882         try:
 883             readlen = int(metadata["readsize"])
 884         except KeyError:
 885             readlen = 0
 886
 887         dataType = metadata["dataType"]
 888         scale = 1. / normalizationFactor
 889         shift = {}
 890         shift["+"] = int(shiftValue)
 891         shift["-"] = -1 * int(shiftValue)
 892
 893         if cstop > 0:
 894             lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
 895         else:
 896             lastNT = cstop - cstart + readlen + shift["+"]
 897
 898         chromModel = array("f",[0.] * lastNT)
 899         hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
 900         if cstart < 0:
 901             cstart = 0
 902
 903         for readEntry in hitDict[chromosome]:
 904             hstart = readEntry["start"]
 905             sense =  readEntry ["sense"]
 906             weight = readEntry["weight"]
 907             hstart = hstart - cstart + shift[sense]
 908             for currentpos in range(hstart,hstart+readlen):
 909                 try:
 910                     if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
 911                         chromModel[currentpos] += scale * weight
 912                     elif sense == "-" and keepStrand != "plusOnly":
 913                         chromModel[currentpos] -= scale * weight
 914                 except:
 915                     continue
 916
 917         del hitDict
 918         if useSplices and dataType == "RNA":
 919             if cstop > 0:
 920                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
 921             else:
 922                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
 923
 924             if chromosome in spliceDict:
 925                 for spliceEntry in spliceDict[chromosome]:
 926                     Lstart = spliceEntry["startL"]
 927                     Lstop = spliceEntry["stopL"]
 928                     Rstart = spliceEntry["startR"]
 929                     Rstop = spliceEntry["stopR"]
 930                     rsense = spliceEntry["sense"]
 931                     if (Rstop - cstart) < lastNT:
 932                         for index in range(abs(Lstop - Lstart)):
 933                             currentpos = Lstart - cstart + index
 934                             # we only track unique splices
 935                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
 936                                 chromModel[currentpos] += scale
 937                             elif rsense == "-" and keepStrand != "plusOnly":
 938                                 chromModel[currentpos] -= scale
 939
 940                         for index in range(abs(Rstop - Rstart)):
 941                             currentpos = Rstart - cstart + index
 942                             # we only track unique splices
 943                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
 944                                 chromModel[currentpos] += scale
 945                             elif rsense == "-" and keepStrand != "plusOnly":
 946                                 chromModel[currentpos] -= scale
 947
 948             del spliceDict
 949
 950         return chromModel
 951
 952
 953     def insertMetadata(self, valuesList):
 954         """ inserts a list of (pname, pvalue) into the metadata
 955         table.
 956         """
 957         self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
 958         self.dbcon.commit()
 959
 960
 961     def updateMetadata(self, pname, newValue, originalValue=""):
 962         """ update a metadata field given the original value and the new value.
 963         """
 964         stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
 965         if originalValue != "":
 966             stmt += " and value='%s' " % str(originalValue)
 967
 968         self.dbcon.execute(stmt)
 969         self.dbcon.commit()
 970
 971
 972     def insertUniqs(self, valuesList):
 973         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
 974         into the uniqs table.
 975         """
 976         self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
 977         self.dbcon.commit()
 978
 979
 980     def insertMulti(self, valuesList):
 981         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
 982         into the multi table.
 983         """
 984         self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
 985         self.dbcon.commit()
 986
 987
 988     def insertSplices(self, valuesList):
 989         """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
 990         into the splices table.
 991         """
 992         self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
 993         self.dbcon.commit()
 994
 995
 996     def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
 997         """ update reads on file database in a list region of regions for a chromosome to have a new flag.
 998             regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
 999             sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
1000         """
1001         restrict = ""
1002         if sense != "both":
1003             restrict = " and sense = ? "
1004
1005         if uniqs:
1006             self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1007
1008         if multi:
1009             self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1010
1011         if self.dataType == "RNA" and splices:
1012             self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
1013             self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
1014
1015         self.dbcon.commit()
1016
1017
1018     def setFlags(self, flag, uniqs=True, multi=True, splices=True):
1019         """ set the flag fields in the entire dataset.
1020         """
1021         if uniqs:
1022             self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
1023
1024         if multi:
1025             self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
1026
1027         if self.dataType == "RNA" and splices:
1028             self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
1029
1030         self.dbcon.commit()
1031
1032
1033     def resetFlags(self, uniqs=True, multi=True, splices=True):
1034         """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
1035         """
1036         self.setFlags("", uniqs, multi, splices)
1037
1038
1039     def reweighMultireads(self, readList):
1040         self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
1041
1042
1043     def setSynchronousPragma(self, value="ON"):
1044         try:
1045             self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
1046         except:
1047             print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
1048
1049
1050     def setDBcache(self, cache, default=False):
1051         self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
1052         if default:
1053             self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
1054
1055
1056     def execute(self, statement, returnResults=False):
1057         sql = self.getSqlCursor()
1058
1059         sql.execute(statement)
1060         if returnResults:
1061             result = sql.fetchall()
1062             return result
1063
1064
1065     def executeCommit(self, statement):
1066         self.execute(statement)
1067
1068         if self.memBacked:
1069             self.memcon.commit()
1070         else:
1071             self.dbcon.commit()
1072
1073
1074     def buildIndex(self, cache=100000):
1075         """ Builds the file indeces for the main tables.
1076             Cache is the number of 1.5 kb pages to keep in memory.
1077             100000 pages translates into 150MB of RAM, which is our default.
1078         """
1079         if cache > self.getDefaultCacheSize():
1080             self.setDBcache(cache)
1081         self.setSynchronousPragma("OFF")
1082         self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1083         print "built uPosIndex"
1084         self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
1085         print "built uChromIndex"
1086         self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1087         print "built mPosIndex"
1088         self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
1089         print "built mChromIndex"
1090
1091         if self.dataType == "RNA":
1092             self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
1093             print "built sPosIndex"
1094             self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
1095             print "built sPosIndex2"
1096             self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
1097             print "built sChromIndex"
1098
1099         self.dbcon.commit()
1100         self.setSynchronousPragma("ON")
1101
1102
1103     def dropIndex(self):
1104         """ drops the file indices for the main tables.
1105         """
1106         try:
1107             self.setSynchronousPragma("OFF")
1108             self.dbcon.execute("DROP INDEX uPosIndex")
1109             self.dbcon.execute("DROP INDEX uChromIndex")
1110             self.dbcon.execute("DROP INDEX mPosIndex")
1111             self.dbcon.execute("DROP INDEX mChromIndex")
1112
1113             if self.dataType == "RNA":
1114                 self.dbcon.execute("DROP INDEX sPosIndex")
1115                 try:
1116                     self.dbcon.execute("DROP INDEX sPosIndex2")
1117                 except:
1118                     pass
1119
1120                 self.dbcon.execute("DROP INDEX sChromIndex")
1121
1122             self.dbcon.commit()
1123         except:
1124             print "problem dropping index"
1125
1126         self.setSynchronousPragma("ON")
1127
1128
1129     def memSync(self, chrom="", index=False):
1130         """ makes a copy of the dataset into memory for faster access.
1131         Can be restricted to a "full" chromosome. Can also build the
1132         memory indices.
1133         """
1134         self.memcon = ""
1135         self.memcon = sqlite.connect(":memory:")
1136         self.initializeTables(self.memcon)
1137         cursor = self.dbcon.cursor()
1138         whereclause = ""
1139         if chrom != "":
1140             print "memSync %s" % chrom
1141             whereclause = " where chrom = '%s' " % chrom
1142             self.memChrom = chrom
1143         else:
1144             self.memChrom = ""
1145
1146         self.memcon.execute("PRAGMA temp_store = MEMORY")
1147         self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
1148         # copy metadata to memory
1149         self.memcon.execute("delete from metadata")
1150         results = cursor.execute("select name, value from metadata")
1151         results2 = []
1152         for row in results:
1153             results2.append((row["name"], row["value"]))
1154
1155         self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
1156
1157         self.copyDBEntriesToMemory("uniqs", whereclause)
1158         self.copyDBEntriesToMemory("multi", whereclause)
1159         if self.dataType == "RNA":
1160             self.copySpliceDBEntriesToMemory(whereclause)
1161
1162         if index:
1163             if chrom != "":
1164                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
1165                 self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
1166                 if self.dataType == "RNA":
1167                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
1168                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
1169             else:
1170                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1171                 self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1172                 if self.dataType == "RNA":
1173                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
1174                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
1175
1176         self.memBacked = True
1177         self.memcon.row_factory = sqlite.Row
1178         self.memcon.commit()
1179
1180
1181     def copyDBEntriesToMemory(self, dbName, whereClause=""):
1182         cursor = self.dbcon.cursor()
1183         sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
1184         destinationEntries = []
1185         for row in sourceEntries:
1186             destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
1187
1188         self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
1189
1190
1191     def copySpliceDBEntriesToMemory(self, whereClause=""):
1192         cursor = self.dbcon.cursor()
1193         sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
1194         destinationEntries = []
1195         for row in sourceEntries:
1196             destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
1197                                        row["weight"], row["flag"], row["mismatch"]))
1198
1199         self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", destinationEntries)
1200