ReadDataset.py

   1 import sqlite3 as sqlite
   2 import string
   3 import tempfile
   4 import shutil
   5 import os
   6 from array import array
   7 from commoncode import getReverseComplement, getConfigParser, getConfigOption
   8
   9 currentRDSVersion = "2.0"
  10
  11
  12 class ReadDatasetError(Exception):
  13     pass
  14
  15
  16 class ReadDataset():
  17     """ Class for storing reads from experiments. Assumes that custom scripts
  18     will translate incoming data into a format that can be inserted into the
  19     class using the insert* methods. Default class subtype ('DNA') includes
  20     tables for unique and multireads, whereas 'RNA' subtype also includes a
  21     splices table.
  22     """
  23
  24     def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False,
  25                  cache=False, reportCount=True):
  26         """ creates an rds datafile if initialize is set to true, otherwise
  27         will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
  28         """
  29         self.dbcon = ""
  30         self.memcon = ""
  31         self.dataType = ""
  32         self.rdsVersion = currentRDSVersion
  33         self.memBacked = False
  34         self.memChrom = ""
  35         self.memCursor = ""
  36         self.cachedDBFile = ""
  37
  38         if cache:
  39             if verbose:
  40                 print "caching ...."
  41
  42             self.cacheDB(datafile)
  43             dbFile = self.cachedDBFile
  44         else:
  45             dbFile = datafile
  46
  47         self.dbcon = sqlite.connect(dbFile)
  48         self.dbcon.row_factory = sqlite.Row
  49         self.dbcon.execute("PRAGMA temp_store = MEMORY")
  50         if initialize:
  51             if datasetType not in ["DNA", "RNA"]:
  52                 raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
  53             else:
  54                 self.dataType = datasetType
  55
  56             self.initializeTables(self.dbcon)
  57         else:
  58             metadata = self.getMetadata("dataType")
  59             self.dataType = metadata["dataType"]
  60
  61         try:
  62             metadata = self.getMetadata("rdsVersion")
  63             self.rdsVersion = metadata["rdsVersion"]
  64         except:
  65             try:
  66                 self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
  67             except IOError:
  68                 print "could not add rdsVersion - read-only ?"
  69                 self.rdsVersion = "pre-1.0"
  70
  71         if verbose:
  72             if initialize:
  73                 print "INITIALIZED dataset %s" % datafile
  74             else:
  75                 print "dataset %s" % datafile
  76
  77             metadata = self.getMetadata()
  78             print "metadata:"
  79             pnameList = metadata.keys()
  80             pnameList.sort()
  81             for pname in pnameList:
  82                 print "\t" + pname + "\t" + metadata[pname]
  83
  84             if reportCount:
  85                 ucount = self.getUniqsCount()
  86                 mcount = self.getMultiCount()
  87                 if self.dataType == "DNA" and not initialize:
  88                     try:
  89                         print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
  90                     except ValueError:
  91                         print "\n%s unique reads and %s multireads" % (ucount, mcount)
  92                 elif self.dataType == "RNA" and not initialize:
  93                     scount = self.getSplicesCount()
  94                     try:
  95                         print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
  96                     except ValueError:
  97                         print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
  98
  99             print "default cache size is %d pages" % self.getDefaultCacheSize()
 100             if self.hasIndex():
 101                 print "found index"
 102             else:
 103                 print "not indexed"
 104
 105
 106     def __len__(self):
 107         """ return the number of usable reads in the dataset.
 108         """
 109         total = self.getUniqsCount()
 110         total += self.getMultiCount()
 111
 112         if self.dataType == "RNA":
 113             total += self.getSplicesCount()
 114
 115         total = int(total)
 116
 117         return total
 118
 119
 120     def __del__(self):
 121         """ cleanup copy in local cache, if present.
 122         """
 123         if self.cachedDBFile != "":
 124             self.uncacheDB()
 125
 126
 127     def cacheDB(self, filename):
 128         """ copy geneinfoDB to a local cache.
 129         """
 130         configParser = getConfigParser()
 131         cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
 132         tempfile.tempdir = cisTemp
 133         self.cachedDBFile =  "%s.db" % tempfile.mktemp()
 134         shutil.copyfile(filename, self.cachedDBFile)
 135
 136
 137     def saveCacheDB(self, filename):
 138         """ copy geneinfoDB to a local cache.
 139         """
 140         shutil.copyfile(self.cachedDBFile, filename)
 141
 142
 143     def uncacheDB(self):
 144         """ delete geneinfoDB from local cache.
 145         """
 146         global cachedDBFile
 147         if self.cachedDBFile != "":
 148             try:
 149                 os.remove(self.cachedDBFile)
 150             except:
 151                 print "could not delete %s" % self.cachedDBFile
 152
 153             self.cachedDB = ""
 154
 155
 156     def attachDB(self, filename, dbName):
 157         """ attach another database file to the readDataset.
 158         """
 159         stmt = "attach '%s' as %s" % (filename, dbName)
 160         self.execute(stmt)
 161
 162
 163     def detachDB(self, dbName):
 164         """ detach a database file to the readDataset.
 165         """
 166         stmt = "detach %s" % (dbName)
 167         self.execute(stmt)
 168
 169
 170     def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
 171         """ import into current RDS the table (with columns destcolumns,
 172             with default all columns) from the database file asname,
 173             using the column specification of ascolumns (default all).
 174         """
 175         stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
 176         if flagged != "":
 177             stmt += " where flag = '%s' " % flagged
 178
 179         self.executeCommit(stmt)
 180
 181
 182     def getTables(self, dbName=""):
 183         """ get a list of table names in a particular database file.
 184         """
 185         resultList = []
 186         sql = self.getSqlCursor()
 187
 188         if dbName != "":
 189             dbName = "%s." % dbName
 190
 191         stmt = "select name from %ssqlite_master where type='table'" % dbName
 192         sql.execute(stmt)
 193         results = sql.fetchall()
 194
 195         for row in results:
 196             resultList.append(row["name"])
 197
 198         return resultList
 199
 200
 201     def getSqlCursor(self):
 202         if self.memBacked:
 203             sql = self.getMemCursor()
 204         else:
 205             sql = self.getFileCursor()
 206
 207         return sql
 208
 209
 210     def hasIndex(self):
 211         """ check whether the RDS file has at least one index.
 212         """
 213         stmt = "select count(*) from sqlite_master where type='index'"
 214         count = int(self.execute(stmt, returnResults=True)[0][0])
 215         if count > 0:
 216             return True
 217
 218         return False
 219
 220
 221     def initializeTables(self, dbConnection, cache=100000):
 222         """ creates table schema in a database connection, which is
 223         typically a database file or an in-memory database.
 224         """
 225         dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
 226         dbConnection.execute("create table metadata (name varchar, value varchar)")
 227         dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
 228         positionSchema = "start int, stop int"
 229         tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
 230         dbConnection.execute("create table uniqs %s" % tableSchema)
 231         dbConnection.execute("create table multi %s" % tableSchema)
 232         if self.dataType == "RNA":
 233             positionSchema = "startL int, stopL int, startR int, stopR int"
 234             tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
 235             dbConnection.execute("create table splices %s" % tableSchema)
 236
 237         dbConnection.commit()
 238
 239
 240     def getFileCursor(self):
 241         """ returns a cursor to file database for low-level (SQL)
 242         access to the data.
 243         """
 244         return self.dbcon.cursor()
 245
 246
 247     def getMemCursor(self):
 248         """ returns a cursor to memory database for low-level (SQL)
 249         access to the data.
 250         """
 251         return self.memcon.cursor()
 252
 253
 254     def getMetadata(self, valueName=""):
 255         """ returns a dictionary of metadata.
 256         """
 257         whereClause = ""
 258         resultsDict = {}
 259
 260         if valueName != "":
 261             whereClause = " where name='%s'" % valueName
 262
 263         sql = self.getSqlCursor()
 264
 265         sql.execute("select name, value from metadata %s" % whereClause)
 266         results = sql.fetchall()
 267
 268         for row in results:
 269             parameterName = row["name"]
 270             parameterValue = row["value"]
 271             if parameterName not in resultsDict:
 272                 resultsDict[parameterName] = parameterValue
 273             else:
 274                 trying = True
 275                 index = 2
 276                 while trying:
 277                     newName = string.join([parameterName, str(index)], ":")
 278                     if newName not in resultsDict:
 279                         resultsDict[newName] = parameterValue
 280                         trying = False
 281
 282                     index += 1
 283
 284         return resultsDict
 285
 286
 287     def getReadSize(self):
 288         """ returns readsize if defined in metadata.
 289         """
 290         metadata = self.getMetadata()
 291         if "readsize" not in metadata:
 292             raise ReadDatasetError("no readsize parameter defined")
 293         else:
 294             readSize = metadata["readsize"]
 295             if "import" in readSize:
 296                 readSize = readSize.split()[0]
 297
 298             readSize = int(readSize)
 299             if readSize < 0:
 300                 raise ReadDatasetError("readsize is negative")
 301
 302             return readSize
 303
 304
 305     def getDefaultCacheSize(self):
 306         """ returns the default cache size.
 307         """
 308         return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
 309
 310
 311     def getChromosomes(self, table="uniqs", fullChrom=True):
 312         """ returns a list of distinct chromosomes in table.
 313         """
 314         statement = "select distinct chrom from %s" % table
 315         sql = self.getSqlCursor()
 316
 317         sql.execute(statement)
 318         results = []
 319         for row in sql:
 320             if fullChrom:
 321                 if row["chrom"] not in results:
 322                     results.append(row["chrom"])
 323             else:
 324                 shortName = row["chrom"][3:]
 325                 if  len(shortName.strip()) > 0 and shortName not in results:
 326                     results.append(shortName)
 327
 328         results.sort()
 329
 330         return results
 331
 332
 333     def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
 334                          doMulti=False, doSplices=False):
 335         """ returns the maximum coordinate for reads on a given chromosome.
 336         """
 337         maxCoord = 0
 338
 339         if doUniqs:
 340             maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
 341
 342         if doSplices:
 343             spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
 344             maxCoord = max(spliceMax, maxCoord)
 345
 346         if doMulti:
 347             multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
 348             maxCoord = max(multiMax, maxCoord)
 349
 350         if verbose:
 351             print "%s maxCoord: %d" % (chrom, maxCoord)
 352
 353         return maxCoord
 354
 355
 356     def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
 357         maxCoord = 0
 358         sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
 359         sql = self.getSqlCursor()
 360         try:
 361             sql.execute(sqlStatement)
 362             maxCoord = int(sql.fetchall()[0][0])
 363         except:
 364             print "couldn't retrieve coordMax for chromosome %s" % chrom
 365
 366         return maxCoord
 367
 368
 369     def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
 370                      flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
 371                      withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
 372                      readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
 373                      flagLike=False, strand='', combine5p=False):
 374         """ returns a dictionary of reads in a variety of formats
 375         and which can be restricted by chromosome or custom-flag.
 376         Returns unique reads by default, but can return multireads
 377         with doMulti set to True.
 378
 379         Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
 380         """
 381
 382         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
 383         if findallOptimize:
 384             selectQuery = "select start, sense, sum(weight)"
 385         else:
 386             selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
 387
 388         groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
 389         if doUniqs:
 390             stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
 391             if doMulti:
 392                 stmt.append("UNION ALL")
 393                 stmt.append(selectQuery)
 394                 stmt.append("from multi")
 395                 stmt.append(whereQuery)
 396                 stmt.append(groupQuery)
 397         else:
 398             stmt = [selectQuery, "from multi", whereQuery]
 399
 400         if combine5p:
 401             if findallOptimize:
 402                 selectQuery = "select start, sense, weight, chrom"
 403
 404             if doUniqs:
 405                 subSelect = [selectQuery, "from uniqs", whereQuery]
 406                 if doMulti:
 407                     subSelect.append("union all")
 408                     subSelect.append(selectQuery)
 409                     subSelect.append("from multi")
 410                     subSelect.append(whereQuery)
 411             else:
 412                 subSelect = [selectQuery, "from multi", whereQuery]
 413
 414             sqlStmt = string.join(subSelect)
 415             if findallOptimize:
 416                 selectQuery = "select start, sense, sum(weight)"
 417
 418             stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
 419                     selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
 420
 421         if findallOptimize:
 422             if self.memBacked:
 423                 self.memcon.row_factory = None
 424                 sql = self.memcon.cursor()
 425             else:
 426                 self.dbcon.row_factory = None
 427                 sql = self.dbcon.cursor()
 428
 429             stmt.append("order by start")
 430         elif readIDDict:
 431             if self.memBacked:
 432                 sql = self.memcon.cursor()
 433             else:
 434                 sql = self.dbcon.cursor()
 435
 436             stmt.append("order by readID, start")
 437         else:
 438             if self.memBacked:
 439                 sql = self.memcon.cursor()
 440             else:
 441                 sql = self.dbcon.cursor()
 442
 443             stmt.append("order by chrom, start")
 444
 445         sqlQuery = string.join(stmt)
 446         sql.execute(sqlQuery)
 447
 448         resultsDict = {}
 449         if findallOptimize:
 450             resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
 451             if self.memBacked:
 452                 self.memcon.row_factory = sqlite.Row
 453             else:
 454                 self.dbcon.row_factory = sqlite.Row
 455         else:
 456             currentChrom = ""
 457             currentReadID = ""
 458             pairID = 0
 459             for row in sql:
 460                 readID = row["readID"]
 461                 if fullChrom:
 462                     chrom = row["chrom"]
 463                 else:
 464                     chrom = row["chrom"][3:]
 465
 466                 if not readIDDict and chrom != currentChrom:
 467                     resultsDict[chrom] = []
 468                     currentChrom = chrom
 469                     dictKey = chrom
 470                 elif readIDDict:
 471                     theReadID = readID
 472                     if "::" in readID:
 473                         theReadID = readID.split("::")[0]
 474
 475                     if "/" in theReadID and withPairID:
 476                         (theReadID, pairID) = readID.split("/")
 477
 478                     if theReadID != currentReadID:
 479                         resultsDict[theReadID] = []
 480                         currentReadID = theReadID
 481                         dictKey = theReadID
 482
 483                 newrow = {"start": int(row["start"])}
 484                 if bothEnds:
 485                     newrow["stop"] = int(row["stop"])
 486
 487                 if not noSense:
 488                     newrow["sense"] = row["sense"]
 489
 490                 if withWeight:
 491                     newrow["weight"] = float(row["weight"])
 492
 493                 if withFlag:
 494                     newrow["flag"] = row["flag"]
 495
 496                 if withMismatch:
 497                     newrow["mismatch"] = row["mismatch"]
 498
 499                 if withID:
 500                     newrow["readID"] = readID
 501
 502                 if withChrom:
 503                     newrow["chrom"] = chrom
 504
 505                 if withPairID:
 506                     newrow["pairID"] = pairID
 507
 508                 resultsDict[dictKey].append(newrow)
 509
 510         return resultsDict
 511
 512
 513     def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
 514         if splice:
 515             startText = "startL"
 516             stopText = "stopR"
 517         else:
 518             startText = "start"
 519             stopText = "stop"
 520
 521         whereClause = []
 522         if chrom != "" and chrom != self.memChrom:
 523             whereClause.append("chrom = '%s'" % chrom)
 524
 525         if flag != "":
 526             if flagLike:
 527                 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
 528                 whereClause.append(flagLikeClause)
 529             else:
 530                 whereClause.append("flag = '%s'" % flag)
 531
 532         if start > -1:
 533             whereClause.append("%s > %d" % (startText, start))
 534
 535         if stop > -1:
 536             whereClause.append("%s < %d" % (stopText, stop))
 537
 538         if len(readLike) > 0:
 539             readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
 540             whereClause.append(readIDClause)
 541
 542         if hasMismatch:
 543             whereClause.append("mismatch != ''")
 544
 545         if strand in ["+", "-"]:
 546             whereClause.append("sense = '%s'" % strand)
 547
 548         if len(whereClause) > 0:
 549             whereStatement = string.join(whereClause, " and ")
 550             whereQuery = "where %s" % whereStatement
 551         else:
 552             whereQuery = ""
 553
 554         return whereQuery
 555
 556
 557     def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
 558
 559         selectClause = [baseSelect]
 560         if bothEnds:
 561             selectClause.append("stop")
 562
 563         if not noSense:
 564             selectClause.append("sense")
 565
 566         if withWeight:
 567             selectClause.append("weight")
 568
 569         if withFlag:
 570             selectClause.append("flag")
 571
 572         if withMismatch:
 573             selectClause.append("mismatch")
 574
 575         selectQuery = string.join(selectClause, ",")
 576
 577         return selectQuery
 578
 579
 580     def getReadGroupQuery(self, findallOptimize, limit, combine5p):
 581         groupBy = []
 582         if findallOptimize:
 583             groupBy = ["GROUP BY start, sense"]
 584
 585         if limit > 0 and not combine5p:
 586             groupBy.append("LIMIT %d" % limit)
 587
 588         groupQuery = string.join(groupBy)
 589
 590         return groupQuery
 591
 592
 593     def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
 594                        flag="", withWeight=False, withFlag=False, withMismatch=False,
 595                        withID=False, withChrom=False, withPairID=False, readIDDict=False,
 596                        splitRead=False, hasMismatch=False, flagLike=False, start=-1,
 597                        stop=-1, strand=""):
 598         """ returns a dictionary of spliced reads in a variety of
 599         formats and which can be restricted by chromosome or custom-flag.
 600         Returns unique spliced reads for now.
 601         """
 602         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
 603         selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
 604         selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
 605         if self.memBacked:
 606             sql = self.memcon.cursor()
 607         else:
 608             sql = self.dbcon.cursor()
 609
 610         stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
 611         sql.execute(stmt)
 612         currentReadID = ""
 613         currentChrom = ""
 614         resultsDict = {}
 615         for row in sql:
 616             pairID = 0
 617             readID = row["readID"]
 618             if fullChrom:
 619                 chrom = row["chrom"]
 620             else:
 621                 chrom = row["chrom"][3:]
 622
 623             if not readIDDict and chrom != currentChrom:
 624                 resultsDict[chrom] = []
 625                 currentChrom = chrom
 626                 dictKey = chrom
 627             elif readIDDict:
 628                 if "/" in readID:
 629                     (theReadID, pairID) = readID.split("/")
 630                 else:
 631                     theReadID = readID
 632
 633                 if theReadID != currentReadID:
 634                     resultsDict[theReadID] = []
 635                     currentReadID = theReadID
 636                     dictKey = theReadID
 637
 638             newrow = {"startL": int(row["startL"])}
 639             newrow["stopL"] = int(row["stopL"])
 640             newrow["startR"] = int(row["startR"])
 641             newrow["stopR"] = int(row["stopR"])
 642             if not noSense:
 643                 newrow["sense"] = row["sense"]
 644
 645             if withWeight:
 646                 newrow["weight"] = float(row["weight"])
 647
 648             if withFlag:
 649                 newrow["flag"] = row["flag"]
 650
 651             if withMismatch:
 652                 newrow["mismatch"] = row["mismatch"]
 653
 654             if withID:
 655                 newrow["readID"] = readID
 656
 657             if withChrom:
 658                 newrow["chrom"] = chrom
 659
 660             if withPairID:
 661                 newrow["pairID"] = pairID
 662
 663             if splitRead:
 664                 leftDict = newrow.copy()
 665                 del leftDict["startR"]
 666                 del leftDict["stopR"]
 667                 rightDict = newrow
 668                 del rightDict["startL"]
 669                 del rightDict["stopL"]
 670                 resultsDict[dictKey].append(leftDict)
 671                 resultsDict[dictKey].append(rightDict)
 672             else:
 673                 resultsDict[dictKey].append(newrow)
 674
 675         return resultsDict
 676
 677
 678     def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
 679                   splices=False, reportCombined=True, sense="both"):
 680         """ return read counts for a given region.
 681         """
 682         ucount = 0
 683         mcount = 0
 684         scount = 0
 685         restrict = ""
 686         if sense in ["+", "-"]:
 687             restrict = " sense ='%s' " % sense
 688
 689         if uniqs:
 690             try:
 691                 ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
 692             except:
 693                 ucount = 0
 694
 695         if multi:
 696             try:
 697                 mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
 698             except:
 699                 mcount = 0
 700
 701         if splices:
 702             try:
 703                 scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
 704             except:
 705                 scount = 0
 706
 707         if reportCombined:
 708             total = ucount + mcount + scount
 709             return total
 710         else:
 711             return (ucount, mcount, scount)
 712
 713
 714     def getTotalCounts(self, chrom="", rmin="", rmax=""):
 715         """ return read counts for a given region.
 716         """
 717         return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
 718
 719
 720     def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
 721         """ returns the number of row in the uniqs table.
 722         """
 723         whereClause = []
 724         count = 0
 725
 726         if chrom !=""  and chrom != self.memChrom:
 727             whereClause = ["chrom='%s'" % chrom]
 728
 729         if rmin != "":
 730             whereClause.append("%s >= %s" % (startField, str(rmin)))
 731
 732         if rmax != "":
 733             whereClause.append("%s <= %s" % (startField, str(rmax)))
 734
 735         if restrict != "":
 736             whereClause.append(restrict)
 737
 738         if len(whereClause) > 0:
 739             whereStatement = string.join(whereClause, " and ")
 740             whereQuery = "where %s" % whereStatement
 741         else:
 742             whereQuery = ""
 743
 744         if self.memBacked:
 745             sql = self.memcon.cursor()
 746         else:
 747             sql = self.dbcon.cursor()
 748
 749         if distinct:
 750             sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
 751         else:
 752             sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
 753
 754         result = sql.fetchone()
 755
 756         try:
 757             count = int(result[0])
 758         except:
 759             count = 0
 760
 761         return count
 762
 763
 764     def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
 765         """ returns the number of row in the splices table.
 766         """
 767         # TODO: if the rds type is DNA should this just return zero?
 768         return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
 769
 770
 771     def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
 772         """ returns the number of distinct readIDs in the uniqs table.
 773         """
 774         return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
 775
 776
 777     def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
 778         """ returns the total weight of readIDs in the multi table.
 779         """
 780         return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
 781
 782
 783     def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
 784         """ get readID's.
 785         """
 786         stmt = []
 787         if uniqs:
 788             stmt.append("select readID from uniqs")
 789
 790         if multi:
 791             stmt.append("select readID from multi")
 792
 793         if splices:
 794             stmt.append("select readID from splices")
 795
 796         if len(stmt) > 0:
 797             selectPart = string.join(stmt, " union ")
 798         else:
 799             selectPart = ""
 800
 801         limitPart = ""
 802         if limit > 0:
 803             limitPart = "LIMIT %d" % limit
 804
 805         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
 806         if self.memBacked:
 807             sql = self.memcon.cursor()
 808         else:
 809             sql = self.dbcon.cursor()
 810
 811         sql.execute(sqlQuery)
 812         result = sql.fetchall()
 813
 814         if paired:
 815             return [x[0].split("/")[0] for x in result]
 816         else:
 817             return [x[0] for x in result]
 818
 819
 820     def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
 821         """ returns the uniq and spliced mismatches in a dictionary.
 822         """
 823         readlen = self.getReadSize()
 824         if mischrom:
 825             hitChromList = [mischrom]
 826         else:
 827             hitChromList = self.getChromosomes()
 828             hitChromList.sort()
 829
 830         snpDict = {}
 831         for achrom in hitChromList:
 832             if verbose:
 833                 print "getting mismatches from chromosome %s" % (achrom)
 834
 835             snpDict[achrom] = []
 836             if useSplices and self.dataType == "RNA":
 837                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
 838                 spliceIDList = spliceDict.keys()
 839                 for spliceID in spliceIDList:
 840                     spliceEntry = spliceDict[spliceID][0]
 841                     startpos = spliceEntry["startL"]
 842                     lefthalf = spliceEntry["stopL"]
 843                     rightstart = spliceEntry["startR"]
 844                     sense = spliceEntry["sense"]
 845                     mismatches = spliceEntry["mismatch"]
 846                     spMismatchList = mismatches.split(",")
 847                     for mismatch in spMismatchList:
 848                         if "N" in mismatch:
 849                             continue
 850
 851                         change_len = len(mismatch)
 852                         if sense == "+":
 853                             change_from = mismatch[0]
 854                             change_base = mismatch[change_len-1]
 855                             change_pos = int(mismatch[1:change_len-1])
 856                         elif sense == "-":
 857                             change_from = getReverseComplement([mismatch[0]])
 858                             change_base = getReverseComplement([mismatch[change_len-1]])
 859                             change_pos = readlen - int(mismatch[1:change_len-1]) + 1
 860
 861                         firsthalf = int(lefthalf)-int(startpos)+1
 862                         secondhalf = 0
 863                         if int(change_pos) <= int(firsthalf):
 864                             change_at = startpos + change_pos - 1
 865                         else:
 866                             secondhalf = change_pos - firsthalf
 867                             change_at = rightstart + secondhalf
 868
 869                         snpDict[achrom].append([startpos, change_at, change_base, change_from])
 870
 871             hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
 872             if achrom not in hitDict.keys():
 873                 continue
 874
 875             for readEntry in hitDict[achrom]:
 876                 start = readEntry["start"]
 877                 sense = readEntry["sense"]
 878                 mismatches = readEntry["mismatch"]
 879                 mismatchList = mismatches.split(",")
 880                 for mismatch in mismatchList:
 881                     if "N" in mismatch:
 882                         continue
 883
 884                     change_len = len(mismatch)
 885                     if sense == "+":
 886                         change_from = mismatch[0]
 887                         change_base = mismatch[change_len-1]
 888                         change_pos = int(mismatch[1:change_len-1])
 889                     elif sense == "-":
 890                         change_from = getReverseComplement([mismatch[0]])
 891                         change_base = getReverseComplement([mismatch[change_len-1]])
 892                         change_pos = readlen - int(mismatch[1:change_len-1]) + 1
 893
 894                     change_at = start + change_pos - 1
 895                     snpDict[achrom].append([start, change_at, change_base, change_from])
 896
 897         return snpDict
 898
 899
 900     def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
 901                         useSplices=False, normalizationFactor=1.0, trackStrand=False,
 902                         keepStrand="both", shiftValue=0):
 903         """return a profile of the chromosome as an array of per-base read coverage....
 904             keepStrand = 'both', 'plusOnly', or 'minusOnly'.
 905             Will also shift position of unique and multireads (but not splices) if shift is a natural number
 906         """
 907         metadata = self.getMetadata()
 908         try:
 909             readlen = int(metadata["readsize"])
 910         except KeyError:
 911             readlen = 0
 912
 913         dataType = metadata["dataType"]
 914         scale = 1. / normalizationFactor
 915         shift = {}
 916         shift["+"] = int(shiftValue)
 917         shift["-"] = -1 * int(shiftValue)
 918
 919         if cstop > 0:
 920             lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
 921         else:
 922             lastNT = cstop - cstart + readlen + shift["+"]
 923
 924         chromModel = array("f",[0.] * lastNT)
 925         hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
 926         if cstart < 0:
 927             cstart = 0
 928
 929         for readEntry in hitDict[chromosome]:
 930             hstart = readEntry["start"]
 931             sense =  readEntry ["sense"]
 932             weight = readEntry["weight"]
 933             hstart = hstart - cstart + shift[sense]
 934             for currentpos in range(hstart,hstart+readlen):
 935                 try:
 936                     if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
 937                         chromModel[currentpos] += scale * weight
 938                     elif sense == "-" and keepStrand != "plusOnly":
 939                         chromModel[currentpos] -= scale * weight
 940                 except:
 941                     continue
 942
 943         del hitDict
 944         if useSplices and dataType == "RNA":
 945             if cstop > 0:
 946                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
 947             else:
 948                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
 949
 950             if chromosome in spliceDict:
 951                 for spliceEntry in spliceDict[chromosome]:
 952                     Lstart = spliceEntry["startL"]
 953                     Lstop = spliceEntry["stopL"]
 954                     Rstart = spliceEntry["startR"]
 955                     Rstop = spliceEntry["stopR"]
 956                     rsense = spliceEntry["sense"]
 957                     if (Rstop - cstart) < lastNT:
 958                         for index in range(abs(Lstop - Lstart)):
 959                             currentpos = Lstart - cstart + index
 960                             # we only track unique splices
 961                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
 962                                 chromModel[currentpos] += scale
 963                             elif rsense == "-" and keepStrand != "plusOnly":
 964                                 chromModel[currentpos] -= scale
 965
 966                         for index in range(abs(Rstop - Rstart)):
 967                             currentpos = Rstart - cstart + index
 968                             # we only track unique splices
 969                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
 970                                 chromModel[currentpos] += scale
 971                             elif rsense == "-" and keepStrand != "plusOnly":
 972                                 chromModel[currentpos] -= scale
 973
 974             del spliceDict
 975
 976         return chromModel
 977
 978
 979     def insertMetadata(self, valuesList):
 980         """ inserts a list of (pname, pvalue) into the metadata
 981         table.
 982         """
 983         self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
 984         self.dbcon.commit()
 985
 986
 987     def updateMetadata(self, pname, newValue, originalValue=""):
 988         """ update a metadata field given the original value and the new value.
 989         """
 990         stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
 991         if originalValue != "":
 992             stmt += " and value='%s' " % str(originalValue)
 993
 994         self.dbcon.execute(stmt)
 995         self.dbcon.commit()
 996
 997
 998     def insertUniqs(self, valuesList):
 999         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1000         into the uniqs table.
1001         """
1002         self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1003         self.dbcon.commit()
1004
1005
1006     def insertMulti(self, valuesList):
1007         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1008         into the multi table.
1009         """
1010         self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1011         self.dbcon.commit()
1012
1013
1014     def insertSplices(self, valuesList):
1015         """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
1016         into the splices table.
1017         """
1018         self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
1019         self.dbcon.commit()
1020
1021
1022     def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
1023         """ update reads on file database in a list region of regions for a chromosome to have a new flag.
1024             regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
1025             sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
1026         """
1027         restrict = ""
1028         if sense != "both":
1029             restrict = " and sense = ? "
1030
1031         if uniqs:
1032             self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1033
1034         if multi:
1035             self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1036
1037         if self.dataType == "RNA" and splices:
1038             self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
1039             self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
1040
1041         self.dbcon.commit()
1042
1043
1044     def setFlags(self, flag, uniqs=True, multi=True, splices=True):
1045         """ set the flag fields in the entire dataset.
1046         """
1047         if uniqs:
1048             self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
1049
1050         if multi:
1051             self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
1052
1053         if self.dataType == "RNA" and splices:
1054             self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
1055
1056         self.dbcon.commit()
1057
1058
1059     def resetFlags(self, uniqs=True, multi=True, splices=True):
1060         """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
1061         """
1062         self.setFlags("", uniqs, multi, splices)
1063
1064
1065     def reweighMultireads(self, readList):
1066         self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
1067
1068
1069     def setSynchronousPragma(self, value="ON"):
1070         try:
1071             self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
1072         except:
1073             print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
1074
1075
1076     def setDBcache(self, cache, default=False):
1077         self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
1078         if default:
1079             self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
1080
1081
1082     def execute(self, statement, returnResults=False):
1083         sql = self.getSqlCursor()
1084
1085         sql.execute(statement)
1086         if returnResults:
1087             result = sql.fetchall()
1088             return result
1089
1090
1091     def executeCommit(self, statement):
1092         self.execute(statement)
1093
1094         if self.memBacked:
1095             self.memcon.commit()
1096         else:
1097             self.dbcon.commit()
1098
1099
1100     def buildIndex(self, cache=100000):
1101         """ Builds the file indeces for the main tables.
1102             Cache is the number of 1.5 kb pages to keep in memory.
1103             100000 pages translates into 150MB of RAM, which is our default.
1104         """
1105         if cache > self.getDefaultCacheSize():
1106             self.setDBcache(cache)
1107         self.setSynchronousPragma("OFF")
1108         self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1109         print "built uPosIndex"
1110         self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
1111         print "built uChromIndex"
1112         self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1113         print "built mPosIndex"
1114         self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
1115         print "built mChromIndex"
1116
1117         if self.dataType == "RNA":
1118             self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
1119             print "built sPosIndex"
1120             self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
1121             print "built sPosIndex2"
1122             self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
1123             print "built sChromIndex"
1124
1125         self.dbcon.commit()
1126         self.setSynchronousPragma("ON")
1127
1128
1129     def dropIndex(self):
1130         """ drops the file indices for the main tables.
1131         """
1132         try:
1133             self.setSynchronousPragma("OFF")
1134             self.dbcon.execute("DROP INDEX uPosIndex")
1135             self.dbcon.execute("DROP INDEX uChromIndex")
1136             self.dbcon.execute("DROP INDEX mPosIndex")
1137             self.dbcon.execute("DROP INDEX mChromIndex")
1138
1139             if self.dataType == "RNA":
1140                 self.dbcon.execute("DROP INDEX sPosIndex")
1141                 try:
1142                     self.dbcon.execute("DROP INDEX sPosIndex2")
1143                 except:
1144                     pass
1145
1146                 self.dbcon.execute("DROP INDEX sChromIndex")
1147
1148             self.dbcon.commit()
1149         except:
1150             print "problem dropping index"
1151
1152         self.setSynchronousPragma("ON")
1153
1154
1155     def memSync(self, chrom="", index=False):
1156         """ makes a copy of the dataset into memory for faster access.
1157         Can be restricted to a "full" chromosome. Can also build the
1158         memory indices.
1159         """
1160         self.memcon = ""
1161         self.memcon = sqlite.connect(":memory:")
1162         self.initializeTables(self.memcon)
1163         cursor = self.dbcon.cursor()
1164         whereclause = ""
1165         if chrom != "":
1166             print "memSync %s" % chrom
1167             whereclause = " where chrom = '%s' " % chrom
1168             self.memChrom = chrom
1169         else:
1170             self.memChrom = ""
1171
1172         self.memcon.execute("PRAGMA temp_store = MEMORY")
1173         self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
1174         # copy metadata to memory
1175         self.memcon.execute("delete from metadata")
1176         results = cursor.execute("select name, value from metadata")
1177         results2 = []
1178         for row in results:
1179             results2.append((row["name"], row["value"]))
1180
1181         self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
1182
1183         self.copyDBEntriesToMemory("uniqs", whereclause)
1184         self.copyDBEntriesToMemory("multi", whereclause)
1185         if self.dataType == "RNA":
1186             self.copySpliceDBEntriesToMemory(whereclause)
1187
1188         if index:
1189             if chrom != "":
1190                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
1191                 self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
1192                 if self.dataType == "RNA":
1193                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
1194                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
1195             else:
1196                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1197                 self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1198                 if self.dataType == "RNA":
1199                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
1200                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
1201
1202         self.memBacked = True
1203         self.memcon.row_factory = sqlite.Row
1204         self.memcon.commit()
1205
1206
1207     def copyDBEntriesToMemory(self, dbName, whereClause=""):
1208         cursor = self.dbcon.cursor()
1209         sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
1210         destinationEntries = []
1211         for row in sourceEntries:
1212             destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
1213
1214         self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
1215
1216
1217     def copySpliceDBEntriesToMemory(self, whereClause=""):
1218         cursor = self.dbcon.cursor()
1219         sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
1220         destinationEntries = []
1221         for row in sourceEntries:
1222             destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
1223                                        row["weight"], row["flag"], row["mismatch"]))
1224
1225         self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", destinationEntries)
1226