ReadDataset.py

   1 import sqlite3 as sqlite
   2 import string
   3 import tempfile
   4 import shutil
   5 import os
   6 from array import array
   7 from commoncode import getReverseComplement, getConfigParser, getConfigOption
   8
   9 currentRDSVersion = "2.0"
  10
  11
  12 class ReadDatasetError(Exception):
  13     pass
  14
  15
  16 class ReadDataset():
  17     """ Class for storing reads from experiments. Assumes that custom scripts
  18     will translate incoming data into a format that can be inserted into the
  19     class using the insert* methods. Default class subtype ('DNA') includes
  20     tables for unique and multireads, whereas 'RNA' subtype also includes a
  21     splices table.
  22     """
  23
  24     def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False,
  25                  cache=False, reportCount=True):
  26         """ creates an rds datafile if initialize is set to true, otherwise
  27         will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
  28         """
  29         self.dbcon = ""
  30         self.memcon = ""
  31         self.dataType = ""
  32         self.rdsVersion = currentRDSVersion
  33         self.memBacked = False
  34         self.memChrom = ""
  35         self.memCursor = ""
  36         self.cachedDBFile = ""
  37
  38         if cache:
  39             if verbose:
  40                 print "caching ...."
  41
  42             self.cacheDB(datafile)
  43             dbFile = self.cachedDBFile
  44         else:
  45             dbFile = datafile
  46
  47         self.dbcon = sqlite.connect(dbFile)
  48         self.dbcon.row_factory = sqlite.Row
  49         self.dbcon.execute("PRAGMA temp_store = MEMORY")
  50         if initialize:
  51             if datasetType not in ["DNA", "RNA"]:
  52                 raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
  53             else:
  54                 self.dataType = datasetType
  55
  56             self.initializeTables(self.dbcon)
  57         else:
  58             metadata = self.getMetadata("dataType")
  59             self.dataType = metadata["dataType"]
  60
  61         try:
  62             metadata = self.getMetadata("rdsVersion")
  63             self.rdsVersion = metadata["rdsVersion"]
  64         except:
  65             try:
  66                 self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
  67             except IOError:
  68                 print "could not add rdsVersion - read-only ?"
  69                 self.rdsVersion = "pre-1.0"
  70
  71         if verbose:
  72             if initialize:
  73                 print "INITIALIZED dataset %s" % datafile
  74             else:
  75                 print "dataset %s" % datafile
  76
  77             metadata = self.getMetadata()
  78             print "metadata:"
  79             pnameList = metadata.keys()
  80             pnameList.sort()
  81             for pname in pnameList:
  82                 print "\t" + pname + "\t" + metadata[pname]
  83
  84             if reportCount:
  85                 ucount = self.getUniqsCount()
  86                 mcount = self.getMultiCount()
  87                 if self.dataType == "DNA" and not initialize:
  88                     try:
  89                         print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
  90                     except ValueError:
  91                         print "\n%s unique reads and %s multireads" % (ucount, mcount)
  92                 elif self.dataType == "RNA" and not initialize:
  93                     scount = self.getSplicesCount()
  94                     try:
  95                         print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
  96                     except ValueError:
  97                         print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
  98
  99             print "default cache size is %d pages" % self.getDefaultCacheSize()
 100             if self.hasIndex():
 101                 print "found index"
 102             else:
 103                 print "not indexed"
 104
 105
 106     def __len__(self):
 107         """ return the number of usable reads in the dataset.
 108         """
 109         total = self.getUniqsCount()
 110         total += self.getMultiCount()
 111
 112         if self.dataType == "RNA":
 113             total += self.getSplicesCount()
 114
 115         total = int(total)
 116
 117         return total
 118
 119
 120     def __del__(self):
 121         """ cleanup copy in local cache, if present.
 122         """
 123         if self.cachedDBFile != "":
 124             self.uncacheDB()
 125
 126
 127     def cacheDB(self, filename):
 128         """ copy geneinfoDB to a local cache.
 129         """
 130         configParser = getConfigParser()
 131         cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
 132         tempfile.tempdir = cisTemp
 133         self.cachedDBFile =  "%s.db" % tempfile.mktemp()
 134         shutil.copyfile(filename, self.cachedDBFile)
 135
 136
 137     def saveCacheDB(self, filename):
 138         """ copy geneinfoDB to a local cache.
 139         """
 140         shutil.copyfile(self.cachedDBFile, filename)
 141
 142
 143     def uncacheDB(self):
 144         """ delete geneinfoDB from local cache.
 145         """
 146         global cachedDBFile
 147         if self.cachedDBFile != "":
 148             try:
 149                 os.remove(self.cachedDBFile)
 150             except:
 151                 print "could not delete %s" % self.cachedDBFile
 152
 153             self.cachedDB = ""
 154
 155
 156     def attachDB(self, filename, dbName):
 157         """ attach another database file to the readDataset.
 158         """
 159         stmt = "attach '%s' as %s" % (filename, dbName)
 160         self.execute(stmt)
 161
 162
 163     def detachDB(self, dbName):
 164         """ detach a database file to the readDataset.
 165         """
 166         stmt = "detach %s" % (dbName)
 167         self.execute(stmt)
 168
 169
 170     def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
 171         """ import into current RDS the table (with columns destcolumns,
 172             with default all columns) from the database file asname,
 173             using the column specification of ascolumns (default all).
 174         """
 175         stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
 176         if flagged != "":
 177             stmt += " where flag = '%s' " % flagged
 178
 179         self.executeCommit(stmt)
 180
 181
 182     def getTables(self, dbName=""):
 183         """ get a list of table names in a particular database file.
 184         """
 185         resultList = []
 186         sql = self.getSqlCursor()
 187
 188         if dbName != "":
 189             dbName = "%s." % dbName
 190
 191         stmt = "select name from %ssqlite_master where type='table'" % dbName
 192         sql.execute(stmt)
 193         results = sql.fetchall()
 194
 195         for row in results:
 196             resultList.append(row["name"])
 197
 198         return resultList
 199
 200
 201     def getSqlCursor(self):
 202         if self.memBacked:
 203             sql = self.getMemCursor()
 204         else:
 205             sql = self.getFileCursor()
 206
 207         return sql
 208
 209
 210     def hasIndex(self):
 211         """ check whether the RDS file has at least one index.
 212         """
 213         stmt = "select count(*) from sqlite_master where type='index'"
 214         count = int(self.execute(stmt, returnResults=True)[0][0])
 215         if count > 0:
 216             return True
 217
 218         return False
 219
 220
 221     def initializeTables(self, dbConnection, cache=100000):
 222         """ creates table schema in a database connection, which is
 223         typically a database file or an in-memory database.
 224         """
 225         dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
 226         dbConnection.execute("create table metadata (name varchar, value varchar)")
 227         dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
 228         positionSchema = "start int, stop int"
 229         tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
 230         dbConnection.execute("create table uniqs %s" % tableSchema)
 231         dbConnection.execute("create table multi %s" % tableSchema)
 232         if self.dataType == "RNA":
 233             positionSchema = "startL int, stopL int, startR int, stopR int"
 234             tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
 235             dbConnection.execute("create table splices %s" % tableSchema)
 236
 237         dbConnection.commit()
 238
 239
 240     def getFileCursor(self):
 241         """ returns a cursor to file database for low-level (SQL)
 242         access to the data.
 243         """
 244         return self.dbcon.cursor()
 245
 246
 247     def getMemCursor(self):
 248         """ returns a cursor to memory database for low-level (SQL)
 249         access to the data.
 250         """
 251         return self.memcon.cursor()
 252
 253
 254     def getMetadata(self, valueName=""):
 255         """ returns a dictionary of metadata.
 256         """
 257         whereClause = ""
 258         resultsDict = {}
 259
 260         if valueName != "":
 261             whereClause = " where name='%s'" % valueName
 262
 263         sql = self.getSqlCursor()
 264
 265         sql.execute("select name, value from metadata %s" % whereClause)
 266         results = sql.fetchall()
 267
 268         for row in results:
 269             parameterName = row["name"]
 270             parameterValue = row["value"]
 271             if parameterName not in resultsDict:
 272                 resultsDict[parameterName] = parameterValue
 273             else:
 274                 trying = True
 275                 index = 2
 276                 while trying:
 277                     newName = string.join([parameterName, str(index)], ":")
 278                     if newName not in resultsDict:
 279                         resultsDict[newName] = parameterValue
 280                         trying = False
 281
 282                     index += 1
 283
 284         return resultsDict
 285
 286
 287     def getReadSize(self):
 288         """ returns readsize if defined in metadata.
 289         """
 290         metadata = self.getMetadata()
 291         if "readsize" not in metadata:
 292             raise ReadDatasetError("no readsize parameter defined")
 293         else:
 294             readSize = metadata["readsize"]
 295             if "import" in readSize:
 296                 readSize = readSize.split()[0]
 297
 298             readSize = int(readSize)
 299             if readSize < 0:
 300                 raise ReadDatasetError("readsize is negative")
 301
 302             return readSize
 303
 304
 305     def getDefaultCacheSize(self):
 306         """ returns the default cache size.
 307         """
 308         return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
 309
 310
 311     def getChromosomes(self, table="uniqs", fullChrom=True):
 312         """ returns a list of distinct chromosomes in table.
 313         """
 314         statement = "select distinct chrom from %s" % table
 315         sql = self.getSqlCursor()
 316
 317         sql.execute(statement)
 318         results = []
 319         for row in sql:
 320             if fullChrom:
 321                 if row["chrom"] not in results:
 322                     results.append(row["chrom"])
 323             else:
 324                 shortName = row["chrom"][3:]
 325                 if  len(shortName.strip()) > 0 and shortName not in results:
 326                     results.append(shortName)
 327
 328         results.sort()
 329
 330         return results
 331
 332
 333     def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
 334                          doMulti=False, doSplices=False):
 335         """ returns the maximum coordinate for reads on a given chromosome.
 336         """
 337         maxCoord = 0
 338
 339         if doUniqs:
 340             maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
 341
 342         if doSplices:
 343             spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
 344             maxCoord = max(spliceMax, maxCoord)
 345
 346         if doMulti:
 347             multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
 348             maxCoord = max(multiMax, maxCoord)
 349
 350         if verbose:
 351             print "%s maxCoord: %d" % (chrom, maxCoord)
 352
 353         return maxCoord
 354
 355
 356     def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
 357         maxCoord = 0
 358         sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
 359         sql = self.getSqlCursor()
 360         try:
 361             sql.execute(sqlStatement)
 362             maxCoord = int(sql.fetchall()[0][0])
 363         except:
 364             print "couldn't retrieve coordMax for chromosome %s" % chrom
 365
 366         return maxCoord
 367
 368
 369     def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
 370                      flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
 371                      withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
 372                      readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
 373                      flagLike=False, strand='', combine5p=False):
 374         """ returns a dictionary of reads in a variety of formats
 375         and which can be restricted by chromosome or custom-flag.
 376         Returns unique reads by default, but can return multireads
 377         with doMulti set to True.
 378
 379         Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
 380         """
 381
 382         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
 383         if findallOptimize:
 384             selectQuery = "select start, sense, sum(weight)"
 385         else:
 386             selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
 387
 388         groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
 389         if doUniqs:
 390             stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
 391             if doMulti:
 392                 stmt.append("UNION ALL")
 393                 stmt.append(selectQuery)
 394                 stmt.append("from multi")
 395                 stmt.append(whereQuery)
 396                 stmt.append(groupQuery)
 397         else:
 398             stmt = [selectQuery, "from multi", whereQuery]
 399
 400         if combine5p:
 401             if findallOptimize:
 402                 selectQuery = "select start, sense, weight, chrom"
 403
 404             if doUniqs:
 405                 subSelect = [selectQuery, "from uniqs", whereQuery]
 406                 if doMulti:
 407                     subSelect.append("union all")
 408                     subSelect.append(selectQuery)
 409                     subSelect.append("from multi")
 410                     subSelect.append(whereQuery)
 411             else:
 412                 subSelect = [selectQuery, "from multi", whereQuery]
 413
 414             sqlStmt = string.join(subSelect)
 415             if findallOptimize:
 416                 selectQuery = "select start, sense, sum(weight)"
 417
 418             stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
 419                     selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
 420
 421         if findallOptimize:
 422             if self.memBacked:
 423                 self.memcon.row_factory = None
 424                 sql = self.memcon.cursor()
 425             else:
 426                 self.dbcon.row_factory = None
 427                 sql = self.dbcon.cursor()
 428
 429             stmt.append("order by start")
 430         elif readIDDict:
 431             if self.memBacked:
 432                 sql = self.memcon.cursor()
 433             else:
 434                 sql = self.dbcon.cursor()
 435
 436             stmt.append("order by readID, start")
 437         else:
 438             if self.memBacked:
 439                 sql = self.memcon.cursor()
 440             else:
 441                 sql = self.dbcon.cursor()
 442
 443             stmt.append("order by chrom, start")
 444
 445         sqlQuery = string.join(stmt)
 446         sql.execute(sqlQuery)
 447
 448         resultsDict = {}
 449         if findallOptimize:
 450             resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
 451             if self.memBacked:
 452                 self.memcon.row_factory = sqlite.Row
 453             else:
 454                 self.dbcon.row_factory = sqlite.Row
 455         else:
 456             currentChrom = ""
 457             currentReadID = ""
 458             pairID = 0
 459             for row in sql:
 460                 readID = row["readID"]
 461                 if fullChrom:
 462                     chrom = row["chrom"]
 463                 else:
 464                     chrom = row["chrom"][3:]
 465
 466                 if not readIDDict and chrom != currentChrom:
 467                     resultsDict[chrom] = []
 468                     currentChrom = chrom
 469                     dictKey = chrom
 470                 elif readIDDict:
 471                     theReadID = readID
 472                     if "::" in readID:
 473                         theReadID = readID.split("::")[0]
 474
 475                     if "/" in theReadID and withPairID:
 476                         (theReadID, pairID) = readID.split("/")
 477
 478                     if theReadID != currentReadID:
 479                         resultsDict[theReadID] = []
 480                         currentReadID = theReadID
 481                         dictKey = theReadID
 482
 483                 newrow = {"start": int(row["start"])}
 484                 if bothEnds:
 485                     newrow["stop"] = int(row["stop"])
 486
 487                 if not noSense:
 488                     newrow["sense"] = row["sense"]
 489
 490                 if withWeight:
 491                     newrow["weight"] = float(row["weight"])
 492
 493                 if withFlag:
 494                     newrow["flag"] = row["flag"]
 495
 496                 if withMismatch:
 497                     newrow["mismatch"] = row["mismatch"]
 498
 499                 if withID:
 500                     newrow["readID"] = readID
 501
 502                 if withChrom:
 503                     newrow["chrom"] = chrom
 504
 505                 if withPairID:
 506                     newrow["pairID"] = pairID
 507
 508                 resultsDict[dictKey].append(newrow)
 509
 510         return resultsDict
 511
 512
 513     def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
 514         if splice:
 515             startText = "startL"
 516             stopText = "stopR"
 517         else:
 518             startText = "start"
 519             stopText = "stop"
 520
 521         whereClause = []
 522         if chrom != "" and chrom != self.memChrom:
 523             whereClause.append("chrom = '%s'" % chrom)
 524
 525         if flag != "":
 526             if flagLike:
 527                 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
 528                 whereClause.append(flagLikeClause)
 529             else:
 530                 whereClause.append("flag = '%s'" % flag)
 531
 532         if start > -1:
 533             whereClause.append("%s > %d" % (startText, start))
 534
 535         if stop > -1:
 536             whereClause.append("%s < %d" % (stopText, stop))
 537
 538         if len(readLike) > 0:
 539             readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
 540             whereClause.append(readIDClause)
 541
 542         if hasMismatch:
 543             whereClause.append("mismatch != ''")
 544
 545         if strand in ["+", "-"]:
 546             whereClause.append("sense = '%s'" % strand)
 547
 548         if len(whereClause) > 0:
 549             whereStatement = string.join(whereClause, " and ")
 550             whereQuery = "where %s" % whereStatement
 551         else:
 552             whereQuery = ""
 553
 554         return whereQuery
 555
 556
 557     def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
 558
 559         selectClause = [baseSelect]
 560         if bothEnds:
 561             selectClause.append("stop")
 562
 563         if not noSense:
 564             selectClause.append("sense")
 565
 566         if withWeight:
 567             selectClause.append("weight")
 568
 569         if withFlag:
 570             selectClause.append("flag")
 571
 572         if withMismatch:
 573             selectClause.append("mismatch")
 574
 575         selectQuery = string.join(selectClause, ",")
 576
 577         return selectQuery
 578
 579
 580     def getReadGroupQuery(self, findallOptimize, limit, combine5p):
 581         groupBy = []
 582         if findallOptimize:
 583             groupBy = ["GROUP BY start, sense"]
 584
 585         if limit > 0 and not combine5p:
 586             groupBy.append("LIMIT %d" % limit)
 587
 588         groupQuery = string.join(groupBy)
 589
 590         return groupQuery
 591
 592
 593     def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
 594                        flag="", withWeight=False, withFlag=False, withMismatch=False,
 595                        withID=False, withChrom=False, withPairID=False, readIDDict=False,
 596                        splitRead=False, hasMismatch=False, flagLike=False, start=-1,
 597                        stop=-1, strand=""):
 598         """ returns a dictionary of spliced reads in a variety of
 599         formats and which can be restricted by chromosome or custom-flag.
 600         Returns unique spliced reads for now.
 601         """
 602         whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
 603         selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
 604         selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
 605         if self.memBacked:
 606             sql = self.memcon.cursor()
 607         else:
 608             sql = self.dbcon.cursor()
 609
 610         stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
 611         sql.execute(stmt)
 612         currentReadID = ""
 613         currentChrom = ""
 614         resultsDict = {}
 615         for row in sql:
 616             pairID = 0
 617             readID = row["readID"]
 618             if fullChrom:
 619                 chrom = row["chrom"]
 620             else:
 621                 chrom = row["chrom"][3:]
 622
 623             if not readIDDict and chrom != currentChrom:
 624                 resultsDict[chrom] = []
 625                 currentChrom = chrom
 626                 dictKey = chrom
 627             elif readIDDict:
 628                 if "/" in readID:
 629                     (theReadID, pairID) = readID.split("/")
 630                 else:
 631                     theReadID = readID
 632
 633                 if theReadID != currentReadID:
 634                     resultsDict[theReadID] = []
 635                     currentReadID = theReadID
 636                     dictKey = theReadID
 637
 638             newrow = {"startL": int(row["startL"])}
 639             newrow["stopL"] = int(row["stopL"])
 640             newrow["startR"] = int(row["startR"])
 641             newrow["stopR"] = int(row["stopR"])
 642             if not noSense:
 643                 newrow["sense"] = row["sense"]
 644
 645             if withWeight:
 646                 newrow["weight"] = float(row["weight"])
 647
 648             if withFlag:
 649                 newrow["flag"] = row["flag"]
 650
 651             if withMismatch:
 652                 newrow["mismatch"] = row["mismatch"]
 653
 654             if withID:
 655                 newrow["readID"] = readID
 656
 657             if withChrom:
 658                 newrow["chrom"] = chrom
 659
 660             if withPairID:
 661                 newrow["pairID"] = pairID
 662
 663             if splitRead:
 664                 leftDict = newrow.copy()
 665                 del leftDict["startR"]
 666                 del leftDict["stopR"]
 667                 rightDict = newrow
 668                 del rightDict["startL"]
 669                 del rightDict["stopL"]
 670                 resultsDict[dictKey].append(leftDict)
 671                 resultsDict[dictKey].append(rightDict)
 672             else:
 673                 resultsDict[dictKey].append(newrow)
 674
 675         return resultsDict
 676
 677
 678     def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
 679                   splices=False, reportCombined=True, sense="both"):
 680         """ return read counts for a given region.
 681         """
 682         ucount = 0
 683         mcount = 0
 684         scount = 0
 685         restrict = ""
 686         if sense in ["+", "-"]:
 687             restrict = " sense ='%s' " % sense
 688
 689         if uniqs:
 690             try:
 691                 ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
 692             except:
 693                 ucount = 0
 694
 695         if multi:
 696             try:
 697                 mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
 698             except:
 699                 mcount = 0
 700
 701         if splices:
 702             try:
 703                 scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
 704             except:
 705                 scount = 0
 706
 707         if reportCombined:
 708             total = ucount + mcount + scount
 709             return total
 710         else:
 711             return (ucount, mcount, scount)
 712
 713
 714     def getTotalCounts(self, chrom="", rmin="", rmax=""):
 715         """ return read counts for a given region.
 716         """
 717         return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
 718
 719
 720     def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
 721         """ returns the number of row in the uniqs table.
 722         """
 723         whereClause = []
 724         count = 0
 725
 726         if chrom !=""  and chrom != self.memChrom:
 727             whereClause = ["chrom='%s'" % chrom]
 728
 729         if rmin != "":
 730             whereClause.append("%s >= %s" % (startField, str(rmin)))
 731
 732         if rmax != "":
 733             whereClause.append("%s <= %s" % (startField, str(rmax)))
 734
 735         if restrict != "":
 736             whereClause.append(restrict)
 737
 738         if len(whereClause) > 0:
 739             whereStatement = string.join(whereClause, " and ")
 740             whereQuery = "where %s" % whereStatement
 741         else:
 742             whereQuery = ""
 743
 744         if self.memBacked:
 745             sql = self.memcon.cursor()
 746         else:
 747             sql = self.dbcon.cursor()
 748
 749         if distinct:
 750             sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
 751         else:
 752             sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
 753
 754         result = sql.fetchone()
 755
 756         try:
 757             count = int(result[0])
 758         except:
 759             count = 0
 760
 761         return count
 762
 763
 764     def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
 765         """ returns the number of row in the splices table.
 766         """
 767         return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
 768
 769
 770     def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
 771         """ returns the number of distinct readIDs in the uniqs table.
 772         """
 773         return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
 774
 775
 776     def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
 777         """ returns the total weight of readIDs in the multi table.
 778         """
 779         return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
 780
 781
 782     def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
 783         """ get readID's.
 784         """
 785         stmt = []
 786         if uniqs:
 787             stmt.append("select readID from uniqs")
 788
 789         if multi:
 790             stmt.append("select readID from multi")
 791
 792         if splices:
 793             stmt.append("select readID from splices")
 794
 795         if len(stmt) > 0:
 796             selectPart = string.join(stmt, " union ")
 797         else:
 798             selectPart = ""
 799
 800         limitPart = ""
 801         if limit > 0:
 802             limitPart = "LIMIT %d" % limit
 803
 804         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
 805         if self.memBacked:
 806             sql = self.memcon.cursor()
 807         else:
 808             sql = self.dbcon.cursor()
 809
 810         sql.execute(sqlQuery)
 811         result = sql.fetchall()
 812
 813         if paired:
 814             return [x[0].split("/")[0] for x in result]
 815         else:
 816             return [x[0] for x in result]
 817
 818
 819     def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
 820         """ returns the uniq and spliced mismatches in a dictionary.
 821         """
 822         readlen = self.getReadSize()
 823         if mischrom:
 824             hitChromList = [mischrom]
 825         else:
 826             hitChromList = self.getChromosomes()
 827             hitChromList.sort()
 828
 829         snpDict = {}
 830         for achrom in hitChromList:
 831             if verbose:
 832                 print "getting mismatches from chromosome %s" % (achrom)
 833
 834             snpDict[achrom] = []
 835             if useSplices and self.dataType == "RNA":
 836                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
 837                 spliceIDList = spliceDict.keys()
 838                 for spliceID in spliceIDList:
 839                     spliceEntry = spliceDict[spliceID][0]
 840                     startpos = spliceEntry["startL"]
 841                     lefthalf = spliceEntry["stopL"]
 842                     rightstart = spliceEntry["startR"]
 843                     sense = spliceEntry["sense"]
 844                     mismatches = spliceEntry["mismatch"]
 845                     spMismatchList = mismatches.split(",")
 846                     for mismatch in spMismatchList:
 847                         if "N" in mismatch:
 848                             continue
 849
 850                         change_len = len(mismatch)
 851                         if sense == "+":
 852                             change_from = mismatch[0]
 853                             change_base = mismatch[change_len-1]
 854                             change_pos = int(mismatch[1:change_len-1])
 855                         elif sense == "-":
 856                             change_from = getReverseComplement([mismatch[0]])
 857                             change_base = getReverseComplement([mismatch[change_len-1]])
 858                             change_pos = readlen - int(mismatch[1:change_len-1]) + 1
 859
 860                         firsthalf = int(lefthalf)-int(startpos)+1
 861                         secondhalf = 0
 862                         if int(change_pos) <= int(firsthalf):
 863                             change_at = startpos + change_pos - 1
 864                         else:
 865                             secondhalf = change_pos - firsthalf
 866                             change_at = rightstart + secondhalf
 867
 868                         snpDict[achrom].append([startpos, change_at, change_base, change_from])
 869
 870             hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
 871             if achrom not in hitDict.keys():
 872                 continue
 873
 874             for readEntry in hitDict[achrom]:
 875                 start = readEntry["start"]
 876                 sense = readEntry["sense"]
 877                 mismatches = readEntry["mismatch"]
 878                 mismatchList = mismatches.split(",")
 879                 for mismatch in mismatchList:
 880                     if "N" in mismatch:
 881                         continue
 882
 883                     change_len = len(mismatch)
 884                     if sense == "+":
 885                         change_from = mismatch[0]
 886                         change_base = mismatch[change_len-1]
 887                         change_pos = int(mismatch[1:change_len-1])
 888                     elif sense == "-":
 889                         change_from = getReverseComplement([mismatch[0]])
 890                         change_base = getReverseComplement([mismatch[change_len-1]])
 891                         change_pos = readlen - int(mismatch[1:change_len-1]) + 1
 892
 893                     change_at = start + change_pos - 1
 894                     snpDict[achrom].append([start, change_at, change_base, change_from])
 895
 896         return snpDict
 897
 898
 899     def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
 900                         useSplices=False, normalizationFactor=1.0, trackStrand=False,
 901                         keepStrand="both", shiftValue=0):
 902         """return a profile of the chromosome as an array of per-base read coverage....
 903             keepStrand = 'both', 'plusOnly', or 'minusOnly'.
 904             Will also shift position of unique and multireads (but not splices) if shift is a natural number
 905         """
 906         metadata = self.getMetadata()
 907         try:
 908             readlen = int(metadata["readsize"])
 909         except KeyError:
 910             readlen = 0
 911
 912         dataType = metadata["dataType"]
 913         scale = 1. / normalizationFactor
 914         shift = {}
 915         shift["+"] = int(shiftValue)
 916         shift["-"] = -1 * int(shiftValue)
 917
 918         if cstop > 0:
 919             lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
 920         else:
 921             lastNT = cstop - cstart + readlen + shift["+"]
 922
 923         chromModel = array("f",[0.] * lastNT)
 924         hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
 925         if cstart < 0:
 926             cstart = 0
 927
 928         for readEntry in hitDict[chromosome]:
 929             hstart = readEntry["start"]
 930             sense =  readEntry ["sense"]
 931             weight = readEntry["weight"]
 932             hstart = hstart - cstart + shift[sense]
 933             for currentpos in range(hstart,hstart+readlen):
 934                 try:
 935                     if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
 936                         chromModel[currentpos] += scale * weight
 937                     elif sense == "-" and keepStrand != "plusOnly":
 938                         chromModel[currentpos] -= scale * weight
 939                 except:
 940                     continue
 941
 942         del hitDict
 943         if useSplices and dataType == "RNA":
 944             if cstop > 0:
 945                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
 946             else:
 947                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
 948
 949             if chromosome in spliceDict:
 950                 for spliceEntry in spliceDict[chromosome]:
 951                     Lstart = spliceEntry["startL"]
 952                     Lstop = spliceEntry["stopL"]
 953                     Rstart = spliceEntry["startR"]
 954                     Rstop = spliceEntry["stopR"]
 955                     rsense = spliceEntry["sense"]
 956                     if (Rstop - cstart) < lastNT:
 957                         for index in range(abs(Lstop - Lstart)):
 958                             currentpos = Lstart - cstart + index
 959                             # we only track unique splices
 960                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
 961                                 chromModel[currentpos] += scale
 962                             elif rsense == "-" and keepStrand != "plusOnly":
 963                                 chromModel[currentpos] -= scale
 964
 965                         for index in range(abs(Rstop - Rstart)):
 966                             currentpos = Rstart - cstart + index
 967                             # we only track unique splices
 968                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
 969                                 chromModel[currentpos] += scale
 970                             elif rsense == "-" and keepStrand != "plusOnly":
 971                                 chromModel[currentpos] -= scale
 972
 973             del spliceDict
 974
 975         return chromModel
 976
 977
 978     def insertMetadata(self, valuesList):
 979         """ inserts a list of (pname, pvalue) into the metadata
 980         table.
 981         """
 982         self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
 983         self.dbcon.commit()
 984
 985
 986     def updateMetadata(self, pname, newValue, originalValue=""):
 987         """ update a metadata field given the original value and the new value.
 988         """
 989         stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
 990         if originalValue != "":
 991             stmt += " and value='%s' " % str(originalValue)
 992
 993         self.dbcon.execute(stmt)
 994         self.dbcon.commit()
 995
 996
 997     def insertUniqs(self, valuesList):
 998         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
 999         into the uniqs table.
1000         """
1001         self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1002         self.dbcon.commit()
1003
1004
1005     def insertMulti(self, valuesList):
1006         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1007         into the multi table.
1008         """
1009         self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1010         self.dbcon.commit()
1011
1012
1013     def insertSplices(self, valuesList):
1014         """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
1015         into the splices table.
1016         """
1017         self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
1018         self.dbcon.commit()
1019
1020
1021     def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
1022         """ update reads on file database in a list region of regions for a chromosome to have a new flag.
1023             regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
1024             sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
1025         """
1026         restrict = ""
1027         if sense != "both":
1028             restrict = " and sense = ? "
1029
1030         if uniqs:
1031             self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1032
1033         if multi:
1034             self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1035
1036         if self.dataType == "RNA" and splices:
1037             self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
1038             self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
1039
1040         self.dbcon.commit()
1041
1042
1043     def setFlags(self, flag, uniqs=True, multi=True, splices=True):
1044         """ set the flag fields in the entire dataset.
1045         """
1046         if uniqs:
1047             self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
1048
1049         if multi:
1050             self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
1051
1052         if self.dataType == "RNA" and splices:
1053             self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
1054
1055         self.dbcon.commit()
1056
1057
1058     def resetFlags(self, uniqs=True, multi=True, splices=True):
1059         """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
1060         """
1061         self.setFlags("", uniqs, multi, splices)
1062
1063
1064     def reweighMultireads(self, readList):
1065         self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
1066
1067
1068     def setSynchronousPragma(self, value="ON"):
1069         try:
1070             self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
1071         except:
1072             print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
1073
1074
1075     def setDBcache(self, cache, default=False):
1076         self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
1077         if default:
1078             self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
1079
1080
1081     def execute(self, statement, returnResults=False):
1082         sql = self.getSqlCursor()
1083
1084         sql.execute(statement)
1085         if returnResults:
1086             result = sql.fetchall()
1087             return result
1088
1089
1090     def executeCommit(self, statement):
1091         self.execute(statement)
1092
1093         if self.memBacked:
1094             self.memcon.commit()
1095         else:
1096             self.dbcon.commit()
1097
1098
1099     def buildIndex(self, cache=100000):
1100         """ Builds the file indeces for the main tables.
1101             Cache is the number of 1.5 kb pages to keep in memory.
1102             100000 pages translates into 150MB of RAM, which is our default.
1103         """
1104         if cache > self.getDefaultCacheSize():
1105             self.setDBcache(cache)
1106         self.setSynchronousPragma("OFF")
1107         self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1108         print "built uPosIndex"
1109         self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
1110         print "built uChromIndex"
1111         self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1112         print "built mPosIndex"
1113         self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
1114         print "built mChromIndex"
1115
1116         if self.dataType == "RNA":
1117             self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
1118             print "built sPosIndex"
1119             self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
1120             print "built sPosIndex2"
1121             self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
1122             print "built sChromIndex"
1123
1124         self.dbcon.commit()
1125         self.setSynchronousPragma("ON")
1126
1127
1128     def dropIndex(self):
1129         """ drops the file indices for the main tables.
1130         """
1131         try:
1132             self.setSynchronousPragma("OFF")
1133             self.dbcon.execute("DROP INDEX uPosIndex")
1134             self.dbcon.execute("DROP INDEX uChromIndex")
1135             self.dbcon.execute("DROP INDEX mPosIndex")
1136             self.dbcon.execute("DROP INDEX mChromIndex")
1137
1138             if self.dataType == "RNA":
1139                 self.dbcon.execute("DROP INDEX sPosIndex")
1140                 try:
1141                     self.dbcon.execute("DROP INDEX sPosIndex2")
1142                 except:
1143                     pass
1144
1145                 self.dbcon.execute("DROP INDEX sChromIndex")
1146
1147             self.dbcon.commit()
1148         except:
1149             print "problem dropping index"
1150
1151         self.setSynchronousPragma("ON")
1152
1153
1154     def memSync(self, chrom="", index=False):
1155         """ makes a copy of the dataset into memory for faster access.
1156         Can be restricted to a "full" chromosome. Can also build the
1157         memory indices.
1158         """
1159         self.memcon = ""
1160         self.memcon = sqlite.connect(":memory:")
1161         self.initializeTables(self.memcon)
1162         cursor = self.dbcon.cursor()
1163         whereclause = ""
1164         if chrom != "":
1165             print "memSync %s" % chrom
1166             whereclause = " where chrom = '%s' " % chrom
1167             self.memChrom = chrom
1168         else:
1169             self.memChrom = ""
1170
1171         self.memcon.execute("PRAGMA temp_store = MEMORY")
1172         self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
1173         # copy metadata to memory
1174         self.memcon.execute("delete from metadata")
1175         results = cursor.execute("select name, value from metadata")
1176         results2 = []
1177         for row in results:
1178             results2.append((row["name"], row["value"]))
1179
1180         self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
1181
1182         self.copyDBEntriesToMemory("uniqs", whereclause)
1183         self.copyDBEntriesToMemory("multi", whereclause)
1184         if self.dataType == "RNA":
1185             self.copySpliceDBEntriesToMemory(whereclause)
1186
1187         if index:
1188             if chrom != "":
1189                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
1190                 self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
1191                 if self.dataType == "RNA":
1192                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
1193                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
1194             else:
1195                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1196                 self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1197                 if self.dataType == "RNA":
1198                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
1199                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
1200
1201         self.memBacked = True
1202         self.memcon.row_factory = sqlite.Row
1203         self.memcon.commit()
1204
1205
1206     def copyDBEntriesToMemory(self, dbName, whereClause=""):
1207         cursor = self.dbcon.cursor()
1208         sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
1209         destinationEntries = []
1210         for row in sourceEntries:
1211             destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
1212
1213         self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
1214
1215
1216     def copySpliceDBEntriesToMemory(self, whereClause=""):
1217         cursor = self.dbcon.cursor()
1218         sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
1219         destinationEntries = []
1220         for row in sourceEntries:
1221             destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
1222                                        row["weight"], row["flag"], row["mismatch"]))
1223
1224         self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", destinationEntries)
1225