ReadDataset.py

   1 import sqlite3 as sqlite
   2 import string
   3 import tempfile
   4 import shutil
   5 import os
   6 from array import array
   7 from commoncode import getReverseComplement, getConfigParser, getConfigOption
   8
   9 currentRDSVersion = "2.0"
  10
  11
  12 class ReadDatasetError(Exception):
  13     pass
  14
  15
  16 class ReadDataset():
  17     """ Class for storing reads from experiments. Assumes that custom scripts
  18     will translate incoming data into a format that can be inserted into the
  19     class using the insert* methods. Default class subtype ('DNA') includes
  20     tables for unique and multireads, whereas 'RNA' subtype also includes a
  21     splices table.
  22     """
  23
  24     def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False,
  25                  cache=False, reportCount=True):
  26         """ creates an rds datafile if initialize is set to true, otherwise
  27         will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
  28         """
  29         self.dbcon = ""
  30         self.memcon = ""
  31         self.dataType = ""
  32         self.rdsVersion = currentRDSVersion
  33         self.memBacked = False
  34         self.memChrom = ""
  35         self.memCursor = ""
  36         self.cachedDBFile = ""
  37
  38         if cache:
  39             if verbose:
  40                 print "caching ...."
  41
  42             self.cacheDB(datafile)
  43             dbFile = self.cachedDBFile
  44         else:
  45             dbFile = datafile
  46
  47         self.dbcon = sqlite.connect(dbFile)
  48         self.dbcon.row_factory = sqlite.Row
  49         self.dbcon.execute("PRAGMA temp_store = MEMORY")
  50         if initialize:
  51             if datasetType not in ["DNA", "RNA"]:
  52                 raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
  53             else:
  54                 self.dataType = datasetType
  55
  56             self.initializeTables(self.dbcon)
  57         else:
  58             metadata = self.getMetadata("dataType")
  59             self.dataType = metadata["dataType"]
  60
  61         try:
  62             metadata = self.getMetadata("rdsVersion")
  63             self.rdsVersion = metadata["rdsVersion"]
  64         except:
  65             try:
  66                 self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
  67             except IOError:
  68                 print "could not add rdsVersion - read-only ?"
  69                 self.rdsVersion = "pre-1.0"
  70
  71         if verbose:
  72             if initialize:
  73                 print "INITIALIZED dataset %s" % datafile
  74             else:
  75                 print "dataset %s" % datafile
  76
  77             metadata = self.getMetadata()
  78             print "metadata:"
  79             pnameList = metadata.keys()
  80             pnameList.sort()
  81             for pname in pnameList:
  82                 print "\t" + pname + "\t" + metadata[pname]
  83
  84             if reportCount:
  85                 ucount = self.getUniqsCount()
  86                 mcount = self.getMultiCount()
  87                 if self.dataType == "DNA" and not initialize:
  88                     try:
  89                         print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
  90                     except ValueError:
  91                         print "\n%s unique reads and %s multireads" % (ucount, mcount)
  92                 elif self.dataType == "RNA" and not initialize:
  93                     scount = self.getSplicesCount()
  94                     try:
  95                         print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
  96                     except ValueError:
  97                         print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
  98
  99             print "default cache size is %d pages" % self.getDefaultCacheSize()
 100             if self.hasIndex():
 101                 print "found index"
 102             else:
 103                 print "not indexed"
 104
 105
 106     def __len__(self):
 107         """ return the number of usable reads in the dataset.
 108         """
 109         total = self.getUniqsCount()
 110         total += self.getMultiCount()
 111
 112         if self.dataType == "RNA":
 113             total += self.getSplicesCount()
 114
 115         total = int(total)
 116
 117         return total
 118
 119
 120     def __del__(self):
 121         """ cleanup copy in local cache, if present.
 122         """
 123         if self.cachedDBFile != "":
 124             self.uncacheDB()
 125
 126
 127     def cacheDB(self, filename):
 128         """ copy geneinfoDB to a local cache.
 129         """
 130         configParser = getConfigParser()
 131         cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
 132         tempfile.tempdir = cisTemp
 133         self.cachedDBFile =  "%s.db" % tempfile.mktemp()
 134         shutil.copyfile(filename, self.cachedDBFile)
 135
 136
 137     def saveCacheDB(self, filename):
 138         """ copy geneinfoDB to a local cache.
 139         """
 140         shutil.copyfile(self.cachedDBFile, filename)
 141
 142
 143     def uncacheDB(self):
 144         """ delete geneinfoDB from local cache.
 145         """
 146         global cachedDBFile
 147         if self.cachedDBFile != "":
 148             try:
 149                 os.remove(self.cachedDBFile)
 150             except:
 151                 print "could not delete %s" % self.cachedDBFile
 152
 153             self.cachedDB = ""
 154
 155
 156     def attachDB(self, filename, asname):
 157         """ attach another database file to the readDataset.
 158         """
 159         stmt = "attach '%s' as %s" % (filename, asname)
 160         self.execute(stmt)
 161
 162
 163     def detachDB(self, asname):
 164         """ detach a database file to the readDataset.
 165         """
 166         stmt = "detach %s" % (asname)
 167         self.execute(stmt)
 168
 169
 170     def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
 171         """ import into current RDS the table (with columns destcolumns,
 172             with default all columns) from the database file asname,
 173             using the column specification of ascolumns (default all).
 174         """
 175         stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
 176         if flagged != "":
 177             stmt += " where flag = '%s' " % flagged
 178
 179         self.executeCommit(stmt)
 180
 181
 182     def getTables(self, asname=""):
 183         """ get a list of table names in a particular database file.
 184         """
 185         resultList = []
 186         sql = self.getSqlCursor()
 187
 188         if asname != "":
 189             asname += "."
 190
 191         stmt = "select name from %ssqlite_master where type='table'" % asname
 192         sql.execute(stmt)
 193         results = sql.fetchall()
 194
 195         for row in results:
 196             resultList.append(row["name"])
 197
 198         return resultList
 199
 200
 201     def getSqlCursor(self):
 202         if self.memBacked:
 203             sql = self.getMemCursor()
 204         else:
 205             sql = self.getFileCursor()
 206
 207         return sql
 208
 209
 210     def hasIndex(self):
 211         """ check whether the RDS file has at least one index.
 212         """
 213         stmt = "select count(*) from sqlite_master where type='index'"
 214         count = int(self.execute(stmt, returnResults=True)[0][0])
 215         if count > 0:
 216             return True
 217
 218         return False
 219
 220
 221     def initializeTables(self, dbConnection, cache=100000):
 222         """ creates table schema in a database connection, which is
 223         typically a database file or an in-memory database.
 224         """
 225         dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
 226         dbConnection.execute("create table metadata (name varchar, value varchar)")
 227         dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
 228         positionSchema = "start int, stop int"
 229         tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
 230         dbConnection.execute("create table uniqs %s" % tableSchema)
 231         dbConnection.execute("create table multi %s" % tableSchema)
 232         if self.dataType == "RNA":
 233             positionSchema = "startL int, stopL int, startR int, stopR int"
 234             tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
 235             dbConnection.execute("create table splices %s" % tableSchema)
 236
 237         dbConnection.commit()
 238
 239
 240     def getFileCursor(self):
 241         """ returns a cursor to file database for low-level (SQL)
 242         access to the data.
 243         """
 244         return self.dbcon.cursor()
 245
 246
 247     def getMemCursor(self):
 248         """ returns a cursor to memory database for low-level (SQL)
 249         access to the data.
 250         """
 251         return self.memcon.cursor()
 252
 253
 254     def getMetadata(self, valueName=""):
 255         """ returns a dictionary of metadata.
 256         """
 257         whereClause = ""
 258         resultsDict = {}
 259
 260         if valueName != "":
 261             whereClause = " where name='%s'" % valueName
 262
 263         sql = self.getSqlCursor()
 264
 265         sql.execute("select name, value from metadata %s" % whereClause)
 266         results = sql.fetchall()
 267
 268         for row in results:
 269             parameterName = row["name"]
 270             parameterValue = row["value"]
 271             if parameterName not in resultsDict:
 272                 resultsDict[parameterName] = parameterValue
 273             else:
 274                 trying = True
 275                 index = 2
 276                 while trying:
 277                     newName = string.join([parameterName, str(index)], ":")
 278                     if newName not in resultsDict:
 279                         resultsDict[newName] = parameterValue
 280                         trying = False
 281
 282                     index += 1
 283
 284         return resultsDict
 285
 286
 287     def getReadSize(self):
 288         """ returns readsize if defined in metadata.
 289         """
 290         metadata = self.getMetadata()
 291         if "readsize" not in metadata:
 292             raise ReadDatasetError("no readsize parameter defined")
 293         else:
 294             mysize = metadata["readsize"]
 295             if "import" in mysize:
 296                 mysize = mysize.split()[0]
 297
 298             return int(mysize)
 299
 300
 301     def getDefaultCacheSize(self):
 302         """ returns the default cache size.
 303         """
 304         return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
 305
 306
 307     def getChromosomes(self, table="uniqs", fullChrom=True):
 308         """ returns a list of distinct chromosomes in table.
 309         """
 310         statement = "select distinct chrom from %s" % table
 311         sql = self.getSqlCursor()
 312
 313         sql.execute(statement)
 314         results = []
 315         for row in sql:
 316             if fullChrom:
 317                 if row["chrom"] not in results:
 318                     results.append(row["chrom"])
 319             else:
 320                 if  len(row["chrom"][3:].strip()) < 1:
 321                     continue
 322
 323                 if row["chrom"][3:] not in results:
 324                     results.append(row["chrom"][3:])
 325
 326         results.sort()
 327
 328         return results
 329
 330
 331     def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
 332                          doMulti=False, doSplices=False):
 333         """ returns the maximum coordinate for reads on a given chromosome.
 334         """
 335         maxCoord = 0
 336         sql = self.getSqlCursor()
 337
 338         if doUniqs:
 339             try:
 340                 sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
 341                 maxCoord = int(sql.fetchall()[0][0])
 342             except:
 343                 print "couldn't retrieve coordMax for chromosome %s" % chrom
 344
 345         if doSplices:
 346             sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
 347             try:
 348                 spliceMax = int(sql.fetchall()[0][0])
 349                 if spliceMax > maxCoord:
 350                     maxCoord = spliceMax
 351             except:
 352                 pass
 353
 354         if doMulti:
 355             sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
 356             try:
 357                 multiMax = int(sql.fetchall()[0][0])
 358                 if multiMax > maxCoord:
 359                     maxCoord = multiMax
 360             except:
 361                 pass
 362
 363         if verbose:
 364             print "%s maxCoord: %d" % (chrom, maxCoord)
 365
 366         return maxCoord
 367
 368
 369     def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
 370                      flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
 371                      withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
 372                      readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
 373                      flagLike=False, strand='', combine5p=False):
 374         """ returns a dictionary of reads in a variety of formats
 375         and which can be restricted by chromosome or custom-flag.
 376         Returns unique reads by default, but can return multireads
 377         with doMulti set to True.
 378
 379         Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
 380         """
 381         whereClause = []
 382         resultsDict = {}
 383
 384         if chrom != "" and chrom != self.memChrom:
 385             whereClause.append("chrom = '%s'" % chrom)
 386
 387         if flag != "":
 388             if flagLike:
 389                 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
 390                 whereClause.append(flagLikeClause)
 391             else:
 392                 whereClause.append("flag = '%s'" % flag)
 393
 394         if start > -1:
 395             whereClause.append("start > %d" % start)
 396
 397         if stop > -1:
 398             whereClause.append("stop < %d" % stop)
 399
 400         if len(readLike) > 0:
 401             readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
 402             whereClause.append(readIDClause)
 403
 404         if hasMismatch:
 405             whereClause.append("mismatch != ''")
 406
 407         if strand in ["+", "-"]:
 408             whereClause.append("sense = '%s'" % strand)
 409
 410         if len(whereClause) > 0:
 411             whereStatement = string.join(whereClause, " and ")
 412             whereQuery = "where %s" % whereStatement
 413         else:
 414             whereQuery = ""
 415
 416         groupBy = []
 417         if findallOptimize:
 418             selectClause = ["select start, sense, sum(weight)"]
 419             groupBy = ["GROUP BY start, sense"]
 420         else:
 421             selectClause = ["select ID, chrom, start, readID"]
 422             if bothEnds:
 423                 selectClause.append("stop")
 424
 425             if not noSense:
 426                 selectClause.append("sense")
 427
 428             if withWeight:
 429                 selectClause.append("weight")
 430
 431             if withFlag:
 432                 selectClause.append("flag")
 433
 434             if withMismatch:
 435                 selectClause.append("mismatch")
 436
 437         if limit > 0 and not combine5p:
 438             groupBy.append("LIMIT %d" % limit)
 439
 440         selectQuery = string.join(selectClause, ",")
 441         groupQuery = string.join(groupBy)
 442         if doUniqs:
 443             stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
 444             if doMulti:
 445                 stmt.append("UNION ALL")
 446                 stmt.append(selectQuery)
 447                 stmt.append("from multi")
 448                 stmt.append(whereQuery)
 449                 stmt.append(groupQuery)
 450         else:
 451             stmt = [selectQuery, "from multi", whereQuery]
 452
 453         if combine5p:
 454             if findallOptimize:
 455                 selectQuery = "select start, sense, weight, chrom"
 456
 457             if doUniqs:
 458                 subSelect = [selectQuery, "from uniqs", whereQuery]
 459                 if doMulti:
 460                     subSelect.append("union all")
 461                     subSelect.append(selectQuery)
 462                     subSelect.append("from multi")
 463                     subSelect.append(whereQuery)
 464             else:
 465                 subSelect = [selectQuery, "from multi", whereQuery]
 466
 467             sqlStmt = string.join(subSelect)
 468             if findallOptimize:
 469                 selectQuery = "select start, sense, sum(weight)"
 470
 471             stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
 472                     selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
 473
 474         if findallOptimize:
 475             if self.memBacked:
 476                 self.memcon.row_factory = None
 477                 sql = self.memcon.cursor()
 478             else:
 479                 self.dbcon.row_factory = None
 480                 sql = self.dbcon.cursor()
 481
 482             stmt.append("order by start")
 483         elif readIDDict:
 484             if self.memBacked:
 485                 sql = self.memcon.cursor()
 486             else:
 487                 sql = self.dbcon.cursor()
 488
 489             stmt.append("order by readID, start")
 490         else:
 491             if self.memBacked:
 492                 sql = self.memcon.cursor()
 493             else:
 494                 sql = self.dbcon.cursor()
 495
 496             stmt.append("order by chrom, start")
 497
 498         sqlQuery = string.join(stmt)
 499         sql.execute(sqlQuery)
 500
 501         if findallOptimize:
 502             resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
 503             if self.memBacked:
 504                 self.memcon.row_factory = sqlite.Row
 505             else:
 506                 self.dbcon.row_factory = sqlite.Row
 507         else:
 508             currentChrom = ""
 509             currentReadID = ""
 510             pairID = 0
 511             for row in sql:
 512                 readID = row["readID"]
 513                 if fullChrom:
 514                     chrom = row["chrom"]
 515                 else:
 516                     chrom = row["chrom"][3:]
 517
 518                 if not readIDDict and chrom != currentChrom:
 519                     resultsDict[chrom] = []
 520                     currentChrom = chrom
 521                     dictKey = chrom
 522                 elif readIDDict:
 523                     theReadID = readID
 524                     if "::" in readID:
 525                         theReadID = readID.split("::")[0]
 526
 527                     if "/" in theReadID and withPairID:
 528                         (theReadID, pairID) = readID.split("/")
 529
 530                     if theReadID != currentReadID:
 531                         resultsDict[theReadID] = []
 532                         currentReadID = theReadID
 533                         dictKey = theReadID
 534
 535                 newrow = {"start": int(row["start"])}
 536                 if bothEnds:
 537                     newrow["stop"] = int(row["stop"])
 538
 539                 if not noSense:
 540                     newrow["sense"] = row["sense"]
 541
 542                 if withWeight:
 543                     newrow["weight"] = float(row["weight"])
 544
 545                 if withFlag:
 546                     newrow["flag"] = row["flag"]
 547
 548                 if withMismatch:
 549                     newrow["mismatch"] = row["mismatch"]
 550
 551                 if withID:
 552                     newrow["readID"] = readID
 553
 554                 if withChrom:
 555                     newrow["chrom"] = chrom
 556
 557                 if withPairID:
 558                     newrow["pairID"] = pairID
 559
 560                 resultsDict[dictKey].append(newrow)
 561
 562         return resultsDict
 563
 564
 565     def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
 566                        flag="", withWeight=False, withFlag=False, withMismatch=False,
 567                        withID=False, withChrom=False, withPairID=False, readIDDict=False,
 568                        splitRead=False, hasMismatch=False, flagLike=False, start=-1,
 569                        stop=-1, strand=""):
 570         """ returns a dictionary of spliced reads in a variety of
 571         formats and which can be restricted by chromosome or custom-flag.
 572         Returns unique spliced reads for now.
 573         """
 574         whereClause = []
 575         resultsDict = {}
 576
 577         if chrom != "" and chrom != self.memChrom:
 578             whereClause = ["chrom = '%s'" % chrom]
 579
 580         if flag != "":
 581             if flagLike:
 582                 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
 583                 whereClause.append(flagLikeClause)
 584             else:
 585                 whereClause.append("flag = '%s'" % flag)
 586
 587         if hasMismatch:
 588             whereClause.append("mismatch != ''")
 589
 590         if strand != "":
 591             whereClause.append("sense = '%s'" % strand)
 592
 593         if start > -1:
 594             whereClause.append("startL > %d" % start)
 595
 596         if stop > -1:
 597             whereClause.append("stopR < %d" % stop)
 598
 599         if len(whereClause) > 0:
 600             whereStatement = string.join(whereClause, " and ")
 601             whereQuery = "where %s" % whereStatement
 602         else:
 603             whereQuery = ""
 604
 605         selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
 606         if not noSense:
 607             selectClause.append("sense")
 608
 609         if withWeight:
 610             selectClause.append("weight")
 611
 612         if withFlag:
 613             selectClause.append("flag")
 614
 615         if withMismatch:
 616             selectClause.append("mismatch")
 617
 618         selectQuery = string.join(selectClause, " ,")
 619         if self.memBacked:
 620             sql = self.memcon.cursor()
 621         else:
 622             sql = self.dbcon.cursor()
 623
 624         stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
 625         sql.execute(stmt)
 626         currentReadID = ""
 627         currentChrom = ""
 628         for row in sql:
 629             pairID = 0
 630             readID = row["readID"]
 631             if fullChrom:
 632                 chrom = row["chrom"]
 633             else:
 634                 chrom = row["chrom"][3:]
 635
 636             if not readIDDict and chrom != currentChrom:
 637                 resultsDict[chrom] = []
 638                 currentChrom = chrom
 639                 dictKey = chrom
 640             elif readIDDict:
 641                 if "/" in readID:
 642                     (theReadID, pairID) = readID.split("/")
 643                 else:
 644                     theReadID = readID
 645
 646                 if theReadID != currentReadID:
 647                     resultsDict[theReadID] = []
 648                     currentReadID = theReadID
 649                     dictKey = theReadID
 650
 651             newrow = {"startL": int(row["startL"])}
 652             newrow["stopL"] = int(row["stopL"])
 653             newrow["startR"] = int(row["startR"])
 654             newrow["stopR"] = int(row["stopR"])
 655             if not noSense:
 656                 newrow["sense"] = row["sense"]
 657
 658             if withWeight:
 659                 newrow["weight"] = float(row["weight"])
 660
 661             if withFlag:
 662                 newrow["flag"] = row["flag"]
 663
 664             if withMismatch:
 665                 newrow["mismatch"] = row["mismatch"]
 666
 667             if withID:
 668                 newrow["readID"] = readID
 669
 670             if withChrom:
 671                 newrow["chrom"] = chrom
 672
 673             if withPairID:
 674                 newrow["pairID"] = pairID
 675
 676             if splitRead:
 677                 leftDict = newrow.copy()
 678                 del leftDict["startR"]
 679                 del leftDict["stopR"]
 680                 rightDict = newrow
 681                 del rightDict["startL"]
 682                 del rightDict["stopL"]
 683                 resultsDict[dictKey].append(leftDict)
 684                 resultsDict[dictKey].append(rightDict)
 685             else:
 686                 resultsDict[dictKey].append(newrow)
 687
 688         return resultsDict
 689
 690
 691     def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
 692                   splices=False, reportCombined=True, sense="both"):
 693         """ return read counts for a given region.
 694         """
 695         ucount = 0
 696         mcount = 0
 697         scount = 0
 698         restrict = ""
 699         if sense in ["+", "-"]:
 700             restrict = " sense ='%s' " % sense
 701
 702         if uniqs:
 703             try:
 704                 ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
 705             except:
 706                 ucount = 0
 707
 708         if multi:
 709             try:
 710                 mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
 711             except:
 712                 mcount = 0
 713
 714         if splices:
 715             try:
 716                 scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
 717             except:
 718                 scount = 0
 719
 720         if reportCombined:
 721             total = ucount + mcount + scount
 722             return total
 723         else:
 724             return (ucount, mcount, scount)
 725
 726
 727     def getTotalCounts(self, chrom="", rmin="", rmax=""):
 728         """ return read counts for a given region.
 729         """
 730         return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
 731
 732
 733     def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
 734         """ returns the number of row in the uniqs table.
 735         """
 736         whereClause = []
 737         count = 0
 738
 739         if chrom !=""  and chrom != self.memChrom:
 740             whereClause = ["chrom='%s'" % chrom]
 741
 742         if rmin != "":
 743             whereClause.append("%s >= %s" % (startField, str(rmin)))
 744
 745         if rmax != "":
 746             whereClause.append("%s <= %s" % (startField, str(rmax)))
 747
 748         if restrict != "":
 749             whereClause.append(restrict)
 750
 751         if len(whereClause) > 0:
 752             whereStatement = string.join(whereClause, " and ")
 753             whereQuery = "where %s" % whereStatement
 754         else:
 755             whereQuery = ""
 756
 757         if self.memBacked:
 758             sql = self.memcon.cursor()
 759         else:
 760             sql = self.dbcon.cursor()
 761
 762         if distinct:
 763             sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
 764         else:
 765             sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
 766
 767         result = sql.fetchone()
 768
 769         try:
 770             count = int(result[0])
 771         except:
 772             count = 0
 773
 774         return count
 775
 776
 777     def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
 778         """ returns the number of row in the splices table.
 779         """
 780         return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
 781
 782
 783     def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
 784         """ returns the number of distinct readIDs in the uniqs table.
 785         """
 786         return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
 787
 788
 789     def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
 790         """ returns the total weight of readIDs in the multi table.
 791         """
 792         return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
 793
 794
 795     def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
 796         """ get readID's.
 797         """
 798         stmt = []
 799         limitPart = ""
 800         if limit > 0:
 801             limitPart = "LIMIT %d" % limit
 802
 803         if uniqs:
 804             stmt.append("select readID from uniqs")
 805
 806         if multi:
 807             stmt.append("select readID from multi")
 808
 809         if splices:
 810             stmt.append("select readID from splices")
 811
 812         if len(stmt) > 0:
 813             selectPart = string.join(stmt, " union ")
 814         else:
 815             selectPart = ""
 816
 817         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
 818         if self.memBacked:
 819             sql = self.memcon.cursor()
 820         else:
 821             sql = self.dbcon.cursor()
 822
 823         sql.execute(sqlQuery)
 824         result = sql.fetchall()
 825
 826         if paired:
 827             return [x[0].split("/")[0] for x in result]
 828         else:
 829             return [x[0] for x in result]
 830
 831
 832     def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
 833         """ returns the uniq and spliced mismatches in a dictionary.
 834         """
 835         readlen = self.getReadSize()
 836         if mischrom:
 837             hitChromList = [mischrom]
 838         else:
 839             hitChromList = self.getChromosomes()
 840             hitChromList.sort()
 841
 842         snpDict = {}
 843         for achrom in hitChromList:
 844             if verbose:
 845                 print "getting mismatches from chromosome %s" % (achrom)
 846
 847             snpDict[achrom] = []
 848             hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
 849             if useSplices and self.dataType == "RNA":
 850                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
 851                 spliceIDList = spliceDict.keys()
 852                 for k in spliceIDList:
 853                     spliceEntry = spliceDict[k][0]
 854                     startpos = spliceEntry["startL"]
 855                     lefthalf = spliceEntry["stopL"]
 856                     rightstart = spliceEntry["startR"]
 857                     sense = spliceEntry["sense"]
 858                     mismatches = spliceEntry["mismatch"]
 859                     spMismatchList = mismatches.split(",")
 860                     for mismatch in spMismatchList:
 861                         if "N" in mismatch:
 862                             continue
 863
 864                         change_len = len(mismatch)
 865                         if sense == "+":
 866                             change_from = mismatch[0]
 867                             change_base = mismatch[change_len-1]
 868                             change_pos = int(mismatch[1:change_len-1])
 869                         elif sense == "-":
 870                             change_from = getReverseComplement([mismatch[0]])
 871                             change_base = getReverseComplement([mismatch[change_len-1]])
 872                             change_pos = readlen - int(mismatch[1:change_len-1]) + 1
 873
 874                         firsthalf = int(lefthalf)-int(startpos)+1
 875                         secondhalf = 0
 876                         if int(change_pos) <= int(firsthalf):
 877                             change_at = startpos + change_pos - 1
 878                         else:
 879                             secondhalf = change_pos - firsthalf
 880                             change_at = rightstart + secondhalf
 881
 882                         snpDict[achrom].append([startpos, change_at, change_base, change_from])
 883
 884             if achrom not in hitDict.keys():
 885                 continue
 886
 887             for readEntry in hitDict[achrom]:
 888                 start = readEntry["start"]
 889                 sense = readEntry["sense"]
 890                 mismatches = readEntry["mismatch"]
 891                 mismatchList = mismatches.split(",")
 892                 for mismatch in mismatchList:
 893                     if "N" in mismatch:
 894                         continue
 895
 896                     change_len = len(mismatch)
 897                     if sense == "+":
 898                         change_from = mismatch[0]
 899                         change_base = mismatch[change_len-1]
 900                         change_pos = int(mismatch[1:change_len-1])
 901                     elif sense == "-":
 902                         change_from = getReverseComplement([mismatch[0]])
 903                         change_base = getReverseComplement([mismatch[change_len-1]])
 904                         change_pos = readlen - int(mismatch[1:change_len-1]) + 1
 905
 906                     change_at = start + change_pos - 1
 907                     snpDict[achrom].append([start, change_at, change_base, change_from])
 908
 909         return snpDict
 910
 911
 912     def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
 913                         useSplices=False, normalizationFactor = 1.0, trackStrand=False,
 914                         keepStrand="both", shiftValue=0):
 915         """return a profile of the chromosome as an array of per-base read coverage....
 916             keepStrand = 'both', 'plusOnly', or 'minusOnly'.
 917             Will also shift position of unique and multireads (but not splices) if shift is a natural number
 918         """
 919         metadata = self.getMetadata()
 920         try:
 921             readlen = int(metadata["readsize"])
 922         except KeyError:
 923             readlen = 0
 924
 925         dataType = metadata["dataType"]
 926         scale = 1. / normalizationFactor
 927         shift = {}
 928         shift['+'] = int(shiftValue)
 929         shift['-'] = -1 * int(shiftValue)
 930
 931         if cstop > 0:
 932             lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
 933         else:
 934             lastNT = cstop - cstart + readlen + shift["+"]
 935
 936         chromModel = array("f",[0.] * lastNT)
 937         hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
 938         if cstart < 0:
 939             cstart = 0
 940
 941         for readEntry in hitDict[chromosome]:
 942             hstart = readEntry["start"]
 943             sense =  readEntry ["sense"]
 944             weight = readEntry["weight"]
 945             hstart = hstart - cstart + shift[sense]
 946             for currentpos in range(hstart,hstart+readlen):
 947                 try:
 948                     if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
 949                         chromModel[currentpos] += scale * weight
 950                     elif sense == "-" and keepStrand != "plusOnly":
 951                         chromModel[currentpos] -= scale * weight
 952                 except:
 953                     continue
 954
 955         del hitDict
 956         if useSplices and dataType == "RNA":
 957             if cstop > 0:
 958                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
 959             else:
 960                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
 961
 962             if chromosome in spliceDict:
 963                 for spliceEntry in spliceDict[chromosome]:
 964                     Lstart = spliceEntry["startL"]
 965                     Lstop = spliceEntry["stopL"]
 966                     Rstart = spliceEntry["startR"]
 967                     Rstop = spliceEntry["stopR"]
 968                     rsense = spliceEntry["sense"]
 969                     if (Rstop - cstart) < lastNT:
 970                         for index in range(abs(Lstop - Lstart)):
 971                             currentpos = Lstart - cstart + index
 972                             # we only track unique splices
 973                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
 974                                 chromModel[currentpos] += scale
 975                             elif rsense == "-" and keepStrand != "plusOnly":
 976                                 chromModel[currentpos] -= scale
 977
 978                         for index in range(abs(Rstop - Rstart)):
 979                             currentpos = Rstart - cstart + index
 980                             # we only track unique splices
 981                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
 982                                 chromModel[currentpos] += scale
 983                             elif rsense == "-" and keepStrand != "plusOnly":
 984                                 chromModel[currentpos] -= scale
 985
 986             del spliceDict
 987
 988         return chromModel
 989
 990
 991     def insertMetadata(self, valuesList):
 992         """ inserts a list of (pname, pvalue) into the metadata
 993         table.
 994         """
 995         self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
 996         self.dbcon.commit()
 997
 998
 999     def updateMetadata(self, pname, newValue, originalValue=""):
1000         """ update a metadata field given the original value and the new value.
1001         """
1002         stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
1003         if originalValue != "":
1004             stmt += " and value='%s' " % str(originalValue)
1005
1006         self.dbcon.execute(stmt)
1007         self.dbcon.commit()
1008
1009
1010     def insertUniqs(self, valuesList):
1011         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1012         into the uniqs table.
1013         """
1014         self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1015         self.dbcon.commit()
1016
1017
1018     def insertMulti(self, valuesList):
1019         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1020         into the multi table.
1021         """
1022         self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1023         self.dbcon.commit()
1024
1025
1026     def insertSplices(self, valuesList):
1027         """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
1028         into the splices table.
1029         """
1030         self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
1031         self.dbcon.commit()
1032
1033
1034     def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
1035         """ update reads on file database in a list region of regions for a chromosome to have a new flag.
1036             regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
1037             sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
1038         """
1039         restrict = ""
1040         if sense != "both":
1041             restrict = " and sense = ? "
1042
1043         if uniqs:
1044             self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1045
1046         if multi:
1047             self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1048
1049         if self.dataType == "RNA" and splices:
1050             self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
1051             self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
1052
1053         self.dbcon.commit()
1054
1055
1056     def setFlags(self, flag, uniqs=True, multi=True, splices=True):
1057         """ set the flag fields in the entire dataset.
1058         """
1059         if uniqs:
1060             self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
1061
1062         if multi:
1063             self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
1064
1065         if self.dataType == "RNA" and splices:
1066             self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
1067
1068         self.dbcon.commit()
1069
1070
1071     def resetFlags(self, uniqs=True, multi=True, splices=True):
1072         """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
1073         """
1074         self.setFlags("", uniqs, multi, splices)
1075
1076
1077     def reweighMultireads(self, readList):
1078         self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
1079
1080
1081     def setSynchronousPragma(self, value="ON"):
1082         try:
1083             self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
1084         except:
1085             print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
1086
1087
1088     def setDBcache(self, cache, default=False):
1089         self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
1090         if default:
1091             self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
1092
1093
1094     def execute(self, statement, returnResults=False):
1095         sql = self.getSqlCursor()
1096
1097         sql.execute(statement)
1098         if returnResults:
1099             result = sql.fetchall()
1100             return result
1101
1102
1103     def executeCommit(self, statement):
1104         self.execute(statement)
1105
1106         if self.memBacked:
1107             self.memcon.commit()
1108         else:
1109             self.dbcon.commit()
1110
1111
1112     def buildIndex(self, cache=100000):
1113         """ Builds the file indeces for the main tables.
1114             Cache is the number of 1.5 kb pages to keep in memory.
1115             100000 pages translates into 150MB of RAM, which is our default.
1116         """
1117         if cache > self.getDefaultCacheSize():
1118             self.setDBcache(cache)
1119         self.setSynchronousPragma("OFF")
1120         self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1121         print "built uPosIndex"
1122         self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
1123         print "built uChromIndex"
1124         self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1125         print "built mPosIndex"
1126         self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
1127         print "built mChromIndex"
1128
1129         if self.dataType == "RNA":
1130             self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
1131             print "built sPosIndex"
1132             self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
1133             print "built sPosIndex2"
1134             self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
1135             print "built sChromIndex"
1136
1137         self.dbcon.commit()
1138         self.setSynchronousPragma("ON")
1139
1140
1141     def dropIndex(self):
1142         """ drops the file indices for the main tables.
1143         """
1144         try:
1145             self.setSynchronousPragma("OFF")
1146             self.dbcon.execute("DROP INDEX uPosIndex")
1147             self.dbcon.execute("DROP INDEX uChromIndex")
1148             self.dbcon.execute("DROP INDEX mPosIndex")
1149             self.dbcon.execute("DROP INDEX mChromIndex")
1150
1151             if self.dataType == "RNA":
1152                 self.dbcon.execute("DROP INDEX sPosIndex")
1153                 try:
1154                     self.dbcon.execute("DROP INDEX sPosIndex2")
1155                 except:
1156                     pass
1157
1158                 self.dbcon.execute("DROP INDEX sChromIndex")
1159
1160             self.dbcon.commit()
1161         except:
1162             print "problem dropping index"
1163
1164         self.setSynchronousPragma("ON")
1165
1166
1167     def memSync(self, chrom="", index=False):
1168         """ makes a copy of the dataset into memory for faster access.
1169         Can be restricted to a "full" chromosome. Can also build the
1170         memory indices.
1171         """
1172         self.memcon = ""
1173         self.memcon = sqlite.connect(":memory:")
1174         self.initializeTables(self.memcon)
1175         cursor = self.dbcon.cursor()
1176         whereclause = ""
1177         if chrom != "":
1178             print "memSync %s" % chrom
1179             whereclause = " where chrom = '%s' " % chrom
1180             self.memChrom = chrom
1181         else:
1182             self.memChrom = ""
1183
1184         self.memcon.execute("PRAGMA temp_store = MEMORY")
1185         self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
1186         # copy metadata to memory
1187         self.memcon.execute("delete from metadata")
1188         results = cursor.execute("select name, value from metadata")
1189         results2 = []
1190         for row in results:
1191             results2.append((row["name"], row["value"]))
1192
1193         self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
1194
1195         self.copyDBEntriesToMemory("uniqs", whereclause)
1196         self.copyDBEntriesToMemory("multi", whereclause)
1197         if self.dataType == "RNA":
1198             self.copySpliceDBEntriesToMemory(whereclause)
1199
1200         if index:
1201             if chrom != "":
1202                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
1203                 self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
1204                 if self.dataType == "RNA":
1205                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
1206                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
1207             else:
1208                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1209                 self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1210                 if self.dataType == "RNA":
1211                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
1212                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
1213
1214         self.memBacked = True
1215         self.memcon.row_factory = sqlite.Row
1216         self.memcon.commit()
1217
1218
1219     def copyDBEntriesToMemory(self, dbName, whereClause=""):
1220         cursor = self.dbcon.cursor()
1221         sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
1222         destinationEntries = []
1223         for row in sourceEntries:
1224             destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
1225
1226         self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
1227
1228
1229     def copySpliceDBEntriesToMemory(self, whereClause=""):
1230         cursor = self.dbcon.cursor()
1231         sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
1232         destinationEntries = []
1233         for row in sourceEntries:
1234             destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
1235                                        row["weight"], row["flag"], row["mismatch"]))
1236
1237         self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", destinationEntries)
1238