ReadDataset.py

   1 """
   2 Created on Jul 1, 2010
   3
   4 @author: sau
   5 """
   6
   7 import sqlite3 as sqlite
   8 import string
   9 import tempfile
  10 import shutil
  11 import os
  12 from os import environ
  13 from array import array
  14 from commoncode import getReverseComplement
  15
  16 if environ.get("CISTEMATIC_TEMP"):
  17     cisTemp = environ.get("CISTEMATIC_TEMP")
  18 else:
  19     cisTemp = "/tmp"
  20
  21 tempfile.tempdir = cisTemp
  22 currentRDSVersion = "1.1"
  23
  24
  25 class ReadDatasetError(Exception):
  26     pass
  27
  28
  29 class ReadDataset():
  30     """ Class for storing reads from experiments. Assumes that custom scripts
  31     will translate incoming data into a format that can be inserted into the
  32     class using the insert* methods. Default class subtype ('DNA') includes
  33     tables for unique and multireads, whereas 'RNA' subtype also includes a
  34     splices table.
  35     """
  36
  37     def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False,
  38                  cache=False, reportCount=True):
  39         """ creates an rds datafile if initialize is set to true, otherwise
  40         will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
  41         """
  42         self.dbcon = ""
  43         self.memcon = ""
  44         self.dataType = ""
  45         self.rdsVersion = currentRDSVersion
  46         self.memBacked = False
  47         self.memChrom = ""
  48         self.memCursor = ""
  49         self.cachedDBFile = ""
  50
  51         if cache:
  52             if verbose:
  53                 print "caching ...."
  54
  55             self.cacheDB(datafile)
  56             dbFile = self.cachedDBFile
  57         else:
  58             dbFile = datafile
  59
  60         self.dbcon = sqlite.connect(dbFile)
  61         self.dbcon.row_factory = sqlite.Row
  62         self.dbcon.execute("PRAGMA temp_store = MEMORY")
  63         if initialize:
  64             if datasetType not in ["DNA", "RNA"]:
  65                 raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
  66             else:
  67                 self.dataType = datasetType
  68
  69             self.initializeTables(self.dbcon)
  70         else:
  71             metadata = self.getMetadata("dataType")
  72             self.dataType = metadata["dataType"]
  73
  74         try:
  75             metadata = self.getMetadata("rdsVersion")
  76             self.rdsVersion = metadata["rdsVersion"]
  77         except:
  78             try:
  79                 self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
  80             except IOError:
  81                 print "could not add rdsVersion - read-only ?"
  82                 self.rdsVersion = "pre-1.0"
  83
  84         if verbose:
  85             if initialize:
  86                 print "INITIALIZED dataset %s" % datafile
  87             else:
  88                 print "dataset %s" % datafile
  89
  90             metadata = self.getMetadata()
  91             print "metadata:"
  92             pnameList = metadata.keys()
  93             pnameList.sort()
  94             for pname in pnameList:
  95                 print "\t" + pname + "\t" + metadata[pname]
  96
  97             if reportCount:
  98                 ucount = self.getUniqsCount()
  99                 mcount = self.getMultiCount()
 100                 if self.dataType == "DNA" and not initialize:
 101                     try:
 102                         print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
 103                     except ValueError:
 104                         print "\n%s unique reads and %s multireads" % (ucount, mcount)
 105                 elif self.dataType == "RNA" and not initialize:
 106                     scount = self.getSplicesCount()
 107                     try:
 108                         print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
 109                     except ValueError:
 110                         print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
 111
 112             print "default cache size is %d pages" % self.getDefaultCacheSize()
 113             if self.hasIndex():
 114                 print "found index"
 115             else:
 116                 print "not indexed"
 117
 118
 119     def __len__(self):
 120         """ return the number of usable reads in the dataset.
 121         """
 122         total = self.getUniqsCount()
 123         total += self.getMultiCount()
 124
 125         if self.dataType == "RNA":
 126             total += self.getSplicesCount()
 127
 128         total = int(total)
 129
 130         return total
 131
 132
 133     def __del__(self):
 134         """ cleanup copy in local cache, if present.
 135         """
 136         if self.cachedDBFile != "":
 137             self.uncacheDB()
 138
 139
 140     def cacheDB(self, filename):
 141         """ copy geneinfoDB to a local cache.
 142         """
 143         self.cachedDBFile =  "%s.db" % tempfile.mktemp()
 144         shutil.copyfile(filename, self.cachedDBFile)
 145
 146
 147     def saveCacheDB(self, filename):
 148         """ copy geneinfoDB to a local cache.
 149         """
 150         shutil.copyfile(self.cachedDBFile, filename)
 151
 152
 153     def uncacheDB(self):
 154         """ delete geneinfoDB from local cache.
 155         """
 156         global cachedDBFile
 157         if self.cachedDBFile != "":
 158             try:
 159                 os.remove(self.cachedDBFile)
 160             except:
 161                 print "could not delete %s" % self.cachedDBFile
 162
 163             self.cachedDB = ""
 164
 165
 166     def attachDB(self, filename, asname):
 167         """ attach another database file to the readDataset.
 168         """
 169         stmt = "attach '%s' as %s" % (filename, asname)
 170         self.execute(stmt)
 171
 172
 173     def detachDB(self, asname):
 174         """ detach a database file to the readDataset.
 175         """
 176         stmt = "detach %s" % (asname)
 177         self.execute(stmt)
 178
 179
 180     def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
 181         """ import into current RDS the table (with columns destcolumns,
 182             with default all columns) from the database file asname,
 183             using the column specification of ascolumns (default all).
 184         """
 185         stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
 186         if flagged != "":
 187             stmt += " where flag = '%s' " % flagged
 188
 189         self.executeCommit(stmt)
 190
 191
 192     def getTables(self, asname=""):
 193         """ get a list of table names in a particular database file.
 194         """
 195         resultList = []
 196         sql = self.getSqlCursor()
 197
 198         if asname != "":
 199             asname += "."
 200
 201         stmt = "select name from %ssqlite_master where type='table'" % asname
 202         sql.execute(stmt)
 203         results = sql.fetchall()
 204
 205         for row in results:
 206             resultList.append(row["name"])
 207
 208         return resultList
 209
 210
 211     def getSqlCursor(self):
 212         if self.memBacked:
 213             sql = self.getMemCursor()
 214         else:
 215             sql = self.getFileCursor()
 216
 217         return sql
 218
 219
 220     def hasIndex(self):
 221         """ check whether the RDS file has at least one index.
 222         """
 223         stmt = "select count(*) from sqlite_master where type='index'"
 224         count = int(self.execute(stmt, returnResults=True)[0][0])
 225         if count > 0:
 226             return True
 227
 228         return False
 229
 230
 231     def initializeTables(self, dbConnection, cache=100000):
 232         """ creates table schema in a database connection, which is
 233         typically a database file or an in-memory database.
 234         """
 235         dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
 236         dbConnection.execute("create table metadata (name varchar, value varchar)")
 237         dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
 238         positionSchema = "start int, stop int"
 239         tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
 240         dbConnection.execute("create table uniqs %s" % tableSchema)
 241         dbConnection.execute("create table multi %s" % tableSchema)
 242         if self.dataType == "RNA":
 243             positionSchema = "startL int, stopL int, startR int, stopR int"
 244             tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
 245             dbConnection.execute("create table splices %s" % tableSchema)
 246
 247         dbConnection.commit()
 248
 249
 250     def getFileCursor(self):
 251         """ returns a cursor to file database for low-level (SQL)
 252         access to the data.
 253         """
 254         return self.dbcon.cursor()
 255
 256
 257     def getMemCursor(self):
 258         """ returns a cursor to memory database for low-level (SQL)
 259         access to the data.
 260         """
 261         return self.memcon.cursor()
 262
 263
 264     def getMetadata(self, valueName=""):
 265         """ returns a dictionary of metadata.
 266         """
 267         whereClause = ""
 268         resultsDict = {}
 269
 270         if valueName != "":
 271             whereClause = " where name='%s'" % valueName
 272
 273         sql = self.getSqlCursor()
 274
 275         sql.execute("select name, value from metadata %s" % whereClause)
 276         results = sql.fetchall()
 277
 278         for row in results:
 279             parameterName = row["name"]
 280             parameterValue = row["value"]
 281             if parameterName not in resultsDict:
 282                 resultsDict[parameterName] = parameterValue
 283             else:
 284                 trying = True
 285                 index = 2
 286                 while trying:
 287                     newName = string.join([parameterName, str(index)], ":")
 288                     if newName not in resultsDict:
 289                         resultsDict[newName] = parameterValue
 290                         trying = False
 291
 292                     index += 1
 293
 294         return resultsDict
 295
 296
 297     def getReadSize(self):
 298         """ returns readsize if defined in metadata.
 299         """
 300         metadata = self.getMetadata()
 301         if "readsize" not in metadata:
 302             raise ReadDatasetError("no readsize parameter defined")
 303         else:
 304             mysize = metadata["readsize"]
 305             if "import" in mysize:
 306                 mysize = mysize.split()[0]
 307
 308             return int(mysize)
 309
 310
 311     def getDefaultCacheSize(self):
 312         """ returns the default cache size.
 313         """
 314         return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
 315
 316
 317     def getChromosomes(self, table="uniqs", fullChrom=True):
 318         """ returns a list of distinct chromosomes in table.
 319         """
 320         statement = "select distinct chrom from %s" % table
 321         sql = self.getSqlCursor()
 322
 323         sql.execute(statement)
 324         results = []
 325         for row in sql:
 326             if fullChrom:
 327                 if row["chrom"] not in results:
 328                     results.append(row["chrom"])
 329             else:
 330                 if  len(row["chrom"][3:].strip()) < 1:
 331                     continue
 332
 333                 if row["chrom"][3:] not in results:
 334                     results.append(row["chrom"][3:])
 335
 336         results.sort()
 337
 338         return results
 339
 340
 341     def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True,
 342                          doMulti=False, doSplices=False):
 343         """ returns the maximum coordinate for reads on a given chromosome.
 344         """
 345         maxCoord = 0
 346         sql = self.getSqlCursor()
 347
 348         if doUniqs:
 349             try:
 350                 sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
 351                 maxCoord = int(sql.fetchall()[0][0])
 352             except:
 353                 print "couldn't retrieve coordMax for chromosome %s" % chrom
 354
 355         if doSplices:
 356             sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
 357             try:
 358                 spliceMax = int(sql.fetchall()[0][0])
 359                 if spliceMax > maxCoord:
 360                     maxCoord = spliceMax
 361             except:
 362                 pass
 363
 364         if doMulti:
 365             sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
 366             try:
 367                 multiMax = int(sql.fetchall()[0][0])
 368                 if multiMax > maxCoord:
 369                     maxCoord = multiMax
 370             except:
 371                 pass
 372
 373         if verbose:
 374             print "%s maxCoord: %d" % (chrom, maxCoord)
 375
 376         return maxCoord
 377
 378
 379     def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
 380                      flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
 381                      withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
 382                      readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
 383                      flagLike=False, strand='', combine5p=False):
 384         """ returns a dictionary of reads in a variety of formats
 385         and which can be restricted by chromosome or custom-flag.
 386         Returns unique reads by default, but can return multireads
 387         with doMulti set to True.
 388         """
 389         whereClause = []
 390         resultsDict = {}
 391
 392         if chrom != "" and chrom != self.memChrom:
 393             whereClause.append("chrom = '%s'" % chrom)
 394
 395         if flag != "":
 396             if flagLike:
 397                 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
 398                 whereClause.append(flagLikeClause)
 399             else:
 400                 whereClause.append("flag = '%s'" % flag)
 401
 402         if start > -1:
 403             whereClause.append("start > %d" % start)
 404
 405         if stop > -1:
 406             whereClause.append("stop < %d" % stop)
 407
 408         if len(readLike) > 0:
 409             readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
 410             whereClause.append(readIDClause)
 411
 412         if hasMismatch:
 413             whereClause.append("mismatch != ''")
 414
 415         if strand in ["+", "-"]:
 416             whereClause.append("sense = '%s'" % strand)
 417
 418         if len(whereClause) > 0:
 419             whereStatement = string.join(whereClause, " and ")
 420             whereQuery = "where %s" % whereStatement
 421         else:
 422             whereQuery = ""
 423
 424         groupBy = []
 425         if findallOptimize:
 426             selectClause = ["select start, sense, sum(weight)"]
 427             groupBy = ["GROUP BY start, sense"]
 428         else:
 429             selectClause = ["select ID, chrom, start, readID"]
 430             if bothEnds:
 431                 selectClause.append("stop")
 432
 433             if not noSense:
 434                 selectClause.append("sense")
 435
 436             if withWeight:
 437                 selectClause.append("weight")
 438
 439             if withFlag:
 440                 selectClause.append("flag")
 441
 442             if withMismatch:
 443                 selectClause.append("mismatch")
 444
 445         if limit > 0 and not combine5p:
 446             groupBy.append("LIMIT %d" % limit)
 447
 448         selectQuery = string.join(selectClause, ",")
 449         groupQuery = string.join(groupBy)
 450         if doUniqs:
 451             stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
 452             if doMulti:
 453                 stmt.append("UNION ALL")
 454                 stmt.append(selectQuery)
 455                 stmt.append("from multi")
 456                 stmt.append(whereQuery)
 457                 stmt.append(groupQuery)
 458         else:
 459             stmt = [selectQuery, "from multi", whereQuery]
 460
 461         if combine5p:
 462             if findallOptimize:
 463                 selectQuery = "select start, sense, weight, chrom"
 464
 465             if doUniqs:
 466                 subSelect = [selectQuery, "from uniqs", whereQuery]
 467                 if doMulti:
 468                     subSelect.append("union all")
 469                     subSelect.append(selectQuery)
 470                     subSelect.append("from multi")
 471                     subSelect.append(whereQuery)
 472             else:
 473                 subSelect = [selectQuery, "from multi", whereQuery]
 474
 475             sqlStmt = string.join(subSelect)
 476             if findallOptimize:
 477                 selectQuery = "select start, sense, sum(weight)"
 478
 479             stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union",
 480                     selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"]
 481
 482         if findallOptimize:
 483             if self.memBacked:
 484                 self.memcon.row_factory = None
 485                 sql = self.memcon.cursor()
 486             else:
 487                 self.dbcon.row_factory = None
 488                 sql = self.dbcon.cursor()
 489
 490             stmt.append("order by start")
 491         elif readIDDict:
 492             if self.memBacked:
 493                 sql = self.memcon.cursor()
 494             else:
 495                 sql = self.dbcon.cursor()
 496
 497             stmt.append("order by readID, start")
 498         else:
 499             if self.memBacked:
 500                 sql = self.memcon.cursor()
 501             else:
 502                 sql = self.dbcon.cursor()
 503
 504             stmt.append("order by chrom, start")
 505
 506         sqlQuery = string.join(stmt)
 507         sql.execute(sqlQuery)
 508
 509         if findallOptimize:
 510             resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
 511             if self.memBacked:
 512                 self.memcon.row_factory = sqlite.Row
 513             else:
 514                 self.dbcon.row_factory = sqlite.Row
 515         else:
 516             currentChrom = ""
 517             currentReadID = ""
 518             pairID = 0
 519             for row in sql:
 520                 readID = row["readID"]
 521                 if fullChrom:
 522                     chrom = row["chrom"]
 523                 else:
 524                     chrom = row["chrom"][3:]
 525
 526                 if not readIDDict and chrom != currentChrom:
 527                     resultsDict[chrom] = []
 528                     currentChrom = chrom
 529                     dictKey = chrom
 530                 elif readIDDict:
 531                     theReadID = readID
 532                     if "::" in readID:
 533                         theReadID = readID.split("::")[0]
 534
 535                     if "/" in theReadID and withPairID:
 536                         (theReadID, pairID) = readID.split("/")
 537
 538                     if theReadID != currentReadID:
 539                         resultsDict[theReadID] = []
 540                         currentReadID = theReadID
 541                         dictKey = theReadID
 542
 543                 newrow = {"start": int(row["start"])}
 544                 if bothEnds:
 545                     newrow["stop"] = int(row["stop"])
 546
 547                 if not noSense:
 548                     newrow["sense"] = row["sense"]
 549
 550                 if withWeight:
 551                     newrow["weight"] = float(row["weight"])
 552
 553                 if withFlag:
 554                     newrow["flag"] = row["flag"]
 555
 556                 if withMismatch:
 557                     newrow["mismatch"] = row["mismatch"]
 558
 559                 if withID:
 560                     newrow["readID"] = readID
 561
 562                 if withChrom:
 563                     newrow["chrom"] = chrom
 564
 565                 if withPairID:
 566                     newrow["pairID"] = pairID
 567
 568                 resultsDict[dictKey].append(newrow)
 569
 570         return resultsDict
 571
 572
 573     def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
 574                        flag="", withWeight=False, withFlag=False, withMismatch=False,
 575                        withID=False, withChrom=False, withPairID=False, readIDDict=False,
 576                        splitRead=False, hasMismatch=False, flagLike=False, start=-1,
 577                        stop=-1, strand=""):
 578         """ returns a dictionary of spliced reads in a variety of
 579         formats and which can be restricted by chromosome or custom-flag.
 580         Returns unique spliced reads for now.
 581         """
 582         whereClause = []
 583         resultsDict = {}
 584
 585         if chrom != "" and chrom != self.memChrom:
 586             whereClause = ["chrom = '%s'" % chrom]
 587
 588         if flag != "":
 589             if flagLike:
 590                 flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
 591                 whereClause.append(flagLikeClause)
 592             else:
 593                 whereClause.append("flag = '%s'" % flag)
 594
 595         if hasMismatch:
 596             whereClause.append("mismatch != ''")
 597
 598         if strand != "":
 599             whereClause.append("sense = '%s'" % strand)
 600
 601         if start > -1:
 602             whereClause.append("startL > %d" % start)
 603
 604         if stop > -1:
 605             whereClause.append("stopR < %d" % stop)
 606
 607         if len(whereClause) > 0:
 608             whereStatement = string.join(whereClause, " and ")
 609             whereQuery = "where %s" % whereStatement
 610         else:
 611             whereQuery = ""
 612
 613         selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
 614         if not noSense:
 615             selectClause.append("sense")
 616
 617         if withWeight:
 618             selectClause.append("weight")
 619
 620         if withFlag:
 621             selectClause.append("flag")
 622
 623         if withMismatch:
 624             selectClause.append("mismatch")
 625
 626         selectQuery = string.join(selectClause, " ,")
 627         if self.memBacked:
 628             sql = self.memcon.cursor()
 629         else:
 630             sql = self.dbcon.cursor()
 631
 632         stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
 633         sql.execute(stmt)
 634         currentReadID = ""
 635         currentChrom = ""
 636         for row in sql:
 637             pairID = 0
 638             readID = row["readID"]
 639             if fullChrom:
 640                 chrom = row["chrom"]
 641             else:
 642                 chrom = row["chrom"][3:]
 643
 644             if not readIDDict and chrom != currentChrom:
 645                 resultsDict[chrom] = []
 646                 currentChrom = chrom
 647                 dictKey = chrom
 648             elif readIDDict:
 649                 if "/" in readID:
 650                     (theReadID, pairID) = readID.split("/")
 651                 else:
 652                     theReadID = readID
 653
 654                 if theReadID != currentReadID:
 655                     resultsDict[theReadID] = []
 656                     currentReadID = theReadID
 657                     dictKey = theReadID
 658
 659             newrow = {"startL": int(row["startL"])}
 660             newrow["stopL"] = int(row["stopL"])
 661             newrow["startR"] = int(row["startR"])
 662             newrow["stopR"] = int(row["stopR"])
 663             if not noSense:
 664                 newrow["sense"] = row["sense"]
 665
 666             if withWeight:
 667                 newrow["weight"] = float(row["weight"])
 668
 669             if withFlag:
 670                 newrow["flag"] = row["flag"]
 671
 672             if withMismatch:
 673                 newrow["mismatch"] = row["mismatch"]
 674
 675             if withID:
 676                 newrow["readID"] = readID
 677
 678             if withChrom:
 679                 newrow["chrom"] = chrom
 680
 681             if withPairID:
 682                 newrow["pairID"] = pairID
 683
 684             if splitRead:
 685                 leftDict = newrow.copy()
 686                 del leftDict["startR"]
 687                 del leftDict["stopR"]
 688                 rightDict = newrow
 689                 del rightDict["startL"]
 690                 del rightDict["stopL"]
 691                 resultsDict[dictKey].append(leftDict)
 692                 resultsDict[dictKey].append(rightDict)
 693             else:
 694                 resultsDict[dictKey].append(newrow)
 695
 696         return resultsDict
 697
 698
 699     def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
 700                   splices=False, reportCombined=True, sense="both"):
 701         """ return read counts for a given region.
 702         """
 703         ucount = 0
 704         mcount = 0
 705         scount = 0
 706         restrict = ""
 707         if sense in ["+", "-"]:
 708             restrict = " sense ='%s' " % sense
 709
 710         if uniqs:
 711             try:
 712                 ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
 713             except:
 714                 ucount = 0
 715
 716         if multi:
 717             try:
 718                 mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
 719             except:
 720                 mcount = 0
 721
 722         if splices:
 723             try:
 724                 scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
 725             except:
 726                 scount = 0
 727
 728         if reportCombined:
 729             total = ucount + mcount + scount
 730             return total
 731         else:
 732             return (ucount, mcount, scount)
 733
 734
 735     def getTotalCounts(self, chrom="", rmin="", rmax=""):
 736         """ return read counts for a given region.
 737         """
 738         return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
 739
 740
 741     def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
 742         """ returns the number of row in the uniqs table.
 743         """
 744         whereClause = []
 745         count = 0
 746
 747         if chrom !=""  and chrom != self.memChrom:
 748             whereClause = ["chrom='%s'" % chrom]
 749
 750         if rmin != "":
 751             whereClause.append("%s >= %s" % (startField, str(rmin)))
 752
 753         if rmax != "":
 754             whereClause.append("%s <= %s" % (startField, str(rmax)))
 755
 756         if restrict != "":
 757             whereClause.append(restrict)
 758
 759         if len(whereClause) > 0:
 760             whereStatement = string.join(whereClause, " and ")
 761             whereQuery = "where %s" % whereStatement
 762         else:
 763             whereQuery = ""
 764
 765         if self.memBacked:
 766             sql = self.memcon.cursor()
 767         else:
 768             sql = self.dbcon.cursor()
 769
 770         if distinct:
 771             sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
 772         else:
 773             sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
 774
 775         result = sql.fetchone()
 776
 777         try:
 778             count = int(result[0])
 779         except:
 780             count = 0
 781
 782         return count
 783
 784
 785     def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
 786         """ returns the number of row in the splices table.
 787         """
 788         return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
 789
 790
 791     def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
 792         """ returns the number of distinct readIDs in the uniqs table.
 793         """
 794         return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
 795
 796
 797     def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
 798         """ returns the total weight of readIDs in the multi table.
 799         """
 800         return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
 801
 802
 803     def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
 804         """ get readID's.
 805         """
 806         stmt = []
 807         limitPart = ""
 808         if limit > 0:
 809             limitPart = "LIMIT %d" % limit
 810
 811         if uniqs:
 812             stmt.append("select readID from uniqs")
 813
 814         if multi:
 815             stmt.append("select readID from multi")
 816
 817         if splices:
 818             stmt.append("select readID from splices")
 819
 820         if len(stmt) > 0:
 821             selectPart = string.join(stmt, " union ")
 822         else:
 823             selectPart = ""
 824
 825         sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
 826         if self.memBacked:
 827             sql = self.memcon.cursor()
 828         else:
 829             sql = self.dbcon.cursor()
 830
 831         sql.execute(sqlQuery)
 832         result = sql.fetchall()
 833
 834         if paired:
 835             return [x[0].split("/")[0] for x in result]
 836         else:
 837             return [x[0] for x in result]
 838
 839
 840     def getMismatches(self, mischrom=None, verbose=False, useSplices=True):
 841         """ returns the uniq and spliced mismatches in a dictionary.
 842         """
 843         readlen = self.getReadSize()
 844         if mischrom:
 845             hitChromList = [mischrom]
 846         else:
 847             hitChromList = self.getChromosomes()
 848             hitChromList.sort()
 849
 850         snpDict = {}
 851         for achrom in hitChromList:
 852             if verbose:
 853                 print "getting mismatches from chromosome %s" % (achrom)
 854
 855             snpDict[achrom] = []
 856             hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
 857             if useSplices and self.dataType == "RNA":
 858                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
 859                 spliceIDList = spliceDict.keys()
 860                 for k in spliceIDList:
 861                     spliceEntry = spliceDict[k][0]
 862                     startpos = spliceEntry["startL"]
 863                     lefthalf = spliceEntry["stopL"]
 864                     rightstart = spliceEntry["startR"]
 865                     sense = spliceEntry["sense"]
 866                     mismatches = spliceEntry["mismatch"]
 867                     spMismatchList = mismatches.split(",")
 868                     for mismatch in spMismatchList:
 869                         if "N" in mismatch:
 870                             continue
 871
 872                         change_len = len(mismatch)
 873                         if sense == "+":
 874                             change_from = mismatch[0]
 875                             change_base = mismatch[change_len-1]
 876                             change_pos = int(mismatch[1:change_len-1])
 877                         elif sense == "-":
 878                             change_from = getReverseComplement([mismatch[0]])
 879                             change_base = getReverseComplement([mismatch[change_len-1]])
 880                             change_pos = readlen - int(mismatch[1:change_len-1]) + 1
 881
 882                         firsthalf = int(lefthalf)-int(startpos)+1
 883                         secondhalf = 0
 884                         if int(change_pos) <= int(firsthalf):
 885                             change_at = startpos + change_pos - 1
 886                         else:
 887                             secondhalf = change_pos - firsthalf
 888                             change_at = rightstart + secondhalf
 889
 890                         snpDict[achrom].append([startpos, change_at, change_base, change_from])
 891
 892             if achrom not in hitDict.keys():
 893                 continue
 894
 895             for readEntry in hitDict[achrom]:
 896                 start = readEntry["start"]
 897                 sense = readEntry["sense"]
 898                 mismatches = readEntry["mismatch"]
 899                 mismatchList = mismatches.split(",")
 900                 for mismatch in mismatchList:
 901                     if "N" in mismatch:
 902                         continue
 903
 904                     change_len = len(mismatch)
 905                     if sense == "+":
 906                         change_from = mismatch[0]
 907                         change_base = mismatch[change_len-1]
 908                         change_pos = int(mismatch[1:change_len-1])
 909                     elif sense == "-":
 910                         change_from = getReverseComplement([mismatch[0]])
 911                         change_base = getReverseComplement([mismatch[change_len-1]])
 912                         change_pos = readlen - int(mismatch[1:change_len-1]) + 1
 913
 914                     change_at = start + change_pos - 1
 915                     snpDict[achrom].append([start, change_at, change_base, change_from])
 916
 917         return snpDict
 918
 919
 920     def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
 921                         useSplices=False, normalizationFactor = 1.0, trackStrand=False,
 922                         keepStrand="both", shiftValue=0):
 923         """return a profile of the chromosome as an array of per-base read coverage....
 924             keepStrand = 'both', 'plusOnly', or 'minusOnly'.
 925             Will also shift position of unique and multireads (but not splices) if shift is a natural number
 926         """
 927         metadata = self.getMetadata()
 928         try:
 929             readlen = int(metadata["readsize"])
 930         except KeyError:
 931             readlen = 0
 932
 933         dataType = metadata["dataType"]
 934         scale = 1. / normalizationFactor
 935         shift = {}
 936         shift['+'] = int(shiftValue)
 937         shift['-'] = -1 * int(shiftValue)
 938
 939         if cstop > 0:
 940             lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen
 941         else:
 942             lastNT = cstop - cstart + readlen + shift["+"]
 943
 944         chromModel = array("f",[0.] * lastNT)
 945         hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True)
 946         if cstart < 0:
 947             cstart = 0
 948
 949         for readEntry in hitDict[chromosome]:
 950             hstart = readEntry["start"]
 951             sense =  readEntry ["sense"]
 952             weight = readEntry["weight"]
 953             hstart = hstart - cstart + shift[sense]
 954             for currentpos in range(hstart,hstart+readlen):
 955                 try:
 956                     if not trackStrand or (sense == "+" and keepStrand != "minusOnly"):
 957                         chromModel[currentpos] += scale * weight
 958                     elif sense == "-" and keepStrand != "plusOnly":
 959                         chromModel[currentpos] -= scale * weight
 960                 except:
 961                     continue
 962
 963         del hitDict
 964         if useSplices and dataType == "RNA":
 965             if cstop > 0:
 966                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
 967             else:
 968                 spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
 969
 970             if chromosome in spliceDict:
 971                 for spliceEntry in spliceDict[chromosome]:
 972                     Lstart = spliceEntry["startL"]
 973                     Lstop = spliceEntry["stopL"]
 974                     Rstart = spliceEntry["startR"]
 975                     Rstop = spliceEntry["stopR"]
 976                     rsense = spliceEntry["sense"]
 977                     if (Rstop - cstart) < lastNT:
 978                         for index in range(abs(Lstop - Lstart)):
 979                             currentpos = Lstart - cstart + index
 980                             # we only track unique splices
 981                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
 982                                 chromModel[currentpos] += scale
 983                             elif rsense == "-" and keepStrand != "plusOnly":
 984                                 chromModel[currentpos] -= scale
 985
 986                         for index in range(abs(Rstop - Rstart)):
 987                             currentpos = Rstart - cstart + index
 988                             # we only track unique splices
 989                             if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"):
 990                                 chromModel[currentpos] += scale
 991                             elif rsense == "-" and keepStrand != "plusOnly":
 992                                 chromModel[currentpos] -= scale
 993
 994             del spliceDict
 995
 996         return chromModel
 997
 998
 999     def insertMetadata(self, valuesList):
1000         """ inserts a list of (pname, pvalue) into the metadata
1001         table.
1002         """
1003         self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
1004         self.dbcon.commit()
1005
1006
1007     def updateMetadata(self, pname, newValue, originalValue=""):
1008         """ update a metadata field given the original value and the new value.
1009         """
1010         stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
1011         if originalValue != "":
1012             stmt += " and value='%s' " % str(originalValue)
1013
1014         self.dbcon.execute(stmt)
1015         self.dbcon.commit()
1016
1017
1018     def insertUniqs(self, valuesList):
1019         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1020         into the uniqs table.
1021         """
1022         self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1023         self.dbcon.commit()
1024
1025
1026     def insertMulti(self, valuesList):
1027         """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
1028         into the multi table.
1029         """
1030         self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
1031         self.dbcon.commit()
1032
1033
1034     def insertSplices(self, valuesList):
1035         """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
1036         into the splices table.
1037         """
1038         self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
1039         self.dbcon.commit()
1040
1041
1042     def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
1043         """ update reads on file database in a list region of regions for a chromosome to have a new flag.
1044             regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
1045             sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense).
1046         """
1047         restrict = ""
1048         if sense != "both":
1049             restrict = " and sense = ? "
1050
1051         if uniqs:
1052             self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1053
1054         if multi:
1055             self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList)
1056
1057         if self.dataType == "RNA" and splices:
1058             self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList)
1059             self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList)
1060
1061         self.dbcon.commit()
1062
1063
1064     def setFlags(self, flag, uniqs=True, multi=True, splices=True):
1065         """ set the flag fields in the entire dataset.
1066         """
1067         if uniqs:
1068             self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
1069
1070         if multi:
1071             self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
1072
1073         if self.dataType == "RNA" and splices:
1074             self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
1075
1076         self.dbcon.commit()
1077
1078
1079     def resetFlags(self, uniqs=True, multi=True, splices=True):
1080         """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
1081         """
1082         self.setFlags("", uniqs, multi, splices)
1083
1084
1085     def reweighMultireads(self, readList):
1086         self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
1087
1088
1089     def setSynchronousPragma(self, value="ON"):
1090         try:
1091             self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
1092         except:
1093             print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
1094
1095
1096     def setDBcache(self, cache, default=False):
1097         self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
1098         if default:
1099             self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
1100
1101
1102     def execute(self, statement, returnResults=False):
1103         sql = self.getSqlCursor()
1104
1105         sql.execute(statement)
1106         if returnResults:
1107             result = sql.fetchall()
1108             return result
1109
1110
1111     def executeCommit(self, statement):
1112         self.execute(statement)
1113
1114         if self.memBacked:
1115             self.memcon.commit()
1116         else:
1117             self.dbcon.commit()
1118
1119
1120     def buildIndex(self, cache=100000):
1121         """ Builds the file indeces for the main tables.
1122             Cache is the number of 1.5 kb pages to keep in memory.
1123             100000 pages translates into 150MB of RAM, which is our default.
1124         """
1125         if cache > self.getDefaultCacheSize():
1126             self.setDBcache(cache)
1127         self.setSynchronousPragma("OFF")
1128         self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1129         print "built uPosIndex"
1130         self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
1131         print "built uChromIndex"
1132         self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1133         print "built mPosIndex"
1134         self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
1135         print "built mChromIndex"
1136
1137         if self.dataType == "RNA":
1138             self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
1139             print "built sPosIndex"
1140             self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
1141             print "built sPosIndex2"
1142             self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
1143             print "built sChromIndex"
1144
1145         self.dbcon.commit()
1146         self.setSynchronousPragma("ON")
1147
1148
1149     def dropIndex(self):
1150         """ drops the file indices for the main tables.
1151         """
1152         try:
1153             self.setSynchronousPragma("OFF")
1154             self.dbcon.execute("DROP INDEX uPosIndex")
1155             self.dbcon.execute("DROP INDEX uChromIndex")
1156             self.dbcon.execute("DROP INDEX mPosIndex")
1157             self.dbcon.execute("DROP INDEX mChromIndex")
1158
1159             if self.dataType == "RNA":
1160                 self.dbcon.execute("DROP INDEX sPosIndex")
1161                 try:
1162                     self.dbcon.execute("DROP INDEX sPosIndex2")
1163                 except:
1164                     pass
1165
1166                 self.dbcon.execute("DROP INDEX sChromIndex")
1167
1168             self.dbcon.commit()
1169         except:
1170             print "problem dropping index"
1171
1172         self.setSynchronousPragma("ON")
1173
1174
1175     def memSync(self, chrom="", index=False):
1176         """ makes a copy of the dataset into memory for faster access.
1177         Can be restricted to a "full" chromosome. Can also build the
1178         memory indices.
1179         """
1180         self.memcon = ""
1181         self.memcon = sqlite.connect(":memory:")
1182         self.initializeTables(self.memcon)
1183         cursor = self.dbcon.cursor()
1184         whereclause = ""
1185         if chrom != "":
1186             print "memSync %s" % chrom
1187             whereclause = " where chrom = '%s' " % chrom
1188             self.memChrom = chrom
1189         else:
1190             self.memChrom = ""
1191
1192         self.memcon.execute("PRAGMA temp_store = MEMORY")
1193         self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
1194         # copy metadata to memory
1195         self.memcon.execute("delete from metadata")
1196         results = cursor.execute("select name, value from metadata")
1197         results2 = []
1198         for row in results:
1199             results2.append((row["name"], row["value"]))
1200
1201         self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
1202
1203         self.copyDBEntriesToMemory("uniqs", whereclause)
1204         self.copyDBEntriesToMemory("multi", whereclause)
1205         if self.dataType == "RNA":
1206             self.copySpliceDBEntriesToMemory(whereclause)
1207
1208         if index:
1209             if chrom != "":
1210                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
1211                 self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
1212                 if self.dataType == "RNA":
1213                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
1214                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
1215             else:
1216                 self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
1217                 self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
1218                 if self.dataType == "RNA":
1219                     self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
1220                     self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
1221
1222         self.memBacked = True
1223         self.memcon.row_factory = sqlite.Row
1224         self.memcon.commit()
1225
1226
1227     def copyDBEntriesToMemory(self, dbName, whereClause=""):
1228         cursor = self.dbcon.cursor()
1229         sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
1230         destinationEntries = []
1231         for row in sourceEntries:
1232             destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
1233
1234         self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
1235
1236
1237     def copySpliceDBEntriesToMemory(self, whereClause=""):
1238         cursor = self.dbcon.cursor()
1239         sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
1240         destinationEntries = []
1241         for row in sourceEntries:
1242             destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
1243                                        row["weight"], row["flag"], row["mismatch"]))
1244
1245         self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", destinationEntries)
1246