X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=commoncode.py;h=821936a30bc2524f0aa1664de1a66ecb8ae47fd6;hp=9d864738fef6828ad7bc8e5a4c45079eab84edbc;hb=0d3e3112fd04c2e6b44a25cacef1d591658ad181;hpb=5e4ae21098dba3d1edcf11e7279da0d84c3422e4 diff --git a/commoncode.py b/commoncode.py index 9d86473..821936a 100755 --- a/commoncode.py +++ b/commoncode.py @@ -3,25 +3,19 @@ # ENRAGE # -import tempfile -import shutil +import ConfigParser import os -from os import environ import string -import sqlite3 as sqlite from time import strftime from array import array from collections import defaultdict +import Peak +from cistematic.core.geneinfo import geneinfoDB +from cistematic.genomes import Genome +import Region -commoncodeVersion = 5.5 -currentRDSversion = 1.1 - -if environ.get("CISTEMATIC_TEMP"): - cisTemp = environ.get("CISTEMATIC_TEMP") -else: - cisTemp = "/tmp" - -tempfile.tempdir = cisTemp +commoncodeVersion = 5.6 +currentRDSversion = 2.0 def getReverseComplement(base): @@ -57,11 +51,91 @@ def writeLog(logFile, messenger, message): logfile.close() +def getGeneInfoDict(genome, cache=False): + idb = geneinfoDB(cache=cache) + if genome == "dmelanogaster": + geneinfoDict = idb.getallGeneInfo(genome, infoKey="locus") + else: + geneinfoDict = idb.getallGeneInfo(genome) + + return geneinfoDict + + +def getGeneAnnotDict(genome, inRAM=False): + return getExtendedGeneAnnotDict(genome, "", inRAM=inRAM) + + +def getExtendedGeneAnnotDict(genome, extendGenome, replaceModels=False, inRAM=False): + hg = Genome(genome, inRAM=inRAM) + if extendGenome != "": + hg.extendFeatures(extendGenome, replace=replaceModels) + + geneannotDict = hg.allAnnotInfo() + + return geneannotDict + + +def getConfigParser(fileList=[]): + configFiles = ["erange.config", os.path.expanduser("~/.erange.config")] + for filename in fileList: + configFiles.append(filename) + + config = ConfigParser.SafeConfigParser() + config.read(configFiles) + + return config + + +def getConfigOption(parser, section, option, default=None): + try: + setting = parser.get(section, option) + except (ConfigParser.NoSectionError, ConfigParser.NoOptionError): + setting = default + + return setting + + +def getConfigIntOption(parser, section, option, default=None): + try: + setting = parser.getint(section, option) + except (ConfigParser.NoSectionError, ConfigParser.NoOptionError): + setting = default + + return setting + + +def getConfigFloatOption(parser, section, option, default=None): + try: + setting = parser.getfloat(section, option) + except (ConfigParser.NoSectionError, ConfigParser.NoOptionError): + setting = default + + return setting + + +def getConfigBoolOption(parser, section, option, default=None): + try: + setting = parser.getboolean(section, option) + except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ValueError): + setting = default + + return setting + + +def getAllConfigSectionOptions(parser, section): + try: + setting = parser.items(section) + except ConfigParser.NoSectionError: + setting = [] + + return setting + + def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, keepLabel=False, - fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False, + fullChrom=False, chromField=1, scoreField=4, pad=0, compact=False, doMerge=True, keepPeak=False, returnTop=0): - """ returns a list of merged overlapping regions; + """ returns a dictionary containing a list of merged overlapping regions by chromosome; can optionally filter regions that have a scoreField fewer than minHits. Can also optionally return the label of each region, as well as the peak, if supplied (peakPos and peakHeight should be the last 2 fields). @@ -81,7 +155,7 @@ def getMergedRegions(regionfilename, maxDist=1000, minHits=0, verbose=False, kee def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, keepLabel=False, fullChrom = False, chromField=1, scoreField=4, pad=0, compact=False, doMerge=True, keepPeak=False, returnTop=0): - """ returns a list of merged overlapping regions; + """ returns a dictionary containing a list of merged overlapping regions by chromosome; can optionally filter regions that have a scoreField fewer than minHits. Can also optionally return the label of each region, as well as the peak, if supplied (peakPos and peakHeight should be the last 2 fields). @@ -159,7 +233,6 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, if not fullChrom: chrom = chrom[3:] - length = abs(stop - start) if keepPeak: peakPos = int(fields[-2 - hasPvalue - hasShift]) peakHeight = float(fields[-1 - hasPvalue - hasShift]) @@ -171,15 +244,9 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, if doMerge and len(regions[chrom]) > 0: for index in range(len(regions[chrom])): - if keepLabel and keepPeak: - (rlabel, rstart, rstop, rlen, rpeakPos, rpeakHeight) = regions[chrom][index] - elif keepLabel: - (rlabel, rstart, rstop, rlen) = regions[chrom][index] - elif keepPeak: - (rstart, rstop, rlen, rpeakPos, rpeakHeight) = regions[chrom][index] - else: - (rstart, rstop, rlen) = regions[chrom][index] - + region = regions[chrom][index] + rstart = region.start + rstop = region.stop if regionsOverlap(start, stop, rstart, rstop) or regionsAreWithinDistance(start, stop, rstart, rstop, maxDist): if start < rstart: rstart = start @@ -187,35 +254,38 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, if rstop < stop: rstop = stop - rlen = abs(rstop - rstart) if keepPeak: + rpeakPos = region.peakPos + rpeakHeight = region.peakHeight if peakHeight > rpeakHeight: rpeakHeight = peakHeight rpeakPos = peakPos - if keepLabel and keepPeak: - regions[chrom][index] = (label, rstart, rstop, rlen, rpeakPos, rpeakHeight) - elif keepLabel: - regions[chrom][index] = (label, rstart, rstop, rlen) - elif keepPeak: - regions[chrom][index] = (rstart, rstop, rlen, rpeakPos, rpeakHeight) - else: - regions[chrom][index] = (rstart, rstop, rlen) + regions[chrom][index].start = rstart + regions[chrom][index].stop = rstop + regions[chrom][index].length = abs(rstop - rstart) + if keepLabel: + regions[chrom][index].label = label + + if keepPeak: + regions[chrom][index].peakPos = rpeakPos + regions[chrom][index].peakHeight = rpeakHeight + mergeCount += 1 merged = True break if not merged: - if keepLabel and keepPeak: - regions[chrom].append((label, start, stop, length, peakPos, peakHeight)) - elif keepLabel: - regions[chrom].append((label, start, stop, length)) - elif keepPeak: - regions[chrom].append((start, stop, length, peakPos, peakHeight)) - else: - regions[chrom].append((start, stop, length)) + region = Region.Region(start, stop) + if keepLabel: + region.label = label + if keepPeak: + region.peakPos = peakPos + region.peakHeight = peakHeight + + regions[chrom].append(region) count += 1 if verbose and (count % 100000 == 0): @@ -224,10 +294,7 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False, regionCount = 0 for chrom in regions: regionCount += len(regions[chrom]) - if keepLabel: - regions[chrom].sort(cmp=lambda x,y:cmp(x[1], y[1])) - else: - regions[chrom].sort() + regions[chrom].sort(cmp=lambda x,y:cmp(x.start, y.start)) if verbose: print "merged %d times" % mergeCount @@ -257,7 +324,7 @@ def regionsAreWithinDistance(start, stop, rstart, rstop, maxDist): def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False, - shift=0, returnShift=False, maxshift=75): + shift=0, maxshift=75): """ find the peak in a list of reads (hitlist) in a region of a given length and absolute start point. returns a list of peaks, the number of hits, a triangular-smoothed @@ -268,82 +335,35 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False, the peak, taken to be the first TopPos position. """ - seqArray = array("f", [0.] * length) - smoothArray = array("f", [0.] * length) - numHits = 0. - numPlus = 0. - regionArray = [] if shift == "auto": shift = getBestShiftForRegion(hitList, start, length, doWeight, maxshift) - # once we have the best shift, compute seqArray - for read in hitList: - currentpos = read[0] - start - if read[1] == "+": - currentpos += shift - else: - currentpos -= shift - - if (currentpos < 1 - readlen) or (currentpos >= length): - continue - - hitIndex = 0 - if doWeight: - weight = read[2] - else: - weight = 1.0 - - numHits += weight - if leftPlus: - regionArray.append(read) - - while currentpos < 0: - hitIndex += 1 - currentpos += 1 - - while hitIndex < readlen and currentpos < length: - seqArray[currentpos] += weight - hitIndex += 1 - currentpos += 1 - - if read[1] == "+": - numPlus += weight + seqArray, regionArray, numHits, numPlus = findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus) # implementing a triangular smooth + smoothArray = array("f", [0.] * length) for pos in range(2,length -2): smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0 - topNucleotide = 0 - topPos = [] - for currentpos in xrange(length): - if topNucleotide < smoothArray[currentpos]: - topNucleotide = smoothArray[currentpos] - topPos = [currentpos] - elif topNucleotide == smoothArray[currentpos]: - topPos.append(currentpos) + topPos = getPeakPositionList(smoothArray, length) + peak = Peak(topPos, numHits, smoothArray, numPlus, shift=shift) if leftPlus: numLeftPlus = 0 maxPos = topPos[0] for read in regionArray: if doWeight: - weight = read[2] + weight = read["weight"] else: weight = 1.0 - currentPos = read[0] - start - if currentPos <= maxPos and read[1] == "+": + currentPos = read["start"] - start + if currentPos <= maxPos and read["sense"] == "+": numLeftPlus += weight - if returnShift: - return (topPos, numHits, smoothArray, numPlus, numLeftPlus, shift) - else: - return (topPos, numHits, smoothArray, numPlus, numLeftPlus) - else: - if returnShift: - return (topPos, numHits, smoothArray, numPlus, shift) - else: - return (topPos, numHits, smoothArray, numPlus) + peak.numLeftPlus = numLeftPlus + + return peak def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75): @@ -352,8 +372,8 @@ def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75): for testShift in xrange(maxShift + 1): shiftArray = array("f", [0.] * length) for read in hitList: - currentpos = read[0] - start - if read[1] == "+": + currentpos = read["start"] - start + if read["sense"] == "+": currentpos += testShift else: currentpos -= testShift @@ -362,11 +382,11 @@ def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75): continue if doWeight: - weight = read[2] + weight = read["weight"] else: weight = 1.0 - if read[1] == "+": + if read["sense"] == "+": shiftArray[currentpos] += weight else: shiftArray[currentpos] -= weight @@ -383,6 +403,59 @@ def getBestShiftForRegion(hitList, start, length, doWeight=False, maxShift=75): return bestShift +def findPeakSequenceArray(hitList, start, shift, length, readlen, doWeight, leftPlus): + seqArray = array("f", [0.] * length) + numHits = 0. + numPlus = 0. + regionArray = [] + for read in hitList: + currentpos = read["start"] - start + if read["sense"] == "+": + currentpos += shift + else: + currentpos -= shift + + if (currentpos < 1 - readlen) or (currentpos >= length): + continue + + if doWeight: + weight = read["weight"] + else: + weight = 1.0 + + numHits += weight + if leftPlus: + regionArray.append(read) + + hitIndex = 0 + while currentpos < 0: + hitIndex += 1 + currentpos += 1 + + while hitIndex < readlen and currentpos < length: + seqArray[currentpos] += weight + hitIndex += 1 + currentpos += 1 + + if read["sense"] == "+": + numPlus += weight + + return seqArray, regionArray, numHits, numPlus + + +def getPeakPositionList(smoothArray, length): + topNucleotide = 0 + peakList = [] + for currentpos in xrange(length): + if topNucleotide < smoothArray[currentpos]: + topNucleotide = smoothArray[currentpos] + peakList = [currentpos] + elif topNucleotide == smoothArray[currentpos]: + peakList.append(currentpos) + + return peakList + + def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo=False, restrictList=[], regionComplement=False, maxStop=250000000): """ return a dictionary of cistematic gene features. Requires @@ -402,7 +475,8 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo= if len(additionalRegionsDict) > 0: sortList = [] for chrom in additionalRegionsDict: - for (label, start, stop, length) in additionalRegionsDict[chrom]: + for region in additionalRegionsDict[chrom]: + label = region.label if label not in sortList: sortList.append(label) @@ -412,7 +486,7 @@ def getFeaturesByChromDict(genomeObject, additionalRegionsDict={}, ignorePseudo= else: sense = featuresDict[label][0][-1] - featuresDict[label].append(("custom", chrom, start, stop, sense)) + featuresDict[label].append(("custom", chrom, region.start, region.stop, sense)) for gid in sortList: featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2])) @@ -525,7 +599,8 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True, if len(additionalRegionsDict) > 0: sortList = [] for chrom in additionalRegionsDict: - for (label, start, stop, length) in additionalRegionsDict[chrom]: + for region in additionalRegionsDict[chrom]: + label = region.label if label not in sortList: sortList.append(label) @@ -535,7 +610,7 @@ def getLocusByChromDict(genomeObject, upstream=0, downstream=0, useCDS=True, else: sense = featuresDict[label][0][-1] - featuresDict[label].append(("custom", chrom, start, stop, sense)) + featuresDict[label].append(("custom", chrom, region.start, region.stop, sense)) for gid in sortList: featuresDict[gid].sort(cmp=lambda x,y:cmp(x[2], y[2])) @@ -705,7 +780,9 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[], print "%s\n" % chrom startRegion = 0 - for (tagStart, sense, weight) in hitDict[chrom]: + for read in hitDict[chrom]: + tagStart = read["start"] + weight = read["weight"] index += 1 if index % 100000 == 0: print "read %d " % index, @@ -773,1296 +850,3 @@ def computeRegionBins(regionsByChromDict, hitDict, bins, readlen, regionList=[], stopPoint = stop return (regionsBins, regionsLen) - - -# TODO: The readDataset class is going to be replaced by Erange.ReadDataset but this will -# require going through all the code to make the changes needed. Major project for another -# day, but it really needs to be done -class readDataset: - """ Class for storing reads from experiments. Assumes that custom scripts - will translate incoming data into a format that can be inserted into the - class using the insert* methods. Default class subtype ('DNA') includes - tables for unique and multireads, whereas 'RNA' subtype also includes a - splices table. - """ - - def __init__(self, datafile, initialize=False, datasetType='', verbose=False, - cache=False, reportCount=True): - """ creates an rds datafile if initialize is set to true, otherwise - will append to existing tables. datasetType can be either 'DNA' or 'RNA'. - """ - self.dbcon = "" - self.memcon = "" - self.dataType = "" - self.rdsVersion = "1.1" - self.memBacked = False - self.memChrom = "" - self.memCursor = "" - self.cachedDBFile = "" - - if cache: - if verbose: - print "caching ...." - - self.cacheDB(datafile) - dbfile = self.cachedDBFile - else: - dbfile = datafile - - self.dbcon = sqlite.connect(dbfile) - self.dbcon.row_factory = sqlite.Row - self.dbcon.execute("PRAGMA temp_store = MEMORY") - if initialize: - if datasetType == "": - self.dataType = "DNA" - else: - self.dataType = datasetType - - self.initializeTables(self.dbcon) - else: - metadata = self.getMetadata("dataType") - self.dataType = metadata["dataType"] - - try: - metadata = self.getMetadata("rdsVersion") - self.rdsVersion = metadata["rdsVersion"] - except: - try: - self.insertMetadata([("rdsVersion", currentRDSversion)]) - except: - print "could not add rdsVersion - read-only ?" - self.rdsVersion = "pre-1.0" - - if verbose: - if initialize: - print "INITIALIZED dataset %s" % datafile - else: - print "dataset %s" % datafile - - metadata = self.getMetadata() - print "metadata:" - pnameList = metadata.keys() - pnameList.sort() - for pname in pnameList: - print "\t" + pname + "\t" + metadata[pname] - - if reportCount: - ucount = self.getUniqsCount() - mcount = self.getMultiCount() - if self.dataType == "DNA" and not initialize: - try: - print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount)) - except: - print "\n%s unique reads and %s multireads" % (ucount, mcount) - elif self.dataType == 'RNA' and not initialize: - scount = self.getSplicesCount() - try: - print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount)) - except: - print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount) - - print "default cache size is %d pages" % self.getDefaultCacheSize() - if self.hasIndex(): - print "found index" - else: - print "not indexed" - - - def __len__(self): - """ return the number of usable reads in the dataset. - """ - try: - total = self.getUniqsCount() - except: - total = 0 - - try: - total += self.getMultiCount() - except: - pass - - if self.dataType == "RNA": - try: - total += self.getSplicesCount() - except: - pass - - try: - total = int(total) - except: - total = 0 - - return total - - - def __del__(self): - """ cleanup copy in local cache, if present. - """ - if self.cachedDBFile != "": - self.uncacheDB() - - - def cacheDB(self, filename): - """ copy geneinfoDB to a local cache. - """ - self.cachedDBFile = tempfile.mktemp() + ".db" - shutil.copyfile(filename, self.cachedDBFile) - - - def saveCacheDB(self, filename): - """ copy geneinfoDB to a local cache. - """ - shutil.copyfile(self.cachedDBFile, filename) - - - def uncacheDB(self): - """ delete geneinfoDB from local cache. - """ - global cachedDBFile - if self.cachedDBFile != "": - try: - os.remove(self.cachedDBFile) - except: - print "could not delete %s" % self.cachedDBFile - - self.cachedDB = "" - - - def attachDB(self, filename, asname): - """ attach another database file to the readDataset. - """ - stmt = "attach '%s' as %s" % (filename, asname) - self.execute(stmt) - - - def detachDB(self, asname): - """ detach a database file to the readDataset. - """ - stmt = "detach %s" % (asname) - self.execute(stmt) - - - def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""): - """ import into current RDS the table (with columns destcolumns, - with default all columns) from the database file asname, - using the column specification of ascolumns (default all). - """ - stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table) - if flagged != "": - stmt += " where flag = '%s' " % flagged - - self.execute(stmt, forceCommit=True) - - - def getTables(self, asname=""): - """ get a list of table names in a particular database file. - """ - resultList = [] - - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() - - if asname != "": - asname += "." - - stmt = "select name from %ssqlite_master where type='table'" % asname - sql.execute(stmt) - results = sql.fetchall() - - for row in results: - resultList.append(row["name"]) - - return resultList - - - def hasIndex(self): - """ check whether the RDS file has at least one index. - """ - stmt = "select count(*) from sqlite_master where type='index'" - count = int(self.execute(stmt, returnResults=True)[0][0]) - if count > 0: - return True - - return False - - - def initializeTables(self, acon, cache=100000): - """ creates table schema in database connection acon, which is - typically a database file or an in-memory database. - """ - acon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache) - acon.execute("create table metadata (name varchar, value varchar)") - acon.execute("insert into metadata values('dataType','%s')" % self.dataType) - acon.execute("create table uniqs (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, start int, stop int, sense varchar, weight real, flag varchar, mismatch varchar)") - acon.execute("create table multi (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, start int, stop int, sense varchar, weight real, flag varchar, mismatch varchar)") - if self.dataType == "RNA": - acon.execute("create table splices (ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, startL int, stopL int, startR int, stopR int, sense varchar, weight real, flag varchar, mismatch varchar)") - - acon.commit() - - - def getFileCursor(self): - """ returns a cursor to file database for low-level (SQL) - access to the data. - """ - return self.dbcon.cursor() - - - def getMemCursor(self): - """ returns a cursor to memory database for low-level (SQL) - access to the data. - """ - return self.memcon.cursor() - - - def getMetadata(self, valueName=""): - """ returns a dictionary of metadata. - """ - whereClause = "" - resultsDict = {} - - if valueName != "": - whereClause = " where name = '%s' " % valueName - - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() - - sql.execute("select name, value from metadata" + whereClause) - results = sql.fetchall() - - for row in results: - pname = row["name"] - pvalue = row["value"] - if pname not in resultsDict: - resultsDict[pname] = pvalue - else: - trying = True - index = 2 - while trying: - newName = pname + ":" + str(index) - if newName not in resultsDict: - resultsDict[newName] = pvalue - trying = False - - index += 1 - - return resultsDict - - - def getReadSize(self): - """ returns readsize if defined in metadata. - """ - metadata = self.getMetadata() - if "readsize" not in metadata: - print "no readsize parameter defined - returning 0" - return 0 - else: - mysize = metadata["readsize"] - if "import" in mysize: - mysize = mysize.split()[0] - - return int(mysize) - - - def getDefaultCacheSize(self): - """ returns the default cache size. - """ - return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0]) - - - def getChromosomes(self, table="uniqs", fullChrom=True): - """ returns a list of distinct chromosomes in table. - """ - statement = "select distinct chrom from %s" % table - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() - - sql.execute(statement) - results = [] - for row in sql: - if fullChrom: - if row["chrom"] not in results: - results.append(row["chrom"]) - else: - if len(row["chrom"][3:].strip()) < 1: - continue - - if row["chrom"][3:] not in results: - results.append(row["chrom"][3:]) - - results.sort() - - return results - - - def getMaxCoordinate(self, chrom, verbose=False, doUniqs=True, - doMulti=False, doSplices=False): - """ returns the maximum coordinate for reads on a given chromosome. - """ - maxCoord = 0 - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() - - if doUniqs: - try: - sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom) - maxCoord = int(sql.fetchall()[0][0]) - except: - print "couldn't retrieve coordMax for chromosome %s" % chrom - - if doSplices: - sql.execute("select max(startR) from splices where chrom = '%s'" % chrom) - try: - spliceMax = int(sql.fetchall()[0][0]) - if spliceMax > maxCoord: - maxCoord = spliceMax - except: - pass - - if doMulti: - sql.execute("select max(start) from multi where chrom = '%s'" % chrom) - try: - multiMax = int(sql.fetchall()[0][0]) - if multiMax > maxCoord: - maxCoord = multiMax - except: - pass - - if verbose: - print "%s maxCoord: %d" % (chrom, maxCoord) - - return maxCoord - - - def getReadsDict(self, verbose=False, bothEnds=False, noSense=False, fullChrom=False, chrom="", - flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False, - withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False, - readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False, - flagLike=False, strand="", entryDict=False, combine5p=False): - """ returns a dictionary of reads in a variety of formats - and which can be restricted by chromosome or custom-flag. - Returns unique reads by default, but can return multireads - with doMulti set to True. - """ - whereClause = [] - resultsDict = {} - - if chrom != "" and chrom != self.memChrom: - whereClause.append("chrom = '%s'" % chrom) - - if flag != "": - if flagLike: - flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "") - whereClause.append(flagLikeClause) - else: - whereClause.append("flag = '%s'" % flag) - - if start > -1: - whereClause.append("start > %d" % start) - - if stop > -1: - whereClause.append("stop < %d" % stop) - - if len(readLike) > 0: - readIDClause = string.join(["readID LIKE '", readLike, "%'"], "") - whereClause.append(readIDClause) - - if hasMismatch: - whereClause.append("mismatch != ''") - - if strand in ["+", "-"]: - whereClause.append("sense = '%s'" % strand) - - if len(whereClause) > 0: - whereStatement = string.join(whereClause, " and ") - whereQuery = "where %s" % whereStatement - else: - whereQuery = "" - - groupBy = [] - if findallOptimize: - selectClause = ["select start, sense, sum(weight)"] - groupBy = ["GROUP BY start, sense"] - else: - selectClause = ["select ID, chrom, start, readID"] - if bothEnds: - selectClause.append("stop") - - if not noSense: - selectClause.append("sense") - - if withWeight: - selectClause.append("weight") - - if withFlag: - selectClause.append("flag") - - if withMismatch: - selectClause.append("mismatch") - - if limit > 0 and not combine5p: - groupBy.append("LIMIT %d" % limit) - - selectQuery = string.join(selectClause, ",") - groupQuery = string.join(groupBy) - if doUniqs: - stmt = [selectQuery, "from uniqs", whereQuery, groupQuery] - if doMulti: - stmt.append("UNION ALL") - stmt.append(selectQuery) - stmt.append("from multi") - stmt.append(whereQuery) - stmt.append(groupQuery) - else: - stmt = [selectQuery, "from multi", whereQuery] - - if combine5p: - if findallOptimize: - selectQuery = "select start, sense, weight, chrom" - - if doUniqs: - subSelect = [selectQuery, "from uniqs", whereQuery] - if doMulti: - subSelect.append("union all") - subSelect.append(selectQuery) - subSelect.append("from multi") - subSelect.append(whereQuery) - else: - subSelect = [selectQuery, "from multi", whereQuery] - - sqlStmt = string.join(subSelect) - if findallOptimize: - selectQuery = "select start, sense, sum(weight)" - - stmt = [selectQuery, "from (", sqlStmt, ") group by chrom,start having ( count(start) > 1 and count(chrom) > 1) union", - selectQuery, "from(", sqlStmt, ") group by chrom, start having ( count(start) = 1 and count(chrom) = 1)"] - - if findallOptimize: - if self.memBacked: - self.memcon.row_factory = None - sql = self.memcon.cursor() - else: - self.dbcon.row_factory = None - sql = self.dbcon.cursor() - - stmt.append("order by start") - elif readIDDict: - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() - - stmt.append("order by readID, start") - else: - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() - - stmt.append("order by chrom, start") - - sqlQuery = string.join(stmt) - sql.execute(sqlQuery) - - if findallOptimize: - resultsDict[chrom] = [[int(row[0]), row[1], float(row[2])] for row in sql] - if self.memBacked: - self.memcon.row_factory = sqlite.Row - else: - self.dbcon.row_factory = sqlite.Row - else: - currentChrom = "" - currentReadID = "" - pairID = 0 - for row in sql: - readID = row["readID"] - if fullChrom: - chrom = row["chrom"] - else: - chrom = row["chrom"][3:] - - if not readIDDict and chrom != currentChrom: - resultsDict[chrom] = [] - currentChrom = chrom - dictKey = chrom - elif readIDDict: - theReadID = readID - if "::" in readID: - (theReadID, multiplicity) = readID.split("::") - - if "/" in theReadID and withPairID: - (theReadID, pairID) = readID.split("/") - - if theReadID != currentReadID: - resultsDict[theReadID] = [] - currentReadID = theReadID - dictKey = theReadID - - if entryDict: - newrow = {"start": int(row["start"])} - if bothEnds: - newrow["stop"] = int(row["stop"]) - - if not noSense: - newrow["sense"] = row["sense"] - - if withWeight: - newrow["weight"] = float(row["weight"]) - - if withFlag: - newrow["flag"] = row["flag"] - - if withMismatch: - newrow["mismatch"] = row["mismatch"] - - if withID: - newrow["readID"] = readID - - if withChrom: - newrow["chrom"] = chrom - - if withPairID: - newrow["pairID"] = pairID - else: - newrow = [int(row["start"])] - if bothEnds: - newrow.append(int(row["stop"])) - - if not noSense: - newrow.append(row["sense"]) - - if withWeight: - newrow.append(float(row["weight"])) - - if withFlag: - newrow.append(row["flag"]) - - if withMismatch: - newrow.append(row["mismatch"]) - - if withID: - newrow.append(readID) - - if withChrom: - newrow.append(chrom) - - if withPairID: - newrow.append(pairID) - - resultsDict[dictKey].append(newrow) - - return resultsDict - - - def getSplicesDict(self, verbose=False, noSense=False, fullChrom=False, chrom="", - flag="", withWeight=False, withFlag=False, withMismatch=False, - withID=False, withChrom=False, withPairID=False, readIDDict=False, - splitRead=False, hasMismatch=False, flagLike=False, start=-1, - stop=-1, strand="", entryDict=False): - """ returns a dictionary of spliced reads in a variety of - formats and which can be restricted by chromosome or custom-flag. - Returns unique spliced reads for now. - """ - whereClause = [] - resultsDict = {} - - if chrom != "" and chrom != self.memChrom: - whereClause = ["chrom = '%s'" % chrom] - - if flag != "": - if flagLike: - flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "") - whereClause.append(flagLikeClause) - else: - whereClause.append("flag = '%s'" % flag) - - if hasMismatch: - whereClause.append("mismatch != ''") - - if strand != "": - whereClause.append("sense = '%s'" % strand) - - if start > -1: - whereClause.append("startL > %d" % start) - - if stop > -1: - whereClause.append("stopR < %d" % stop) - - if len(whereClause) > 0: - whereStatement = string.join(whereClause, " and ") - whereQuery = "where %s" % whereStatement - else: - whereQuery = "" - - selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"] - if not noSense: - selectClause.append("sense") - - if withWeight: - selectClause.append("weight") - - if withFlag: - selectClause.append("flag") - - if withMismatch: - selectClause.append("mismatch") - - selectQuery = string.join(selectClause, " ,") - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() - - if chrom == "" and not readIDDict: - stmt = "select distinct chrom from splices %s" % whereQuery - sql.execute(stmt) - for row in sql: - if fullChrom: - chrom = row["chrom"] - else: - chrom = row["chrom"][3:] - - resultsDict[chrom] = [] - elif chrom != "" and not readIDDict: - resultsDict[chrom] = [] - - stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery) - sql.execute(stmt) - currentReadID = "" - for row in sql: - pairID = 0 - readID = row["readID"] - if fullChrom: - chrom = row["chrom"] - else: - chrom = row["chrom"][3:] - - if readIDDict: - if "/" in readID: - (theReadID, pairID) = readID.split("/") - else: - theReadID = readID - - if theReadID != currentReadID: - resultsDict[theReadID] = [] - currentReadID = theReadID - dictKey = theReadID - else: - dictKey = chrom - - if entryDict: - newrow = {"startL": int(row["startL"])} - newrow["stopL"] = int(row["stopL"]) - newrow["startR"] = int(row["startR"]) - newrow["stopR"] = int(row["stopR"]) - if not noSense: - newrow["sense"] = row["sense"] - - if withWeight: - newrow["weight"] = float(row["weight"]) - - if withFlag: - newrow["flag"] = row["flag"] - - if withMismatch: - newrow["mismatch"] = row["mismatch"] - - if withID: - newrow["readID"] = readID - - if withChrom: - newrow["chrom"] = chrom - - if withPairID: - newrow["pairID"] = pairID - - if splitRead: - leftDict = newrow - del leftDict["startR"] - del leftDict["stopR"] - rightDict = newrow - del rightDict["start"] - del rightDict["stopL"] - resultsDict[dictKey].append(leftDict) - resultsDict[dictKey].append(rightDict) - else: - resultsDict[dictKey].append(newrow) - else: - newrow = [int(row["startL"])] - newrow.append(int(row["stopL"])) - newrow.append(int(row["startR"])) - newrow.append(int(row["stopR"])) - if not noSense: - newrow.append(row["sense"]) - - if withWeight: - newrow.append(float(row["weight"])) - - if withFlag: - newrow.append(row["flag"]) - - if withMismatch: - newrow.append(row["mismatch"]) - - if withID: - newrow.append(readID) - - if withChrom: - newrow.append(chrom) - - if withPairID: - newrow.append(pairID) - - if splitRead: - resultsDict[dictKey].append(newrow[:2] + newrow[4:]) - resultsDict[dictKey].append(newrow[2:]) - else: - resultsDict[dictKey].append(newrow) - - return resultsDict - - - def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False, - splices=False, reportCombined=True, sense="both"): - """ return read counts for a given region. - """ - ucount = 0 - mcount = 0 - scount = 0 - restrict = "" - if sense in ["+", "-"]: - restrict = " sense ='%s' " % sense - - if uniqs: - try: - ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict)) - except: - ucount = 0 - - if multi: - try: - mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict)) - except: - mcount = 0 - - if splices: - try: - scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict)) - except: - scount = 0 - - if reportCombined: - total = ucount + mcount + scount - return total - else: - return (ucount, mcount, scount) - - - def getTotalCounts(self, chrom="", rmin="", rmax=""): - return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both") - - - def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"): - """ returns the number of row in the uniqs table. - """ - whereClause = [] - count = 0 - - if chrom !="" and chrom != self.memChrom: - whereClause = ["chrom='%s'" % chrom] - - if rmin != "": - whereClause.append("%s >= %s" % (startField, str(rmin))) - - if rmax != "": - whereClause.append("%s <= %s" % (startField, str(rmax))) - - if restrict != "": - whereClause.append(restrict) - - if len(whereClause) > 0: - whereStatement = string.join(whereClause, " and ") - whereQuery = "where %s" % whereStatement - else: - whereQuery = "" - - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() - - if distinct: - sql.execute("select count(distinct chrom+start+sense) from %s %s" % (table, whereQuery)) - else: - sql.execute("select sum(weight) from %s %s" % (table, whereQuery)) - - result = sql.fetchone() - - try: - count = int(result[0]) - except: - count = 0 - - return count - - - def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False): - """ returns the number of row in the splices table. - """ - return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL") - - - def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False): - """ returns the number of distinct readIDs in the uniqs table. - """ - return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct) - - - def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False): - """ returns the total weight of readIDs in the multi table. - """ - return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct) - - - def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1): - """ get readID's. - """ - stmt = [] - limitPart = "" - if limit > 0: - limitPart = "LIMIT %d" % limit - - if uniqs: - stmt.append("select readID from uniqs") - - if multi: - stmt.append("select readID from multi") - - if splices: - stmt.append("select readID from splices") - - if len(stmt) > 0: - selectPart = string.join(stmt, " union ") - else: - selectPart = "" - - sqlQuery = "%s group by readID %s" (selectPart, limitPart) - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() - - sql.execute(sqlQuery) - result = sql.fetchall() - - if paired: - return [x.split("/")[0][0] for x in result] - else: - return [x[0] for x in result] - - - def getMismatches(self, mischrom = None, verbose=False, useSplices=True): - """ returns the uniq and spliced mismatches in a dictionary. - """ - revcomp = {"A": "T", - "T": "A", - "G": "C", - "C": "G", - "N": "N" - } - - readlen = self.getReadSize() - if mischrom: - hitChromList = [mischrom] - else: - hitChromList = self.getChromosomes() - hitChromList.sort() - - snpDict = {} - for achrom in hitChromList: - if verbose: - print "getting mismatches from chromosome %s" % (achrom) - - snpDict[achrom] = [] - hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, findallOptimize=False, hasMismatch=True) - if useSplices and self.dataType == "RNA": - spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True) - spliceIDList = spliceDict.keys() - for k in spliceIDList: - (startpos, lefthalf, rightstart, endspos, sense, mismatches) = spliceDict[k][0] - spMismatchList = mismatches.split(",") - for mismatch in spMismatchList: - if "N" in mismatch: - continue - - change_len = len(mismatch) - if sense == "+": - change_from = mismatch[0] - change_base = mismatch[change_len-1] - change_pos = int(mismatch[1:change_len-1]) - elif sense == "-": - change_from = revcomp[mismatch[0]] - change_base = revcomp[mismatch[change_len-1]] - change_pos = readlen - int(mismatch[1:change_len-1]) + 1 - - firsthalf = int(lefthalf)-int(startpos)+1 - secondhalf = 0 - if int(change_pos) <= int(firsthalf): - change_at = startpos + change_pos - 1 - else: - secondhalf = change_pos - firsthalf - change_at = rightstart + secondhalf - - snpDict[achrom].append([startpos, change_at, change_base, change_from]) - - if achrom not in hitDict: - continue - - for (start, sense, mismatches) in hitDict[achrom]: - mismatchList = mismatches.split(",") - for mismatch in mismatchList: - if "N" in mismatch: - continue - - change_len = len(mismatch) - if sense == "+": - change_from = mismatch[0] - change_base = mismatch[change_len-1] - change_pos = int(mismatch[1:change_len-1]) - elif sense == "-": - change_from = revcomp[mismatch[0]] - change_base = revcomp[mismatch[change_len-1]] - change_pos = readlen - int(mismatch[1:change_len-1]) + 1 - - change_at = start + change_pos - 1 - snpDict[achrom].append([start, change_at, change_base, change_from]) - - return snpDict - - - def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True, - useSplices=False, normalizationFactor = 1.0, trackStrand=False, - keepStrand="both", shiftValue=0): - """return a profile of the chromosome as an array of per-base read coverage.... - keepStrand = 'both', 'plusOnly', or 'minusOnly'. - Will also shift position of unique and multireads (but not splices) if shift is a natural number - """ - metadata = self.getMetadata() - readlen = int(metadata["readsize"]) - dataType = metadata["dataType"] - scale = 1. / normalizationFactor - shift = {} - shift["+"] = int(shiftValue) - shift["-"] = -1 * int(shiftValue) - - if cstop > 0: - lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen - else: - lastNT = cstop - cstart + readlen + shift["+"] - - chromModel = array("f", [0.] * lastNT) - hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withWeight=True, doMulti=useMulti, start=cstart, stop=cstop, findallOptimize=True) - if cstart < 0: - cstart = 0 - - for (hstart, sense, weight) in hitDict[chromosome]: - hstart = hstart - cstart + shift[sense] - for currentpos in range(hstart,hstart+readlen): - try: - if not trackStrand or (sense == "+" and keepStrand != "minusOnly"): - chromModel[currentpos] += scale * weight - elif sense == '-' and keepStrand != "plusOnly": - chromModel[currentpos] -= scale * weight - except: - continue - - del hitDict - if useSplices and dataType == "RNA": - if cstop > 0: - spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop) - else: - spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True) - - if chromosome in spliceDict: - for (Lstart, Lstop, Rstart, Rstop, rsense, readName) in spliceDict[chromosome]: - if (Rstop - cstart) < lastNT: - for index in range(abs(Lstop - Lstart)): - currentpos = Lstart - cstart + index - # we only track unique splices - if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"): - chromModel[currentpos] += scale - elif rsense == "-" and keepStrand != "plusOnly": - chromModel[currentpos] -= scale - - for index in range(abs(Rstop - Rstart)): - currentpos = Rstart - cstart + index - # we only track unique splices - if not trackStrand or (rsense == "+" and keepStrand != "minusOnly"): - chromModel[currentpos] += scale - elif rsense == "-" and keepStrand != "plusOnly": - chromModel[currentpos] -= scale - - del spliceDict - - return chromModel - - - def insertMetadata(self, valuesList): - """ inserts a list of (pname, pvalue) into the metadata - table. - """ - self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList) - self.dbcon.commit() - - - def updateMetadata(self, pname, newValue, originalValue=""): - """ update a metadata field given the original value and the new value. - """ - stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname) - if originalValue != "": - stmt += " and value='%s' " % str(originalValue) - - self.dbcon.execute(stmt) - self.dbcon.commit() - - - def insertUniqs(self, valuesList): - """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch) - into the uniqs table. - """ - self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList) - self.dbcon.commit() - - - def insertMulti(self, valuesList): - """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch) - into the multi table. - """ - self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList) - self.dbcon.commit() - - - def insertSplices(self, valuesList): - """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) - into the splices table. - """ - self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList) - self.dbcon.commit() - - - def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"): - """ update reads on file database in a list region of regions for a chromosome to have a new flag. - regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with - sense set to '+' or '-', 5 fields per region of the form (flag, chrom, start, stop, sense). - """ - restrict = "" - if sense != "both": - restrict = " and sense = ? " - - if uniqs: - self.dbcon.executemany("UPDATE uniqs SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList) - - if multi: - self.dbcon.executemany("UPDATE multi SET flag = ? where chrom = ? and start >= ? and start < ? " + restrict, regionsList) - - if self.dataType == "RNA" and splices: - self.dbcon.executemany("UPDATE splices SET flag = flag || ' L:' || ? where chrom = ? and startL >= ? and startL < ? " + restrict, regionsList) - self.dbcon.executemany("UPDATE splices SET flag = flag || ' R:' || ? where chrom = ? and startR >= ? and startR < ? " + restrict, regionsList) - - self.dbcon.commit() - - - def setFlags(self, flag, uniqs=True, multi=True, splices=True): - """ set the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch. - """ - if uniqs: - self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag) - - if multi: - self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag) - - if self.dataType == 'RNA' and splices: - self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag) - - self.dbcon.commit() - - - def resetFlags(self, uniqs=True, multi=True, splices=True): - """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch. - """ - if uniqs: - self.dbcon.execute("UPDATE uniqs SET flag = ''") - - if multi: - self.dbcon.execute("UPDATE multi SET flag = ''") - - if self.dataType == "RNA" and splices: - self.dbcon.execute("UPDATE splices SET flag = ''") - - self.dbcon.commit() - - - def reweighMultireads(self, readList): - self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList) - - - def setSynchronousPragma(self, value="ON"): - try: - self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value) - except: - print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value - - - def setDBcache(self, cache, default=False): - self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache) - if default: - self.dbcon.execute('PRAGMA DEFAULT_CACHE_SIZE = %d' % cache) - - - def execute(self, statement, returnResults=False, forceCommit=False): - if self.memBacked: - sql = self.memcon.cursor() - else: - sql = self.dbcon.cursor() - - sql.execute(statement) - if returnResults: - result = sql.fetchall() - return result - - if forceCommit: - if self.memBacked: - self.memcon.commit() - else: - self.dbcon.commit() - - - def buildIndex(self, cache=100000): - """ Builds the file indeces for the main tables. - Cache is the number of 1.5 kb pages to keep in memory. - 100000 pages translates into 150MB of RAM, which is our default. - """ - if cache > self.getDefaultCacheSize(): - self.setDBcache(cache) - self.setSynchronousPragma("OFF") - self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)") - print "built uPosIndex" - self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)") - print "built uChromIndex" - self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)") - print "built mPosIndex" - self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)") - print "built mChromIndex" - - if self.dataType == "RNA": - self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)") - print "built sPosIndex" - self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)") - print "built sPosIndex2" - self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)") - print "built sChromIndex" - - self.dbcon.commit() - self.setSynchronousPragma("ON") - - - def dropIndex(self): - """ drops the file indices for the main tables. - """ - try: - self.setSynchronousPragma("OFF") - self.dbcon.execute("DROP INDEX uPosIndex") - self.dbcon.execute("DROP INDEX uChromIndex") - self.dbcon.execute("DROP INDEX mPosIndex") - self.dbcon.execute("DROP INDEX mChromIndex") - - if self.dataType == "RNA": - self.dbcon.execute("DROP INDEX sPosIndex") - try: - self.dbcon.execute("DROP INDEX sPosIndex2") - except: - pass - - self.dbcon.execute("DROP INDEX sChromIndex") - - self.dbcon.commit() - except: - print "problem dropping index" - - self.setSynchronousPragma("ON") - - - def memSync(self, chrom="", index=False): - """ makes a copy of the dataset into memory for faster access. - Can be restricted to a "full" chromosome. Can also build the - memory indices. - """ - self.memcon = "" - self.memcon = sqlite.connect(":memory:") - self.initializeTables(self.memcon) - cursor = self.dbcon.cursor() - whereclause = "" - if chrom != "": - print "memSync %s" % chrom - whereclause = " where chrom = '%s' " % chrom - self.memChrom = chrom - else: - self.memChrom = "" - - self.memcon.execute("PRAGMA temp_store = MEMORY") - self.memcon.execute("PRAGMA CACHE_SIZE = 1000000") - # copy metadata to memory - self.memcon.execute("delete from metadata") - results = cursor.execute("select name, value from metadata") - results2 = [] - for row in results: - results2.append((row["name"], row["value"])) - - self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2) - # copy uniqs to memory - results = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from uniqs" + whereclause) - results2 = [] - for row in results: - results2.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"])) - - self.memcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", results2) - # copy multi to memory - results = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from multi" + whereclause) - results2 = [] - for row in results: - results2.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"])) - - self.memcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", results2) - # copy splices to memory - if self.dataType == "RNA": - results = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices" + whereclause) - results2 = [] - for row in results: - results2.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"], row["weight"], row["flag"], row["mismatch"])) - - self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, weight, sense, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", results2) - if index: - if chrom != "": - self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)") - self.memcon.execute("CREATE INDEX mPosIndex on multi(start)") - if self.dataType == "RNA": - self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)") - self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)") - else: - self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)") - self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)") - if self.dataType == "RNA": - self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)") - self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)") - - self.memBacked = True - self.memcon.row_factory = sqlite.Row - self.memcon.commit()