X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=ReadDataset.py;fp=ReadDataset.py;h=71544ca2420d2489fca13452976d930a6c55a9c5;hp=c9d2a0bf2b4eb56e0fa906be2319fcac866c30b3;hb=47bd897210cb85e042f11d7400b46d94400cc428;hpb=03f1e0b3bab22d517ad75b9af4d54e8fcb8540fb diff --git a/ReadDataset.py b/ReadDataset.py index c9d2a0b..71544ca 100644 --- a/ReadDataset.py +++ b/ReadDataset.py @@ -6,7 +6,7 @@ import os from array import array from commoncode import getReverseComplement, getConfigParser, getConfigOption -currentRDSVersion = "2.0" +currentRDSVersion = "2.1" class ReadDatasetError(Exception): @@ -35,6 +35,9 @@ class ReadDataset(): self.memCursor = "" self.cachedDBFile = "" + if initialize and datasetType not in ["DNA", "RNA"]: + raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'") + if cache: if verbose: print "caching ...." @@ -48,11 +51,7 @@ class ReadDataset(): self.dbcon.row_factory = sqlite.Row self.dbcon.execute("PRAGMA temp_store = MEMORY") if initialize: - if datasetType not in ["DNA", "RNA"]: - raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'") - else: - self.dataType = datasetType - + self.dataType = datasetType self.initializeTables(self.dbcon) else: metadata = self.getMetadata("dataType") @@ -69,38 +68,7 @@ class ReadDataset(): self.rdsVersion = "pre-1.0" if verbose: - if initialize: - print "INITIALIZED dataset %s" % datafile - else: - print "dataset %s" % datafile - - metadata = self.getMetadata() - print "metadata:" - pnameList = metadata.keys() - pnameList.sort() - for pname in pnameList: - print "\t" + pname + "\t" + metadata[pname] - - if reportCount: - ucount = self.getUniqsCount() - mcount = self.getMultiCount() - if self.dataType == "DNA" and not initialize: - try: - print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount)) - except ValueError: - print "\n%s unique reads and %s multireads" % (ucount, mcount) - elif self.dataType == "RNA" and not initialize: - scount = self.getSplicesCount() - try: - print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount)) - except ValueError: - print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount) - - print "default cache size is %d pages" % self.getDefaultCacheSize() - if self.hasIndex(): - print "found index" - else: - print "not indexed" + self.printRDSInfo(datafile, reportCount, initialize) def __len__(self): @@ -124,6 +92,39 @@ class ReadDataset(): self.uncacheDB() + def printRDSInfo(self, datafile, reportCount, initialize): + if initialize: + print "INITIALIZED dataset %s" % datafile + else: + print "dataset %s" % datafile + + metadata = self.getMetadata() + print "metadata:" + pnameList = metadata.keys() + pnameList.sort() + for pname in pnameList: + print "\t" + pname + "\t" + metadata[pname] + + if reportCount and not initialize: + self.printReadCounts() + + print "default cache size is %d pages" % self.getDefaultCacheSize() + if self.hasIndex(): + print "found index" + else: + print "not indexed" + + + def printReadCounts(self): + ucount = self.getUniqsCount() + mcount = self.getMultiCount() + if self.dataType == "DNA": + print "\n%d unique reads and %d multireads" % (ucount, mcount) + elif self.dataType == "RNA": + scount = self.getSplicesCount() + print "\n%d unique reads, %d spliced reads and %d multireads" % (ucount, scount, mcount) + + def cacheDB(self, filename): """ copy geneinfoDB to a local cache. """ @@ -246,6 +247,10 @@ class ReadDataset(): tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema dbConnection.execute("create table splices %s" % tableSchema) + positionSchema = "startL int, stopL int, startR int, stopR int" + tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema + dbConnection.execute("create table multisplices %s" % tableSchema) + dbConnection.commit() @@ -993,6 +998,14 @@ class ReadDataset(): self.dbcon.commit() + def insertMultisplices(self, valuesList): + """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) + into the multisplices table. + """ + self.dbcon.executemany("insert into multisplices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList) + self.dbcon.commit() + + def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"): """ update reads on file database in a list region of regions for a chromosome to have a new flag. regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with