rewrite of findall.py and MakeRdsFromBam to fix bugs resulting from poor initial...

[erange.git] / ReadDataset.py
diff --git a/ReadDataset.py b/ReadDataset.py

index c9d2a0bf2b4eb56e0fa906be2319fcac866c30b3..71544ca2420d2489fca13452976d930a6c55a9c5 100644 (file)
--- a/ReadDataset.py
+++ b/ReadDataset.py
@@ -6,7 +6,7 @@ import os
  from array import array
  from commoncode import getReverseComplement, getConfigParser, getConfigOption
  
-currentRDSVersion = "2.0"
+currentRDSVersion = "2.1"
  
  
  class ReadDatasetError(Exception):
@@ -35,6 +35,9 @@ class ReadDataset():
          self.memCursor = ""
          self.cachedDBFile = ""
  
+        if initialize and datasetType not in ["DNA", "RNA"]:
+            raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
+
          if cache:
              if verbose:
                  print "caching ...."
@@ -48,11 +51,7 @@ class ReadDataset():
          self.dbcon.row_factory = sqlite.Row
          self.dbcon.execute("PRAGMA temp_store = MEMORY")
          if initialize:
-            if datasetType not in ["DNA", "RNA"]:
-                raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
-            else:
-                self.dataType = datasetType
-
+            self.dataType = datasetType
              self.initializeTables(self.dbcon)
          else:
              metadata = self.getMetadata("dataType")
@@ -69,38 +68,7 @@ class ReadDataset():
                  self.rdsVersion = "pre-1.0"
  
          if verbose:
-            if initialize:
-                print "INITIALIZED dataset %s" % datafile
-            else:
-                print "dataset %s" % datafile
-
-            metadata = self.getMetadata()
-            print "metadata:"
-            pnameList = metadata.keys()
-            pnameList.sort()
-            for pname in pnameList:
-                print "\t" + pname + "\t" + metadata[pname]
-
-            if reportCount:
-                ucount = self.getUniqsCount()
-                mcount = self.getMultiCount()
-                if self.dataType == "DNA" and not initialize:
-                    try:
-                        print "\n%d unique reads and %d multireads" % (int(ucount), int(mcount))
-                    except ValueError:
-                        print "\n%s unique reads and %s multireads" % (ucount, mcount)
-                elif self.dataType == "RNA" and not initialize:
-                    scount = self.getSplicesCount()
-                    try:
-                        print "\n%d unique reads, %d spliced reads and %d multireads" % (int(ucount), int(scount), int(mcount))
-                    except ValueError:
-                        print "\n%s unique reads, %s spliced reads and %s multireads" % (ucount, scount, mcount)
-
-            print "default cache size is %d pages" % self.getDefaultCacheSize()
-            if self.hasIndex():
-                print "found index"
-            else:
-                print "not indexed"
+            self.printRDSInfo(datafile, reportCount, initialize)
  
  
      def __len__(self):
@@ -124,6 +92,39 @@ class ReadDataset():
              self.uncacheDB()
  
  
+    def printRDSInfo(self, datafile, reportCount, initialize):
+        if initialize:
+            print "INITIALIZED dataset %s" % datafile
+        else:
+            print "dataset %s" % datafile
+
+        metadata = self.getMetadata()
+        print "metadata:"
+        pnameList = metadata.keys()
+        pnameList.sort()
+        for pname in pnameList:
+            print "\t" + pname + "\t" + metadata[pname]
+
+        if reportCount and not initialize:
+            self.printReadCounts()
+
+        print "default cache size is %d pages" % self.getDefaultCacheSize()
+        if self.hasIndex():
+            print "found index"
+        else:
+            print "not indexed"
+
+
+    def printReadCounts(self):
+        ucount = self.getUniqsCount()
+        mcount = self.getMultiCount()
+        if self.dataType == "DNA":
+            print "\n%d unique reads and %d multireads" % (ucount, mcount)
+        elif self.dataType == "RNA":
+            scount = self.getSplicesCount()
+            print "\n%d unique reads, %d spliced reads and %d multireads" % (ucount, scount, mcount)
+
+
      def cacheDB(self, filename):
          """ copy geneinfoDB to a local cache.
          """
@@ -246,6 +247,10 @@ class ReadDataset():
              tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
              dbConnection.execute("create table splices %s" % tableSchema)
  
+            positionSchema = "startL int, stopL int, startR int, stopR int"
+            tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
+            dbConnection.execute("create table multisplices %s" % tableSchema)
+
          dbConnection.commit()
  
  
@@ -993,6 +998,14 @@ class ReadDataset():
          self.dbcon.commit()
  
  
+    def insertMultisplices(self, valuesList):
+        """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
+        into the multisplices table.
+        """
+        self.dbcon.executemany("insert into multisplices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
+        self.dbcon.commit()
+
+
      def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
          """ update reads on file database in a list region of regions for a chromosome to have a new flag.
              regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with