Release version for Erange 4.0a

[erange.git] / ReadDataset.py
diff --git a/ReadDataset.py b/ReadDataset.py

index 5ff60e2e6954c862888c5807495ef49142e14fa3..56e28a8995b734735ace06d1d990ac9b46753580 100644 (file)
--- a/ReadDataset.py
+++ b/ReadDataset.py
@@ -153,42 +153,42 @@ class ReadDataset():
              self.cachedDB = ""
  
  
-    def attachDB(self, filename, asname):
+    def attachDB(self, filename, dbName):
          """ attach another database file to the readDataset.
          """
-        stmt = "attach '%s' as %s" % (filename, asname)
+        stmt = "attach '%s' as %s" % (filename, dbName)
          self.execute(stmt)
  
  
-    def detachDB(self, asname):
+    def detachDB(self, dbName):
          """ detach a database file to the readDataset.
          """
-        stmt = "detach %s" % (asname)
+        stmt = "detach %s" % (dbName)
          self.execute(stmt)
  
  
-    def importFromDB(self, asname, table, ascolumns="*", destcolumns="", flagged=""):
+    def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
          """ import into current RDS the table (with columns destcolumns,
              with default all columns) from the database file asname,
              using the column specification of ascolumns (default all).
          """
-        stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, asname, table)
+        stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
          if flagged != "":
              stmt += " where flag = '%s' " % flagged
  
          self.executeCommit(stmt)
  
  
-    def getTables(self, asname=""):
+    def getTables(self, dbName=""):
          """ get a list of table names in a particular database file.
          """
          resultList = []
          sql = self.getSqlCursor()
  
-        if asname != "":
-            asname += "."
+        if dbName != "":
+            dbName = "%s." % dbName
  
-        stmt = "select name from %ssqlite_master where type='table'" % asname
+        stmt = "select name from %ssqlite_master where type='table'" % dbName
          sql.execute(stmt)
          results = sql.fetchall()
  
@@ -291,11 +291,15 @@ class ReadDataset():
          if "readsize" not in metadata:
              raise ReadDatasetError("no readsize parameter defined")
          else:
-            mysize = metadata["readsize"]
-            if "import" in mysize:
-                mysize = mysize.split()[0]
+            readSize = metadata["readsize"]
+            if "import" in readSize:
+                readSize = readSize.split()[0]
  
-            return int(mysize)
+            readSize = int(readSize)
+            if readSize < 0:
+                raise ReadDatasetError("readsize is negative")
+
+            return readSize
  
  
      def getDefaultCacheSize(self):
@@ -317,11 +321,9 @@ class ReadDataset():
                  if row["chrom"] not in results:
                      results.append(row["chrom"])
              else:
-                if  len(row["chrom"][3:].strip()) < 1:
-                    continue
-
-                if row["chrom"][3:] not in results:
-                    results.append(row["chrom"][3:])
+                shortName = row["chrom"][3:]
+                if  len(shortName.strip()) > 0 and shortName not in results:
+                    results.append(shortName)
  
          results.sort()
  
@@ -333,32 +335,17 @@ class ReadDataset():
          """ returns the maximum coordinate for reads on a given chromosome.
          """
          maxCoord = 0
-        sql = self.getSqlCursor()
  
          if doUniqs:
-            try:
-                sql.execute("select max(start) from uniqs where chrom = '%s'" % chrom)
-                maxCoord = int(sql.fetchall()[0][0])
-            except:
-                print "couldn't retrieve coordMax for chromosome %s" % chrom
+            maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
  
          if doSplices:
-            sql.execute("select max(startR) from splices where chrom = '%s'" % chrom)
-            try:
-                spliceMax = int(sql.fetchall()[0][0])
-                if spliceMax > maxCoord:
-                    maxCoord = spliceMax
-            except:
-                pass
+            spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
+            maxCoord = max(spliceMax, maxCoord)
  
          if doMulti:
-            sql.execute("select max(start) from multi where chrom = '%s'" % chrom)
-            try:
-                multiMax = int(sql.fetchall()[0][0])
-                if multiMax > maxCoord:
-                    maxCoord = multiMax
-            except:
-                pass
+            multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
+            maxCoord = max(multiMax, maxCoord)
  
          if verbose:
              print "%s maxCoord: %d" % (chrom, maxCoord)
@@ -366,6 +353,19 @@ class ReadDataset():
          return maxCoord
  
  
+    def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
+        maxCoord = 0
+        sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
+        sql = self.getSqlCursor()
+        try:
+            sql.execute(sqlStatement)
+            maxCoord = int(sql.fetchall()[0][0])
+        except:
+            print "couldn't retrieve coordMax for chromosome %s" % chrom
+
+        return maxCoord
+
+
      def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
                       flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
                       withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
@@ -378,67 +378,14 @@ class ReadDataset():
          
          Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
          """
-        whereClause = []
-        resultsDict = {}
-
-        if chrom != "" and chrom != self.memChrom:
-            whereClause.append("chrom = '%s'" % chrom)
-
-        if flag != "":
-            if flagLike:
-                flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
-                whereClause.append(flagLikeClause)
-            else:
-                whereClause.append("flag = '%s'" % flag)
-
-        if start > -1:
-            whereClause.append("start > %d" % start)
-
-        if stop > -1:
-            whereClause.append("stop < %d" % stop)
-
-        if len(readLike) > 0:
-            readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
-            whereClause.append(readIDClause)
-
-        if hasMismatch:
-            whereClause.append("mismatch != ''")
-
-        if strand in ["+", "-"]:
-            whereClause.append("sense = '%s'" % strand)
-
-        if len(whereClause) > 0:
-            whereStatement = string.join(whereClause, " and ")
-            whereQuery = "where %s" % whereStatement
-        else:
-            whereQuery = ""
  
-        groupBy = []
+        whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
          if findallOptimize:
-            selectClause = ["select start, sense, sum(weight)"]
-            groupBy = ["GROUP BY start, sense"]
+            selectQuery = "select start, sense, sum(weight)"
          else:
-            selectClause = ["select ID, chrom, start, readID"]
-            if bothEnds:
-                selectClause.append("stop")
-
-            if not noSense:
-                selectClause.append("sense")
-
-            if withWeight:
-                selectClause.append("weight")
-
-            if withFlag:
-                selectClause.append("flag")
+            selectQuery = self.getReadSelectQuery("select ID, chrom, start, readID", noSense, withWeight, withFlag, withMismatch, bothEnds)
  
-            if withMismatch:
-                selectClause.append("mismatch")
-
-        if limit > 0 and not combine5p:
-            groupBy.append("LIMIT %d" % limit)
-
-        selectQuery = string.join(selectClause, ",")
-        groupQuery = string.join(groupBy)
+        groupQuery = self.getReadGroupQuery(findallOptimize, limit, combine5p)
          if doUniqs:
              stmt = [selectQuery, "from uniqs", whereQuery, groupQuery]
              if doMulti:
@@ -498,6 +445,7 @@ class ReadDataset():
          sqlQuery = string.join(stmt)
          sql.execute(sqlQuery)
  
+        resultsDict = {}
          if findallOptimize:
              resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
              if self.memBacked:
@@ -562,20 +510,17 @@ class ReadDataset():
          return resultsDict
  
  
-    def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
-                       flag="", withWeight=False, withFlag=False, withMismatch=False,
-                       withID=False, withChrom=False, withPairID=False, readIDDict=False,
-                       splitRead=False, hasMismatch=False, flagLike=False, start=-1,
-                       stop=-1, strand=""):
-        """ returns a dictionary of spliced reads in a variety of
-        formats and which can be restricted by chromosome or custom-flag.
-        Returns unique spliced reads for now.
-        """
-        whereClause = []
-        resultsDict = {}
+    def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
+        if splice:
+            startText = "startL"
+            stopText = "stopR"
+        else:
+            startText = "start"
+            stopText = "stop"
  
+        whereClause = []
          if chrom != "" and chrom != self.memChrom:
-            whereClause = ["chrom = '%s'" % chrom]
+            whereClause.append("chrom = '%s'" % chrom)
  
          if flag != "":
              if flagLike:
@@ -584,25 +529,37 @@ class ReadDataset():
              else:
                  whereClause.append("flag = '%s'" % flag)
  
+        if start > -1:
+            whereClause.append("%s > %d" % (startText, start))
+
+        if stop > -1:
+            whereClause.append("%s < %d" % (stopText, stop))
+
+        if len(readLike) > 0:
+            readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
+            whereClause.append(readIDClause)
+
          if hasMismatch:
              whereClause.append("mismatch != ''")
  
-        if strand != "":
+        if strand in ["+", "-"]:
              whereClause.append("sense = '%s'" % strand)
  
-        if start > -1:
-            whereClause.append("startL > %d" % start)
-
-        if stop > -1:
-            whereClause.append("stopR < %d" % stop)
-
          if len(whereClause) > 0:
              whereStatement = string.join(whereClause, " and ")
              whereQuery = "where %s" % whereStatement
          else:
              whereQuery = ""
  
-        selectClause = ["select ID, chrom, startL, stopL, startR, stopR, readID"]
+        return whereQuery
+
+
+    def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
+
+        selectClause = [baseSelect]
+        if bothEnds:
+            selectClause.append("stop")
+
          if not noSense:
              selectClause.append("sense")
  
@@ -615,7 +572,36 @@ class ReadDataset():
          if withMismatch:
              selectClause.append("mismatch")
  
-        selectQuery = string.join(selectClause, " ,")
+        selectQuery = string.join(selectClause, ",")
+
+        return selectQuery
+
+
+    def getReadGroupQuery(self, findallOptimize, limit, combine5p):
+        groupBy = []
+        if findallOptimize:
+            groupBy = ["GROUP BY start, sense"]
+
+        if limit > 0 and not combine5p:
+            groupBy.append("LIMIT %d" % limit)
+
+        groupQuery = string.join(groupBy)
+
+        return groupQuery
+
+
+    def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
+                       flag="", withWeight=False, withFlag=False, withMismatch=False,
+                       withID=False, withChrom=False, withPairID=False, readIDDict=False,
+                       splitRead=False, hasMismatch=False, flagLike=False, start=-1,
+                       stop=-1, strand=""):
+        """ returns a dictionary of spliced reads in a variety of
+        formats and which can be restricted by chromosome or custom-flag.
+        Returns unique spliced reads for now.
+        """
+        whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
+        selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
+        selectQuery = self.getReadSelectQuery(selectClause, noSense, withWeight, withFlag, withMismatch)
          if self.memBacked:
              sql = self.memcon.cursor()
          else:
@@ -625,6 +611,7 @@ class ReadDataset():
          sql.execute(stmt)
          currentReadID = ""
          currentChrom = ""
+        resultsDict = {}
          for row in sql:
              pairID = 0
              readID = row["readID"]
@@ -796,10 +783,6 @@ class ReadDataset():
          """ get readID's.
          """
          stmt = []
-        limitPart = ""
-        if limit > 0:
-            limitPart = "LIMIT %d" % limit
-
          if uniqs:
              stmt.append("select readID from uniqs")
  
@@ -814,6 +797,10 @@ class ReadDataset():
          else:
              selectPart = ""
  
+        limitPart = ""
+        if limit > 0:
+            limitPart = "LIMIT %d" % limit
+
          sqlQuery = "%s group by readID %s" % (selectPart, limitPart)
          if self.memBacked:
              sql = self.memcon.cursor()
@@ -845,12 +832,11 @@ class ReadDataset():
                  print "getting mismatches from chromosome %s" % (achrom)
  
              snpDict[achrom] = []
-            hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
              if useSplices and self.dataType == "RNA":
                  spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
                  spliceIDList = spliceDict.keys()
-                for k in spliceIDList:
-                    spliceEntry = spliceDict[k][0]
+                for spliceID in spliceIDList:
+                    spliceEntry = spliceDict[spliceID][0]
                      startpos = spliceEntry["startL"]
                      lefthalf = spliceEntry["stopL"]
                      rightstart = spliceEntry["startR"]
@@ -881,6 +867,7 @@ class ReadDataset():
  
                          snpDict[achrom].append([startpos, change_at, change_base, change_from])
  
+            hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
              if achrom not in hitDict.keys():
                  continue
  
@@ -910,7 +897,7 @@ class ReadDataset():
  
  
      def getChromProfile(self, chromosome, cstart=-1, cstop=-1, useMulti=True,
-                        useSplices=False, normalizationFactor = 1.0, trackStrand=False,
+                        useSplices=False, normalizationFactor=1.0, trackStrand=False,
                          keepStrand="both", shiftValue=0):
          """return a profile of the chromosome as an array of per-base read coverage....
              keepStrand = 'both', 'plusOnly', or 'minusOnly'.
@@ -925,8 +912,8 @@ class ReadDataset():
          dataType = metadata["dataType"]
          scale = 1. / normalizationFactor
          shift = {}
-        shift['+'] = int(shiftValue)
-        shift['-'] = -1 * int(shiftValue)
+        shift["+"] = int(shiftValue)
+        shift["-"] = -1 * int(shiftValue)
  
          if cstop > 0:
              lastNT = self.getMaxCoordinate(chromosome, doMulti=useMulti, doSplices=useSplices) + readlen