first pass cleanup of cistematic/genomes; change bamPreprocessing

[erange.git] / ReadDataset.py
diff --git a/ReadDataset.py b/ReadDataset.py

index 4b51471184777a1cfe0d650a51ead592873ce3a1..dffa76de9436f4379f42ba2175bec17fc1a8fa59 100644 (file)
--- a/ReadDataset.py
+++ b/ReadDataset.py
@@ -1,18 +1,72 @@
-import sqlite3 as sqlite
  import string
  import tempfile
  import shutil
  import os
+import re
+import sys
+import pysam
  from array import array
-from commoncode import getReverseComplement, getConfigParser, getConfigOption
-
-currentRDSVersion = "2.1"
+from commoncode import getReverseComplement, isSpliceEntry
  
+currentRDSVersion = "3.0"
  
  class ReadDatasetError(Exception):
      pass
  
  
+class ReadCounter():
+    uniqReadCount = 0
+    multiReadCount = 0.0
+    spliceReadCount = 0
+
+    def __init__(self, multiReadIDCount={}, sense="both"):
+        self.multiReadIDCount = multiReadIDCount
+        self.sense = sense
+
+
+    def __call__(self, read):
+        multiplicity = self.getReadMultiplicity(read)
+        if multiplicity > 1:
+            self.multiReadCount += float(1.0/multiplicity)
+        elif isSpliceEntry(read.cigar):
+            self.spliceReadCount += 1
+        else:
+            self.uniqReadCount += 1
+
+
+    def getReadMultiplicity(self, read):
+        pairReadSuffix = getPairedReadNumberSuffix(read)
+        readID = "%s%s" % (read.qname, pairReadSuffix)
+        try:
+            multiplicity = self.multiReadIDCount[readID]
+        except KeyError:
+            multiplicity = 1
+
+        return multiplicity
+
+
+    def isMulti(self, read):
+        pairReadSuffix = getPairedReadNumberSuffix(read)
+        readID = "%s%s" % (read.qname, pairReadSuffix)
+        return readID in self.multiReadIDCount
+
+
+
+class MaxCoordFinder(ReadCounter):
+    maxUniqueStart = 0
+    maxMultiStart = 0
+    maxSpliceStart = 0
+
+    def __call__(self, read):
+        if self.isMulti(read):
+            self.maxMultiStart = max(self.maxMultiStart, read.pos)
+        elif isSpliceEntry(read.cigar):
+            self.maxSpliceStart = max(self.maxMultiStart, getSpliceRightStart(read.pos, read.cigar))
+        else:
+            self.maxUniqueStart = max(self.maxMultiStart, read.pos)
+
+
+
  class ReadDataset():
      """ Class for storing reads from experiments. Assumes that custom scripts
      will translate incoming data into a format that can be inserted into the
@@ -21,75 +75,93 @@ class ReadDataset():
      splices table.
      """
  
+
      def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False, 
                   cache=False, reportCount=True):
          """ creates an rds datafile if initialize is set to true, otherwise
          will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
          """
-        self.dbcon = ""
-        self.memcon = ""
-        self.dataType = ""
-        self.rdsVersion = currentRDSVersion
-        self.memBacked = False
-        self.memChrom = ""
-        self.memCursor = ""
-        self.cachedDBFile = ""
  
          if initialize and datasetType not in ["DNA", "RNA"]:
              raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
  
-        if cache:
-            if verbose:
-                print "caching ...."
+        self.dataType = ""
+        self.multiReadIDCounts = self.getMultiReadIDCounts(datafile, "rb")
+        self.bamfile = pysam.Samfile(datafile, "rb")
+        self.readsize = None
+        self.totalReadWeight = None
  
-            self.cacheDB(datafile)
-            dbFile = self.cachedDBFile
-        else:
-            dbFile = datafile
+        self.metadata = {"dataType": datasetType}
  
-        self.dbcon = sqlite.connect(dbFile)
-        self.dbcon.row_factory = sqlite.Row
-        self.dbcon.execute("PRAGMA temp_store = MEMORY")
          if initialize:
              self.dataType = datasetType
-            self.initializeTables(self.dbcon)
          else:
-            metadata = self.getMetadata("dataType")
-            self.dataType = metadata["dataType"]
+            self.dataType = self.metadata["dataType"]
  
          try:
-            metadata = self.getMetadata("rdsVersion")
-            self.rdsVersion = metadata["rdsVersion"]
-        except:
-            try:
-                self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
-            except IOError:
-                print "could not add rdsVersion - read-only ?"
-                self.rdsVersion = "pre-1.0"
+            self.rdsVersion = self.metadata["rdsVersion"]
+        except KeyError:
+            self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
  
+        self.fullReadCounts = self.getFullReadCount()
          if verbose:
              self.printRDSInfo(datafile, reportCount, initialize)
  
  
      def __len__(self):
-        """ return the number of usable reads in the dataset.
+        """ return the number of reads in the bam file
+            This is not the same as the original as the original returned total weight.
+        """
+        count = 0
+        references = self.bamfile.references
+        for reference in references:
+            count += self.bamfile.count(reference)
+
+        return count
+
+
+    def getMultiReadIDCounts(self, samFileName, fileMode):
+        try:
+            samfile = pysam.Samfile(samFileName, fileMode)
+        except ValueError:
+            print "samfile index not found"
+            sys.exit(1)
+
+        readIDCounts = {}
+        for read in samfile.fetch(until_eof=True):
+            pairReadSuffix = getPairedReadNumberSuffix(read)
+            readName = "%s%s" % (read.qname, pairReadSuffix)
+            try:
+                readIDCounts[readName] += 1.0
+            except KeyError:
+                readIDCounts[readName] = 1.0
+
+        for readID in readIDCounts.keys():
+            if int(readIDCounts[readID]) == 1:
+                del readIDCounts[readID]
+
+        return readIDCounts
+
+
+    def totalReadWeight(self):
+        """ return the total weight of usable reads in the bam file.
          """
-        total = self.getUniqsCount()
-        total += self.getMultiCount()
+        if self.totalReadWeight is None:
+            total = self.fullReadCounts["uniq"]
+            total += self.fullReadCounts["multi"]
  
-        if self.dataType == "RNA":
-            total += self.getSplicesCount()
+            if self.dataType == "RNA":
+                total += self.fullReadCounts["splice"]
  
-        total = int(total)
+            self.totalReadWeight = int(total)
  
-        return total
+        return self.totalReadWeight
  
  
      def __del__(self):
          """ cleanup copy in local cache, if present.
          """
-        if self.cachedDBFile != "":
-            self.uncacheDB()
+        pass
  
  
      def printRDSInfo(self, datafile, reportCount, initialize):
@@ -98,237 +170,129 @@ class ReadDataset():
          else:
              print "dataset %s" % datafile
  
-        metadata = self.getMetadata()
          print "metadata:"
-        pnameList = metadata.keys()
+        pnameList = self.metadata.keys()
          pnameList.sort()
          for pname in pnameList:
-            print "\t" + pname + "\t" + metadata[pname]
+            print "\t" + pname + "\t" + str(self.metadata[pname])
  
          if reportCount and not initialize:
              self.printReadCounts()
  
          print "default cache size is %d pages" % self.getDefaultCacheSize()
-        if self.hasIndex():
-            print "found index"
-        else:
-            print "not indexed"
  
  
      def printReadCounts(self):
-        ucount = self.getUniqsCount()
-        mcount = self.getMultiCount()
+        ucount = self.fullReadCounts["uniq"]
+        mcount = self.fullReadCounts["multi"]
          if self.dataType == "DNA":
              print "\n%d unique reads and %d multireads" % (ucount, mcount)
          elif self.dataType == "RNA":
-            scount = self.getSplicesCount()
+            scount = self.fullReadCounts["splice"]
              print "\n%d unique reads, %d spliced reads and %d multireads" % (ucount, scount, mcount)
  
  
      def cacheDB(self, filename):
-        """ copy geneinfoDB to a local cache.
-        """
-        configParser = getConfigParser()
-        cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
-        tempfile.tempdir = cisTemp
-        self.cachedDBFile =  "%s.db" % tempfile.mktemp()
-        shutil.copyfile(filename, self.cachedDBFile)
+        pass
  
  
      def saveCacheDB(self, filename):
-        """ copy geneinfoDB to a local cache.
-        """
-        shutil.copyfile(self.cachedDBFile, filename)
+        pass
  
  
      def uncacheDB(self):
-        """ delete geneinfoDB from local cache.
-        """
-        global cachedDBFile
-        if self.cachedDBFile != "":
-            try:
-                os.remove(self.cachedDBFile)
-            except:
-                print "could not delete %s" % self.cachedDBFile
-
-            self.cachedDB = ""
+        pass
  
  
      def attachDB(self, filename, dbName):
-        """ attach another database file to the readDataset.
-        """
-        stmt = "attach '%s' as %s" % (filename, dbName)
-        self.execute(stmt)
+        pass
  
  
      def detachDB(self, dbName):
-        """ detach a database file to the readDataset.
-        """
-        stmt = "detach %s" % (dbName)
-        self.execute(stmt)
+        pass
  
  
      def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
-        """ import into current RDS the table (with columns destcolumns,
-            with default all columns) from the database file asname,
-            using the column specification of ascolumns (default all).
-        """
-        stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
-        if flagged != "":
-            stmt += " where flag = '%s' " % flagged
-
-        self.executeCommit(stmt)
+        pass
  
  
      def getTables(self, dbName=""):
-        """ get a list of table names in a particular database file.
-        """
-        resultList = []
-        sql = self.getSqlCursor()
-
-        if dbName != "":
-            dbName = "%s." % dbName
-
-        stmt = "select name from %ssqlite_master where type='table'" % dbName
-        sql.execute(stmt)
-        results = sql.fetchall()
-
-        for row in results:
-            resultList.append(row["name"])
-
-        return resultList
+        pass
  
  
      def getSqlCursor(self):
-        if self.memBacked:
-            sql = self.getMemCursor()
-        else:
-            sql = self.getFileCursor()
-
-        return sql
+        pass
  
  
      def getMemCursor(self):
-        """ returns a cursor to memory database for low-level (SQL)
-        access to the data.
-        """
-        return self.memcon.cursor()
+        pass
  
  
      def getFileCursor(self):
-        """ returns a cursor to file database for low-level (SQL)
-        access to the data.
-        """
-        return self.dbcon.cursor()
+        pass
  
  
-    def hasIndex(self):
-        """ return True if the RDS file has at least one index.
-        """
-        stmt = "select count(*) from sqlite_master where type='index'"
-        count = int(self.execute(stmt, returnResults=True)[0][0])
+    def initializeTables(self, dbConnection, cache=100000):
+        pass
  
-        return count > 0
  
+    def getMetadata(self, valueName=""):        
+        return self.metadata
  
-    def initializeTables(self, dbConnection, cache=100000):
-        """ creates table schema in a database connection, which is
-        typically a database file or an in-memory database.
-        """
-        dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
-        dbConnection.execute("create table metadata (name varchar, value varchar)")
-        dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
-        positionSchema = "start int, stop int"
-        tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
-        dbConnection.execute("create table uniqs %s" % tableSchema)
-        dbConnection.execute("create table multi %s" % tableSchema)
-        if self.dataType == "RNA":
-            positionSchema = "startL int, stopL int, startR int, stopR int"
-            tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
-            dbConnection.execute("create table splices %s" % tableSchema)
-
-            positionSchema = "startL int, stopL int, startR int, stopR int"
-            tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
-            dbConnection.execute("create table multisplices %s" % tableSchema)
-
-        dbConnection.commit()
-
-
-    def getMetadata(self, valueName=""):
-        """ returns a dictionary of metadata.
-        """
-        whereClause = ""
-        resultsDict = {}
  
-        if valueName != "":
-            whereClause = " where name='%s'" % valueName
+    def getReadSize(self):
+        """ This is following the same model as the original where it assumes that all
+            read have the same readsize and that this is reflected by the size of the
+            first read.
+        """
+        if self.readsize is None:
+            bamFileIterator = self.bamfile.fetch(until_eof=True)
+            for read in bamFileIterator:
+                self.calculateReadsizeFromCigar(read.cigar)
+                break
  
-        sql = self.getSqlCursor()
+        return self.readsize
  
-        sql.execute("select name, value from metadata %s" % whereClause)
-        results = sql.fetchall()
  
-        for row in results:
-            parameterName = row["name"]
-            parameterValue = row["value"]
-            if parameterName not in resultsDict:
-                resultsDict[parameterName] = parameterValue
-            else:
-                trying = True
-                index = 2
-                while trying:
-                    newName = string.join([parameterName, str(index)], ":")
-                    if newName not in resultsDict:
-                        resultsDict[newName] = parameterValue
-                        trying = False
+    def calculateReadsizeFromCigar(self, cigar):
+        take = (0, 1) # CIGAR operation (M/match, I/insertion)
+        self.readsize = sum([length for op,length in cigar if op in take])
  
-                    index += 1
  
-        return resultsDict
+    def getDefaultCacheSize(self):
+        """ returns 0 as cache is going to be deprecated
+        """
+        return 0
  
  
-    def getReadSize(self):
-        """ returns readsize if defined in metadata.
+    def getChromosomes(self, fullChrom=True):
+        """ returns a sorted list of distinct chromosomes in bam file reference list.
          """
-        metadata = self.getMetadata()
-        if "readsize" not in metadata:
-            raise ReadDatasetError("no readsize parameter defined")
+        if fullChrom:
+            results = list(self.getFullReferenceNames())
          else:
-            readSize = metadata["readsize"]
-            if "import" in readSize:
-                readSize = readSize.split()[0]
+            results = list(self.getShortReferenceNames())
  
-            readSize = int(readSize)
-            if readSize < 0:
-                raise ReadDatasetError("readsize is negative")
+        results.sort()
  
-            return readSize
+        return results
  
  
-    def getDefaultCacheSize(self):
-        """ returns the default cache size.
+    def getFullReferenceNames(self):
+        """ returns a set of bam file reference names.
          """
-        return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
+        return set(self.bamfile.references)
  
  
-    def getChromosomes(self, table="uniqs", fullChrom=True):
-        """ returns a sorted list of distinct chromosomes in table.
+    def getShortReferenceNames(self):
+        """ returns a set of bam file reference names after removing first 3 characters.
          """
-        statement = "select distinct chrom from %s" % table
-        sql = self.getSqlCursor()
-
-        sql.execute(statement)
-        results = []
-        for row in sql:
-            if fullChrom:
-                if row["chrom"] not in results:
-                    results.append(row["chrom"])
-            else:
-                shortName = row["chrom"][3:]
-                if  len(shortName.strip()) > 0 and shortName not in results:
-                    results.append(shortName)
-
-        results.sort()
+        results = set()
+        references = self.bamfile.references
+        for chromosome in references:
+            shortName = chromosome[3:]
+            if len(shortName.strip()) > 0:
+                results.add(shortName)
  
          return results
  
@@ -338,30 +302,17 @@ class ReadDataset():
          """ returns the maximum coordinate for reads on a given chromosome.
          """
          maxCoord = 0
+        coordFinder = MaxCoordFinder()
+        self.bamfile.fetch(reference=chrom, callback=coordFinder)
  
          if doUniqs:
-            maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
+            maxCoord = coordFinder.maxUniqueStart
  
          if doSplices:
-            spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
-            maxCoord = max(spliceMax, maxCoord)
+            maxCoord = max(coordFinder.maxSpliceStart, maxCoord)
  
          if doMulti:
-            multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
-            maxCoord = max(multiMax, maxCoord)
-
-        return maxCoord
-
-
-    def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
-        maxCoord = 0
-        sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
-        sql = self.getSqlCursor()
-        try:
-            sql.execute(sqlStatement)
-            maxCoord = int(sql.fetchall()[0][0])
-        except:
-            print "couldn't retrieve coordMax for chromosome %s" % chrom
+            maxCoord = max(coordFinder.maxMultiStart, maxCoord)
  
          return maxCoord
  
@@ -369,16 +320,13 @@ class ReadDataset():
      def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
                       flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
                       withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
-                     readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
+                     readIDDict=False, readLike="", start=None, stop=None, limit=-1, hasMismatch=False,
                       flagLike=False, strand='', combine5p=False):
-        """ returns a dictionary of reads in a variety of formats
-        and which can be restricted by chromosome or custom-flag.
-        Returns unique reads by default, but can return multireads
-        with doMulti set to True.
-
+        """ First pass of rewrite
+            1) Leave as much in place as possible
+            2) For now treat multis just like uniqs
+        """
          """
-        #TODO: Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
-
          whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
          if findallOptimize:
              selectQuery = "select start, sense, sum(weight)"
@@ -433,160 +381,134 @@ class ReadDataset():
          sql = self.getSqlCursor()
          sqlQuery = string.join(stmt)
          sql.execute(sqlQuery)
+        """
  
+        # Edits are starting here - trying to ignore the entire sql construct
+        bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop)
          resultsDict = {}
-        if findallOptimize:
-            resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
-            if self.memBacked:
-                self.memcon.row_factory = sqlite.Row
-            else:
-                self.dbcon.row_factory = sqlite.Row
-        else:
-            currentChrom = ""
-            currentReadID = ""
-            pairID = 0
-            for row in sql:
-                readID = row["readID"]
-                if fullChrom:
-                    chrom = row["chrom"]
-                else:
-                    chrom = row["chrom"][3:]
-
-                if not readIDDict and chrom != currentChrom:
-                    resultsDict[chrom] = []
-                    currentChrom = chrom
-                    dictKey = chrom
-                elif readIDDict:
-                    theReadID = readID
-                    if "::" in readID:
-                        theReadID = readID.split("::")[0]
-
-                    if "/" in theReadID and withPairID:
-                        (theReadID, pairID) = readID.split("/")
-
-                    if theReadID != currentReadID:
-                        resultsDict[theReadID] = []
-                        currentReadID = theReadID
-                        dictKey = theReadID
-
-                newrow = {"start": int(row["start"])}
-                if bothEnds:
-                    newrow["stop"] = int(row["stop"])
-
-                if not noSense:
-                    newrow["sense"] = row["sense"]
-
-                if withWeight:
-                    newrow["weight"] = float(row["weight"])
-
-                if withFlag:
-                    newrow["flag"] = row["flag"]
-
-                if withMismatch:
-                    newrow["mismatch"] = row["mismatch"]
-
-                if withID:
-                    newrow["readID"] = readID
-
-                if withChrom:
-                    newrow["chrom"] = chrom
-
-                if withPairID:
-                    newrow["pairID"] = pairID
-
-                resultsDict[dictKey].append(newrow)
-
-        return resultsDict
-
-
-    def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
-        if splice:
-            startText = "startL"
-            stopText = "stopR"
-        else:
-            startText = "start"
-            stopText = "stop"
+        if findallOptimize and chrom != "":
+            readIDDict = False
+            bothEnds = False
+            noSense = False
+            withWeight = True
+            withFlag = False
+            withMismatch = False
+            withID = False
+            withChrom = False
+            withPairID = False
+            fullChrom = True
+            #resultsDict[chrom] = [{"start": int(read.pos), "sense": getReadSense(read.is_reverse), "weight": 1.0} for read in bamFileIterator if not isSpliceEntry(read.cigar)]
+
+        pairID = 0
+        for read in bamFileIterator:
+            pairReadSuffix = getPairedReadNumberSuffix(read)
+            readID = "%s%s" % (read.qname, pairReadSuffix)
+            if isSpliceEntry(read.cigar) or self.readDoesNotMeetCriteria(read, readID, doMulti=doMulti):
+                continue
  
-        whereClause = []
-        if chrom != "" and chrom != self.memChrom:
-            whereClause.append("chrom = '%s'" % chrom)
+            sense = getReadSense(read.is_reverse)
+            if fullChrom:
+                chrom = self.bamfile.getrname(read.rname)
+            else:
+                chrom = self.bamfile.getrname(read.rname)[3:]
  
-        if flag != "":
-            if flagLike:
-                flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
-                whereClause.append(flagLikeClause)
+            if readIDDict:
+                dictKey = read.qname
              else:
-                whereClause.append("flag = '%s'" % flag)
+                dictKey = chrom
  
-        if start > -1:
-            whereClause.append("%s > %d" % (startText, start))
+            newrow = {"start": int(read.pos)}
+            if bothEnds:
+                if self.readsize is None:
+                    self.calculateReadsizeFromCigar(read.cigar)
  
-        if stop > -1:
-            whereClause.append("%s < %d" % (stopText, stop))
+                newrow["stop"] = int(read.pos + self.readsize)
  
-        if len(readLike) > 0:
-            readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
-            whereClause.append(readIDClause)
+            if not noSense:
+                newrow["sense"] = sense
  
-        if hasMismatch:
-            whereClause.append("mismatch != ''")
+            if withWeight:
+                try:
+                    newrow["weight"] = 1.0/self.multiReadIDCounts[readID]
+                except KeyError:
+                    newrow["weight"] = 1.0
  
-        if strand in ["+", "-"]:
-            whereClause.append("sense = '%s'" % strand)
+            if withFlag:
+                #newrow["flag"] = row["flag"]
+                pass
  
-        if len(whereClause) > 0:
-            whereStatement = string.join(whereClause, " and ")
-            whereQuery = "where %s" % whereStatement
-        else:
-            whereQuery = ""
+            if withMismatch:
+                try:
+                    mismatchTag = read.opt("MD")
+                    mismatches = getMismatches(mismatchTag, read.seq, sense)
+                except KeyError:
+                    mismatches = ""
+
+                newrow["mismatch"] = mismatches
  
-        return whereQuery
+            if withID:
+                newrow["readID"] = readID
  
+            if withChrom:
+                newrow["chrom"] = chrom
  
-    def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
+            if withPairID:
+                newrow["pairID"] = pairReadSuffix[1:]
  
-        selectClause = [baseSelect]
-        if bothEnds:
-            selectClause.append("stop")
+            try:
+                resultsDict[dictKey].append(newrow)
+            except KeyError:
+                resultsDict[dictKey] = [newrow]
  
-        if not noSense:
-            selectClause.append("sense")
+        return resultsDict
  
-        if withWeight:
-            selectClause.append("weight")
  
-        if withFlag:
-            selectClause.append("flag")
+    def readDoesNotMeetCriteria(self, read, readID, doMulti=False, flag="", readLike="", hasMismatch=False, strand=""):
+        readDoesNotMeetCriteria = self.rejectMultiReadEntry(readID, doMulti)
  
-        if withMismatch:
-            selectClause.append("mismatch")
+        if flag != "":
+            #TODO: need to pick a tag code for the erange flag - using ZF for now
+            # although it looks like the flag is really just a label on a region.
+            # since BAM can retrieve based on location if we store the regions then
+            # we can just retrieve it directly from the BAM and we do not need to
+            # flag the individual reads.  The location will be sufficient.
+            try:
+                if flag != read.opt("ZF"):
+                    readDoesNotMeetCriteria = True
+            except KeyError:
+                readDoesNotMeetCriteria = True
  
-        selectQuery = string.join(selectClause, ",")
  
-        return selectQuery
+        if hasMismatch:
+            try:
+                mismatchTag = read.opt("MD")
+            except KeyError:
+                readDoesNotMeetCriteria = True
  
+        if strand in ["+", "-"] and getReadSense(read.is_reverse) != strand:
+            readDoesNotMeetCriteria = True
  
-    def getReadGroupQuery(self, findallOptimize, limit, combine5p):
-        groupBy = []
-        if findallOptimize:
-            groupBy = ["GROUP BY start, sense"]
+        return readDoesNotMeetCriteria
  
-        if limit > 0 and not combine5p:
-            groupBy.append("LIMIT %d" % limit)
  
-        groupQuery = string.join(groupBy)
+    def rejectMultiReadEntry(self, readID, doMulti=False, threshold=10):
+        rejectRead = False
+        if readID in self.multiReadIDCounts.keys():
+            if self.multiReadIDCounts[readID] > threshold or not doMulti:
+                rejectRead = True
  
-        return groupQuery
+        return rejectRead
  
  
      def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
                         flag="", withWeight=False, withFlag=False, withMismatch=False,
                         withID=False, withChrom=False, withPairID=False, readIDDict=False,
-                       splitRead=False, hasMismatch=False, flagLike=False, start=-1,
-                       stop=-1, strand=""):
-        """ returns a dictionary of spliced reads in a variety of
-        formats and which can be restricted by chromosome or custom-flag.
-        Returns unique spliced reads for now.
+                       splitRead=False, hasMismatch=False, flagLike=False, start=None,
+                       stop=None, strand=""):
+        """ First pass of rewrite
+            1) Leave as much in place as possible
+            2) For now treat multis just like regular splices
+        """
          """
          whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
          selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
@@ -595,47 +517,55 @@ class ReadDataset():
  
          stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
          sql.execute(stmt)
-        currentReadID = ""
-        currentChrom = ""
+        """
+
+        # Edits are starting here - trying to ignore the entire sql construct
+        bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop)
          resultsDict = {}
-        for row in sql:
-            pairID = 0
-            readID = row["readID"]
+        pairID = 0
+        for read in bamFileIterator:
+            if not isSpliceEntry(read.cigar):
+                continue
+
+            sense = getReadSense(read.is_reverse)
+            pairReadSuffix = getPairedReadNumberSuffix(read)
+            readID = "%s%s" % (read.qname, pairReadSuffix)
              if fullChrom:
-                chrom = row["chrom"]
+                chrom = self.bamfile.getrname(read.rname)
              else:
-                chrom = row["chrom"][3:]
+                chrom = self.bamfile.getrname(read.rname)[3:]
  
-            if not readIDDict and chrom != currentChrom:
-                resultsDict[chrom] = []
-                currentChrom = chrom
+            if readIDDict:
+                dictKey = read.qname
+            else:
                  dictKey = chrom
-            elif readIDDict:
-                if "/" in readID:
-                    (theReadID, pairID) = readID.split("/")
-                else:
-                    theReadID = readID
-
-                if theReadID != currentReadID:
-                    resultsDict[theReadID] = []
-                    currentReadID = theReadID
-                    dictKey = theReadID
-
-            newrow = {"startL": int(row["startL"])}
-            newrow["stopL"] = int(row["stopL"])
-            newrow["startR"] = int(row["startR"])
-            newrow["stopR"] = int(row["stopR"])
+
+            if self.readsize is None:
+                self.calculateReadsizeFromCigar(read.cigar)
+
+            startL, startR, stopL, stopR = getSpliceBounds(read.pos, self.readsize, read.cigar)
+            newrow = {"startL": int(startL)}
+            newrow["stopL"] = int(stopL)
+            newrow["startR"] = int(startR)
+            newrow["stopR"] = int(stopR)
              if not noSense:
-                newrow["sense"] = row["sense"]
+                newrow["sense"] = sense
  
              if withWeight:
-                newrow["weight"] = float(row["weight"])
+                newrow["weight"] = 1.0
  
              if withFlag:
-                newrow["flag"] = row["flag"]
+                #newrow["flag"] = row["flag"]
+                pass
  
              if withMismatch:
-                newrow["mismatch"] = row["mismatch"]
+                try:
+                    mismatchTag = read.opt("MD")
+                    mismatches = getMismatches(mismatchTag, read.seq, sense)
+                except KeyError:
+                    mismatches = ""
+
+                newrow["mismatch"] = mismatches
  
              if withID:
                  newrow["readID"] = readID
@@ -653,42 +583,53 @@ class ReadDataset():
                  rightDict = newrow
                  del rightDict["startL"]
                  del rightDict["stopL"]
-                resultsDict[dictKey].append(leftDict)
+                try:
+                    resultsDict[dictKey].append(leftDict)
+                except KeyError:
+                    resultsDict[dictKey] = [leftDict]
+
                  resultsDict[dictKey].append(rightDict)
              else:
-                resultsDict[dictKey].append(newrow)
+                try:
+                    resultsDict[dictKey].append(newrow)
+                except KeyError:
+                    resultsDict[dictKey] = [newrow]
  
          return resultsDict
  
  
-    def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
+    def getFullReadCount(self):
+        fullReadCount = {"uniq": 0,
+                         "multi": 0,
+                         "splice": 0
+        }
+
+        for chromosome in self.getChromosomes():
+            uniqCount, multiCount, spliceCount = self.getCounts(chrom=chromosome, reportCombined=False, multi=True, splices=True)
+            fullReadCount["uniq"] += uniqCount
+            fullReadCount["multi"] += multiCount
+            fullReadCount["splice"] += spliceCount
+
+        return fullReadCount
+
+
+    def getCounts(self, chrom=None, rmin=None, rmax=None, uniqs=True, multi=False,
                    splices=False, reportCombined=True, sense="both"):
          """ return read counts for a given region.
          """
          ucount = 0
          mcount = 0
          scount = 0
-        restrict = ""
-        if sense in ["+", "-"]:
-            restrict = " sense ='%s' " % sense
+        readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=sense)
  
          if uniqs:
-            try:
-                ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
-            except:
-                ucount = 0
+            ucount = readCounter.uniqReadCount
  
          if multi:
-            try:
-                mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
-            except:
-                mcount = 0
+            mcount = readCounter.multiReadCount
  
          if splices:
-            try:
-                scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
-            except:
-                scount = 0
+            scount = readCounter.spliceReadCount
  
          if reportCombined:
              total = ucount + mcount + scount
@@ -697,72 +638,32 @@ class ReadDataset():
              return (ucount, mcount, scount)
  
  
-    def getTotalCounts(self, chrom="", rmin="", rmax=""):
-        """ return read counts for a given region.
-        """
-        return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
-
-
-    def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
-        """ returns the number of row in the specified table.
-        """
-        whereClause = []
-        count = 0
-
-        if chrom !=""  and chrom != self.memChrom:
-            whereClause = ["chrom='%s'" % chrom]
-
-        if rmin != "":
-            whereClause.append("%s >= %s" % (startField, str(rmin)))
-
-        if rmax != "":
-            whereClause.append("%s <= %s" % (startField, str(rmax)))
+    def getSplicesCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False):
+        readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
  
-        if restrict != "":
-            whereClause.append(restrict)
+        return readCounter.spliceReadCount
  
-        if len(whereClause) > 0:
-            whereStatement = string.join(whereClause, " and ")
-            whereQuery = "where %s" % whereStatement
-        else:
-            whereQuery = ""
  
-        sql = self.getSqlCursor()
+    def getUniqsCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False):
+        readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
  
-        if distinct:
-            sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
-        else:
-            sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
+        return readCounter.uniqReadCount
  
-        result = sql.fetchone()
-
-        try:
-            count = int(result[0])
-        except:
-            count = 0
-
-        return count
  
+    def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
+        readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
  
-    def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
-        """ returns the number of row in the splices table.
-        """
-        # TODO: if the rds type is DNA should this just return zero?
-        return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
+        return readCounter.multiReadCount
  
  
-    def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
-        """ returns the number of distinct readIDs in the uniqs table.
-        """
-        return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
+    def getBamReadCounter(self, chrom="", rmin=None, rmax=None, sense="both"):
+        readCounter = ReadCounter(self.multiReadIDCounts, sense)
+        self.bamfile.fetch(reference=chrom, start=rmin, end=rmax, callback=readCounter)
  
-
-    def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
-        """ returns the total weight of readIDs in the multi table.
-        """
-        return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
+        return readCounter
  
  
+    #TODO: Primary
      def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
          """ get readID's.
          """
@@ -807,13 +708,13 @@ class ReadDataset():
              hitChromList.sort()
  
          snpDict = {}
-        for achrom in hitChromList:
+        for chromosome in hitChromList:
              if verbose:
-                print "getting mismatches from chromosome %s" % (achrom)
+                print "getting mismatches from chromosome %s" % (chromosome)
  
-            snpDict[achrom] = []
+            snpDict[chromosome] = []
              if useSplices and self.dataType == "RNA":
-                spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
+                spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withMismatch=True, readIDDict=True, hasMismatch=True)
                  spliceIDList = spliceDict.keys()
                  for spliceID in spliceIDList:
                      spliceEntry = spliceDict[spliceID][0]
@@ -845,13 +746,13 @@ class ReadDataset():
                              secondhalf = change_pos - firsthalf
                              change_at = rightstart + secondhalf
  
-                        snpDict[achrom].append([startpos, change_at, change_base, change_from])
+                        snpDict[chromosome].append([startpos, change_at, change_base, change_from])
  
-            hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
-            if achrom not in hitDict.keys():
+            hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withMismatch=True, hasMismatch=True)
+            if chromosome not in hitDict.keys():
                  continue
  
-            for readEntry in hitDict[achrom]:
+            for readEntry in hitDict[chromosome]:
                  start = readEntry["start"]
                  sense = readEntry["sense"]
                  mismatches = readEntry["mismatch"]
@@ -871,7 +772,7 @@ class ReadDataset():
                          change_pos = readlen - int(mismatch[1:change_len-1]) + 1
  
                      change_at = start + change_pos - 1
-                    snpDict[achrom].append([start, change_at, change_base, change_from])
+                    snpDict[chromosome].append([start, change_at, change_base, change_from])
  
          return snpDict
  
@@ -925,7 +826,7 @@ class ReadDataset():
                  spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
              else:
                  spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
-   
+
              if chromosome in spliceDict:
                  for spliceEntry in spliceDict[chromosome]:
                      Lstart = spliceEntry["startL"]
@@ -956,56 +857,19 @@ class ReadDataset():
  
  
      def insertMetadata(self, valuesList):
-        """ inserts a list of (pname, pvalue) into the metadata
-        table.
-        """
-        self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
-        self.dbcon.commit()
+        for (pname, pvalue) in valuesList:
+            self.metadata[pname] = pvalue
  
  
      def updateMetadata(self, pname, newValue, originalValue=""):
-        """ update a metadata field given the original value and the new value.
-        """
-        stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
          if originalValue != "":
-            stmt += " and value='%s' " % str(originalValue)
-
-        self.dbcon.execute(stmt)
-        self.dbcon.commit()
-
-
-    def insertUniqs(self, valuesList):
-        """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
-        into the uniqs table.
-        """
-        self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
-        self.dbcon.commit()
-
-
-    def insertMulti(self, valuesList):
-        """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
-        into the multi table.
-        """
-        self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
-        self.dbcon.commit()
-
-
-    def insertSplices(self, valuesList):
-        """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
-        into the splices table.
-        """
-        self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
-        self.dbcon.commit()
-
-
-    def insertMultisplices(self, valuesList):
-        """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
-        into the multisplices table.
-        """
-        self.dbcon.executemany("insert into multisplices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
-        self.dbcon.commit()
+            if self.metadata[pname] == originalValue:
+                self.metadata[pname] = newValue
+        else:
+            self.metadata[pname] = newValue
  
  
+    #TODO: Primary
      def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
          """ update reads on file database in a list region of regions for a chromosome to have a new flag.
              regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
@@ -1028,186 +892,160 @@ class ReadDataset():
          self.dbcon.commit()
  
  
+    #TODO: Primary
      def setFlags(self, flag, uniqs=True, multi=True, splices=True):
          """ set the flag fields in the entire dataset.
          """
          if uniqs:
-            self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
+            self.setFlagsInDB("uniqs", flag)
  
          if multi:
-            self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
+            self.setFlagsInDB("multi", flag)
  
          if self.dataType == "RNA" and splices:
-            self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
+            self.setFlagsInDB("splices", flag)
  
          self.dbcon.commit()
  
  
+    #TODO: Secondary to setFlags()
+    def setFlagsInDB(self, table, flag):
+        """ set the flag field for every entry in a table.
+        """
+        self.dbcon.execute("UPDATE %s SET flag = '%s'" % (table, flag))
+
+
+    #TODO: Primary
      def resetFlags(self, uniqs=True, multi=True, splices=True):
          """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
          """
          self.setFlags("", uniqs, multi, splices)
  
  
+    #TODO: Primary
      def reweighMultireads(self, readList):
          self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
  
  
      def setSynchronousPragma(self, value="ON"):
-        try:
-            self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
-        except:
-            print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
+        pass
  
  
      def setDBcache(self, cache, default=False):
-        self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
-        if default:
-            self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
+        pass
  
  
      def execute(self, statement, returnResults=False):
-        sql = self.getSqlCursor()
-
-        sql.execute(statement)
-        if returnResults:
-            result = sql.fetchall()
-            return result
+        pass
  
  
      def executeCommit(self, statement):
-        self.execute(statement)
-
-        if self.memBacked:
-            self.memcon.commit()
-        else:
-            self.dbcon.commit()
+        pass
  
  
      def buildIndex(self, cache=100000):
-        """ Builds the file indeces for the main tables.
-            Cache is the number of 1.5 kb pages to keep in memory.
-            100000 pages translates into 150MB of RAM, which is our default.
-        """
-        if cache > self.getDefaultCacheSize():
-            self.setDBcache(cache)
-        self.setSynchronousPragma("OFF")
-        self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
-        print "built uPosIndex"
-        self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
-        print "built uChromIndex"
-        self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
-        print "built mPosIndex"
-        self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
-        print "built mChromIndex"
-
-        if self.dataType == "RNA":
-            self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
-            print "built sPosIndex"
-            self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
-            print "built sPosIndex2"
-            self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
-            print "built sChromIndex"
-
-        self.dbcon.commit()
-        self.setSynchronousPragma("ON")
+        pass
  
  
      def dropIndex(self):
-        """ drops the file indices for the main tables.
-        """
-        try:
-            self.setSynchronousPragma("OFF")
-            self.dbcon.execute("DROP INDEX uPosIndex")
-            self.dbcon.execute("DROP INDEX uChromIndex")
-            self.dbcon.execute("DROP INDEX mPosIndex")
-            self.dbcon.execute("DROP INDEX mChromIndex")
+        pass
  
-            if self.dataType == "RNA":
-                self.dbcon.execute("DROP INDEX sPosIndex")
-                try:
-                    self.dbcon.execute("DROP INDEX sPosIndex2")
-                except:
-                    pass
  
-                self.dbcon.execute("DROP INDEX sChromIndex")
+    def memSync(self, chrom="", index=False):
+        pass
+
  
-            self.dbcon.commit()
-        except:
-            print "problem dropping index"
+    def copyDBEntriesToMemory(self, dbName, whereClause=""):
+        pass
  
-        self.setSynchronousPragma("ON")
  
+    def copySpliceDBEntriesToMemory(self, whereClause=""):
+        pass
  
-    def memSync(self, chrom="", index=False):
-        """ makes a copy of the dataset into memory for faster access.
-        Can be restricted to a "full" chromosome. Can also build the
-        memory indices.
-        """
-        self.memcon = ""
-        self.memcon = sqlite.connect(":memory:")
-        self.initializeTables(self.memcon)
-        cursor = self.dbcon.cursor()
-        whereclause = ""
-        if chrom != "":
-            print "memSync %s" % chrom
-            whereclause = " where chrom = '%s' " % chrom
-            self.memChrom = chrom
+
+def getReadSense(reverse):
+    if reverse:
+        sense = "-"
+    else:
+        sense = "+"
+
+    return sense
+
+
+def getSpliceRightStart(start, cigarTupleList):
+    offset = 0
+
+    for operation,length in cigarTupleList:
+        if operation == 3:
+            stopL = int(start + offset)
+            startR = int(stopL + length)
+
+            return startR
          else:
-            self.memChrom = ""
-
-        self.memcon.execute("PRAGMA temp_store = MEMORY")
-        self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
-        # copy metadata to memory
-        self.memcon.execute("delete from metadata")
-        results = cursor.execute("select name, value from metadata")
-        results2 = []
-        for row in results:
-            results2.append((row["name"], row["value"]))
-
-        self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
-
-        self.copyDBEntriesToMemory("uniqs", whereclause)
-        self.copyDBEntriesToMemory("multi", whereclause)
-        if self.dataType == "RNA":
-            self.copySpliceDBEntriesToMemory(whereclause)
-
-        if index:
-            if chrom != "":
-                self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
-                self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
-                if self.dataType == "RNA":
-                    self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
-                    self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
-            else:
-                self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
-                self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
-                if self.dataType == "RNA":
-                    self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
-                    self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
+            offset += length
  
-        self.memBacked = True
-        self.memcon.row_factory = sqlite.Row
-        self.memcon.commit()
  
+def getSpliceBounds(start, readsize, cigarTupleList):
+    offset = 0
+    #TODO: check operations here - we want to make sure we are adding the correct ones
+    for operation,length in cigarTupleList:
+        if operation == 3:
+            stopL = int(start + offset)
+            startR = int(stopL + length)
+            offset = 0
+        else:
+            offset += length
  
-    def copyDBEntriesToMemory(self, dbName, whereClause=""):
-        cursor = self.dbcon.cursor()
-        sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
-        destinationEntries = []
-        for row in sourceEntries:
-            destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
+    stopR = startR + offset
  
-        self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
+    return start, startR, stopL, stopR
  
  
-    def copySpliceDBEntriesToMemory(self, whereClause=""):
-        cursor = self.dbcon.cursor()
-        sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
-        destinationEntries = []
-        for row in sourceEntries:
-            destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
-                                       row["weight"], row["flag"], row["mismatch"]))
+def getPairedReadNumberSuffix(read):
+    readSuffix = ""
+    if not isPairedRead(read):
+        return ""
+
+    if read.is_read1:
+        readSuffix = "/1"
+    elif read.is_read2:
+        readSuffix = "/2"
+
+    return readSuffix
+
+
+def isPairedRead(read):
+    return read.is_proper_pair and (read.is_read1 or read.is_read2)
+
+
+def getMismatches(mismatchTag, querySequence="", sense="+"):
+    output = []
+    deletionMarker = "^"
+    position = 0
+
+    lengths = re.findall("\d+", mismatchTag)
+    mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
+
+    for mismatchEntry in range(len(mismatchSequences)):
+        mismatch = mismatchSequences[mismatchEntry]
+        position = position + int(lengths[mismatchEntry])
+        if string.find(mismatch, deletionMarker) == 0:
+            continue
+
+        try:
+            if querySequence:
+                genomicNucleotide = querySequence[position]
+            else:
+                genomicNucleotide = "N"
+
+            if sense == "-":
+                mismatch = getReverseComplement(mismatch)
+                genomicNucleotide  = getReverseComplement(genomicNucleotide)
  
-        self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", destinationEntries)
+            erange1BasedElandCompatiblePosition = int(position + 1)
+            output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
+            position += 1
+        except IndexError:
+            return ""
  
+    return string.join(output, ",")