development release: conversion of ReadDataset to use BAM files

author Sean Upchurch <sau@caltech.edu>

Mon, 2 Jul 2012 17:17:00 +0000 (10:17 -0700)

committer Sean Upchurch <sau@caltech.edu>

Mon, 2 Jul 2012 17:17:00 +0000 (10:17 -0700)
author Sean Upchurch <sau@caltech.edu>
Mon, 2 Jul 2012 17:17:00 +0000 (10:17 -0700)
committer Sean Upchurch <sau@caltech.edu>
Mon, 2 Jul 2012 17:17:00 +0000 (10:17 -0700)
diff --git a/MakeBamFromRds.py b/MakeBamFromRds.py

index 730c20be6fb147f84e58b7f80485199e0975ad3a..0b4946b1c3e9d4338da5e4a57b77791316ebad9b 100644 (file)
--- a/MakeBamFromRds.py
+++ b/MakeBamFromRds.py
@@ -176,7 +176,7 @@ def processSpliceReads(RDS, outfile, chrom, withFlag, useFlagLike, readlength, d
          pass
      else:
          for read in spliceDict[chrom]:
          pass
      else:
          for read in spliceDict[chrom]:
-            if fastaSequenceDict.has_key(chrom):
+            if chrom in fastaSequenceDict:
                  read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], read["startR"], read["stopL"], read["sense"])
                  noncanonicalSplices += noncanonical
  
                  read["sense"], noncanonical = fixSpliceSense(fastaSequenceDict[chrom], read["startR"], read["stopL"], read["sense"])
                  noncanonicalSplices += noncanonical
  
@@ -205,7 +205,7 @@ def writeBAMEntry(outfile, chrom, outputDict, readlength):
  
      alignedRead.rname = outfile.references.index(chrom)
  
  
      alignedRead.rname = outfile.references.index(chrom)
  
-    if outputDict.has_key("startL"):
+    if "startL" in outputDict:
          startL = outputDict["startL"] - 1
          stopL = outputDict["stopL"] - 1
          startR = outputDict["startR"] - 1
          startL = outputDict["startL"] - 1
          stopL = outputDict["stopL"] - 1
          startR = outputDict["startR"] - 1
@@ -217,6 +217,13 @@ def writeBAMEntry(outfile, chrom, outputDict, readlength):
          alignedRead.pos = outputDict["start"] - 1
          alignedRead.cigar = [(0, readlength)]
  
          alignedRead.pos = outputDict["start"] - 1
          alignedRead.cigar = [(0, readlength)]
  
+    #TODO: add sequence info
+    #cistematic.core.retrieveSequence(genome, chrom, start, stop, sense="F", db="")
+    #ooooo. let's do this with PyGr
+
+    #TODO: add quality - max for each base, but default is to not include just like current
+    # rdsToFastq() sets everything to max
+
      if paired:
          if pairID == "1":
              alignedRead.is_read1 = True
      if paired:
          if pairID == "1":
              alignedRead.is_read1 = True
@@ -227,7 +234,7 @@ def writeBAMEntry(outfile, chrom, outputDict, readlength):
          else:
              pass
  
          else:
              pass
  
-    if outputDict.has_key("mismatch"):
+    if "mismatch" in outputDict:
          mismatchTag = getMismatches(outputDict["mismatch"])
          if mismatchTag:
              tagList.append(("MD", str(mismatchTag)))
          mismatchTag = getMismatches(outputDict["mismatch"])
          if mismatchTag:
              tagList.append(("MD", str(mismatchTag)))
@@ -300,7 +307,7 @@ def fixSpliceSense(fastaSequence, startRight, stopLeft, sense=""):
      spliceJunction = string.join([leftJunctionSig, rightJunctionSig], "")
      spliceJunction = spliceJunction.upper()
      print spliceJunction
      spliceJunction = string.join([leftJunctionSig, rightJunctionSig], "")
      spliceJunction = spliceJunction.upper()
      print spliceJunction
-    if spliceSense.has_key(spliceJunction):
+    if spliceJunction in spliceSense:
          sense = spliceSense[spliceJunction]
      else:
          noncanonical += 1
          sense = spliceSense[spliceJunction]
      else:
          noncanonical += 1
diff --git a/MakeRdsFromBam.py b/MakeRdsFromBam.py

index 13c901332b2c54dcf6b4033a6633ec0e074da782..b16bccd2da0fae8b7e6ed85c20237286e9623823 100644 (file)
--- a/MakeRdsFromBam.py
+++ b/MakeRdsFromBam.py
@@ -418,13 +418,13 @@ def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False):
  
  
  def getSpliceBounds(start, readsize, cigarTupleList):
  
  
  def getSpliceBounds(start, readsize, cigarTupleList):
-    stopR = int(start + readsize)
      offset = 0
  
      for operation,length in cigarTupleList:
          if operation == 3:
              stopL = int(start + offset)
              startR = int(stopL + length)
      offset = 0
  
      for operation,length in cigarTupleList:
          if operation == 3:
              stopL = int(start + offset)
              startR = int(stopL + length)
+            stopR = int(startR + readsize - offset)
  
              return start, startR, stopL, stopR
          else:
  
              return start, startR, stopL, stopR
          else:
diff --git a/ReadDataset.py b/ReadDataset.py

index 4b51471184777a1cfe0d650a51ead592873ce3a1..9a795c8cde5c5ba6d257da963ebf4c32f7d6e42d 100644 (file)
--- a/ReadDataset.py
+++ b/ReadDataset.py
@@ -1,19 +1,74 @@
-import sqlite3 as sqlite
  import string
  import tempfile
  import shutil
  import os
  import string
  import tempfile
  import shutil
  import os
+import re
+import sys
+import pysam
  from array import array
  from array import array
-from commoncode import getReverseComplement, getConfigParser, getConfigOption
+from commoncode import getReverseComplement
  
  
-currentRDSVersion = "2.1"
+currentRDSVersion = "3.0"
  
  
  class ReadDatasetError(Exception):
      pass
  
  
  
  
  class ReadDatasetError(Exception):
      pass
  
  
-class ReadDataset():
+class ReadCounter():
+    uniqReadCount = 0
+    multiReadCount = 0.0
+    spliceReadCount = 0
+
+    def __init__(self, multiReadIDCount={}, sense="both"):
+        self.multiReadIDCount = multiReadIDCount
+        self.sense = sense
+
+
+    def __call__(self, read):
+        multiplicity = self.getReadMultiplicity(read)
+        if multiplicity > 1:
+            self.multiReadCount += float(1.0/multiplicity)
+        elif isSpliceEntry(read.cigar):
+            self.spliceReadCount += 1
+        else:
+            self.uniqReadCount += 1
+
+
+    def getReadMultiplicity(self, read):
+        pairReadSuffix = getPairedReadNumberSuffix(read)
+        readID = "%s%s" % (read.qname, pairReadSuffix)
+        try:
+            multiplicity = self.multiReadIDCount[readID]
+        except KeyError:
+            multiplicity = 1
+
+        return multiplicity
+
+
+    def isMulti(self, read):
+        pairReadSuffix = getPairedReadNumberSuffix(read)
+        readID = "%s%s" % (read.qname, pairReadSuffix)
+        return readID in self.multiReadIDCount
+
+
+
+class MaxCoordFinder(ReadCounter):
+    maxUniqueStart = 0
+    maxMultiStart = 0
+    maxSpliceStart = 0
+
+    def __call__(self, read):
+        if self.isMulti(read):
+            self.maxMultiStart = max(self.maxMultiStart, read.pos)
+        elif isSpliceEntry(read.cigar):
+            self.maxSpliceStart = max(self.maxMultiStart, getSpliceRightStart(read.pos, read.cigar))
+        else:
+            self.maxUniqueStart = max(self.maxMultiStart, read.pos)
+
+
+
+class BamReadDataset():
      """ Class for storing reads from experiments. Assumes that custom scripts
      will translate incoming data into a format that can be inserted into the
      class using the insert* methods. Default class subtype ('DNA') includes
      """ Class for storing reads from experiments. Assumes that custom scripts
      will translate incoming data into a format that can be inserted into the
      class using the insert* methods. Default class subtype ('DNA') includes
@@ -21,75 +76,93 @@ class ReadDataset():
      splices table.
      """
  
      splices table.
      """
  
+
      def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False, 
                   cache=False, reportCount=True):
          """ creates an rds datafile if initialize is set to true, otherwise
          will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
          """
      def __init__(self, datafile, initialize=False, datasetType="DNA", verbose=False, 
                   cache=False, reportCount=True):
          """ creates an rds datafile if initialize is set to true, otherwise
          will append to existing tables. datasetType can be either 'DNA' or 'RNA'.
          """
-        self.dbcon = ""
-        self.memcon = ""
-        self.dataType = ""
-        self.rdsVersion = currentRDSVersion
-        self.memBacked = False
-        self.memChrom = ""
-        self.memCursor = ""
-        self.cachedDBFile = ""
  
          if initialize and datasetType not in ["DNA", "RNA"]:
              raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
  
  
          if initialize and datasetType not in ["DNA", "RNA"]:
              raise ReadDatasetError("failed to initialize: datasetType must be 'DNA' or 'RNA'")
  
-        if cache:
-            if verbose:
-                print "caching ...."
+        self.dataType = ""
+        self.multiReadIDCounts = self.getMultiReadIDCounts(datafile, "rb")
+        self.bamfile = pysam.Samfile(datafile, "rb")
+        self.readsize = None
+        self.totalReadWeight = None
  
  
-            self.cacheDB(datafile)
-            dbFile = self.cachedDBFile
-        else:
-            dbFile = datafile
+        self.metadata = {"dataType": datasetType}
  
  
-        self.dbcon = sqlite.connect(dbFile)
-        self.dbcon.row_factory = sqlite.Row
-        self.dbcon.execute("PRAGMA temp_store = MEMORY")
          if initialize:
              self.dataType = datasetType
          if initialize:
              self.dataType = datasetType
-            self.initializeTables(self.dbcon)
          else:
          else:
-            metadata = self.getMetadata("dataType")
-            self.dataType = metadata["dataType"]
+            self.dataType = self.metadata["dataType"]
  
          try:
  
          try:
-            metadata = self.getMetadata("rdsVersion")
-            self.rdsVersion = metadata["rdsVersion"]
-        except:
-            try:
-                self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
-            except IOError:
-                print "could not add rdsVersion - read-only ?"
-                self.rdsVersion = "pre-1.0"
+            self.rdsVersion = self.metadata["rdsVersion"]
+        except KeyError:
+            self.insertMetadata([("rdsVersion", float(currentRDSVersion))])
  
  
+        self.fullReadCounts = self.getFullReadCount()
          if verbose:
              self.printRDSInfo(datafile, reportCount, initialize)
  
  
      def __len__(self):
          if verbose:
              self.printRDSInfo(datafile, reportCount, initialize)
  
  
      def __len__(self):
-        """ return the number of usable reads in the dataset.
+        """ return the number of reads in the bam file
+            This is not the same as the original as the original returned total weight.
          """
          """
-        total = self.getUniqsCount()
-        total += self.getMultiCount()
+        count = 0
+        references = self.bamfile.references
+        for reference in references:
+            count += self.bamfile.count(reference)
+
+        return count
+
  
  
-        if self.dataType == "RNA":
-            total += self.getSplicesCount()
+    def getMultiReadIDCounts(self, samFileName, fileMode):
+        try:
+            samfile = pysam.Samfile(samFileName, fileMode)
+        except ValueError:
+            print "samfile index not found"
+            sys.exit(1)
+
+        readIDCounts = {}
+        for read in samfile.fetch(until_eof=True):
+            pairReadSuffix = getPairedReadNumberSuffix(read)
+            readName = "%s%s" % (read.qname, pairReadSuffix)
+            try:
+                readIDCounts[readName] += 1.0
+            except KeyError:
+                readIDCounts[readName] = 1.0
  
  
-        total = int(total)
+        for readID in readIDCounts.keys():
+            if int(readIDCounts[readID]) == 1:
+                del readIDCounts[readID]
  
  
-        return total
+        return readIDCounts
+
+
+    def totalReadWeight(self):
+        """ return the total weight of usable reads in the bam file.
+        """
+        if self.totalReadWeight is None:
+            total = self.fullReadCounts["uniq"]
+            total += self.fullReadCounts["multi"]
+
+            if self.dataType == "RNA":
+                total += self.fullReadCounts["splice"]
+
+            self.totalReadWeight = int(total)
+
+        return self.totalReadWeight
  
  
      def __del__(self):
          """ cleanup copy in local cache, if present.
          """
  
  
      def __del__(self):
          """ cleanup copy in local cache, if present.
          """
-        if self.cachedDBFile != "":
-            self.uncacheDB()
+        pass
  
  
      def printRDSInfo(self, datafile, reportCount, initialize):
  
  
      def printRDSInfo(self, datafile, reportCount, initialize):
@@ -98,237 +171,129 @@ class ReadDataset():
          else:
              print "dataset %s" % datafile
  
          else:
              print "dataset %s" % datafile
  
-        metadata = self.getMetadata()
          print "metadata:"
          print "metadata:"
-        pnameList = metadata.keys()
+        pnameList = self.metadata.keys()
          pnameList.sort()
          for pname in pnameList:
          pnameList.sort()
          for pname in pnameList:
-            print "\t" + pname + "\t" + metadata[pname]
+            print "\t" + pname + "\t" + str(self.metadata[pname])
  
          if reportCount and not initialize:
              self.printReadCounts()
  
          print "default cache size is %d pages" % self.getDefaultCacheSize()
  
          if reportCount and not initialize:
              self.printReadCounts()
  
          print "default cache size is %d pages" % self.getDefaultCacheSize()
-        if self.hasIndex():
-            print "found index"
-        else:
-            print "not indexed"
  
  
      def printReadCounts(self):
  
  
      def printReadCounts(self):
-        ucount = self.getUniqsCount()
-        mcount = self.getMultiCount()
+        ucount = self.fullReadCounts["uniq"]
+        mcount = self.fullReadCounts["multi"]
          if self.dataType == "DNA":
              print "\n%d unique reads and %d multireads" % (ucount, mcount)
          elif self.dataType == "RNA":
          if self.dataType == "DNA":
              print "\n%d unique reads and %d multireads" % (ucount, mcount)
          elif self.dataType == "RNA":
-            scount = self.getSplicesCount()
+            scount = self.fullReadCounts["splice"]
              print "\n%d unique reads, %d spliced reads and %d multireads" % (ucount, scount, mcount)
  
  
      def cacheDB(self, filename):
              print "\n%d unique reads, %d spliced reads and %d multireads" % (ucount, scount, mcount)
  
  
      def cacheDB(self, filename):
-        """ copy geneinfoDB to a local cache.
-        """
-        configParser = getConfigParser()
-        cisTemp = getConfigOption(configParser, "general", "cistematic_temp", default="/tmp")
-        tempfile.tempdir = cisTemp
-        self.cachedDBFile =  "%s.db" % tempfile.mktemp()
-        shutil.copyfile(filename, self.cachedDBFile)
+        pass
  
  
      def saveCacheDB(self, filename):
  
  
      def saveCacheDB(self, filename):
-        """ copy geneinfoDB to a local cache.
-        """
-        shutil.copyfile(self.cachedDBFile, filename)
+        pass
  
  
      def uncacheDB(self):
  
  
      def uncacheDB(self):
-        """ delete geneinfoDB from local cache.
-        """
-        global cachedDBFile
-        if self.cachedDBFile != "":
-            try:
-                os.remove(self.cachedDBFile)
-            except:
-                print "could not delete %s" % self.cachedDBFile
-
-            self.cachedDB = ""
+        pass
  
  
      def attachDB(self, filename, dbName):
  
  
      def attachDB(self, filename, dbName):
-        """ attach another database file to the readDataset.
-        """
-        stmt = "attach '%s' as %s" % (filename, dbName)
-        self.execute(stmt)
+        pass
  
  
      def detachDB(self, dbName):
  
  
      def detachDB(self, dbName):
-        """ detach a database file to the readDataset.
-        """
-        stmt = "detach %s" % (dbName)
-        self.execute(stmt)
+        pass
  
  
      def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
  
  
      def importFromDB(self, dbName, table, ascolumns="*", destcolumns="", flagged=""):
-        """ import into current RDS the table (with columns destcolumns,
-            with default all columns) from the database file asname,
-            using the column specification of ascolumns (default all).
-        """
-        stmt = "insert into %s %s select %s from %s.%s" % (table, destcolumns, ascolumns, dbName, table)
-        if flagged != "":
-            stmt += " where flag = '%s' " % flagged
-
-        self.executeCommit(stmt)
+        pass
  
  
      def getTables(self, dbName=""):
  
  
      def getTables(self, dbName=""):
-        """ get a list of table names in a particular database file.
-        """
-        resultList = []
-        sql = self.getSqlCursor()
-
-        if dbName != "":
-            dbName = "%s." % dbName
-
-        stmt = "select name from %ssqlite_master where type='table'" % dbName
-        sql.execute(stmt)
-        results = sql.fetchall()
-
-        for row in results:
-            resultList.append(row["name"])
-
-        return resultList
+        pass
  
  
      def getSqlCursor(self):
  
  
      def getSqlCursor(self):
-        if self.memBacked:
-            sql = self.getMemCursor()
-        else:
-            sql = self.getFileCursor()
-
-        return sql
+        pass
  
  
      def getMemCursor(self):
  
  
      def getMemCursor(self):
-        """ returns a cursor to memory database for low-level (SQL)
-        access to the data.
-        """
-        return self.memcon.cursor()
+        pass
  
  
      def getFileCursor(self):
  
  
      def getFileCursor(self):
-        """ returns a cursor to file database for low-level (SQL)
-        access to the data.
-        """
-        return self.dbcon.cursor()
+        pass
  
  
  
  
-    def hasIndex(self):
-        """ return True if the RDS file has at least one index.
-        """
-        stmt = "select count(*) from sqlite_master where type='index'"
-        count = int(self.execute(stmt, returnResults=True)[0][0])
+    def initializeTables(self, dbConnection, cache=100000):
+        pass
  
  
-        return count > 0
  
  
+    def getMetadata(self, valueName=""):        
+        return self.metadata
  
  
-    def initializeTables(self, dbConnection, cache=100000):
-        """ creates table schema in a database connection, which is
-        typically a database file or an in-memory database.
-        """
-        dbConnection.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
-        dbConnection.execute("create table metadata (name varchar, value varchar)")
-        dbConnection.execute("insert into metadata values('dataType','%s')" % self.dataType)
-        positionSchema = "start int, stop int"
-        tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
-        dbConnection.execute("create table uniqs %s" % tableSchema)
-        dbConnection.execute("create table multi %s" % tableSchema)
-        if self.dataType == "RNA":
-            positionSchema = "startL int, stopL int, startR int, stopR int"
-            tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
-            dbConnection.execute("create table splices %s" % tableSchema)
-
-            positionSchema = "startL int, stopL int, startR int, stopR int"
-            tableSchema = "(ID INTEGER PRIMARY KEY, readID varchar, chrom varchar, %s, sense varchar, weight real, flag varchar, mismatch varchar)" % positionSchema
-            dbConnection.execute("create table multisplices %s" % tableSchema)
-
-        dbConnection.commit()
-
-
-    def getMetadata(self, valueName=""):
-        """ returns a dictionary of metadata.
-        """
-        whereClause = ""
-        resultsDict = {}
  
  
-        if valueName != "":
-            whereClause = " where name='%s'" % valueName
+    def getReadSize(self):
+        """ This is following the same model as the original where it assumes that all
+            read have the same readsize and that this is reflected by the size of the
+            first read.
+        """
+        if self.readsize is None:
+            bamFileIterator = self.bamfile.fetch(until_eof=True)
+            for read in bamFileIterator:
+                self.calculateReadsizeFromCigar(read.cigar)
+                break
  
  
-        sql = self.getSqlCursor()
+        return self.readsize
  
  
-        sql.execute("select name, value from metadata %s" % whereClause)
-        results = sql.fetchall()
  
  
-        for row in results:
-            parameterName = row["name"]
-            parameterValue = row["value"]
-            if parameterName not in resultsDict:
-                resultsDict[parameterName] = parameterValue
-            else:
-                trying = True
-                index = 2
-                while trying:
-                    newName = string.join([parameterName, str(index)], ":")
-                    if newName not in resultsDict:
-                        resultsDict[newName] = parameterValue
-                        trying = False
+    def calculateReadsizeFromCigar(self, cigar):
+        take = (0, 1) # CIGAR operation (M/match, I/insertion)
+        self.readsize = sum([length for op,length in cigar if op in take])
  
  
-                    index += 1
  
  
-        return resultsDict
+    def getDefaultCacheSize(self):
+        """ returns 0 as cache is going to be deprecated
+        """
+        return 0
  
  
  
  
-    def getReadSize(self):
-        """ returns readsize if defined in metadata.
+    def getChromosomes(self, fullChrom=True):
+        """ returns a sorted list of distinct chromosomes in bam file reference list.
          """
          """
-        metadata = self.getMetadata()
-        if "readsize" not in metadata:
-            raise ReadDatasetError("no readsize parameter defined")
+        if fullChrom:
+            results = list(self.getFullReferenceNames())
          else:
          else:
-            readSize = metadata["readsize"]
-            if "import" in readSize:
-                readSize = readSize.split()[0]
+            results = list(self.getShortReferenceNames())
  
  
-            readSize = int(readSize)
-            if readSize < 0:
-                raise ReadDatasetError("readsize is negative")
+        results.sort()
  
  
-            return readSize
+        return results
  
  
  
  
-    def getDefaultCacheSize(self):
-        """ returns the default cache size.
+    def getFullReferenceNames(self):
+        """ returns a set of bam file reference names.
          """
          """
-        return int(self.execute("PRAGMA DEFAULT_CACHE_SIZE", returnResults=True)[0][0])
+        return set(self.bamfile.references)
  
  
  
  
-    def getChromosomes(self, table="uniqs", fullChrom=True):
-        """ returns a sorted list of distinct chromosomes in table.
+    def getShortReferenceNames(self):
+        """ returns a set of bam file reference names after removing first 3 characters.
          """
          """
-        statement = "select distinct chrom from %s" % table
-        sql = self.getSqlCursor()
-
-        sql.execute(statement)
-        results = []
-        for row in sql:
-            if fullChrom:
-                if row["chrom"] not in results:
-                    results.append(row["chrom"])
-            else:
-                shortName = row["chrom"][3:]
-                if  len(shortName.strip()) > 0 and shortName not in results:
-                    results.append(shortName)
-
-        results.sort()
+        results = set()
+        references = self.bamfile.references
+        for chromosome in references:
+            shortName = chromosome[3:]
+            if len(shortName.strip()) > 0:
+                results.add(shortName)
  
          return results
  
  
          return results
  
@@ -338,30 +303,17 @@ class ReadDataset():
          """ returns the maximum coordinate for reads on a given chromosome.
          """
          maxCoord = 0
          """ returns the maximum coordinate for reads on a given chromosome.
          """
          maxCoord = 0
+        coordFinder = MaxCoordFinder()
+        self.bamfile.fetch(reference=chrom, callback=coordFinder)
  
          if doUniqs:
  
          if doUniqs:
-            maxCoord = self.getMaxStartCoordinateInTable(chrom, "uniqs")
+            maxCoord = coordFinder.maxUniqueStart
  
          if doSplices:
  
          if doSplices:
-            spliceMax = self.getMaxStartCoordinateInTable(chrom, "splices", startField="startR")
-            maxCoord = max(spliceMax, maxCoord)
+            maxCoord = max(coordFinder.maxSpliceStart, maxCoord)
  
          if doMulti:
  
          if doMulti:
-            multiMax = self.getMaxStartCoordinateInTable(chrom, "multi")
-            maxCoord = max(multiMax, maxCoord)
-
-        return maxCoord
-
-
-    def getMaxStartCoordinateInTable(self, chrom, table, startField="start"):
-        maxCoord = 0
-        sqlStatement = "select max(%s) from %s where chrom = '%s'" % (startField, table, chrom)
-        sql = self.getSqlCursor()
-        try:
-            sql.execute(sqlStatement)
-            maxCoord = int(sql.fetchall()[0][0])
-        except:
-            print "couldn't retrieve coordMax for chromosome %s" % chrom
+            maxCoord = max(coordFinder.maxMultiStart, maxCoord)
  
          return maxCoord
  
  
          return maxCoord
  
@@ -369,16 +321,13 @@ class ReadDataset():
      def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
                       flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
                       withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
      def getReadsDict(self, bothEnds=False, noSense=False, fullChrom=False, chrom="",
                       flag="", withWeight=False, withFlag=False, withMismatch=False, withID=False,
                       withChrom=False, withPairID=False, doUniqs=True, doMulti=False, findallOptimize=False,
-                     readIDDict=False, readLike="", start=-1, stop=-1, limit=-1, hasMismatch=False,
+                     readIDDict=False, readLike="", start=None, stop=None, limit=-1, hasMismatch=False,
                       flagLike=False, strand='', combine5p=False):
                       flagLike=False, strand='', combine5p=False):
-        """ returns a dictionary of reads in a variety of formats
-        and which can be restricted by chromosome or custom-flag.
-        Returns unique reads by default, but can return multireads
-        with doMulti set to True.
-
+        """ First pass of rewrite
+            1) Leave as much in place as possible
+            2) For now treat multis just like uniqs
+        """
          """
          """
-        #TODO: Need to rethink original design 1: Cannot have pairID without exporting as a readIDDict
-
          whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
          if findallOptimize:
              selectQuery = "select start, sense, sum(weight)"
          whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike)
          if findallOptimize:
              selectQuery = "select start, sense, sum(weight)"
@@ -433,160 +382,134 @@ class ReadDataset():
          sql = self.getSqlCursor()
          sqlQuery = string.join(stmt)
          sql.execute(sqlQuery)
          sql = self.getSqlCursor()
          sqlQuery = string.join(stmt)
          sql.execute(sqlQuery)
+        """
  
  
+        # Edits are starting here - trying to ignore the entire sql construct
+        bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop)
          resultsDict = {}
          resultsDict = {}
-        if findallOptimize:
-            resultsDict[chrom] = [{"start": int(row[0]), "sense": row[1], "weight": float(row[2])} for row in sql]
-            if self.memBacked:
-                self.memcon.row_factory = sqlite.Row
-            else:
-                self.dbcon.row_factory = sqlite.Row
-        else:
-            currentChrom = ""
-            currentReadID = ""
-            pairID = 0
-            for row in sql:
-                readID = row["readID"]
-                if fullChrom:
-                    chrom = row["chrom"]
-                else:
-                    chrom = row["chrom"][3:]
-
-                if not readIDDict and chrom != currentChrom:
-                    resultsDict[chrom] = []
-                    currentChrom = chrom
-                    dictKey = chrom
-                elif readIDDict:
-                    theReadID = readID
-                    if "::" in readID:
-                        theReadID = readID.split("::")[0]
-
-                    if "/" in theReadID and withPairID:
-                        (theReadID, pairID) = readID.split("/")
-
-                    if theReadID != currentReadID:
-                        resultsDict[theReadID] = []
-                        currentReadID = theReadID
-                        dictKey = theReadID
-
-                newrow = {"start": int(row["start"])}
-                if bothEnds:
-                    newrow["stop"] = int(row["stop"])
-
-                if not noSense:
-                    newrow["sense"] = row["sense"]
-
-                if withWeight:
-                    newrow["weight"] = float(row["weight"])
-
-                if withFlag:
-                    newrow["flag"] = row["flag"]
-
-                if withMismatch:
-                    newrow["mismatch"] = row["mismatch"]
-
-                if withID:
-                    newrow["readID"] = readID
-
-                if withChrom:
-                    newrow["chrom"] = chrom
-
-                if withPairID:
-                    newrow["pairID"] = pairID
-
-                resultsDict[dictKey].append(newrow)
-
-        return resultsDict
-
-
-    def getReadWhereQuery(self, chrom, flag, flagLike, start, stop, hasMismatch, strand, readLike="", splice=False):
-        if splice:
-            startText = "startL"
-            stopText = "stopR"
-        else:
-            startText = "start"
-            stopText = "stop"
+        if findallOptimize and chrom != "":
+            readIDDict = False
+            bothEnds = False
+            noSense = False
+            withWeight = True
+            withFlag = False
+            withMismatch = False
+            withID = False
+            withChrom = False
+            withPairID = False
+            fullChrom = True
+            #resultsDict[chrom] = [{"start": int(read.pos), "sense": getReadSense(read.is_reverse), "weight": 1.0} for read in bamFileIterator if not isSpliceEntry(read.cigar)]
+
+        pairID = 0
+        for read in bamFileIterator:
+            pairReadSuffix = getPairedReadNumberSuffix(read)
+            readID = "%s%s" % (read.qname, pairReadSuffix)
+            if isSpliceEntry(read.cigar) or self.readDoesNotMeetCriteria(read, readID, doMulti=doMulti):
+                continue
  
  
-        whereClause = []
-        if chrom != "" and chrom != self.memChrom:
-            whereClause.append("chrom = '%s'" % chrom)
+            sense = getReadSense(read.is_reverse)
+            if fullChrom:
+                chrom = self.bamfile.getrname(read.rname)
+            else:
+                chrom = self.bamfile.getrname(read.rname)[3:]
  
  
-        if flag != "":
-            if flagLike:
-                flagLikeClause = string.join(['flag LIKE "%', flag, '%"'], "")
-                whereClause.append(flagLikeClause)
+            if readIDDict:
+                dictKey = read.qname
              else:
              else:
-                whereClause.append("flag = '%s'" % flag)
+                dictKey = chrom
  
  
-        if start > -1:
-            whereClause.append("%s > %d" % (startText, start))
+            newrow = {"start": int(read.pos)}
+            if bothEnds:
+                if self.readsize is None:
+                    self.calculateReadsizeFromCigar(read.cigar)
  
  
-        if stop > -1:
-            whereClause.append("%s < %d" % (stopText, stop))
+                newrow["stop"] = int(read.pos + self.readsize)
  
  
-        if len(readLike) > 0:
-            readIDClause = string.join(["readID LIKE  '", readLike, "%'"], "")
-            whereClause.append(readIDClause)
+            if not noSense:
+                newrow["sense"] = sense
  
  
-        if hasMismatch:
-            whereClause.append("mismatch != ''")
+            if withWeight:
+                try:
+                    newrow["weight"] = 1.0/self.multiReadIDCounts[readID]
+                except KeyError:
+                    newrow["weight"] = 1.0
  
  
-        if strand in ["+", "-"]:
-            whereClause.append("sense = '%s'" % strand)
+            if withFlag:
+                #newrow["flag"] = row["flag"]
+                pass
  
  
-        if len(whereClause) > 0:
-            whereStatement = string.join(whereClause, " and ")
-            whereQuery = "where %s" % whereStatement
-        else:
-            whereQuery = ""
+            if withMismatch:
+                try:
+                    mismatchTag = read.opt("MD")
+                    mismatches = getMismatches(mismatchTag, read.seq, sense)
+                except KeyError:
+                    mismatches = ""
  
  
-        return whereQuery
+                newrow["mismatch"] = mismatches
  
  
+            if withID:
+                newrow["readID"] = readID
  
  
-    def getReadSelectQuery(self, baseSelect, noSense, withWeight, withFlag, withMismatch, bothEnds=False):
+            if withChrom:
+                newrow["chrom"] = chrom
  
  
-        selectClause = [baseSelect]
-        if bothEnds:
-            selectClause.append("stop")
+            if withPairID:
+                newrow["pairID"] = pairID
+
+            try:
+                resultsDict[dictKey].append(newrow)
+            except KeyError:
+                resultsDict[dictKey] = [newrow]
  
  
-        if not noSense:
-            selectClause.append("sense")
+        return resultsDict
  
  
-        if withWeight:
-            selectClause.append("weight")
  
  
-        if withFlag:
-            selectClause.append("flag")
+    def readDoesNotMeetCriteria(self, read, readID, doMulti=False, flag="", readLike="", hasMismatch=False, strand=""):
+        readDoesNotMeetCriteria = self.rejectMultiReadEntry(readID, doMulti)
  
  
-        if withMismatch:
-            selectClause.append("mismatch")
+        if flag != "":
+            #TODO: need to pick a tag code for the erange flag - using ZF for now
+            # although it looks like the flag is really just a label on a region.
+            # since BAM can retrieve based on location if we store the regions then
+            # we can just retrieve it directly from the BAM and we do not need to
+            # flag the individual reads.  The location will be sufficient.
+            try:
+                if flag != read.opt("ZF"):
+                    readDoesNotMeetCriteria = True
+            except KeyError:
+                readDoesNotMeetCriteria = True
  
  
-        selectQuery = string.join(selectClause, ",")
  
  
-        return selectQuery
+        if hasMismatch:
+            try:
+                mismatchTag = read.opt("MD")
+            except KeyError:
+                readDoesNotMeetCriteria = True
  
  
+        if strand in ["+", "-"] and getReadSense(read.is_reverse) != strand:
+            readDoesNotMeetCriteria = True
  
  
-    def getReadGroupQuery(self, findallOptimize, limit, combine5p):
-        groupBy = []
-        if findallOptimize:
-            groupBy = ["GROUP BY start, sense"]
+        return readDoesNotMeetCriteria
  
  
-        if limit > 0 and not combine5p:
-            groupBy.append("LIMIT %d" % limit)
  
  
-        groupQuery = string.join(groupBy)
+    def rejectMultiReadEntry(self, readID, doMulti=False, threshold=10):
+        rejectRead = False
+        if readID in self.multiReadIDCounts.keys():
+            if self.multiReadIDCounts[readID] > threshold or not doMulti:
+                rejectRead = True
  
  
-        return groupQuery
+        return rejectRead
  
  
      def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
                         flag="", withWeight=False, withFlag=False, withMismatch=False,
                         withID=False, withChrom=False, withPairID=False, readIDDict=False,
  
  
      def getSplicesDict(self, noSense=False, fullChrom=False, chrom="",
                         flag="", withWeight=False, withFlag=False, withMismatch=False,
                         withID=False, withChrom=False, withPairID=False, readIDDict=False,
-                       splitRead=False, hasMismatch=False, flagLike=False, start=-1,
-                       stop=-1, strand=""):
-        """ returns a dictionary of spliced reads in a variety of
-        formats and which can be restricted by chromosome or custom-flag.
-        Returns unique spliced reads for now.
+                       splitRead=False, hasMismatch=False, flagLike=False, start=None,
+                       stop=None, strand=""):
+        """ First pass of rewrite
+            1) Leave as much in place as possible
+            2) For now treat multis just like regular splices
+        """
          """
          whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
          selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
          """
          whereQuery = self.getReadWhereQuery(chrom, flag, flagLike, start, stop, hasMismatch, strand, splice=True)
          selectClause = "select ID, chrom, startL, stopL, startR, stopR, readID"
@@ -595,47 +518,55 @@ class ReadDataset():
  
          stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
          sql.execute(stmt)
  
          stmt = "%s from splices %s order by chrom, startL" % (selectQuery, whereQuery)
          sql.execute(stmt)
-        currentReadID = ""
-        currentChrom = ""
+        """
+
+        # Edits are starting here - trying to ignore the entire sql construct
+        bamFileIterator = self.bamfile.fetch(reference=chrom, start=start, end=stop)
          resultsDict = {}
          resultsDict = {}
-        for row in sql:
-            pairID = 0
-            readID = row["readID"]
+        pairID = 0
+        for read in bamFileIterator:
+            if not isSpliceEntry(read.cigar):
+                continue
+
+            sense = getReadSense(read.is_reverse)
+            pairReadSuffix = getPairedReadNumberSuffix(read)
+            readID = "%s%s" % (read.qname, pairReadSuffix)
              if fullChrom:
              if fullChrom:
-                chrom = row["chrom"]
+                chrom = self.bamfile.getrname(read.rname)
              else:
              else:
-                chrom = row["chrom"][3:]
+                chrom = self.bamfile.getrname(read.rname)[3:]
  
  
-            if not readIDDict and chrom != currentChrom:
-                resultsDict[chrom] = []
-                currentChrom = chrom
+            if readIDDict:
+                dictKey = read.qname
+            else:
                  dictKey = chrom
                  dictKey = chrom
-            elif readIDDict:
-                if "/" in readID:
-                    (theReadID, pairID) = readID.split("/")
-                else:
-                    theReadID = readID
-
-                if theReadID != currentReadID:
-                    resultsDict[theReadID] = []
-                    currentReadID = theReadID
-                    dictKey = theReadID
-
-            newrow = {"startL": int(row["startL"])}
-            newrow["stopL"] = int(row["stopL"])
-            newrow["startR"] = int(row["startR"])
-            newrow["stopR"] = int(row["stopR"])
+
+            if self.readsize is None:
+                self.calculateReadsizeFromCigar(read.cigar)
+
+            startL, startR, stopL, stopR = getSpliceBounds(read.pos, self.readsize, read.cigar)
+            newrow = {"startL": int(startL)}
+            newrow["stopL"] = int(stopL)
+            newrow["startR"] = int(startR)
+            newrow["stopR"] = int(stopR)
              if not noSense:
              if not noSense:
-                newrow["sense"] = row["sense"]
+                newrow["sense"] = sense
  
              if withWeight:
  
              if withWeight:
-                newrow["weight"] = float(row["weight"])
+                newrow["weight"] = 1.0
  
              if withFlag:
  
              if withFlag:
-                newrow["flag"] = row["flag"]
+                #newrow["flag"] = row["flag"]
+                pass
  
              if withMismatch:
  
              if withMismatch:
-                newrow["mismatch"] = row["mismatch"]
+                try:
+                    mismatchTag = read.opt("MD")
+                    mismatches = getMismatches(mismatchTag, read.seq, sense)
+                except KeyError:
+                    mismatches = ""
+
+                newrow["mismatch"] = mismatches
  
              if withID:
                  newrow["readID"] = readID
  
              if withID:
                  newrow["readID"] = readID
@@ -653,42 +584,53 @@ class ReadDataset():
                  rightDict = newrow
                  del rightDict["startL"]
                  del rightDict["stopL"]
                  rightDict = newrow
                  del rightDict["startL"]
                  del rightDict["stopL"]
-                resultsDict[dictKey].append(leftDict)
+                try:
+                    resultsDict[dictKey].append(leftDict)
+                except KeyError:
+                    resultsDict[dictKey] = [leftDict]
+
                  resultsDict[dictKey].append(rightDict)
              else:
                  resultsDict[dictKey].append(rightDict)
              else:
-                resultsDict[dictKey].append(newrow)
+                try:
+                    resultsDict[dictKey].append(newrow)
+                except KeyError:
+                    resultsDict[dictKey] = [newrow]
  
          return resultsDict
  
  
  
          return resultsDict
  
  
-    def getCounts(self, chrom="", rmin="", rmax="", uniqs=True, multi=False,
+    def getFullReadCount(self):
+        fullReadCount = {"uniq": 0,
+                         "multi": 0,
+                         "splice": 0
+        }
+
+        for chromosome in self.getChromosomes():
+            uniqCount, multiCount, spliceCount = self.getCounts(chrom=chromosome, reportCombined=False, multi=True, splices=True)
+            fullReadCount["uniq"] += uniqCount
+            fullReadCount["multi"] += multiCount
+            fullReadCount["splice"] += spliceCount
+
+        return fullReadCount
+
+
+    def getCounts(self, chrom=None, rmin=None, rmax=None, uniqs=True, multi=False,
                    splices=False, reportCombined=True, sense="both"):
          """ return read counts for a given region.
          """
          ucount = 0
          mcount = 0
          scount = 0
                    splices=False, reportCombined=True, sense="both"):
          """ return read counts for a given region.
          """
          ucount = 0
          mcount = 0
          scount = 0
-        restrict = ""
-        if sense in ["+", "-"]:
-            restrict = " sense ='%s' " % sense
+        readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=sense)
  
          if uniqs:
  
          if uniqs:
-            try:
-                ucount = float(self.getUniqsCount(chrom, rmin, rmax, restrict))
-            except:
-                ucount = 0
+            ucount = readCounter.uniqReadCount
  
          if multi:
  
          if multi:
-            try:
-                mcount = float(self.getMultiCount(chrom, rmin, rmax, restrict))
-            except:
-                mcount = 0
+            mcount = readCounter.multiReadCount
  
          if splices:
  
          if splices:
-            try:
-                scount = float(self.getSplicesCount(chrom, rmin, rmax, restrict))
-            except:
-                scount = 0
+            scount = readCounter.spliceReadCount
  
          if reportCombined:
              total = ucount + mcount + scount
  
          if reportCombined:
              total = ucount + mcount + scount
@@ -697,72 +639,32 @@ class ReadDataset():
              return (ucount, mcount, scount)
  
  
              return (ucount, mcount, scount)
  
  
-    def getTotalCounts(self, chrom="", rmin="", rmax=""):
-        """ return read counts for a given region.
-        """
-        return self.getCounts(chrom, rmin, rmax, uniqs=True, multi=True, splices=True, reportCombined=True, sense="both")
-
-
-    def getTableEntryCount(self, table, chrom="", rmin="", rmax="", restrict="", distinct=False, startField="start"):
-        """ returns the number of row in the specified table.
-        """
-        whereClause = []
-        count = 0
-
-        if chrom !=""  and chrom != self.memChrom:
-            whereClause = ["chrom='%s'" % chrom]
-
-        if rmin != "":
-            whereClause.append("%s >= %s" % (startField, str(rmin)))
-
-        if rmax != "":
-            whereClause.append("%s <= %s" % (startField, str(rmax)))
+    def getSplicesCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False):
+        readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
  
  
-        if restrict != "":
-            whereClause.append(restrict)
-
-        if len(whereClause) > 0:
-            whereStatement = string.join(whereClause, " and ")
-            whereQuery = "where %s" % whereStatement
-        else:
-            whereQuery = ""
+        return readCounter.spliceReadCount
  
  
-        sql = self.getSqlCursor()
  
  
-        if distinct:
-            sql.execute("select count(distinct chrom+%s+sense) from %s %s" % (startField, table, whereQuery))
-        else:
-            sql.execute("select sum(weight) from %s %s" % (table, whereQuery))
+    def getUniqsCount(self, chrom=None, rmin=None, rmax=None, restrict="both", distinct=False):
+        readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
  
  
-        result = sql.fetchone()
+        return readCounter.uniqReadCount
  
  
-        try:
-            count = int(result[0])
-        except:
-            count = 0
  
  
-        return count
-
-
-    def getSplicesCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
-        """ returns the number of row in the splices table.
-        """
-        # TODO: if the rds type is DNA should this just return zero?
-        return self.getTableEntryCount("splices", chrom, rmin, rmax, restrict, distinct, startField="startL")
+    def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
+        readCounter = self.getBamReadCounter(chrom=chrom, rmin=rmin, rmax=rmax, sense=restrict)
  
  
+        return readCounter.multiReadCount
  
  
-    def getUniqsCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
-        """ returns the number of distinct readIDs in the uniqs table.
-        """
-        return self.getTableEntryCount("uniqs", chrom, rmin, rmax, restrict, distinct)
  
  
+    def getBamReadCounter(self, chrom="", rmin=None, rmax=None, sense="both"):
+        readCounter = ReadCounter(self.multiReadIDCounts, sense)
+        self.bamfile.fetch(reference=chrom, start=rmin, end=rmax, callback=readCounter)
  
  
-    def getMultiCount(self, chrom="", rmin="", rmax="", restrict="", distinct=False):
-        """ returns the total weight of readIDs in the multi table.
-        """
-        return self.getTableEntryCount("multi", chrom, rmin, rmax, restrict, distinct)
+        return readCounter
  
  
  
  
+    #TODO: Primary
      def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
          """ get readID's.
          """
      def getReadIDs(self, uniqs=True, multi=False, splices=False, paired=False, limit=-1):
          """ get readID's.
          """
@@ -807,13 +709,13 @@ class ReadDataset():
              hitChromList.sort()
  
          snpDict = {}
              hitChromList.sort()
  
          snpDict = {}
-        for achrom in hitChromList:
+        for chromosome in hitChromList:
              if verbose:
              if verbose:
-                print "getting mismatches from chromosome %s" % (achrom)
+                print "getting mismatches from chromosome %s" % (chromosome)
  
  
-            snpDict[achrom] = []
+            snpDict[chromosome] = []
              if useSplices and self.dataType == "RNA":
              if useSplices and self.dataType == "RNA":
-                spliceDict = self.getSplicesDict(fullChrom=True, chrom=achrom, withMismatch=True, readIDDict=True, hasMismatch=True)
+                spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withMismatch=True, readIDDict=True, hasMismatch=True)
                  spliceIDList = spliceDict.keys()
                  for spliceID in spliceIDList:
                      spliceEntry = spliceDict[spliceID][0]
                  spliceIDList = spliceDict.keys()
                  for spliceID in spliceIDList:
                      spliceEntry = spliceDict[spliceID][0]
@@ -845,13 +747,13 @@ class ReadDataset():
                              secondhalf = change_pos - firsthalf
                              change_at = rightstart + secondhalf
  
                              secondhalf = change_pos - firsthalf
                              change_at = rightstart + secondhalf
  
-                        snpDict[achrom].append([startpos, change_at, change_base, change_from])
+                        snpDict[chromosome].append([startpos, change_at, change_base, change_from])
  
  
-            hitDict = self.getReadsDict(fullChrom=True, chrom=achrom, withMismatch=True, hasMismatch=True)
-            if achrom not in hitDict.keys():
+            hitDict = self.getReadsDict(fullChrom=True, chrom=chromosome, withMismatch=True, hasMismatch=True)
+            if chromosome not in hitDict.keys():
                  continue
  
                  continue
  
-            for readEntry in hitDict[achrom]:
+            for readEntry in hitDict[chromosome]:
                  start = readEntry["start"]
                  sense = readEntry["sense"]
                  mismatches = readEntry["mismatch"]
                  start = readEntry["start"]
                  sense = readEntry["sense"]
                  mismatches = readEntry["mismatch"]
@@ -871,7 +773,7 @@ class ReadDataset():
                          change_pos = readlen - int(mismatch[1:change_len-1]) + 1
  
                      change_at = start + change_pos - 1
                          change_pos = readlen - int(mismatch[1:change_len-1]) + 1
  
                      change_at = start + change_pos - 1
-                    snpDict[achrom].append([start, change_at, change_base, change_from])
+                    snpDict[chromosome].append([start, change_at, change_base, change_from])
  
          return snpDict
  
  
          return snpDict
  
@@ -925,7 +827,7 @@ class ReadDataset():
                  spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
              else:
                  spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
                  spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True, start=cstart, stop=cstop)
              else:
                  spliceDict = self.getSplicesDict(fullChrom=True, chrom=chromosome, withID=True)
-   
+
              if chromosome in spliceDict:
                  for spliceEntry in spliceDict[chromosome]:
                      Lstart = spliceEntry["startL"]
              if chromosome in spliceDict:
                  for spliceEntry in spliceDict[chromosome]:
                      Lstart = spliceEntry["startL"]
@@ -956,56 +858,19 @@ class ReadDataset():
  
  
      def insertMetadata(self, valuesList):
  
  
      def insertMetadata(self, valuesList):
-        """ inserts a list of (pname, pvalue) into the metadata
-        table.
-        """
-        self.dbcon.executemany("insert into metadata(name, value) values (?,?)", valuesList)
-        self.dbcon.commit()
+        for (pname, pvalue) in valuesList:
+            self.metadata[pname] = pvalue
  
  
      def updateMetadata(self, pname, newValue, originalValue=""):
  
  
      def updateMetadata(self, pname, newValue, originalValue=""):
-        """ update a metadata field given the original value and the new value.
-        """
-        stmt = "update metadata set value='%s' where name='%s'" % (str(newValue), pname)
          if originalValue != "":
          if originalValue != "":
-            stmt += " and value='%s' " % str(originalValue)
-
-        self.dbcon.execute(stmt)
-        self.dbcon.commit()
-
-
-    def insertUniqs(self, valuesList):
-        """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
-        into the uniqs table.
-        """
-        self.dbcon.executemany("insert into uniqs(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
-        self.dbcon.commit()
-
-
-    def insertMulti(self, valuesList):
-        """ inserts a list of (readID, chrom, start, stop, sense, weight, flag, mismatch)
-        into the multi table.
-        """
-        self.dbcon.executemany("insert into multi(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)", valuesList)
-        self.dbcon.commit()
-
-
-    def insertSplices(self, valuesList):
-        """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
-        into the splices table.
-        """
-        self.dbcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
-        self.dbcon.commit()
-
-
-    def insertMultisplices(self, valuesList):
-        """ inserts a list of (readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch)
-        into the multisplices table.
-        """
-        self.dbcon.executemany("insert into multisplices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", valuesList)
-        self.dbcon.commit()
+            if self.metadata[pname] == originalValue:
+                self.metadata[pname] = newValue
+        else:
+            self.metadata[pname] = newValue
  
  
  
  
+    #TODO: Primary
      def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
          """ update reads on file database in a list region of regions for a chromosome to have a new flag.
              regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
      def flagReads(self, regionsList, uniqs=True, multi=False, splices=False, sense="both"):
          """ update reads on file database in a list region of regions for a chromosome to have a new flag.
              regionsList must have 4 fields per region of the form (flag, chrom, start, stop) or, with
@@ -1028,186 +893,170 @@ class ReadDataset():
          self.dbcon.commit()
  
  
          self.dbcon.commit()
  
  
+    #TODO: Primary
      def setFlags(self, flag, uniqs=True, multi=True, splices=True):
          """ set the flag fields in the entire dataset.
          """
          if uniqs:
      def setFlags(self, flag, uniqs=True, multi=True, splices=True):
          """ set the flag fields in the entire dataset.
          """
          if uniqs:
-            self.dbcon.execute("UPDATE uniqs SET flag = '%s'" % flag)
+            self.setFlagsInDB("uniqs", flag)
  
          if multi:
  
          if multi:
-            self.dbcon.execute("UPDATE multi SET flag = '%s'" % flag)
+            self.setFlagsInDB("multi", flag)
  
          if self.dataType == "RNA" and splices:
  
          if self.dataType == "RNA" and splices:
-            self.dbcon.execute("UPDATE splices SET flag = '%s'" % flag)
+            self.setFlagsInDB("splices", flag)
  
          self.dbcon.commit()
  
  
  
          self.dbcon.commit()
  
  
+    #TODO: Secondary to setFlags()
+    def setFlagsInDB(self, table, flag):
+        """ set the flag field for every entry in a table.
+        """
+        self.dbcon.execute("UPDATE %s SET flag = '%s'" % (table, flag))
+
+
+    #TODO: Primary
      def resetFlags(self, uniqs=True, multi=True, splices=True):
          """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
          """
          self.setFlags("", uniqs, multi, splices)
  
  
      def resetFlags(self, uniqs=True, multi=True, splices=True):
          """ reset the flag fields in the entire dataset to clear. Useful for rerunning an analysis from scratch.
          """
          self.setFlags("", uniqs, multi, splices)
  
  
+    #TODO: Primary
      def reweighMultireads(self, readList):
          self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
  
  
      def setSynchronousPragma(self, value="ON"):
      def reweighMultireads(self, readList):
          self.dbcon.executemany("UPDATE multi SET weight = ? where chrom = ? and start = ? and readID = ? ", readList)
  
  
      def setSynchronousPragma(self, value="ON"):
-        try:
-            self.dbcon.execute("PRAGMA SYNCHRONOUS = %s" % value)
-        except:
-            print "warning: couldn't set PRAGMA SYNCHRONOUS = %s" % value
+        pass
  
  
      def setDBcache(self, cache, default=False):
  
  
      def setDBcache(self, cache, default=False):
-        self.dbcon.execute("PRAGMA CACHE_SIZE = %d" % cache)
-        if default:
-            self.dbcon.execute("PRAGMA DEFAULT_CACHE_SIZE = %d" % cache)
+        pass
  
  
      def execute(self, statement, returnResults=False):
  
  
      def execute(self, statement, returnResults=False):
-        sql = self.getSqlCursor()
-
-        sql.execute(statement)
-        if returnResults:
-            result = sql.fetchall()
-            return result
+        pass
  
  
      def executeCommit(self, statement):
  
  
      def executeCommit(self, statement):
-        self.execute(statement)
-
-        if self.memBacked:
-            self.memcon.commit()
-        else:
-            self.dbcon.commit()
+        pass
  
  
      def buildIndex(self, cache=100000):
  
  
      def buildIndex(self, cache=100000):
-        """ Builds the file indeces for the main tables.
-            Cache is the number of 1.5 kb pages to keep in memory.
-            100000 pages translates into 150MB of RAM, which is our default.
-        """
-        if cache > self.getDefaultCacheSize():
-            self.setDBcache(cache)
-        self.setSynchronousPragma("OFF")
-        self.dbcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
-        print "built uPosIndex"
-        self.dbcon.execute("CREATE INDEX uChromIndex on uniqs(chrom)")
-        print "built uChromIndex"
-        self.dbcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
-        print "built mPosIndex"
-        self.dbcon.execute("CREATE INDEX mChromIndex on multi(chrom)")
-        print "built mChromIndex"
-
-        if self.dataType == "RNA":
-            self.dbcon.execute("CREATE INDEX sPosIndex on splices(chrom, startL)")
-            print "built sPosIndex"
-            self.dbcon.execute("CREATE INDEX sPosIndex2 on splices(chrom, startR)")
-            print "built sPosIndex2"
-            self.dbcon.execute("CREATE INDEX sChromIndex on splices(chrom)")
-            print "built sChromIndex"
-
-        self.dbcon.commit()
-        self.setSynchronousPragma("ON")
+        pass
  
  
      def dropIndex(self):
  
  
      def dropIndex(self):
-        """ drops the file indices for the main tables.
-        """
-        try:
-            self.setSynchronousPragma("OFF")
-            self.dbcon.execute("DROP INDEX uPosIndex")
-            self.dbcon.execute("DROP INDEX uChromIndex")
-            self.dbcon.execute("DROP INDEX mPosIndex")
-            self.dbcon.execute("DROP INDEX mChromIndex")
+        pass
  
  
-            if self.dataType == "RNA":
-                self.dbcon.execute("DROP INDEX sPosIndex")
-                try:
-                    self.dbcon.execute("DROP INDEX sPosIndex2")
-                except:
-                    pass
  
  
-                self.dbcon.execute("DROP INDEX sChromIndex")
+    def memSync(self, chrom="", index=False):
+        pass
  
  
-            self.dbcon.commit()
-        except:
-            print "problem dropping index"
  
  
-        self.setSynchronousPragma("ON")
+    def copyDBEntriesToMemory(self, dbName, whereClause=""):
+        pass
  
  
  
  
-    def memSync(self, chrom="", index=False):
-        """ makes a copy of the dataset into memory for faster access.
-        Can be restricted to a "full" chromosome. Can also build the
-        memory indices.
-        """
-        self.memcon = ""
-        self.memcon = sqlite.connect(":memory:")
-        self.initializeTables(self.memcon)
-        cursor = self.dbcon.cursor()
-        whereclause = ""
-        if chrom != "":
-            print "memSync %s" % chrom
-            whereclause = " where chrom = '%s' " % chrom
-            self.memChrom = chrom
+    def copySpliceDBEntriesToMemory(self, whereClause=""):
+        pass
+
+
+def getReadSense(reverse):
+    if reverse:
+        sense = "-"
+    else:
+        sense = "+"
+
+    return sense
+
+
+def isSpliceEntry(cigarTupleList):
+    isSplice = False
+    for operation,length in cigarTupleList:
+        if operation == 3:
+            isSplice = True
+            break
+
+    return isSplice
+
+
+def getSpliceRightStart(start, cigarTupleList):
+    offset = 0
+
+    for operation,length in cigarTupleList:
+        if operation == 3:
+            stopL = int(start + offset)
+            startR = int(stopL + length)
+
+            return startR
          else:
          else:
-            self.memChrom = ""
-
-        self.memcon.execute("PRAGMA temp_store = MEMORY")
-        self.memcon.execute("PRAGMA CACHE_SIZE = 1000000")
-        # copy metadata to memory
-        self.memcon.execute("delete from metadata")
-        results = cursor.execute("select name, value from metadata")
-        results2 = []
-        for row in results:
-            results2.append((row["name"], row["value"]))
-
-        self.memcon.executemany("insert into metadata(name, value) values (?,?)", results2)
-
-        self.copyDBEntriesToMemory("uniqs", whereclause)
-        self.copyDBEntriesToMemory("multi", whereclause)
-        if self.dataType == "RNA":
-            self.copySpliceDBEntriesToMemory(whereclause)
-
-        if index:
-            if chrom != "":
-                self.memcon.execute("CREATE INDEX uPosIndex on uniqs(start)")
-                self.memcon.execute("CREATE INDEX mPosIndex on multi(start)")
-                if self.dataType == "RNA":
-                    self.memcon.execute("CREATE INDEX sPosLIndex on splices(startL)")
-                    self.memcon.execute("CREATE INDEX sPosRIndex on splices(startR)")
-            else:
-                self.memcon.execute("CREATE INDEX uPosIndex on uniqs(chrom, start)")
-                self.memcon.execute("CREATE INDEX mPosIndex on multi(chrom, start)")
-                if self.dataType == "RNA":
-                    self.memcon.execute("CREATE INDEX sPosLIndex on splices(chrom, startL)")
-                    self.memcon.execute("CREATE INDEX sPosRIndex on splices(chrom, startR)")
+            offset += length
  
  
-        self.memBacked = True
-        self.memcon.row_factory = sqlite.Row
-        self.memcon.commit()
  
  
+def getSpliceBounds(start, readsize, cigarTupleList):
+    offset = 0
+    #TODO: check operations here - we want to make sure we are adding the correct ones
+    for operation,length in cigarTupleList:
+        if operation == 3:
+            stopL = int(start + offset)
+            startR = int(stopL + length)
+            offset = 0
+        else:
+            offset += length
  
  
-    def copyDBEntriesToMemory(self, dbName, whereClause=""):
-        cursor = self.dbcon.cursor()
-        sourceEntries = cursor.execute("select chrom, start, stop, sense, weight, flag, mismatch, readID from %s %s" % (dbName, whereClause))
-        destinationEntries = []
-        for row in sourceEntries:
-            destinationEntries.append((row["readID"], row["chrom"], int(row["start"]), int(row["stop"]), row["sense"], row["weight"], row["flag"], row["mismatch"]))
+    stopR = startR + offset
  
  
-        self.memcon.executemany("insert into %s(ID, readID, chrom, start, stop, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?)" % dbName, destinationEntries)
+    return start, startR, stopL, stopR
  
  
  
  
-    def copySpliceDBEntriesToMemory(self, whereClause=""):
-        cursor = self.dbcon.cursor()
-        sourceEntries = cursor.execute("select chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch, readID from splices %s" % whereClause)
-        destinationEntries = []
-        for row in sourceEntries:
-            destinationEntries.append((row["readID"], row["chrom"], int(row["startL"]), int(row["stopL"]), int(row["startR"]), int(row["stopR"]), row["sense"],
-                                       row["weight"], row["flag"], row["mismatch"]))
+def getPairedReadNumberSuffix(read):
+    readSuffix = ""
+    if not isPairedRead(read):
+        return ""
+
+    if read.is_read1:
+        readSuffix = "/1"
+    elif read.is_read2:
+        readSuffix = "/2"
+
+    return readSuffix
+
+
+def isPairedRead(read):
+    return read.is_proper_pair and (read.is_read1 or read.is_read2)
+
+
+def getMismatches(mismatchTag, querySequence="", sense="+"):
+    output = []
+    deletionMarker = "^"
+    position = 0
+
+    lengths = re.findall("\d+", mismatchTag)
+    mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)
+
+    for mismatchEntry in range(len(mismatchSequences)):
+        mismatch = mismatchSequences[mismatchEntry]
+        position = position + int(lengths[mismatchEntry])
+        if string.find(mismatch, deletionMarker) == 0:
+            continue
+
+        try:
+            if querySequence:
+                genomicNucleotide = querySequence[position]
+            else:
+                genomicNucleotide = "N"
+
+            if sense == "-":
+                mismatch = getReverseComplement(mismatch)
+                genomicNucleotide  = getReverseComplement(genomicNucleotide)
  
  
-        self.memcon.executemany("insert into splices(ID, readID, chrom, startL, stopL, startR, stopR, sense, weight, flag, mismatch) values (NULL,?,?,?,?,?,?,?,?,?,?)", destinationEntries)
+            erange1BasedElandCompatiblePosition = int(position + 1)
+            output.append("%s%d%s" % (mismatch, erange1BasedElandCompatiblePosition, genomicNucleotide))
+            position += 1
+        except IndexError:
+            return ""
  
  
+    return string.join(output, ",")
diff --git a/combineRPKMs.py b/combineRPKMs.py

index 77f30d6fc2acb2a9424634a138a03ee30e314f91..88b57ab547edeea3f9be5387780796aadbe72bc2 100755 (executable)
--- a/combineRPKMs.py
+++ b/combineRPKMs.py
@@ -63,6 +63,8 @@ def combineRPKMs(firstfileName, expandedfileName, finalfileName, outfileName, do
      outfile.write(header)
  
      finalfile = open(finalfileName)
      outfile.write(header)
  
      finalfile = open(finalfileName)
+    #TODO: QForAli - the output lines are driven by finalfile.  If there are genes in the first 2 that
+    #      are not in the finalfile then they will be lost.
      for line in finalfile:
          fields = line.strip().split()
          gene = fields[0]
      for line in finalfile:
          fields = line.strip().split()
          gene = fields[0]
@@ -101,4 +103,4 @@ def getRPKMDict(rpkmFileName, getGIDDict=False):
  
  
  if __name__ == "__main__":
  
  
  if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
+\ No newline at end of file
diff --git a/commoncode.py b/commoncode.py

index 6ef4edfcb3e10d75ae0eecea887f45581f9e4edb..cd449620f0fc88304f4e8adceee0e96e37e1295d 100755 (executable)
--- a/commoncode.py
+++ b/commoncode.py
@@ -192,6 +192,7 @@ def getMergedRegionsFromList(regionList, maxDist=1000, minHits=0, verbose=False,
      #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
      #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
      #      be merged with A but that will exit the loop and the merge with C will be missed.
      #      the case of 3 regions ABC that are in the input file as ACB as it goes now when processing C there
      #      will be no merge with A as B is needed to bridge the two.  When it comes time to process B it will
      #      be merged with A but that will exit the loop and the merge with C will be missed.
+    #TODO: QForAli - Are we going to require sorted region files to have this even work?
      for regionEntry in regionList:
          if regionEntry[0] == "#":
              if "pvalue" in regionEntry:
      for regionEntry in regionList:
          if regionEntry[0] == "#":
              if "pvalue" in regionEntry:
@@ -340,6 +341,10 @@ def findPeak(hitList, start, length, readlen=25, doWeight=False, leftPlus=False,
  
      # implementing a triangular smooth
      smoothArray = array("f", [0.] * length)
  
      # implementing a triangular smooth
      smoothArray = array("f", [0.] * length)
+    #TODO: QForAli - really?  We're going to insert MANUFACTURED data (0.0's) and use them in the calculation
+    # (probably) just to avoid IndexError exceptions.  Then we are going to completely ignore adjusting the
+    # last 2 elements of the array for (probably) the same issue.  Is this how erange is dealing with the
+    # edge cases?
      for pos in range(2,length -2):
          smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
  
      for pos in range(2,length -2):
          smoothArray[pos] = (seqArray[pos -2] + 2 * seqArray[pos - 1] + 3 * seqArray[pos] + 2 * seqArray[pos + 1] + seqArray[pos + 2]) / 9.0
  
diff --git a/findMotifs.py b/findMotifs.py

index a072d43432efe3d53a9455f8e92119401049f1bd..ee2957cfdc1f5913c96fe1d75948a33c075c63d7 100755 (executable)
--- a/findMotifs.py
+++ b/findMotifs.py
@@ -54,7 +54,7 @@ def getParser(usage):
      parser.add_option("--logo", action="store_true", dest="saveLogo")
      parser.add_option("--threshold", type="float", dest="threshold")
      parser.add_option("--prefix", dest="motifPrefix")
      parser.add_option("--logo", action="store_true", dest="saveLogo")
      parser.add_option("--threshold", type="float", dest="threshold")
      parser.add_option("--prefix", dest="motifPrefix")
-    parser.add_option("--numMotifs", dest="numMotifs")
+    parser.add_option("--numMotifs", type="int", dest="numMotifs")
      parser.add_option("--maxWidth", type="int", dest="maxWidth")
      parser.add_option("--maskLower", action="store_true", dest="maskLower")
  
      parser.add_option("--maxWidth", type="int", dest="maxWidth")
      parser.add_option("--maskLower", action="store_true", dest="maskLower")
  
@@ -64,7 +64,7 @@ def getParser(usage):
      doCisGreedy = getConfigBoolOption(configParser, section, "doCisGreedy", False)
      saveLogo = getConfigBoolOption(configParser, section, "saveLogo", False)
      threshold = getConfigFloatOption(configParser, section, "threshold", 75.)
      doCisGreedy = getConfigBoolOption(configParser, section, "doCisGreedy", False)
      saveLogo = getConfigBoolOption(configParser, section, "saveLogo", False)
      threshold = getConfigFloatOption(configParser, section, "threshold", 75.)
-    numMotifs = getConfigOption(configParser, section, "numMotifs", "10")
+    numMotifs = getConfigOption(configParser, section, "numMotifs", 10)
      maxWidth = getConfigIntOption(configParser, section, "maxWidth", 28)
      maskLower = getConfigBoolOption(configParser, section, "maskLower", False)
  
      maxWidth = getConfigIntOption(configParser, section, "maxWidth", 28)
      maskLower = getConfigBoolOption(configParser, section, "maskLower", False)
  
@@ -76,7 +76,7 @@ def getParser(usage):
  
  
  def findMotifs(expbase, fsafile, doMeme=False, doCisGreedy=False, saveLogo=False, threshold=75.,
  
  
  def findMotifs(expbase, fsafile, doMeme=False, doCisGreedy=False, saveLogo=False, threshold=75.,
-               numMotifs="10", maxWidth=28, maskLower=False, doCisSampler=False):
+               numMotifs=10, maxWidth=28, maskLower=False, doCisSampler=False):
  
      motifPrefix = expbase
  
  
      motifPrefix = expbase
  
diff --git a/findall.py b/findall.py

index 2e4123ab2e0261988282c7c9f11b02c474d61fac..9dd85df733f86417debf9b060e53f59230110b76 100755 (executable)
--- a/findall.py
+++ b/findall.py
@@ -390,6 +390,7 @@ def findall(regionFinder, hitfile, outfilename, logfilename="findall.log", outpu
      shiftDict = {}
      chromosomeList = getChromosomeListToProcess(hitRDS, controlRDS, doControl)
      for chromosome in chromosomeList:
      shiftDict = {}
      chromosomeList = getChromosomeListToProcess(hitRDS, controlRDS, doControl)
      for chromosome in chromosomeList:
+        #TODO: QForAli -Really? Use first chr shift value for all of them
          if regionFinder.shiftValue == "learn":
              learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename, outfile, useMulti, doControl, controlRDS, combine5p)
  
          if regionFinder.shiftValue == "learn":
              learnShift(regionFinder, hitRDS, chromosome, logfilename, outfilename, outfile, useMulti, doControl, controlRDS, combine5p)
  
@@ -958,4 +959,4 @@ def getBestShiftInDict(shiftDict):
  
  
  if __name__ == "__main__":
  
  
  if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
+\ No newline at end of file
diff --git a/geneLocusCounts.py b/geneLocusCounts.py

index 218017e10a0d9a04d0b1cd8adde8449ecb60dcb8..ab4412506e062cae890077aa19292ce22492b65e 100755 (executable)
--- a/geneLocusCounts.py
+++ b/geneLocusCounts.py
@@ -122,7 +122,7 @@ def geneLocusCounts(genome, hitfile, outfilename, upstream=0, downstream=0,
          print fullchrom
          hitRDS.memSync(fullchrom, index=True)
          for (start, stop, gid, length) in locusByChromDict[chrom]:
          print fullchrom
          hitRDS.memSync(fullchrom, index=True)
          for (start, stop, gid, length) in locusByChromDict[chrom]:
-            if not gidDict.has_key(gid):
+            if not gid in gidDict:
                  gidDict[gid] = {"count": 0, "length": length}
  
              gidDict[gid]["count"] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
                  gidDict[gid] = {"count": 0, "length": length}
  
              gidDict[gid]["count"] += hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
diff --git a/geneMrnaCounts.py b/geneMrnaCounts.py

index cf5065ab88c40279c7ca5dfb30b70bc99dfd2065..7b4a2cc819c30976692d9721ddb363756ebdf70a 100755 (executable)
--- a/geneMrnaCounts.py
+++ b/geneMrnaCounts.py
@@ -124,7 +124,7 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice
              continue
  
          if countFeats:
              continue
  
          if countFeats:
-            seenFeaturesByChromDict[chrom] = []
+            seenFeaturesByChromDict[chrom] = set([])
  
          print "\nchr%s" % chrom
          fullchrom = "chr%s" % chrom
  
          print "\nchr%s" % chrom
          fullchrom = "chr%s" % chrom
@@ -137,18 +137,18 @@ def geneMrnaCounts(genomeName, hitfile, outfilename, trackStrand=False, doSplice
                      if featureSense == "R":
                          checkSense = "-"
  
                      if featureSense == "R":
                          checkSense = "-"
  
-                    regionList.append((gid, fullchrom, start, stop, checkSense))
+                    regionData = (gid, fullchrom, start, stop, checkSense)
                      count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
                  else:
                      count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices, sense=checkSense)
                  else:
-                    regionList.append((gid, fullchrom, start, stop))
+                    regionData = (gid, fullchrom, start, stop)
                      count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
                      count = hitRDS.getCounts(fullchrom, start, stop, uniqs=doUniqs, multi=doMulti, splices=doSplices)
-                    if count != 0:
-                        print count
  
                  gidCount[gid] += count
  
                  gidCount[gid] += count
+                if markGID:
+                    regionList.append(regionData)
+
                  if countFeats:
                  if countFeats:
-                    if (start, stop, gid, featureSense) not in seenFeaturesByChromDict[chrom]:
-                        seenFeaturesByChromDict[chrom].append((start, stop, gid, featureSense))
+                    seenFeaturesByChromDict[chrom].add((start, stop, gid, featureSense))
              except:
                  print "problem with %s - skipping" % gid
  
              except:
                  print "problem with %s - skipping" % gid
  
diff --git a/geneMrnaCountsWeighted.py b/geneMrnaCountsWeighted.py

index 74e7a0cd817e699af6b398960578f576a030f7e4..31213c3f7a4703f9d9c8c1e87f17f4c0854e4574 100755 (executable)
--- a/geneMrnaCountsWeighted.py
+++ b/geneMrnaCountsWeighted.py
@@ -165,7 +165,7 @@ def getReadGIDs(hitDict, fullchrom, featList, readlen, index):
      for read in hitDict[fullchrom]:
          tagStart = read["start"]
          tagReadID = read["readID"]
      for read in hitDict[fullchrom]:
          tagStart = read["start"]
          tagReadID = read["readID"]
-        if read.has_key("sense"):
+        if "sense" in read:
              tagSense = read["sense"]
              ignoreSense = False
  
              tagSense = read["sense"]
              ignoreSense = False
  
@@ -253,7 +253,7 @@ def getTagCount(uniqueCountDict, gid, gidReadDict, read2GidDict):
          try:
              tagValue = uniqueCountDict[gid]
          except KeyError:
          try:
              tagValue = uniqueCountDict[gid]
          except KeyError:
-            tagValue = 1
+            pass
  
          tagDenom = 0.
          for relatedGID in read2GidDict[readID]:
  
          tagDenom = 0.
          for relatedGID in read2GidDict[readID]:
diff --git a/getSNPGeneInfo.py b/getSNPGeneInfo.py

index edcb2ceae0103af5b810c4e98d5c0521544efd34..0fbc6795121db515e27151902531f0962aa05715 100755 (executable)
--- a/getSNPGeneInfo.py
+++ b/getSNPGeneInfo.py
@@ -116,7 +116,7 @@ def getSNPGeneInfo(genome, infilename, rpkmfilename, doCache=False, withSense=Tr
              symbol = "LOC%s" % geneID
  
          geneDescriptor = (symbol, geneID)
              symbol = "LOC%s" % geneID
  
          geneDescriptor = (symbol, geneID)
-        if geneDict.has_key(geneDescriptor):
+        if geneDescriptor in geneDict:
              geneDict[geneDescriptor]["position"].append(pos)
          else:
              geneDict[geneDescriptor] = {"position": [pos],
              geneDict[geneDescriptor]["position"].append(pos)
          else:
              geneDict[geneDescriptor] = {"position": [pos],
@@ -167,7 +167,7 @@ def getSNPGeneInfoList(geneDict, snpDict, rpkmDict, withSense):
                                 "geneID": geneID}
  
              rpkm = "N\A"
                                 "geneID": geneID}
  
              rpkm = "N\A"
-            if rpkmDict.has_key(geneID):
+            if geneID in rpkmDict:
                  rpkm = str(rpkmDict[geneID])
  
              snpGeneInfoDict["rpkm"] = rpkm
                  rpkm = str(rpkmDict[geneID])
  
              snpGeneInfoDict["rpkm"] = rpkm
diff --git a/getSNPs.py b/getSNPs.py

index 4778644832ba9b3734687a92421dd60bbeee93a2..64948d405c523146e378c00fa4e204ca3ec75a4b 100755 (executable)
--- a/getSNPs.py
+++ b/getSNPs.py
@@ -174,7 +174,7 @@ def getMatchDict(rds, chrom, withSplices=True):
      for read in readDict[chrom]:
          start = read["start"]
          stop = read["stop"]
      for read in readDict[chrom]:
          start = read["start"]
          stop = read["stop"]
-        if finalDict.has_key(start):
+        if start in finalDict:
              finalDict[start].append(stop)
          else:
              finalDict[start] = [stop]
              finalDict[start].append(stop)
          else:
              finalDict[start] = [stop]
@@ -193,7 +193,7 @@ def getMatchDict(rds, chrom, withSplices=True):
                  start = read["startR"]
                  stop = read["stopR"]
  
                  start = read["startR"]
                  stop = read["stopR"]
  
-            if finalDict.has_key(start):
+            if start in finalDict:
                  finalDict[start].append(stop)
              else:
                  finalDict[start] = [stop]
                  finalDict[start].append(stop)
              else:
                  finalDict[start] = [stop]
@@ -211,12 +211,12 @@ def getMismatchDict(rds, chrom, withSplices=True):
          back = "%s:%s" % (str(start), change)
          uniqBaseDict = {change: 1}
          totalBaseDict = {change: 1}
          back = "%s:%s" % (str(start), change)
          uniqBaseDict = {change: 1}
          totalBaseDict = {change: 1}
-        if mismatchDict.has_key(change_at):
+        if change_at in mismatchDict:
              pos = "%s:%s" % (str(start), change)
              totalCount = mismatchDict[change_at]["totalCount"]
              totalCount += 1
              totalBaseDict = mismatchDict[change_at]["totalBaseDict"]
              pos = "%s:%s" % (str(start), change)
              totalCount = mismatchDict[change_at]["totalCount"]
              totalCount += 1
              totalBaseDict = mismatchDict[change_at]["totalBaseDict"]
-            if totalBaseDict.has_key(change): 
+            if change in totalBaseDict: 
                  totalBaseDict[change] += 1
  
              uniqBaseDict = mismatchDict[change_at]["uniqBaseDict"]
                  totalBaseDict[change] += 1
  
              uniqBaseDict = mismatchDict[change_at]["uniqBaseDict"]
@@ -224,7 +224,7 @@ def getMismatchDict(rds, chrom, withSplices=True):
              back = mismatchDict[change_at]["back"]
              if pos not in back:
                  uniqueReadCount += 1
              back = mismatchDict[change_at]["back"]
              if pos not in back:
                  uniqueReadCount += 1
-                if uniqBaseDict.has_key(change):
+                if change in uniqBaseDict:
                      uniqBaseDict[change] += 1 # dict contains total unique read counts
  
                  back = "%s,%s" % (back, pos)
                      uniqBaseDict[change] += 1 # dict contains total unique read counts
  
                  back = "%s,%s" % (back, pos)
diff --git a/normalizeExpandedExonic.py b/normalizeExpandedExonic.py

index c677d563e18d3f2544fc3c70caee4148c743bf5f..cf02cb3af4e26d987a971817057c9e790dae874d 100644 (file)
--- a/normalizeExpandedExonic.py
+++ b/normalizeExpandedExonic.py
@@ -45,7 +45,10 @@ def main(argv=None):
          except IndexError:
              pass
  
          except IndexError:
              pass
  
-    normalizeExpandedExonic(genome, hitfile, uniquecountfile, splicecountfile, outfile,
+    RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=options.doCache, reportCount=False)    
+    uniqcount = RDS.getUniqsCount()
+    
+    normalizeExpandedExonic(genome, uniqcount, uniquecountfile, splicecountfile, outfile,
                              candidateLines, acceptedfilename, options.fieldID,
                              options.maxLength, options.doCache, options.extendGenome,
                              options.replaceModels)
                              candidateLines, acceptedfilename, options.fieldID,
                              options.maxLength, options.doCache, options.extendGenome,
                              options.replaceModels)
@@ -73,7 +76,7 @@ def makeParser(usage=""):
      return parser
  
  
      return parser
  
  
-def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfilename,
+def normalizeExpandedExonic(genome, uniqcount, uniquecountfilename, splicecountfilename,
                              outfilename, candidateLines=[], acceptedfilename="",
                              fieldID=0, maxLength=1000000000., doCache=False,
                              extendGenome="", replaceModels=False):
                              outfilename, candidateLines=[], acceptedfilename="",
                              fieldID=0, maxLength=1000000000., doCache=False,
                              extendGenome="", replaceModels=False):
@@ -104,8 +107,7 @@ def normalizeExpandedExonic(genome, hitfile, uniquecountfilename, splicecountfil
      if extendGenome != "":
          hg.extendFeatures(extendGenome, replace=replaceModels)
  
      if extendGenome != "":
          hg.extendFeatures(extendGenome, replace=replaceModels)
  
-    RDS = ReadDataset.ReadDataset(hitfile, verbose = True, cache=doCache, reportCount=False)    
-    uniqcount = RDS.getUniqsCount()
+    
      print "%d unique reads" % uniqcount
  
      splicecount = 0
      print "%d unique reads" % uniqcount
  
      splicecount = 0
diff --git a/normalizeFinalExonic.py b/normalizeFinalExonic.py

index 6c1ae3323555722deea8bf91c5254ceb310560cb..6b47a208122ed7d24638dfab16cf3862ffaf214f 100755 (executable)
--- a/normalizeFinalExonic.py
+++ b/normalizeFinalExonic.py
@@ -15,7 +15,7 @@ def main(argv=None):
      if not argv:
          argv = sys.argv
  
      if not argv:
          argv = sys.argv
  
-    usage = "usage: python %prog rdsfile expandedRPKMfile multicountfile outfile [--multifraction] [--multifold] [--minrpkm minThreshold] [--cache] [--withGID]"
+    usage = "usage: python normalizeFinalExonic rdsfile expandedRPKMfile multicountfile outfile [--multifraction] [--multifold] [--minrpkm minThreshold] [--cache] [--withGID]"
  
      parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
  
      parser = makeParser(usage)
      (options, args) = parser.parse_args(argv[1:])
@@ -29,7 +29,13 @@ def main(argv=None):
      multicountfile = args[2]
      outfilename = args[3]
  
      multicountfile = args[2]
      outfilename = args[3]
  
-    normalizeFinalExonic(rdsfilename, expandedRPKMfile, multicountfile, outfilename,
+    readCounts = {}
+    RDS = ReadDataset.ReadDataset(rdsfilename, verbose=True, cache=options.doCache, reportCount=False)
+    readCounts["uniq"] = RDS.getUniqsCount()
+    readCounts["splice"] = RDS.getSplicesCount()
+    readCounts["multi"] = RDS.getMultiCount()
+
+    normalizeFinalExonic(readCounts, expandedRPKMfile, multicountfile, outfilename,
                           options.reportFraction, options.reportFold, options.minThreshold,
                           options.doCache, options.writeGID)
  
                           options.reportFraction, options.reportFold, options.minThreshold,
                           options.doCache, options.writeGID)
  
@@ -56,7 +62,7 @@ def makeParser(usage=""):
      return parser
  
  
      return parser
  
  
-def normalizeFinalExonic(rdsfilename, expandedRPKMfilename, multicountfilename, outfilename,
+def normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename,
                           reportFraction=False, reportFold=False, minThreshold=0., doCache=False,
                           writeGID=False):
  
                           reportFraction=False, reportFold=False, minThreshold=0., doCache=False,
                           writeGID=False):
  
@@ -69,10 +75,9 @@ def normalizeFinalExonic(rdsfilename, expandedRPKMfilename, multicountfilename,
      elif reportFold:
          print "reporting fold contribution of multireads"
  
      elif reportFold:
          print "reporting fold contribution of multireads"
  
-    RDS = ReadDataset.ReadDataset(rdsfilename, verbose=True, cache=doCache, reportCount=False)
-    uniqcount = RDS.getUniqsCount()
-    splicecount = RDS.getSplicesCount()
-    multicount = RDS.getMultiCount()
+    uniqcount = readCounts["uniq"]
+    splicecount = readCounts["splice"]
+    multicount = readCounts["multi"]
      countDict = {}
      multicountDict = {}
      lengthDict = {}
      countDict = {}
      multicountDict = {}
      lengthDict = {}
diff --git a/rnaEditing.py b/rnaEditing.py

index ce9f4ea4f81c121e174433bbca134954a451f021..f6f714a03c35b9e91117e34bad9329af25c7b990 100644 (file)
--- a/rnaEditing.py
+++ b/rnaEditing.py
@@ -1,7 +1,7 @@
  import sys
  import optparse
  import sys
  import optparse
-from erange import chksnp, getSNPs, getSNPGeneInfo, analyzego, getNovelSNPs, makeSNPtrack, rnaAToIFilter
-from erange.commoncode import countDuplicatesInList, getConfigParser, getConfigOption
+import chksnp, getSNPs, getSNPGeneInfo, analyzego, getNovelSNPs, makeSNPtrack, rnaAToIFilter
+from commoncode import countDuplicatesInList, getConfigParser, getConfigOption
  
  
  def main(argv=None):
  
  
  def main(argv=None):
diff --git a/runRNAPairedAnalysis.py b/runRNAPairedAnalysis.py

new file mode 100644 (file)

index 0000000..ebc25a5
--- /dev/null
+++ b/runRNAPairedAnalysis.py
@@ -0,0 +1,135 @@
+import sys
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigBoolOption, writeLog
+from checkrmask import checkrmask
+from geneMrnaCounts import geneMrnaCounts
+from geneMrnaCountsWeighted import geneMrnaCountsWeighted
+from regionCounts import regionCounts
+from rnafarPairs import rnaFarPairs
+from normalizeFinalExonic import normalizeFinalExonic
+from normalizeExpandedExonic import normalizeExpandedExonic
+from findall import findall, RegionFinder
+
+VERSION = "1.0"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    print "runRNAPairedAnalysis: version %s" % VERSION
+    usage = "usage: python runRNAPairedAnalysis.py genome rdsprefix repeatmaskdb [modelfile] [--replacemodels]"
+
+    parser = getParser(usage)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 3:
+        print usage
+        sys.exit(1)
+
+    genome = args[0]
+    rdsprefix = args[1]
+    repeatmaskdb = args[2]
+    try:
+        modelfile = args[3]
+    except IndexError:
+        modelfile = ""
+
+    runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile=modelfile, replacemodels=options.replacemodels)
+
+
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--replacemodels", action="store_true", dest="replacemodels")
+
+    configParser = getConfigParser()
+    section = "RunRNAPairedAnalysis"
+    replacemodels = getConfigBoolOption(configParser, section, "replacemodels", False)
+    parser.set_defaults(replacemodels=replacemodels)
+
+    return parser
+
+
+def runRNAPairedAnalysis(genome, rdsprefix, repeatmaskdb, modelfile="", replacemodels=False):
+    """ based on original script runRNAPairedAnalysis.sh
+        usage:runRNAPairedAnalysis.sh genome rdsprefix repeatmaskdb [modelfile] [--replacemodels]
+            where rdsprefix is the name of the rds file without the .rds extension
+            use "none" for the repeatmaskdb if you do not have one
+    """
+
+    rdsfile = "%s.rds" % rdsprefix
+    logfile = "rna.log"
+
+    # log the parameters
+    message = "with parameters: %s %s %s" % (genome, rdsprefix, repeatmaskdb)
+    writeLog(logfile, "runRNAPairedAnalysis.py", message)
+
+    # count the unique reads falling on the gene models ; the nomatch files are 
+    # mappable reads that fell outside of the Cistematic gene models and not the 
+    # unmappable of Eland (i.e, the "NM" reads)
+    uniquecountfilename = "%s.uniqs.count" % rdsprefix
+    geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+
+    # calculate a first-pass RPKM to re-weigh the unique reads,
+    # using 'none' for the splice count
+    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
+    RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
+    (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+    readCounts = {}
+    readCounts["uniq"] = ucount
+    readCounts["splice"] = mcount
+    readCounts["multi"] = scount
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+
+    # recount the unique reads with weights calculated during the first pass
+    uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
+    geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
+
+    # count splice reads
+    splicecountfilename = "%s.splices.count" % rdsprefix
+    geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+
+    # find new regions outside of gene models with reads piled up 
+    newregionfilename = "%s.newregions.txt" % rdsprefix
+    regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
+    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+
+    # filter out new regions that overlap repeats more than a certain fraction
+    outFileName = "%s.newregions.repstatus" % rdsprefix
+    goodFileName = "%s.newregions.good" % rdsprefix
+    checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
+
+    # calculate the read densities
+    regionfilename = "%s.newregions.checked" % rdsprefix
+    regionCounts(regionfilename, rdsfile, goodFileName, flagRDS=True, cachePages=1, logfilename=logfile)
+
+    # map all candidate regions that have paired ends overlapping with known genes
+    candidatefilename = "%s.candidates.txt" % rdsprefix
+    rnaFarPairs(genome, goodFileName, rdsfile, candidatefilename, doCache=True)
+
+    expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+    # calculate expanded exonic read density
+    acceptedfilename = "%s.accepted.rpkm" % rdsprefix
+    try:
+        candidatefile = open(candidatefilename)
+        candidateLines = candidatefile.readlines()
+        candidatefile.close()
+    except IOError:
+        candidateLines = []
+
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines,
+                            acceptedfilename=acceptedfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+
+    # weigh multi-reads
+    multicountfilename = "%s.multi.count" % rdsprefix
+    acceptfile = "%s.accepted.rpkm" % rdsprefix
+    geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptfile, cachePages=1,
+                           extendGenome=modelfile, replaceModels=replacemodels)
+
+    # calculate final exonic read density
+    outfilename = "%s.final.rpkm" % rdsprefix
+    normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
+\ No newline at end of file
diff --git a/runSNPAnalysis.py b/runSNPAnalysis.py

new file mode 100644 (file)

index 0000000..e511d80
--- /dev/null
+++ b/runSNPAnalysis.py
@@ -0,0 +1,100 @@
+import sys
+import optparse
+from commoncode import writeLog
+from makeSNPtrack import makeSNPtrack
+from getNovelSNPs import getNovelSNPsFromFile
+from getSNPGeneInfo import writeSNPGeneInfo
+from chksnp import chkSNPFile
+from chkSNPrmask import chkSNPrmask
+from getSNPs import writeSNPsToFile
+
+VERSION = "1.0"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    print "runSNPAnalysis: version %s" % VERSION
+    usage = "usage: python runSNPAnalysis.py genome rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile [cachepages]"
+
+    parser = getParser(usage)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 4:
+        print usage
+        sys.exit(1)
+
+    genome = args[0]
+    rdsfile = args[1]
+    label = args[2]
+    rmaskdbfile = args[3]
+    dbsnpfile = args[4]
+    uniqStartMin = args[5]
+    totalRatio = args[6]
+    rpkmfile = args[7]
+    try:
+        cachepages = args[8]
+    except IndexError:
+        cachepages = None
+
+    runSNPAnalysis(genome, rdsfile, label, rmaskdbfile, dbsnpfile, uniqStartMin, totalRatio, rpkmfile, cachepages)
+
+
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+
+    return parser
+
+
+def runSNPAnalysis(genome, rdsfile, label, rmaskdbfile, dbsnpfile, uniqStartMin, totalRatio, rpkmfile, cachepages=None):
+    """ based on original script runSNPAnalysis.sh
+        usage: runSNPAnalysis.sh genome rdsfile label rmaskdbfile dbsnpfile uniqStartMin totalRatio rpkmfile [cachepages]
+               where for each position S:
+                   uniqStartMin = # independent reads supporting base change at S
+                   totalRatio = total # reads supporting base change at S / total # reads that pass through S
+    """
+
+    # log the parameters
+    message = "with parameters: %s %s %s %s %s %s %s %s" % (genome, rdsfile, label, rmaskdbfile, dbsnpfile, uniqStartMin, totalRatio, rpkmfile)
+    writeLog("rna.log", "runStandardAnalysis.py", message)
+
+    # get all SNPs by extracting it from the RDS
+    fullsnpfilename = "%s.snps.txt" % label
+    try:
+        snpCache = int(cachepages)
+    except TypeError, ValueError:
+        snpCache = 0
+
+    doCache = cachepages is not None
+    writeSNPsToFile(rdsfile, uniqStartMin, totalRatio, fullsnpfilename, doCache, cachePages=snpCache, forceChr=True)
+
+    # get SNPs in non-repeat regions only
+    snpfilename = "%s.nr_snps.txt" % label
+    chkSNPrmask(rmaskdbfile, fullsnpfilename, snpfilename, repeats=False, cachePages=cachepages)
+
+    # Check to see if SNPs are found in dbSNP
+    # if dbSNP128.db is not built yet, build it by running buildsnpdb.py - build snp database using the dbSNP database file downloaded from UCSC
+    # usage: python2.5 buildsnpdb.py snpdbdir snpdbname
+    # the database flat file must be in the snpdbdir directory
+    # To build dbSNP database file, run the following command
+    # python2.5 buildsnpdb.py snp128.txt dbSNP128
+
+    # get dbSNP info for SNPs that are found in the dbSNP database
+    dbsnpFileName = "%s.nr_dbsnp.txt" % label
+    chkSNPFile(dbsnpfile, snpfilename, dbsnpFileName, cachePages=cachepages)
+
+    # get gene info for the snps found in dbSNP
+    genefilename = "%s.nr_dbsnp_geneinfo.txt" % label
+    writeSNPGeneInfo(genome, dbsnpFileName, rpkmfile, genefilename, doCache=doCache)
+
+    # get gene info for snps that are not found in dbSNP
+    outfilename = "%s.nr_final.txt" % label
+    getNovelSNPsFromFile(genome, genefilename, outfilename)
+
+    # make bed file for displaying the snps on UCSC genome browser
+    bedfilename = "%s.nr_snps.bed" % label
+    makeSNPtrack(snpfilename, label, bedfilename)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
+\ No newline at end of file
diff --git a/runStandardAnalysis.py b/runStandardAnalysis.py

new file mode 100644 (file)

index 0000000..adfa375
--- /dev/null
+++ b/runStandardAnalysis.py
@@ -0,0 +1,131 @@
+import sys
+import optparse
+import ReadDataset
+from commoncode import getConfigParser, getConfigBoolOption, writeLog
+from checkrmask import checkrmask
+from geneMrnaCounts import geneMrnaCounts
+from geneMrnaCountsWeighted import geneMrnaCountsWeighted
+from getallgenes import getallgenes
+from normalizeFinalExonic import normalizeFinalExonic
+from normalizeExpandedExonic import normalizeExpandedExonic
+from findall import findall, RegionFinder
+
+VERSION = "1.0"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    print "runStandardAnalysis: version %s" % VERSION
+    usage = "usage: python runStandardAnalysis.py genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]"
+
+    parser = getParser(usage)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 4:
+        print usage
+        sys.exit(1)
+
+    genome = args[0]
+    rdsprefix = args[1]
+    repeatmaskdb = args[2]
+    bpradius = args[3]
+    try:
+        modelfile = args[4]
+    except IndexError:
+        modelfile = ""
+
+    runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile=modelfile, replacemodels=options.replacemodels)
+
+
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option("--replacemodels", action="store_true", dest="replacemodels")
+
+    configParser = getConfigParser()
+    section = "RunStandardAnalysis"
+    replacemodels = getConfigBoolOption(configParser, section, "replacemodels", False)
+    parser.set_defaults(replacemodels=replacemodels)
+
+    return parser
+
+
+def runStandardAnalysis(genome, rdsprefix, repeatmaskdb, bpradius, modelfile="", replacemodels=False):
+    """ based on original script runStandardAnalysis.sh
+        usage: runStandardAnalysis.sh genome rdsprefix repeatmaskdb bpradius [modelfile] [--replacemodels]
+               where rdsprefix is the name of the rds file without the .rds extension
+               use "none" for the repeatmaskdb if you do not have one
+    """
+
+    rdsfile = "%s.rds" % rdsprefix
+    logfile = "rna.log"
+
+    # log the parameters
+    message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius)
+    writeLog(logfile, "runStandardAnalysis.py", message)
+
+    # count the unique reads falling on the gene models ; the nomatch files are
+    # mappable reads that fell outside of the Cistematic gene models and not the
+    # unmappable of Eland (i.e, the "NM" reads)
+    uniquecountfilename = "%s.uniqs.count" % rdsprefix
+    geneMrnaCounts(genome, rdsfile, uniquecountfilename, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1, markGID=True)
+
+    # calculate a first-pass RPKM to re-weigh the unique reads,
+    # using 'none' for the splice count
+    splicecountfilename = "none"
+    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
+    RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
+    (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+    readCounts = {}
+    readCounts["uniq"] = ucount
+    readCounts["splice"] = mcount
+    readCounts["multi"] = scount
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, splicecountfilename, initialrpkmfilename, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+
+    # recount the unique reads with weights calculated during the first pass
+    uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
+    geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, extendGenome=modelfile, replaceModels=replacemodels)
+
+    # count splice reads
+    splicecountfilename = "%s.splices.count" % rdsprefix
+    geneMrnaCounts(genome, rdsfile, splicecountfilename, doSplices=True, doUniqs=False, extendGenome=modelfile, replaceModels=replacemodels, cachePages=1)
+
+    # Alternative 1: find new regions outside of gene models with reads piled up
+    newregionfilename = "%s.newregions.txt" % rdsprefix
+    regionFinder = RegionFinder("RNAFAR", minHits=1, withFlag="NM")
+    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+
+    # Alternative 1: filter out new regions that overlap repeats more than a certain fraction
+    outFileName = "%s.newregions.repstatus" % rdsprefix
+    goodFileName = "%s.newregions.good" % rdsprefix
+    checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
+
+    # map all candidate regions that are within a given radius of a gene in bp
+    candidatefilename = "%s.candidates.txt" % rdsprefix
+    getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+
+    expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+    # calculate expanded exonic read density
+    acceptedfilename = "%s.accepted.rpkm" % rdsprefix
+    try:
+        candidatefile = open(candidatefilename)
+        candidateLines = candidatefile.readlines()
+        candidatefile.close()
+    except IOError:
+        candidateLines = []
+
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename,
+                            doCache=True, extendGenome=modelfile, replaceModels=replacemodels)
+
+    # weigh multi-reads
+    multicountfilename = "%s.multi.count" % rdsprefix
+    geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1, extendGenome=modelfile,
+                           replaceModels=replacemodels)
+
+    # calculate final exonic read density
+    outfilename = "%s.final.rpkm" % rdsprefix
+    normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
+\ No newline at end of file
diff --git a/runStrandedAnalysis.py b/runStrandedAnalysis.py

new file mode 100644 (file)

index 0000000..55c87ca
--- /dev/null
+++ b/runStrandedAnalysis.py
@@ -0,0 +1,129 @@
+import sys
+import optparse
+import ReadDataset
+from commoncode import writeLog
+from checkrmask import checkrmask
+from geneMrnaCounts import geneMrnaCounts
+from geneMrnaCountsWeighted import geneMrnaCountsWeighted
+from getallgenes import getallgenes
+from normalizeFinalExonic import normalizeFinalExonic
+from normalizeExpandedExonic import normalizeExpandedExonic
+from findall import findall, RegionFinder
+
+VERSION = "1.0"
+
+def main(argv=None):
+    if not argv:
+        argv = sys.argv
+
+    print "runStrandedAnalysis: version %s" % VERSION
+    usage = "usage: python runStrandedAnalysis.py genome rdsprefix repeatmaskdb bpradius"
+
+    parser = getParser(usage)
+    (options, args) = parser.parse_args(argv[1:])
+
+    if len(args) < 4:
+        print usage
+        sys.exit(1)
+
+    genome = args[0]
+    rdsprefix = args[1]
+    repeatmaskdb = args[2]
+    bpradius = args[3]
+
+    runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius)
+
+
+def getParser(usage):
+    parser = optparse.OptionParser(usage=usage)
+
+    return parser
+
+
+def runStrandedAnalysis(genome, rdsprefix, repeatmaskdb, bpradius):
+    """ based on original script runStrandedAnalysis.sh
+        usage: runStrandedAnalysis.sh genome rdsprefix repeatmaskdb bpradius
+               where rdsprefix is the name of the rds file without the .rds extension
+               use "none" for the repeatmaskdb if you do not have one
+    """
+
+    rdsfile = "%s.rds" % rdsprefix
+    logfile = "rna.log"
+
+    # log the parameters
+    message = "with parameters: %s %s %s %s" % (genome, rdsprefix, repeatmaskdb, bpradius)
+    writeLog(logfile, "runStrandedAnalysis.py", message)
+
+    # count the unique reads falling on the gene models ; the nomatch files are 
+    # mappable reads that fell outside of the Cistematic gene models and not the 
+    # unmappable of Eland (i.e, the "NM" reads)
+    uniquecountfilename = "%s.uniqs.count" % rdsprefix
+    geneMrnaCounts(genome, rdsfile, uniquecountfilename, trackStrand=True, cachePages=1, markGID=True)
+
+    # calculate a first-pass RPKM to re-weigh the unique reads,
+    # using 'none' for the splice count
+    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
+    RDS = ReadDataset.ReadDataset(rdsfile, verbose=True, cache=True, reportCount=False)
+    (ucount, mcount, scount) = RDS.getCounts(multi=True, splices=True, reportCombined=False)
+    readCounts = {}
+    readCounts["uniq"] = ucount
+    readCounts["splice"] = mcount
+    readCounts["multi"] = scount
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquecountfilename, "none", initialrpkmfilename, doCache=True)
+
+    # recount the unique reads with weights calculated during the first pass
+    initialrpkmfilename = "%s.firstpass.rpkm" % rdsprefix
+    uniquerecountfilename = "%s.uniqs.recount" % rdsprefix
+    geneMrnaCountsWeighted(genome, rdsfile, initialrpkmfilename, uniquerecountfilename, withUniqs=True, cachePages=1, ignoreSense=False)
+
+    # count splice reads
+    splicecountfilename = "%s.splices.count" % rdsprefix
+    geneMrnaCounts(genome, rdsfile, splicecountfilename, trackStrand=True, doSplices=True, doUniqs=False, cachePages=1)
+
+    # find new regions outside of gene models with reads piled up 
+    newregionfilename = "%s.newregions.txt" % rdsprefix
+    regionFinder = RegionFinder("RNAFARP", minHits=1, withFlag="NM", strandfilter="plus")
+    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False)
+
+    regionFinder = RegionFinder("RNAFARM", minHits=1, withFlag="NM", strandfilter="plus")
+    findall(regionFinder, rdsfile, newregionfilename, logfilename=logfile, rnaSettings=True, cachePages=1, useMulti=False, outputMode="a")
+
+    # filter out new regions that overlap repeats more than a certain fraction
+    newregionfilename = "%s.newregions.txt" % rdsprefix
+    outFileName = "%s.newregions.repstatus" % rdsprefix
+    goodFileName = "%s.newregions.good" % rdsprefix
+    checkrmask(repeatmaskdb, newregionfilename, outFileName, goodFileName, startField=1, cachePages=1, logfilename=logfile)
+
+    #TODO: these calls look wrong
+    # Alternative 2: use a precomputed list of "new" regions (outside of gene models)
+    #python $ERANGEPATH/regionCounts.py $3 $2.nomatch.bed $2.newregions.good $2.stillnomatch.bed
+    #python $ERANGEPATH/regionCounts.py $3 $2.rds $2.newregions.good
+
+    # map all candidate regions that are within a given radius of a gene in bp
+    candidatefilename = "%s.candidates.txt" % rdsprefix
+    getallgenes(genome, goodFileName, candidatefilename, maxRadius=bpradius, trackFar=True, doCache=True, trackStrand=True)
+
+    expandedRPKMfilename = "%s.expanded.rpkm" % rdsprefix
+    # calculate expanded exonic read density
+    acceptedfilename = "%s.accepted.rpkm" % rdsprefix
+    try:
+        candidatefile = open(candidatefilename)
+        candidateLines = candidatefile.readlines()
+        candidatefile.close()
+    except IOError:
+        candidateLines = []
+
+    normalizeExpandedExonic(genome, readCounts["uniq"], uniquerecountfilename, splicecountfilename, expandedRPKMfilename, candidateLines=candidateLines, acceptedfilename=acceptedfilename,
+                            doCache=True)
+
+    # weigh multi-reads
+    multicountfilename = "%s.multi.count" % rdsprefix
+    geneMrnaCountsWeighted(genome, rdsfile, expandedRPKMfilename, multicountfilename, withMulti=True, acceptfile=acceptedfilename, cachePages=1)
+
+    # calculate final exonic read density
+    outfilename = "%s.final.rpkm" % rdsprefix
+    normalizeFinalExonic(readCounts, expandedRPKMfilename, multicountfilename, outfilename, reportFraction=True, doCache=True, writeGID=True)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
+\ No newline at end of file
diff --git a/siteintersects.py b/siteintersects.py

index e012bb49436c92f50a1d8722182d0cd72a754897..58d9e3059ad8a587c10badd3fb8c7cc820462047 100755 (executable)
--- a/siteintersects.py
+++ b/siteintersects.py
@@ -126,6 +126,8 @@ def compareSites(siteFilename, rejectFilename, outfilename, siteDict, rejectSite
              continue
  
          siteNotCommon = True
              continue
  
          siteNotCommon = True
+        #TODO: as handed to me this is O(n^m) - probably could be better...
+        #TODO: QForAli - if there are multiple matches will count them but only write the first match to the file
          for (rstart, rstop, rline) in siteDict[chrom]:
              if regionsOverlap(start, stop, rstart, rstop):
                  commonSites += 1
          for (rstart, rstop, rline) in siteDict[chrom]:
              if regionsOverlap(start, stop, rstart, rstop):
                  commonSites += 1
@@ -162,4 +164,4 @@ def writeRejectSiteFile(siteRejectFilename, rejectSiteDict):
      
  
  if __name__ == "__main__":
      
  
  if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
+\ No newline at end of file
diff --git a/weighMultireads.py b/weighMultireads.py

index 9fd041d4462abb552eef87ebd82430177106231f..3fd6bd8e151b3f909034e856aeb5eb329db7c17f 100755 (executable)
--- a/weighMultireads.py
+++ b/weighMultireads.py
@@ -168,6 +168,7 @@ def reweighUsingPairs(RDS, pairDist, multiIDs, verbose=False):
                  chrom = "chr%s" % multiReadList[index][1]
                  reweighList.append((round(score,3), chrom, start, theID))
  
                  chrom = "chr%s" % multiReadList[index][1]
                  reweighList.append((round(score,3), chrom, start, theID))
  
+            #TODO: QForAli - Is this right? If match index is 0 are we doing nothing?
              if theMatch > 0:
                  RDS.reweighMultireads(reweighList)
                  fixedPair += 1
              if theMatch > 0:
                  RDS.reweighMultireads(reweighList)
                  fixedPair += 1
@@ -329,4 +330,4 @@ def reweighUsingRadius(RDS, radius, multiIDs, readsToSkip=[], verbose=False):
  
  
  if __name__ == "__main__":
  
  
  if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv)
+\ No newline at end of file
author	Sean Upchurch <sau@caltech.edu>
	Mon, 2 Jul 2012 17:17:00 +0000 (10:17 -0700)
committer	Sean Upchurch <sau@caltech.edu>
	Mon, 2 Jul 2012 17:17:00 +0000 (10:17 -0700)
MakeBamFromRds.py		patch \| blob \| history
MakeRdsFromBam.py		patch \| blob \| history
ReadDataset.py		patch \| blob \| history
combineRPKMs.py		patch \| blob \| history
commoncode.py		patch \| blob \| history
findMotifs.py		patch \| blob \| history
findall.py		patch \| blob \| history
geneLocusCounts.py		patch \| blob \| history
geneMrnaCounts.py		patch \| blob \| history
geneMrnaCountsWeighted.py		patch \| blob \| history
getSNPGeneInfo.py		patch \| blob \| history
getSNPs.py		patch \| blob \| history
normalizeExpandedExonic.py		patch \| blob \| history
normalizeFinalExonic.py		patch \| blob \| history
rnaEditing.py		patch \| blob \| history
runRNAPairedAnalysis.py	[new file with mode: 0644]	patch \| blob
runSNPAnalysis.py	[new file with mode: 0644]	patch \| blob
runStandardAnalysis.py	[new file with mode: 0644]	patch \| blob
runStrandedAnalysis.py	[new file with mode: 0644]	patch \| blob
siteintersects.py		patch \| blob \| history
weighMultireads.py		patch \| blob \| history