erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / genomes / __init__.py
diff --git a/cistematic/genomes/__init__.py b/cistematic/genomes/__init__.py
new file mode 100644 (file)
index 0000000..4dee924
--- /dev/null
@@ -0,0 +1,1109 @@
+###########################################################################
+#                                                                         #
+# C O P Y R I G H T   N O T I C E                                         #
+#  Copyright (c) 2003-10 by:                                              #
+#    * California Institute of Technology                                 #
+#                                                                         #
+#    All Rights Reserved.                                                 #
+#                                                                         #
+# Permission is hereby granted, free of charge, to any person             #
+# obtaining a copy of this software and associated documentation files    #
+# (the "Software"), to deal in the Software without restriction,          #
+# including without limitation the rights to use, copy, modify, merge,    #
+# publish, distribute, sublicense, and/or sell copies of the Software,    #
+# and to permit persons to whom the Software is furnished to do so,       #
+# subject to the following conditions:                                    #
+#                                                                         #
+# The above copyright notice and this permission notice shall be          #
+# included in all copies or substantial portions of the Software.         #
+#                                                                         #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
+# SOFTWARE.                                                               #
+###########################################################################
+#
+try:
+    from pysqlite2 import dbapi2 as sqlite
+except:
+    from sqlite3 import dbapi2 as sqlite
+
+import string
+from mmap import *
+from os import environ
+
+if environ.get("CISTEMATIC_ROOT"):
+    cisRoot = environ.get("CISTEMATIC_ROOT") 
+else:
+    cisRoot = "/proj/genome"
+
+__all__ = ["scerevisiae", "athaliana", "celegans", "cbriggsae", "cbrenneri", 
+           "cremanei", "dmelanogaster", "mmusculus", "hsapiens", "rnorvegicus",
+           "spurpuratus", "ggallus", "cfamiliaris", "mdomestica", "xtropicalis",
+           "btaurus", "drerio", "ecaballus"
+]
+
+supportedGenomes = ["athaliana", "cfamiliaris", "mmusculus", "hsapiens",
+                    "rnorvegicus", "ggallus", "scerevisiae", "celegans", "cbriggsae",
+                    "cbrenneri", "cremanei", "dmelanogaster", "mdomestica",
+                    "spurpuratus", "xtropicalis","btaurus", "drerio", "ecaballus"
+]
+
+geneDB  = {}
+chromDict  = {}
+chromRoot  = {}
+chromGeneEntries = {}
+checkGene = {}
+geneInfo   = {}
+getAllGenes = {}
+annotInfo = {}
+goInfo = {}
+allAnnotInfo = {}
+allGOInfo = {}
+
+def compNT(nt):
+    """ returns the complementary basepair to base nt
+    """
+    compDict = {"A": "T", "T": "A",
+                "G": "C", "C": "G",
+                "S": "S",
+                "W": "W",
+                "R": "Y", "Y": "R",
+                "M": "K", "K": "M",
+                "H": "D", "D": "H",
+                "B": "V", "V": "B",
+                "N": "N",
+                "a": "t", "t": "a",
+                "g": "c", "c": "g",
+                "n": "n",
+                "z": "z"
+    }
+
+    return compDict.get(nt, "N")
+
+
+def complement(sequence, length=0):
+    """ returns the complement of the sequence.
+    """
+    newSeq = ""
+    seqLength = len(sequence)
+    if length == seqLength:
+        seqList = list(sequence)
+        seqList.reverse()
+        return "".join(map(compNT, seqList))
+
+    for index in range(seqLength - 1, seqLength - length - 1, -1):
+        try:
+            newSeq += compNT(sequence[index])
+        except:
+            newSeq += "N"
+
+    return newSeq
+
+
+class Genome:
+    genome = ""
+    chromosome = ""
+    version = ""
+    dbFile = ""
+    supported = False
+    oldStyle = False
+    customAnnotations = False
+
+
+    def __init__(self, genome, chrom="", version="", dbFile="", inRAM=False):
+        self.genome = genome
+        if chrom != "":
+            self.chromosome = chrom
+
+        if version != "":
+            self.version = version
+
+        if dbFile != "":
+            self.dbFile = dbFile
+
+        if genome in supportedGenomes and dbFile == "":
+            self.dbFile = geneDB[genome]
+            self.supported = True
+
+        if inRAM:
+            self.memoryBacked = True
+            self.memoryDB = sqlite.connect(":memory:")
+            self.createGeneDB(inMemory=True)
+            sql = self.memoryDB.cursor()
+            try:
+                sql.execute("PRAGMA DEFAULT_CACHE_SIZE = 500000")
+                sql.execute("ATTACH '%s' as diskdb" % self.dbFile)
+                for table in ["gene_entries", "gene_annotation", "gene_ontology", "sequences", "chromosomes", "sequence_features"]:
+                    sql.execute("insert into %s select * from diskdb.%s" % (table, table))
+
+                sql.execute("DETACH diskdb")
+            except:
+                if self.dbFile != "":
+                    print "could not import %s" % self.dbFile
+
+            sql.close()
+            self.createIndices(inMemory=True)
+        else:
+            self.memoryBacked = False
+            self.memoryDB = ""
+
+
+    def setGeneDB(self, dbFile):
+        self.dbFile = dbFile
+        self.supported = False
+
+
+    def setChromosome(self, chrom):
+        self.chromosome = chrom
+
+
+    def checkGene(self, geneID):
+        """ returns True if the geneID matches an entry in the genome database.
+        """
+        (genome, gID) = geneID
+        if genome != self.genome:
+            return False
+
+        try:
+            stmt = "SELECT chromosome  from gene_entries where name = '%s' " % gID
+            res = self.queryDB(stmt)
+            if len(res) > 0:
+                return True
+        except:
+            pass
+
+        return False
+
+
+    def geneInfo(self, geneID, version="1"):
+        (genome, gID) = geneID
+        result = ""
+        if genome != self.genome:
+            return False
+
+        try:
+            stmt = "SELECT chromosome, start, stop, length, orientation from gene_entries where name = '%s' and version = '%s' " % (gID, version)
+            res = self.queryDB(stmt)
+            if len(res) > 0:
+                chrom = res[0]
+                start = int(res[1])
+                stop = int(res[2])
+                if start > stop:
+                    temp = stop
+                    stop = start
+                    start = temp
+
+                length = int(res[3])
+                orientation = res[4]
+                result = (chrom, start, stop, length, orientation)
+        except:
+            pass
+
+        return result
+
+
+    def getallGeneInfo(self, name="", chrom="", version=""):
+        resultDict = {}
+        chromList = []
+        stmt = "select name, chromosome, start, stop, orientation from gene_entries order by name, start "
+        res = self.queryDB(stmt, fetchall=True)
+        for entry in res:
+            (name, chromosome, start, stop, orientation) = entry
+            if name not in resultDict:
+                resultDict[name] = []
+
+            if chromosome not in chromList:
+                resultDict[chromosome] = []
+                chromList.append(chromosome)
+
+            resultDict[chromosome].append((name, chromosome, int(start), int(stop), orientation))
+
+        return resultDict
+
+
+    def leftGeneDistance(self, geneID, radius=50000, version="1"):
+        result = radius
+        res = self.geneInfo(geneID, version)
+        if res != "":
+            (chrom, start, stop, length, orientation) = res
+            if start > stop:
+                temp = stop
+                stop = start
+                start = temp
+
+            stmt = "SELECT name, start, stop, length, orientation from gene_entries where chromosome = '%s' and ((start > %d and start < %d) or (stop > %d and stop < %d)) " % (chrom, start - radius, start, start - radius, start)
+            res = self.queryDB(stmt, fetchall=True)
+            for entry in res:
+                rstart = int(entry[1])
+                rstop = int(entry[2])
+                if rstart > rstop:
+                    temp = rstop
+                    rstop = rstart
+                    rstart = temp
+
+                thelength = start - rstart
+                if (start - rstop) > 0 and (start - rstop) < thelength:
+                    thelength = start - rstop
+
+                if thelength > 0 and thelength < result:
+                    result = thelength
+
+        return result
+
+    def rightGeneDistance(self, geneID, radius=50000, version="1"):
+        result = radius                
+        res = self.geneInfo(geneID, version)
+        if res != "":
+            (chrom, start, stop, length, orientation) = res
+            if start > stop:
+                temp = stop
+                stop = start
+                start = temp
+
+            stmt = "SELECT name, start, stop, length, orientation from gene_entries where chromosome = '%s' and ((start > %d and start < %d) or (stop > %d and stop < %d)) " % (chrom, stop, stop + radius, stop, stop + radius)
+            res = self.queryDB(stmt, fetchall=True)
+            for entry in res:
+                rstart = int(entry[1])
+                rstop = int(entry[2])
+                if rstart > rstop:
+                    temp = rstop
+                    rstop = rstart
+                    rstart = temp
+
+                thelength = rstop - stop
+                if (rstart - stop) > 0 and (rstart - stop) < thelength:
+                    thelength = rstart - stop
+
+                if thelength > 0 and thelength < result:
+                    result = thelength
+
+        return result
+
+
+    def annotInfo(self, geneID):
+        (genome, gID) = geneID
+        result = []
+        if genome != self.genome:
+            return False
+
+        stmt = "SELECT description from gene_annotation where name = '%s'" % gID
+        res = self.queryDB(stmt, fetchall=True)
+        if len(res) > 0:
+            for entry in res:
+                result.append(entry[0])
+
+        return result
+
+
+    def goInfo(self, geneID):
+        (genome, gID) = geneID
+        result = []
+        if genome != self.genome:
+            return False
+
+        stmt = "SELECT GOID, objType, objName, isNot, GOterm, evidence from gene_ontology where name = '%s'" % gID
+        res = self.queryDB(stmt, fetchall=True)
+        if len(res) > 0:
+            for entry in res:
+                result.append(string.join(entry,"\t"))
+
+        return result
+
+
+    def chromInfo(self, chrom=""):
+        if chrom == "" and self.chromosome != "":
+            chrom = self.chromosome
+
+        stmt = "SELECT sequenceName, storageType from chromosomes where name = '%s' " % chrom
+        res = self.queryDB(stmt)
+        result = "%s\t%s" % (res[0], res[1])
+
+        return result
+
+
+    def allGIDs(self):
+        """ returns [ list of all orf names]
+        """
+        result = []
+        stmt = "SELECT distinct name from gene_entries"
+        res = self.queryDB(stmt, fetchall=True)
+        for entry in res:
+            result.append(entry[0])
+
+        return result
+
+
+    def allGIDsbyGOID(self, GOID):
+        """ returns [ list of all orf names] that match a particular GOID
+        """
+        result = []
+        stmt = "SELECT distinct name from gene_ontology where GOID = '%s' " % GOID
+        res = self.queryDB(stmt, fetchall=True)
+        for entry in res:
+            result.append(entry[0])
+
+        return result
+
+
+    def allGOIDs(self):
+        """ returns the list of GOID's in the genome
+        """
+        result = []
+        stmt = "SELECT distinct GOID from gene_ontology"
+        res = self.queryDB(stmt, fetchall=True)
+        for entry in res:
+            result.append(entry[0])
+
+        return result
+
+
+    def allGOterms(self):
+        """ returns the list of GOID's and their associated GO term in the genome
+        """
+        result = {}
+        stmt = "SELECT distinct GOID, GOterm from gene_ontology"
+        res = self.queryDB(stmt, fetchall=True)
+        for entry in res:
+            result[str(entry[0])] = str(entry[1])
+
+        return result
+
+
+    def getGOIDCount(self, GOID):
+        """ returns the match count for a particular GOID
+        """
+        stmt = "SELECT distinct name from gene_ontology where GOID = '%s' " % GOID
+        res = self.queryDB(stmt, fetchall=True)
+
+        return len(res)
+
+
+    def allAnnotInfo(self):
+        result = {}
+        stmt = "SELECT name, description from gene_annotation"
+        res = self.queryDB(stmt, fetchall=True)
+
+        for entry in res:
+            geneID = (self.genome, entry[0])
+            if geneID not in result:
+                result[geneID] = []
+
+            result[(self.genome,entry[0])].append(entry[1])
+
+        return result
+
+
+    def allGoInfo(self):
+        result = {}
+        stmt = "SELECT name, GOID, objType, objName, isNot, GOterm, evidence, other from gene_ontology order by name " 
+        res = self.queryDB(stmt, fetchall=True)
+        for entry in res:
+            geneID = (self.genome, entry[0])
+            if geneID not in result:
+                result[geneID] = []
+
+            result[(self.genome, entry[0])].append(string.join(entry[1:], "\t"))
+
+        return result
+
+
+    def allChromNames(self, partition=1, slice=0):
+        result = []
+        stmt = "SELECT distinct name from chromosomes"
+        res = self.queryDB(stmt, fetchall=True)
+        reslen = len(res)
+        for index in range(reslen):
+            if (index + slice) % partition == 0:
+                entry = res[index]
+                result.append(entry[0]) 
+
+        return result
+
+
+    def geneSeq(self, gID, maskCDS=False, maskLower=False, version="1"):
+        (chrom, start, stop, length, orientation) = self.geneInfo(gID, version)
+        seq = self.sequence(chrom, start, length, maskCDS, maskLower)
+        if orientation == "R":
+            seq = complement(seq, length)
+
+        return seq
+
+
+    def getGeneFeatures(self, gID, type="", version="1"):
+        results = []
+        featureClause = ""
+        (genome, geneid) = gID
+        if len(type) > 0:
+            featureClause = ' and type = "%s" ' % type
+
+        stmt = 'select type, chromosome, start, stop, orientation from sequence_features where name = "%s" and version = "%s"  %s order by start ' % (geneid, str(version), featureClause)
+        res = self.queryDB(stmt, fetchall=True)
+        for entry in res:
+            (type, chromosome, start, stop, orientation) = entry
+            results.append((type, chromosome, start, stop, orientation))
+
+        return results
+
+
+    def getallGeneFeatures(self):
+        resultDict = {}
+        stmt = "select name, type, chromosome, start, stop, orientation from sequence_features order by name, start "
+        res = self.queryDB(stmt, fetchall=True)
+        for entry in res:
+            (name, type, chromosome, start, stop, orientation) = entry
+            if name not in resultDict:
+                resultDict[name] = []
+
+            resultDict[name].append((type, chromosome, start, stop, orientation))
+
+        return resultDict
+
+
+    def getFeatureTypes(self, ftype=''):
+        """ Returns the distinct feature types available in the sequence_features
+            tables. Can optionally limit by feature type; the wild-card % can be 
+            used to search feature substrings.
+        """
+        results = []
+        whereClause = ""
+        useLike = False
+        if "%" in ftype:
+            useLike = True
+
+        if len(ftype) > 0 and not useLike:
+            whereClause = 'where type = "%s" ' % ftype
+        elif len(ftype) > 0:
+            whereClause = 'where type LIKE "%s" ' % ftype
+
+        stmt = "select distinct type from sequence_features %s" % whereClause
+        res = self.queryDB(stmt, fetchall=True)
+        for entry in res:
+            results.append(entry[0])
+
+        return results
+
+    def getFeatures(self, ftype, name="", chrom="", version =""):
+        """ Get features stored in sequence_features that match a feature type, 
+            optionally restricted by name/value, chromosome, or version. Will 
+            search for substrings when ftype and/or name are given with a % to 
+            indicate the location of the wildcard. Returns a dictionary of features 
+            with chromosomes as the keys.
+        """
+        results = {}
+        chromList = []
+        nameClause = ""
+        chromClause = ""
+        versionClause = ""
+        useLike = "="
+        if "%" in ftype:
+            useLike = "LIKE"
+
+        if len(name) > 0:
+            if "%" in name:
+                nameLike = "LIKE"
+            else:
+                nameLike = "="
+
+            nameClause = ' and name %s "%s" ' % (nameLike, name)
+
+        if len(chrom) > 0:
+            chromClause = ' and chromosome = "%s" ' % chrom
+
+        if len(version) > 0:
+            versionClause = ' and version = "%s" ' % version
+
+        stmt = 'select name, version, chromosome, start, stop, orientation, type from sequence_features where type %s "%s" %s %s %s order by type' % (useLike, ftype, chromClause, nameClause, versionClause)
+        res = self.queryDB(stmt, fetchall=True)
+        for entry in res:
+            (name, version, chromosome, start, stop, orientation, atype) = entry
+            if chromosome not in chromList:
+                results[chromosome] = []
+                chromList.append(chromosome)
+
+            results[chromosome].append((name, version, chromosome, start, stop, orientation, atype))
+
+        return results
+
+
+    def getFeaturesIntersecting(self, chrom, qstart, qlength, ftype=""):
+        """ Return features that are on a particular stretch of the genome. Can optionally
+            limit by feature type; the wild-card % can  be used to search feature substrings.
+        """
+        results = []
+        featureClause = ""
+        qstop = qstart + qlength
+        useLike = False
+        if "%" in ftype:
+            useLike = True
+
+        if len(ftype) > 0 and not useLike:
+            featureClause = ' and type = "%s" ' % ftype
+        elif len(ftype) > 0:
+            featureClause = ' and type LIKE "%s" ' % ftype
+
+        #select all features that encompass our start/stop
+        stmt = 'select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start < %d and stop > %d %s order by start' % (chrom, qstart, qstop, featureClause)
+        res = self.queryDB(stmt, fetchall=True)
+        for entry in res:
+            (chromosome, start, stop, orientation, name, version, atype) = entry
+            results.append((name, version, chromosome, start, stop, orientation, atype))
+
+        # select all features that have a "stop" between start and stop that aren't yet in the dataset
+        stmt = 'select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and stop >= %d and stop <= %d %s order by start' % (chrom, qstart, qstop, featureClause)
+        res = self.queryDB(stmt, fetchall=True)
+        for entry in res:
+            (chromosome, start, stop, orientation, name, version, atype) = entry
+            if (name, version, chromosome, start, stop, orientation, atype) not in results:
+                results.append((name, version, chromosome, start, stop, orientation, atype))
+
+        # select all features on chromosome that have a "start" between start and stop
+        stmt = 'chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start >= %d and start <= %d %s order by start' % (chrom, qstart, qstop, featureClause)
+        res = self.queryDB(stmt, fetchall=True)
+        for entry in res:
+            (chromosome, start, stop, orientation, name, version, atype) = entry
+            if (name, version, chromosome, start, stop, orientation, atype) not in results:
+                results.append((name, version, chromosome, start, stop, orientation, atype))
+
+        return results
+
+
+    def getGenesIntersecting(self, chrom, qstart, qlength):
+        """ Return features that are on a ptarticular stretch of the genome. Can optionally
+            limit by feature type; the wild-card % can  be used to search feature substrings.
+        """
+        results = []
+        qstop = qstart + qlength
+        atype = "model"
+        #select all features that encompass our start/stop
+        stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and start < %d and stop > %d order by start' % (chrom, qstart, qstop)
+        res = self.queryDB(stmt, fetchall=True)
+        for entry in res:
+            (chromosome, start, stop, orientation, name, version) = entry
+            results.append((name, version, chromosome, start, stop, orientation, atype))
+
+        # select all features that have a "stop" between start and stop that aren't yet in the dataset
+        stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and stop >= %d and stop <= %d order by start' % (chrom, qstart, qstop)
+        res = self.queryDB(stmt, fetchall=True)
+        for entry in res:
+            (chromosome, start, stop, orientation, name, version) = entry
+            if (name, version, chromosome, start, stop, orientation) not in results:
+                results.append((name, version, chromosome, start, stop, orientation, atype))
+
+        # select all features on chromosome that have a "start" between start and stop
+        stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and start >= %d and start <= %d order by start' % (chrom, qstart, qstop)
+        res = self.queryDB(stmt, fetchall=True)
+        for entry in res:
+            (chromosome, start, stop, orientation, name, version) = entry
+            if (name, version, chromosome, start, stop, orientation) not in results:
+                results.append((name, version, chromosome, start, stop, orientation, atype))
+
+        return results
+
+
+    def sequence(self, chrom, start, length, maskCDS=False, maskLower=False):
+        seq = ""
+        if chrom == "" and self.chromosome != "":
+            chrom = self.chromosome
+
+        stmt = "select sequenceName, storageType from chromosomes where name = '%s' " % chrom
+        res = self.queryDB(stmt)
+        seqName = res[0]
+        seqType = res[1]
+        if seqType == "db":
+            stmt = "select sequence from sequences where name = '%s' " % seqName
+            res = self.queryDB(stmt)
+            seq = res[0][start: start + length]
+            res = ""
+        else:
+            chromFile = open("%s%s" % (cisRoot, seqName), "r")
+            mymap = mmap(chromFile.fileno(),1,access=ACCESS_READ)
+            mymap = mmap(chromFile.fileno(), mymap.size(), access=ACCESS_READ)
+            mymap.seek(start)
+            seq = mymap.read(length)
+            chromFile.close()
+
+        if maskCDS:
+            stop = start + length - 1
+            seqArray = list(seq)
+            features = self.getFeaturesIntersecting(chrom, start, length, "CDS")
+            for entry in features:
+                (name, geneID, chromosome, fstart, fstop, forientation, type) = entry
+                if fstart < start:
+                    fstart = start
+
+                if fstop > stop:
+                    fstop = stop
+
+                nstart = fstart - start
+                nstop = fstop - start + 1
+                for pos in range(nstop - nstart):
+                    seqArray[nstart + pos] = "N"
+
+            seq = string.join(seqArray, "")
+
+        if maskLower:
+            seqArray = list(seq)
+            for index in range(len(seqArray)):
+                if seqArray[index] in ["a", "c" , "g", "t"]:
+                    seqArray[index] = "N"
+
+            seq = string.join(seqArray, "")
+
+        return seq
+
+
+    def getChromosomeSequence(self, chrom=""):
+        seq = ""
+        if chrom == "" and self.chromosome != "":
+            chrom = self.chromosome
+
+        stmt = "select sequenceName, storageType from chromosomes where name = '%s' " % chrom
+        res = self.queryDB(stmt)
+        if res == None:
+            print "Could not find chromosome %s" % chrom
+            return ''
+
+        seqName = res[0]
+        seqType = res[1]
+        if seqType == "db":
+            res = self.queryDB('select sequence from sequences where name = "%s"' % chrom)
+            seq = res[0]
+            res = ""
+        else:
+            chromFile = open("%s%s" % (cisRoot, seqName), "r")
+            seq = chromFile.readline()
+            chromFile.close()
+
+        return seq
+
+
+    def chromGeneEntries(self, chrom="", lowerbound=-1, upperbound=-1):
+        result = []
+        if chrom == "" and self.chromosome != "":
+            chrom = self.chromosome
+
+        stmt = "select distinct start, stop, orientation, name from gene_entries where chromosome = '%s' " % chrom
+        res = self.queryDB(stmt, fetchall=True)
+        if lowerbound > 0 and upperbound > 0:
+            for entry in res:
+                start = int(entry[0])
+                stop = int(entry[1])
+                if stop < start:
+                    start, stop = stop, start
+
+                if stop < lowerbound or start > upperbound:
+                    continue
+
+                result.append((start, stop, entry[2], (self.genome, entry[3])))
+        else:
+            for entry in res:
+                start = int(entry[0])
+                stop = int(entry[1])
+                if stop < start:
+                    start, stop = stop, start
+
+                result.append((start, stop, entry[2], (self.genome, entry[3])))
+
+        return result
+
+
+    def createGeneDB(self, dbFile="", inMemory=False):
+        if len(dbFile) > 0:
+            self.dbFile = dbFile
+
+        tableDict = {"gene_entries": "ID INTEGER PRIMARY KEY, name varchar, version varchar, chromosome varchar, start varchar, stop varchar, length varchar, orientation varchar, feature varchar",
+                     "gene_annotation": "ID INTEGER PRIMARY KEY, name varchar, description varchar",
+                     "gene_ontology": "ID INTEGER PRIMARY KEY, name varchar, GOID varchar, objType varchar, objName varchar, isNot varchar, GOterm varchar, evidence varchar, other varchar",
+                     "sequences": "ID INTEGER PRIMARY KEY, name varchar, sequenceLength varchar, sequenceType varchar, sequence varchar",
+                     "chromosomes": "ID INTEGER PRIMARY KEY, name varchar, sequenceName varchar, storageType varchar",
+                     "sequence_features": "ID INTEGER PRIMARY KEY, name varchar, version varchar, chromosome varchar, start int, stop int, length varchar, orientation varchar, type varchar"
+        }
+        
+        for table in tableDict.keys():
+            stmt = "create table %s(%s)" % (table, tableDict[table])
+            self.writeDB(stmt, useMemory=inMemory)
+
+
+    def createIndices(self, inMemory=False):
+        indexDict = {"nameIndex1": ("gene_entries", "name"),
+                     "nameIndex2": ("gene_annotation", "name"),
+                     "nameIndex3": ("gene_ontology", "name"),
+                     "goidIndex": ("gene_ontology", "GOID"),
+                     "nameIndex4": ("sequences", "name"),
+                     "nameIndex5": ("chromosomes", "name"),
+                     "geneIDIndex": ("sequence_features", "name, type"),
+                     "posIndex": ("sequence_features", "chromosome, start, stop, type"),
+                     "typeIndex": ("sequence_features", "type")
+        }
+
+        for indexName in indexDict.keys():
+            (tableName, fields) = indexDict[indexName]
+            stmt = "CREATE INDEX %s on %s(%s)" % (indexName, tableName, fields)
+            self.writeDB(stmt, useMemory=inMemory)
+
+
+    def addGeneEntry(self, geneID, chrom, start, stop, orientation, feature="", version=1.0):
+        (genome, gID) = geneID
+        length = str(abs(int(start) - int(stop)) + 1)
+        stmt = "insert into gene_entries(ID, name, version, chromosome, start, stop, length, orientation, feature) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % (gID, str(version), chrom, start, stop, length, orientation, feature)
+        self.writeDB(stmt)
+
+
+    def addFeatureEntry(self, name, geneID, chrom, start, stop, orientation, type=""):
+        (genome, gID) = geneID
+        length = str(abs(int(start) - int(stop)) + 1)
+        stmt = "insert into sequence_features(ID, name, geneID, chromosome, start, stop, length, orientation, type) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % (name, gID, chrom, start, stop, length, orientation, type)
+        self.writeDB(stmt)
+
+
+    def addAnnotation(self, geneID, description):
+        (genome, gID) = geneID
+        stmt = "insert into gene_annotation(ID, name, description) values (NULL, '%s', '%s') " % (gID, description)
+        self.writeDB(stmt)
+
+
+    def addGoInfo(self, geneID, GOID, objType="", objName="", isNot="", GOterm="", evidence="", other=""):
+        (genome, gID) = geneID
+        stmt = "insert into gene_ontology(ID, name, GOID, objType, objName, isNot, GOterm, evidence, other) values (NULL, '%s', '%s',  '%s', '%s', '%s', '%s', '%s', '%s') " % (gID, str(GOID), objType, objName, isNot, GOterm, evidence, other)
+        self.writeDB(stmt)
+
+
+    def addSequence(self, geneID, seq, seqType, seqLen="-1"):
+        (genome, gID) = geneID
+        if int(seqLen) < 0:
+            seqLen = str(len(seq))
+
+        stmt = "insert into sequences(ID, name, sequenceLength, sequenceType, sequence) values (NULL, '%s', '%s', '%s', '%s')" % (gID, str(seqLen), seqType, seq)
+        self.writeDB(stmt)
+
+
+    def addChromosomeEntry(self, chromo, seqName, storageType):
+        stmt = "insert into chromosomes(ID, name, sequenceName, storageType)  values (NULL, '%s', '%s', '%s')" % (chromo, seqName, storageType)
+        self.writeDB(stmt)
+
+
+    def addSequenceBatch(self, entryArray, inMemory=False):
+        stmtArray = []
+        stmt = "insert into sequences(ID, name, sequenceLength, sequenceType, sequence) values (NULL, ?, ?, ?, ?)" 
+        for entry in entryArray:
+            (geneID, seq, seqType, seqLen) = entry
+            (genome, gID) = geneID
+            if int(seqLen) < 0:
+                seqLen = str(len(seq))
+
+            stmtArray.append((gID, str(seqLen), seqType, seq))
+
+        self.writeBatchDB(stmt, stmtArray)
+
+
+    def addChromosomeEntryBatch(self, entryArray, inMemory=False):
+        stmt = "insert into chromosomes(ID, name, sequenceName, storageType)  values (NULL, ?, ?, ?)" 
+        self.writeBatchDB(stmt, entryArray, useMemory=inMemory)
+
+
+    def addGeneEntryBatch(self, entryArray, inMemory=False):
+        stmtArray = []
+        stmt = "insert into gene_entries(ID, name, version, chromosome, start, stop, length, orientation, feature) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) "
+        for entry in entryArray:
+            (geneID, chrom, start, stop, orientation, feature, version) = entry
+            (genome, gID) = geneID
+            length = str(abs(int(start) - int(stop)) + 1)
+            stmtArray.append((gID, str(version), chrom, start, stop, length, orientation, feature))
+
+        self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)
+
+
+    def addFeatureEntryBatch(self, entryArray, inMemory=False):
+        stmtArray = []
+        stmt = "insert into sequence_features(ID, name, version, chromosome, start, stop, length, orientation, type) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) " 
+        for entry in entryArray:
+            (geneID, version, chrom, start, stop, orientation, type) = entry
+            (genome, gID) = geneID
+            length = str(abs(int(start) - int(stop)) + 1)
+            stmtArray.append((gID, version, chrom, int(start), int(stop), length, orientation, type))
+
+        self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)
+
+
+    def addAnnotationBatch(self, entryArray, inMemory=False):
+        stmtArray = []
+        stmt = "insert into gene_annotation(ID, name, description) values (NULL, ?, ?) " 
+        for entry in entryArray:
+            (geneID, description) = entry
+            (genome, gID) = geneID
+            stmtArray.append((gID, description))
+
+        self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)
+
+
+    def addGoInfoBatch(self, entryArray, inMemory=False):
+        stmtArray = []
+        stmt = "insert into gene_ontology(ID, name, GOID, objType, objName, isNot, GOterm, evidence, other) values (NULL, ?, ?,  ?, ?, ?, ?, ?, ?) " 
+        for entry in entryArray:
+            (geneID, GOID, objType, objName, isNot, GOterm, evidence, other) = entry
+            (genome, gID) = geneID
+            stmtArray.append((gID, str(GOID), objType, objName, isNot, GOterm, evidence, other))
+
+        self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)
+
+
+    def extendFeatures(self, featureFile, fileType="cistematic", replace=False):
+        geneEntryList = []
+        geneFeatureList = []
+        currentGene = ""
+        gstart = -1
+        gstop = -1
+        gchrom = ""
+        gsense = ""
+        ftype = ""
+        senseArray = {"+": "F",
+                      "-": "R",
+                      ".": "F"
+        }
+
+        featfile = open(featureFile)
+        if fileType == "cistematic":
+            for line in featfile:
+                if line[0] == "#":
+                    continue
+
+                fields = line.strip().split()
+                if currentGene == "":
+                    currentGene = fields[0]
+                    gchrom = fields[2]
+                    gstart = int(fields[3])
+                    gsense = senseArray[fields[5]]
+
+                if fields[0] != currentGene:
+                    geneEntryList.append(((self.genome, currentGene), gchrom, gstart, gstop, gsense, "Transcript", "1"))
+                    currentGene = fields[0]
+                    gchrom = fields[2]
+                    gstart = int(fields[3])
+                    gsense = senseArray[fields[5]]
+
+                lstart = int(fields[3])
+                gstop = int(fields[4])
+                ftype = fields[6]
+                geneFeatureList.append(((self.genome, currentGene), "1", gchrom, lstart, gstop, gsense, ftype))
+        elif fileType == "UCSC":
+            for line in featfile:
+                if line[0] == "#":
+                    continue
+
+                geneFields = line.split("\t")
+                exonNum = int(geneFields[8])
+                exonStarts = geneFields[9].split(",")
+                exonStops = geneFields[10].split(",")
+                gchrom = geneFields[2][3:]
+                gsense = senseArray[geneFields[3]]
+                gstop = int(geneFields[7]) - 1
+                gstart = int(geneFields[6]) - 1
+                geneID = ("generic", geneFields[1])
+                for index in range(exonNum):
+                    estart = int(exonStarts[index]) - 1
+                    estop = int(exonStops[index]) - 1
+                    if estart >= gstart and estop <= gstop:
+                        geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, "CDS"))
+                    elif estop <= gstart:
+                        if gsense == "F":
+                            fType = "5UTR"
+                        else:
+                            fType = "3UTR"
+
+                        geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, fType))
+                    elif estart >= gstop:
+                        if gsense == "F":
+                            fType = "3UTR"
+                        else:
+                            fType = "5UTR"
+
+                        geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, fType))
+                    elif estart <= gstop and estart > gstart:
+                        if gsense == 'F':
+                            fType = '3UTR'
+                        else:
+                            fType = '5UTR'
+
+                        geneFeatureList.append((geneID, "1", gchrom, estart, gstop, gsense, "CDS"))
+                        geneFeatureList.append((geneID, "1", gchrom, gstop + 1, estop, gsense, fType))
+                    elif estart < gstart and estop <= gstop:
+                        if gsense == "F":
+                            fType = "5UTR"
+                        else:
+                            fType = "3UTR"
+
+                        geneFeatureList.append((geneID, "1", gchrom, estart, gstart - 1, gsense, fType))
+                        geneFeatureList.append((geneID, "1", gchrom, gstart, estop, gsense, "CDS"))
+                    else:
+                        if gsense == "F":
+                            fType1 = "5UTR"
+                            fType2 = "3UTR"
+                        else:
+                            fType1 = "3UTR"
+                            fType2 = "5UTR"
+
+                        geneFeatureList.append((geneID, "1", gchrom, estart, gstart - 1, gsense, fType1))
+                        geneFeatureList.append((geneID, "1", gchrom, gstart, gstop, gsense, "CDS"))
+                        geneFeatureList.append((geneID, "1", gchrom, gstop + 1, estop - 1, gsense, fType2))
+        else:
+            print "%s format not supported yet"
+            featfile.close()
+            return
+
+        featfile.close()
+        if replace==True:
+            self.queryDB("delete from sequence_features", useMemory=True)
+            self.queryDB("delete from gene_entries", useMemory=True)        
+
+        self.addGeneEntryBatch(geneEntryList, inMemory=True)
+        self.addFeatureEntryBatch(geneFeatureList, inMemory=True)
+
+
+    def queryDB(self, stmt, fetchall=False, useMemory=False):
+        if useMemory or self.memoryBacked:
+            db = self.memoryDB
+        else:
+            db = sqlite.connect(self.dbFile, timeout = 60)
+
+        sql = db.cursor()
+        sql.execute(stmt)
+        if fetchall:
+            results = sql.fetchall()
+        else:
+            results = sql.fetchone()
+
+        sql.close()
+        if not (useMemory or self.memoryBacked):
+            db.close()         
+
+        return results
+
+
+    def writeDB(self, stmt, useMemory=False):
+        if useMemory:
+            db = self.memoryDB
+        else:
+            db = sqlite.connect(self.dbFile, timeout = 60)
+
+        sql = db.cursor()
+        sql.execute(stmt)
+        db.commit()
+        sql.close()
+        if not useMemory:
+            db.close()
+
+
+    def writeBatchDB(self, stmt, stmtArray, useMemory=False):
+        if useMemory:
+            db = self.memoryDB
+        else:
+            db = sqlite.connect(self.dbFile, timeout = 60)
+
+        sql = db.cursor()
+        try:
+            sql.executemany(stmt, stmtArray)
+        except:
+            print "writeBatchDB: problem with %s" % stmt
+
+        db.commit()
+        sql.close()
+        if not useMemory:
+            db.close()
+
+
+def processSql(sqlFile, keyFields={}):
+    """ process a UCSC formatted .sql file to identify the table name and the position of key fields. Specifying the 
+        name of important fields in keyFields is highly recommended. A key in keyFields represents the sql column 
+        name in the file, while the corresponding value corresponds to the feature name. Blank values mean that the 
+        same field name will be used. For example, one possible keyFields dict would be:
+        keyFields = {'chromStart':'start', 'chromEnd':'stop', 'chrom':'', 'name':'', 'score':'value', 'strand':'orientation'}
+    """
+    fields = {}
+    infile = open(sqlFile)
+    line = ""
+    while "CREATE TABLE" not in line:
+        line = infile.readline()
+        continue
+
+    tabFields = line.split()
+    tabName = tabFields[2]
+    if keyFields == {}:
+        fields["chrom"] = 1
+        fields["start"] = 2
+        fields["stop"] = 3
+        fields["name"] = 4
+    else:
+        index = 0
+        for line in infile:
+            lineFields = line.split()
+            if lineFields[0] in keyFields.keys():
+                if keyFields[lineFields[0]] == "":
+                    outfield = lineFields[0]
+                else:
+                    outfield = keyFields[lineFields[0]]
+
+                fields[outfield] = index
+
+            index += 1
+
+    infile.close()
+
+    return (tabName, fields)
+
+
+def processTrack(genomeObject, dataFile, typeName="MISC", dataFields={}, version="1"):
+    """ process data for a UCSC track, given an instantiated genome object, the data, the name for the feature 
+        type in sequence_features, the dataFields in the format returned by processSql(), and a version.
+        If genomeObject is the empty string '', then processTrack() will simply print out the aggregate records,
+        rather than use addFeatureEntryBatch() to record the added features.
+
+        Note that numberic values are overloaded into the name field using @ as a delimiter.
+    """
+    records = []
+    if dataFields == {}:
+        dataFields["chrom"] = 1
+        dataFields["start"] = 2
+        dataFields["stop"] = 3
+        dataFields["name"] = 4
+    
+    infile = open(dataFile)
+    for line in infile:
+        fields = line.strip().split("\t")
+        if "name" in dataFields:
+            recName = fields[dataFields["name"]]
+        else:
+            recName = typeName
+
+        if "value" in dataFields:
+            recName += "@" + fields[dataFields["value"]]
+
+        start = int(fields[dataFields["start"]])
+        stop = int(fields[dataFields["stop"]])
+        chrom = fields[dataFields["chrom"]][3:]
+        chrom.replace("_random", "rand")
+        orientation = "F"
+        if "orientation" in dataFields:
+            if fields[dataFields["orientation"]] == "-":
+                orientation = "R"
+
+        if "version" in dataFields:
+            version = fields[dataFields["version"]]
+
+        records.append((("placeholder", recName), version, chrom, start, stop, orientation, typeName.upper()))
+
+    if genomeObject != "":
+        genomeObject.addFeatureEntryBatch(records)
+    else:
+        print records
+
+
+# Voodoo to get recursive imports working
+for genome in supportedGenomes:
+    importString = "import %s" % genome
+    exec importString
+    geneDB[genome]  = eval("%s.geneDB" % genome)
\ No newline at end of file