+###########################################################################
+# #
+# C O P Y R I G H T N O T I C E #
+# Copyright (c) 2003-10 by: #
+# * California Institute of Technology #
+# #
+# All Rights Reserved. #
+# #
+# Permission is hereby granted, free of charge, to any person #
+# obtaining a copy of this software and associated documentation files #
+# (the "Software"), to deal in the Software without restriction, #
+# including without limitation the rights to use, copy, modify, merge, #
+# publish, distribute, sublicense, and/or sell copies of the Software, #
+# and to permit persons to whom the Software is furnished to do so, #
+# subject to the following conditions: #
+# #
+# The above copyright notice and this permission notice shall be #
+# included in all copies or substantial portions of the Software. #
+# #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
+# SOFTWARE. #
+###########################################################################
+#
+try:
+ from pysqlite2 import dbapi2 as sqlite
+except:
+ from sqlite3 import dbapi2 as sqlite
+
+import string
+from mmap import *
+from os import environ
+
+if environ.get("CISTEMATIC_ROOT"):
+ cisRoot = environ.get("CISTEMATIC_ROOT")
+else:
+ cisRoot = "/proj/genome"
+
+__all__ = ["scerevisiae", "athaliana", "celegans", "cbriggsae", "cbrenneri",
+ "cremanei", "dmelanogaster", "mmusculus", "hsapiens", "rnorvegicus",
+ "spurpuratus", "ggallus", "cfamiliaris", "mdomestica", "xtropicalis",
+ "btaurus", "drerio", "ecaballus"
+]
+
+supportedGenomes = ["athaliana", "cfamiliaris", "mmusculus", "hsapiens",
+ "rnorvegicus", "ggallus", "scerevisiae", "celegans", "cbriggsae",
+ "cbrenneri", "cremanei", "dmelanogaster", "mdomestica",
+ "spurpuratus", "xtropicalis","btaurus", "drerio", "ecaballus"
+]
+
+geneDB = {}
+chromDict = {}
+chromRoot = {}
+chromGeneEntries = {}
+checkGene = {}
+geneInfo = {}
+getAllGenes = {}
+annotInfo = {}
+goInfo = {}
+allAnnotInfo = {}
+allGOInfo = {}
+
+def compNT(nt):
+ """ returns the complementary basepair to base nt
+ """
+ compDict = {"A": "T", "T": "A",
+ "G": "C", "C": "G",
+ "S": "S",
+ "W": "W",
+ "R": "Y", "Y": "R",
+ "M": "K", "K": "M",
+ "H": "D", "D": "H",
+ "B": "V", "V": "B",
+ "N": "N",
+ "a": "t", "t": "a",
+ "g": "c", "c": "g",
+ "n": "n",
+ "z": "z"
+ }
+
+ return compDict.get(nt, "N")
+
+
+def complement(sequence, length=0):
+ """ returns the complement of the sequence.
+ """
+ newSeq = ""
+ seqLength = len(sequence)
+ if length == seqLength:
+ seqList = list(sequence)
+ seqList.reverse()
+ return "".join(map(compNT, seqList))
+
+ for index in range(seqLength - 1, seqLength - length - 1, -1):
+ try:
+ newSeq += compNT(sequence[index])
+ except:
+ newSeq += "N"
+
+ return newSeq
+
+
+class Genome:
+ genome = ""
+ chromosome = ""
+ version = ""
+ dbFile = ""
+ supported = False
+ oldStyle = False
+ customAnnotations = False
+
+
+ def __init__(self, genome, chrom="", version="", dbFile="", inRAM=False):
+ self.genome = genome
+ if chrom != "":
+ self.chromosome = chrom
+
+ if version != "":
+ self.version = version
+
+ if dbFile != "":
+ self.dbFile = dbFile
+
+ if genome in supportedGenomes and dbFile == "":
+ self.dbFile = geneDB[genome]
+ self.supported = True
+
+ if inRAM:
+ self.memoryBacked = True
+ self.memoryDB = sqlite.connect(":memory:")
+ self.createGeneDB(inMemory=True)
+ sql = self.memoryDB.cursor()
+ try:
+ sql.execute("PRAGMA DEFAULT_CACHE_SIZE = 500000")
+ sql.execute("ATTACH '%s' as diskdb" % self.dbFile)
+ for table in ["gene_entries", "gene_annotation", "gene_ontology", "sequences", "chromosomes", "sequence_features"]:
+ sql.execute("insert into %s select * from diskdb.%s" % (table, table))
+
+ sql.execute("DETACH diskdb")
+ except:
+ if self.dbFile != "":
+ print "could not import %s" % self.dbFile
+
+ sql.close()
+ self.createIndices(inMemory=True)
+ else:
+ self.memoryBacked = False
+ self.memoryDB = ""
+
+
+ def setGeneDB(self, dbFile):
+ self.dbFile = dbFile
+ self.supported = False
+
+
+ def setChromosome(self, chrom):
+ self.chromosome = chrom
+
+
+ def checkGene(self, geneID):
+ """ returns True if the geneID matches an entry in the genome database.
+ """
+ (genome, gID) = geneID
+ if genome != self.genome:
+ return False
+
+ try:
+ stmt = "SELECT chromosome from gene_entries where name = '%s' " % gID
+ res = self.queryDB(stmt)
+ if len(res) > 0:
+ return True
+ except:
+ pass
+
+ return False
+
+
+ def geneInfo(self, geneID, version="1"):
+ (genome, gID) = geneID
+ result = ""
+ if genome != self.genome:
+ return False
+
+ try:
+ stmt = "SELECT chromosome, start, stop, length, orientation from gene_entries where name = '%s' and version = '%s' " % (gID, version)
+ res = self.queryDB(stmt)
+ if len(res) > 0:
+ chrom = res[0]
+ start = int(res[1])
+ stop = int(res[2])
+ if start > stop:
+ temp = stop
+ stop = start
+ start = temp
+
+ length = int(res[3])
+ orientation = res[4]
+ result = (chrom, start, stop, length, orientation)
+ except:
+ pass
+
+ return result
+
+
+ def getallGeneInfo(self, name="", chrom="", version=""):
+ resultDict = {}
+ chromList = []
+ stmt = "select name, chromosome, start, stop, orientation from gene_entries order by name, start "
+ res = self.queryDB(stmt, fetchall=True)
+ for entry in res:
+ (name, chromosome, start, stop, orientation) = entry
+ if name not in resultDict:
+ resultDict[name] = []
+
+ if chromosome not in chromList:
+ resultDict[chromosome] = []
+ chromList.append(chromosome)
+
+ resultDict[chromosome].append((name, chromosome, int(start), int(stop), orientation))
+
+ return resultDict
+
+
+ def leftGeneDistance(self, geneID, radius=50000, version="1"):
+ result = radius
+ res = self.geneInfo(geneID, version)
+ if res != "":
+ (chrom, start, stop, length, orientation) = res
+ if start > stop:
+ temp = stop
+ stop = start
+ start = temp
+
+ stmt = "SELECT name, start, stop, length, orientation from gene_entries where chromosome = '%s' and ((start > %d and start < %d) or (stop > %d and stop < %d)) " % (chrom, start - radius, start, start - radius, start)
+ res = self.queryDB(stmt, fetchall=True)
+ for entry in res:
+ rstart = int(entry[1])
+ rstop = int(entry[2])
+ if rstart > rstop:
+ temp = rstop
+ rstop = rstart
+ rstart = temp
+
+ thelength = start - rstart
+ if (start - rstop) > 0 and (start - rstop) < thelength:
+ thelength = start - rstop
+
+ if thelength > 0 and thelength < result:
+ result = thelength
+
+ return result
+
+ def rightGeneDistance(self, geneID, radius=50000, version="1"):
+ result = radius
+ res = self.geneInfo(geneID, version)
+ if res != "":
+ (chrom, start, stop, length, orientation) = res
+ if start > stop:
+ temp = stop
+ stop = start
+ start = temp
+
+ stmt = "SELECT name, start, stop, length, orientation from gene_entries where chromosome = '%s' and ((start > %d and start < %d) or (stop > %d and stop < %d)) " % (chrom, stop, stop + radius, stop, stop + radius)
+ res = self.queryDB(stmt, fetchall=True)
+ for entry in res:
+ rstart = int(entry[1])
+ rstop = int(entry[2])
+ if rstart > rstop:
+ temp = rstop
+ rstop = rstart
+ rstart = temp
+
+ thelength = rstop - stop
+ if (rstart - stop) > 0 and (rstart - stop) < thelength:
+ thelength = rstart - stop
+
+ if thelength > 0 and thelength < result:
+ result = thelength
+
+ return result
+
+
+ def annotInfo(self, geneID):
+ (genome, gID) = geneID
+ result = []
+ if genome != self.genome:
+ return False
+
+ stmt = "SELECT description from gene_annotation where name = '%s'" % gID
+ res = self.queryDB(stmt, fetchall=True)
+ if len(res) > 0:
+ for entry in res:
+ result.append(entry[0])
+
+ return result
+
+
+ def goInfo(self, geneID):
+ (genome, gID) = geneID
+ result = []
+ if genome != self.genome:
+ return False
+
+ stmt = "SELECT GOID, objType, objName, isNot, GOterm, evidence from gene_ontology where name = '%s'" % gID
+ res = self.queryDB(stmt, fetchall=True)
+ if len(res) > 0:
+ for entry in res:
+ result.append(string.join(entry,"\t"))
+
+ return result
+
+
+ def chromInfo(self, chrom=""):
+ if chrom == "" and self.chromosome != "":
+ chrom = self.chromosome
+
+ stmt = "SELECT sequenceName, storageType from chromosomes where name = '%s' " % chrom
+ res = self.queryDB(stmt)
+ result = "%s\t%s" % (res[0], res[1])
+
+ return result
+
+
+ def allGIDs(self):
+ """ returns [ list of all orf names]
+ """
+ result = []
+ stmt = "SELECT distinct name from gene_entries"
+ res = self.queryDB(stmt, fetchall=True)
+ for entry in res:
+ result.append(entry[0])
+
+ return result
+
+
+ def allGIDsbyGOID(self, GOID):
+ """ returns [ list of all orf names] that match a particular GOID
+ """
+ result = []
+ stmt = "SELECT distinct name from gene_ontology where GOID = '%s' " % GOID
+ res = self.queryDB(stmt, fetchall=True)
+ for entry in res:
+ result.append(entry[0])
+
+ return result
+
+
+ def allGOIDs(self):
+ """ returns the list of GOID's in the genome
+ """
+ result = []
+ stmt = "SELECT distinct GOID from gene_ontology"
+ res = self.queryDB(stmt, fetchall=True)
+ for entry in res:
+ result.append(entry[0])
+
+ return result
+
+
+ def allGOterms(self):
+ """ returns the list of GOID's and their associated GO term in the genome
+ """
+ result = {}
+ stmt = "SELECT distinct GOID, GOterm from gene_ontology"
+ res = self.queryDB(stmt, fetchall=True)
+ for entry in res:
+ result[str(entry[0])] = str(entry[1])
+
+ return result
+
+
+ def getGOIDCount(self, GOID):
+ """ returns the match count for a particular GOID
+ """
+ stmt = "SELECT distinct name from gene_ontology where GOID = '%s' " % GOID
+ res = self.queryDB(stmt, fetchall=True)
+
+ return len(res)
+
+
+ def allAnnotInfo(self):
+ result = {}
+ stmt = "SELECT name, description from gene_annotation"
+ res = self.queryDB(stmt, fetchall=True)
+
+ for entry in res:
+ geneID = (self.genome, entry[0])
+ if geneID not in result:
+ result[geneID] = []
+
+ result[(self.genome,entry[0])].append(entry[1])
+
+ return result
+
+
+ def allGoInfo(self):
+ result = {}
+ stmt = "SELECT name, GOID, objType, objName, isNot, GOterm, evidence, other from gene_ontology order by name "
+ res = self.queryDB(stmt, fetchall=True)
+ for entry in res:
+ geneID = (self.genome, entry[0])
+ if geneID not in result:
+ result[geneID] = []
+
+ result[(self.genome, entry[0])].append(string.join(entry[1:], "\t"))
+
+ return result
+
+
+ def allChromNames(self, partition=1, slice=0):
+ result = []
+ stmt = "SELECT distinct name from chromosomes"
+ res = self.queryDB(stmt, fetchall=True)
+ reslen = len(res)
+ for index in range(reslen):
+ if (index + slice) % partition == 0:
+ entry = res[index]
+ result.append(entry[0])
+
+ return result
+
+
+ def geneSeq(self, gID, maskCDS=False, maskLower=False, version="1"):
+ (chrom, start, stop, length, orientation) = self.geneInfo(gID, version)
+ seq = self.sequence(chrom, start, length, maskCDS, maskLower)
+ if orientation == "R":
+ seq = complement(seq, length)
+
+ return seq
+
+
+ def getGeneFeatures(self, gID, type="", version="1"):
+ results = []
+ featureClause = ""
+ (genome, geneid) = gID
+ if len(type) > 0:
+ featureClause = ' and type = "%s" ' % type
+
+ stmt = 'select type, chromosome, start, stop, orientation from sequence_features where name = "%s" and version = "%s" %s order by start ' % (geneid, str(version), featureClause)
+ res = self.queryDB(stmt, fetchall=True)
+ for entry in res:
+ (type, chromosome, start, stop, orientation) = entry
+ results.append((type, chromosome, start, stop, orientation))
+
+ return results
+
+
+ def getallGeneFeatures(self):
+ resultDict = {}
+ stmt = "select name, type, chromosome, start, stop, orientation from sequence_features order by name, start "
+ res = self.queryDB(stmt, fetchall=True)
+ for entry in res:
+ (name, type, chromosome, start, stop, orientation) = entry
+ if name not in resultDict:
+ resultDict[name] = []
+
+ resultDict[name].append((type, chromosome, start, stop, orientation))
+
+ return resultDict
+
+
+ def getFeatureTypes(self, ftype=''):
+ """ Returns the distinct feature types available in the sequence_features
+ tables. Can optionally limit by feature type; the wild-card % can be
+ used to search feature substrings.
+ """
+ results = []
+ whereClause = ""
+ useLike = False
+ if "%" in ftype:
+ useLike = True
+
+ if len(ftype) > 0 and not useLike:
+ whereClause = 'where type = "%s" ' % ftype
+ elif len(ftype) > 0:
+ whereClause = 'where type LIKE "%s" ' % ftype
+
+ stmt = "select distinct type from sequence_features %s" % whereClause
+ res = self.queryDB(stmt, fetchall=True)
+ for entry in res:
+ results.append(entry[0])
+
+ return results
+
+ def getFeatures(self, ftype, name="", chrom="", version =""):
+ """ Get features stored in sequence_features that match a feature type,
+ optionally restricted by name/value, chromosome, or version. Will
+ search for substrings when ftype and/or name are given with a % to
+ indicate the location of the wildcard. Returns a dictionary of features
+ with chromosomes as the keys.
+ """
+ results = {}
+ chromList = []
+ nameClause = ""
+ chromClause = ""
+ versionClause = ""
+ useLike = "="
+ if "%" in ftype:
+ useLike = "LIKE"
+
+ if len(name) > 0:
+ if "%" in name:
+ nameLike = "LIKE"
+ else:
+ nameLike = "="
+
+ nameClause = ' and name %s "%s" ' % (nameLike, name)
+
+ if len(chrom) > 0:
+ chromClause = ' and chromosome = "%s" ' % chrom
+
+ if len(version) > 0:
+ versionClause = ' and version = "%s" ' % version
+
+ stmt = 'select name, version, chromosome, start, stop, orientation, type from sequence_features where type %s "%s" %s %s %s order by type' % (useLike, ftype, chromClause, nameClause, versionClause)
+ res = self.queryDB(stmt, fetchall=True)
+ for entry in res:
+ (name, version, chromosome, start, stop, orientation, atype) = entry
+ if chromosome not in chromList:
+ results[chromosome] = []
+ chromList.append(chromosome)
+
+ results[chromosome].append((name, version, chromosome, start, stop, orientation, atype))
+
+ return results
+
+
+ def getFeaturesIntersecting(self, chrom, qstart, qlength, ftype=""):
+ """ Return features that are on a particular stretch of the genome. Can optionally
+ limit by feature type; the wild-card % can be used to search feature substrings.
+ """
+ results = []
+ featureClause = ""
+ qstop = qstart + qlength
+ useLike = False
+ if "%" in ftype:
+ useLike = True
+
+ if len(ftype) > 0 and not useLike:
+ featureClause = ' and type = "%s" ' % ftype
+ elif len(ftype) > 0:
+ featureClause = ' and type LIKE "%s" ' % ftype
+
+ #select all features that encompass our start/stop
+ stmt = 'select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start < %d and stop > %d %s order by start' % (chrom, qstart, qstop, featureClause)
+ res = self.queryDB(stmt, fetchall=True)
+ for entry in res:
+ (chromosome, start, stop, orientation, name, version, atype) = entry
+ results.append((name, version, chromosome, start, stop, orientation, atype))
+
+ # select all features that have a "stop" between start and stop that aren't yet in the dataset
+ stmt = 'select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and stop >= %d and stop <= %d %s order by start' % (chrom, qstart, qstop, featureClause)
+ res = self.queryDB(stmt, fetchall=True)
+ for entry in res:
+ (chromosome, start, stop, orientation, name, version, atype) = entry
+ if (name, version, chromosome, start, stop, orientation, atype) not in results:
+ results.append((name, version, chromosome, start, stop, orientation, atype))
+
+ # select all features on chromosome that have a "start" between start and stop
+ stmt = 'chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start >= %d and start <= %d %s order by start' % (chrom, qstart, qstop, featureClause)
+ res = self.queryDB(stmt, fetchall=True)
+ for entry in res:
+ (chromosome, start, stop, orientation, name, version, atype) = entry
+ if (name, version, chromosome, start, stop, orientation, atype) not in results:
+ results.append((name, version, chromosome, start, stop, orientation, atype))
+
+ return results
+
+
+ def getGenesIntersecting(self, chrom, qstart, qlength):
+ """ Return features that are on a ptarticular stretch of the genome. Can optionally
+ limit by feature type; the wild-card % can be used to search feature substrings.
+ """
+ results = []
+ qstop = qstart + qlength
+ atype = "model"
+ #select all features that encompass our start/stop
+ stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and start < %d and stop > %d order by start' % (chrom, qstart, qstop)
+ res = self.queryDB(stmt, fetchall=True)
+ for entry in res:
+ (chromosome, start, stop, orientation, name, version) = entry
+ results.append((name, version, chromosome, start, stop, orientation, atype))
+
+ # select all features that have a "stop" between start and stop that aren't yet in the dataset
+ stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and stop >= %d and stop <= %d order by start' % (chrom, qstart, qstop)
+ res = self.queryDB(stmt, fetchall=True)
+ for entry in res:
+ (chromosome, start, stop, orientation, name, version) = entry
+ if (name, version, chromosome, start, stop, orientation) not in results:
+ results.append((name, version, chromosome, start, stop, orientation, atype))
+
+ # select all features on chromosome that have a "start" between start and stop
+ stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and start >= %d and start <= %d order by start' % (chrom, qstart, qstop)
+ res = self.queryDB(stmt, fetchall=True)
+ for entry in res:
+ (chromosome, start, stop, orientation, name, version) = entry
+ if (name, version, chromosome, start, stop, orientation) not in results:
+ results.append((name, version, chromosome, start, stop, orientation, atype))
+
+ return results
+
+
+ def sequence(self, chrom, start, length, maskCDS=False, maskLower=False):
+ seq = ""
+ if chrom == "" and self.chromosome != "":
+ chrom = self.chromosome
+
+ stmt = "select sequenceName, storageType from chromosomes where name = '%s' " % chrom
+ res = self.queryDB(stmt)
+ seqName = res[0]
+ seqType = res[1]
+ if seqType == "db":
+ stmt = "select sequence from sequences where name = '%s' " % seqName
+ res = self.queryDB(stmt)
+ seq = res[0][start: start + length]
+ res = ""
+ else:
+ chromFile = open("%s%s" % (cisRoot, seqName), "r")
+ mymap = mmap(chromFile.fileno(),1,access=ACCESS_READ)
+ mymap = mmap(chromFile.fileno(), mymap.size(), access=ACCESS_READ)
+ mymap.seek(start)
+ seq = mymap.read(length)
+ chromFile.close()
+
+ if maskCDS:
+ stop = start + length - 1
+ seqArray = list(seq)
+ features = self.getFeaturesIntersecting(chrom, start, length, "CDS")
+ for entry in features:
+ (name, geneID, chromosome, fstart, fstop, forientation, type) = entry
+ if fstart < start:
+ fstart = start
+
+ if fstop > stop:
+ fstop = stop
+
+ nstart = fstart - start
+ nstop = fstop - start + 1
+ for pos in range(nstop - nstart):
+ seqArray[nstart + pos] = "N"
+
+ seq = string.join(seqArray, "")
+
+ if maskLower:
+ seqArray = list(seq)
+ for index in range(len(seqArray)):
+ if seqArray[index] in ["a", "c" , "g", "t"]:
+ seqArray[index] = "N"
+
+ seq = string.join(seqArray, "")
+
+ return seq
+
+
+ def getChromosomeSequence(self, chrom=""):
+ seq = ""
+ if chrom == "" and self.chromosome != "":
+ chrom = self.chromosome
+
+ stmt = "select sequenceName, storageType from chromosomes where name = '%s' " % chrom
+ res = self.queryDB(stmt)
+ if res == None:
+ print "Could not find chromosome %s" % chrom
+ return ''
+
+ seqName = res[0]
+ seqType = res[1]
+ if seqType == "db":
+ res = self.queryDB('select sequence from sequences where name = "%s"' % chrom)
+ seq = res[0]
+ res = ""
+ else:
+ chromFile = open("%s%s" % (cisRoot, seqName), "r")
+ seq = chromFile.readline()
+ chromFile.close()
+
+ return seq
+
+
+ def chromGeneEntries(self, chrom="", lowerbound=-1, upperbound=-1):
+ result = []
+ if chrom == "" and self.chromosome != "":
+ chrom = self.chromosome
+
+ stmt = "select distinct start, stop, orientation, name from gene_entries where chromosome = '%s' " % chrom
+ res = self.queryDB(stmt, fetchall=True)
+ if lowerbound > 0 and upperbound > 0:
+ for entry in res:
+ start = int(entry[0])
+ stop = int(entry[1])
+ if stop < start:
+ start, stop = stop, start
+
+ if stop < lowerbound or start > upperbound:
+ continue
+
+ result.append((start, stop, entry[2], (self.genome, entry[3])))
+ else:
+ for entry in res:
+ start = int(entry[0])
+ stop = int(entry[1])
+ if stop < start:
+ start, stop = stop, start
+
+ result.append((start, stop, entry[2], (self.genome, entry[3])))
+
+ return result
+
+
+ def createGeneDB(self, dbFile="", inMemory=False):
+ if len(dbFile) > 0:
+ self.dbFile = dbFile
+
+ tableDict = {"gene_entries": "ID INTEGER PRIMARY KEY, name varchar, version varchar, chromosome varchar, start varchar, stop varchar, length varchar, orientation varchar, feature varchar",
+ "gene_annotation": "ID INTEGER PRIMARY KEY, name varchar, description varchar",
+ "gene_ontology": "ID INTEGER PRIMARY KEY, name varchar, GOID varchar, objType varchar, objName varchar, isNot varchar, GOterm varchar, evidence varchar, other varchar",
+ "sequences": "ID INTEGER PRIMARY KEY, name varchar, sequenceLength varchar, sequenceType varchar, sequence varchar",
+ "chromosomes": "ID INTEGER PRIMARY KEY, name varchar, sequenceName varchar, storageType varchar",
+ "sequence_features": "ID INTEGER PRIMARY KEY, name varchar, version varchar, chromosome varchar, start int, stop int, length varchar, orientation varchar, type varchar"
+ }
+
+ for table in tableDict.keys():
+ stmt = "create table %s(%s)" % (table, tableDict[table])
+ self.writeDB(stmt, useMemory=inMemory)
+
+
+ def createIndices(self, inMemory=False):
+ indexDict = {"nameIndex1": ("gene_entries", "name"),
+ "nameIndex2": ("gene_annotation", "name"),
+ "nameIndex3": ("gene_ontology", "name"),
+ "goidIndex": ("gene_ontology", "GOID"),
+ "nameIndex4": ("sequences", "name"),
+ "nameIndex5": ("chromosomes", "name"),
+ "geneIDIndex": ("sequence_features", "name, type"),
+ "posIndex": ("sequence_features", "chromosome, start, stop, type"),
+ "typeIndex": ("sequence_features", "type")
+ }
+
+ for indexName in indexDict.keys():
+ (tableName, fields) = indexDict[indexName]
+ stmt = "CREATE INDEX %s on %s(%s)" % (indexName, tableName, fields)
+ self.writeDB(stmt, useMemory=inMemory)
+
+
+ def addGeneEntry(self, geneID, chrom, start, stop, orientation, feature="", version=1.0):
+ (genome, gID) = geneID
+ length = str(abs(int(start) - int(stop)) + 1)
+ stmt = "insert into gene_entries(ID, name, version, chromosome, start, stop, length, orientation, feature) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % (gID, str(version), chrom, start, stop, length, orientation, feature)
+ self.writeDB(stmt)
+
+
+ def addFeatureEntry(self, name, geneID, chrom, start, stop, orientation, type=""):
+ (genome, gID) = geneID
+ length = str(abs(int(start) - int(stop)) + 1)
+ stmt = "insert into sequence_features(ID, name, geneID, chromosome, start, stop, length, orientation, type) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % (name, gID, chrom, start, stop, length, orientation, type)
+ self.writeDB(stmt)
+
+
+ def addAnnotation(self, geneID, description):
+ (genome, gID) = geneID
+ stmt = "insert into gene_annotation(ID, name, description) values (NULL, '%s', '%s') " % (gID, description)
+ self.writeDB(stmt)
+
+
+ def addGoInfo(self, geneID, GOID, objType="", objName="", isNot="", GOterm="", evidence="", other=""):
+ (genome, gID) = geneID
+ stmt = "insert into gene_ontology(ID, name, GOID, objType, objName, isNot, GOterm, evidence, other) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % (gID, str(GOID), objType, objName, isNot, GOterm, evidence, other)
+ self.writeDB(stmt)
+
+
+ def addSequence(self, geneID, seq, seqType, seqLen="-1"):
+ (genome, gID) = geneID
+ if int(seqLen) < 0:
+ seqLen = str(len(seq))
+
+ stmt = "insert into sequences(ID, name, sequenceLength, sequenceType, sequence) values (NULL, '%s', '%s', '%s', '%s')" % (gID, str(seqLen), seqType, seq)
+ self.writeDB(stmt)
+
+
+ def addChromosomeEntry(self, chromo, seqName, storageType):
+ stmt = "insert into chromosomes(ID, name, sequenceName, storageType) values (NULL, '%s', '%s', '%s')" % (chromo, seqName, storageType)
+ self.writeDB(stmt)
+
+
+ def addSequenceBatch(self, entryArray, inMemory=False):
+ stmtArray = []
+ stmt = "insert into sequences(ID, name, sequenceLength, sequenceType, sequence) values (NULL, ?, ?, ?, ?)"
+ for entry in entryArray:
+ (geneID, seq, seqType, seqLen) = entry
+ (genome, gID) = geneID
+ if int(seqLen) < 0:
+ seqLen = str(len(seq))
+
+ stmtArray.append((gID, str(seqLen), seqType, seq))
+
+ self.writeBatchDB(stmt, stmtArray)
+
+
+ def addChromosomeEntryBatch(self, entryArray, inMemory=False):
+ stmt = "insert into chromosomes(ID, name, sequenceName, storageType) values (NULL, ?, ?, ?)"
+ self.writeBatchDB(stmt, entryArray, useMemory=inMemory)
+
+
+ def addGeneEntryBatch(self, entryArray, inMemory=False):
+ stmtArray = []
+ stmt = "insert into gene_entries(ID, name, version, chromosome, start, stop, length, orientation, feature) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) "
+ for entry in entryArray:
+ (geneID, chrom, start, stop, orientation, feature, version) = entry
+ (genome, gID) = geneID
+ length = str(abs(int(start) - int(stop)) + 1)
+ stmtArray.append((gID, str(version), chrom, start, stop, length, orientation, feature))
+
+ self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)
+
+
+ def addFeatureEntryBatch(self, entryArray, inMemory=False):
+ stmtArray = []
+ stmt = "insert into sequence_features(ID, name, version, chromosome, start, stop, length, orientation, type) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) "
+ for entry in entryArray:
+ (geneID, version, chrom, start, stop, orientation, type) = entry
+ (genome, gID) = geneID
+ length = str(abs(int(start) - int(stop)) + 1)
+ stmtArray.append((gID, version, chrom, int(start), int(stop), length, orientation, type))
+
+ self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)
+
+
+ def addAnnotationBatch(self, entryArray, inMemory=False):
+ stmtArray = []
+ stmt = "insert into gene_annotation(ID, name, description) values (NULL, ?, ?) "
+ for entry in entryArray:
+ (geneID, description) = entry
+ (genome, gID) = geneID
+ stmtArray.append((gID, description))
+
+ self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)
+
+
+ def addGoInfoBatch(self, entryArray, inMemory=False):
+ stmtArray = []
+ stmt = "insert into gene_ontology(ID, name, GOID, objType, objName, isNot, GOterm, evidence, other) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) "
+ for entry in entryArray:
+ (geneID, GOID, objType, objName, isNot, GOterm, evidence, other) = entry
+ (genome, gID) = geneID
+ stmtArray.append((gID, str(GOID), objType, objName, isNot, GOterm, evidence, other))
+
+ self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)
+
+
+ def extendFeatures(self, featureFile, fileType="cistematic", replace=False):
+ geneEntryList = []
+ geneFeatureList = []
+ currentGene = ""
+ gstart = -1
+ gstop = -1
+ gchrom = ""
+ gsense = ""
+ ftype = ""
+ senseArray = {"+": "F",
+ "-": "R",
+ ".": "F"
+ }
+
+ featfile = open(featureFile)
+ if fileType == "cistematic":
+ for line in featfile:
+ if line[0] == "#":
+ continue
+
+ fields = line.strip().split()
+ if currentGene == "":
+ currentGene = fields[0]
+ gchrom = fields[2]
+ gstart = int(fields[3])
+ gsense = senseArray[fields[5]]
+
+ if fields[0] != currentGene:
+ geneEntryList.append(((self.genome, currentGene), gchrom, gstart, gstop, gsense, "Transcript", "1"))
+ currentGene = fields[0]
+ gchrom = fields[2]
+ gstart = int(fields[3])
+ gsense = senseArray[fields[5]]
+
+ lstart = int(fields[3])
+ gstop = int(fields[4])
+ ftype = fields[6]
+ geneFeatureList.append(((self.genome, currentGene), "1", gchrom, lstart, gstop, gsense, ftype))
+ elif fileType == "UCSC":
+ for line in featfile:
+ if line[0] == "#":
+ continue
+
+ geneFields = line.split("\t")
+ exonNum = int(geneFields[8])
+ exonStarts = geneFields[9].split(",")
+ exonStops = geneFields[10].split(",")
+ gchrom = geneFields[2][3:]
+ gsense = senseArray[geneFields[3]]
+ gstop = int(geneFields[7]) - 1
+ gstart = int(geneFields[6]) - 1
+ geneID = ("generic", geneFields[1])
+ for index in range(exonNum):
+ estart = int(exonStarts[index]) - 1
+ estop = int(exonStops[index]) - 1
+ if estart >= gstart and estop <= gstop:
+ geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, "CDS"))
+ elif estop <= gstart:
+ if gsense == "F":
+ fType = "5UTR"
+ else:
+ fType = "3UTR"
+
+ geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, fType))
+ elif estart >= gstop:
+ if gsense == "F":
+ fType = "3UTR"
+ else:
+ fType = "5UTR"
+
+ geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, fType))
+ elif estart <= gstop and estart > gstart:
+ if gsense == 'F':
+ fType = '3UTR'
+ else:
+ fType = '5UTR'
+
+ geneFeatureList.append((geneID, "1", gchrom, estart, gstop, gsense, "CDS"))
+ geneFeatureList.append((geneID, "1", gchrom, gstop + 1, estop, gsense, fType))
+ elif estart < gstart and estop <= gstop:
+ if gsense == "F":
+ fType = "5UTR"
+ else:
+ fType = "3UTR"
+
+ geneFeatureList.append((geneID, "1", gchrom, estart, gstart - 1, gsense, fType))
+ geneFeatureList.append((geneID, "1", gchrom, gstart, estop, gsense, "CDS"))
+ else:
+ if gsense == "F":
+ fType1 = "5UTR"
+ fType2 = "3UTR"
+ else:
+ fType1 = "3UTR"
+ fType2 = "5UTR"
+
+ geneFeatureList.append((geneID, "1", gchrom, estart, gstart - 1, gsense, fType1))
+ geneFeatureList.append((geneID, "1", gchrom, gstart, gstop, gsense, "CDS"))
+ geneFeatureList.append((geneID, "1", gchrom, gstop + 1, estop - 1, gsense, fType2))
+ else:
+ print "%s format not supported yet"
+ featfile.close()
+ return
+
+ featfile.close()
+ if replace==True:
+ self.queryDB("delete from sequence_features", useMemory=True)
+ self.queryDB("delete from gene_entries", useMemory=True)
+
+ self.addGeneEntryBatch(geneEntryList, inMemory=True)
+ self.addFeatureEntryBatch(geneFeatureList, inMemory=True)
+
+
+ def queryDB(self, stmt, fetchall=False, useMemory=False):
+ if useMemory or self.memoryBacked:
+ db = self.memoryDB
+ else:
+ db = sqlite.connect(self.dbFile, timeout = 60)
+
+ sql = db.cursor()
+ sql.execute(stmt)
+ if fetchall:
+ results = sql.fetchall()
+ else:
+ results = sql.fetchone()
+
+ sql.close()
+ if not (useMemory or self.memoryBacked):
+ db.close()
+
+ return results
+
+
+ def writeDB(self, stmt, useMemory=False):
+ if useMemory:
+ db = self.memoryDB
+ else:
+ db = sqlite.connect(self.dbFile, timeout = 60)
+
+ sql = db.cursor()
+ sql.execute(stmt)
+ db.commit()
+ sql.close()
+ if not useMemory:
+ db.close()
+
+
+ def writeBatchDB(self, stmt, stmtArray, useMemory=False):
+ if useMemory:
+ db = self.memoryDB
+ else:
+ db = sqlite.connect(self.dbFile, timeout = 60)
+
+ sql = db.cursor()
+ try:
+ sql.executemany(stmt, stmtArray)
+ except:
+ print "writeBatchDB: problem with %s" % stmt
+
+ db.commit()
+ sql.close()
+ if not useMemory:
+ db.close()
+
+
+def processSql(sqlFile, keyFields={}):
+ """ process a UCSC formatted .sql file to identify the table name and the position of key fields. Specifying the
+ name of important fields in keyFields is highly recommended. A key in keyFields represents the sql column
+ name in the file, while the corresponding value corresponds to the feature name. Blank values mean that the
+ same field name will be used. For example, one possible keyFields dict would be:
+ keyFields = {'chromStart':'start', 'chromEnd':'stop', 'chrom':'', 'name':'', 'score':'value', 'strand':'orientation'}
+ """
+ fields = {}
+ infile = open(sqlFile)
+ line = ""
+ while "CREATE TABLE" not in line:
+ line = infile.readline()
+ continue
+
+ tabFields = line.split()
+ tabName = tabFields[2]
+ if keyFields == {}:
+ fields["chrom"] = 1
+ fields["start"] = 2
+ fields["stop"] = 3
+ fields["name"] = 4
+ else:
+ index = 0
+ for line in infile:
+ lineFields = line.split()
+ if lineFields[0] in keyFields.keys():
+ if keyFields[lineFields[0]] == "":
+ outfield = lineFields[0]
+ else:
+ outfield = keyFields[lineFields[0]]
+
+ fields[outfield] = index
+
+ index += 1
+
+ infile.close()
+
+ return (tabName, fields)
+
+
+def processTrack(genomeObject, dataFile, typeName="MISC", dataFields={}, version="1"):
+ """ process data for a UCSC track, given an instantiated genome object, the data, the name for the feature
+ type in sequence_features, the dataFields in the format returned by processSql(), and a version.
+ If genomeObject is the empty string '', then processTrack() will simply print out the aggregate records,
+ rather than use addFeatureEntryBatch() to record the added features.
+
+ Note that numberic values are overloaded into the name field using @ as a delimiter.
+ """
+ records = []
+ if dataFields == {}:
+ dataFields["chrom"] = 1
+ dataFields["start"] = 2
+ dataFields["stop"] = 3
+ dataFields["name"] = 4
+
+ infile = open(dataFile)
+ for line in infile:
+ fields = line.strip().split("\t")
+ if "name" in dataFields:
+ recName = fields[dataFields["name"]]
+ else:
+ recName = typeName
+
+ if "value" in dataFields:
+ recName += "@" + fields[dataFields["value"]]
+
+ start = int(fields[dataFields["start"]])
+ stop = int(fields[dataFields["stop"]])
+ chrom = fields[dataFields["chrom"]][3:]
+ chrom.replace("_random", "rand")
+ orientation = "F"
+ if "orientation" in dataFields:
+ if fields[dataFields["orientation"]] == "-":
+ orientation = "R"
+
+ if "version" in dataFields:
+ version = fields[dataFields["version"]]
+
+ records.append((("placeholder", recName), version, chrom, start, stop, orientation, typeName.upper()))
+
+ if genomeObject != "":
+ genomeObject.addFeatureEntryBatch(records)
+ else:
+ print records
+
+
+# Voodoo to get recursive imports working
+for genome in supportedGenomes:
+ importString = "import %s" % genome
+ exec importString
+ geneDB[genome] = eval("%s.geneDB" % genome)
\ No newline at end of file