X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=cistematic%2Fgenomes%2F__init__.py;fp=cistematic%2Fgenomes%2F__init__.py;h=4dee9248836da4133204428fd6a476ad759ce8a4;hp=0000000000000000000000000000000000000000;hb=bc30aca13e5ec397c92e67002fbf7a103130b828;hpb=0d3e3112fd04c2e6b44a25cacef1d591658ad181 diff --git a/cistematic/genomes/__init__.py b/cistematic/genomes/__init__.py new file mode 100644 index 0000000..4dee924 --- /dev/null +++ b/cistematic/genomes/__init__.py @@ -0,0 +1,1109 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +try: + from pysqlite2 import dbapi2 as sqlite +except: + from sqlite3 import dbapi2 as sqlite + +import string +from mmap import * +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +__all__ = ["scerevisiae", "athaliana", "celegans", "cbriggsae", "cbrenneri", + "cremanei", "dmelanogaster", "mmusculus", "hsapiens", "rnorvegicus", + "spurpuratus", "ggallus", "cfamiliaris", "mdomestica", "xtropicalis", + "btaurus", "drerio", "ecaballus" +] + +supportedGenomes = ["athaliana", "cfamiliaris", "mmusculus", "hsapiens", + "rnorvegicus", "ggallus", "scerevisiae", "celegans", "cbriggsae", + "cbrenneri", "cremanei", "dmelanogaster", "mdomestica", + "spurpuratus", "xtropicalis","btaurus", "drerio", "ecaballus" +] + +geneDB = {} +chromDict = {} +chromRoot = {} +chromGeneEntries = {} +checkGene = {} +geneInfo = {} +getAllGenes = {} +annotInfo = {} +goInfo = {} +allAnnotInfo = {} +allGOInfo = {} + +def compNT(nt): + """ returns the complementary basepair to base nt + """ + compDict = {"A": "T", "T": "A", + "G": "C", "C": "G", + "S": "S", + "W": "W", + "R": "Y", "Y": "R", + "M": "K", "K": "M", + "H": "D", "D": "H", + "B": "V", "V": "B", + "N": "N", + "a": "t", "t": "a", + "g": "c", "c": "g", + "n": "n", + "z": "z" + } + + return compDict.get(nt, "N") + + +def complement(sequence, length=0): + """ returns the complement of the sequence. + """ + newSeq = "" + seqLength = len(sequence) + if length == seqLength: + seqList = list(sequence) + seqList.reverse() + return "".join(map(compNT, seqList)) + + for index in range(seqLength - 1, seqLength - length - 1, -1): + try: + newSeq += compNT(sequence[index]) + except: + newSeq += "N" + + return newSeq + + +class Genome: + genome = "" + chromosome = "" + version = "" + dbFile = "" + supported = False + oldStyle = False + customAnnotations = False + + + def __init__(self, genome, chrom="", version="", dbFile="", inRAM=False): + self.genome = genome + if chrom != "": + self.chromosome = chrom + + if version != "": + self.version = version + + if dbFile != "": + self.dbFile = dbFile + + if genome in supportedGenomes and dbFile == "": + self.dbFile = geneDB[genome] + self.supported = True + + if inRAM: + self.memoryBacked = True + self.memoryDB = sqlite.connect(":memory:") + self.createGeneDB(inMemory=True) + sql = self.memoryDB.cursor() + try: + sql.execute("PRAGMA DEFAULT_CACHE_SIZE = 500000") + sql.execute("ATTACH '%s' as diskdb" % self.dbFile) + for table in ["gene_entries", "gene_annotation", "gene_ontology", "sequences", "chromosomes", "sequence_features"]: + sql.execute("insert into %s select * from diskdb.%s" % (table, table)) + + sql.execute("DETACH diskdb") + except: + if self.dbFile != "": + print "could not import %s" % self.dbFile + + sql.close() + self.createIndices(inMemory=True) + else: + self.memoryBacked = False + self.memoryDB = "" + + + def setGeneDB(self, dbFile): + self.dbFile = dbFile + self.supported = False + + + def setChromosome(self, chrom): + self.chromosome = chrom + + + def checkGene(self, geneID): + """ returns True if the geneID matches an entry in the genome database. + """ + (genome, gID) = geneID + if genome != self.genome: + return False + + try: + stmt = "SELECT chromosome from gene_entries where name = '%s' " % gID + res = self.queryDB(stmt) + if len(res) > 0: + return True + except: + pass + + return False + + + def geneInfo(self, geneID, version="1"): + (genome, gID) = geneID + result = "" + if genome != self.genome: + return False + + try: + stmt = "SELECT chromosome, start, stop, length, orientation from gene_entries where name = '%s' and version = '%s' " % (gID, version) + res = self.queryDB(stmt) + if len(res) > 0: + chrom = res[0] + start = int(res[1]) + stop = int(res[2]) + if start > stop: + temp = stop + stop = start + start = temp + + length = int(res[3]) + orientation = res[4] + result = (chrom, start, stop, length, orientation) + except: + pass + + return result + + + def getallGeneInfo(self, name="", chrom="", version=""): + resultDict = {} + chromList = [] + stmt = "select name, chromosome, start, stop, orientation from gene_entries order by name, start " + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (name, chromosome, start, stop, orientation) = entry + if name not in resultDict: + resultDict[name] = [] + + if chromosome not in chromList: + resultDict[chromosome] = [] + chromList.append(chromosome) + + resultDict[chromosome].append((name, chromosome, int(start), int(stop), orientation)) + + return resultDict + + + def leftGeneDistance(self, geneID, radius=50000, version="1"): + result = radius + res = self.geneInfo(geneID, version) + if res != "": + (chrom, start, stop, length, orientation) = res + if start > stop: + temp = stop + stop = start + start = temp + + stmt = "SELECT name, start, stop, length, orientation from gene_entries where chromosome = '%s' and ((start > %d and start < %d) or (stop > %d and stop < %d)) " % (chrom, start - radius, start, start - radius, start) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + rstart = int(entry[1]) + rstop = int(entry[2]) + if rstart > rstop: + temp = rstop + rstop = rstart + rstart = temp + + thelength = start - rstart + if (start - rstop) > 0 and (start - rstop) < thelength: + thelength = start - rstop + + if thelength > 0 and thelength < result: + result = thelength + + return result + + def rightGeneDistance(self, geneID, radius=50000, version="1"): + result = radius + res = self.geneInfo(geneID, version) + if res != "": + (chrom, start, stop, length, orientation) = res + if start > stop: + temp = stop + stop = start + start = temp + + stmt = "SELECT name, start, stop, length, orientation from gene_entries where chromosome = '%s' and ((start > %d and start < %d) or (stop > %d and stop < %d)) " % (chrom, stop, stop + radius, stop, stop + radius) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + rstart = int(entry[1]) + rstop = int(entry[2]) + if rstart > rstop: + temp = rstop + rstop = rstart + rstart = temp + + thelength = rstop - stop + if (rstart - stop) > 0 and (rstart - stop) < thelength: + thelength = rstart - stop + + if thelength > 0 and thelength < result: + result = thelength + + return result + + + def annotInfo(self, geneID): + (genome, gID) = geneID + result = [] + if genome != self.genome: + return False + + stmt = "SELECT description from gene_annotation where name = '%s'" % gID + res = self.queryDB(stmt, fetchall=True) + if len(res) > 0: + for entry in res: + result.append(entry[0]) + + return result + + + def goInfo(self, geneID): + (genome, gID) = geneID + result = [] + if genome != self.genome: + return False + + stmt = "SELECT GOID, objType, objName, isNot, GOterm, evidence from gene_ontology where name = '%s'" % gID + res = self.queryDB(stmt, fetchall=True) + if len(res) > 0: + for entry in res: + result.append(string.join(entry,"\t")) + + return result + + + def chromInfo(self, chrom=""): + if chrom == "" and self.chromosome != "": + chrom = self.chromosome + + stmt = "SELECT sequenceName, storageType from chromosomes where name = '%s' " % chrom + res = self.queryDB(stmt) + result = "%s\t%s" % (res[0], res[1]) + + return result + + + def allGIDs(self): + """ returns [ list of all orf names] + """ + result = [] + stmt = "SELECT distinct name from gene_entries" + res = self.queryDB(stmt, fetchall=True) + for entry in res: + result.append(entry[0]) + + return result + + + def allGIDsbyGOID(self, GOID): + """ returns [ list of all orf names] that match a particular GOID + """ + result = [] + stmt = "SELECT distinct name from gene_ontology where GOID = '%s' " % GOID + res = self.queryDB(stmt, fetchall=True) + for entry in res: + result.append(entry[0]) + + return result + + + def allGOIDs(self): + """ returns the list of GOID's in the genome + """ + result = [] + stmt = "SELECT distinct GOID from gene_ontology" + res = self.queryDB(stmt, fetchall=True) + for entry in res: + result.append(entry[0]) + + return result + + + def allGOterms(self): + """ returns the list of GOID's and their associated GO term in the genome + """ + result = {} + stmt = "SELECT distinct GOID, GOterm from gene_ontology" + res = self.queryDB(stmt, fetchall=True) + for entry in res: + result[str(entry[0])] = str(entry[1]) + + return result + + + def getGOIDCount(self, GOID): + """ returns the match count for a particular GOID + """ + stmt = "SELECT distinct name from gene_ontology where GOID = '%s' " % GOID + res = self.queryDB(stmt, fetchall=True) + + return len(res) + + + def allAnnotInfo(self): + result = {} + stmt = "SELECT name, description from gene_annotation" + res = self.queryDB(stmt, fetchall=True) + + for entry in res: + geneID = (self.genome, entry[0]) + if geneID not in result: + result[geneID] = [] + + result[(self.genome,entry[0])].append(entry[1]) + + return result + + + def allGoInfo(self): + result = {} + stmt = "SELECT name, GOID, objType, objName, isNot, GOterm, evidence, other from gene_ontology order by name " + res = self.queryDB(stmt, fetchall=True) + for entry in res: + geneID = (self.genome, entry[0]) + if geneID not in result: + result[geneID] = [] + + result[(self.genome, entry[0])].append(string.join(entry[1:], "\t")) + + return result + + + def allChromNames(self, partition=1, slice=0): + result = [] + stmt = "SELECT distinct name from chromosomes" + res = self.queryDB(stmt, fetchall=True) + reslen = len(res) + for index in range(reslen): + if (index + slice) % partition == 0: + entry = res[index] + result.append(entry[0]) + + return result + + + def geneSeq(self, gID, maskCDS=False, maskLower=False, version="1"): + (chrom, start, stop, length, orientation) = self.geneInfo(gID, version) + seq = self.sequence(chrom, start, length, maskCDS, maskLower) + if orientation == "R": + seq = complement(seq, length) + + return seq + + + def getGeneFeatures(self, gID, type="", version="1"): + results = [] + featureClause = "" + (genome, geneid) = gID + if len(type) > 0: + featureClause = ' and type = "%s" ' % type + + stmt = 'select type, chromosome, start, stop, orientation from sequence_features where name = "%s" and version = "%s" %s order by start ' % (geneid, str(version), featureClause) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (type, chromosome, start, stop, orientation) = entry + results.append((type, chromosome, start, stop, orientation)) + + return results + + + def getallGeneFeatures(self): + resultDict = {} + stmt = "select name, type, chromosome, start, stop, orientation from sequence_features order by name, start " + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (name, type, chromosome, start, stop, orientation) = entry + if name not in resultDict: + resultDict[name] = [] + + resultDict[name].append((type, chromosome, start, stop, orientation)) + + return resultDict + + + def getFeatureTypes(self, ftype=''): + """ Returns the distinct feature types available in the sequence_features + tables. Can optionally limit by feature type; the wild-card % can be + used to search feature substrings. + """ + results = [] + whereClause = "" + useLike = False + if "%" in ftype: + useLike = True + + if len(ftype) > 0 and not useLike: + whereClause = 'where type = "%s" ' % ftype + elif len(ftype) > 0: + whereClause = 'where type LIKE "%s" ' % ftype + + stmt = "select distinct type from sequence_features %s" % whereClause + res = self.queryDB(stmt, fetchall=True) + for entry in res: + results.append(entry[0]) + + return results + + def getFeatures(self, ftype, name="", chrom="", version =""): + """ Get features stored in sequence_features that match a feature type, + optionally restricted by name/value, chromosome, or version. Will + search for substrings when ftype and/or name are given with a % to + indicate the location of the wildcard. Returns a dictionary of features + with chromosomes as the keys. + """ + results = {} + chromList = [] + nameClause = "" + chromClause = "" + versionClause = "" + useLike = "=" + if "%" in ftype: + useLike = "LIKE" + + if len(name) > 0: + if "%" in name: + nameLike = "LIKE" + else: + nameLike = "=" + + nameClause = ' and name %s "%s" ' % (nameLike, name) + + if len(chrom) > 0: + chromClause = ' and chromosome = "%s" ' % chrom + + if len(version) > 0: + versionClause = ' and version = "%s" ' % version + + stmt = 'select name, version, chromosome, start, stop, orientation, type from sequence_features where type %s "%s" %s %s %s order by type' % (useLike, ftype, chromClause, nameClause, versionClause) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (name, version, chromosome, start, stop, orientation, atype) = entry + if chromosome not in chromList: + results[chromosome] = [] + chromList.append(chromosome) + + results[chromosome].append((name, version, chromosome, start, stop, orientation, atype)) + + return results + + + def getFeaturesIntersecting(self, chrom, qstart, qlength, ftype=""): + """ Return features that are on a particular stretch of the genome. Can optionally + limit by feature type; the wild-card % can be used to search feature substrings. + """ + results = [] + featureClause = "" + qstop = qstart + qlength + useLike = False + if "%" in ftype: + useLike = True + + if len(ftype) > 0 and not useLike: + featureClause = ' and type = "%s" ' % ftype + elif len(ftype) > 0: + featureClause = ' and type LIKE "%s" ' % ftype + + #select all features that encompass our start/stop + stmt = 'select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start < %d and stop > %d %s order by start' % (chrom, qstart, qstop, featureClause) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (chromosome, start, stop, orientation, name, version, atype) = entry + results.append((name, version, chromosome, start, stop, orientation, atype)) + + # select all features that have a "stop" between start and stop that aren't yet in the dataset + stmt = 'select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and stop >= %d and stop <= %d %s order by start' % (chrom, qstart, qstop, featureClause) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (chromosome, start, stop, orientation, name, version, atype) = entry + if (name, version, chromosome, start, stop, orientation, atype) not in results: + results.append((name, version, chromosome, start, stop, orientation, atype)) + + # select all features on chromosome that have a "start" between start and stop + stmt = 'chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start >= %d and start <= %d %s order by start' % (chrom, qstart, qstop, featureClause) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (chromosome, start, stop, orientation, name, version, atype) = entry + if (name, version, chromosome, start, stop, orientation, atype) not in results: + results.append((name, version, chromosome, start, stop, orientation, atype)) + + return results + + + def getGenesIntersecting(self, chrom, qstart, qlength): + """ Return features that are on a ptarticular stretch of the genome. Can optionally + limit by feature type; the wild-card % can be used to search feature substrings. + """ + results = [] + qstop = qstart + qlength + atype = "model" + #select all features that encompass our start/stop + stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and start < %d and stop > %d order by start' % (chrom, qstart, qstop) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (chromosome, start, stop, orientation, name, version) = entry + results.append((name, version, chromosome, start, stop, orientation, atype)) + + # select all features that have a "stop" between start and stop that aren't yet in the dataset + stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and stop >= %d and stop <= %d order by start' % (chrom, qstart, qstop) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (chromosome, start, stop, orientation, name, version) = entry + if (name, version, chromosome, start, stop, orientation) not in results: + results.append((name, version, chromosome, start, stop, orientation, atype)) + + # select all features on chromosome that have a "start" between start and stop + stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and start >= %d and start <= %d order by start' % (chrom, qstart, qstop) + res = self.queryDB(stmt, fetchall=True) + for entry in res: + (chromosome, start, stop, orientation, name, version) = entry + if (name, version, chromosome, start, stop, orientation) not in results: + results.append((name, version, chromosome, start, stop, orientation, atype)) + + return results + + + def sequence(self, chrom, start, length, maskCDS=False, maskLower=False): + seq = "" + if chrom == "" and self.chromosome != "": + chrom = self.chromosome + + stmt = "select sequenceName, storageType from chromosomes where name = '%s' " % chrom + res = self.queryDB(stmt) + seqName = res[0] + seqType = res[1] + if seqType == "db": + stmt = "select sequence from sequences where name = '%s' " % seqName + res = self.queryDB(stmt) + seq = res[0][start: start + length] + res = "" + else: + chromFile = open("%s%s" % (cisRoot, seqName), "r") + mymap = mmap(chromFile.fileno(),1,access=ACCESS_READ) + mymap = mmap(chromFile.fileno(), mymap.size(), access=ACCESS_READ) + mymap.seek(start) + seq = mymap.read(length) + chromFile.close() + + if maskCDS: + stop = start + length - 1 + seqArray = list(seq) + features = self.getFeaturesIntersecting(chrom, start, length, "CDS") + for entry in features: + (name, geneID, chromosome, fstart, fstop, forientation, type) = entry + if fstart < start: + fstart = start + + if fstop > stop: + fstop = stop + + nstart = fstart - start + nstop = fstop - start + 1 + for pos in range(nstop - nstart): + seqArray[nstart + pos] = "N" + + seq = string.join(seqArray, "") + + if maskLower: + seqArray = list(seq) + for index in range(len(seqArray)): + if seqArray[index] in ["a", "c" , "g", "t"]: + seqArray[index] = "N" + + seq = string.join(seqArray, "") + + return seq + + + def getChromosomeSequence(self, chrom=""): + seq = "" + if chrom == "" and self.chromosome != "": + chrom = self.chromosome + + stmt = "select sequenceName, storageType from chromosomes where name = '%s' " % chrom + res = self.queryDB(stmt) + if res == None: + print "Could not find chromosome %s" % chrom + return '' + + seqName = res[0] + seqType = res[1] + if seqType == "db": + res = self.queryDB('select sequence from sequences where name = "%s"' % chrom) + seq = res[0] + res = "" + else: + chromFile = open("%s%s" % (cisRoot, seqName), "r") + seq = chromFile.readline() + chromFile.close() + + return seq + + + def chromGeneEntries(self, chrom="", lowerbound=-1, upperbound=-1): + result = [] + if chrom == "" and self.chromosome != "": + chrom = self.chromosome + + stmt = "select distinct start, stop, orientation, name from gene_entries where chromosome = '%s' " % chrom + res = self.queryDB(stmt, fetchall=True) + if lowerbound > 0 and upperbound > 0: + for entry in res: + start = int(entry[0]) + stop = int(entry[1]) + if stop < start: + start, stop = stop, start + + if stop < lowerbound or start > upperbound: + continue + + result.append((start, stop, entry[2], (self.genome, entry[3]))) + else: + for entry in res: + start = int(entry[0]) + stop = int(entry[1]) + if stop < start: + start, stop = stop, start + + result.append((start, stop, entry[2], (self.genome, entry[3]))) + + return result + + + def createGeneDB(self, dbFile="", inMemory=False): + if len(dbFile) > 0: + self.dbFile = dbFile + + tableDict = {"gene_entries": "ID INTEGER PRIMARY KEY, name varchar, version varchar, chromosome varchar, start varchar, stop varchar, length varchar, orientation varchar, feature varchar", + "gene_annotation": "ID INTEGER PRIMARY KEY, name varchar, description varchar", + "gene_ontology": "ID INTEGER PRIMARY KEY, name varchar, GOID varchar, objType varchar, objName varchar, isNot varchar, GOterm varchar, evidence varchar, other varchar", + "sequences": "ID INTEGER PRIMARY KEY, name varchar, sequenceLength varchar, sequenceType varchar, sequence varchar", + "chromosomes": "ID INTEGER PRIMARY KEY, name varchar, sequenceName varchar, storageType varchar", + "sequence_features": "ID INTEGER PRIMARY KEY, name varchar, version varchar, chromosome varchar, start int, stop int, length varchar, orientation varchar, type varchar" + } + + for table in tableDict.keys(): + stmt = "create table %s(%s)" % (table, tableDict[table]) + self.writeDB(stmt, useMemory=inMemory) + + + def createIndices(self, inMemory=False): + indexDict = {"nameIndex1": ("gene_entries", "name"), + "nameIndex2": ("gene_annotation", "name"), + "nameIndex3": ("gene_ontology", "name"), + "goidIndex": ("gene_ontology", "GOID"), + "nameIndex4": ("sequences", "name"), + "nameIndex5": ("chromosomes", "name"), + "geneIDIndex": ("sequence_features", "name, type"), + "posIndex": ("sequence_features", "chromosome, start, stop, type"), + "typeIndex": ("sequence_features", "type") + } + + for indexName in indexDict.keys(): + (tableName, fields) = indexDict[indexName] + stmt = "CREATE INDEX %s on %s(%s)" % (indexName, tableName, fields) + self.writeDB(stmt, useMemory=inMemory) + + + def addGeneEntry(self, geneID, chrom, start, stop, orientation, feature="", version=1.0): + (genome, gID) = geneID + length = str(abs(int(start) - int(stop)) + 1) + stmt = "insert into gene_entries(ID, name, version, chromosome, start, stop, length, orientation, feature) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % (gID, str(version), chrom, start, stop, length, orientation, feature) + self.writeDB(stmt) + + + def addFeatureEntry(self, name, geneID, chrom, start, stop, orientation, type=""): + (genome, gID) = geneID + length = str(abs(int(start) - int(stop)) + 1) + stmt = "insert into sequence_features(ID, name, geneID, chromosome, start, stop, length, orientation, type) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % (name, gID, chrom, start, stop, length, orientation, type) + self.writeDB(stmt) + + + def addAnnotation(self, geneID, description): + (genome, gID) = geneID + stmt = "insert into gene_annotation(ID, name, description) values (NULL, '%s', '%s') " % (gID, description) + self.writeDB(stmt) + + + def addGoInfo(self, geneID, GOID, objType="", objName="", isNot="", GOterm="", evidence="", other=""): + (genome, gID) = geneID + stmt = "insert into gene_ontology(ID, name, GOID, objType, objName, isNot, GOterm, evidence, other) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % (gID, str(GOID), objType, objName, isNot, GOterm, evidence, other) + self.writeDB(stmt) + + + def addSequence(self, geneID, seq, seqType, seqLen="-1"): + (genome, gID) = geneID + if int(seqLen) < 0: + seqLen = str(len(seq)) + + stmt = "insert into sequences(ID, name, sequenceLength, sequenceType, sequence) values (NULL, '%s', '%s', '%s', '%s')" % (gID, str(seqLen), seqType, seq) + self.writeDB(stmt) + + + def addChromosomeEntry(self, chromo, seqName, storageType): + stmt = "insert into chromosomes(ID, name, sequenceName, storageType) values (NULL, '%s', '%s', '%s')" % (chromo, seqName, storageType) + self.writeDB(stmt) + + + def addSequenceBatch(self, entryArray, inMemory=False): + stmtArray = [] + stmt = "insert into sequences(ID, name, sequenceLength, sequenceType, sequence) values (NULL, ?, ?, ?, ?)" + for entry in entryArray: + (geneID, seq, seqType, seqLen) = entry + (genome, gID) = geneID + if int(seqLen) < 0: + seqLen = str(len(seq)) + + stmtArray.append((gID, str(seqLen), seqType, seq)) + + self.writeBatchDB(stmt, stmtArray) + + + def addChromosomeEntryBatch(self, entryArray, inMemory=False): + stmt = "insert into chromosomes(ID, name, sequenceName, storageType) values (NULL, ?, ?, ?)" + self.writeBatchDB(stmt, entryArray, useMemory=inMemory) + + + def addGeneEntryBatch(self, entryArray, inMemory=False): + stmtArray = [] + stmt = "insert into gene_entries(ID, name, version, chromosome, start, stop, length, orientation, feature) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) " + for entry in entryArray: + (geneID, chrom, start, stop, orientation, feature, version) = entry + (genome, gID) = geneID + length = str(abs(int(start) - int(stop)) + 1) + stmtArray.append((gID, str(version), chrom, start, stop, length, orientation, feature)) + + self.writeBatchDB(stmt, stmtArray, useMemory=inMemory) + + + def addFeatureEntryBatch(self, entryArray, inMemory=False): + stmtArray = [] + stmt = "insert into sequence_features(ID, name, version, chromosome, start, stop, length, orientation, type) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) " + for entry in entryArray: + (geneID, version, chrom, start, stop, orientation, type) = entry + (genome, gID) = geneID + length = str(abs(int(start) - int(stop)) + 1) + stmtArray.append((gID, version, chrom, int(start), int(stop), length, orientation, type)) + + self.writeBatchDB(stmt, stmtArray, useMemory=inMemory) + + + def addAnnotationBatch(self, entryArray, inMemory=False): + stmtArray = [] + stmt = "insert into gene_annotation(ID, name, description) values (NULL, ?, ?) " + for entry in entryArray: + (geneID, description) = entry + (genome, gID) = geneID + stmtArray.append((gID, description)) + + self.writeBatchDB(stmt, stmtArray, useMemory=inMemory) + + + def addGoInfoBatch(self, entryArray, inMemory=False): + stmtArray = [] + stmt = "insert into gene_ontology(ID, name, GOID, objType, objName, isNot, GOterm, evidence, other) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) " + for entry in entryArray: + (geneID, GOID, objType, objName, isNot, GOterm, evidence, other) = entry + (genome, gID) = geneID + stmtArray.append((gID, str(GOID), objType, objName, isNot, GOterm, evidence, other)) + + self.writeBatchDB(stmt, stmtArray, useMemory=inMemory) + + + def extendFeatures(self, featureFile, fileType="cistematic", replace=False): + geneEntryList = [] + geneFeatureList = [] + currentGene = "" + gstart = -1 + gstop = -1 + gchrom = "" + gsense = "" + ftype = "" + senseArray = {"+": "F", + "-": "R", + ".": "F" + } + + featfile = open(featureFile) + if fileType == "cistematic": + for line in featfile: + if line[0] == "#": + continue + + fields = line.strip().split() + if currentGene == "": + currentGene = fields[0] + gchrom = fields[2] + gstart = int(fields[3]) + gsense = senseArray[fields[5]] + + if fields[0] != currentGene: + geneEntryList.append(((self.genome, currentGene), gchrom, gstart, gstop, gsense, "Transcript", "1")) + currentGene = fields[0] + gchrom = fields[2] + gstart = int(fields[3]) + gsense = senseArray[fields[5]] + + lstart = int(fields[3]) + gstop = int(fields[4]) + ftype = fields[6] + geneFeatureList.append(((self.genome, currentGene), "1", gchrom, lstart, gstop, gsense, ftype)) + elif fileType == "UCSC": + for line in featfile: + if line[0] == "#": + continue + + geneFields = line.split("\t") + exonNum = int(geneFields[8]) + exonStarts = geneFields[9].split(",") + exonStops = geneFields[10].split(",") + gchrom = geneFields[2][3:] + gsense = senseArray[geneFields[3]] + gstop = int(geneFields[7]) - 1 + gstart = int(geneFields[6]) - 1 + geneID = ("generic", geneFields[1]) + for index in range(exonNum): + estart = int(exonStarts[index]) - 1 + estop = int(exonStops[index]) - 1 + if estart >= gstart and estop <= gstop: + geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, "CDS")) + elif estop <= gstart: + if gsense == "F": + fType = "5UTR" + else: + fType = "3UTR" + + geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, fType)) + elif estart >= gstop: + if gsense == "F": + fType = "3UTR" + else: + fType = "5UTR" + + geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, fType)) + elif estart <= gstop and estart > gstart: + if gsense == 'F': + fType = '3UTR' + else: + fType = '5UTR' + + geneFeatureList.append((geneID, "1", gchrom, estart, gstop, gsense, "CDS")) + geneFeatureList.append((geneID, "1", gchrom, gstop + 1, estop, gsense, fType)) + elif estart < gstart and estop <= gstop: + if gsense == "F": + fType = "5UTR" + else: + fType = "3UTR" + + geneFeatureList.append((geneID, "1", gchrom, estart, gstart - 1, gsense, fType)) + geneFeatureList.append((geneID, "1", gchrom, gstart, estop, gsense, "CDS")) + else: + if gsense == "F": + fType1 = "5UTR" + fType2 = "3UTR" + else: + fType1 = "3UTR" + fType2 = "5UTR" + + geneFeatureList.append((geneID, "1", gchrom, estart, gstart - 1, gsense, fType1)) + geneFeatureList.append((geneID, "1", gchrom, gstart, gstop, gsense, "CDS")) + geneFeatureList.append((geneID, "1", gchrom, gstop + 1, estop - 1, gsense, fType2)) + else: + print "%s format not supported yet" + featfile.close() + return + + featfile.close() + if replace==True: + self.queryDB("delete from sequence_features", useMemory=True) + self.queryDB("delete from gene_entries", useMemory=True) + + self.addGeneEntryBatch(geneEntryList, inMemory=True) + self.addFeatureEntryBatch(geneFeatureList, inMemory=True) + + + def queryDB(self, stmt, fetchall=False, useMemory=False): + if useMemory or self.memoryBacked: + db = self.memoryDB + else: + db = sqlite.connect(self.dbFile, timeout = 60) + + sql = db.cursor() + sql.execute(stmt) + if fetchall: + results = sql.fetchall() + else: + results = sql.fetchone() + + sql.close() + if not (useMemory or self.memoryBacked): + db.close() + + return results + + + def writeDB(self, stmt, useMemory=False): + if useMemory: + db = self.memoryDB + else: + db = sqlite.connect(self.dbFile, timeout = 60) + + sql = db.cursor() + sql.execute(stmt) + db.commit() + sql.close() + if not useMemory: + db.close() + + + def writeBatchDB(self, stmt, stmtArray, useMemory=False): + if useMemory: + db = self.memoryDB + else: + db = sqlite.connect(self.dbFile, timeout = 60) + + sql = db.cursor() + try: + sql.executemany(stmt, stmtArray) + except: + print "writeBatchDB: problem with %s" % stmt + + db.commit() + sql.close() + if not useMemory: + db.close() + + +def processSql(sqlFile, keyFields={}): + """ process a UCSC formatted .sql file to identify the table name and the position of key fields. Specifying the + name of important fields in keyFields is highly recommended. A key in keyFields represents the sql column + name in the file, while the corresponding value corresponds to the feature name. Blank values mean that the + same field name will be used. For example, one possible keyFields dict would be: + keyFields = {'chromStart':'start', 'chromEnd':'stop', 'chrom':'', 'name':'', 'score':'value', 'strand':'orientation'} + """ + fields = {} + infile = open(sqlFile) + line = "" + while "CREATE TABLE" not in line: + line = infile.readline() + continue + + tabFields = line.split() + tabName = tabFields[2] + if keyFields == {}: + fields["chrom"] = 1 + fields["start"] = 2 + fields["stop"] = 3 + fields["name"] = 4 + else: + index = 0 + for line in infile: + lineFields = line.split() + if lineFields[0] in keyFields.keys(): + if keyFields[lineFields[0]] == "": + outfield = lineFields[0] + else: + outfield = keyFields[lineFields[0]] + + fields[outfield] = index + + index += 1 + + infile.close() + + return (tabName, fields) + + +def processTrack(genomeObject, dataFile, typeName="MISC", dataFields={}, version="1"): + """ process data for a UCSC track, given an instantiated genome object, the data, the name for the feature + type in sequence_features, the dataFields in the format returned by processSql(), and a version. + If genomeObject is the empty string '', then processTrack() will simply print out the aggregate records, + rather than use addFeatureEntryBatch() to record the added features. + + Note that numberic values are overloaded into the name field using @ as a delimiter. + """ + records = [] + if dataFields == {}: + dataFields["chrom"] = 1 + dataFields["start"] = 2 + dataFields["stop"] = 3 + dataFields["name"] = 4 + + infile = open(dataFile) + for line in infile: + fields = line.strip().split("\t") + if "name" in dataFields: + recName = fields[dataFields["name"]] + else: + recName = typeName + + if "value" in dataFields: + recName += "@" + fields[dataFields["value"]] + + start = int(fields[dataFields["start"]]) + stop = int(fields[dataFields["stop"]]) + chrom = fields[dataFields["chrom"]][3:] + chrom.replace("_random", "rand") + orientation = "F" + if "orientation" in dataFields: + if fields[dataFields["orientation"]] == "-": + orientation = "R" + + if "version" in dataFields: + version = fields[dataFields["version"]] + + records.append((("placeholder", recName), version, chrom, start, stop, orientation, typeName.upper())) + + if genomeObject != "": + genomeObject.addFeatureEntryBatch(records) + else: + print records + + +# Voodoo to get recursive imports working +for genome in supportedGenomes: + importString = "import %s" % genome + exec importString + geneDB[genome] = eval("%s.geneDB" % genome) \ No newline at end of file