X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=cistematic%2Fcore%2Fhomology.py;fp=cistematic%2Fcore%2Fhomology.py;h=079875326b1e7b51e4c0d124bb39321f1cc52610;hp=0000000000000000000000000000000000000000;hb=bc30aca13e5ec397c92e67002fbf7a103130b828;hpb=0d3e3112fd04c2e6b44a25cacef1d591658ad181 diff --git a/cistematic/core/homology.py b/cistematic/core/homology.py new file mode 100644 index 0000000..0798753 --- /dev/null +++ b/cistematic/core/homology.py @@ -0,0 +1,222 @@ +########################################################################### +# # +# C O P Y R I G H T N O T I C E # +# Copyright (c) 2003-10 by: # +# * California Institute of Technology # +# # +# All Rights Reserved. # +# # +# Permission is hereby granted, free of charge, to any person # +# obtaining a copy of this software and associated documentation files # +# (the "Software"), to deal in the Software without restriction, # +# including without limitation the rights to use, copy, modify, merge, # +# publish, distribute, sublicense, and/or sell copies of the Software, # +# and to permit persons to whom the Software is furnished to do so, # +# subject to the following conditions: # +# # +# The above copyright notice and this permission notice shall be # +# included in all copies or substantial portions of the Software. # +# # +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # +# SOFTWARE. # +########################################################################### +# +try: + from pysqlite2 import dbapi2 as sqlite +except: + from sqlite3 import dbapi2 as sqlite + +import tempfile, shutil, os +from os import environ + +if environ.get("CISTEMATIC_ROOT"): + cisRoot = environ.get("CISTEMATIC_ROOT") +else: + cisRoot = "/proj/genome" + +from cistematic.core.geneinfo import speciesMap + +dbPath = "%s/db/homologene.db" % cisRoot +homologeneGenomes = ["hsapiens", "mmusculus", "rnorvegicus", "celegans", + "cbriggsae", "cremanei", "dmelanogaster", "athaliana", + "ggallus", "cfamiliaris", "drerio", "scerevisiae"] + +if environ.get("CISTEMATIC_TEMP"): + cisTemp = environ.get("CISTEMATIC_TEMP") +else: + cisTemp = "/tmp" +tempfile.tempdir = cisTemp + + +class homologyDB: + """ The homologyDB class allows for the mapping and return of predefined homology relationships. + """ + startingGenome = "" + targetGenomes = [] + homologousGenes = {} + cachedDB = "" + + + def __init__(self, tGenomes=[], cache=False): + """ initialize the homologyDB object with a target genome and cache database, if desired. + """ + self.targetGenomes = tGenomes + if cache: + self.cacheDB() + + + def __del__(self): + """ cleanup copy in local cache, if present. + """ + if self.cachedDB != "": + self.uncacheDB() + + + def cacheDB(self): + """ copy homologyDB to a local cache. + """ + self.cachedDB = "%s.db" % tempfile.mktemp() + shutil.copyfile(dbPath, self.cachedDB) + + + def uncacheDB(self): + """ delete homologyDB from local cache. + """ + global cachedDB + if self.cachedDB != "": + try: + os.remove(self.cachedDB) + except: + print "could not delete %s" % self.cachedDB + + self.cachedDB = "" + + + def connectDB(self): + """ return a handle to the database. + """ + path = dbPath + if self.cachedDB != "": + path = self.cachedDB + + return sqlite.connect(path, timeout=60) + + + def loadDB(self): + """ deprecated. + """ + pass + + + def getHomologousGenes(self, geneID): + """ return list of geneIDs homologous to given geneID. Limit to target genomes if specified at initialization. + """ + db = self.connectDB() + cursor = db.cursor() + results = [] + (gen, gid) = geneID + cursor.execute("select homoloID from homolog where genome = :gen and gID = :gid", locals()) + groups = cursor.fetchall() + for hIDentry in groups: + homoloID = str(hIDentry[0]) + cursor.execute("select genome, gID from homolog where homoloID = :homoloID ", locals()) + genes = cursor.fetchall() + for gene in genes: + if gene != geneID: + (genome, gID) = gene + if len(self.targetGenomes) > 0: + if genome not in self.targetGenomes: + pass + else: + results.append((str(genome), str(gID))) + else: + results.append((str(genome), str(gID))) + + cursor.close() + db.close() + + return results + + +def buildHomologeneDB(hFile="homologene.data", hdb=dbPath): + """ Populate a new homologyDB database with homology relationships from homologene. + """ + inFile = open(hFile) + db = sqlite.connect(hdb) + cursor = db.cursor() + cursor.execute("create table homolog(ID INTEGER PRIMARY KEY, homoloID varchar, genome varchar, gID varchar)") + for line in inFile: + doInsert = False + sqlstmt = "INSERT into homolog(ID, homoloID, genome, gID) values (NULL, ?, ?, ?)" + field = line.split("\t") + if field[1] in speciesMap: + gid = field[2] + if speciesMap[field[1]] == "arabidopsis": + gid = field[3].upper() + + values = ("homologene-%s" % field[0], speciesMap[field[1]], gid.strip()) + doInsert = True + + if doInsert: + cursor.execute(sqlstmt, values) + + inFile.close() + sqlstmt = "CREATE INDEX idx1 on homolog(genome, gID)" + cursor.execute(sqlstmt) + sqlstmt = "CREATE INDEX idx2 on homolog(homoloID)" + cursor.execute(sqlstmt) + db.commit() + cursor.close() + db.close() + + +def addHomologs(genomeA, genomeB, entries, hdb=dbPath): + """ Specify homology relationships between geneIDs to be inserted into homology database. + The entries list contains doubles of the form (gIDa, gIDb) from genome A and genome B, respectively. + """ + mapping = {} + for (geneID1, geneID2) in entries: + mapping[geneID1] = geneID2 + + if len(mapping) < 1: + return + + db = sqlite.connect(hdb) + sql = db.cursor() + sql.execute('select * from homolog where genome = "%s" ' % genomeA) + results = sql.fetchall() + + stmt = "insert into homolog(ID, homoloID, genome, gID) values (NULL, ?, ?, ?) " + stmtArray = [] + + for entry in results: + (rowID, hID, genome, gID) = entry + if gID in mapping: + stmtArray.append((hID, genomeB, mapping[gID])) + del mapping[gID] + + topHID = 0 + if len(stmtArray) > 0: + print "Updating %d entries in homolog table" % len(stmtArray) + sql.executemany(stmt, stmtArray) + + stmtArray = [] + for gID in mapping: + topHID += 1 + homologID = "%s-%s-%s" % (genomeA, genomeB, str(topHID)) + stmtArray.append((homologID, genomeA, gID)) + stmtArray.append((homologID, genomeB, mapping[gID])) + + if len(mapping) > 0: + print "Adding %d new homology entries" % len(mapping) + sql.executemany(stmt, stmtArray) + + db.commit() + sql.close() + db.close() \ No newline at end of file