erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / core / homology.py
diff --git a/cistematic/core/homology.py b/cistematic/core/homology.py
new file mode 100644 (file)
index 0000000..0798753
--- /dev/null
@@ -0,0 +1,222 @@
+###########################################################################
+#                                                                         #
+# C O P Y R I G H T   N O T I C E                                         #
+#  Copyright (c) 2003-10 by:                                              #
+#    * California Institute of Technology                                 #
+#                                                                         #
+#    All Rights Reserved.                                                 #
+#                                                                         #
+# Permission is hereby granted, free of charge, to any person             #
+# obtaining a copy of this software and associated documentation files    #
+# (the "Software"), to deal in the Software without restriction,          #
+# including without limitation the rights to use, copy, modify, merge,    #
+# publish, distribute, sublicense, and/or sell copies of the Software,    #
+# and to permit persons to whom the Software is furnished to do so,       #
+# subject to the following conditions:                                    #
+#                                                                         #
+# The above copyright notice and this permission notice shall be          #
+# included in all copies or substantial portions of the Software.         #
+#                                                                         #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
+# SOFTWARE.                                                               #
+###########################################################################
+#
+try:
+    from pysqlite2 import dbapi2 as sqlite
+except:
+    from sqlite3 import dbapi2 as sqlite
+
+import tempfile, shutil, os
+from os import environ
+
+if environ.get("CISTEMATIC_ROOT"):
+    cisRoot = environ.get("CISTEMATIC_ROOT") 
+else:
+    cisRoot = "/proj/genome"
+
+from cistematic.core.geneinfo import speciesMap
+
+dbPath = "%s/db/homologene.db" % cisRoot
+homologeneGenomes = ["hsapiens", "mmusculus", "rnorvegicus", "celegans",
+                     "cbriggsae", "cremanei", "dmelanogaster", "athaliana",
+                     "ggallus", "cfamiliaris", "drerio", "scerevisiae"]
+
+if environ.get("CISTEMATIC_TEMP"):
+    cisTemp = environ.get("CISTEMATIC_TEMP")
+else:
+    cisTemp = "/tmp"
+tempfile.tempdir = cisTemp
+
+
+class homologyDB:
+    """ The homologyDB class allows for the mapping and return of predefined homology relationships.
+    """
+    startingGenome = ""
+    targetGenomes  = []
+    homologousGenes = {}    
+    cachedDB = ""
+
+
+    def __init__(self, tGenomes=[], cache=False):
+        """ initialize the homologyDB object with a target genome and cache database, if desired. 
+        """
+        self.targetGenomes = tGenomes
+        if cache:
+            self.cacheDB()
+
+
+    def __del__(self):
+        """ cleanup copy in local cache, if present. 
+        """
+        if self.cachedDB != "":
+            self.uncacheDB()
+
+
+    def cacheDB(self):
+        """ copy homologyDB to a local cache. 
+        """
+        self.cachedDB = "%s.db" % tempfile.mktemp()
+        shutil.copyfile(dbPath, self.cachedDB)
+
+
+    def uncacheDB(self):
+        """ delete homologyDB from local cache. 
+        """
+        global cachedDB
+        if self.cachedDB != "":
+            try:
+                os.remove(self.cachedDB)
+            except:
+                print "could not delete %s" % self.cachedDB
+
+            self.cachedDB = ""
+
+
+    def connectDB(self):
+        """ return a handle to the database. 
+        """
+        path = dbPath
+        if self.cachedDB != "":
+            path = self.cachedDB
+
+        return sqlite.connect(path, timeout=60)
+
+
+    def loadDB(self):
+        """ deprecated. 
+        """
+        pass
+
+
+    def getHomologousGenes(self, geneID):
+        """ return list of geneIDs homologous to given geneID. Limit to target genomes if specified at initialization. 
+        """
+        db = self.connectDB()
+        cursor = db.cursor()
+        results = []
+        (gen, gid) = geneID
+        cursor.execute("select homoloID from homolog where genome = :gen and gID = :gid", locals())
+        groups = cursor.fetchall()
+        for hIDentry in groups:
+            homoloID = str(hIDentry[0])
+            cursor.execute("select genome, gID from homolog where homoloID = :homoloID ", locals())
+            genes = cursor.fetchall()
+            for gene in genes:
+                if gene != geneID:
+                    (genome, gID) = gene
+                    if len(self.targetGenomes) > 0:
+                        if genome not in self.targetGenomes:
+                            pass
+                        else:
+                            results.append((str(genome), str(gID)))
+                    else:
+                        results.append((str(genome), str(gID)))
+
+            cursor.close()
+            db.close()
+
+        return results
+
+
+def buildHomologeneDB(hFile="homologene.data", hdb=dbPath):
+    """ Populate a new homologyDB database with homology relationships from homologene.
+    """
+    inFile = open(hFile)
+    db = sqlite.connect(hdb)
+    cursor = db.cursor()
+    cursor.execute("create table homolog(ID INTEGER PRIMARY KEY, homoloID varchar, genome varchar, gID varchar)")
+    for line in inFile:
+        doInsert = False
+        sqlstmt = "INSERT into homolog(ID, homoloID, genome, gID) values (NULL, ?, ?, ?)"
+        field = line.split("\t")
+        if field[1] in speciesMap:
+            gid = field[2]
+            if speciesMap[field[1]] == "arabidopsis":
+                gid = field[3].upper()
+
+            values =  ("homologene-%s" % field[0], speciesMap[field[1]], gid.strip())
+            doInsert = True
+
+        if doInsert:
+            cursor.execute(sqlstmt, values)
+
+    inFile.close()
+    sqlstmt = "CREATE INDEX idx1 on homolog(genome, gID)"
+    cursor.execute(sqlstmt)
+    sqlstmt = "CREATE INDEX idx2 on homolog(homoloID)"
+    cursor.execute(sqlstmt)
+    db.commit()
+    cursor.close()
+    db.close()
+
+
+def addHomologs(genomeA, genomeB, entries, hdb=dbPath):
+    """ Specify homology relationships between geneIDs to be inserted into homology database.
+        The entries list contains doubles of the form (gIDa, gIDb) from genome A and genome B, respectively.
+    """
+    mapping = {}
+    for (geneID1, geneID2) in entries:
+        mapping[geneID1] = geneID2
+
+    if len(mapping) < 1:
+        return
+
+    db = sqlite.connect(hdb)
+    sql = db.cursor()
+    sql.execute('select * from homolog where genome = "%s" ' % genomeA)
+    results = sql.fetchall()
+
+    stmt = "insert into homolog(ID, homoloID, genome, gID) values (NULL, ?, ?, ?) "
+    stmtArray = []
+
+    for entry in results:
+        (rowID, hID, genome, gID) = entry
+        if gID in mapping:
+            stmtArray.append((hID, genomeB, mapping[gID]))
+            del mapping[gID]
+
+    topHID = 0
+    if len(stmtArray) > 0:
+        print "Updating %d entries in homolog table" % len(stmtArray)
+        sql.executemany(stmt, stmtArray)
+
+    stmtArray = []
+    for gID in mapping:
+        topHID += 1
+        homologID = "%s-%s-%s" % (genomeA, genomeB, str(topHID))
+        stmtArray.append((homologID, genomeA, gID))
+        stmtArray.append((homologID, genomeB, mapping[gID]))
+
+    if len(mapping) > 0:
+        print "Adding %d new homology entries" % len(mapping)
+        sql.executemany(stmt, stmtArray)
+
+    db.commit()
+    sql.close()
+    db.close()
\ No newline at end of file