erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / core / homology.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-10 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 try:
31     from pysqlite2 import dbapi2 as sqlite
32 except:
33     from sqlite3 import dbapi2 as sqlite
34
35 import tempfile, shutil, os
36 from os import environ
37
38 if environ.get("CISTEMATIC_ROOT"):
39     cisRoot = environ.get("CISTEMATIC_ROOT") 
40 else:
41     cisRoot = "/proj/genome"
42
43 from cistematic.core.geneinfo import speciesMap
44
45 dbPath = "%s/db/homologene.db" % cisRoot
46 homologeneGenomes = ["hsapiens", "mmusculus", "rnorvegicus", "celegans",
47                      "cbriggsae", "cremanei", "dmelanogaster", "athaliana",
48                      "ggallus", "cfamiliaris", "drerio", "scerevisiae"]
49
50 if environ.get("CISTEMATIC_TEMP"):
51     cisTemp = environ.get("CISTEMATIC_TEMP")
52 else:
53     cisTemp = "/tmp"
54 tempfile.tempdir = cisTemp
55
56
57 class homologyDB:
58     """ The homologyDB class allows for the mapping and return of predefined homology relationships.
59     """
60     startingGenome = ""
61     targetGenomes  = []
62     homologousGenes = {}    
63     cachedDB = ""
64
65
66     def __init__(self, tGenomes=[], cache=False):
67         """ initialize the homologyDB object with a target genome and cache database, if desired. 
68         """
69         self.targetGenomes = tGenomes
70         if cache:
71             self.cacheDB()
72
73
74     def __del__(self):
75         """ cleanup copy in local cache, if present. 
76         """
77         if self.cachedDB != "":
78             self.uncacheDB()
79
80
81     def cacheDB(self):
82         """ copy homologyDB to a local cache. 
83         """
84         self.cachedDB = "%s.db" % tempfile.mktemp()
85         shutil.copyfile(dbPath, self.cachedDB)
86
87
88     def uncacheDB(self):
89         """ delete homologyDB from local cache. 
90         """
91         global cachedDB
92         if self.cachedDB != "":
93             try:
94                 os.remove(self.cachedDB)
95             except:
96                 print "could not delete %s" % self.cachedDB
97
98             self.cachedDB = ""
99
100
101     def connectDB(self):
102         """ return a handle to the database. 
103         """
104         path = dbPath
105         if self.cachedDB != "":
106             path = self.cachedDB
107
108         return sqlite.connect(path, timeout=60)
109
110
111     def loadDB(self):
112         """ deprecated. 
113         """
114         pass
115
116
117     def getHomologousGenes(self, geneID):
118         """ return list of geneIDs homologous to given geneID. Limit to target genomes if specified at initialization. 
119         """
120         db = self.connectDB()
121         cursor = db.cursor()
122         results = []
123         (gen, gid) = geneID
124         cursor.execute("select homoloID from homolog where genome = :gen and gID = :gid", locals())
125         groups = cursor.fetchall()
126         for hIDentry in groups:
127             homoloID = str(hIDentry[0])
128             cursor.execute("select genome, gID from homolog where homoloID = :homoloID ", locals())
129             genes = cursor.fetchall()
130             for gene in genes:
131                 if gene != geneID:
132                     (genome, gID) = gene
133                     if len(self.targetGenomes) > 0:
134                         if genome not in self.targetGenomes:
135                             pass
136                         else:
137                             results.append((str(genome), str(gID)))
138                     else:
139                         results.append((str(genome), str(gID)))
140
141             cursor.close()
142             db.close()
143
144         return results
145
146
147 def buildHomologeneDB(hFile="homologene.data", hdb=dbPath):
148     """ Populate a new homologyDB database with homology relationships from homologene.
149     """
150     inFile = open(hFile)
151     db = sqlite.connect(hdb)
152     cursor = db.cursor()
153     cursor.execute("create table homolog(ID INTEGER PRIMARY KEY, homoloID varchar, genome varchar, gID varchar)")
154     for line in inFile:
155         doInsert = False
156         sqlstmt = "INSERT into homolog(ID, homoloID, genome, gID) values (NULL, ?, ?, ?)"
157         field = line.split("\t")
158         if field[1] in speciesMap:
159             gid = field[2]
160             if speciesMap[field[1]] == "arabidopsis":
161                 gid = field[3].upper()
162
163             values =  ("homologene-%s" % field[0], speciesMap[field[1]], gid.strip())
164             doInsert = True
165
166         if doInsert:
167             cursor.execute(sqlstmt, values)
168
169     inFile.close()
170     sqlstmt = "CREATE INDEX idx1 on homolog(genome, gID)"
171     cursor.execute(sqlstmt)
172     sqlstmt = "CREATE INDEX idx2 on homolog(homoloID)"
173     cursor.execute(sqlstmt)
174     db.commit()
175     cursor.close()
176     db.close()
177
178
179 def addHomologs(genomeA, genomeB, entries, hdb=dbPath):
180     """ Specify homology relationships between geneIDs to be inserted into homology database.
181         The entries list contains doubles of the form (gIDa, gIDb) from genome A and genome B, respectively.
182     """
183     mapping = {}
184     for (geneID1, geneID2) in entries:
185         mapping[geneID1] = geneID2
186
187     if len(mapping) < 1:
188         return
189
190     db = sqlite.connect(hdb)
191     sql = db.cursor()
192     sql.execute('select * from homolog where genome = "%s" ' % genomeA)
193     results = sql.fetchall()
194
195     stmt = "insert into homolog(ID, homoloID, genome, gID) values (NULL, ?, ?, ?) "
196     stmtArray = []
197
198     for entry in results:
199         (rowID, hID, genome, gID) = entry
200         if gID in mapping:
201             stmtArray.append((hID, genomeB, mapping[gID]))
202             del mapping[gID]
203
204     topHID = 0
205     if len(stmtArray) > 0:
206         print "Updating %d entries in homolog table" % len(stmtArray)
207         sql.executemany(stmt, stmtArray)
208
209     stmtArray = []
210     for gID in mapping:
211         topHID += 1
212         homologID = "%s-%s-%s" % (genomeA, genomeB, str(topHID))
213         stmtArray.append((homologID, genomeA, gID))
214         stmtArray.append((homologID, genomeB, mapping[gID]))
215
216     if len(mapping) > 0:
217         print "Adding %d new homology entries" % len(mapping)
218         sql.executemany(stmt, stmtArray)
219
220     db.commit()
221     sql.close()
222     db.close()