1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
31 from pysqlite2 import dbapi2 as sqlite
33 from sqlite3 import dbapi2 as sqlite
35 import tempfile, shutil, os
36 from os import environ
38 if environ.get("CISTEMATIC_ROOT"):
39 cisRoot = environ.get("CISTEMATIC_ROOT")
41 cisRoot = "/proj/genome"
43 dbPath = "%s/db/gene_info.db" % cisRoot
45 if environ.get("CISTEMATIC_TEMP"):
46 cisTemp = environ.get("CISTEMATIC_TEMP")
50 tempfile.tempdir = cisTemp
52 speciesMap = {"3702": "athaliana",
53 "4932": "scerevisiae",
55 "7227": "dmelanogaster",
56 "7668": "spurpuratus",
58 "8364": "xtropicalis",
61 "9615": "cfamiliaris",
65 "10116": "rnorvegicus",
71 """ The geneinfoDB class allows for the querying of NCBI gene data.
78 def __init__(self, tGenomes=[], cache=False):
79 """ initialize the geneinfoDB object with a target genome and cache database, if desired.
81 self.targetGenomes = tGenomes
87 """ cleanup copy in local cache, if present.
89 if self.cachedDB != "":
94 """ copy geneinfoDB to a local cache.
96 self.cachedDB = "%s.db" % tempfile.mktemp()
97 shutil.copyfile(dbPath, self.cachedDB)
101 """ delete geneinfoDB from local cache.
104 if self.cachedDB != "":
106 os.remove(self.cachedDB)
108 print "could not delete %s" % self.cachedDB
114 """ return a handle to the database.
117 if self.cachedDB != "":
120 return sqlite.connect(path, timeout=60)
123 def getGeneInfo(self, geneID):
124 """ returns a list of one or more (symbol, locustag, dbxrefs, chromosome, map_location) for a geneID.
126 db = self.connectDB()
130 cursor.execute("select symbol, locustag, dbxrefs, chromosome, map_location from gene_info where genome = :gen and gID = :gid " , locals())
131 entry = cursor.fetchone()
135 (symbol, locustag, dbxrefs, chromosome, map_location) = entry
136 return (str(symbol), str(locustag), str(dbxrefs), str(chromosome), str(map_location))
143 def getallGeneInfo(self, genome, infoKey="gid"):
144 """ returns a dictionary of one or more (symbol, locustag, dbxrefs, chromosome, map_location) per gID.
145 acceptable infoKey arguments are: 'locus', and 'gid'.
147 db = self.connectDB()
150 cursor.execute("select gid, symbol, locustag, dbxrefs, chromosome, map_location from gene_info where genome = :genome", locals())
151 results = cursor.fetchall()
154 for (gid, symbol, locustag, dbxrefs, chromosome, map_location) in results:
155 if infoKey == "locus":
156 if str(locustag) not in resDict:
157 resDict[str(locustag)] = []
159 resDict[str(locustag)].append((str(symbol), str(gid), str(dbxrefs), str(chromosome), str(map_location)))
161 if str(gid) not in resDict:
162 resDict[str(gid)] = []
164 resDict[str(gid)].append((str(symbol), str(locustag), str(dbxrefs), str(chromosome), str(map_location)))
169 def getDescription(self, geneID):
170 """ returns a list of one or more gene description for a geneID.
172 db = self.connectDB()
176 cursor.execute("select description from gene_description where genome = :gen and gID = :gid", locals())
177 entries = cursor.fetchall()
180 for entry in entries:
181 results.append(str(entry[0]))
186 def geneIDSynonyms(self, geneID):
187 """ returns a list of synonyms for a geneID.
189 db = self.connectDB()
193 cursor.execute("select synonym from gene_synonyms where genome = :gen and gID = :gid", locals())
194 entries = cursor.fetchall()
197 for entry in entries:
198 results.append(str(entry[0]))
203 def getGeneID(self, genome, synonym):
204 """ returns a geneID given a genome and a synonym.
206 db = self.connectDB()
209 cursor.execute("select gID from gene_info where genome= :genome and symbol= :synonym", locals())
210 entry = cursor.fetchone()
214 return (genome, str(entry[0]))
216 cursor.execute("select gID from gene_synonyms where genome = :genome and synonym = :synonym", locals())
217 entry = cursor.fetchone()
221 return (genome, str(entry[0]))
223 cursor.execute("select gID from gene_info where genome = :genome and locustag = :synonym", locals())
224 entry = cursor.fetchone()
228 return (genome, str(entry[0]))
233 def buildgeneinfoDB(datafile, path=dbPath):
234 """ populate geneinfo database from NCBI gene information.
236 inFile = open(datafile, "r")
240 cursor.execute("create table gene_info(ID INTEGER PRIMARY KEY, genome varchar, gID varchar, symbol varchar, locustag varchar, dbxrefs varchar, chromosome varchar, map_location varchar)")
241 cursor.execute("create table gene_description(ID INTEGER PRIMARY KEY, genome varchar, gID varchar, description varchar)")
242 cursor.execute("create table gene_synonyms(ID INTEGER PRIMARY KEY, genome varchar, gID varchar, synonym varchar)")
243 genomeKeys = speciesMap.keys()
246 line = line.replace("'", "prime")
247 field = line.split("\t")
248 if field[0] in genomeKeys:
250 genome = speciesMap[field[0]]
251 sqlstmt = "INSERT into gene_info(ID, genome, gID, symbol, locustag, dbxrefs, chromosome, map_location) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (genome, field[1], field[2], field[3], field[5], field[6], field[7])
252 cursor.execute(sqlstmt)
253 descr = field[8].strip()
255 sqlstmt2 = "INSERT into gene_description(ID, genome, gID, description) values (NULL, '%s', '%s', '%s')" % (genome, field[1], field[8].strip())
256 cursor.execute(sqlstmt2)
258 sqlstmt3 = "INSERT into gene_synonyms(ID, genome, gID, synonym) values (NULL, '%s', '%s', '%s')" % (genome, field[1], field[1].strip())
259 cursor.execute(sqlstmt3)
260 synonyms = field[4].split("|")
261 for entry in synonyms:
263 if entry != "-" and entry != field[1].strip():
264 sqlstmt3 = "INSERT into gene_synonyms(ID, genome, gID, synonym) values (NULL, '%s', '%s', '%s')" % (genome, field[1], entry.strip())
265 cursor.execute(sqlstmt3)
266 except sqlite.OperationalError:
268 except sqlite.OperationalError:
269 print "could not register %s" % (line)
271 cursor.execute("create index genIdx1 on gene_info(genome)")
272 cursor.execute("create index genIdx2 on gene_description(genome)")
273 cursor.execute("create index genIdx3 on gene_synonyms(genome)")
274 cursor.execute("create index gIDIdx1 on gene_info(gID)")
275 cursor.execute("create index gIDIdx2 on gene_description(gID)")
276 cursor.execute("create index gIDIdx3 on gene_synonyms(gID)")
277 cursor.execute("create index synIdx on gene_synonyms(synonym)")