1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Drosophila melanogaster
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/D_melanogaster/dmelanogaster.genedb" % cisRoot
70 def loadChromosome(db, chromID, chromPath, chromOut):
72 dmGenome = Genome("dmelanogaster", dbFile=db)
73 inFile = open(chromPath, "r")
74 line = inFile.readline()
76 seqArray.append(line.strip())
78 seq = string.join(seqArray, "")
81 print "Problems reading sequence from file"
83 print "writing to file %s" % chromOut
84 outFile = open("%s%s" % (cisRoot, chromOut), "w")
87 dmGenome.addChromosomeEntry(chromID, chromOut, "file")
90 def loadGeneEntries(db, gFile):
92 dmGenome = Genome("dmelanogaster", dbFile=db)
93 geneFile = open(gFile, "r")
96 cols = line.split("\t")
97 name = cols[1].split("-R")
108 geneID = ("dmelanogaster", gid)
110 gidVersion = version[name[1]]
114 geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
116 print "Adding %d gene entries" % len(geneEntries)
117 dmGenome.addGeneEntryBatch(geneEntries)
120 def loadGeneFeatures(db, gfile):
121 geneFile = open(gfile, "r")
122 senseArray = {"+": "F",
128 for geneLine in geneFile:
129 geneFields = geneLine.split("\t")
130 exonNum = int(geneFields[8])
131 exonStarts = geneFields[9].split(",")
132 exonStops = geneFields[10].split(",")
133 chrom = geneFields[2][3:]
134 sense = senseArray[geneFields[3]]
135 gstop = int(geneFields[7]) - 1
136 gstart = int(geneFields[6]) - 1
137 name = geneFields[1].split("-R")
138 geneID = ("dmelanogaster", name[0])
140 gidVersion = version[name[1]]
144 for index in range(exonNum):
145 estart = int(exonStarts[index]) - 1
146 estop = int(exonStops[index]) - 1
147 if estart >= gstart and estop <= gstop:
148 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
149 elif estop <= gstart:
155 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, fType))
156 elif estart >= gstop:
162 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, fType))
163 elif estart <= gstop and estart > gstart:
169 insertArray.append((geneID, gidVersion, chrom, estart, gstop, sense, "CDS"))
170 insertArray.append((geneID, gidVersion, chrom, gstop + 1, estop, sense, fType))
171 elif estart < gstart and estop <= gstop:
177 insertArray.append((geneID, gidVersion, chrom, estart, gstart - 1, sense, fType))
178 insertArray.append((geneID, gidVersion, chrom, gstart, estop, sense, "CDS"))
187 insertArray.append((geneID, gidVersion, chrom, estart, gstart - 1, sense, fType1))
188 insertArray.append((geneID, gidVersion, chrom, gstart, gstop, sense, "CDS"))
189 insertArray.append((geneID, gidVersion, chrom, gstop + 1, estop - 1, sense, fType2))
192 dmGenome = Genome("dmelanogaster", dbFile=db)
193 print "Adding %d features" % len(insertArray)
194 dmGenome.addFeatureEntryBatch(insertArray)
197 def loadGeneAnnotations(db, annotPath):
199 annotFile = open(annotPath, "r")
200 dmGenome = Genome("dmelanogaster", dbFile=db)
201 for line in annotFile:
203 cols = line.split("\t")
204 if cols[0] != "7227":
213 geneAnnotations.append((("dmelanogaster", locID), string.replace(geneDesc.strip(), "'", "p")))
217 print "Adding %d annotations" % len(geneAnnotations)
218 dmGenome.addAnnotationBatch(geneAnnotations)
221 def loadGeneOntology(db, goPath, goDefPath, annotPath):
222 dmGenome = Genome("dmelanogaster", dbFile=db)
223 goDefFile = open(goDefPath, "r")
224 goFile = open(goPath, "r")
225 annotFile = open(annotPath, "r")
226 annotEntries = annotFile.readlines()
228 goDefEntries = goDefFile.readlines()
232 for goDefEntry in goDefEntries:
233 if goDefEntry[0] != "!":
234 cols = goDefEntry.split("\t")
235 goDefs[cols[0]] = (cols[1], cols[2].strip())
237 goEntries = goFile.readlines()
238 for annotEntry in annotEntries:
240 cols = annotEntry.split("\t")
241 if cols[0] != "7227":
244 locID = cols[3].strip()
245 geneName = cols[1].strip()
247 locus[geneName] = locID
251 for entry in goEntries:
255 if entry[:4] != "7227":
259 fields = entry.split("\t")
260 geneName = fields[1].strip()
261 locID = locus[geneName]
266 goArray.append((("dmelanogaster", locID), GOID, "", geneName, "", string.replace(goDefs[GOID][0], "'", "p"), goDefs[GOID][1], ""))
270 print "adding %d go entries" % len(goArray)
271 dmGenome.addGoInfoBatch(goArray)
274 def createDBFile(db):
275 dmGenome = Genome("dmelanogaster", dbFile=db)
276 dmGenome.createGeneDB(db)
279 def createDBindices(db):
280 dmGenome = Genome("dmelanogaster", dbFile=db)
281 dmGenome.createIndices()
284 def buildDmelanogasterDB(db=geneDB):
285 """ genes and annotations are from UCSC. GO association file is from geneontology.org.
287 genePath = "%s/download/flyBaseGene.txt" % cisRoot
288 annotPath = "%s/download/gene_info" % cisRoot
289 goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
290 goPath = "%s/download/gene2go" % cisRoot
291 chromos = {"2L": "%s/download/chr2L.fa" % cisRoot,
292 "2R": "%s/download/chr2R.fa" % cisRoot,
293 "2LHet": "%s/download/chr2LHet.fa" % cisRoot,
294 "2RHet": "%s/download/chr2RHet.fa" % cisRoot,
295 "3L": "%s/download/chr3L.fa" % cisRoot,
296 "3LHet": "%s/download/chr3LHet.fa" % cisRoot,
297 "3R": "%s/download/chr3R.fa" % cisRoot,
298 "3RHet": "%s/download/chr3RHet.fa" % cisRoot,
299 "4": "%s/download/chr4.fa" % cisRoot,
300 "X": "%s/download/chrX.fa" % cisRoot,
301 "XHet": "%s/download/chrXHet.fa" % cisRoot,
302 "YHet": "%s/download/chrYHet.fa" % cisRoot,
303 "U": "%s/download/chrU.fa" % cisRoot,
304 "Uextra": "%s/download/chrUextra.fa" % cisRoot,
305 "M": "%s/download/chrM.fa" % cisRoot
308 print "Creating database %s" % db
311 print "Adding gene entries"
312 loadGeneEntries(db, genePath)
314 print "Adding gene features"
315 loadGeneFeatures(db, genePath)
317 print "Adding gene annotations"
318 loadGeneAnnotations(db, annotPath)
320 print "Adding gene ontology"
321 loadGeneOntology(db, goPath, goDefPath, annotPath)
323 for chromID in chromos.keys():
324 print "Loading chromosome %s" % chromID
325 loadChromosome(db, chromID, chromos[chromID], "/D_melanogaster/chromo%s.bin" % chromID)
327 print "Creating Indices"
330 print "Finished creating database %s" % db