1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-13 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Drosophila melanogaster
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/D_melanogaster/dmelanogaster.genedb" % cisRoot
71 def buildDmelanogasterDB(db=geneDB):
72 """ genes and annotations are from UCSC. GO association file is from geneontology.org.
74 genePath = "%s/download/flyBaseGene.txt" % cisRoot
75 annotPath = "%s/download/gene_info" % cisRoot
76 goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
77 goPath = "%s/download/gene2go" % cisRoot
79 print "Creating database %s" % db
82 print "Adding gene entries"
83 loadGeneEntries(db, genePath)
85 print "Adding gene features"
86 loadGeneFeatures(db, genePath)
88 print "Adding gene annotations"
89 loadGeneAnnotations(db, annotPath)
91 print "Adding gene ontology"
92 loadGeneOntology(db, goPath, goDefPath, annotPath)
94 chromList = ["2L", "2R", "2LHet", "2RHet", "3L", "3R", "3LHet", "3RHet",
95 "4", "X", "XHet", "YHet", "U", "UExtra", "M"
97 for chromID in chromList:
98 print "Loading chromosome %s" % chromID
99 chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID)
100 loadChromosome(db, chromID, chromPath, "/D_melanogaster/chromo%s.bin" % chromID)
102 print "Creating Indices"
105 print "Finished creating database %s" % db
108 def createDBFile(db):
109 dmGenome = Genome("dmelanogaster", dbFile=db)
110 dmGenome.createGeneDB(db)
113 def loadGeneEntries(db, gFile):
115 dmGenome = Genome("dmelanogaster", dbFile=db)
116 geneFile = open(gFile, "r")
118 for line in geneFile:
119 cols = line.split("\t")
120 name = cols[1].split("-R")
131 geneID = ("dmelanogaster", gid)
133 gidVersion = version[name[1]]
137 geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
139 print "Adding %d gene entries" % len(geneEntries)
140 dmGenome.addGeneEntryBatch(geneEntries)
143 def loadGeneFeatures(db, gfile):
144 geneFile = open(gfile, "r")
145 senseArray = {"+": "F",
151 for geneLine in geneFile:
152 geneFields = geneLine.split("\t")
153 exonNum = int(geneFields[8])
154 exonStarts = geneFields[9].split(",")
155 exonStops = geneFields[10].split(",")
156 chrom = geneFields[2][3:]
157 sense = senseArray[geneFields[3]]
158 gstop = int(geneFields[7]) - 1
159 gstart = int(geneFields[6]) - 1
160 name = geneFields[1].split("-R")
161 geneID = ("dmelanogaster", name[0])
163 gidVersion = version[name[1]]
167 for index in range(exonNum):
168 estart = int(exonStarts[index]) - 1
169 estop = int(exonStops[index]) - 1
170 if estart >= gstart and estop <= gstop:
171 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
172 elif estop <= gstart:
178 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, fType))
179 elif estart >= gstop:
185 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, fType))
186 elif estart <= gstop and estart > gstart:
192 insertArray.append((geneID, gidVersion, chrom, estart, gstop, sense, "CDS"))
193 insertArray.append((geneID, gidVersion, chrom, gstop + 1, estop, sense, fType))
194 elif estart < gstart and estop <= gstop:
200 insertArray.append((geneID, gidVersion, chrom, estart, gstart - 1, sense, fType))
201 insertArray.append((geneID, gidVersion, chrom, gstart, estop, sense, "CDS"))
210 insertArray.append((geneID, gidVersion, chrom, estart, gstart - 1, sense, fType1))
211 insertArray.append((geneID, gidVersion, chrom, gstart, gstop, sense, "CDS"))
212 insertArray.append((geneID, gidVersion, chrom, gstop + 1, estop - 1, sense, fType2))
215 dmGenome = Genome("dmelanogaster", dbFile=db)
216 print "Adding %d features" % len(insertArray)
217 dmGenome.addFeatureEntryBatch(insertArray)
220 def loadGeneAnnotations(db, annotPath):
222 annotFile = open(annotPath, "r")
223 dmGenome = Genome("dmelanogaster", dbFile=db)
224 for line in annotFile:
226 cols = line.split("\t")
227 if cols[0] != "7227":
236 geneAnnotations.append((("dmelanogaster", locID), string.replace(geneDesc.strip(), "'", "p")))
240 print "Adding %d annotations" % len(geneAnnotations)
241 dmGenome.addAnnotationBatch(geneAnnotations)
244 def loadGeneOntology(db, goPath, goDefPath, annotPath):
245 dmGenome = Genome("dmelanogaster", dbFile=db)
246 goDefFile = open(goDefPath, "r")
247 goFile = open(goPath, "r")
248 annotFile = open(annotPath, "r")
249 annotEntries = annotFile.readlines()
251 goDefEntries = goDefFile.readlines()
255 for goDefEntry in goDefEntries:
256 if goDefEntry[0] != "!":
257 cols = goDefEntry.split("\t")
258 goDefs[cols[0]] = (cols[1], cols[2].strip())
260 goEntries = goFile.readlines()
261 for annotEntry in annotEntries:
263 cols = annotEntry.split("\t")
264 if cols[0] != "7227":
267 locID = cols[3].strip()
268 geneName = cols[1].strip()
270 locus[geneName] = locID
274 for entry in goEntries:
278 if entry[:4] != "7227":
282 fields = entry.split("\t")
283 geneName = fields[1].strip()
284 locID = locus[geneName]
289 goArray.append((("dmelanogaster", locID), GOID, "", geneName, "", string.replace(goDefs[GOID][0], "'", "p"), goDefs[GOID][1], ""))
293 print "adding %d go entries" % len(goArray)
294 dmGenome.addGoInfoBatch(goArray)
297 def loadChromosome(db, chromID, chromPath, chromOut):
299 dmGenome = Genome("dmelanogaster", dbFile=db)
300 inFile = open(chromPath, "r")
301 line = inFile.readline()
303 seqArray.append(line.strip())
305 seq = string.join(seqArray, "")
308 print "Problems reading sequence from file"
310 print "writing to file %s" % chromOut
311 outFile = open("%s%s" % (cisRoot, chromOut), "w")
314 dmGenome.addChromosomeEntry(chromID, chromOut, "file")
317 def createDBindices(db):
318 dmGenome = Genome("dmelanogaster", dbFile=db)
319 dmGenome.createIndices()