1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Caenorhaditis elegans
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/C_elegans/celegans.genedb" % cisRoot
43 def loadChromosome(db, chromID, chromPath, chromOut):
45 ceGenome = Genome("celegans", dbFile=db)
46 inFile = open(chromPath, "r")
47 line = inFile.readline()
49 seqArray.append(line.strip())
51 seq = string.join(seqArray, "")
54 print "Problems reading sequence from file"
56 print "writing to file %s" % chromOut
57 outFile = open("%s%s" % (cisRoot, chromOut), "w")
60 ceGenome.addChromosomeEntry(chromID, chromOut, "file")
63 def loadGeneEntries(db, gffFile):
64 ceGenome = Genome("celegans", dbFile=db)
65 geneFile = open(gffFile, "r")
71 field = line.split("\t")
72 if field[1] != "Coding_transcript" and field[1] != "miRNA":
75 if field[2] != "Transcript" and field[2] != "miRNA_primary_transcript":
78 gidrev = field[8].split('"')
79 giddots = gidrev[1].split(".")
80 # we are ignoring gene models!!!!
81 if giddots[1][-1] in string.letters:
82 gidGene = giddots[1][:-1]
83 gidLetter = giddots[1][-1]
87 gid = "%s.%s" % (giddots[0], gidGene)
88 geneID = ("celegans", gid)
92 gidVersion = ord(gidLetter.lower()) - 96
94 print "problem processing %s - skipping" % gidrev[1]
97 start = int(field[3]) - 1
98 stop = int(field[4]) - 1
100 chrom = field[0].strip()
106 geneEntries.append((geneID, chrom, start, stop, sense, "Transcript", gidVersion))
108 print "Adding %d gene entries" % len(geneEntries)
109 ceGenome.addGeneEntryBatch(geneEntries)
112 def loadFeatureEntries(db, gffFile):
113 ceGenome = Genome("celegans", dbFile=db)
114 featureFile = open(gffFile, "r")
117 featureTranslation = {"coding_exon": "CDS",
118 "three_prime_UTR": "3UTR",
119 "five_prime_UTR": "5UTR"
122 for line in featureFile:
126 field = line.split("\t")
127 if field[1] not in ["Coding_transcript", "miRNA", "tRNAscan-SE-1.23"]:
130 if field[1] == "Coding_transcript" and field[2].strip() not in featureTranslation:
133 if field[1] in ["miRNA", "tRNAscan-SE-1.23"]:
134 featureType = "CDS" # identifying these as CDS will force their masking later on
136 featureType = featureTranslation[field[2].strip()]
138 gidrev = field[8].split('"')
139 giddots = gidrev[1].split(".")
140 # we are ignoring gene models!!!!
141 if giddots[1][-1] in string.letters:
142 gidGene = giddots[1][:-1]
143 gidLetter = giddots[1][-1]
148 gid = "%s.%s" % (giddots[0], gidGene)
149 geneID = ("celegans", gid)
153 gidVersion = ord(gidLetter.lower()) - 96
155 print "problem processing %s - skipping" % gidrev[1]
158 start = int(field[3]) - 1
159 stop = int(field[4]) - 1
161 chrom = field[0].strip()
167 if geneID not in seenFeatures:
168 seenFeatures[geneID] = []
170 if (gidVersion, start, stop, featureType) not in seenFeatures[geneID]:
171 featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, featureType))
172 seenFeatures[geneID].append((gidVersion, start, stop, featureType))
174 print "Adding %d feature entries" % len(featureEntries)
175 ceGenome.addFeatureEntryBatch(featureEntries)
178 def loadGeneAnnotations(db, geneIDPath):
180 geneIDFile = open(geneIDPath, "r")
181 lines = geneIDFile.readlines()
183 ceGenome = Genome("celegans", dbFile=db)
185 field = line.split(",")
187 gid = field[2].strip()
188 geneID = "%s\t%s" % (field[0], field[1])
189 geneAnnotations.append((("celegans", gid), geneID))
193 print "Adding %d annotations" % len(geneAnnotations)
194 ceGenome.addAnnotationBatch(geneAnnotations)
197 def loadGeneOntology(db, goPath, goDefPath, geneIDPath):
198 ceGenome = Genome("celegans", version="WS200", dbFile=db)
199 geneIDFile = open(geneIDPath, "r")
200 goDefFile = open(goDefPath, "r")
201 goFile = open(goPath, "r")
202 lines = geneIDFile.readlines()
207 field = line.split(",")
208 # ugly C elegans hack - map both fields to gid, since either might be
210 if len(field[2].strip()) > 1:
211 geneIDmap[field[1]] = field[2].strip()
213 goDefEntries = goDefFile.readlines()
215 for goDefEntry in goDefEntries:
216 if goDefEntry[0] != "!":
217 cols = goDefEntry.split("\t")
218 goDefs[cols[0]] = (cols[1], cols[2].strip())
220 goEntries = goFile.readlines()
222 for line in goEntries:
226 fields = line.split("\t")
228 if name in geneIDmap:
229 name = geneIDmap[name]
234 GOIDarray = fields[4].split(" ")
237 objName = fields[10].split("|")
241 name = "%s|%s" % (name.strip(), fields[10])
244 GOterm = string.replace(goDefs[GOID][0], "'", "p")
246 print "could no map %s - using GOID only" % GOID
250 if gID not in seenGO:
253 if GOID not in seenGO[gID]:
254 seenGO[gID].append(GOID)
255 goArray.append((("celegans", gID), GOID, objType, name, isNot, GOterm, evidence, fields[1]))
257 print "Adding %d GO entries" % len(goArray)
258 ceGenome.addGoInfoBatch(goArray)
261 def createDBFile(db):
262 ceGenome = Genome("celegans", version="WS200", dbFile=db)
263 ceGenome.createGeneDB(db)
266 def createDBindices(db):
267 ceGenome = Genome("celegans", version="WS200", dbFile=db)
268 ceGenome.createIndices()
271 def buildCelegansDB(db=geneDB, downloadRoot=""):
272 if downloadRoot == "":
273 downloadRoot = "%s/download/" % cisRoot
275 geneIDPath = "%sgeneIDs.WS200" % downloadRoot
276 goDefPath = "%sGO.terms_and_ids" % downloadRoot
277 goPath = "%sgene_association.wb" % downloadRoot
279 # can be found at ftp://caltech.wormbase.org/pub/schwarz/cisreg/softmasks
280 chromos = {"I": "%sCHROMOSOME_I_softmasked.dna" % downloadRoot,
281 "II": "%sCHROMOSOME_II_softmasked.dna" % downloadRoot,
282 "III": "%sCHROMOSOME_III_softmasked.dna" % downloadRoot,
283 "IV": "%sCHROMOSOME_IV_softmasked.dna" % downloadRoot,
284 "V": "%sCHROMOSOME_V_softmasked.dna" % downloadRoot,
285 "X": "%sCHROMOSOME_X_softmasked.dna" % downloadRoot
288 # can be found at ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/genome_feature_tables/GFF2/elegansWS160.gff.gz
289 gffPath = "%selegansWS200.gff" % downloadRoot
291 print "Creating database %s" % db
294 print "Adding gene entries"
295 loadGeneEntries(db, gffPath)
297 print "Adding feature entries"
298 loadFeatureEntries(db, gffPath)
300 print "Adding gene annotations"
301 loadGeneAnnotations(db, geneIDPath)
303 print "Adding gene ontology"
304 loadGeneOntology(db, goPath, goDefPath, geneIDPath)
306 for chromID in chromos:
307 print "Loading chromosome %s" % chromID
308 loadChromosome(db, chromID, chromos[chromID], "/C_elegans/chr%s.bin" % chromID)
310 print "Creating Indices"
313 print "Finished creating database %s" % db