1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-13 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Caenorhaditis elegans
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/C_elegans/celegans.genedb" % cisRoot
43 def buildCelegansDB(db=geneDB, downloadRoot=""):
44 if downloadRoot == "":
45 downloadRoot = "%s/download/" % cisRoot
47 geneIDPath = "%sgeneIDs.WS200" % downloadRoot
48 goDefPath = "%sGO.terms_and_ids" % downloadRoot
49 goPath = "%sgene_association.wb" % downloadRoot
51 # can be found at ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/genome_feature_tables/GFF2/elegansWS160.gff.gz
52 gffPath = "%selegansWS200.gff" % downloadRoot
54 print "Creating database %s" % db
57 print "Adding gene entries"
58 loadGeneEntries(db, gffPath)
60 print "Adding feature entries"
61 loadFeatureEntries(db, gffPath)
63 print "Adding gene annotations"
64 loadGeneAnnotations(db, geneIDPath)
66 print "Adding gene ontology"
67 loadGeneOntology(db, goPath, goDefPath, geneIDPath)
69 # can be found at ftp://caltech.wormbase.org/pub/schwarz/cisreg/softmasks
70 for chromID in ["I", "II", "III", "IV", "V", "X"]:
71 print "Loading chromosome %s" % chromID
72 chromPath = "%sCHROMOSOME_%s_softmasked.dna" % (downloadRoot, chromID)
73 loadChromosome(db, chromID, chromPath, "/C_elegans/chr%s.bin" % chromID)
75 print "Creating Indices"
78 print "Finished creating database %s" % db
82 ceGenome = Genome("celegans", version="WS200", dbFile=db)
83 ceGenome.createGeneDB(db)
86 def loadGeneEntries(db, gffFile):
87 ceGenome = Genome("celegans", dbFile=db)
88 geneFile = open(gffFile, "r")
94 field = line.split("\t")
95 if field[1] != "Coding_transcript" and field[1] != "miRNA":
98 if field[2] != "Transcript" and field[2] != "miRNA_primary_transcript":
101 gidrev = field[8].split('"')
102 giddots = gidrev[1].split(".")
103 # we are ignoring gene models!!!!
104 if giddots[1][-1] in string.letters:
105 gidGene = giddots[1][:-1]
106 gidLetter = giddots[1][-1]
111 gid = "%s.%s" % (giddots[0], gidGene)
112 geneID = ("celegans", gid)
116 gidVersion = ord(gidLetter.lower()) - 96
118 print "problem processing %s - skipping" % gidrev[1]
121 start = int(field[3]) - 1
122 stop = int(field[4]) - 1
124 chrom = field[0].strip()
130 geneEntries.append((geneID, chrom, start, stop, sense, "Transcript", gidVersion))
132 print "Adding %d gene entries" % len(geneEntries)
133 ceGenome.addGeneEntryBatch(geneEntries)
136 def loadFeatureEntries(db, gffFile):
137 ceGenome = Genome("celegans", dbFile=db)
138 featureFile = open(gffFile, "r")
141 featureTranslation = {"coding_exon": "CDS",
142 "three_prime_UTR": "3UTR",
143 "five_prime_UTR": "5UTR"
146 for line in featureFile:
150 field = line.split("\t")
151 if field[1] not in ["Coding_transcript", "miRNA", "tRNAscan-SE-1.23"]:
154 if field[1] == "Coding_transcript" and field[2].strip() not in featureTranslation:
157 if field[1] in ["miRNA", "tRNAscan-SE-1.23"]:
158 featureType = "CDS" # identifying these as CDS will force their masking later on
160 featureType = featureTranslation[field[2].strip()]
162 gidrev = field[8].split('"')
163 giddots = gidrev[1].split(".")
164 # we are ignoring gene models!!!!
165 if giddots[1][-1] in string.letters:
166 gidGene = giddots[1][:-1]
167 gidLetter = giddots[1][-1]
172 gid = "%s.%s" % (giddots[0], gidGene)
173 geneID = ("celegans", gid)
177 gidVersion = ord(gidLetter.lower()) - 96
179 print "problem processing %s - skipping" % gidrev[1]
182 start = int(field[3]) - 1
183 stop = int(field[4]) - 1
185 chrom = field[0].strip()
191 if geneID not in seenFeatures:
192 seenFeatures[geneID] = []
194 if (gidVersion, start, stop, featureType) not in seenFeatures[geneID]:
195 featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, featureType))
196 seenFeatures[geneID].append((gidVersion, start, stop, featureType))
198 print "Adding %d feature entries" % len(featureEntries)
199 ceGenome.addFeatureEntryBatch(featureEntries)
202 def loadGeneAnnotations(db, geneIDPath):
204 geneIDFile = open(geneIDPath, "r")
205 lines = geneIDFile.readlines()
207 ceGenome = Genome("celegans", dbFile=db)
209 field = line.split(",")
211 gid = field[2].strip()
212 geneID = "%s\t%s" % (field[0], field[1])
213 geneAnnotations.append((("celegans", gid), geneID))
217 print "Adding %d annotations" % len(geneAnnotations)
218 ceGenome.addAnnotationBatch(geneAnnotations)
221 def loadGeneOntology(db, goPath, goDefPath, geneIDPath):
222 ceGenome = Genome("celegans", version="WS200", dbFile=db)
223 geneIDFile = open(geneIDPath, "r")
224 goDefFile = open(goDefPath, "r")
225 goFile = open(goPath, "r")
226 lines = geneIDFile.readlines()
231 field = line.split(",")
232 # ugly C elegans hack - map both fields to gid, since either might be
234 if len(field[2].strip()) > 1:
235 geneIDmap[field[1]] = field[2].strip()
237 goDefEntries = goDefFile.readlines()
239 for goDefEntry in goDefEntries:
240 if goDefEntry[0] != "!":
241 cols = goDefEntry.split("\t")
242 goDefs[cols[0]] = (cols[1], cols[2].strip())
244 goEntries = goFile.readlines()
246 for line in goEntries:
250 fields = line.split("\t")
252 if name in geneIDmap:
253 name = geneIDmap[name]
258 GOIDarray = fields[4].split(" ")
261 objName = fields[10].split("|")
265 name = "%s|%s" % (name.strip(), fields[10])
268 GOterm = string.replace(goDefs[GOID][0], "'", "p")
270 print "could no map %s - using GOID only" % GOID
274 if gID not in seenGO:
277 if GOID not in seenGO[gID]:
278 seenGO[gID].append(GOID)
279 goArray.append((("celegans", gID), GOID, objType, name, isNot, GOterm, evidence, fields[1]))
281 print "Adding %d GO entries" % len(goArray)
282 ceGenome.addGoInfoBatch(goArray)
285 def loadChromosome(db, chromID, chromPath, chromOut):
287 ceGenome = Genome("celegans", dbFile=db)
288 inFile = open(chromPath, "r")
289 line = inFile.readline()
291 seqArray.append(line.strip())
293 seq = string.join(seqArray, "")
296 print "Problems reading sequence from file"
298 print "writing to file %s" % chromOut
299 outFile = open("%s%s" % (cisRoot, chromOut), "w")
302 ceGenome.addChromosomeEntry(chromID, chromOut, "file")
305 def createDBindices(db):
306 ceGenome = Genome("celegans", version="WS200", dbFile=db)
307 ceGenome.createIndices()