erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / genomes / celegans.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-10 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Caenorhaditis elegans
31 import string
32 from cistematic.genomes import Genome
33 from os import environ
34
35 if environ.get("CISTEMATIC_ROOT"):
36     cisRoot = environ.get("CISTEMATIC_ROOT") 
37 else:
38     cisRoot = "/proj/genome"
39
40 geneDB = "%s/C_elegans/celegans.genedb" % cisRoot
41
42
43 def loadChromosome(db, chromID, chromPath, chromOut):
44     seqArray = []
45     ceGenome = Genome("celegans", dbFile=db)
46     inFile = open(chromPath, "r")
47     line = inFile.readline()
48     for line in inFile:
49         seqArray.append(line.strip())
50
51     seq = string.join(seqArray, "")
52     seqLen = len(seq)
53     if seqLen < 1:
54         print "Problems reading sequence from file"
55
56     print "writing to file %s" % chromOut
57     outFile = open("%s%s" % (cisRoot, chromOut), "w")
58     outFile.write(seq)
59     outFile.close()
60     ceGenome.addChromosomeEntry(chromID, chromOut, "file")
61
62
63 def loadGeneEntries(db, gffFile):
64     ceGenome = Genome("celegans", dbFile=db)
65     geneFile = open(gffFile, "r")
66     geneEntries = []
67     for line in geneFile:
68         if line[0] == "#":
69             continue
70
71         field = line.split("\t")
72         if field[1] != "Coding_transcript" and field[1] != "miRNA":
73             continue
74
75         if field[2] != "Transcript" and field[2] != "miRNA_primary_transcript":
76             continue
77
78         gidrev = field[8].split('"')
79         giddots = gidrev[1].split(".")
80         # we are ignoring gene models!!!!
81         if giddots[1][-1] in string.letters:
82             gidGene = giddots[1][:-1]
83             gidLetter = giddots[1][-1]
84         else:
85             gidGene = giddots[1]
86             gidLetter = "a"
87         gid = "%s.%s" % (giddots[0], gidGene)
88         geneID = ("celegans", gid)
89         gidVersion = 1
90         if gidLetter != "a":
91             try:
92                 gidVersion = ord(gidLetter.lower()) - 96
93             except:
94                 print "problem processing %s - skipping" % gidrev[1]
95                 continue
96
97         start = int(field[3]) - 1
98         stop = int(field[4]) - 1
99         sense = field[6]
100         chrom = field[0].strip()
101         if sense == "+":
102             sense = "F"
103         else:
104             sense = "R"
105
106         geneEntries.append((geneID, chrom, start, stop, sense, "Transcript", gidVersion))
107
108     print "Adding %d gene entries" % len(geneEntries)
109     ceGenome.addGeneEntryBatch(geneEntries)
110
111
112 def loadFeatureEntries(db, gffFile):
113     ceGenome = Genome("celegans", dbFile=db)
114     featureFile = open(gffFile, "r")
115     featureEntries = []
116     seenFeatures = {}
117     featureTranslation = {"coding_exon": "CDS",
118                           "three_prime_UTR": "3UTR",
119                           "five_prime_UTR": "5UTR"
120     }
121
122     for line in featureFile:
123         if line[0] == "#":
124             continue
125
126         field = line.split("\t")
127         if field[1] not in ["Coding_transcript", "miRNA", "tRNAscan-SE-1.23"]:
128             continue
129
130         if field[1] == "Coding_transcript" and field[2].strip() not in featureTranslation:
131             continue
132
133         if field[1] in ["miRNA", "tRNAscan-SE-1.23"]:
134             featureType = "CDS" # identifying these as CDS will force their masking later on
135         else:
136             featureType = featureTranslation[field[2].strip()]
137
138         gidrev = field[8].split('"')
139         giddots = gidrev[1].split(".")
140         # we are ignoring gene models!!!!
141         if giddots[1][-1] in string.letters:
142             gidGene = giddots[1][:-1]
143             gidLetter = giddots[1][-1]
144         else:
145             gidGene = giddots[1]
146             gidLetter = "a"
147
148         gid = "%s.%s" % (giddots[0], gidGene)
149         geneID = ("celegans", gid)
150         gidVersion = 1
151         if gidLetter != "a":
152             try:
153                 gidVersion = ord(gidLetter.lower()) - 96
154             except:
155                 print "problem processing %s - skipping" % gidrev[1]
156                 continue
157
158         start = int(field[3]) - 1
159         stop = int(field[4]) - 1
160         sense = field[6]
161         chrom = field[0].strip()
162         if sense == "+":
163             sense = "F"
164         else:
165             sense = "R"
166
167         if geneID not in seenFeatures:
168             seenFeatures[geneID] = []
169
170         if (gidVersion, start, stop, featureType) not in seenFeatures[geneID]:
171             featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, featureType))
172             seenFeatures[geneID].append((gidVersion, start, stop, featureType))
173
174     print "Adding %d feature entries" % len(featureEntries)
175     ceGenome.addFeatureEntryBatch(featureEntries)
176
177
178 def loadGeneAnnotations(db, geneIDPath):
179     geneAnnotations = []
180     geneIDFile = open(geneIDPath, "r")  
181     lines = geneIDFile.readlines()
182     geneIDFile.close()
183     ceGenome = Genome("celegans", dbFile=db)
184     for line in lines:
185         field = line.split(",")
186         try:
187             gid = field[2].strip()
188             geneID = "%s\t%s" % (field[0], field[1])
189             geneAnnotations.append((("celegans", gid), geneID))
190         except:
191             pass
192
193     print "Adding %d annotations" % len(geneAnnotations)
194     ceGenome.addAnnotationBatch(geneAnnotations)
195
196
197 def loadGeneOntology(db, goPath, goDefPath, geneIDPath):
198     ceGenome = Genome("celegans", version="WS200", dbFile=db)
199     geneIDFile = open(geneIDPath, "r")
200     goDefFile = open(goDefPath, "r")
201     goFile = open(goPath, "r")
202     lines = geneIDFile.readlines()
203     geneIDFile.close()
204     geneIDmap = {}
205     seenGO = {}
206     for line in lines:
207         field = line.split(",")
208         # ugly C elegans hack - map both fields to gid, since either might be
209         # used by GO !
210         if len(field[2].strip()) > 1:
211             geneIDmap[field[1]] =  field[2].strip()
212
213     goDefEntries = goDefFile.readlines()
214     goDefs = {}
215     for goDefEntry in goDefEntries:
216         if goDefEntry[0] != "!":
217             cols = goDefEntry.split("\t")
218             goDefs[cols[0]] = (cols[1], cols[2].strip())
219
220     goEntries = goFile.readlines()
221     goArray = []
222     for line in goEntries:
223         if line[0] == "!":
224             continue
225
226         fields = line.split("\t")
227         name = fields[2]
228         if name in geneIDmap:
229             name = geneIDmap[name]
230
231         if name[-1] == "a":
232             name = name[:-1]
233
234         GOIDarray = fields[4].split(" ")
235         GOID = GOIDarray[0]
236         objType = fields[8]
237         objName = fields[10].split("|")
238         gID = name
239         isNot = fields[3]
240         if len(objName) > 1:
241             name = "%s|%s" % (name.strip(), fields[10])
242
243         try:
244             GOterm  = string.replace(goDefs[GOID][0], "'", "p")
245         except:
246             print "could no map %s - using GOID only" % GOID
247             GOterm = ""
248
249         evidence = fields[9]
250         if gID not in seenGO:
251             seenGO[gID] = []
252
253         if GOID not in seenGO[gID]:
254             seenGO[gID].append(GOID)
255             goArray.append((("celegans", gID), GOID, objType, name, isNot, GOterm, evidence, fields[1]))
256
257     print "Adding %d GO entries" % len(goArray)
258     ceGenome.addGoInfoBatch(goArray)
259
260
261 def createDBFile(db):
262     ceGenome = Genome("celegans", version="WS200",  dbFile=db)
263     ceGenome.createGeneDB(db)
264
265
266 def createDBindices(db):
267     ceGenome = Genome("celegans", version="WS200", dbFile=db)
268     ceGenome.createIndices()
269
270
271 def buildCelegansDB(db=geneDB, downloadRoot=""):
272     if downloadRoot == "":
273         downloadRoot = "%s/download/" % cisRoot
274
275     geneIDPath = "%sgeneIDs.WS200" % downloadRoot
276     goDefPath = "%sGO.terms_and_ids" % downloadRoot
277     goPath = "%sgene_association.wb" % downloadRoot
278
279     # can be found at ftp://caltech.wormbase.org/pub/schwarz/cisreg/softmasks
280     chromos = {"I": "%sCHROMOSOME_I_softmasked.dna" % downloadRoot,
281                "II": "%sCHROMOSOME_II_softmasked.dna" % downloadRoot,
282                "III": "%sCHROMOSOME_III_softmasked.dna" % downloadRoot,
283                "IV": "%sCHROMOSOME_IV_softmasked.dna" % downloadRoot,
284                "V": "%sCHROMOSOME_V_softmasked.dna" % downloadRoot,
285                "X": "%sCHROMOSOME_X_softmasked.dna" % downloadRoot
286     }
287
288     # can be found at ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/genome_feature_tables/GFF2/elegansWS160.gff.gz
289     gffPath = "%selegansWS200.gff" % downloadRoot
290
291     print "Creating database %s" % db
292     createDBFile(db)
293
294     print "Adding gene entries" 
295     loadGeneEntries(db, gffPath)
296
297     print "Adding feature entries" 
298     loadFeatureEntries(db, gffPath)
299
300     print "Adding gene annotations"
301     loadGeneAnnotations(db, geneIDPath)
302
303     print "Adding gene ontology"
304     loadGeneOntology(db, goPath, goDefPath, geneIDPath)
305
306     for chromID in chromos:
307         print "Loading chromosome %s" % chromID
308         loadChromosome(db, chromID, chromos[chromID], "/C_elegans/chr%s.bin" % chromID)
309
310     print "Creating Indices"
311     createDBindices(db)
312
313     print "Finished creating database %s" % db