first pass cleanup of cistematic/genomes; change bamPreprocessing
[erange.git] / cistematic / genomes / celegans.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-13 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Caenorhaditis elegans
31 import string
32 from cistematic.genomes import Genome
33 from os import environ
34
35 if environ.get("CISTEMATIC_ROOT"):
36     cisRoot = environ.get("CISTEMATIC_ROOT")
37 else:
38     cisRoot = "/proj/genome"
39
40 geneDB = "%s/C_elegans/celegans.genedb" % cisRoot
41
42
43 def buildCelegansDB(db=geneDB, downloadRoot=""):
44     if downloadRoot == "":
45         downloadRoot = "%s/download/" % cisRoot
46
47     geneIDPath = "%sgeneIDs.WS200" % downloadRoot
48     goDefPath = "%sGO.terms_and_ids" % downloadRoot
49     goPath = "%sgene_association.wb" % downloadRoot
50
51     # can be found at ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/genome_feature_tables/GFF2/elegansWS160.gff.gz
52     gffPath = "%selegansWS200.gff" % downloadRoot
53
54     print "Creating database %s" % db
55     createDBFile(db)
56
57     print "Adding gene entries"
58     loadGeneEntries(db, gffPath)
59
60     print "Adding feature entries"
61     loadFeatureEntries(db, gffPath)
62
63     print "Adding gene annotations"
64     loadGeneAnnotations(db, geneIDPath)
65
66     print "Adding gene ontology"
67     loadGeneOntology(db, goPath, goDefPath, geneIDPath)
68
69     # can be found at ftp://caltech.wormbase.org/pub/schwarz/cisreg/softmasks
70     for chromID in ["I", "II", "III", "IV", "V", "X"]:
71         print "Loading chromosome %s" % chromID
72         chromPath = "%sCHROMOSOME_%s_softmasked.dna" % (downloadRoot, chromID)
73         loadChromosome(db, chromID, chromPath, "/C_elegans/chr%s.bin" % chromID)
74
75     print "Creating Indices"
76     createDBindices(db)
77
78     print "Finished creating database %s" % db
79
80
81 def createDBFile(db):
82     ceGenome = Genome("celegans", version="WS200",  dbFile=db)
83     ceGenome.createGeneDB(db)
84
85
86 def loadGeneEntries(db, gffFile):
87     ceGenome = Genome("celegans", dbFile=db)
88     geneFile = open(gffFile, "r")
89     geneEntries = []
90     for line in geneFile:
91         if line[0] == "#":
92             continue
93
94         field = line.split("\t")
95         if field[1] != "Coding_transcript" and field[1] != "miRNA":
96             continue
97
98         if field[2] != "Transcript" and field[2] != "miRNA_primary_transcript":
99             continue
100
101         gidrev = field[8].split('"')
102         giddots = gidrev[1].split(".")
103         # we are ignoring gene models!!!!
104         if giddots[1][-1] in string.letters:
105             gidGene = giddots[1][:-1]
106             gidLetter = giddots[1][-1]
107         else:
108             gidGene = giddots[1]
109             gidLetter = "a"
110
111         gid = "%s.%s" % (giddots[0], gidGene)
112         geneID = ("celegans", gid)
113         gidVersion = 1
114         if gidLetter != "a":
115             try:
116                 gidVersion = ord(gidLetter.lower()) - 96
117             except:
118                 print "problem processing %s - skipping" % gidrev[1]
119                 continue
120
121         start = int(field[3]) - 1
122         stop = int(field[4]) - 1
123         sense = field[6]
124         chrom = field[0].strip()
125         if sense == "+":
126             sense = "F"
127         else:
128             sense = "R"
129
130         geneEntries.append((geneID, chrom, start, stop, sense, "Transcript", gidVersion))
131
132     print "Adding %d gene entries" % len(geneEntries)
133     ceGenome.addGeneEntryBatch(geneEntries)
134
135
136 def loadFeatureEntries(db, gffFile):
137     ceGenome = Genome("celegans", dbFile=db)
138     featureFile = open(gffFile, "r")
139     featureEntries = []
140     seenFeatures = {}
141     featureTranslation = {"coding_exon": "CDS",
142                           "three_prime_UTR": "3UTR",
143                           "five_prime_UTR": "5UTR"
144     }
145
146     for line in featureFile:
147         if line[0] == "#":
148             continue
149
150         field = line.split("\t")
151         if field[1] not in ["Coding_transcript", "miRNA", "tRNAscan-SE-1.23"]:
152             continue
153
154         if field[1] == "Coding_transcript" and field[2].strip() not in featureTranslation:
155             continue
156
157         if field[1] in ["miRNA", "tRNAscan-SE-1.23"]:
158             featureType = "CDS" # identifying these as CDS will force their masking later on
159         else:
160             featureType = featureTranslation[field[2].strip()]
161
162         gidrev = field[8].split('"')
163         giddots = gidrev[1].split(".")
164         # we are ignoring gene models!!!!
165         if giddots[1][-1] in string.letters:
166             gidGene = giddots[1][:-1]
167             gidLetter = giddots[1][-1]
168         else:
169             gidGene = giddots[1]
170             gidLetter = "a"
171
172         gid = "%s.%s" % (giddots[0], gidGene)
173         geneID = ("celegans", gid)
174         gidVersion = 1
175         if gidLetter != "a":
176             try:
177                 gidVersion = ord(gidLetter.lower()) - 96
178             except:
179                 print "problem processing %s - skipping" % gidrev[1]
180                 continue
181
182         start = int(field[3]) - 1
183         stop = int(field[4]) - 1
184         sense = field[6]
185         chrom = field[0].strip()
186         if sense == "+":
187             sense = "F"
188         else:
189             sense = "R"
190
191         if geneID not in seenFeatures:
192             seenFeatures[geneID] = []
193
194         if (gidVersion, start, stop, featureType) not in seenFeatures[geneID]:
195             featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, featureType))
196             seenFeatures[geneID].append((gidVersion, start, stop, featureType))
197
198     print "Adding %d feature entries" % len(featureEntries)
199     ceGenome.addFeatureEntryBatch(featureEntries)
200
201
202 def loadGeneAnnotations(db, geneIDPath):
203     geneAnnotations = []
204     geneIDFile = open(geneIDPath, "r")
205     lines = geneIDFile.readlines()
206     geneIDFile.close()
207     ceGenome = Genome("celegans", dbFile=db)
208     for line in lines:
209         field = line.split(",")
210         try:
211             gid = field[2].strip()
212             geneID = "%s\t%s" % (field[0], field[1])
213             geneAnnotations.append((("celegans", gid), geneID))
214         except:
215             pass
216
217     print "Adding %d annotations" % len(geneAnnotations)
218     ceGenome.addAnnotationBatch(geneAnnotations)
219
220
221 def loadGeneOntology(db, goPath, goDefPath, geneIDPath):
222     ceGenome = Genome("celegans", version="WS200", dbFile=db)
223     geneIDFile = open(geneIDPath, "r")
224     goDefFile = open(goDefPath, "r")
225     goFile = open(goPath, "r")
226     lines = geneIDFile.readlines()
227     geneIDFile.close()
228     geneIDmap = {}
229     seenGO = {}
230     for line in lines:
231         field = line.split(",")
232         # ugly C elegans hack - map both fields to gid, since either might be
233         # used by GO !
234         if len(field[2].strip()) > 1:
235             geneIDmap[field[1]] =  field[2].strip()
236
237     goDefEntries = goDefFile.readlines()
238     goDefs = {}
239     for goDefEntry in goDefEntries:
240         if goDefEntry[0] != "!":
241             cols = goDefEntry.split("\t")
242             goDefs[cols[0]] = (cols[1], cols[2].strip())
243
244     goEntries = goFile.readlines()
245     goArray = []
246     for line in goEntries:
247         if line[0] == "!":
248             continue
249
250         fields = line.split("\t")
251         name = fields[2]
252         if name in geneIDmap:
253             name = geneIDmap[name]
254
255         if name[-1] == "a":
256             name = name[:-1]
257
258         GOIDarray = fields[4].split(" ")
259         GOID = GOIDarray[0]
260         objType = fields[8]
261         objName = fields[10].split("|")
262         gID = name
263         isNot = fields[3]
264         if len(objName) > 1:
265             name = "%s|%s" % (name.strip(), fields[10])
266
267         try:
268             GOterm  = string.replace(goDefs[GOID][0], "'", "p")
269         except:
270             print "could no map %s - using GOID only" % GOID
271             GOterm = ""
272
273         evidence = fields[9]
274         if gID not in seenGO:
275             seenGO[gID] = []
276
277         if GOID not in seenGO[gID]:
278             seenGO[gID].append(GOID)
279             goArray.append((("celegans", gID), GOID, objType, name, isNot, GOterm, evidence, fields[1]))
280
281     print "Adding %d GO entries" % len(goArray)
282     ceGenome.addGoInfoBatch(goArray)
283
284
285 def loadChromosome(db, chromID, chromPath, chromOut):
286     seqArray = []
287     ceGenome = Genome("celegans", dbFile=db)
288     inFile = open(chromPath, "r")
289     line = inFile.readline()
290     for line in inFile:
291         seqArray.append(line.strip())
292
293     seq = string.join(seqArray, "")
294     seqLen = len(seq)
295     if seqLen < 1:
296         print "Problems reading sequence from file"
297
298     print "writing to file %s" % chromOut
299     outFile = open("%s%s" % (cisRoot, chromOut), "w")
300     outFile.write(seq)
301     outFile.close()
302     ceGenome.addChromosomeEntry(chromID, chromOut, "file")
303
304
305 def createDBindices(db):
306     ceGenome = Genome("celegans", version="WS200", dbFile=db)
307     ceGenome.createIndices()