erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / genomes / dmelanogaster.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-10 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Drosophila melanogaster
31 import string
32 from cistematic.genomes import Genome
33 from os import environ
34
35 if environ.get("CISTEMATIC_ROOT"):
36     cisRoot = environ.get("CISTEMATIC_ROOT") 
37 else:
38     cisRoot = "/proj/genome"
39
40 geneDB = "%s/D_melanogaster/dmelanogaster.genedb" % cisRoot
41
42 version = {"A": "1",
43            "B": "2",
44            "C": "3",
45            "D": "4",
46            "E": "5",
47            "F": "6",
48            "G": "7",
49            "H": "8",
50            "I": "9",
51            "J": "10",
52            "K": "11",
53            "L": "12",
54            "M": "13",
55            "N": "14",
56            "O": "15",
57            "P": "16",
58            "Q": "17",
59            "R": "18",
60            "S": "19",
61            "T": "20",
62            "U": "21",
63            "V": "22",
64            "W": "23",
65            "X": "24",
66            "Y": "25",
67            "Z": "26"
68 }
69
70 def loadChromosome(db, chromID, chromPath, chromOut):
71     seqArray = []
72     dmGenome = Genome("dmelanogaster", dbFile=db)
73     inFile = open(chromPath, "r")
74     line = inFile.readline()
75     for line in inFile:
76         seqArray.append(line.strip())
77
78     seq = string.join(seqArray, "")
79     seqLen = len(seq)
80     if seqLen < 1:
81         print "Problems reading sequence from file"
82
83     print "writing to file %s" % chromOut
84     outFile = open("%s%s" % (cisRoot, chromOut), "w")
85     outFile.write(seq)
86     outFile.close()
87     dmGenome.addChromosomeEntry(chromID, chromOut, "file")
88
89
90 def loadGeneEntries(db, gFile):
91     geneEntries = []
92     dmGenome = Genome("dmelanogaster", dbFile=db)
93     geneFile = open(gFile, "r")
94
95     for line in geneFile:
96         cols = line.split("\t")
97         name = cols[1].split("-R")
98         gid = name[0]
99         start = int(cols[4])
100         stop = int(cols[5])
101         sense = cols[3]
102         chrom = cols[2][3:]
103         if sense == "-":
104             sense = "R"
105         else:
106             sense = "F"
107
108         geneID = ("dmelanogaster", gid)
109         try:
110             gidVersion = version[name[1]]
111         except:
112             continue
113
114         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
115
116     print "Adding %d gene entries" % len(geneEntries)
117     dmGenome.addGeneEntryBatch(geneEntries)
118
119
120 def loadGeneFeatures(db, gfile):
121     geneFile = open(gfile, "r")
122     senseArray = {"+": "F",
123                   "-": "R",
124                   ".": "F"
125     }
126
127     insertArray = []
128     for geneLine in geneFile:
129         geneFields = geneLine.split("\t")
130         exonNum = int(geneFields[8])
131         exonStarts = geneFields[9].split(",")
132         exonStops = geneFields[10].split(",")
133         chrom = geneFields[2][3:]
134         sense = senseArray[geneFields[3]]
135         gstop = int(geneFields[7]) - 1
136         gstart = int(geneFields[6]) - 1
137         name = geneFields[1].split("-R")
138         geneID = ("dmelanogaster", name[0])
139         try:
140             gidVersion = version[name[1]]
141         except:
142             continue
143
144         for index in range(exonNum):
145             estart = int(exonStarts[index]) - 1
146             estop = int(exonStops[index]) - 1
147             if estart >= gstart and estop <= gstop:
148                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, "CDS"))
149             elif estop <= gstart:
150                 if sense == "F":
151                     fType = "5UTR"
152                 else:
153                     fType = "3UTR"
154
155                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, fType))
156             elif estart >= gstop:
157                 if sense == "F":
158                     fType = "3UTR"
159                 else:
160                     fType = "5UTR"
161
162                 insertArray.append((geneID, gidVersion, chrom, estart, estop, sense, fType))
163             elif estart <= gstop and estart > gstart:
164                 if sense == "F":
165                     fType = "3UTR"
166                 else:
167                     fType = "5UTR"
168
169                 insertArray.append((geneID, gidVersion, chrom, estart, gstop, sense, "CDS"))
170                 insertArray.append((geneID, gidVersion, chrom, gstop + 1, estop, sense, fType))
171             elif estart < gstart and estop <= gstop:
172                 if sense == "F":
173                     fType = "5UTR"
174                 else:
175                     fType = "3UTR"
176
177                 insertArray.append((geneID, gidVersion, chrom, estart, gstart - 1, sense, fType))
178                 insertArray.append((geneID, gidVersion, chrom, gstart, estop, sense, "CDS"))
179             else:
180                 if sense == "F":
181                     fType1 = "5UTR"
182                     fType2 = "3UTR"
183                 else:
184                     fType1 = "3UTR"
185                     fType2 = "5UTR"
186
187                 insertArray.append((geneID, gidVersion, chrom, estart, gstart - 1, sense, fType1))
188                 insertArray.append((geneID, gidVersion, chrom, gstart, gstop, sense, "CDS"))
189                 insertArray.append((geneID, gidVersion, chrom, gstop + 1, estop - 1, sense, fType2))
190
191     geneFile.close()
192     dmGenome = Genome("dmelanogaster", dbFile=db)
193     print "Adding %d features" % len(insertArray)
194     dmGenome.addFeatureEntryBatch(insertArray)
195
196
197 def loadGeneAnnotations(db, annotPath):
198     geneAnnotations = []
199     annotFile = open(annotPath, "r")  
200     dmGenome = Genome("dmelanogaster", dbFile=db)
201     for line in annotFile:
202         try:
203             cols = line.split("\t")
204             if cols[0] != "7227":
205                 continue
206
207             locID = cols[3]
208             if "Dmel_" in locID:
209                 locID = locID[5:]
210
211             geneDesc = cols[4]
212             if len(locID) > 0:
213                 geneAnnotations.append((("dmelanogaster", locID), string.replace(geneDesc.strip(), "'", "p")))
214         except:
215             pass
216
217     print "Adding %d annotations" % len(geneAnnotations)
218     dmGenome.addAnnotationBatch(geneAnnotations)
219
220
221 def loadGeneOntology(db, goPath, goDefPath, annotPath):
222     dmGenome = Genome("dmelanogaster", dbFile=db)
223     goDefFile = open(goDefPath, "r")
224     goFile = open(goPath, "r")
225     annotFile = open(annotPath, "r")  
226     annotEntries = annotFile.readlines()
227     annotFile.close()
228     goDefEntries = goDefFile.readlines()
229     goDefs = {}
230     locus = {}
231     goArray = []
232     for goDefEntry in goDefEntries:
233         if goDefEntry[0] != "!":
234             cols = goDefEntry.split("\t")
235             goDefs[cols[0]] = (cols[1], cols[2].strip())
236
237     goEntries = goFile.readlines()
238     for annotEntry in annotEntries:
239         try:
240             cols = annotEntry.split("\t")
241             if cols[0] != "7227":
242                 continue
243
244             locID = cols[3].strip()
245             geneName = cols[1].strip()
246             if len(locID) > 0:
247                 locus[geneName] = locID
248         except:
249             pass
250
251     for entry in goEntries:
252         if entry[0] == "!":
253             continue
254
255         if entry[:4] != "7227":
256             continue
257
258         try:
259             fields = entry.split("\t")
260             geneName = fields[1].strip()
261             locID = locus[geneName]
262             if "Dmel_" in locID:
263                 locID = locID[5:]
264
265             GOID = fields[2]
266             goArray.append((("dmelanogaster", locID), GOID, "", geneName, "", string.replace(goDefs[GOID][0], "'", "p"), goDefs[GOID][1], ""))
267         except:
268             pass
269
270     print "adding %d go entries" % len(goArray)
271     dmGenome.addGoInfoBatch(goArray)
272
273
274 def createDBFile(db):
275     dmGenome = Genome("dmelanogaster", dbFile=db)
276     dmGenome.createGeneDB(db)
277
278
279 def createDBindices(db):
280     dmGenome = Genome("dmelanogaster", dbFile=db)
281     dmGenome.createIndices()
282
283
284 def buildDmelanogasterDB(db=geneDB):
285     """ genes and annotations are from UCSC. GO association file is from geneontology.org. 
286     """
287     genePath = "%s/download/flyBaseGene.txt" % cisRoot
288     annotPath = "%s/download/gene_info" % cisRoot
289     goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
290     goPath = "%s/download/gene2go" % cisRoot
291     chromos = {"2L": "%s/download/chr2L.fa" % cisRoot,
292                "2R": "%s/download/chr2R.fa" % cisRoot,
293                "2LHet": "%s/download/chr2LHet.fa" % cisRoot,
294                "2RHet": "%s/download/chr2RHet.fa" % cisRoot,
295                "3L": "%s/download/chr3L.fa" % cisRoot,
296                "3LHet": "%s/download/chr3LHet.fa" % cisRoot,
297                "3R": "%s/download/chr3R.fa" % cisRoot,
298                "3RHet": "%s/download/chr3RHet.fa" % cisRoot,
299                "4": "%s/download/chr4.fa" % cisRoot,
300                "X": "%s/download/chrX.fa" % cisRoot,
301                "XHet": "%s/download/chrXHet.fa" % cisRoot,
302                "YHet": "%s/download/chrYHet.fa" % cisRoot,
303                "U": "%s/download/chrU.fa" % cisRoot,
304                "Uextra": "%s/download/chrUextra.fa" % cisRoot,
305                "M": "%s/download/chrM.fa" % cisRoot
306     }
307
308     print "Creating database %s" % db
309     createDBFile(db)
310
311     print "Adding gene entries"
312     loadGeneEntries(db, genePath)
313
314     print "Adding gene features"
315     loadGeneFeatures(db, genePath)
316
317     print "Adding gene annotations"
318     loadGeneAnnotations(db, annotPath)
319
320     print "Adding gene ontology"
321     loadGeneOntology(db, goPath, goDefPath, annotPath)
322
323     for chromID in chromos.keys():
324         print "Loading chromosome %s" % chromID
325         loadChromosome(db, chromID, chromos[chromID], "/D_melanogaster/chromo%s.bin" % chromID)
326
327     print "Creating Indices"
328     createDBindices(db)
329
330     print "Finished creating database %s" % db