erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / genomes / scerevisiae.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-10 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Saccharomyces cerevisiae
31 import string
32 from cistematic.genomes import Genome
33 from os import environ
34
35 if environ.get("CISTEMATIC_ROOT"):
36     cisRoot = environ.get("CISTEMATIC_ROOT") 
37 else:
38     cisRoot = "/proj/genome"
39
40 geneDB = "%s/S_cerevisiae/scerevisiae.genedb" % cisRoot
41
42
43 def loadChromosome(db, chromID, chromPath, chromOut):
44     seqArray = []
45     scGenome = Genome("scerevisiae", dbFile=db)
46     inFile = open(chromPath, "r")
47     line = inFile.readline()
48     for line in inFile:
49         seqArray.append(line.strip())
50
51     seq = string.join(seqArray, "")
52     seqLen = len(seq)
53     if seqLen < 1:
54         print "Problems reading sequence from file"
55
56     print "writing to file %s" % chromOut
57     outFile = open("%s%s" % (cisRoot, chromOut), "w")
58     outFile.write(seq)
59     outFile.close()
60     seq = ""
61     print "calling scGenome()"
62     scGenome.addChromosomeEntry(chromID, chromOut, "file")
63
64
65 def loadGeneEntries(db, gFile):
66     geneEntries = []
67     geneFeatures = []
68     scGenome = Genome("scerevisiae", dbFile=db)
69     geneFile = open(gFile, "r")
70     for line in geneFile:
71         field = line.split("\t")
72         if field[1] != "ORF":
73             continue
74
75         orfName = field[3].strip()
76         sense = field[11]
77         chrom = field[8].strip()
78         if sense == "W":
79             sense = "F"
80             try:
81                 start = int(field[9].strip()) - 1
82                 stop  = int(field[10].strip()) - 1
83             except:
84                 start = 0
85                 stop = 0
86         else:
87             sense = "R"
88             try:
89                 start = int(field[10].strip()) - 1
90                 stop  = int(field[9].strip()) - 1
91             except:
92                 start = 0
93                 stop = 0
94
95         geneID = ("scerevisiae", orfName)
96         gidVersion = 1
97         geneEntries.append((geneID, chrom, start, stop, sense, "chromosomal_feature", gidVersion))
98         geneFeatures.append((geneID, gidVersion, chrom, start, stop, sense, "CDS"))
99
100     print "loading %d gene entries" % len(geneEntries)
101     scGenome.addGeneEntryBatch(geneEntries)
102     print "loading %d gene features" % len(geneFeatures)
103     scGenome.addFeatureEntryBatch(geneFeatures)
104
105
106 def loadGeneAnnotations(db, annotPath):
107     geneAnnotations = []
108     annotFile = open(annotPath, "r")  
109     lines = annotFile.readlines()
110     annotFile.close()
111     scGenome = Genome("scerevisiae", dbFile=db)
112     for line in lines:
113         field = line.split("\t")
114         if field[1] != "ORF":
115             continue
116
117         try:
118             orfName = field[6].strip()
119             description =  field[15].strip()
120             geneAnnotations.append((("scerevisiae", orfName), string.replace(description, "'", "p")))
121         except:
122             pass
123
124     print "Adding %d annotations" % len(geneAnnotations)
125     scGenome.addAnnotationBatch(geneAnnotations)
126
127
128 def loadGeneOntology(db, goPath, goDefPath):
129     scGenome = Genome("scerevisiae", version="SGD1", dbFile=db)
130     goDefFile = open(goDefPath, "r")
131     goFile = open(goPath, "r")
132     goDefEntries = goDefFile.readlines()
133     goDefs = {}
134     for goDefEntry in goDefEntries:
135         if goDefEntry[0] != "!":
136             cols = goDefEntry.split("\t")
137             goDefs[cols[0]] = (cols[1], cols[2].strip())
138
139     goEntries = goFile.readlines()
140     goArray = []
141     for line in goEntries:
142         if line[0] == "!":
143             continue
144
145         fields = line.split("\t")
146         genes = fields[10].split("|")
147         gID = genes[0]
148         GOID = fields[4]
149         objType = fields[11]
150         objNameArray = fields[10].split("|")
151         objName = objNameArray[0]
152         isNot = fields[3]
153         try:
154             GOterm  = string.replace(goDefs[GOID][0], "'", "p")
155         except:
156             print "Could not translate %s" % (GOID)
157             GOterm = ""
158
159         evidence = fields[6]
160         goArray.append((("scerevisiae", gID), GOID[3:], objType, objName, isNot, GOterm, evidence, fields[1]))
161
162     scGenome.addGoInfoBatch(goArray)
163
164
165 def createDBFile(db):
166     scGenome = Genome("scerevisiae", version="SGD1",  dbFile=db)
167     scGenome.createGeneDB(db)
168
169
170 def createDBindices(db):
171     scGenome = Genome("scerevisiae", version="SGD1", dbFile=db)
172     scGenome.createIndices()
173
174
175 def buildScerevisiaeDB(db=geneDB):
176     genePath = "%s/download/SGD_features.tab" % cisRoot
177     goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
178     goPath = "%s/download/gene_association.sgd" % cisRoot
179     chromos = {"1": "%s/download/chr01.fsa" % cisRoot,
180                "2": "%s/download/chr02.fsa" % cisRoot,
181                "3": "%s/download/chr03.fsa" % cisRoot,
182                "4": "%s/download/chr04.fsa" % cisRoot,
183                "5": "%s/download/chr05.fsa" % cisRoot,
184                "6": "%s/download/chr06.fsa" % cisRoot,
185                "7": "%s/download/chr07.fsa" % cisRoot,
186                "8": "%s/download/chr08.fsa" % cisRoot,
187                "9": "%s/download/chr09.fsa" % cisRoot,
188                "10": "%s/download/chr10.fsa" % cisRoot,
189                "11": "%s/download/chr11.fsa" % cisRoot,
190                "12": "%s/download/chr12.fsa" % cisRoot,
191                "13": "%s/download/chr13.fsa" % cisRoot,
192                "14": "%s/download/chr14.fsa" % cisRoot,
193                "15": "%s/download/chr15.fsa" % cisRoot,
194                "16": "%s/download/chr16.fsa" % cisRoot
195     }
196
197     print "Creating database %s" % db
198     createDBFile(db)
199
200     print "Adding gene entries"
201     loadGeneEntries(db, genePath)
202
203     print "Adding gene annotations"
204     loadGeneAnnotations(db, genePath)
205
206     print "Adding gene ontology"
207     loadGeneOntology(db, goPath, goDefPath)
208
209     for chromID in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16"]:
210         print "Loading chromosome %s" % chromID
211         loadChromosome(db, chromID, chromos[chromID], "/S_cerevisiae/chr%s.bin" % chromID)
212
213     print "Creating Indices"
214     createDBindices(db)
215
216     print "Finished creating database %s" % db