erange 4.0a dev release with integrated cistematic
[erange.git] / cistematic / genomes / rnorvegicus.py
1 ###########################################################################
2 #                                                                         #
3 # C O P Y R I G H T   N O T I C E                                         #
4 #  Copyright (c) 2003-10 by:                                              #
5 #    * California Institute of Technology                                 #
6 #                                                                         #
7 #    All Rights Reserved.                                                 #
8 #                                                                         #
9 # Permission is hereby granted, free of charge, to any person             #
10 # obtaining a copy of this software and associated documentation files    #
11 # (the "Software"), to deal in the Software without restriction,          #
12 # including without limitation the rights to use, copy, modify, merge,    #
13 # publish, distribute, sublicense, and/or sell copies of the Software,    #
14 # and to permit persons to whom the Software is furnished to do so,       #
15 # subject to the following conditions:                                    #
16 #                                                                         #
17 # The above copyright notice and this permission notice shall be          #
18 # included in all copies or substantial portions of the Software.         #
19 #                                                                         #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
27 # SOFTWARE.                                                               #
28 ###########################################################################
29 #
30 # data for Ratus Norvegicus
31 import string
32 from cistematic.genomes import Genome
33 from cistematic.core.geneinfo import geneinfoDB
34 from os import environ
35
36 if environ.get("CISTEMATIC_ROOT"):
37     cisRoot = environ.get("CISTEMATIC_ROOT") 
38 else:
39     cisRoot = "/proj/genome"
40
41 geneDB  = "%s/R_norvegicus/rnorvegicus.genedb" % cisRoot
42
43
44 def loadChromosome(db, chromID, chromPath, chromOut):
45     seqArray = []
46     rnGenome = Genome("rnorvegicus", dbFile=db)
47     inFile = open(chromPath, "r")
48     line = inFile.readline()
49     for line in inFile:
50         seqArray.append(line.strip())
51
52     seq = string.join(seqArray, "")
53     seqLen = len(seq)
54     if seqLen < 1:
55         print "Problems reading sequence from file"
56
57     print "writing to file %s" % chromOut
58     outFile = open("%s%s" % (cisRoot, chromOut), "w")
59     outFile.write(seq)
60     outFile.close()
61     rnGenome.addChromosomeEntry(chromID, chromOut, "file")
62
63
64 def loadGeneEntries(db, gFile):
65     """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
66     """
67     geneEntries = []
68     rnGenome = Genome("rnorvegicus", dbFile=db)
69     geneFile = open(gFile, "r")
70     for line in geneFile:
71         cols = line.split("\t")
72         if cols[11] != "GENE":
73             continue
74
75         if cols[12] == "Celera":
76             continue
77
78         name = cols[10].split(":")
79         gid = name[1]
80         start = int(cols[2]) - 1
81         stop = int(cols[3]) - 1
82         sense = cols[4]
83         chrom = cols[1].strip()
84         if sense == "+":
85             sense = "F"
86         else:
87             sense = "R"
88
89         geneID = ("rnorvegicus", gid)
90         gidVersion = 1
91         geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
92
93     print "Adding %d gene entries" % len(geneEntries)
94     rnGenome.addGeneEntryBatch(geneEntries)
95
96
97 def loadGeneFeatures(db, gFile):
98     """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
99     """
100     featureEntries = []
101     rnGenome = Genome("rnorvegicus", dbFile=db)
102     featureFile = open(gFile, "r")
103     for line in featureFile:
104         cols = line.split("\t")
105         if cols[11] not in ["CDS", "UTR", "PSEUDO"]:
106             continue
107
108         if cols[12] == "Celera":
109             continue
110
111         fType = cols[11]
112         name = cols[10].split(":")
113         gid = name[1]
114         start = int(cols[2]) - 1
115         stop = int(cols[3]) - 1
116         sense = cols[4]
117         chrom = cols[1].strip()
118         if sense == "+":
119             sense = "F"
120         else:
121             sense = "R"
122
123         geneID = ("rnorvegicus", gid)
124         gidVersion = 1
125         featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
126
127     print "Adding %d feature entries" % len(featureEntries)
128     rnGenome.addFeatureEntryBatch(featureEntries)
129
130
131 def loadGeneAnnotations(db):
132     geneAnnotations = []
133     idb = geneinfoDB()  
134     rnGenome = Genome("rnorvegicus", dbFile=db)
135     gidList = rnGenome.allGIDs()
136     for locID in gidList:
137         gID = ("rnorvegicus", locID)
138         geneDescArray = idb.getDescription(gID)
139         geneDesc = ""
140         for entry in geneDescArray:
141             geneDesc += ","
142             geneDesc += entry.strip()
143
144         if len(geneDescArray) > 0:
145             geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p")))
146
147     print "Adding %d annotations" % len(geneAnnotations)
148     rnGenome.addAnnotationBatch(geneAnnotations)
149
150
151 def loadGeneOntology(db, goPath, goDefPath):
152     rnGenome = Genome("rnorvegicus", dbFile=db)
153     goDefFile = open(goDefPath, "r")
154     goFile = open(goPath, "r")
155     idb = geneinfoDB()
156     goDefs = {}
157     goArray = []
158     for goDefEntry in goDefFile:
159         if goDefEntry[0] != "!":
160             cols = goDefEntry.split("\t")
161             goDefs[cols[0]] = (cols[1], cols[2].strip())
162
163     goEntries = goFile.readlines()
164     prevGID = ''
165     for entry in goEntries:
166         try:
167             fields = entry.split("\t")
168             if fields[0] != "10090":
169                 continue
170
171             locID = fields[1].strip()
172             gID = ("rnorvegicus", locID)
173             if prevGID != gID:
174                 prevGID = gID
175                 gene_name = ""
176                 synonyms = idb.geneIDSynonyms(gID)
177                 if len(synonyms) >0:
178                     for entry in synonyms:
179                         gene_name += ","    
180                         gene_name += entry
181                 else:
182                     gene_name = " "
183
184             goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
185         except:
186             print "locus ID %s could not be added" % locID
187             pass
188
189     print "adding %d go entries" % len(goArray)
190     rnGenome.addGoInfoBatch(goArray)
191
192
193 def createDBFile(db):
194     rnGenome = Genome("rnorvegicus",  dbFile=db)
195     rnGenome.createGeneDB(db)
196
197
198 def createDBindices(db):
199     rnGenome = Genome("rnorvegicus", dbFile=db)
200     rnGenome.createIndices()
201
202
203 def buildRatDB(db=geneDB, downloadDir="%s/download" % cisRoot):
204     genePath = "%s/seq_gene.md" % downloadDir
205     goDefPath = "%s/GO.terms_and_ids" % downloadDir
206     goPath = "%s/gene2go" % downloadDir
207     # ignoring all random chromosomes
208     chromos = {"1": "%s/chr1.fa" % downloadDir,
209                "2": "%s/chr2.fa" % downloadDir,
210                "3": "%s/chr3.fa" % downloadDir,
211                "4": "%s/chr4.fa" % downloadDir,
212                "5": "%s/chr5.fa" % downloadDir,
213                "6": "%s/chr6.fa" % downloadDir,
214                "7": "%s/chr7.fa" % downloadDir,
215                "8": "%s/chr8.fa" % downloadDir,
216                "9": "%s/chr9.fa" % downloadDir,
217                "10": "%s/chr10.fa" % downloadDir,
218                "11": "%s/chr11.fa" % downloadDir,
219                "12": "%s/chr12.fa" % downloadDir,
220                "13": "%s/chr13.fa" % downloadDir,
221                "14": "%s/chr14.fa" % downloadDir,
222                "15": "%s/chr15.fa" % downloadDir,
223                "16": "%s/chr16.fa" % downloadDir,
224                "17": "%s/chr17.fa" % downloadDir,
225                "18": "%s/chr18.fa" % downloadDir,
226                "19": "%s/chr19.fa" % downloadDir,
227                "Un": "%s/chrUn.fa" % downloadDir,
228                "X": "%s/chrX.fa" % downloadDir,
229                "20": "%s/chr20.fa" % downloadDir,
230                "M": "%s/chrM.fa" % downloadDir
231     }
232
233     print "Creating database %s" % db
234     createDBFile(db)
235
236     print "Adding gene entries"
237     loadGeneEntries(db, genePath)
238
239     print "Adding gene features"
240     loadGeneFeatures(db, genePath)
241
242     print "Adding gene annotations"
243     loadGeneAnnotations(db)
244
245     print "Adding gene ontology"
246     loadGeneOntology(db, goPath, goDefPath)
247
248     for chromID in chromos.keys():
249         print "Loading chromosome %s" % chromID
250         loadChromosome(db, chromID, chromos[chromID], "/R_norvegicus/chromo%s.bin" % chromID)
251
252     print "Creating Indices"
253     createDBindices(db)
254
255     print "Finished creating database %s" % db