1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Ratus Norvegicus
32 from cistematic.genomes import Genome
33 from cistematic.core.geneinfo import geneinfoDB
34 from os import environ
36 if environ.get("CISTEMATIC_ROOT"):
37 cisRoot = environ.get("CISTEMATIC_ROOT")
39 cisRoot = "/proj/genome"
41 geneDB = "%s/R_norvegicus/rnorvegicus.genedb" % cisRoot
44 def loadChromosome(db, chromID, chromPath, chromOut):
46 rnGenome = Genome("rnorvegicus", dbFile=db)
47 inFile = open(chromPath, "r")
48 line = inFile.readline()
50 seqArray.append(line.strip())
52 seq = string.join(seqArray, "")
55 print "Problems reading sequence from file"
57 print "writing to file %s" % chromOut
58 outFile = open("%s%s" % (cisRoot, chromOut), "w")
61 rnGenome.addChromosomeEntry(chromID, chromOut, "file")
64 def loadGeneEntries(db, gFile):
65 """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
68 rnGenome = Genome("rnorvegicus", dbFile=db)
69 geneFile = open(gFile, "r")
71 cols = line.split("\t")
72 if cols[11] != "GENE":
75 if cols[12] == "Celera":
78 name = cols[10].split(":")
80 start = int(cols[2]) - 1
81 stop = int(cols[3]) - 1
83 chrom = cols[1].strip()
89 geneID = ("rnorvegicus", gid)
91 geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
93 print "Adding %d gene entries" % len(geneEntries)
94 rnGenome.addGeneEntryBatch(geneEntries)
97 def loadGeneFeatures(db, gFile):
98 """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
101 rnGenome = Genome("rnorvegicus", dbFile=db)
102 featureFile = open(gFile, "r")
103 for line in featureFile:
104 cols = line.split("\t")
105 if cols[11] not in ["CDS", "UTR", "PSEUDO"]:
108 if cols[12] == "Celera":
112 name = cols[10].split(":")
114 start = int(cols[2]) - 1
115 stop = int(cols[3]) - 1
117 chrom = cols[1].strip()
123 geneID = ("rnorvegicus", gid)
125 featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
127 print "Adding %d feature entries" % len(featureEntries)
128 rnGenome.addFeatureEntryBatch(featureEntries)
131 def loadGeneAnnotations(db):
134 rnGenome = Genome("rnorvegicus", dbFile=db)
135 gidList = rnGenome.allGIDs()
136 for locID in gidList:
137 gID = ("rnorvegicus", locID)
138 geneDescArray = idb.getDescription(gID)
140 for entry in geneDescArray:
142 geneDesc += entry.strip()
144 if len(geneDescArray) > 0:
145 geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p")))
147 print "Adding %d annotations" % len(geneAnnotations)
148 rnGenome.addAnnotationBatch(geneAnnotations)
151 def loadGeneOntology(db, goPath, goDefPath):
152 rnGenome = Genome("rnorvegicus", dbFile=db)
153 goDefFile = open(goDefPath, "r")
154 goFile = open(goPath, "r")
158 for goDefEntry in goDefFile:
159 if goDefEntry[0] != "!":
160 cols = goDefEntry.split("\t")
161 goDefs[cols[0]] = (cols[1], cols[2].strip())
163 goEntries = goFile.readlines()
165 for entry in goEntries:
167 fields = entry.split("\t")
168 if fields[0] != "10090":
171 locID = fields[1].strip()
172 gID = ("rnorvegicus", locID)
176 synonyms = idb.geneIDSynonyms(gID)
178 for entry in synonyms:
184 goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
186 print "locus ID %s could not be added" % locID
189 print "adding %d go entries" % len(goArray)
190 rnGenome.addGoInfoBatch(goArray)
193 def createDBFile(db):
194 rnGenome = Genome("rnorvegicus", dbFile=db)
195 rnGenome.createGeneDB(db)
198 def createDBindices(db):
199 rnGenome = Genome("rnorvegicus", dbFile=db)
200 rnGenome.createIndices()
203 def buildRatDB(db=geneDB, downloadDir="%s/download" % cisRoot):
204 genePath = "%s/seq_gene.md" % downloadDir
205 goDefPath = "%s/GO.terms_and_ids" % downloadDir
206 goPath = "%s/gene2go" % downloadDir
207 # ignoring all random chromosomes
208 chromos = {"1": "%s/chr1.fa" % downloadDir,
209 "2": "%s/chr2.fa" % downloadDir,
210 "3": "%s/chr3.fa" % downloadDir,
211 "4": "%s/chr4.fa" % downloadDir,
212 "5": "%s/chr5.fa" % downloadDir,
213 "6": "%s/chr6.fa" % downloadDir,
214 "7": "%s/chr7.fa" % downloadDir,
215 "8": "%s/chr8.fa" % downloadDir,
216 "9": "%s/chr9.fa" % downloadDir,
217 "10": "%s/chr10.fa" % downloadDir,
218 "11": "%s/chr11.fa" % downloadDir,
219 "12": "%s/chr12.fa" % downloadDir,
220 "13": "%s/chr13.fa" % downloadDir,
221 "14": "%s/chr14.fa" % downloadDir,
222 "15": "%s/chr15.fa" % downloadDir,
223 "16": "%s/chr16.fa" % downloadDir,
224 "17": "%s/chr17.fa" % downloadDir,
225 "18": "%s/chr18.fa" % downloadDir,
226 "19": "%s/chr19.fa" % downloadDir,
227 "Un": "%s/chrUn.fa" % downloadDir,
228 "X": "%s/chrX.fa" % downloadDir,
229 "20": "%s/chr20.fa" % downloadDir,
230 "M": "%s/chrM.fa" % downloadDir
233 print "Creating database %s" % db
236 print "Adding gene entries"
237 loadGeneEntries(db, genePath)
239 print "Adding gene features"
240 loadGeneFeatures(db, genePath)
242 print "Adding gene annotations"
243 loadGeneAnnotations(db)
245 print "Adding gene ontology"
246 loadGeneOntology(db, goPath, goDefPath)
248 for chromID in chromos.keys():
249 print "Loading chromosome %s" % chromID
250 loadChromosome(db, chromID, chromos[chromID], "/R_norvegicus/chromo%s.bin" % chromID)
252 print "Creating Indices"
255 print "Finished creating database %s" % db