1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-13 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Ratus Norvegicus
32 from cistematic.genomes import Genome
33 from cistematic.core.geneinfo import geneinfoDB
34 from os import environ
36 if environ.get("CISTEMATIC_ROOT"):
37 cisRoot = environ.get("CISTEMATIC_ROOT")
39 cisRoot = "/proj/genome"
41 geneDB = "%s/R_norvegicus/rnorvegicus.genedb" % cisRoot
44 def buildRatDB(db=geneDB, downloadDir="%s/download" % cisRoot):
45 genePath = "%s/seq_gene.md" % downloadDir
46 goDefPath = "%s/GO.terms_and_ids" % downloadDir
47 goPath = "%s/gene2go" % downloadDir
49 print "Creating database %s" % db
52 print "Adding gene entries"
53 loadGeneEntries(db, genePath)
55 print "Adding gene features"
56 loadGeneFeatures(db, genePath)
58 print "Adding gene annotations"
59 loadGeneAnnotations(db)
61 print "Adding gene ontology"
62 loadGeneOntology(db, goPath, goDefPath)
64 # ignoring all random chromosomes
65 chromList = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
66 "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
69 for chromID in chromList:
70 print "Loading chromosome %s" % chromID
71 chromPath = "%s/chr%s.fa" % (downloadDir, chromID)
72 loadChromosome(db, chromID, chromPath, "/R_norvegicus/chromo%s.bin" % chromID)
74 print "Creating Indices"
77 print "Finished creating database %s" % db
81 rnGenome = Genome("rnorvegicus", dbFile=db)
82 rnGenome.createGeneDB(db)
85 def loadGeneEntries(db, gFile):
86 #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES
89 rnGenome = Genome("rnorvegicus", dbFile=db)
90 geneFile = open(gFile, "r")
92 cols = line.split("\t")
93 if cols[11] != "GENE":
96 if cols[12] == "Celera":
99 name = cols[10].split(":")
101 start = int(cols[2]) - 1
102 stop = int(cols[3]) - 1
104 chrom = cols[1].strip()
110 geneID = ("rnorvegicus", gid)
112 geneEntries.append((geneID, chrom, start, stop, sense, "gene", gidVersion))
114 print "Adding %d gene entries" % len(geneEntries)
115 rnGenome.addGeneEntryBatch(geneEntries)
118 def loadGeneFeatures(db, gFile):
119 """ Load gene features such as CDS, UTR, and PSEUDO from the gene file.
122 rnGenome = Genome("rnorvegicus", dbFile=db)
123 featureFile = open(gFile, "r")
124 for line in featureFile:
125 cols = line.split("\t")
126 if cols[11] not in ["CDS", "UTR", "PSEUDO"]:
129 if cols[12] == "Celera":
133 name = cols[10].split(":")
135 start = int(cols[2]) - 1
136 stop = int(cols[3]) - 1
138 chrom = cols[1].strip()
144 geneID = ("rnorvegicus", gid)
146 featureEntries.append((geneID, gidVersion, chrom, start, stop, sense, fType))
148 print "Adding %d feature entries" % len(featureEntries)
149 rnGenome.addFeatureEntryBatch(featureEntries)
152 def loadGeneAnnotations(db):
155 rnGenome = Genome("rnorvegicus", dbFile=db)
156 gidList = rnGenome.allGIDs()
157 for locID in gidList:
158 gID = ("rnorvegicus", locID)
159 geneDescArray = idb.getDescription(gID)
161 for entry in geneDescArray:
163 geneDesc += entry.strip()
165 if len(geneDescArray) > 0:
166 geneAnnotations.append((gID, string.replace(geneDesc[1:], "'", "p")))
168 print "Adding %d annotations" % len(geneAnnotations)
169 rnGenome.addAnnotationBatch(geneAnnotations)
172 def loadGeneOntology(db, goPath, goDefPath):
173 rnGenome = Genome("rnorvegicus", dbFile=db)
174 goDefFile = open(goDefPath, "r")
175 goFile = open(goPath, "r")
179 for goDefEntry in goDefFile:
180 if goDefEntry[0] != "!":
181 cols = goDefEntry.split("\t")
182 goDefs[cols[0]] = (cols[1], cols[2].strip())
184 goEntries = goFile.readlines()
186 for entry in goEntries:
188 fields = entry.split("\t")
189 if fields[0] != "10090":
192 locID = fields[1].strip()
193 gID = ("rnorvegicus", locID)
197 synonyms = idb.geneIDSynonyms(gID)
199 for entry in synonyms:
205 goArray.append((gID, fields[2], "", gene_name[1:], "", string.replace(goDefs[fields[2]][0], "'", "p"), goDefs[fields[2]][1], ""))
207 print "locus ID %s could not be added" % locID
210 print "adding %d go entries" % len(goArray)
211 rnGenome.addGoInfoBatch(goArray)
214 def loadChromosome(db, chromID, chromPath, chromOut):
216 rnGenome = Genome("rnorvegicus", dbFile=db)
217 inFile = open(chromPath, "r")
218 line = inFile.readline()
220 seqArray.append(line.strip())
222 seq = string.join(seqArray, "")
225 print "Problems reading sequence from file"
227 print "writing to file %s" % chromOut
228 outFile = open("%s%s" % (cisRoot, chromOut), "w")
231 rnGenome.addChromosomeEntry(chromID, chromOut, "file")
234 def createDBindices(db):
235 rnGenome = Genome("rnorvegicus", dbFile=db)
236 rnGenome.createIndices()