1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-10 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Arabidopsis thaliana
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/A_thaliana/athaliana.genedb" % cisRoot
42 chromSize = {"1": 30432563,
49 background = [0.3180185, 0.1819815, 0.1819815, 0.3180185]
50 genomeSize = 119186497
65 otherList = other.split(";")
67 for otherItem in otherList:
69 (name, value) = otherItem.split("=")
73 otherDict[name] = value
79 value = value.split(",")[0]
83 return (fType, gid, chrom, start, stop, sense, otherDict)
86 def loadChromosome(db, chromID, chromPath, chromOut):
88 atGenome = Genome("athaliana", dbFile=db)
90 inFile = open(chromPath, "r")
91 line = inFile.readline()
93 seqArray.append(line.strip())
95 seq = string.join(seqArray,"")
98 print "Problems reading sequence from file"
100 print "writing to file %s" % (chromOut)
101 outFile = open(cisRoot + chromOut, "w")
106 atGenome.addChromosomeEntry(chromID, chromOut, "file")
107 # Add alternative chromID - should be A-O and 01-09
108 atGenome.addChromosomeEntry("chromo%s" % chromID, chromOut, "file")
111 def loadGeneEntries(db, gFile):
113 atGenome = Genome("athaliana", dbFile=db)
114 geneFile = open(gFile, "r")
115 for line in geneFile:
116 if line[0] == "#" or len(line) < 10:
119 fields = line.strip().split("\t")
120 if fields[2] != "gene":
123 (fType, gid, chrom, start, stop, sense, otherDict) = decodeGFF3(fields)
124 geneID = ("athaliana", gid)
126 geneEntries.append((geneID, chrom, start, stop, sense, "gene", version))
128 print "inserting %d gene entries" % len(geneEntries)
129 atGenome.addGeneEntryBatch(geneEntries)
132 def loadFeatureEntries(db, gFile):
135 atGenome = Genome("athaliana", dbFile=db)
136 featureTranslation = {"CDS": "CDS",
137 "three_prime_UTR": "3UTR",
138 "five_prime_UTR": "5UTR",
142 geneFile = open(gFile, "r")
143 for line in geneFile:
144 fields = line.split("\t")
145 (fType, gid, chrom, start, stop, sense, otherDict) = decodeGFF3(fields)
146 if fType in ["ncRNA"]:
147 (locus, rev) = gid.split(".")
148 if gid not in trackedGenes:
149 trackedGenes.append(locus)
151 geneFile = open(gFile, "r")
152 for line in geneFile:
153 if line[0] == "c" or len(line) < 10:
156 fields = line.split("\t")
157 (fType, gid, chrom, start, stop, sense, otherDict) = decodeGFF3(fields)
158 locusField = gid.split('.')
160 (locus, rev) = locusField
166 if fType not in featureTranslation:
169 elif fType == "exon" and locus not in trackedGenes:
172 geneID = ("athaliana", locus)
173 featureEntries.append((geneID, rev, chrom, start, stop, sense, featureTranslation[fType]))
175 print "inserted %d feature entries" % len(featureEntries)
176 atGenome.addFeatureEntryBatch(featureEntries)
179 def loadGeneAnnotations(db, annotPath):
181 annotFile = open(annotPath, "r")
183 lines = annotFile.readlines()
186 atGenome = Genome("athaliana", dbFile=db)
188 field = line.split("\t")
190 orfName = field[0].strip()
192 (locus, rev) = orfName.split(".")
195 description = field[2].strip()
196 geneAnnotations.append((("athaliana", orfName), string.replace(description, "'", "p")))
200 print "Adding %d annotations" % len(geneAnnotations)
201 atGenome.addAnnotationBatch(geneAnnotations)
204 def loadGeneOntology(db, goPath):
205 atGenome = Genome("athaliana", dbFile=db)
206 goFile = open(goPath, "r")
207 goEntries = goFile.readlines()
209 for line in goEntries:
210 fields = line.split("\t")
213 objType = string.replace(fields[3], "'", "p")
214 objName = string.replace(fields[2], "'", "p")
218 goArray.append((("athaliana", gID), GOID[3:], objType, objName, isNot, GOterm, evidence, fields[9]))
220 atGenome.addGoInfoBatch(goArray)
223 def createDBFile(db):
224 atGenome = Genome("athaliana", dbFile=db)
225 atGenome.createGeneDB(db)
228 def createDBindices(db):
229 atGenome = Genome("athaliana", dbFile=db)
230 atGenome.createIndices()
233 def buildArabidopsisDB(db=geneDB, downloadDir="%s/download" % cisRoot):
234 genePath = "%s/TAIR9_GFF3_genes_transposons.gff" % downloadDir
235 annotPath = "%s/TAIR9_functional_descriptions" % downloadDir
236 goPath = "%s/ATH_GO_GOSLIM.txt" % downloadDir
238 chromos = {"1": "%s/chr1.fas" % downloadDir,
239 "2": "%s/chr2.fas" % downloadDir,
240 "3": "%s/chr3.fas" % downloadDir,
241 "4": "%s/chr4.fas" % downloadDir,
242 "5": "%s/chr5.fas" % downloadDir,
243 "C": "%s/chrC.fas" % downloadDir,
244 "M": "%s/chrM.fas" % downloadDir
247 print "Creating database %s" % db
250 print "Adding gene entries"
251 loadGeneEntries(db, genePath)
253 print "Adding feature entries"
254 loadFeatureEntries(db, genePath)
256 print "Adding gene annotations"
257 loadGeneAnnotations(db, annotPath)
259 print "Adding gene ontology"
260 loadGeneOntology(db, goPath)
262 for chromID in chromos.keys():
263 print "Loading chromosome %s" % chromID
264 loadChromosome(db, chromID, chromos[chromID], "/A_thaliana/chr%s.bin" % chromID)
266 print "Creating Indices"
269 print "Finished creating database %s" % db