1 ###########################################################################
3 # C O P Y R I G H T N O T I C E #
4 # Copyright (c) 2003-13 by: #
5 # * California Institute of Technology #
7 # All Rights Reserved. #
9 # Permission is hereby granted, free of charge, to any person #
10 # obtaining a copy of this software and associated documentation files #
11 # (the "Software"), to deal in the Software without restriction, #
12 # including without limitation the rights to use, copy, modify, merge, #
13 # publish, distribute, sublicense, and/or sell copies of the Software, #
14 # and to permit persons to whom the Software is furnished to do so, #
15 # subject to the following conditions: #
17 # The above copyright notice and this permission notice shall be #
18 # included in all copies or substantial portions of the Software. #
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
22 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
24 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
25 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
26 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
28 ###########################################################################
30 # data for Arabidopsis thaliana
32 from cistematic.genomes import Genome
33 from os import environ
35 if environ.get("CISTEMATIC_ROOT"):
36 cisRoot = environ.get("CISTEMATIC_ROOT")
38 cisRoot = "/proj/genome"
40 geneDB = "%s/A_thaliana/athaliana.genedb" % cisRoot
42 chromSize = {"1": 30432563,
49 background = [0.3180185, 0.1819815, 0.1819815, 0.3180185]
50 genomeSize = 119186497
65 otherList = other.split(";")
67 for otherItem in otherList:
69 (name, value) = otherItem.split("=")
73 otherDict[name] = value
79 value = value.split(",")[0]
83 return (fType, gid, chrom, start, stop, sense, otherDict)
86 def buildArabidopsisDB(db=geneDB, downloadDir="%s/download" % cisRoot):
87 genePath = "%s/TAIR9_GFF3_genes_transposons.gff" % downloadDir
88 annotPath = "%s/TAIR9_functional_descriptions" % downloadDir
89 goPath = "%s/ATH_GO_GOSLIM.txt" % downloadDir
91 print "Creating database %s" % db
94 print "Adding gene entries"
95 loadGeneEntries(db, genePath)
97 print "Adding feature entries"
98 loadFeatureEntries(db, genePath)
100 print "Adding gene annotations"
101 loadGeneAnnotations(db, annotPath)
103 print "Adding gene ontology"
104 loadGeneOntology(db, goPath)
106 for chromID in ["1", "2", "3", "4", "5", "C", "M"]:
107 print "Loading chromosome %s" % chromID
108 chromPath = "%s/chr%s.fas" % (downloadDir, chromID)
109 loadChromosome(db, chromID, chromPath, "/A_thaliana/chr%s.bin" % chromID)
111 print "Creating Indices"
114 print "Finished creating database %s" % db
117 def createDBFile(db):
118 atGenome = Genome("athaliana", dbFile=db)
119 atGenome.createGeneDB(db)
122 def loadGeneEntries(db, gFile):
124 atGenome = Genome("athaliana", dbFile=db)
125 geneFile = open(gFile, "r")
126 for line in geneFile:
127 if line[0] == "#" or len(line) < 10:
130 fields = line.strip().split("\t")
131 if fields[2] != "gene":
134 (fType, gid, chrom, start, stop, sense, otherDict) = decodeGFF3(fields)
135 geneID = ("athaliana", gid)
137 geneEntries.append((geneID, chrom, start, stop, sense, "gene", version))
139 print "inserting %d gene entries" % len(geneEntries)
140 atGenome.addGeneEntryBatch(geneEntries)
143 def loadFeatureEntries(db, gFile):
146 atGenome = Genome("athaliana", dbFile=db)
147 featureTranslation = {"CDS": "CDS",
148 "three_prime_UTR": "3UTR",
149 "five_prime_UTR": "5UTR",
154 geneFile = open(gFile, "r")
155 for line in geneFile:
156 fields = line.split("\t")
157 (fType, gid, chrom, start, stop, sense, otherDict) = decodeGFF3(fields)
158 if fType in ["ncRNA"]:
159 (locus, rev) = gid.split(".")
160 if gid not in trackedGenes:
161 trackedGenes.append(locus)
163 geneFile = open(gFile, "r")
164 for line in geneFile:
165 if line[0] == "c" or len(line) < 10:
168 fields = line.split("\t")
169 (fType, gid, chrom, start, stop, sense, otherDict) = decodeGFF3(fields)
170 locusField = gid.split('.')
172 (locus, rev) = locusField
178 if fType not in featureTranslation:
181 elif fType == "exon" and locus not in trackedGenes:
184 geneID = ("athaliana", locus)
185 featureEntries.append((geneID, rev, chrom, start, stop, sense, featureTranslation[fType]))
187 print "inserted %d feature entries" % len(featureEntries)
188 atGenome.addFeatureEntryBatch(featureEntries)
191 def loadGeneAnnotations(db, annotPath):
193 annotFile = open(annotPath, "r")
195 lines = annotFile.readlines()
198 atGenome = Genome("athaliana", dbFile=db)
200 field = line.split("\t")
202 orfName = field[0].strip()
204 (locus, rev) = orfName.split(".")
207 description = field[2].strip()
208 geneAnnotations.append((("athaliana", orfName), string.replace(description, "'", "p")))
212 print "Adding %d annotations" % len(geneAnnotations)
213 atGenome.addAnnotationBatch(geneAnnotations)
216 def loadGeneOntology(db, goPath):
217 atGenome = Genome("athaliana", dbFile=db)
218 goFile = open(goPath, "r")
219 goEntries = goFile.readlines()
221 for line in goEntries:
222 fields = line.split("\t")
225 objType = string.replace(fields[3], "'", "p")
226 objName = string.replace(fields[2], "'", "p")
230 goArray.append((("athaliana", gID), GOID[3:], objType, objName, isNot, GOterm, evidence, fields[9]))
232 atGenome.addGoInfoBatch(goArray)
235 def loadChromosome(db, chromID, chromPath, chromOut):
237 atGenome = Genome("athaliana", dbFile=db)
239 inFile = open(chromPath, "r")
240 line = inFile.readline()
242 seqArray.append(line.strip())
244 seq = string.join(seqArray,"")
247 print "Problems reading sequence from file"
249 print "writing to file %s" % (chromOut)
250 outFile = open(cisRoot + chromOut, "w")
255 atGenome.addChromosomeEntry(chromID, chromOut, "file")
256 # Add alternative chromID - should be A-O and 01-09
257 atGenome.addChromosomeEntry("chromo%s" % chromID, chromOut, "file")
260 def createDBindices(db):
261 atGenome = Genome("athaliana", dbFile=db)
262 atGenome.createIndices()