--- /dev/null
+###########################################################################
+# #
+# C O P Y R I G H T N O T I C E #
+# Copyright (c) 2003-10 by: #
+# * California Institute of Technology #
+# #
+# All Rights Reserved. #
+# #
+# Permission is hereby granted, free of charge, to any person #
+# obtaining a copy of this software and associated documentation files #
+# (the "Software"), to deal in the Software without restriction, #
+# including without limitation the rights to use, copy, modify, merge, #
+# publish, distribute, sublicense, and/or sell copies of the Software, #
+# and to permit persons to whom the Software is furnished to do so, #
+# subject to the following conditions: #
+# #
+# The above copyright notice and this permission notice shall be #
+# included in all copies or substantial portions of the Software. #
+# #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
+# SOFTWARE. #
+###########################################################################
+#
+# data for Arabidopsis thaliana
+import string
+from cistematic.genomes import Genome
+from os import environ
+
+if environ.get("CISTEMATIC_ROOT"):
+ cisRoot = environ.get("CISTEMATIC_ROOT")
+else:
+ cisRoot = "/proj/genome"
+
+geneDB = "%s/A_thaliana/athaliana.genedb" % cisRoot
+
+chromSize = {"1": 30432563,
+ "2": 19705359,
+ "3": 23470805,
+ "4": 18585042,
+ "5": 26992738
+}
+
+background = [0.3180185, 0.1819815, 0.1819815, 0.3180185]
+genomeSize = 119186497
+
+
+def decodeGFF3(cols):
+ fType = cols[2]
+ chrom = cols[0][3:]
+ start = int(cols[3])
+ stop = int(cols[4])
+ sense = cols[6]
+ if sense == "+":
+ sense = "F"
+ else:
+ sense = "R"
+
+ other = cols[-1]
+ otherList = other.split(";")
+ otherDict = {}
+ for otherItem in otherList:
+ try:
+ (name, value) = otherItem.split("=")
+ except:
+ continue
+
+ otherDict[name] = value
+ if name == "Name":
+ gid = value.strip()
+
+ if name == "Parent":
+ if "," in value:
+ value = value.split(",")[0]
+
+ gid = value.strip()
+
+ return (fType, gid, chrom, start, stop, sense, otherDict)
+
+
+def loadChromosome(db, chromID, chromPath, chromOut):
+ seqArray = []
+ atGenome = Genome("athaliana", dbFile=db)
+
+ inFile = open(chromPath, "r")
+ line = inFile.readline()
+ for line in inFile:
+ seqArray.append(line.strip())
+
+ seq = string.join(seqArray,"")
+ seqLen = len(seq)
+ if seqLen < 1:
+ print "Problems reading sequence from file"
+
+ print "writing to file %s" % (chromOut)
+ outFile = open(cisRoot + chromOut, "w")
+ outFile.write(seq)
+ outFile.close()
+ seq = ""
+
+ atGenome.addChromosomeEntry(chromID, chromOut, "file")
+ # Add alternative chromID - should be A-O and 01-09
+ atGenome.addChromosomeEntry("chromo%s" % chromID, chromOut, "file")
+
+
+def loadGeneEntries(db, gFile):
+ geneEntries = []
+ atGenome = Genome("athaliana", dbFile=db)
+ geneFile = open(gFile, "r")
+ for line in geneFile:
+ if line[0] == "#" or len(line) < 10:
+ continue
+
+ fields = line.strip().split("\t")
+ if fields[2] != "gene":
+ continue
+
+ (fType, gid, chrom, start, stop, sense, otherDict) = decodeGFF3(fields)
+ geneID = ("athaliana", gid)
+ version = 1
+ geneEntries.append((geneID, chrom, start, stop, sense, "gene", version))
+
+ print "inserting %d gene entries" % len(geneEntries)
+ atGenome.addGeneEntryBatch(geneEntries)
+
+
+def loadFeatureEntries(db, gFile):
+ featureEntries = []
+ trackedGenes = []
+ atGenome = Genome("athaliana", dbFile=db)
+ featureTranslation = {"CDS": "CDS",
+ "three_prime_UTR": "3UTR",
+ "five_prime_UTR": "5UTR",
+ "miRNA": "5UTR",
+ "exon": "5UTR"
+ }
+ geneFile = open(gFile, "r")
+ for line in geneFile:
+ fields = line.split("\t")
+ (fType, gid, chrom, start, stop, sense, otherDict) = decodeGFF3(fields)
+ if fType in ["ncRNA"]:
+ (locus, rev) = gid.split(".")
+ if gid not in trackedGenes:
+ trackedGenes.append(locus)
+
+ geneFile = open(gFile, "r")
+ for line in geneFile:
+ if line[0] == "c" or len(line) < 10:
+ continue
+
+ fields = line.split("\t")
+ (fType, gid, chrom, start, stop, sense, otherDict) = decodeGFF3(fields)
+ locusField = gid.split('.')
+ try:
+ (locus, rev) = locusField
+ rev = rev.strip()
+ except:
+ locus = gid
+ rev = 1
+
+ if fType not in featureTranslation:
+ continue
+
+ elif fType == "exon" and locus not in trackedGenes:
+ continue
+
+ geneID = ("athaliana", locus)
+ featureEntries.append((geneID, rev, chrom, start, stop, sense, featureTranslation[fType]))
+
+ print "inserted %d feature entries" % len(featureEntries)
+ atGenome.addFeatureEntryBatch(featureEntries)
+
+
+def loadGeneAnnotations(db, annotPath):
+ geneAnnotations = []
+ annotFile = open(annotPath, "r")
+ annotFile.readline()
+ lines = annotFile.readlines()
+ annotFile.close()
+
+ atGenome = Genome("athaliana", dbFile=db)
+ for line in lines:
+ field = line.split("\t")
+ try:
+ orfName = field[0].strip()
+ if "." in orfName:
+ (locus, rev) = orfName.split(".")
+ orfName = locus
+
+ description = field[2].strip()
+ geneAnnotations.append((("athaliana", orfName), string.replace(description, "'", "p")))
+ except:
+ pass
+
+ print "Adding %d annotations" % len(geneAnnotations)
+ atGenome.addAnnotationBatch(geneAnnotations)
+
+
+def loadGeneOntology(db, goPath):
+ atGenome = Genome("athaliana", dbFile=db)
+ goFile = open(goPath, "r")
+ goEntries = goFile.readlines()
+ goArray = []
+ for line in goEntries:
+ fields = line.split("\t")
+ gID = fields[0]
+ GOID = fields[4]
+ objType = string.replace(fields[3], "'", "p")
+ objName = string.replace(fields[2], "'", "p")
+ isNot = ""
+ GOterm = fields[7]
+ evidence = fields[8]
+ goArray.append((("athaliana", gID), GOID[3:], objType, objName, isNot, GOterm, evidence, fields[9]))
+
+ atGenome.addGoInfoBatch(goArray)
+
+
+def createDBFile(db):
+ atGenome = Genome("athaliana", dbFile=db)
+ atGenome.createGeneDB(db)
+
+
+def createDBindices(db):
+ atGenome = Genome("athaliana", dbFile=db)
+ atGenome.createIndices()
+
+
+def buildArabidopsisDB(db=geneDB, downloadDir="%s/download" % cisRoot):
+ genePath = "%s/TAIR9_GFF3_genes_transposons.gff" % downloadDir
+ annotPath = "%s/TAIR9_functional_descriptions" % downloadDir
+ goPath = "%s/ATH_GO_GOSLIM.txt" % downloadDir
+
+ chromos = {"1": "%s/chr1.fas" % downloadDir,
+ "2": "%s/chr2.fas" % downloadDir,
+ "3": "%s/chr3.fas" % downloadDir,
+ "4": "%s/chr4.fas" % downloadDir,
+ "5": "%s/chr5.fas" % downloadDir,
+ "C": "%s/chrC.fas" % downloadDir,
+ "M": "%s/chrM.fas" % downloadDir
+ }
+
+ print "Creating database %s" % db
+ createDBFile(db)
+
+ print "Adding gene entries"
+ loadGeneEntries(db, genePath)
+
+ print "Adding feature entries"
+ loadFeatureEntries(db, genePath)
+
+ print "Adding gene annotations"
+ loadGeneAnnotations(db, annotPath)
+
+ print "Adding gene ontology"
+ loadGeneOntology(db, goPath)
+
+ for chromID in chromos.keys():
+ print "Loading chromosome %s" % chromID
+ loadChromosome(db, chromID, chromos[chromID], "/A_thaliana/chr%s.bin" % chromID)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
\ No newline at end of file