--- /dev/null
+###########################################################################
+# #
+# C O P Y R I G H T N O T I C E #
+# Copyright (c) 2003-10 by: #
+# * California Institute of Technology #
+# #
+# All Rights Reserved. #
+# #
+# Permission is hereby granted, free of charge, to any person #
+# obtaining a copy of this software and associated documentation files #
+# (the "Software"), to deal in the Software without restriction, #
+# including without limitation the rights to use, copy, modify, merge, #
+# publish, distribute, sublicense, and/or sell copies of the Software, #
+# and to permit persons to whom the Software is furnished to do so, #
+# subject to the following conditions: #
+# #
+# The above copyright notice and this permission notice shall be #
+# included in all copies or substantial portions of the Software. #
+# #
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, #
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF #
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND #
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS #
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN #
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN #
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #
+# SOFTWARE. #
+###########################################################################
+#
+# data for Saccharomyces cerevisiae
+import string
+from cistematic.genomes import Genome
+from os import environ
+
+if environ.get("CISTEMATIC_ROOT"):
+ cisRoot = environ.get("CISTEMATIC_ROOT")
+else:
+ cisRoot = "/proj/genome"
+
+geneDB = "%s/S_cerevisiae/scerevisiae.genedb" % cisRoot
+
+
+def loadChromosome(db, chromID, chromPath, chromOut):
+ seqArray = []
+ scGenome = Genome("scerevisiae", dbFile=db)
+ inFile = open(chromPath, "r")
+ line = inFile.readline()
+ for line in inFile:
+ seqArray.append(line.strip())
+
+ seq = string.join(seqArray, "")
+ seqLen = len(seq)
+ if seqLen < 1:
+ print "Problems reading sequence from file"
+
+ print "writing to file %s" % chromOut
+ outFile = open("%s%s" % (cisRoot, chromOut), "w")
+ outFile.write(seq)
+ outFile.close()
+ seq = ""
+ print "calling scGenome()"
+ scGenome.addChromosomeEntry(chromID, chromOut, "file")
+
+
+def loadGeneEntries(db, gFile):
+ geneEntries = []
+ geneFeatures = []
+ scGenome = Genome("scerevisiae", dbFile=db)
+ geneFile = open(gFile, "r")
+ for line in geneFile:
+ field = line.split("\t")
+ if field[1] != "ORF":
+ continue
+
+ orfName = field[3].strip()
+ sense = field[11]
+ chrom = field[8].strip()
+ if sense == "W":
+ sense = "F"
+ try:
+ start = int(field[9].strip()) - 1
+ stop = int(field[10].strip()) - 1
+ except:
+ start = 0
+ stop = 0
+ else:
+ sense = "R"
+ try:
+ start = int(field[10].strip()) - 1
+ stop = int(field[9].strip()) - 1
+ except:
+ start = 0
+ stop = 0
+
+ geneID = ("scerevisiae", orfName)
+ gidVersion = 1
+ geneEntries.append((geneID, chrom, start, stop, sense, "chromosomal_feature", gidVersion))
+ geneFeatures.append((geneID, gidVersion, chrom, start, stop, sense, "CDS"))
+
+ print "loading %d gene entries" % len(geneEntries)
+ scGenome.addGeneEntryBatch(geneEntries)
+ print "loading %d gene features" % len(geneFeatures)
+ scGenome.addFeatureEntryBatch(geneFeatures)
+
+
+def loadGeneAnnotations(db, annotPath):
+ geneAnnotations = []
+ annotFile = open(annotPath, "r")
+ lines = annotFile.readlines()
+ annotFile.close()
+ scGenome = Genome("scerevisiae", dbFile=db)
+ for line in lines:
+ field = line.split("\t")
+ if field[1] != "ORF":
+ continue
+
+ try:
+ orfName = field[6].strip()
+ description = field[15].strip()
+ geneAnnotations.append((("scerevisiae", orfName), string.replace(description, "'", "p")))
+ except:
+ pass
+
+ print "Adding %d annotations" % len(geneAnnotations)
+ scGenome.addAnnotationBatch(geneAnnotations)
+
+
+def loadGeneOntology(db, goPath, goDefPath):
+ scGenome = Genome("scerevisiae", version="SGD1", dbFile=db)
+ goDefFile = open(goDefPath, "r")
+ goFile = open(goPath, "r")
+ goDefEntries = goDefFile.readlines()
+ goDefs = {}
+ for goDefEntry in goDefEntries:
+ if goDefEntry[0] != "!":
+ cols = goDefEntry.split("\t")
+ goDefs[cols[0]] = (cols[1], cols[2].strip())
+
+ goEntries = goFile.readlines()
+ goArray = []
+ for line in goEntries:
+ if line[0] == "!":
+ continue
+
+ fields = line.split("\t")
+ genes = fields[10].split("|")
+ gID = genes[0]
+ GOID = fields[4]
+ objType = fields[11]
+ objNameArray = fields[10].split("|")
+ objName = objNameArray[0]
+ isNot = fields[3]
+ try:
+ GOterm = string.replace(goDefs[GOID][0], "'", "p")
+ except:
+ print "Could not translate %s" % (GOID)
+ GOterm = ""
+
+ evidence = fields[6]
+ goArray.append((("scerevisiae", gID), GOID[3:], objType, objName, isNot, GOterm, evidence, fields[1]))
+
+ scGenome.addGoInfoBatch(goArray)
+
+
+def createDBFile(db):
+ scGenome = Genome("scerevisiae", version="SGD1", dbFile=db)
+ scGenome.createGeneDB(db)
+
+
+def createDBindices(db):
+ scGenome = Genome("scerevisiae", version="SGD1", dbFile=db)
+ scGenome.createIndices()
+
+
+def buildScerevisiaeDB(db=geneDB):
+ genePath = "%s/download/SGD_features.tab" % cisRoot
+ goDefPath = "%s/download/GO.terms_and_ids" % cisRoot
+ goPath = "%s/download/gene_association.sgd" % cisRoot
+ chromos = {"1": "%s/download/chr01.fsa" % cisRoot,
+ "2": "%s/download/chr02.fsa" % cisRoot,
+ "3": "%s/download/chr03.fsa" % cisRoot,
+ "4": "%s/download/chr04.fsa" % cisRoot,
+ "5": "%s/download/chr05.fsa" % cisRoot,
+ "6": "%s/download/chr06.fsa" % cisRoot,
+ "7": "%s/download/chr07.fsa" % cisRoot,
+ "8": "%s/download/chr08.fsa" % cisRoot,
+ "9": "%s/download/chr09.fsa" % cisRoot,
+ "10": "%s/download/chr10.fsa" % cisRoot,
+ "11": "%s/download/chr11.fsa" % cisRoot,
+ "12": "%s/download/chr12.fsa" % cisRoot,
+ "13": "%s/download/chr13.fsa" % cisRoot,
+ "14": "%s/download/chr14.fsa" % cisRoot,
+ "15": "%s/download/chr15.fsa" % cisRoot,
+ "16": "%s/download/chr16.fsa" % cisRoot
+ }
+
+ print "Creating database %s" % db
+ createDBFile(db)
+
+ print "Adding gene entries"
+ loadGeneEntries(db, genePath)
+
+ print "Adding gene annotations"
+ loadGeneAnnotations(db, genePath)
+
+ print "Adding gene ontology"
+ loadGeneOntology(db, goPath, goDefPath)
+
+ for chromID in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16"]:
+ print "Loading chromosome %s" % chromID
+ loadChromosome(db, chromID, chromos[chromID], "/S_cerevisiae/chr%s.bin" % chromID)
+
+ print "Creating Indices"
+ createDBindices(db)
+
+ print "Finished creating database %s" % db
\ No newline at end of file