X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=cistematic%2Fgenomes%2Fcelegans.py;fp=cistematic%2Fgenomes%2Fcelegans.py;h=e9287966e5379ea1fdb7eb45c315b6459b4d2825;hp=e3df297269ddba790fc7380c6010cce2cf876f3b;hb=4522d28194e3d1c048bced84038760d394038285;hpb=4ad5495359e4322da39868020a7398676261679e diff --git a/cistematic/genomes/celegans.py b/cistematic/genomes/celegans.py index e3df297..e928796 100644 --- a/cistematic/genomes/celegans.py +++ b/cistematic/genomes/celegans.py @@ -1,7 +1,7 @@ ########################################################################### # # # C O P Y R I G H T N O T I C E # -# Copyright (c) 2003-10 by: # +# Copyright (c) 2003-13 by: # # * California Institute of Technology # # # # All Rights Reserved. # @@ -33,31 +33,54 @@ from cistematic.genomes import Genome from os import environ if environ.get("CISTEMATIC_ROOT"): - cisRoot = environ.get("CISTEMATIC_ROOT") + cisRoot = environ.get("CISTEMATIC_ROOT") else: cisRoot = "/proj/genome" geneDB = "%s/C_elegans/celegans.genedb" % cisRoot -def loadChromosome(db, chromID, chromPath, chromOut): - seqArray = [] - ceGenome = Genome("celegans", dbFile=db) - inFile = open(chromPath, "r") - line = inFile.readline() - for line in inFile: - seqArray.append(line.strip()) +def buildCelegansDB(db=geneDB, downloadRoot=""): + if downloadRoot == "": + downloadRoot = "%s/download/" % cisRoot - seq = string.join(seqArray, "") - seqLen = len(seq) - if seqLen < 1: - print "Problems reading sequence from file" + geneIDPath = "%sgeneIDs.WS200" % downloadRoot + goDefPath = "%sGO.terms_and_ids" % downloadRoot + goPath = "%sgene_association.wb" % downloadRoot - print "writing to file %s" % chromOut - outFile = open("%s%s" % (cisRoot, chromOut), "w") - outFile.write(seq) - outFile.close() - ceGenome.addChromosomeEntry(chromID, chromOut, "file") + # can be found at ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/genome_feature_tables/GFF2/elegansWS160.gff.gz + gffPath = "%selegansWS200.gff" % downloadRoot + + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, gffPath) + + print "Adding feature entries" + loadFeatureEntries(db, gffPath) + + print "Adding gene annotations" + loadGeneAnnotations(db, geneIDPath) + + print "Adding gene ontology" + loadGeneOntology(db, goPath, goDefPath, geneIDPath) + + # can be found at ftp://caltech.wormbase.org/pub/schwarz/cisreg/softmasks + for chromID in ["I", "II", "III", "IV", "V", "X"]: + print "Loading chromosome %s" % chromID + chromPath = "%sCHROMOSOME_%s_softmasked.dna" % (downloadRoot, chromID) + loadChromosome(db, chromID, chromPath, "/C_elegans/chr%s.bin" % chromID) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db + + +def createDBFile(db): + ceGenome = Genome("celegans", version="WS200", dbFile=db) + ceGenome.createGeneDB(db) def loadGeneEntries(db, gffFile): @@ -84,6 +107,7 @@ def loadGeneEntries(db, gffFile): else: gidGene = giddots[1] gidLetter = "a" + gid = "%s.%s" % (giddots[0], gidGene) geneID = ("celegans", gid) gidVersion = 1 @@ -177,7 +201,7 @@ def loadFeatureEntries(db, gffFile): def loadGeneAnnotations(db, geneIDPath): geneAnnotations = [] - geneIDFile = open(geneIDPath, "r") + geneIDFile = open(geneIDPath, "r") lines = geneIDFile.readlines() geneIDFile.close() ceGenome = Genome("celegans", dbFile=db) @@ -258,56 +282,26 @@ def loadGeneOntology(db, goPath, goDefPath, geneIDPath): ceGenome.addGoInfoBatch(goArray) -def createDBFile(db): - ceGenome = Genome("celegans", version="WS200", dbFile=db) - ceGenome.createGeneDB(db) +def loadChromosome(db, chromID, chromPath, chromOut): + seqArray = [] + ceGenome = Genome("celegans", dbFile=db) + inFile = open(chromPath, "r") + line = inFile.readline() + for line in inFile: + seqArray.append(line.strip()) + + seq = string.join(seqArray, "") + seqLen = len(seq) + if seqLen < 1: + print "Problems reading sequence from file" + + print "writing to file %s" % chromOut + outFile = open("%s%s" % (cisRoot, chromOut), "w") + outFile.write(seq) + outFile.close() + ceGenome.addChromosomeEntry(chromID, chromOut, "file") def createDBindices(db): ceGenome = Genome("celegans", version="WS200", dbFile=db) ceGenome.createIndices() - - -def buildCelegansDB(db=geneDB, downloadRoot=""): - if downloadRoot == "": - downloadRoot = "%s/download/" % cisRoot - - geneIDPath = "%sgeneIDs.WS200" % downloadRoot - goDefPath = "%sGO.terms_and_ids" % downloadRoot - goPath = "%sgene_association.wb" % downloadRoot - - # can be found at ftp://caltech.wormbase.org/pub/schwarz/cisreg/softmasks - chromos = {"I": "%sCHROMOSOME_I_softmasked.dna" % downloadRoot, - "II": "%sCHROMOSOME_II_softmasked.dna" % downloadRoot, - "III": "%sCHROMOSOME_III_softmasked.dna" % downloadRoot, - "IV": "%sCHROMOSOME_IV_softmasked.dna" % downloadRoot, - "V": "%sCHROMOSOME_V_softmasked.dna" % downloadRoot, - "X": "%sCHROMOSOME_X_softmasked.dna" % downloadRoot - } - - # can be found at ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/genome_feature_tables/GFF2/elegansWS160.gff.gz - gffPath = "%selegansWS200.gff" % downloadRoot - - print "Creating database %s" % db - createDBFile(db) - - print "Adding gene entries" - loadGeneEntries(db, gffPath) - - print "Adding feature entries" - loadFeatureEntries(db, gffPath) - - print "Adding gene annotations" - loadGeneAnnotations(db, geneIDPath) - - print "Adding gene ontology" - loadGeneOntology(db, goPath, goDefPath, geneIDPath) - - for chromID in chromos: - print "Loading chromosome %s" % chromID - loadChromosome(db, chromID, chromos[chromID], "/C_elegans/chr%s.bin" % chromID) - - print "Creating Indices" - createDBindices(db) - - print "Finished creating database %s" % db \ No newline at end of file