X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=cistematic%2Fgenomes%2Fathaliana.py;fp=cistematic%2Fgenomes%2Fathaliana.py;h=9d2424639de8466d65510ce26ea0bf9402c5021c;hp=628be4dc4789ec61f8775429fd1d261020209063;hb=4522d28194e3d1c048bced84038760d394038285;hpb=4ad5495359e4322da39868020a7398676261679e diff --git a/cistematic/genomes/athaliana.py b/cistematic/genomes/athaliana.py index 628be4d..9d24246 100644 --- a/cistematic/genomes/athaliana.py +++ b/cistematic/genomes/athaliana.py @@ -1,7 +1,7 @@ ########################################################################### # # # C O P Y R I G H T N O T I C E # -# Copyright (c) 2003-10 by: # +# Copyright (c) 2003-13 by: # # * California Institute of Technology # # # # All Rights Reserved. # @@ -33,10 +33,10 @@ from cistematic.genomes import Genome from os import environ if environ.get("CISTEMATIC_ROOT"): - cisRoot = environ.get("CISTEMATIC_ROOT") + cisRoot = environ.get("CISTEMATIC_ROOT") else: cisRoot = "/proj/genome" - + geneDB = "%s/A_thaliana/athaliana.genedb" % cisRoot chromSize = {"1": 30432563, @@ -83,29 +83,40 @@ def decodeGFF3(cols): return (fType, gid, chrom, start, stop, sense, otherDict) -def loadChromosome(db, chromID, chromPath, chromOut): - seqArray = [] - atGenome = Genome("athaliana", dbFile=db) +def buildArabidopsisDB(db=geneDB, downloadDir="%s/download" % cisRoot): + genePath = "%s/TAIR9_GFF3_genes_transposons.gff" % downloadDir + annotPath = "%s/TAIR9_functional_descriptions" % downloadDir + goPath = "%s/ATH_GO_GOSLIM.txt" % downloadDir - inFile = open(chromPath, "r") - line = inFile.readline() - for line in inFile: - seqArray.append(line.strip()) + print "Creating database %s" % db + createDBFile(db) - seq = string.join(seqArray,"") - seqLen = len(seq) - if seqLen < 1: - print "Problems reading sequence from file" + print "Adding gene entries" + loadGeneEntries(db, genePath) - print "writing to file %s" % (chromOut) - outFile = open(cisRoot + chromOut, "w") - outFile.write(seq) - outFile.close() - seq = "" + print "Adding feature entries" + loadFeatureEntries(db, genePath) - atGenome.addChromosomeEntry(chromID, chromOut, "file") - # Add alternative chromID - should be A-O and 01-09 - atGenome.addChromosomeEntry("chromo%s" % chromID, chromOut, "file") + print "Adding gene annotations" + loadGeneAnnotations(db, annotPath) + + print "Adding gene ontology" + loadGeneOntology(db, goPath) + + for chromID in ["1", "2", "3", "4", "5", "C", "M"]: + print "Loading chromosome %s" % chromID + chromPath = "%s/chr%s.fas" % (downloadDir, chromID) + loadChromosome(db, chromID, chromPath, "/A_thaliana/chr%s.bin" % chromID) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db + + +def createDBFile(db): + atGenome = Genome("athaliana", dbFile=db) + atGenome.createGeneDB(db) def loadGeneEntries(db, gFile): @@ -133,12 +144,13 @@ def loadFeatureEntries(db, gFile): featureEntries = [] trackedGenes = [] atGenome = Genome("athaliana", dbFile=db) - featureTranslation = {"CDS": "CDS", + featureTranslation = {"CDS": "CDS", "three_prime_UTR": "3UTR", "five_prime_UTR": "5UTR", "miRNA": "5UTR", "exon": "5UTR" } + geneFile = open(gFile, "r") for line in geneFile: fields = line.split("\t") @@ -178,7 +190,7 @@ def loadFeatureEntries(db, gFile): def loadGeneAnnotations(db, annotPath): geneAnnotations = [] - annotFile = open(annotPath, "r") + annotFile = open(annotPath, "r") annotFile.readline() lines = annotFile.readlines() annotFile.close() @@ -220,50 +232,31 @@ def loadGeneOntology(db, goPath): atGenome.addGoInfoBatch(goArray) -def createDBFile(db): - atGenome = Genome("athaliana", dbFile=db) - atGenome.createGeneDB(db) - - -def createDBindices(db): +def loadChromosome(db, chromID, chromPath, chromOut): + seqArray = [] atGenome = Genome("athaliana", dbFile=db) - atGenome.createIndices() - - -def buildArabidopsisDB(db=geneDB, downloadDir="%s/download" % cisRoot): - genePath = "%s/TAIR9_GFF3_genes_transposons.gff" % downloadDir - annotPath = "%s/TAIR9_functional_descriptions" % downloadDir - goPath = "%s/ATH_GO_GOSLIM.txt" % downloadDir - chromos = {"1": "%s/chr1.fas" % downloadDir, - "2": "%s/chr2.fas" % downloadDir, - "3": "%s/chr3.fas" % downloadDir, - "4": "%s/chr4.fas" % downloadDir, - "5": "%s/chr5.fas" % downloadDir, - "C": "%s/chrC.fas" % downloadDir, - "M": "%s/chrM.fas" % downloadDir - } - - print "Creating database %s" % db - createDBFile(db) - - print "Adding gene entries" - loadGeneEntries(db, genePath) - - print "Adding feature entries" - loadFeatureEntries(db, genePath) + inFile = open(chromPath, "r") + line = inFile.readline() + for line in inFile: + seqArray.append(line.strip()) - print "Adding gene annotations" - loadGeneAnnotations(db, annotPath) + seq = string.join(seqArray,"") + seqLen = len(seq) + if seqLen < 1: + print "Problems reading sequence from file" - print "Adding gene ontology" - loadGeneOntology(db, goPath) + print "writing to file %s" % (chromOut) + outFile = open(cisRoot + chromOut, "w") + outFile.write(seq) + outFile.close() + seq = "" - for chromID in chromos.keys(): - print "Loading chromosome %s" % chromID - loadChromosome(db, chromID, chromos[chromID], "/A_thaliana/chr%s.bin" % chromID) + atGenome.addChromosomeEntry(chromID, chromOut, "file") + # Add alternative chromID - should be A-O and 01-09 + atGenome.addChromosomeEntry("chromo%s" % chromID, chromOut, "file") - print "Creating Indices" - createDBindices(db) - print "Finished creating database %s" % db \ No newline at end of file +def createDBindices(db): + atGenome = Genome("athaliana", dbFile=db) + atGenome.createIndices()