X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=cistematic%2Fgenomes%2Fdmelanogaster.py;h=78ef7fb7594b11314ef1b7602c2a03fcd7cd521d;hp=5c9f53a58bbc4466689f9ee2dc40378db3367c10;hb=HEAD;hpb=4ad5495359e4322da39868020a7398676261679e diff --git a/cistematic/genomes/dmelanogaster.py b/cistematic/genomes/dmelanogaster.py index 5c9f53a..78ef7fb 100644 --- a/cistematic/genomes/dmelanogaster.py +++ b/cistematic/genomes/dmelanogaster.py @@ -1,7 +1,7 @@ ########################################################################### # # # C O P Y R I G H T N O T I C E # -# Copyright (c) 2003-10 by: # +# Copyright (c) 2003-13 by: # # * California Institute of Technology # # # # All Rights Reserved. # @@ -33,7 +33,7 @@ from cistematic.genomes import Genome from os import environ if environ.get("CISTEMATIC_ROOT"): - cisRoot = environ.get("CISTEMATIC_ROOT") + cisRoot = environ.get("CISTEMATIC_ROOT") else: cisRoot = "/proj/genome" @@ -67,24 +67,47 @@ version = {"A": "1", "Z": "26" } -def loadChromosome(db, chromID, chromPath, chromOut): - seqArray = [] - dmGenome = Genome("dmelanogaster", dbFile=db) - inFile = open(chromPath, "r") - line = inFile.readline() - for line in inFile: - seqArray.append(line.strip()) - seq = string.join(seqArray, "") - seqLen = len(seq) - if seqLen < 1: - print "Problems reading sequence from file" +def buildDmelanogasterDB(db=geneDB): + """ genes and annotations are from UCSC. GO association file is from geneontology.org. + """ + genePath = "%s/download/flyBaseGene.txt" % cisRoot + annotPath = "%s/download/gene_info" % cisRoot + goDefPath = "%s/download/GO.terms_and_ids" % cisRoot + goPath = "%s/download/gene2go" % cisRoot - print "writing to file %s" % chromOut - outFile = open("%s%s" % (cisRoot, chromOut), "w") - outFile.write(seq) - outFile.close() - dmGenome.addChromosomeEntry(chromID, chromOut, "file") + print "Creating database %s" % db + createDBFile(db) + + print "Adding gene entries" + loadGeneEntries(db, genePath) + + print "Adding gene features" + loadGeneFeatures(db, genePath) + + print "Adding gene annotations" + loadGeneAnnotations(db, annotPath) + + print "Adding gene ontology" + loadGeneOntology(db, goPath, goDefPath, annotPath) + + chromList = ["2L", "2R", "2LHet", "2RHet", "3L", "3R", "3LHet", "3RHet", + "4", "X", "XHet", "YHet", "U", "UExtra", "M" + ] + for chromID in chromList: + print "Loading chromosome %s" % chromID + chromPath = "%s/download/chr%s.fa" % (cisRoot, chromID) + loadChromosome(db, chromID, chromPath, "/D_melanogaster/chromo%s.bin" % chromID) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db + + +def createDBFile(db): + dmGenome = Genome("dmelanogaster", dbFile=db) + dmGenome.createGeneDB(db) def loadGeneEntries(db, gFile): @@ -196,7 +219,7 @@ def loadGeneFeatures(db, gfile): def loadGeneAnnotations(db, annotPath): geneAnnotations = [] - annotFile = open(annotPath, "r") + annotFile = open(annotPath, "r") dmGenome = Genome("dmelanogaster", dbFile=db) for line in annotFile: try: @@ -222,7 +245,7 @@ def loadGeneOntology(db, goPath, goDefPath, annotPath): dmGenome = Genome("dmelanogaster", dbFile=db) goDefFile = open(goDefPath, "r") goFile = open(goPath, "r") - annotFile = open(annotPath, "r") + annotFile = open(annotPath, "r") annotEntries = annotFile.readlines() annotFile.close() goDefEntries = goDefFile.readlines() @@ -271,60 +294,26 @@ def loadGeneOntology(db, goPath, goDefPath, annotPath): dmGenome.addGoInfoBatch(goArray) -def createDBFile(db): +def loadChromosome(db, chromID, chromPath, chromOut): + seqArray = [] dmGenome = Genome("dmelanogaster", dbFile=db) - dmGenome.createGeneDB(db) + inFile = open(chromPath, "r") + line = inFile.readline() + for line in inFile: + seqArray.append(line.strip()) + + seq = string.join(seqArray, "") + seqLen = len(seq) + if seqLen < 1: + print "Problems reading sequence from file" + + print "writing to file %s" % chromOut + outFile = open("%s%s" % (cisRoot, chromOut), "w") + outFile.write(seq) + outFile.close() + dmGenome.addChromosomeEntry(chromID, chromOut, "file") def createDBindices(db): dmGenome = Genome("dmelanogaster", dbFile=db) dmGenome.createIndices() - - -def buildDmelanogasterDB(db=geneDB): - """ genes and annotations are from UCSC. GO association file is from geneontology.org. - """ - genePath = "%s/download/flyBaseGene.txt" % cisRoot - annotPath = "%s/download/gene_info" % cisRoot - goDefPath = "%s/download/GO.terms_and_ids" % cisRoot - goPath = "%s/download/gene2go" % cisRoot - chromos = {"2L": "%s/download/chr2L.fa" % cisRoot, - "2R": "%s/download/chr2R.fa" % cisRoot, - "2LHet": "%s/download/chr2LHet.fa" % cisRoot, - "2RHet": "%s/download/chr2RHet.fa" % cisRoot, - "3L": "%s/download/chr3L.fa" % cisRoot, - "3LHet": "%s/download/chr3LHet.fa" % cisRoot, - "3R": "%s/download/chr3R.fa" % cisRoot, - "3RHet": "%s/download/chr3RHet.fa" % cisRoot, - "4": "%s/download/chr4.fa" % cisRoot, - "X": "%s/download/chrX.fa" % cisRoot, - "XHet": "%s/download/chrXHet.fa" % cisRoot, - "YHet": "%s/download/chrYHet.fa" % cisRoot, - "U": "%s/download/chrU.fa" % cisRoot, - "Uextra": "%s/download/chrUextra.fa" % cisRoot, - "M": "%s/download/chrM.fa" % cisRoot - } - - print "Creating database %s" % db - createDBFile(db) - - print "Adding gene entries" - loadGeneEntries(db, genePath) - - print "Adding gene features" - loadGeneFeatures(db, genePath) - - print "Adding gene annotations" - loadGeneAnnotations(db, annotPath) - - print "Adding gene ontology" - loadGeneOntology(db, goPath, goDefPath, annotPath) - - for chromID in chromos.keys(): - print "Loading chromosome %s" % chromID - loadChromosome(db, chromID, chromos[chromID], "/D_melanogaster/chromo%s.bin" % chromID) - - print "Creating Indices" - createDBindices(db) - - print "Finished creating database %s" % db \ No newline at end of file