X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=erange.git;a=blobdiff_plain;f=cistematic%2Fgenomes%2Fmdomestica.py;fp=cistematic%2Fgenomes%2Fmdomestica.py;h=e1fc731dd8e39f88bcbc7d2500ad84cfc7b04fdc;hp=516e784bb7eaedd005a4776029731dd91aae576b;hb=4522d28194e3d1c048bced84038760d394038285;hpb=4ad5495359e4322da39868020a7398676261679e diff --git a/cistematic/genomes/mdomestica.py b/cistematic/genomes/mdomestica.py index 516e784..e1fc731 100644 --- a/cistematic/genomes/mdomestica.py +++ b/cistematic/genomes/mdomestica.py @@ -1,7 +1,7 @@ ########################################################################### # # # C O P Y R I G H T N O T I C E # -# Copyright (c) 2003-10 by: # +# Copyright (c) 2003-13 by: # # * California Institute of Technology # # # # All Rights Reserved. # @@ -33,51 +33,44 @@ from cistematic.genomes import Genome from os import environ if environ.get("CISTEMATIC_ROOT"): - cisRoot = environ.get("CISTEMATIC_ROOT") + cisRoot = environ.get("CISTEMATIC_ROOT") else: cisRoot = "/proj/genome" geneDB = "%s/M_domestica/mdomestica.genedb" % cisRoot -def loadChromosome(db, chromPath, chromOutPath): - seqArray = [] - seqLen = 0 - mdGenome = Genome("mdomestica", dbFile=db) - inFile = open(chromPath, "r") - header = inFile.readline() - while header != "": - seqArray = [] - seqLen = 0 - chromID = header.strip()[1:] - currentLine = inFile.readline() - while currentLine != "" and currentLine[0] != ">": - lineSeq = currentLine.strip() - seqLen += len(lineSeq) - seqArray.append(lineSeq) - currentLine = inFile.readline() +def buildMdomesticaDB(db=geneDB): + genePath = "%s/download/mondom/genscan.txt" % cisRoot + chromoPath = "%s/download/mondom/softMask.fa" % cisRoot + chromoOutPath = "/M_domestica/" - seq = string.join(seqArray, "") - if seqLen < 500000: - print "Added contig %s to database" % chromID - mdGenome.addSequence(("mdomestica", chromID), seq, "chromosome", str(seqLen)) - mdGenome.addChromosomeEntry(chromID, chromID, "db") - else: - outFileName = "%s%s.bin" % (chromOutPath, chromID) - outFile = open("%s%s" % (cisRoot, outFileName), "w") - outFile.write(seq) - outFile.close() - print "Added contig file %s to database" % outFileName - mdGenome.addChromosomeEntry(chromID, outFileName, "file") + print "Creating database %s" % db + createDBFile(db) - header = currentLine + print "Adding gene entries" + loadGeneEntries(db, genePath) - inFile.close() + print "Adding gene features" + loadGeneFeatures(db, genePath) + + print "Loading chromosomes" + loadChromosome(db, chromoPath, chromoOutPath) + + print "Creating Indices" + createDBindices(db) + + print "Finished creating database %s" % db + + +def createDBFile(db): + mdGenome = Genome("mdomestica", dbFile=db) + mdGenome.createGeneDB(db) def loadGeneEntries(db, gFile): - """ FIXME - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES - """ + #TODO: - NEED TO DEAL WITH ALTERNATIVE SPLICING ENTRIES + geneEntries = [] mdGenome = Genome("mdomestica", dbFile=db) geneFile = open(gFile, "r") @@ -103,7 +96,7 @@ def loadGeneEntries(db, gFile): def loadGeneAnnotations(db, annotPath): geneAnnotations = [] - annotFile = open(annotPath, "r") + annotFile = open(annotPath, "r") mdGenome = Genome("mdomestica", dbFile=db) for line in annotFile: try: @@ -191,34 +184,41 @@ def loadGeneFeatures(db, gfile): mdGenome.addFeatureEntryBatch(insertArray) -def createDBFile(db): - mdGenome = Genome("mdomestica", dbFile=db) - mdGenome.createGeneDB(db) - - -def createDBindices(db): +def loadChromosome(db, chromPath, chromOutPath): + seqArray = [] + seqLen = 0 mdGenome = Genome("mdomestica", dbFile=db) - mdGenome.createIndices() - - -def buildMdomesticaDB(db=geneDB): - genePath = "%s/download/mondom/genscan.txt" % cisRoot - chromoPath = "%s/download/mondom/softMask.fa" % cisRoot - chromoOutPath = "/M_domestica/" - - print "Creating database %s" % db - createDBFile(db) + inFile = open(chromPath, "r") + header = inFile.readline() + while header != "": + seqArray = [] + seqLen = 0 + chromID = header.strip()[1:] + currentLine = inFile.readline() + while currentLine != "" and currentLine[0] != ">": + lineSeq = currentLine.strip() + seqLen += len(lineSeq) + seqArray.append(lineSeq) + currentLine = inFile.readline() - print "Adding gene entries" - loadGeneEntries(db, genePath) + seq = string.join(seqArray, "") + if seqLen < 500000: + print "Added contig %s to database" % chromID + mdGenome.addSequence(("mdomestica", chromID), seq, "chromosome", str(seqLen)) + mdGenome.addChromosomeEntry(chromID, chromID, "db") + else: + outFileName = "%s%s.bin" % (chromOutPath, chromID) + outFile = open("%s%s" % (cisRoot, outFileName), "w") + outFile.write(seq) + outFile.close() + print "Added contig file %s to database" % outFileName + mdGenome.addChromosomeEntry(chromID, outFileName, "file") - print "Adding gene features" - loadGeneFeatures(db, genePath) + header = currentLine - print "Loading chromosomes" - loadChromosome(db, chromoPath, chromoOutPath) + inFile.close() - print "Creating Indices" - createDBindices(db) - print "Finished creating database %s" % db \ No newline at end of file +def createDBindices(db): + mdGenome = Genome("mdomestica", dbFile=db) + mdGenome.createIndices()