From: Brandon King Date: Sat, 17 Nov 2007 02:09:29 +0000 (+0000) Subject: [project @ Species and build to valid genome dir mapper!] X-Git-Tag: 0.1.0~63 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=249f6a9bd2464f10a0f87c9952fc72bc7713b9ce [project @ Species and build to valid genome dir mapper!] * Config generator returns genome dir with %(species|build)s * This module contains a dictionary generator which given a genome base dir, will generate a dictionary whos key is species|build and whos value is the valid genome dir * requires a file in each genome dir called _metainfo_ * containing: species|build --- diff --git a/bin/genome_mapper.py b/bin/genome_mapper.py new file mode 100644 index 0000000..aacc068 --- /dev/null +++ b/bin/genome_mapper.py @@ -0,0 +1,94 @@ +#!/usr/bin/python +import glob +import sys +import os + +import logging + +class DuplicateGenome(Exception): pass + + +def _has_metainfo(genome_dir): + metapath = os.path.join(genome_dir, '_metainfo_') + if os.path.isfile(metapath): + return True + else: + return False + +def getAvailableGenomes(genome_base_dir): + """ + raises IOError (on genome_base_dir not found) + raises DuplicateGenome on duplicate genomes found. + + returns a double dictionary (i.e. d[species][build] = path) + """ + + # Need valid directory + if not os.path.exists(genome_base_dir): + msg = "Directory does not exist: %s" % (genome_base_dir) + raise IOError, msg + + # Find all subdirectories + filepath_list = glob.glob(os.path.join(genome_base_dir, '*')) + potential_genome_dirs = \ + [ filepath for filepath in filepath_list if os.path.isdir(filepath)] + + # Get list of metadata files + genome_dir_list = \ + [ dirpath \ + for dirpath in potential_genome_dirs \ + if _has_metainfo(dirpath) ] + + # Genome double dictionary + d = {} + + for genome_dir in genome_dir_list: + line = open(os.path.join(genome_dir, '_metainfo_'), 'r').readline().strip() + + # Get species, build... log and skip on failure + try: + species, build = line.split('|') + except: + logging.warning('Skipping: Invalid metafile (%s) line: %s' \ + % (metafile, line)) + continue + + build_dict = d.setdefault(species, {}) + if build in build_dict: + msg = "Duplicate genome for %s|%s" % (species, build) + raise DuplicateGenome, msg + + build_dict[build] = genome_dir + + return d + + +def constructMapperDict(genome_dict): + """ + Creates a dictionary which can map the genome + in the eland config generator output to a local + genome path + + ie. 'Homo sapiens|hg18' -> + """ + mapper_dict = {} + for species in genome_dict.keys(): + for build in genome_dict[species]: + mapper_dict[species+'|'+build] = genome_dict[species][build] + + return mapper_dict + + +if __name__ == '__main__': + + if len(sys.argv) != 2: + print 'useage: %s ' % (sys.argv[0]) + sys.exit(1) + + d = getAvailableGenomes(sys.argv[1]) + d2 = constructMapperDict(d) + + for k,v in d2.items(): + print '%s: %s' % (k,v) + +