9 from htsworkflow.util.alphanum import alphanum
11 class DuplicateGenome(Exception): pass
14 def _has_metainfo(genome_dir):
15 metapath = os.path.join(genome_dir, '_metainfo_')
16 if os.path.isfile(metapath):
21 def getAvailableGenomes(genome_base_dir):
23 raises IOError (on genome_base_dir not found)
24 raises DuplicateGenome on duplicate genomes found.
26 returns a double dictionary (i.e. d[species][build] = path)
29 # Need valid directory
30 if not os.path.exists(genome_base_dir):
31 msg = "Directory does not exist: %s" % (genome_base_dir)
34 # Find all subdirectories
35 filepath_list = glob.glob(os.path.join(genome_base_dir, '*'))
36 potential_genome_dirs = \
37 [ filepath for filepath in filepath_list if os.path.isdir(filepath)]
39 # Get list of metadata files
42 for dirpath in potential_genome_dirs \
43 if _has_metainfo(dirpath) ]
45 # Genome double dictionary
48 for genome_dir in genome_dir_list:
49 line = open(os.path.join(genome_dir, '_metainfo_'), 'r').readline().strip()
51 # Get species, build... log and skip on failure
53 species, build = line.split('|')
55 logging.warning('Skipping: Invalid metafile (%s) line: %s' \
59 build_dict = d.setdefault(species, {})
60 if build in build_dict:
61 msg = "Duplicate genome for %s|%s" % (species, build)
62 raise DuplicateGenome, msg
64 build_dict[build] = genome_dir
69 class constructMapperDict(object):
71 Emulate a dictionary to map genome|build names to paths.
73 It uses the dictionary generated by getAvailableGenomes.
75 def __init__(self, genome_dict):
76 self.genome_dict = genome_dict
78 def __getitem__(self, key):
80 Return the best match for key
82 elements = re.split("\|", key)
85 if len(elements) == 1:
86 # we just the species name
87 # get the set of builds
88 builds = self.genome_dict[elements[0]]
90 # sort build names the way humans would
92 keys.sort(cmp=alphanum)
94 # return the path from the 'last' build name
95 return builds[keys[-1]]
97 elif len(elements) == 2:
98 # we have species, and build name
99 return self.genome_dict[elements[0]][elements[1]]
101 raise KeyError("Unrecognized key")
103 logging.error('Unrecognized genome identifier: %s' % str((elements),))
104 return "NoGenomeAvailable"
108 for species in self.genome_dict.keys():
109 for build in self.genome_dict[species]:
110 keys.append([species+'|'+build])
115 for species in self.genome_dict.keys():
116 for build in self.genome_dict[species]:
117 values.append(self.genome_dict[species][build])
122 for species in self.genome_dict.keys():
123 for build in self.genome_dict[species]:
124 key = [species+'|'+build]
125 value = self.genome_dict[species][build]
126 items.append((key, value))
129 if __name__ == '__main__':
131 if len(sys.argv) != 2:
132 print 'useage: %s <base_genome_dir>' % (sys.argv[0])
135 d = getAvailableGenomes(sys.argv[1])
136 d2 = constructMapperDict(d)
138 for k,v in d2.items():
139 print '%s: %s' % (k,v)