return most recent genome build for the pipeline config file.
[htsworkflow.git] / gaworkflow / pipeline / genome_mapper.py
index aacc06854873ffc1404a5739ab7ffeb5f85cd5d2..90c619ba9a64f9ce2af4700fd3f0f8bba2c8c383 100644 (file)
@@ -2,9 +2,12 @@
 import glob
 import sys
 import os
+import re
 
 import logging
 
+from gaworkflow.util.alphanum import alphanum
+
 class DuplicateGenome(Exception): pass
 
 
@@ -63,22 +66,62 @@ def getAvailableGenomes(genome_base_dir):
   return d
   
 
-def constructMapperDict(genome_dict):
-  """
-  Creates a dictionary which can map the genome
-  in the eland config generator output to a local
-  genome path
-
-  ie. 'Homo sapiens|hg18' -> <genome_dir>
-  """
-  mapper_dict = {}
-  for species in genome_dict.keys():
-    for build in genome_dict[species]:
-      mapper_dict[species+'|'+build] = genome_dict[species][build]
-
-  return mapper_dict
-
-
+class constructMapperDict(object):
+    """
+    Emulate a dictionary to map genome|build names to paths.
+    
+    It uses the dictionary generated by getAvailableGenomes.
+    """
+    def __init__(self, genome_dict):
+        self.genome_dict = genome_dict
+        
+    def __getitem__(self, key):
+        """
+        Return the best match for key
+        """
+        elements = re.split("\|", key)
+          
+        if len(elements) == 1:
+            # we just the species name
+            # get the set of builds
+            builds = self.genome_dict[elements[0]]
+            
+            # sort build names the way humans would
+            keys = builds.keys()
+            keys.sort(cmp=alphanum)
+            
+            # return the path from the 'last' build name
+            return builds[keys[-1]]
+                        
+        elif len(elements) == 2:
+            # we have species, and build name
+            return self.genome_dict[elements[0]][elements[1]]
+        else:
+            raise KeyError("Unrecognized key")
+        
+    def keys(self):
+        keys = []
+        for species in self.genome_dict.keys():
+            for build in self.genome_dict[species]:
+                keys.append([species+'|'+build])
+        return keys
+            
+    def values(self):
+        values = []
+        for species in self.genome_dict.keys():
+            for build in self.genome_dict[species]:
+                values.append(self.genome_dict[species][build])
+        return values
+       
+    def items(self):
+        items = []
+        for species in self.genome_dict.keys():
+            for build in self.genome_dict[species]:
+                key = [species+'|'+build]
+                value = self.genome_dict[species][build]
+                items.append((key, value))
+        return items
+            
 if __name__ == '__main__':
 
   if len(sys.argv) != 2: