return most recent genome build for the pipeline config file.

author Diane Trout <diane@caltech.edu>

Wed, 23 Jan 2008 01:52:40 +0000 (01:52 +0000)

committer Diane Trout <diane@caltech.edu>

Wed, 23 Jan 2008 01:52:40 +0000 (01:52 +0000)
author Diane Trout <diane@caltech.edu>
Wed, 23 Jan 2008 01:52:40 +0000 (01:52 +0000)
committer Diane Trout <diane@caltech.edu>
Wed, 23 Jan 2008 01:52:40 +0000 (01:52 +0000)
diff --git a/gaworkflow/pipeline/genome_mapper.py b/gaworkflow/pipeline/genome_mapper.py

index aacc06854873ffc1404a5739ab7ffeb5f85cd5d2..90c619ba9a64f9ce2af4700fd3f0f8bba2c8c383 100644 (file)
--- a/gaworkflow/pipeline/genome_mapper.py
+++ b/gaworkflow/pipeline/genome_mapper.py
@@ -2,9 +2,12 @@
  import glob
  import sys
  import os
+import re
  
  import logging
  
+from gaworkflow.util.alphanum import alphanum
+
  class DuplicateGenome(Exception): pass
  
  
@@ -63,22 +66,62 @@ def getAvailableGenomes(genome_base_dir):
    return d
    
  
-def constructMapperDict(genome_dict):
-  """
-  Creates a dictionary which can map the genome
-  in the eland config generator output to a local
-  genome path
-
-  ie. 'Homo sapiens|hg18' -> <genome_dir>
-  """
-  mapper_dict = {}
-  for species in genome_dict.keys():
-    for build in genome_dict[species]:
-      mapper_dict[species+'|'+build] = genome_dict[species][build]
-
-  return mapper_dict
-
-
+class constructMapperDict(object):
+    """
+    Emulate a dictionary to map genome|build names to paths.
+    
+    It uses the dictionary generated by getAvailableGenomes.
+    """
+    def __init__(self, genome_dict):
+        self.genome_dict = genome_dict
+        
+    def __getitem__(self, key):
+        """
+        Return the best match for key
+        """
+        elements = re.split("\|", key)
+          
+        if len(elements) == 1:
+            # we just the species name
+            # get the set of builds
+            builds = self.genome_dict[elements[0]]
+            
+            # sort build names the way humans would
+            keys = builds.keys()
+            keys.sort(cmp=alphanum)
+            
+            # return the path from the 'last' build name
+            return builds[keys[-1]]
+                        
+        elif len(elements) == 2:
+            # we have species, and build name
+            return self.genome_dict[elements[0]][elements[1]]
+        else:
+            raise KeyError("Unrecognized key")
+        
+    def keys(self):
+        keys = []
+        for species in self.genome_dict.keys():
+            for build in self.genome_dict[species]:
+                keys.append([species+'|'+build])
+        return keys
+            
+    def values(self):
+        values = []
+        for species in self.genome_dict.keys():
+            for build in self.genome_dict[species]:
+                values.append(self.genome_dict[species][build])
+        return values
+       
+    def items(self):
+        items = []
+        for species in self.genome_dict.keys():
+            for build in self.genome_dict[species]:
+                key = [species+'|'+build]
+                value = self.genome_dict[species][build]
+                items.append((key, value))
+        return items
+            
  if __name__ == '__main__':
  
    if len(sys.argv) != 2:
diff --git a/gaworkflow/pipeline/test/test_genome_mapper.py b/gaworkflow/pipeline/test/test_genome_mapper.py

new file mode 100644 (file)

index 0000000..c8366d1
--- /dev/null
+++ b/gaworkflow/pipeline/test/test_genome_mapper.py
@@ -0,0 +1,33 @@
+import unittest
+
+from StringIO import StringIO
+from gaworkflow.pipeline import genome_mapper
+
+class testGenomeMapper(unittest.TestCase):
+    def test_construct_mapper(self):
+        genomes = {
+        'Arabidopsis thaliana': {'v01212004': '/arabidopsis'},
+        'Homo sapiens': {'hg18': '/hg18'},
+        'Mus musculus': {'mm8': '/mm8',
+                        'mm9': '/mm9',
+                        'mm10': '/mm10'},
+        'Phage': {'174': '/phi'},
+        }
+        genome_map = genome_mapper.constructMapperDict(genomes)
+        
+        self.failUnlessEqual("%(Mus musculus|mm8)s" % (genome_map), "/mm8")
+        self.failUnlessEqual("%(Phage|174)s" % (genome_map), "/phi")
+        self.failUnlessEqual("%(Mus musculus)s" % (genome_map), "/mm10")
+        self.failUnlessEqual("%(Mus musculus|mm8)s" % (genome_map), "/mm8")
+        self.failUnlessEqual("%(Mus musculus|mm10)s" % (genome_map), "/mm10")
+        
+        self.failUnlessEqual(len(genome_map.keys()), 6)
+        self.failUnlessEqual(len(genome_map.values()), 6)
+        self.failUnlessEqual(len(genome_map.items()), 6)
+        
+        
+def suite():
+    return unittest.makeSuite(testGenomeMapper,'test')
+
+if __name__ == "__main__":
+    unittest.main(defaultTest="suite")
diff --git a/gaworkflow/util/__init__.py b/gaworkflow/util/__init__.py

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/gaworkflow/util/alphanum.py b/gaworkflow/util/alphanum.py

new file mode 100644 (file)

index 0000000..8893bdb
--- /dev/null
+++ b/gaworkflow/util/alphanum.py
@@ -0,0 +1,61 @@
+#\r
+# The Alphanum Algorithm is an improved sorting algorithm for strings\r
+# containing numbers.  Instead of sorting numbers in ASCII order like\r
+# a standard sort, this algorithm sorts numbers in numeric order.\r
+#\r
+# The Alphanum Algorithm is discussed at http://www.DaveKoelle.com\r
+#\r
+#* Python implementation provided by Chris Hulan (chris.hulan@gmail.com)\r
+#* Distributed under same license as original\r
+#\r
+# This library is free software; you can redistribute it and/or\r
+# modify it under the terms of the GNU Lesser General Public\r
+# License as published by the Free Software Foundation; either\r
+# version 2.1 of the License, or any later version.\r
+#\r
+# This library is distributed in the hope that it will be useful,\r
+# but WITHOUT ANY WARRANTY; without even the implied warranty of\r
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\r
+# Lesser General Public License for more details.\r
+#\r
+# You should have received a copy of the GNU Lesser General Public\r
+# License along with this library; if not, write to the Free Software\r
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA\r
+#\r
+\r
+import re\r
+\r
+#\r
+# TODO: Make decimal points be considered in the same class as digits\r
+#\r
+\r
+def chunkify(str):\r
+       """return a list of numbers and non-numeric substrings of +str+\r
+\r
+       the numeric substrings are converted to integer, non-numeric are left as is\r
+       """\r
+       chunks = re.findall("(\d+|\D+)",str)\r
+       chunks = [re.match('\d',x) and int(x) or x for x in chunks] #convert numeric strings to numbers\r
+       return chunks\r
+\r
+def alphanum(a,b):\r
+       """breaks +a+ and +b+ into pieces and returns left-to-right comparison of the pieces\r
+\r
+       +a+ and +b+ are expected to be strings (for example file names) with numbers and non-numeric characters\r
+       Split the values into list of numbers and non numeric sub-strings and so comparison of numbers gives\r
+       Numeric sorting, comparison of non-numeric gives Lexicographic order\r
+       """\r
+       # split strings into chunks\r
+       aChunks = chunkify(a)\r
+       bChunks = chunkify(b)\r
+\r
+       return cmp(aChunks,bChunks) #built in comparison works once data is prepared\r
+\r
+\r
+\r
+if __name__ == "__main__":\r
+       unsorted = ["1000X Radonius Maximus","10X Radonius","200X Radonius","20X Radonius","20X Radonius Prime","30X Radonius","40X Radonius","Allegia 50 Clasteron","Allegia 500 Clasteron","Allegia 51 Clasteron","Allegia 51B Clasteron","Allegia 52 Clasteron","Allegia 60 Clasteron","Alpha 100","Alpha 2","Alpha 200","Alpha 2A","Alpha 2A-8000","Alpha 2A-900","Callisto Morphamax","Callisto Morphamax 500","Callisto Morphamax 5000","Callisto Morphamax 600","Callisto Morphamax 700","Callisto Morphamax 7000","Callisto Morphamax 7000 SE","Callisto Morphamax 7000 SE2","QRS-60 Intrinsia Machine","QRS-60F Intrinsia Machine","QRS-62 Intrinsia Machine","QRS-62F Intrinsia Machine","Xiph Xlater 10000","Xiph Xlater 2000","Xiph Xlater 300","Xiph Xlater 40","Xiph Xlater 5","Xiph Xlater 50","Xiph Xlater 500","Xiph Xlater 5000","Xiph Xlater 58"]\r
+       sorted = unsorted[:]\r
+       sorted.sort(alphanum)\r
+       print '+++++Sorted...++++'\r
+       print '\n'.join(sorted)\r
author	Diane Trout <diane@caltech.edu>
	Wed, 23 Jan 2008 01:52:40 +0000 (01:52 +0000)
committer	Diane Trout <diane@caltech.edu>
	Wed, 23 Jan 2008 01:52:40 +0000 (01:52 +0000)
gaworkflow/pipeline/genome_mapper.py		patch \| blob \| history
gaworkflow/pipeline/test/test_genome_mapper.py	[new file with mode: 0644]	patch \| blob
gaworkflow/util/__init__.py	[new file with mode: 0644]	patch \| blob
gaworkflow/util/alphanum.py	[new file with mode: 0644]	patch \| blob