From: Diane Trout Date: Wed, 23 Jan 2008 01:52:40 +0000 (+0000) Subject: return most recent genome build for the pipeline config file. X-Git-Tag: 0.1.0~2 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=60581e340d01f974805ef5895ee43d441ebf66db return most recent genome build for the pipeline config file. Brandon's original pipeline customization code replaced things like %(genome|build)s with the path to the ELAND genome files. What I did is made it possible to substitute keys like %(genome)s in addition to %(genome|build)s. The idea is that the most config files will be set to use whatever is the "most recent" build, but hopefully at some point we'll provide some way of specifying which build. The way I defined "most recent" genome build was to use the alphanum sort, that sorts mixed alpha/numeric strings in the 'natural' order instead of ASCII order, thus "mm10" > "mm8". For the genomes that we had installed right now this would work for everything but arabadopsis--which appears to be using a version number of MMDDYYYY. Though if we changed it to YYYYMMDD everything should work correctly. --- diff --git a/gaworkflow/pipeline/genome_mapper.py b/gaworkflow/pipeline/genome_mapper.py index aacc068..90c619b 100644 --- a/gaworkflow/pipeline/genome_mapper.py +++ b/gaworkflow/pipeline/genome_mapper.py @@ -2,9 +2,12 @@ import glob import sys import os +import re import logging +from gaworkflow.util.alphanum import alphanum + class DuplicateGenome(Exception): pass @@ -63,22 +66,62 @@ def getAvailableGenomes(genome_base_dir): return d -def constructMapperDict(genome_dict): - """ - Creates a dictionary which can map the genome - in the eland config generator output to a local - genome path - - ie. 'Homo sapiens|hg18' -> - """ - mapper_dict = {} - for species in genome_dict.keys(): - for build in genome_dict[species]: - mapper_dict[species+'|'+build] = genome_dict[species][build] - - return mapper_dict - - +class constructMapperDict(object): + """ + Emulate a dictionary to map genome|build names to paths. + + It uses the dictionary generated by getAvailableGenomes. + """ + def __init__(self, genome_dict): + self.genome_dict = genome_dict + + def __getitem__(self, key): + """ + Return the best match for key + """ + elements = re.split("\|", key) + + if len(elements) == 1: + # we just the species name + # get the set of builds + builds = self.genome_dict[elements[0]] + + # sort build names the way humans would + keys = builds.keys() + keys.sort(cmp=alphanum) + + # return the path from the 'last' build name + return builds[keys[-1]] + + elif len(elements) == 2: + # we have species, and build name + return self.genome_dict[elements[0]][elements[1]] + else: + raise KeyError("Unrecognized key") + + def keys(self): + keys = [] + for species in self.genome_dict.keys(): + for build in self.genome_dict[species]: + keys.append([species+'|'+build]) + return keys + + def values(self): + values = [] + for species in self.genome_dict.keys(): + for build in self.genome_dict[species]: + values.append(self.genome_dict[species][build]) + return values + + def items(self): + items = [] + for species in self.genome_dict.keys(): + for build in self.genome_dict[species]: + key = [species+'|'+build] + value = self.genome_dict[species][build] + items.append((key, value)) + return items + if __name__ == '__main__': if len(sys.argv) != 2: diff --git a/gaworkflow/pipeline/test/test_genome_mapper.py b/gaworkflow/pipeline/test/test_genome_mapper.py new file mode 100644 index 0000000..c8366d1 --- /dev/null +++ b/gaworkflow/pipeline/test/test_genome_mapper.py @@ -0,0 +1,33 @@ +import unittest + +from StringIO import StringIO +from gaworkflow.pipeline import genome_mapper + +class testGenomeMapper(unittest.TestCase): + def test_construct_mapper(self): + genomes = { + 'Arabidopsis thaliana': {'v01212004': '/arabidopsis'}, + 'Homo sapiens': {'hg18': '/hg18'}, + 'Mus musculus': {'mm8': '/mm8', + 'mm9': '/mm9', + 'mm10': '/mm10'}, + 'Phage': {'174': '/phi'}, + } + genome_map = genome_mapper.constructMapperDict(genomes) + + self.failUnlessEqual("%(Mus musculus|mm8)s" % (genome_map), "/mm8") + self.failUnlessEqual("%(Phage|174)s" % (genome_map), "/phi") + self.failUnlessEqual("%(Mus musculus)s" % (genome_map), "/mm10") + self.failUnlessEqual("%(Mus musculus|mm8)s" % (genome_map), "/mm8") + self.failUnlessEqual("%(Mus musculus|mm10)s" % (genome_map), "/mm10") + + self.failUnlessEqual(len(genome_map.keys()), 6) + self.failUnlessEqual(len(genome_map.values()), 6) + self.failUnlessEqual(len(genome_map.items()), 6) + + +def suite(): + return unittest.makeSuite(testGenomeMapper,'test') + +if __name__ == "__main__": + unittest.main(defaultTest="suite") diff --git a/gaworkflow/util/__init__.py b/gaworkflow/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gaworkflow/util/alphanum.py b/gaworkflow/util/alphanum.py new file mode 100644 index 0000000..8893bdb --- /dev/null +++ b/gaworkflow/util/alphanum.py @@ -0,0 +1,61 @@ +# +# The Alphanum Algorithm is an improved sorting algorithm for strings +# containing numbers. Instead of sorting numbers in ASCII order like +# a standard sort, this algorithm sorts numbers in numeric order. +# +# The Alphanum Algorithm is discussed at http://www.DaveKoelle.com +# +#* Python implementation provided by Chris Hulan (chris.hulan@gmail.com) +#* Distributed under same license as original +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +# + +import re + +# +# TODO: Make decimal points be considered in the same class as digits +# + +def chunkify(str): + """return a list of numbers and non-numeric substrings of +str+ + + the numeric substrings are converted to integer, non-numeric are left as is + """ + chunks = re.findall("(\d+|\D+)",str) + chunks = [re.match('\d',x) and int(x) or x for x in chunks] #convert numeric strings to numbers + return chunks + +def alphanum(a,b): + """breaks +a+ and +b+ into pieces and returns left-to-right comparison of the pieces + + +a+ and +b+ are expected to be strings (for example file names) with numbers and non-numeric characters + Split the values into list of numbers and non numeric sub-strings and so comparison of numbers gives + Numeric sorting, comparison of non-numeric gives Lexicographic order + """ + # split strings into chunks + aChunks = chunkify(a) + bChunks = chunkify(b) + + return cmp(aChunks,bChunks) #built in comparison works once data is prepared + + + +if __name__ == "__main__": + unsorted = ["1000X Radonius Maximus","10X Radonius","200X Radonius","20X Radonius","20X Radonius Prime","30X Radonius","40X Radonius","Allegia 50 Clasteron","Allegia 500 Clasteron","Allegia 51 Clasteron","Allegia 51B Clasteron","Allegia 52 Clasteron","Allegia 60 Clasteron","Alpha 100","Alpha 2","Alpha 200","Alpha 2A","Alpha 2A-8000","Alpha 2A-900","Callisto Morphamax","Callisto Morphamax 500","Callisto Morphamax 5000","Callisto Morphamax 600","Callisto Morphamax 700","Callisto Morphamax 7000","Callisto Morphamax 7000 SE","Callisto Morphamax 7000 SE2","QRS-60 Intrinsia Machine","QRS-60F Intrinsia Machine","QRS-62 Intrinsia Machine","QRS-62F Intrinsia Machine","Xiph Xlater 10000","Xiph Xlater 2000","Xiph Xlater 300","Xiph Xlater 40","Xiph Xlater 5","Xiph Xlater 50","Xiph Xlater 500","Xiph Xlater 5000","Xiph Xlater 58"] + sorted = unsorted[:] + sorted.sort(alphanum) + print '+++++Sorted...++++' + print '\n'.join(sorted)