From: Diane Trout Date: Fri, 30 Jan 2015 21:47:01 +0000 (-0800) Subject: Convert alphanum sort from comparitor to key operator X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=f0f14a8bd6c9aa2a2e4bf60b0c62e50c3d521451 Convert alphanum sort from comparitor to key operator I found a much shorter algorithm over on stackoverflow. --- diff --git a/htsworkflow/pipelines/genome_mapper.py b/htsworkflow/pipelines/genome_mapper.py index 83bdecb..fc70590 100644 --- a/htsworkflow/pipelines/genome_mapper.py +++ b/htsworkflow/pipelines/genome_mapper.py @@ -6,7 +6,7 @@ import re import logging -from htsworkflow.util.alphanum import alphanum +from htsworkflow.util.alphanum import natural_sort_key LOGGER = logging.getLogger(__name__) class DuplicateGenome(Exception): pass @@ -89,7 +89,7 @@ class constructMapperDict(object): # sort build names the way humans would keys = builds.keys() - keys.sort(cmp=alphanum) + keys.sort(key=natural_sort_key) # return the path from the 'last' build name return builds[keys[-1]] diff --git a/htsworkflow/pipelines/retrieve_config.py b/htsworkflow/pipelines/retrieve_config.py index 3fdd8bf..a00bbd0 100644 --- a/htsworkflow/pipelines/retrieve_config.py +++ b/htsworkflow/pipelines/retrieve_config.py @@ -17,7 +17,7 @@ except ImportError as e: from htsworkflow.auth import apidata from htsworkflow.util import api -from htsworkflow.util import alphanum +from htsworkflow.util.alphanum import natural_sort_key from htsworkflow.util.url import normalize_url from htsworkflow.pipelines.genome_mapper import \ getAvailableGenomes, \ @@ -411,7 +411,7 @@ def format_pooled_libraries(shared, library): elif (type(sequences) == types.DictType): pooled = [] multiplex_ids = sequences.keys() - multiplex_ids.sort(cmp=alphanum.alphanum) + multiplex_ids.sort(key=natural_sort_key) for multiplex_id in multiplex_ids: sample = {} sample.update(shared) diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py index 7acb294..b12af02 100644 --- a/htsworkflow/pipelines/runfolder.py +++ b/htsworkflow/pipelines/runfolder.py @@ -22,7 +22,6 @@ from htsworkflow.pipelines import ElementTree, \ VERSION_RE, USER_RE, \ LANES_PER_FLOWCELL, LANE_LIST from htsworkflow.pipelines.samplekey import LANE_SAMPLE_KEYS -from htsworkflow.util.alphanum import alphanum from htsworkflow.util.ethelp import indent, flatten from htsworkflow.util.queuecommands import QueueCommands diff --git a/htsworkflow/util/alphanum.py b/htsworkflow/util/alphanum.py index c9d6649..3729661 100644 --- a/htsworkflow/util/alphanum.py +++ b/htsworkflow/util/alphanum.py @@ -1,60 +1,13 @@ -# -# The Alphanum Algorithm is an improved sorting algorithm for strings -# containing numbers. Instead of sorting numbers in ASCII order like -# a standard sort, this algorithm sorts numbers in numeric order. -# -# The Alphanum Algorithm is discussed at http://www.DaveKoelle.com -# -#* Python implementation provided by Chris Hulan (chris.hulan@gmail.com) -#* Distributed under same license as original -# -# This library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 2.1 of the License, or any later version. -# -# This library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -# +# from http://stackoverflow.com/questions/4836710/does-python-have-a-built-in-function-for-string-natural-sort +# modified by Diane Trout import re -import types -# -# TODO: Make decimal points be considered in the same class as digits -# - -def chunkify(str): - """ - return a list of numbers and non-numeric substrings of +str+ - the numeric substrings are converted to integer, non-numeric are left as is - """ - if type(str) in types.StringTypes: - chunks = re.findall("(\d+|\D+)",str) - #convert numeric strings to numbers - chunks = [re.match('\d',x) and int(x) or x for x in chunks] - return chunks - elif type(str) in [types.IntType, types.LongType, types.FloatType]: - return [str] +def natural_sort_key(s, _nsre=re.compile('([0-9]+)')): + if isinstance(s, type("")) or isinstance(s, type(u"")): + return [int(text) if text.isdigit() else text.lower() + for text in re.split(_nsre, s)] + elif isinstance(s, int): + return [s] else: - raise ValueError("Unsupported type %s for input %s" % (type(str), str)) - -def alphanum(a,b): - """ - breaks +a+ and +b+ into pieces and returns left-to-right comparison of the pieces - - +a+ and +b+ are expected to be strings (for example file names) with numbers and non-numeric characters - Split the values into list of numbers and non numeric sub-strings and so comparison of numbers gives - Numeric sorting, comparison of non-numeric gives Lexicographic order - """ - # split strings into chunks - aChunks = chunkify(a) - bChunks = chunkify(b) - - return cmp(aChunks,bChunks) #built in comparison works once data is prepared + raise ValueError("Unsupported type %s for input %s" % (type(s), s)) diff --git a/htsworkflow/util/test/test_alphanum.py b/htsworkflow/util/test/test_alphanum.py index a47e682..b9ff493 100644 --- a/htsworkflow/util/test/test_alphanum.py +++ b/htsworkflow/util/test/test_alphanum.py @@ -2,43 +2,43 @@ import copy import os from unittest import TestCase -from htsworkflow.util.alphanum import alphanum +from htsworkflow.util.alphanum import natural_sort_key class testAlphanum(TestCase): def test_string(self): unsorted = ['z5', 'b3', 'b10', 'a001', 'a2'] sorted = [ 'a001', 'a2', 'b3', 'b10', 'z5'] scratch = copy.copy(unsorted) - scratch.sort(alphanum) + scratch.sort(key=natural_sort_key) - for i in xrange(len(scratch)): - self.failIfEqual(scratch[i], unsorted[i]) - for i in xrange(len(scratch)): - self.failUnlessEqual(scratch[i], sorted[i]) + for i, s in enumerate(scratch): + self.failIfEqual(s, unsorted[i]) + for i, s in enumerate(scratch): + self.failUnlessEqual(s, sorted[i]) def test_numbers(self): unsorted = [5,7,10,18,-1,3] sorted = [-1,3,5,7,10,18] scratch = copy.copy(unsorted) - scratch.sort(alphanum) + scratch.sort(key=natural_sort_key) - for i in xrange(len(scratch)): - self.failIfEqual(scratch[i], unsorted[i]) - for i in xrange(len(scratch)): - self.failUnlessEqual(scratch[i], sorted[i]) + for i, s in enumerate(scratch): + self.failIfEqual(s, unsorted[i]) + for i, s in enumerate(scratch): + self.failUnlessEqual(s, sorted[i]) def test_long_names(self): unsorted = ["1000X Radonius Maximus","10X Radonius","200X Radonius","20X Radonius","20X Radonius Prime","30X Radonius","40X Radonius","Allegia 50 Clasteron","Allegia 500 Clasteron","Allegia 51 Clasteron","Allegia 51B Clasteron","Allegia 52 Clasteron","Allegia 60 Clasteron","Alpha 100","Alpha 2","Alpha 200","Alpha 2A","Alpha 2A-8000","Alpha 2A-900","Callisto Morphamax","Callisto Morphamax 500","Callisto Morphamax 5000","Callisto Morphamax 600","Callisto Morphamax 700","Callisto Morphamax 7000","Callisto Morphamax 7000 SE","Callisto Morphamax 7000 SE2","QRS-60 Intrinsia Machine","QRS-60F Intrinsia Machine","QRS-62 Intrinsia Machine","QRS-62F Intrinsia Machine","Xiph Xlater 10000","Xiph Xlater 2000","Xiph Xlater 300","Xiph Xlater 40","Xiph Xlater 5","Xiph Xlater 50","Xiph Xlater 500","Xiph Xlater 5000","Xiph Xlater 58"] expected = ['10X Radonius', '20X Radonius', '20X Radonius Prime', '30X Radonius', '40X Radonius', '200X Radonius', '1000X Radonius Maximus', 'Allegia 50 Clasteron', 'Allegia 51 Clasteron', 'Allegia 51B Clasteron', 'Allegia 52 Clasteron', 'Allegia 60 Clasteron', 'Allegia 500 Clasteron', 'Alpha 2', 'Alpha 2A', 'Alpha 2A-900', 'Alpha 2A-8000', 'Alpha 100', 'Alpha 200', 'Callisto Morphamax', 'Callisto Morphamax 500', 'Callisto Morphamax 600', 'Callisto Morphamax 700', 'Callisto Morphamax 5000', 'Callisto Morphamax 7000', 'Callisto Morphamax 7000 SE', 'Callisto Morphamax 7000 SE2', 'QRS-60 Intrinsia Machine', 'QRS-60F Intrinsia Machine', 'QRS-62 Intrinsia Machine', 'QRS-62F Intrinsia Machine', 'Xiph Xlater 5', 'Xiph Xlater 40', 'Xiph Xlater 50', 'Xiph Xlater 58', 'Xiph Xlater 300', 'Xiph Xlater 500', 'Xiph Xlater 2000', 'Xiph Xlater 5000', 'Xiph Xlater 10000'] s = unsorted[:] - s.sort(alphanum) + s.sort(key=natural_sort_key) self.failUnlessEqual(s, expected) def test_bad_input(self): unsorted = [object(), (1,3j)] s = unsorted[:] - self.failUnlessRaises(ValueError, s.sort, alphanum) + self.failUnlessRaises(ValueError, s.sort, key=natural_sort_key) def suite():