From f12244468a66fedbbc758c2a2cbeba9d3a1ed9bc Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Mon, 5 Mar 2012 14:16:11 -0800 Subject: [PATCH] Use named tuple for sequence flowcell/start/stop/project dir extractor I was returning too much stuff from get_flowcell_cycle, so I thought a named tuple would make it more comprehensible. --- htsworkflow/pipelines/sequences.py | 11 ++++++++++- htsworkflow/pipelines/test/test_sequences.py | 18 +++++++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/htsworkflow/pipelines/sequences.py b/htsworkflow/pipelines/sequences.py index f21e48c..f3cc9fe 100644 --- a/htsworkflow/pipelines/sequences.py +++ b/htsworkflow/pipelines/sequences.py @@ -1,8 +1,10 @@ """ Utilities to work with the various eras of sequence archive files """ +import collections import logging import os +import types import re LOGGER = logging.getLogger(__name__) @@ -29,6 +31,9 @@ CREATE TABLE %(table)s ( """ %( {'table': SEQUENCE_TABLE_NAME} ) return cursor.execute(sql) +FlowcellPath = collections.namedtuple('FlowcellPath', + 'flowcell start stop project') + class SequenceFile(object): """ Simple container class that holds the path to a sequence archive @@ -118,6 +123,7 @@ def get_flowcell_cycle(path): """ Extract flowcell, cycle from pathname """ + path = os.path.normpath(path) project = None rest, tail = os.path.split(path) if tail.startswith('Project_'): @@ -140,7 +146,7 @@ def get_flowcell_cycle(path): if stop is not None: stop = int(stop) - return flowcell, start, stop, project + return FlowcellPath(flowcell, start, stop, project) def parse_srf(path, filename): flowcell_dir, start, stop, project = get_flowcell_cycle(path) @@ -243,6 +249,9 @@ def scan_for_sequences(dirs): Scan through a list of directories for sequence like files """ sequences = [] + if type(dirs) in types.StringTypes: + raise ValueError("You probably want a list or set, not a string") + for d in dirs: LOGGER.info("Scanning %s for sequences" % (d,)) if not os.path.exists(d): diff --git a/htsworkflow/pipelines/test/test_sequences.py b/htsworkflow/pipelines/test/test_sequences.py index cede8c2..157246a 100644 --- a/htsworkflow/pipelines/test/test_sequences.py +++ b/htsworkflow/pipelines/test/test_sequences.py @@ -4,10 +4,27 @@ import unittest from htsworkflow.pipelines import sequences + class SequenceFileTests(unittest.TestCase): """ Make sure the sequence archive class works """ + def test_get_flowcell_cycle(self): + tests = [ + ('/root/42BW9AAXX/C1-152', + sequences.FlowcellPath('42BW9AAXX', 1, 152, None)), + ('/root/42BW9AAXX/C1-152/', + sequences.FlowcellPath('42BW9AAXX', 1, 152, None)), + ('/root/42BW9AAXX/C1-152/Project_12345', + sequences.FlowcellPath('42BW9AAXX', 1, 152, 'Project_12345')), + ('/root/42BW9AAXX/C1-152/Project_12345/', + sequences.FlowcellPath('42BW9AAXX', 1, 152, 'Project_12345')), + ] + + for t in tests: + path = sequences.get_flowcell_cycle(t[0]) + self.failUnlessEqual(path, t[1]) + def test_flowcell_cycle(self): """ Make sure code to parse directory heirarchy works @@ -38,7 +55,6 @@ class SequenceFileTests(unittest.TestCase): path = '/root/42BW9AAXX/other' self.failUnlessRaises(ValueError, sequences.get_flowcell_cycle, path) - def test_srf(self): path = '/root/42BW9AAXX/C1-38' name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_4.srf' -- 2.30.2