Use named tuple for sequence flowcell/start/stop/project dir extractor

[htsworkflow.git] / htsworkflow / pipelines / sequences.py
diff --git a/htsworkflow/pipelines/sequences.py b/htsworkflow/pipelines/sequences.py

index f21e48c66aea85a3a6f71fdfab93e6dfcde40c7e..f3cc9fe6df28a3513d57f0f24f1b67939f429879 100644 (file)
--- a/htsworkflow/pipelines/sequences.py
+++ b/htsworkflow/pipelines/sequences.py
@@ -1,8 +1,10 @@
  """
  Utilities to work with the various eras of sequence archive files
  """
+import collections
  import logging
  import os
+import types
  import re
  
  LOGGER = logging.getLogger(__name__)
@@ -29,6 +31,9 @@ CREATE TABLE %(table)s (
  """ %( {'table': SEQUENCE_TABLE_NAME} )
      return cursor.execute(sql)
  
+FlowcellPath = collections.namedtuple('FlowcellPath',
+                                      'flowcell start stop project')
+
  class SequenceFile(object):
      """
      Simple container class that holds the path to a sequence archive
@@ -118,6 +123,7 @@ def get_flowcell_cycle(path):
      """
      Extract flowcell, cycle from pathname
      """
+    path = os.path.normpath(path)
      project = None
      rest, tail = os.path.split(path)
      if tail.startswith('Project_'):
@@ -140,7 +146,7 @@ def get_flowcell_cycle(path):
      if stop is not None:
          stop = int(stop)
  
-    return flowcell, start, stop, project
+    return FlowcellPath(flowcell, start, stop, project)
  
  def parse_srf(path, filename):
      flowcell_dir, start, stop, project = get_flowcell_cycle(path)
@@ -243,6 +249,9 @@ def scan_for_sequences(dirs):
      Scan through a list of directories for sequence like files
      """
      sequences = []
+    if type(dirs) in types.StringTypes:
+        raise ValueError("You probably want a list or set, not a string")
+
      for d in dirs:
          LOGGER.info("Scanning %s for sequences" % (d,))
          if not os.path.exists(d):