Use named tuple for sequence flowcell/start/stop/project dir extractor
authorDiane Trout <diane@caltech.edu>
Mon, 5 Mar 2012 22:16:11 +0000 (14:16 -0800)
committerDiane Trout <diane@caltech.edu>
Mon, 5 Mar 2012 22:18:02 +0000 (14:18 -0800)
I was returning too much stuff from get_flowcell_cycle, so I thought
a named tuple would make it more comprehensible.

htsworkflow/pipelines/sequences.py
htsworkflow/pipelines/test/test_sequences.py

index f21e48c66aea85a3a6f71fdfab93e6dfcde40c7e..f3cc9fe6df28a3513d57f0f24f1b67939f429879 100644 (file)
@@ -1,8 +1,10 @@
 """
 Utilities to work with the various eras of sequence archive files
 """
+import collections
 import logging
 import os
+import types
 import re
 
 LOGGER = logging.getLogger(__name__)
@@ -29,6 +31,9 @@ CREATE TABLE %(table)s (
 """ %( {'table': SEQUENCE_TABLE_NAME} )
     return cursor.execute(sql)
 
+FlowcellPath = collections.namedtuple('FlowcellPath',
+                                      'flowcell start stop project')
+
 class SequenceFile(object):
     """
     Simple container class that holds the path to a sequence archive
@@ -118,6 +123,7 @@ def get_flowcell_cycle(path):
     """
     Extract flowcell, cycle from pathname
     """
+    path = os.path.normpath(path)
     project = None
     rest, tail = os.path.split(path)
     if tail.startswith('Project_'):
@@ -140,7 +146,7 @@ def get_flowcell_cycle(path):
     if stop is not None:
         stop = int(stop)
 
-    return flowcell, start, stop, project
+    return FlowcellPath(flowcell, start, stop, project)
 
 def parse_srf(path, filename):
     flowcell_dir, start, stop, project = get_flowcell_cycle(path)
@@ -243,6 +249,9 @@ def scan_for_sequences(dirs):
     Scan through a list of directories for sequence like files
     """
     sequences = []
+    if type(dirs) in types.StringTypes:
+        raise ValueError("You probably want a list or set, not a string")
+
     for d in dirs:
         LOGGER.info("Scanning %s for sequences" % (d,))
         if not os.path.exists(d):
index cede8c2b82644adf88bcfa13bb075ac1bce1f58a..157246a94e3eae277e78df2813f5670aabb89621 100644 (file)
@@ -4,10 +4,27 @@ import unittest
 
 from htsworkflow.pipelines import sequences
 
+
 class SequenceFileTests(unittest.TestCase):
     """
     Make sure the sequence archive class works
     """
+    def test_get_flowcell_cycle(self):
+        tests = [
+            ('/root/42BW9AAXX/C1-152',
+             sequences.FlowcellPath('42BW9AAXX', 1, 152, None)),
+            ('/root/42BW9AAXX/C1-152/',
+             sequences.FlowcellPath('42BW9AAXX', 1, 152, None)),
+            ('/root/42BW9AAXX/C1-152/Project_12345',
+             sequences.FlowcellPath('42BW9AAXX', 1, 152, 'Project_12345')),
+            ('/root/42BW9AAXX/C1-152/Project_12345/',
+             sequences.FlowcellPath('42BW9AAXX', 1, 152, 'Project_12345')),
+        ]
+
+        for t in tests:
+            path = sequences.get_flowcell_cycle(t[0])
+            self.failUnlessEqual(path, t[1])
+
     def test_flowcell_cycle(self):
         """
         Make sure code to parse directory heirarchy works
@@ -38,7 +55,6 @@ class SequenceFileTests(unittest.TestCase):
         path = '/root/42BW9AAXX/other'
         self.failUnlessRaises(ValueError, sequences.get_flowcell_cycle, path)
 
-
     def test_srf(self):
         path = '/root/42BW9AAXX/C1-38'
         name = 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_4.srf'