Add a FastqName class to create and parse standardized fastq names.
authorDiane Trout <diane@caltech.edu>
Sat, 8 Dec 2012 01:40:39 +0000 (17:40 -0800)
committerDiane Trout <diane@caltech.edu>
Sat, 8 Dec 2012 01:40:39 +0000 (17:40 -0800)
I had a pretty standard naming convention for the fastq file names,
instead of duplicating the code for creating & parsing them,
I thought I should try to localize the code.

So I just added htsworkflow.submission.fastqname

htsworkflow/submission/condorfastq.py
htsworkflow/submission/fastqname.py [new file with mode: 0644]
htsworkflow/submission/test/test_fastqname.py [new file with mode: 0644]

index d79502d2af7052a06e871cc0520af2e056b7ec7f..17e463351282b7b8091f922760e8b1fd69a2c934 100644 (file)
@@ -13,6 +13,7 @@ from htsworkflow.pipelines.samplekey import SampleKey
 from htsworkflow.pipelines import qseq2fastq
 from htsworkflow.pipelines import srf2fastq
 from htsworkflow.pipelines import desplit_fastq
+from htsworkflow.submission.fastqname import FastqName
 from htsworkflow.util.rdfhelp import get_model, dump_model, load_into_model, \
      fromTypedNode, \
      stripNamespace
@@ -231,18 +232,14 @@ WHERE {
                 'lib_id': seq.library_id,
                 'lane': seq.lane_number,
                 'read': seq.read,
-                'cycle': seq.cycle
+                'cycle': seq.cycle,
+                'is_paired': seq.ispaired
             }
 
-            if seq.ispaired:
-                target_name = fastq_paired_template % \
-                              filename_attributes
-            else:
-                target_name = fastq_single_template % \
-                              filename_attributes
+            fqName = FastqName(**filename_attributes)
 
             result_dir = result_map[seq.library_id]
-            target_pathname = os.path.join(result_dir, target_name)
+            target_pathname = os.path.join(result_dir, fqName.filename)
             if self.force or not os.path.exists(target_pathname):
                 t = needed_targets.setdefault(target_pathname, {})
                 t.setdefault(seq.filetype, []).append(seq)
diff --git a/htsworkflow/submission/fastqname.py b/htsworkflow/submission/fastqname.py
new file mode 100644 (file)
index 0000000..f749d40
--- /dev/null
@@ -0,0 +1,83 @@
+"""Standardize reading and writing fastq submission names.
+"""
+import collections
+import re
+PAIRED_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}_r{read}.fastq'
+SINGLE_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}.fastq'
+
+FASTQ_RE = re.compile(
+    '(?P<lib_id>[^_]+)_(?P<flowcell>[^_]+)_'\
+    'c(?P<cycle>[\d]+)_l(?P<lane>[\d]+)(_r(?P<read>[\d]))?\.fastq')
+
+class FastqName(collections.Mapping):
+    def __init__(self, is_paired=None, **kwargs):
+        self._attributes = ('flowcell', 'lib_id', 'lane', 'read', 'cycle')
+        self._is_paired = is_paired
+
+        if len(kwargs) == 0:
+            return
+        if 'filename' in kwargs:
+            self._init_by_filename(**kwargs)
+        else:
+            self._init_by_attributes(**kwargs)
+
+    def _init_by_attributes(self, **kwargs):
+        for k in self._attributes:
+            value = None
+            if k in kwargs:
+                value = kwargs[k]
+            self[k] = value
+
+
+    def _init_by_filename(self, filename):
+        match = FASTQ_RE.match(filename)
+        if match is None:
+            raise ValueError('Is "{0}" a submission fastq?'.format(filename))
+
+        for k in self._attributes:
+            self[k] = match.group(k)
+
+    def _get_is_paired(self):
+        if self._is_paired is None:
+            return getattr(self, 'read', None) is not None
+        else:
+            return self._is_paired
+    def _set_is_paired(self, value):
+        self._is_paired = value
+    is_paired = property(_get_is_paired, _set_is_paired)
+
+    def _is_valid(self):
+        if self.is_paired and self['read'] is None:
+            return False
+
+        for k in self.keys():
+            if k == 'read':
+                continue
+            if self[k] is None:
+                return False
+        return True
+    is_valid = property(_is_valid)
+
+    def _get_filename(self):
+        if not self.is_valid:
+            raise ValueError(
+                "Please set all needed variables before generating a filename")
+
+        T = PAIRED_TEMPLATE if self.is_paired else SINGLE_TEMPLATE
+        return T.format(**self)
+    filename = property(_get_filename)
+
+    def __iter__(self):
+        return iter(self._attributes)
+
+    def __getitem__(self, key):
+        return getattr(self, key, None)
+
+    def __setitem__(self, key, value):
+        if key in self._attributes:
+            setattr(self, key, value)
+        else:
+            raise ValueError("Unrecognized key {0}".format(key))
+
+    def __len__(self):
+        return len([k for k in self if self[k] is not None])
diff --git a/htsworkflow/submission/test/test_fastqname.py b/htsworkflow/submission/test/test_fastqname.py
new file mode 100644 (file)
index 0000000..d51ad0e
--- /dev/null
@@ -0,0 +1,109 @@
+from unittest2 import TestCase
+from htsworkflow.submission.fastqname import FastqName
+
+class TestFastqName(TestCase):
+    def test_init_empty(self):
+        fq = FastqName()
+        self.assertEqual(fq.is_valid, False)
+
+    def test_init_single_filename(self):
+        fq = FastqName(filename="12345_AABBCCDDXX_c100_l1.fastq")
+        self.assertEqual(fq.lib_id, "12345")
+        self.assertEqual(fq['lib_id'], "12345")
+        self.assertEqual(fq.flowcell, "AABBCCDDXX")
+        self.assertEqual(fq['flowcell'], "AABBCCDDXX")
+        self.assertEqual(fq.cycle, "100")
+        self.assertEqual(fq['cycle'], "100")
+        self.assertEqual(fq.lane, "1")
+        self.assertEqual(fq['lane'], "1")
+        self.assertEqual(fq.is_paired, False)
+
+    def test_init_single_filename(self):
+        fq = FastqName(filename="12345_AABBCCDDXX_c100_l1_r2.fastq")
+        self.assertEqual(fq.lib_id, "12345")
+        self.assertEqual(fq['lib_id'], "12345")
+        self.assertEqual(fq.flowcell, "AABBCCDDXX")
+        self.assertEqual(fq['flowcell'], "AABBCCDDXX")
+        self.assertEqual(fq.cycle, "100")
+        self.assertEqual(fq['cycle'], "100")
+        self.assertEqual(fq.lane, "1")
+        self.assertEqual(fq['lane'], "1")
+        self.assertEqual(fq.read, "2")
+        self.assertEqual(fq['read'], "2")
+        self.assertEqual(fq.is_paired, True)
+
+    def test_init_bad_filename(self):
+        attribs = {'filename': 'asdf.bam'}
+        self.assertRaises(ValueError, FastqName, **attribs)
+
+    def test_init_single_attributes(self):
+        fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX",
+                       cycle = "100", lane="1")
+        self.assertEqual(fq.is_valid, True)
+        self.assertEqual(fq.lib_id, "12345")
+        self.assertEqual(fq['lib_id'], "12345")
+        self.assertEqual(fq.flowcell, "AABBCCDDXX")
+        self.assertEqual(fq['flowcell'], "AABBCCDDXX")
+        self.assertEqual(fq.cycle, "100")
+        self.assertEqual(fq['cycle'], "100")
+        self.assertEqual(fq.lane, "1")
+        self.assertEqual(fq['lane'], "1")
+        self.assertEqual(fq.is_paired, False)
+        self.assertEqual(fq.filename, "12345_AABBCCDDXX_c100_l1.fastq")
+
+    def test_init_single_attributes_set_single(self):
+        fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX",
+                       cycle = "100", lane="1", is_paired=False)
+        self.assertEqual(fq.is_valid, True)
+        self.assertEqual(fq.is_paired, False)
+
+    def test_init_single_attributes_set_paired(self):
+        fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX",
+                       cycle = "100", lane="1", is_paired=True)
+        self.assertEqual(fq.is_valid, False)
+        self.assertEqual(fq.is_paired, True)
+
+    def test_init_paired_attributes(self):
+        fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX",
+                       cycle = "100", lane="1", read="2")
+        self.assertEqual(fq.is_valid, True)
+        self.assertEqual(fq.lib_id, "12345")
+        self.assertEqual(fq['lib_id'], "12345")
+        self.assertEqual(fq.flowcell, "AABBCCDDXX")
+        self.assertEqual(fq['flowcell'], "AABBCCDDXX")
+        self.assertEqual(fq.cycle, "100")
+        self.assertEqual(fq['cycle'], "100")
+        self.assertEqual(fq.lane, "1")
+        self.assertEqual(fq['lane'], "1")
+        self.assertEqual(fq.read, "2")
+        self.assertEqual(fq['read'], "2")
+        self.assertEqual(fq.is_paired, True)
+        self.assertEqual(fq.filename, "12345_AABBCCDDXX_c100_l1_r2.fastq")
+
+    def test_init_paired_attributes_set_single(self):
+        fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX",
+                       cycle = "100", lane="1", read="2", is_paired=False)
+        self.assertEqual(fq.is_valid, True)
+        self.assertEqual(fq.is_paired, False)
+
+    def test_init_paired_attributes_set_paired(self):
+        fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX",
+                       cycle = "100", lane="1", read="2", is_paired=True)
+        self.assertEqual(fq.is_valid, True)
+        self.assertEqual(fq.is_paired, True)
+
+    def test_init_insufficient_attributes(self):
+        attribs = dict(lib_id="12345", flowcell="AABBCCDDXX")
+        fq = FastqName(**attribs)
+        self.assertEqual(fq.is_valid, False)
+
+
+def suite():
+    from unittest2 import TestSuite, defaultTestLoader
+    suite = TestSuite()
+    suite.addTests(defaultTestLoader.loadTestsFromTestCase(TestFastqName))
+    return suite
+
+if __name__ == "__main__":
+    from unittest2 import main
+    main(defaultTest='suite')