From 8155bc04a7f91890b99593a6a1c2a5025b5e4cc6 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Fri, 7 Dec 2012 17:40:39 -0800 Subject: [PATCH] Add a FastqName class to create and parse standardized fastq names. I had a pretty standard naming convention for the fastq file names, instead of duplicating the code for creating & parsing them, I thought I should try to localize the code. So I just added htsworkflow.submission.fastqname --- htsworkflow/submission/condorfastq.py | 13 +-- htsworkflow/submission/fastqname.py | 83 +++++++++++++ htsworkflow/submission/test/test_fastqname.py | 109 ++++++++++++++++++ 3 files changed, 197 insertions(+), 8 deletions(-) create mode 100644 htsworkflow/submission/fastqname.py create mode 100644 htsworkflow/submission/test/test_fastqname.py diff --git a/htsworkflow/submission/condorfastq.py b/htsworkflow/submission/condorfastq.py index d79502d..17e4633 100644 --- a/htsworkflow/submission/condorfastq.py +++ b/htsworkflow/submission/condorfastq.py @@ -13,6 +13,7 @@ from htsworkflow.pipelines.samplekey import SampleKey from htsworkflow.pipelines import qseq2fastq from htsworkflow.pipelines import srf2fastq from htsworkflow.pipelines import desplit_fastq +from htsworkflow.submission.fastqname import FastqName from htsworkflow.util.rdfhelp import get_model, dump_model, load_into_model, \ fromTypedNode, \ stripNamespace @@ -231,18 +232,14 @@ WHERE { 'lib_id': seq.library_id, 'lane': seq.lane_number, 'read': seq.read, - 'cycle': seq.cycle + 'cycle': seq.cycle, + 'is_paired': seq.ispaired } - if seq.ispaired: - target_name = fastq_paired_template % \ - filename_attributes - else: - target_name = fastq_single_template % \ - filename_attributes + fqName = FastqName(**filename_attributes) result_dir = result_map[seq.library_id] - target_pathname = os.path.join(result_dir, target_name) + target_pathname = os.path.join(result_dir, fqName.filename) if self.force or not os.path.exists(target_pathname): t = needed_targets.setdefault(target_pathname, {}) t.setdefault(seq.filetype, []).append(seq) diff --git a/htsworkflow/submission/fastqname.py b/htsworkflow/submission/fastqname.py new file mode 100644 index 0000000..f749d40 --- /dev/null +++ b/htsworkflow/submission/fastqname.py @@ -0,0 +1,83 @@ +"""Standardize reading and writing fastq submission names. +""" +import collections +import re +PAIRED_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}_r{read}.fastq' +SINGLE_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}.fastq' + +FASTQ_RE = re.compile( + '(?P[^_]+)_(?P[^_]+)_'\ + 'c(?P[\d]+)_l(?P[\d]+)(_r(?P[\d]))?\.fastq') + +class FastqName(collections.Mapping): + def __init__(self, is_paired=None, **kwargs): + self._attributes = ('flowcell', 'lib_id', 'lane', 'read', 'cycle') + self._is_paired = is_paired + + if len(kwargs) == 0: + return + if 'filename' in kwargs: + self._init_by_filename(**kwargs) + else: + self._init_by_attributes(**kwargs) + + def _init_by_attributes(self, **kwargs): + for k in self._attributes: + value = None + if k in kwargs: + value = kwargs[k] + self[k] = value + + + def _init_by_filename(self, filename): + match = FASTQ_RE.match(filename) + if match is None: + raise ValueError('Is "{0}" a submission fastq?'.format(filename)) + + for k in self._attributes: + self[k] = match.group(k) + + def _get_is_paired(self): + if self._is_paired is None: + return getattr(self, 'read', None) is not None + else: + return self._is_paired + def _set_is_paired(self, value): + self._is_paired = value + is_paired = property(_get_is_paired, _set_is_paired) + + def _is_valid(self): + if self.is_paired and self['read'] is None: + return False + + for k in self.keys(): + if k == 'read': + continue + if self[k] is None: + return False + return True + is_valid = property(_is_valid) + + def _get_filename(self): + if not self.is_valid: + raise ValueError( + "Please set all needed variables before generating a filename") + + T = PAIRED_TEMPLATE if self.is_paired else SINGLE_TEMPLATE + return T.format(**self) + filename = property(_get_filename) + + def __iter__(self): + return iter(self._attributes) + + def __getitem__(self, key): + return getattr(self, key, None) + + def __setitem__(self, key, value): + if key in self._attributes: + setattr(self, key, value) + else: + raise ValueError("Unrecognized key {0}".format(key)) + + def __len__(self): + return len([k for k in self if self[k] is not None]) diff --git a/htsworkflow/submission/test/test_fastqname.py b/htsworkflow/submission/test/test_fastqname.py new file mode 100644 index 0000000..d51ad0e --- /dev/null +++ b/htsworkflow/submission/test/test_fastqname.py @@ -0,0 +1,109 @@ +from unittest2 import TestCase +from htsworkflow.submission.fastqname import FastqName + +class TestFastqName(TestCase): + def test_init_empty(self): + fq = FastqName() + self.assertEqual(fq.is_valid, False) + + def test_init_single_filename(self): + fq = FastqName(filename="12345_AABBCCDDXX_c100_l1.fastq") + self.assertEqual(fq.lib_id, "12345") + self.assertEqual(fq['lib_id'], "12345") + self.assertEqual(fq.flowcell, "AABBCCDDXX") + self.assertEqual(fq['flowcell'], "AABBCCDDXX") + self.assertEqual(fq.cycle, "100") + self.assertEqual(fq['cycle'], "100") + self.assertEqual(fq.lane, "1") + self.assertEqual(fq['lane'], "1") + self.assertEqual(fq.is_paired, False) + + def test_init_single_filename(self): + fq = FastqName(filename="12345_AABBCCDDXX_c100_l1_r2.fastq") + self.assertEqual(fq.lib_id, "12345") + self.assertEqual(fq['lib_id'], "12345") + self.assertEqual(fq.flowcell, "AABBCCDDXX") + self.assertEqual(fq['flowcell'], "AABBCCDDXX") + self.assertEqual(fq.cycle, "100") + self.assertEqual(fq['cycle'], "100") + self.assertEqual(fq.lane, "1") + self.assertEqual(fq['lane'], "1") + self.assertEqual(fq.read, "2") + self.assertEqual(fq['read'], "2") + self.assertEqual(fq.is_paired, True) + + def test_init_bad_filename(self): + attribs = {'filename': 'asdf.bam'} + self.assertRaises(ValueError, FastqName, **attribs) + + def test_init_single_attributes(self): + fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX", + cycle = "100", lane="1") + self.assertEqual(fq.is_valid, True) + self.assertEqual(fq.lib_id, "12345") + self.assertEqual(fq['lib_id'], "12345") + self.assertEqual(fq.flowcell, "AABBCCDDXX") + self.assertEqual(fq['flowcell'], "AABBCCDDXX") + self.assertEqual(fq.cycle, "100") + self.assertEqual(fq['cycle'], "100") + self.assertEqual(fq.lane, "1") + self.assertEqual(fq['lane'], "1") + self.assertEqual(fq.is_paired, False) + self.assertEqual(fq.filename, "12345_AABBCCDDXX_c100_l1.fastq") + + def test_init_single_attributes_set_single(self): + fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX", + cycle = "100", lane="1", is_paired=False) + self.assertEqual(fq.is_valid, True) + self.assertEqual(fq.is_paired, False) + + def test_init_single_attributes_set_paired(self): + fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX", + cycle = "100", lane="1", is_paired=True) + self.assertEqual(fq.is_valid, False) + self.assertEqual(fq.is_paired, True) + + def test_init_paired_attributes(self): + fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX", + cycle = "100", lane="1", read="2") + self.assertEqual(fq.is_valid, True) + self.assertEqual(fq.lib_id, "12345") + self.assertEqual(fq['lib_id'], "12345") + self.assertEqual(fq.flowcell, "AABBCCDDXX") + self.assertEqual(fq['flowcell'], "AABBCCDDXX") + self.assertEqual(fq.cycle, "100") + self.assertEqual(fq['cycle'], "100") + self.assertEqual(fq.lane, "1") + self.assertEqual(fq['lane'], "1") + self.assertEqual(fq.read, "2") + self.assertEqual(fq['read'], "2") + self.assertEqual(fq.is_paired, True) + self.assertEqual(fq.filename, "12345_AABBCCDDXX_c100_l1_r2.fastq") + + def test_init_paired_attributes_set_single(self): + fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX", + cycle = "100", lane="1", read="2", is_paired=False) + self.assertEqual(fq.is_valid, True) + self.assertEqual(fq.is_paired, False) + + def test_init_paired_attributes_set_paired(self): + fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX", + cycle = "100", lane="1", read="2", is_paired=True) + self.assertEqual(fq.is_valid, True) + self.assertEqual(fq.is_paired, True) + + def test_init_insufficient_attributes(self): + attribs = dict(lib_id="12345", flowcell="AABBCCDDXX") + fq = FastqName(**attribs) + self.assertEqual(fq.is_valid, False) + + +def suite(): + from unittest2 import TestSuite, defaultTestLoader + suite = TestSuite() + suite.addTests(defaultTestLoader.loadTestsFromTestCase(TestFastqName)) + return suite + +if __name__ == "__main__": + from unittest2 import main + main(defaultTest='suite') -- 2.30.2