from htsworkflow.pipelines import qseq2fastq
from htsworkflow.pipelines import srf2fastq
from htsworkflow.pipelines import desplit_fastq
+from htsworkflow.submission.fastqname import FastqName
from htsworkflow.util.rdfhelp import get_model, dump_model, load_into_model, \
fromTypedNode, \
stripNamespace
'lib_id': seq.library_id,
'lane': seq.lane_number,
'read': seq.read,
- 'cycle': seq.cycle
+ 'cycle': seq.cycle,
+ 'is_paired': seq.ispaired
}
- if seq.ispaired:
- target_name = fastq_paired_template % \
- filename_attributes
- else:
- target_name = fastq_single_template % \
- filename_attributes
+ fqName = FastqName(**filename_attributes)
result_dir = result_map[seq.library_id]
- target_pathname = os.path.join(result_dir, target_name)
+ target_pathname = os.path.join(result_dir, fqName.filename)
if self.force or not os.path.exists(target_pathname):
t = needed_targets.setdefault(target_pathname, {})
t.setdefault(seq.filetype, []).append(seq)
--- /dev/null
+"""Standardize reading and writing fastq submission names.
+"""
+import collections
+import re
+PAIRED_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}_r{read}.fastq'
+SINGLE_TEMPLATE = '{lib_id}_{flowcell}_c{cycle}_l{lane}.fastq'
+
+FASTQ_RE = re.compile(
+ '(?P<lib_id>[^_]+)_(?P<flowcell>[^_]+)_'\
+ 'c(?P<cycle>[\d]+)_l(?P<lane>[\d]+)(_r(?P<read>[\d]))?\.fastq')
+
+class FastqName(collections.Mapping):
+ """Utility class to convert to the standardized submission fastq name.
+ """
+ def __init__(self, is_paired=None, **kwargs):
+ """Create a fastq name handler.
+
+ Takes filename or common attributes like flowcell, lib_id, lane, read, cycle
+ """
+ self._attributes = ('flowcell', 'lib_id', 'lane', 'read', 'cycle')
+ self._is_paired = is_paired
+
+ if len(kwargs) == 0:
+ return
+ if 'filename' in kwargs:
+ self._init_by_filename(**kwargs)
+ else:
+ self._init_by_attributes(**kwargs)
+
+ def _init_by_attributes(self, **kwargs):
+ for k in self._attributes:
+ value = None
+ if k in kwargs:
+ value = kwargs[k]
+ self[k] = value
+
+ def _init_by_filename(self, filename):
+ match = FASTQ_RE.match(filename)
+ if match is None:
+ raise ValueError('Is "{0}" a submission fastq?'.format(filename))
+
+ for k in self._attributes:
+ self[k] = match.group(k)
+
+ def _get_is_paired(self):
+ if self._is_paired is None:
+ return getattr(self, 'read', None) is not None
+ else:
+ return self._is_paired
+ def _set_is_paired(self, value):
+ self._is_paired = value
+ is_paired = property(_get_is_paired, _set_is_paired)
+
+ def _is_valid(self):
+ if self.is_paired and self['read'] is None:
+ return False
+
+ for k in self.keys():
+ if k == 'read':
+ continue
+ if self[k] is None:
+ return False
+ return True
+ is_valid = property(_is_valid)
+
+ def _get_filename(self):
+ if not self.is_valid:
+ raise ValueError(
+ "Please set all needed variables before generating a filename")
+
+ T = PAIRED_TEMPLATE if self.is_paired else SINGLE_TEMPLATE
+ return T.format(**self)
+ filename = property(_get_filename)
+
+ def __iter__(self):
+ return iter(self._attributes)
+
+ def __getitem__(self, key):
+ return getattr(self, key, None)
+
+ def __setitem__(self, key, value):
+ if key in self._attributes:
+ setattr(self, key, value)
+ else:
+ raise ValueError("Unrecognized key {0}".format(key))
+
+ def __len__(self):
+ return len([k for k in self if self[k] is not None])
toTypedNode, \
fromTypedNode
from htsworkflow.util.hashfile import make_md5sum
-
+from htsworkflow.submission.fastqname import FastqName
from htsworkflow.submission.daf import \
MetadataLookupException, \
get_submission_uri
an_analysis))
# add file specific information
- fileNode = self.link_file_to_classes(pathname,
- an_analysis,
- an_analysis_uri,
- analysis_dir)
+ fileNode = self.make_file_node(pathname, an_analysis)
self.add_md5s(filename, fileNode, analysis_dir)
+ self.add_fastq_metadata(filename, fileNode)
self.model.add_statement(
RDF.Statement(fileNode,
rdfNS['type'],
file_type))
LOGGER.debug("Done.")
- def link_file_to_classes(self, pathname, submissionNode, submission_uri, analysis_dir):
+ def make_file_node(self, pathname, submissionNode):
+ """Create file node and attach it to its submission.
+ """
# add file specific information
path, filename = os.path.split(pathname)
fileNode = RDF.Node(RDF.Uri('file://'+ os.path.abspath(pathname)))
self.model.add_statement(
RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
+ def add_fastq_metadata(self, filename, fileNode):
+ # How should I detect if this is actually a fastq file?
+ try:
+ fqname = FastqName(filename=filename)
+ except ValueError:
+ # currently its just ignore it if the fastq name parser fails
+ return
+
+ terms = [('flowcell', libraryOntology['flowcell_id']),
+ ('lib_id', libraryOntology['library_id']),
+ ('lane', libraryOntology['lane_number']),
+ ('read', libraryOntology['read']),
+ ('cycle', libraryOntology['read_length'])]
+ for file_term, model_term in terms:
+ value = fqname.get(file_term)
+ if value is not None:
+ s = RDF.Statement(fileNode, model_term, toTypedNode(value))
+ self.model.append(s)
+
def _add_library_details_to_model(self, libNode):
# attributes that can have multiple values
set_attributes = set((libraryOntology['has_lane'],
--- /dev/null
+from unittest2 import TestCase
+from htsworkflow.submission.fastqname import FastqName
+
+class TestFastqName(TestCase):
+ def test_init_empty(self):
+ fq = FastqName()
+ self.assertEqual(fq.is_valid, False)
+
+ def test_init_single_filename(self):
+ fq = FastqName(filename="12345_AABBCCDDXX_c100_l1.fastq")
+ self.assertEqual(fq.lib_id, "12345")
+ self.assertEqual(fq['lib_id'], "12345")
+ self.assertEqual(fq.flowcell, "AABBCCDDXX")
+ self.assertEqual(fq['flowcell'], "AABBCCDDXX")
+ self.assertEqual(fq.cycle, "100")
+ self.assertEqual(fq['cycle'], "100")
+ self.assertEqual(fq.lane, "1")
+ self.assertEqual(fq['lane'], "1")
+ self.assertEqual(fq.is_paired, False)
+
+ def test_init_single_filename(self):
+ fq = FastqName(filename="12345_AABBCCDDXX_c100_l1_r2.fastq")
+ self.assertEqual(fq.lib_id, "12345")
+ self.assertEqual(fq['lib_id'], "12345")
+ self.assertEqual(fq.flowcell, "AABBCCDDXX")
+ self.assertEqual(fq['flowcell'], "AABBCCDDXX")
+ self.assertEqual(fq.cycle, "100")
+ self.assertEqual(fq['cycle'], "100")
+ self.assertEqual(fq.lane, "1")
+ self.assertEqual(fq['lane'], "1")
+ self.assertEqual(fq.read, "2")
+ self.assertEqual(fq['read'], "2")
+ self.assertEqual(fq.is_paired, True)
+
+ def test_init_bad_filename(self):
+ attribs = {'filename': 'asdf.bam'}
+ self.assertRaises(ValueError, FastqName, **attribs)
+
+ def test_init_single_attributes(self):
+ fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX",
+ cycle = "100", lane="1")
+ self.assertEqual(fq.is_valid, True)
+ self.assertEqual(fq.lib_id, "12345")
+ self.assertEqual(fq['lib_id'], "12345")
+ self.assertEqual(fq.flowcell, "AABBCCDDXX")
+ self.assertEqual(fq['flowcell'], "AABBCCDDXX")
+ self.assertEqual(fq.cycle, "100")
+ self.assertEqual(fq['cycle'], "100")
+ self.assertEqual(fq.lane, "1")
+ self.assertEqual(fq['lane'], "1")
+ self.assertEqual(fq.is_paired, False)
+ self.assertEqual(fq.filename, "12345_AABBCCDDXX_c100_l1.fastq")
+
+ def test_init_single_attributes_set_single(self):
+ fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX",
+ cycle = "100", lane="1", is_paired=False)
+ self.assertEqual(fq.is_valid, True)
+ self.assertEqual(fq.is_paired, False)
+
+ def test_init_single_attributes_set_paired(self):
+ fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX",
+ cycle = "100", lane="1", is_paired=True)
+ self.assertEqual(fq.is_valid, False)
+ self.assertEqual(fq.is_paired, True)
+
+ def test_init_paired_attributes(self):
+ fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX",
+ cycle = "100", lane="1", read="2")
+ self.assertEqual(fq.is_valid, True)
+ self.assertEqual(fq.lib_id, "12345")
+ self.assertEqual(fq['lib_id'], "12345")
+ self.assertEqual(fq.flowcell, "AABBCCDDXX")
+ self.assertEqual(fq['flowcell'], "AABBCCDDXX")
+ self.assertEqual(fq.cycle, "100")
+ self.assertEqual(fq['cycle'], "100")
+ self.assertEqual(fq.lane, "1")
+ self.assertEqual(fq['lane'], "1")
+ self.assertEqual(fq.read, "2")
+ self.assertEqual(fq['read'], "2")
+ self.assertEqual(fq.is_paired, True)
+ self.assertEqual(fq.filename, "12345_AABBCCDDXX_c100_l1_r2.fastq")
+
+ def test_init_paired_attributes_set_single(self):
+ fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX",
+ cycle = "100", lane="1", read="2", is_paired=False)
+ self.assertEqual(fq.is_valid, True)
+ self.assertEqual(fq.is_paired, False)
+
+ def test_init_paired_attributes_set_paired(self):
+ fq = FastqName(lib_id="12345", flowcell="AABBCCDDXX",
+ cycle = "100", lane="1", read="2", is_paired=True)
+ self.assertEqual(fq.is_valid, True)
+ self.assertEqual(fq.is_paired, True)
+
+ def test_init_insufficient_attributes(self):
+ attribs = dict(lib_id="12345", flowcell="AABBCCDDXX")
+ fq = FastqName(**attribs)
+ self.assertEqual(fq.is_valid, False)
+
+
+def suite():
+ from unittest2 import TestSuite, defaultTestLoader
+ suite = TestSuite()
+ suite.addTests(defaultTestLoader.loadTestsFromTestCase(TestFastqName))
+ return suite
+
+if __name__ == "__main__":
+ from unittest2 import main
+ main(defaultTest='suite')
?file ucscDaf:filename ?filename ;
ucscDaf:md5sum ?md5sum ;
- dc:source ?source ;
+ libraryOntology:flowcell_id ?flowcell_id ;
a ?file_type .
+
?file_type a <{{file_class}}> ;
geoSoft:fileTypeLabel ?file_type_label .
- ?source libraryOntology:flowcell ?flowcell .
-
?flowcell libraryOntology:flowcell_id ?flowcell_id ;
libraryOntology:read_length ?read_length ;
libraryOntology:flowcell_type ?flowcell_type ;
-{% for name, value in series %}
-{{name}}={{value}}{% endfor %}{% for row in samples %}
-^SAMPLE={{row.name}}
+{% for name, value in series %}{{name}}={{value}}
+{% endfor %}{% for row in samples %}!Series_sample_id={{row.name}}
+{% endfor %}{% for row in samples %}^SAMPLE={{row.name}}
!Sample_type=SRA
!Sample_title={{row.name}}
!Sample_series_id={{ series_id }}
!Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}}
!sample_raw_file_single_or_paired-end_{{forloop.counter}}={{raw.flowcell_type}}{% endfor %}{% for sup in row.supplimental %}
!Sample_supplementary_file_{{forloop.counter}}={{sup.filename}}
-!Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}}
-{% endfor %}{% endfor %}
+!Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}}{% endfor %}
+{% endfor %}
\ No newline at end of file
return node
-def toTypedNode(value):
+def toTypedNode(value, language="en"):
"""Convert a python variable to a RDF Node with its closest xsd type
"""
if type(value) == types.BooleanType:
if value_type is not None:
node = RDF.Node(literal=value, datatype=value_type)
else:
- node = RDF.Node(literal=unicode(value).encode('utf-8'))
+ node = RDF.Node(literal=unicode(value).encode('utf-8'), language=language)
return node