From: Diane Trout Date: Wed, 12 Dec 2012 23:39:00 +0000 (-0800) Subject: Simplify linking fastq files to their library id. X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=447be4c5a3dcfac6550d3f06b8faf9198a8cd321 Simplify linking fastq files to their library id. Unlike my previous effort which required the fastq generation script to generate dc:source entries to match fastqs to libraries, this version just parses the generated fastq filename. This does mean that a manually generated file might not work. I accomplished this by writing a class to generate the fastq (for submission) filenames and to parse them, so at least all that code is in one place. Also after attaching the fastq metadata to the file node, I discovered I the websites use of language tags on strings made my query fail. So I changed the toTypedNode to take an optional language tag. (Defaults to "en"). --- diff --git a/htsworkflow/submission/fastqname.py b/htsworkflow/submission/fastqname.py index f749d40..9dd52a0 100644 --- a/htsworkflow/submission/fastqname.py +++ b/htsworkflow/submission/fastqname.py @@ -10,7 +10,13 @@ FASTQ_RE = re.compile( 'c(?P[\d]+)_l(?P[\d]+)(_r(?P[\d]))?\.fastq') class FastqName(collections.Mapping): + """Utility class to convert to the standardized submission fastq name. + """ def __init__(self, is_paired=None, **kwargs): + """Create a fastq name handler. + + Takes filename or common attributes like flowcell, lib_id, lane, read, cycle + """ self._attributes = ('flowcell', 'lib_id', 'lane', 'read', 'cycle') self._is_paired = is_paired @@ -28,7 +34,6 @@ class FastqName(collections.Mapping): value = kwargs[k] self[k] = value - def _init_by_filename(self, filename): match = FASTQ_RE.match(filename) if match is None: diff --git a/htsworkflow/submission/submission.py b/htsworkflow/submission/submission.py index 01bdb17..b3e2778 100644 --- a/htsworkflow/submission/submission.py +++ b/htsworkflow/submission/submission.py @@ -19,7 +19,7 @@ from htsworkflow.util.rdfhelp import \ toTypedNode, \ fromTypedNode from htsworkflow.util.hashfile import make_md5sum - +from htsworkflow.submission.fastqname import FastqName from htsworkflow.submission.daf import \ MetadataLookupException, \ get_submission_uri @@ -114,18 +114,18 @@ class Submission(object): an_analysis)) # add file specific information - fileNode = self.link_file_to_classes(pathname, - an_analysis, - an_analysis_uri, - analysis_dir) + fileNode = self.make_file_node(pathname, an_analysis) self.add_md5s(filename, fileNode, analysis_dir) + self.add_fastq_metadata(filename, fileNode) self.model.add_statement( RDF.Statement(fileNode, rdfNS['type'], file_type)) LOGGER.debug("Done.") - def link_file_to_classes(self, pathname, submissionNode, submission_uri, analysis_dir): + def make_file_node(self, pathname, submissionNode): + """Create file node and attach it to its submission. + """ # add file specific information path, filename = os.path.split(pathname) fileNode = RDF.Node(RDF.Uri('file://'+ os.path.abspath(pathname))) @@ -150,6 +150,25 @@ class Submission(object): self.model.add_statement( RDF.Statement(fileNode, dafTermOntology['md5sum'], md5)) + def add_fastq_metadata(self, filename, fileNode): + # How should I detect if this is actually a fastq file? + try: + fqname = FastqName(filename=filename) + except ValueError: + # currently its just ignore it if the fastq name parser fails + return + + terms = [('flowcell', libraryOntology['flowcell_id']), + ('lib_id', libraryOntology['library_id']), + ('lane', libraryOntology['lane_number']), + ('read', libraryOntology['read']), + ('cycle', libraryOntology['read_length'])] + for file_term, model_term in terms: + value = fqname.get(file_term) + if value is not None: + s = RDF.Statement(fileNode, model_term, toTypedNode(value)) + self.model.append(s) + def _add_library_details_to_model(self, libNode): # attributes that can have multiple values set_attributes = set((libraryOntology['has_lane'], diff --git a/htsworkflow/templates/geo_fastqs.sparql b/htsworkflow/templates/geo_fastqs.sparql index 8f19c99..3ca5ab0 100644 --- a/htsworkflow/templates/geo_fastqs.sparql +++ b/htsworkflow/templates/geo_fastqs.sparql @@ -13,13 +13,12 @@ WHERE { ?file ucscDaf:filename ?filename ; ucscDaf:md5sum ?md5sum ; - dc:source ?source ; + libraryOntology:flowcell_id ?flowcell_id ; a ?file_type . + ?file_type a <{{file_class}}> ; geoSoft:fileTypeLabel ?file_type_label . - ?source libraryOntology:flowcell ?flowcell . - ?flowcell libraryOntology:flowcell_id ?flowcell_id ; libraryOntology:read_length ?read_length ; libraryOntology:flowcell_type ?flowcell_type ; diff --git a/htsworkflow/templates/geo_submission.soft b/htsworkflow/templates/geo_submission.soft index 8ae3ed9..4e50d00 100644 --- a/htsworkflow/templates/geo_submission.soft +++ b/htsworkflow/templates/geo_submission.soft @@ -1,6 +1,6 @@ -{% for name, value in series %} -{{name}}={{value}}{% endfor %}{% for row in samples %} -^SAMPLE={{row.name}} +{% for name, value in series %}{{name}}={{value}} +{% endfor %}{% for row in samples %}!Series_sample_id={{row.name}} +{% endfor %}{% for row in samples %}^SAMPLE={{row.name}} !Sample_type=SRA !Sample_title={{row.name}} !Sample_series_id={{ series_id }} @@ -30,5 +30,5 @@ !Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}} !sample_raw_file_single_or_paired-end_{{forloop.counter}}={{raw.flowcell_type}}{% endfor %}{% for sup in row.supplimental %} !Sample_supplementary_file_{{forloop.counter}}={{sup.filename}} -!Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}} -{% endfor %}{% endfor %} +!Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}}{% endfor %} +{% endfor %} \ No newline at end of file diff --git a/htsworkflow/util/rdfhelp.py b/htsworkflow/util/rdfhelp.py index c116fc9..90b0e6a 100644 --- a/htsworkflow/util/rdfhelp.py +++ b/htsworkflow/util/rdfhelp.py @@ -85,7 +85,7 @@ def blankOrUri(value=None): return node -def toTypedNode(value): +def toTypedNode(value, language="en"): """Convert a python variable to a RDF Node with its closest xsd type """ if type(value) == types.BooleanType: @@ -113,7 +113,7 @@ def toTypedNode(value): if value_type is not None: node = RDF.Node(literal=value, datatype=value_type) else: - node = RDF.Node(literal=unicode(value).encode('utf-8')) + node = RDF.Node(literal=unicode(value).encode('utf-8'), language=language) return node