Simplify linking fastq files to their library id.
authorDiane Trout <diane@caltech.edu>
Wed, 12 Dec 2012 23:39:00 +0000 (15:39 -0800)
committerDiane Trout <diane@caltech.edu>
Wed, 12 Dec 2012 23:39:00 +0000 (15:39 -0800)
Unlike my previous effort which required the fastq generation
script to generate dc:source entries to match fastqs to libraries,
this version just parses the generated fastq filename.

This does mean that a manually generated file might not work.

I accomplished this by writing a class to generate the
fastq (for submission) filenames and to parse them, so at least
all that code is in one place.

Also after attaching the fastq metadata to the file node,
I discovered I the websites use of language tags on strings
made my query fail. So I changed the toTypedNode to take an optional
language tag. (Defaults to "en").

htsworkflow/submission/fastqname.py
htsworkflow/submission/submission.py
htsworkflow/templates/geo_fastqs.sparql
htsworkflow/templates/geo_submission.soft
htsworkflow/util/rdfhelp.py

index f749d4096246529b264d495d883cad6cb7c005fb..9dd52a0319fb9e9efa230e68344ef53156645be7 100644 (file)
@@ -10,7 +10,13 @@ FASTQ_RE = re.compile(
     'c(?P<cycle>[\d]+)_l(?P<lane>[\d]+)(_r(?P<read>[\d]))?\.fastq')
 
 class FastqName(collections.Mapping):
+    """Utility class to convert to the standardized submission fastq name.
+    """
     def __init__(self, is_paired=None, **kwargs):
+        """Create a fastq name handler.
+
+        Takes filename or common attributes like flowcell, lib_id, lane, read, cycle
+        """
         self._attributes = ('flowcell', 'lib_id', 'lane', 'read', 'cycle')
         self._is_paired = is_paired
 
@@ -28,7 +34,6 @@ class FastqName(collections.Mapping):
                 value = kwargs[k]
             self[k] = value
 
-
     def _init_by_filename(self, filename):
         match = FASTQ_RE.match(filename)
         if match is None:
index 01bdb17228689281b3f33e8aaaaa3af5ccda6d18..b3e2778ca490b8bbf4a111c1e2c0543f23910b00 100644 (file)
@@ -19,7 +19,7 @@ from htsworkflow.util.rdfhelp import \
      toTypedNode, \
      fromTypedNode
 from htsworkflow.util.hashfile import make_md5sum
-
+from htsworkflow.submission.fastqname import FastqName
 from htsworkflow.submission.daf import \
      MetadataLookupException, \
      get_submission_uri
@@ -114,18 +114,18 @@ class Submission(object):
                           an_analysis))
 
         # add file specific information
-        fileNode = self.link_file_to_classes(pathname,
-                                             an_analysis,
-                                             an_analysis_uri,
-                                             analysis_dir)
+        fileNode = self.make_file_node(pathname, an_analysis)
         self.add_md5s(filename, fileNode, analysis_dir)
+        self.add_fastq_metadata(filename, fileNode)
         self.model.add_statement(
             RDF.Statement(fileNode,
                           rdfNS['type'],
                           file_type))
         LOGGER.debug("Done.")
 
-    def link_file_to_classes(self, pathname, submissionNode, submission_uri, analysis_dir):
+    def make_file_node(self, pathname, submissionNode):
+        """Create file node and attach it to its submission.
+        """
         # add file specific information
         path, filename = os.path.split(pathname)
         fileNode = RDF.Node(RDF.Uri('file://'+ os.path.abspath(pathname)))
@@ -150,6 +150,25 @@ class Submission(object):
             self.model.add_statement(
                 RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
 
+    def add_fastq_metadata(self, filename, fileNode):
+        # How should I detect if this is actually a fastq file?
+        try:
+            fqname = FastqName(filename=filename)
+        except ValueError:
+            # currently its just ignore it if the fastq name parser fails
+            return
+        
+        terms = [('flowcell', libraryOntology['flowcell_id']),
+                 ('lib_id', libraryOntology['library_id']),
+                 ('lane', libraryOntology['lane_number']),
+                 ('read', libraryOntology['read']),
+                 ('cycle', libraryOntology['read_length'])]
+        for file_term, model_term in terms:
+            value = fqname.get(file_term)
+            if value is not None:
+                s = RDF.Statement(fileNode, model_term, toTypedNode(value))
+                self.model.append(s)
+
     def _add_library_details_to_model(self, libNode):
         # attributes that can have multiple values
         set_attributes = set((libraryOntology['has_lane'],
index 8f19c994bcf6b1ad93eb662ad9cfc53a7c5cda55..3ca5ab0cdf40d2f0b14091a381fac6e389d1aceb 100644 (file)
@@ -13,13 +13,12 @@ WHERE {
 
   ?file ucscDaf:filename ?filename ;
         ucscDaf:md5sum ?md5sum ;
-        dc:source ?source ;
+        libraryOntology:flowcell_id ?flowcell_id ;
         a ?file_type .
+        
   ?file_type a <{{file_class}}> ;
              geoSoft:fileTypeLabel ?file_type_label .
 
-  ?source libraryOntology:flowcell ?flowcell .
-
   ?flowcell libraryOntology:flowcell_id ?flowcell_id ;
             libraryOntology:read_length ?read_length ;
             libraryOntology:flowcell_type ?flowcell_type ;
index 8ae3ed9ef78e0d737f1d7cada296a740e2ff553d..4e50d0021b87a3c352aa1be7cb9e85dc8effecb8 100644 (file)
@@ -1,6 +1,6 @@
-{% for name, value in series %}
-{{name}}={{value}}{% endfor %}{% for row in samples %}
-^SAMPLE={{row.name}}
+{% for name, value in series %}{{name}}={{value}}
+{% endfor %}{% for row in samples %}!Series_sample_id={{row.name}}
+{% endfor %}{% for row in samples %}^SAMPLE={{row.name}}
 !Sample_type=SRA
 !Sample_title={{row.name}}
 !Sample_series_id={{ series_id }}
@@ -30,5 +30,5 @@
 !Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}}
 !sample_raw_file_single_or_paired-end_{{forloop.counter}}={{raw.flowcell_type}}{% endfor %}{% for sup in row.supplimental %}
 !Sample_supplementary_file_{{forloop.counter}}={{sup.filename}}
-!Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}}
-{% endfor %}{% endfor %}
+!Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}}{% endfor %}
+{% endfor %}
\ No newline at end of file
index c116fc9236c1f06641365a8bc6d3680ab727abea..90b0e6adb194d8e64d4f5ecbd677033ead87ac4a 100644 (file)
@@ -85,7 +85,7 @@ def blankOrUri(value=None):
     return node
 
 
-def toTypedNode(value):
+def toTypedNode(value, language="en"):
     """Convert a python variable to a RDF Node with its closest xsd type
     """
     if type(value) == types.BooleanType:
@@ -113,7 +113,7 @@ def toTypedNode(value):
     if value_type is not None:
         node = RDF.Node(literal=value, datatype=value_type)
     else:
-        node = RDF.Node(literal=unicode(value).encode('utf-8'))
+        node = RDF.Node(literal=unicode(value).encode('utf-8'), language=language)
     return node