Simplify linking fastq files to their library id.

author Diane Trout <diane@caltech.edu>

Wed, 12 Dec 2012 23:39:00 +0000 (15:39 -0800)

committer Diane Trout <diane@caltech.edu>

Wed, 12 Dec 2012 23:39:00 +0000 (15:39 -0800)
author Diane Trout <diane@caltech.edu>
Wed, 12 Dec 2012 23:39:00 +0000 (15:39 -0800)
committer Diane Trout <diane@caltech.edu>
Wed, 12 Dec 2012 23:39:00 +0000 (15:39 -0800)
diff --git a/htsworkflow/submission/fastqname.py b/htsworkflow/submission/fastqname.py

index f749d4096246529b264d495d883cad6cb7c005fb..9dd52a0319fb9e9efa230e68344ef53156645be7 100644 (file)
--- a/htsworkflow/submission/fastqname.py
+++ b/htsworkflow/submission/fastqname.py
@@ -10,7 +10,13 @@ FASTQ_RE = re.compile(
      'c(?P<cycle>[\d]+)_l(?P<lane>[\d]+)(_r(?P<read>[\d]))?\.fastq')
  
  class FastqName(collections.Mapping):
+    """Utility class to convert to the standardized submission fastq name.
+    """
      def __init__(self, is_paired=None, **kwargs):
+        """Create a fastq name handler.
+
+        Takes filename or common attributes like flowcell, lib_id, lane, read, cycle
+        """
          self._attributes = ('flowcell', 'lib_id', 'lane', 'read', 'cycle')
          self._is_paired = is_paired
  
@@ -28,7 +34,6 @@ class FastqName(collections.Mapping):
                  value = kwargs[k]
              self[k] = value
  
-
      def _init_by_filename(self, filename):
          match = FASTQ_RE.match(filename)
          if match is None:
diff --git a/htsworkflow/submission/submission.py b/htsworkflow/submission/submission.py

index 01bdb17228689281b3f33e8aaaaa3af5ccda6d18..b3e2778ca490b8bbf4a111c1e2c0543f23910b00 100644 (file)
--- a/htsworkflow/submission/submission.py
+++ b/htsworkflow/submission/submission.py
@@ -19,7 +19,7 @@ from htsworkflow.util.rdfhelp import \
       toTypedNode, \
       fromTypedNode
  from htsworkflow.util.hashfile import make_md5sum
-
+from htsworkflow.submission.fastqname import FastqName
  from htsworkflow.submission.daf import \
       MetadataLookupException, \
       get_submission_uri
@@ -114,18 +114,18 @@ class Submission(object):
                            an_analysis))
  
          # add file specific information
-        fileNode = self.link_file_to_classes(pathname,
-                                             an_analysis,
-                                             an_analysis_uri,
-                                             analysis_dir)
+        fileNode = self.make_file_node(pathname, an_analysis)
          self.add_md5s(filename, fileNode, analysis_dir)
+        self.add_fastq_metadata(filename, fileNode)
          self.model.add_statement(
              RDF.Statement(fileNode,
                            rdfNS['type'],
                            file_type))
          LOGGER.debug("Done.")
  
-    def link_file_to_classes(self, pathname, submissionNode, submission_uri, analysis_dir):
+    def make_file_node(self, pathname, submissionNode):
+        """Create file node and attach it to its submission.
+        """
          # add file specific information
          path, filename = os.path.split(pathname)
          fileNode = RDF.Node(RDF.Uri('file://'+ os.path.abspath(pathname)))
@@ -150,6 +150,25 @@ class Submission(object):
              self.model.add_statement(
                  RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
  
+    def add_fastq_metadata(self, filename, fileNode):
+        # How should I detect if this is actually a fastq file?
+        try:
+            fqname = FastqName(filename=filename)
+        except ValueError:
+            # currently its just ignore it if the fastq name parser fails
+            return
+        
+        terms = [('flowcell', libraryOntology['flowcell_id']),
+                 ('lib_id', libraryOntology['library_id']),
+                 ('lane', libraryOntology['lane_number']),
+                 ('read', libraryOntology['read']),
+                 ('cycle', libraryOntology['read_length'])]
+        for file_term, model_term in terms:
+            value = fqname.get(file_term)
+            if value is not None:
+                s = RDF.Statement(fileNode, model_term, toTypedNode(value))
+                self.model.append(s)
+
      def _add_library_details_to_model(self, libNode):
          # attributes that can have multiple values
          set_attributes = set((libraryOntology['has_lane'],
diff --git a/htsworkflow/templates/geo_fastqs.sparql b/htsworkflow/templates/geo_fastqs.sparql

index 8f19c994bcf6b1ad93eb662ad9cfc53a7c5cda55..3ca5ab0cdf40d2f0b14091a381fac6e389d1aceb 100644 (file)
--- a/htsworkflow/templates/geo_fastqs.sparql
+++ b/htsworkflow/templates/geo_fastqs.sparql
@@ -13,13 +13,12 @@ WHERE {
  
    ?file ucscDaf:filename ?filename ;
          ucscDaf:md5sum ?md5sum ;
-        dc:source ?source ;
+        libraryOntology:flowcell_id ?flowcell_id ;
          a ?file_type .
+        
    ?file_type a <{{file_class}}> ;
               geoSoft:fileTypeLabel ?file_type_label .
  
-  ?source libraryOntology:flowcell ?flowcell .
-
    ?flowcell libraryOntology:flowcell_id ?flowcell_id ;
              libraryOntology:read_length ?read_length ;
              libraryOntology:flowcell_type ?flowcell_type ;
diff --git a/htsworkflow/templates/geo_submission.soft b/htsworkflow/templates/geo_submission.soft

index 8ae3ed9ef78e0d737f1d7cada296a740e2ff553d..4e50d0021b87a3c352aa1be7cb9e85dc8effecb8 100644 (file)
--- a/htsworkflow/templates/geo_submission.soft
+++ b/htsworkflow/templates/geo_submission.soft
@@ -1,6 +1,6 @@
-{% for name, value in series %}
-{{name}}={{value}}{% endfor %}{% for row in samples %}
-^SAMPLE={{row.name}}
+{% for name, value in series %}{{name}}={{value}}
+{% endfor %}{% for row in samples %}!Series_sample_id={{row.name}}
+{% endfor %}{% for row in samples %}^SAMPLE={{row.name}}
  !Sample_type=SRA
  !Sample_title={{row.name}}
  !Sample_series_id={{ series_id }}
@@ -30,5 +30,5 @@
  !Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}}
  !sample_raw_file_single_or_paired-end_{{forloop.counter}}={{raw.flowcell_type}}{% endfor %}{% for sup in row.supplimental %}
  !Sample_supplementary_file_{{forloop.counter}}={{sup.filename}}
-!Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}}
-{% endfor %}{% endfor %}
+!Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}}{% endfor %}
+{% endfor %}
+\ No newline at end of file
diff --git a/htsworkflow/util/rdfhelp.py b/htsworkflow/util/rdfhelp.py

index c116fc9236c1f06641365a8bc6d3680ab727abea..90b0e6adb194d8e64d4f5ecbd677033ead87ac4a 100644 (file)
--- a/htsworkflow/util/rdfhelp.py
+++ b/htsworkflow/util/rdfhelp.py
@@ -85,7 +85,7 @@ def blankOrUri(value=None):
      return node
  
  
-def toTypedNode(value):
+def toTypedNode(value, language="en"):
      """Convert a python variable to a RDF Node with its closest xsd type
      """
      if type(value) == types.BooleanType:
@@ -113,7 +113,7 @@ def toTypedNode(value):
      if value_type is not None:
          node = RDF.Node(literal=value, datatype=value_type)
      else:
-        node = RDF.Node(literal=unicode(value).encode('utf-8'))
+        node = RDF.Node(literal=unicode(value).encode('utf-8'), language=language)
      return node
author	Diane Trout <diane@caltech.edu>
	Wed, 12 Dec 2012 23:39:00 +0000 (15:39 -0800)
committer	Diane Trout <diane@caltech.edu>
	Wed, 12 Dec 2012 23:39:00 +0000 (15:39 -0800)
htsworkflow/submission/fastqname.py		patch \| blob \| history
htsworkflow/submission/submission.py		patch \| blob \| history
htsworkflow/templates/geo_fastqs.sparql		patch \| blob \| history
htsworkflow/templates/geo_submission.soft		patch \| blob \| history
htsworkflow/util/rdfhelp.py		patch \| blob \| history