This might actually generate soft file with raw & supplemental data.

[htsworkflow.git] / htsworkflow / submission / geo.py
diff --git a/htsworkflow/submission/geo.py b/htsworkflow/submission/geo.py

index 737b1bb353458ceb0d961adf551a680cc1f9399f..097bea1083af8c1ffc39b3edea9a6e94e0a63fcd 100644 (file)
--- a/htsworkflow/submission/geo.py
+++ b/htsworkflow/submission/geo.py
@@ -1,4 +1,5 @@
  import logging
+import os
  
  import RDF
  
@@ -6,6 +7,8 @@ from htsworkflow.submission.submission import Submission
  
  from htsworkflow.util.rdfhelp import \
       fromTypedNode, \
+     geoSoftNS, \
+     stripNamespace, \
       submissionOntology
  
  from django.conf import settings
@@ -14,18 +17,40 @@ from django.template import Context, loader
  LOGGER = logging.getLogger(__name__)
  
  class GEOSubmission(Submission):
-    def __init__(self, name, model):
-        super(GEOSubmission, self).__init__(name, model)
+    def __init__(self, name, model, host):
+        super(GEOSubmission, self).__init__(name, model, host)
  
      def make_soft(self, result_map):
          samples = []
+        platform = self.get_platform_metadata()
+        platform_attribs = dict(platform)
+        platform_id = platform_attribs['^platform']
+        series = self.get_series_metadata()
+        series_attribs = dict(series)
+        series_id = series_attribs['^series']
          for lib_id, result_dir in result_map.items():
              an_analysis = self.get_submission_node(result_dir)
-            samples.append(self.get_sample_metadata(an_analysis))
+            metadata = self.get_sample_metadata(an_analysis)
+            if len(metadata) == 0:
+                errmsg = 'No metadata found for {0}'
+                LOGGER.error(errmsg.format(str(an_analysis),))
+                continue
+            elif len(metadata) > 1:
+                errmsg = 'Confused there are more than one sample for %s'
+                LOGGER.debug(errmsg % (str(an_analysis),))
+            metadata = metadata[0]
+            metadata['raw'] = self.get_raw_files(an_analysis)
+            metadata['supplimental'] = self.get_sample_files(an_analysis)
+            metadata['run'] = self.get_run_details(an_analysis)
+            samples.append(metadata)
  
          soft_template = loader.get_template('geo_submission.soft')
          context = Context({
-            'samples': samples
+            'platform': platform,
+            'series': series,
+            'samples': samples,
+            'platform_id': platform_id,
+            'series_id': series_id,
          })
          print str(soft_template.render(context))
  
@@ -39,19 +64,157 @@ class GEOSubmission(Submission):
          else:
              return True
  
+    def get_platform_metadata(self):
+        """Gather information for filling out sample section of a SOFT file
+        """
+        query_template = loader.get_template('geo_platform.sparql')
+        submission = str(self.submissionSetNS[''].uri)
+        context = Context({
+            'submission': submission,
+            })
+
+        results = self.execute_query(query_template, context)
+        return self.query_to_soft_dictionary(results, 'platform')
+
+    def get_series_metadata(self):
+        """Gather information for filling out sample section of a SOFT file
+        """
+        query_template = loader.get_template('geo_series.sparql')
+        submission = str(self.submissionSetNS[''].uri)
+        context = Context({
+            'submission': submission,
+            })
+
+        results = self.execute_query(query_template, context)
+        return self.query_to_soft_dictionary(results, 'series')
+
      def get_sample_metadata(self, analysis_node):
          """Gather information for filling out sample section of a SOFT file
          """
-        query_template = loader.get_template('geo_submission.sparql')
+        query_template = loader.get_template('geo_samples.sparql')
  
          context = Context({
              'submission': str(analysis_node.uri),
+            'submissionSet': str(self.submissionSetNS[''].uri),
              })
  
-        formatted_query = query_template.render(context)
-        query = RDF.SPARQLQuery(str(formatted_query))
-        rdfstream = query.execute(self.model)
-        results = []
-        for r in rdfstream:
-            results.append(r)
+        results = self.execute_query(query_template, context)
+        for r in results:
+            r['dataProtocol'] = str(r['dataProtocol']).replace('\n', ' ')
+
          return results
+
+    def get_sample_files(self, analysis_node):
+        """Gather derived files
+        """
+        query_template = loader.get_template('geo_files.sparql')
+
+        context = Context({
+            'submission': str(analysis_node.uri),
+            'file_class': str(geoSoftNS['supplemental'].uri)
+            })
+
+        return self.execute_query(query_template, context)
+
+    def get_raw_files(self, analysis_node):
+        """Gather raw data e.g. fastq files.
+        """
+        query_template = loader.get_template('geo_fastqs.sparql')
+
+        context = Context({
+            'submission': str(analysis_node.uri),
+            'file_class': str(geoSoftNS['raw'].uri),
+            })
+
+        lanes = {}
+        for row in self.execute_query(query_template, context):
+            data = {}
+            for k, v in row.items():
+                data[k] = v
+            lane = str(data['lane'])
+            lanes.setdefault(lane, []).append(data)
+        result = []
+        for lane, files in lanes.items():
+            if len(files) > 2:
+                errmsg = "Don't know what to do with more than 2 raw files"
+                raise ValueError(errmsg)
+            elif len(files) == 2:
+                is_paired = True
+            elif len(files) == 1:
+                is_paired = False
+            elif len(files) == 0:
+                raise RuntimeError("Empty lane list discovered")
+            files = self._format_filename(files, is_paired)
+            files = self._format_flowcell_type(files, is_paired)
+            files = self._format_read_length(files, is_paired)
+            result.append(files[0])
+        return result
+
+    def _format_flowcell_type(self, files, is_paired):
+        """Used by get_raw_files to format value for single_or_paired-end
+        """
+        for f in files:
+            if 'flowcell_type' in f:
+                flowcell_type = fromTypedNode(f['flowcell_type'])
+                if flowcell_type is None:
+                    pass
+                elif flowcell_type.lower() == "paired":
+                    f['flowcell_type'] = 'paired-end'
+                else:
+                    f['flowcell_type'] = 'single'
+
+        return files
+
+    def _format_read_length(self, files, is_paired):
+        """Format
+        """
+        read_count = 2 if is_paired else 1
+        for f in files:
+            if 'read_length' in f:
+                read_length = str(fromTypedNode(f['read_length']))
+                f['read_length'] = ",".join([read_length] * read_count)
+        return files
+
+    def _format_filename(self, files, is_paired):
+        """Format file name for get_raw_files, also report if paired
+        """
+        if len(files) == 2:
+            # should be paired
+            f0 = files[0]
+            f1 = files[1]
+            f0['filename'] = "%s, %s" % (str(f0['filename']),
+                                         str(f1['filename']))
+            f0['md5sum'] = "%s, %s" % (str(f0['md5sum']),
+                                       str(f1['md5sum']))
+            del files[1]
+        else:
+            files[0]['filename'] = str(files[0]['filename'])
+            files[0]['md5sum'] = str(files[0]['md5sum'])
+        return files
+
+
+    def get_run_details(self, analysis_node):
+        """Get information about runs
+        """
+        query_template = loader.get_template('geo_run_details.sparql')
+
+        context = Context({
+            'submission': str(analysis_node.uri),
+            })
+
+        return self.execute_query(query_template, context)
+
+    def query_to_soft_dictionary(self, results, heading):
+        attributes = []
+        for r in results:
+            name = stripNamespace(geoSoftNS, r['name'])
+            if name is not None:
+                if name.lower() == heading.lower():
+                    name = '^' + name
+                else:
+                    name = '!' + name
+                for v in fromTypedNode(r['value']).split(os.linesep):
+                    v = v.strip()
+                    if len(v) > 0:
+                        attributes.append((name, v))
+        return attributes