From b20de36b64c3de26b259c55c65a84d2f1513124d Mon Sep 17 00:00:00 2001
From: Diane Trout <diane@caltech.edu>
Date: Wed, 1 Aug 2012 16:18:13 -0700
Subject: [PATCH] GEO wanted both files for a paired lane to be on a single
 line. This required a bit of formatting code to massage the results of the
 sparql query into an acceptable form.

---
 htsworkflow/submission/geo.py                | 90 ++++++++++++++++++--
 htsworkflow/templates/geo_fastqs.sparql      | 31 +++++++
 htsworkflow/templates/geo_files.sparql       | 14 ++-
 htsworkflow/templates/geo_run_details.sparql |  6 +-
 htsworkflow/templates/geo_submission.soft    | 38 +++++----
 5 files changed, 150 insertions(+), 29 deletions(-)
 create mode 100644 htsworkflow/templates/geo_fastqs.sparql

diff --git a/htsworkflow/submission/geo.py b/htsworkflow/submission/geo.py
index ad66422..a3ac2f1 100644
--- a/htsworkflow/submission/geo.py
+++ b/htsworkflow/submission/geo.py
@@ -35,11 +35,8 @@ class GEOSubmission(Submission):
                 errmsg = 'Confused there are more than one samples for %s'
                 LOGGER.debug(errmsg % (str(an_analysis,)))
             metadata = metadata[0]
-            metadata['raw'] = self.get_sample_files(an_analysis,
-                                                    geoSoftNS['raw'])
-            metadata['supplimental'] = self.get_sample_files(
-                an_analysis,
-                geoSoftNS['supplemental'])
+            metadata['raw'] = self.get_raw_files(an_analysis)
+            metadata['supplimental'] = self.get_sample_files(an_analysis)
             metadata['run'] = self.get_run_details(an_analysis)
             samples.append(metadata)
 
@@ -103,18 +100,95 @@ class GEOSubmission(Submission):
 
         return results
 
-    def get_sample_files(self, analysis_node, file_class):
-        """Gather files
+    def get_sample_files(self, analysis_node):
+        """Gather derived files
         """
         query_template = loader.get_template('geo_files.sparql')
 
         context = Context({
             'submission': str(analysis_node.uri),
-            'file_class': str(file_class)
+            'file_class': str(geoSoftNS['supplemental'])
             })
 
         return self.execute_query(query_template, context)
 
+    def get_raw_files(self, analysis_node):
+        """Gather raw data e.g. fastq files.
+        """
+        query_template = loader.get_template('geo_fastqs.sparql')
+
+        context = Context({
+            'submission': str(analysis_node.uri),
+            'file_class': str(geoSoftNS['raw']),
+            })
+
+        lanes = {}
+        for row in self.execute_query(query_template, context):
+            data = {}
+            for k, v in row.items():
+                data[k] = v
+            lane = str(data['lane'])
+            lanes.setdefault(lane, []).append(data)
+        result = []
+        for lane, files in lanes.items():
+            if len(files) > 2:
+                errmsg = "Don't know what to do with more than 2 raw files"
+                raise ValueError(errmsg)
+            elif len(files) == 2:
+                is_paired = True
+            elif len(files) == 1:
+                is_paired = False
+            elif len(files) == 0:
+                raise RuntimeError("Empty lane list discovered")
+            files = self._format_filename(files, is_paired)
+            files = self._format_flowcell_type(files, is_paired)
+            files = self._format_read_length(files, is_paired)
+            result.append(files[0])
+        return result
+
+    def _format_flowcell_type(self, files, is_paired):
+        """Used by get_raw_files to format value for single_or_paired-end
+        """
+        for f in files:
+            if 'flowcell_type' in f:
+                flowcell_type = fromTypedNode(f['flowcell_type'])
+                if flowcell_type is None:
+                    pass
+                elif flowcell_type.lower() == "paired":
+                    f['flowcell_type'] = 'paired-end'
+                else:
+                    f['flowcell_type'] = 'single'
+
+        return files
+
+    def _format_read_length(self, files, is_paired):
+        """Format
+        """
+        read_count = 2 if is_paired else 1
+        for f in files:
+            if 'read_length' in f:
+                read_length = str(fromTypedNode(f['read_length']))
+                f['read_length'] = ",".join([read_length] * read_count)
+        return files
+
+    def _format_filename(self, files, is_paired):
+        """Format file name for get_raw_files, also report if paired
+        """
+        if len(files) == 2:
+            # should be paired
+            f0 = files[0]
+            f1 = files[1]
+            f0['filename'] = "%s, %s" % (str(f0['filename']),
+                                         str(f1['filename']))
+            f0['md5sum'] = "%s, %s" % (str(f0['md5sum']),
+                                       str(f1['md5sum']))
+            del files[1]
+        else:
+            files[0]['filename'] = str(files[0]['filename'])
+            files[0]['md5sum'] = str(files[0]['md5sum'])
+        return files
+
+
     def get_run_details(self, analysis_node):
         """Get information about runs
         """
diff --git a/htsworkflow/templates/geo_fastqs.sparql b/htsworkflow/templates/geo_fastqs.sparql
new file mode 100644
index 0000000..e7fcbc1
--- /dev/null
+++ b/htsworkflow/templates/geo_fastqs.sparql
@@ -0,0 +1,31 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+
+select distinct ?lane ?filename, ?md5sum, ?file_type ?file_type_label ?flowcell_id ?read_length ?flowcell_type ?image_software ?image_version ?basecall_software ?basecall_version ?sequencer_model
+WHERE {
+  <{{submission}}> submissionOntology:library ?library ;
+                   a submissionOntology:submission .
+
+  ?file ucscDaf:filename ?filename ;
+        ucscDaf:md5sum ?md5sum ;
+        libraryOntology:has_lane ?lane ;
+        a ?file_type .
+  ?file_type a <{{file_class}}> ;
+             geoSoft:fileTypeLabel ?file_type_label .
+
+  ?library libraryOntology:has_lane ?lane .
+  ?lane libraryOntology:flowcell ?flowcell .
+  ?flowcell libraryOntology:flowcell_id ?flowcell_id ;
+            libraryOntology:read_length ?read_length ;
+            libraryOntology:flowcell_type ?flowcell_type ;
+  OPTIONAL { ?flowcell libraryOntology:image_software ?image_software ;
+                       libraryOntology:image_version ?image_version . }
+  OPTIONAL {?flowcell  libraryOntology:basecall_software ?basecall_software ;
+                       libraryOntology:basecall_version ?basecall_version . }
+  OPTIONAL {?flowcell  libraryOntology:sequenced_by ?sequencer .
+             ?sequencer libraryOntology:sequencer_model ?sequencer_model . }
+}
+
diff --git a/htsworkflow/templates/geo_files.sparql b/htsworkflow/templates/geo_files.sparql
index 7b66f4f..e3fcb9d 100644
--- a/htsworkflow/templates/geo_files.sparql
+++ b/htsworkflow/templates/geo_files.sparql
@@ -4,15 +4,27 @@ PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
 PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
 PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
 
-select distinct ?filename, ?md5sum, ?file_type ?file_type_label
+select distinct ?filename, ?md5sum, ?file_type ?file_type_label ?flowcell_id ?read_length ?flowcell_type ?image_software ?image_version ?basecall_software ?basecall_version ?sequencer_model
 WHERE {
   <{{submission}}> ucscDaf:has_file ?file ;
                    a submissionOntology:submission .
 
   ?file ucscDaf:filename ?filename ;
         ucscDaf:md5sum ?md5sum ;
+        libraryOntology:has_lane ?lane ;
         a ?file_type .
   ?file_type a <{{file_class}}> ;
              geoSoft:fileTypeLabel ?file_type_label .
 
+  OPTIONAL { ?lane libraryOntology:flowcell ?flowcell .
+             ?flowcell libraryOntology:flowcell_id ?flowcell_id ;
+                       libraryOntology:read_length ?read_length ;
+                       libraryOntology:flowcell_type ?flowcell_type ;
+                       libraryOntology:image_software ?image_software ;
+                       libraryOntology:image_version ?image_version ;
+                       libraryOntology:basecall_software ?basecall_software ;
+                       libraryOntology:basecall_version ?basecall_version ;
+                       libraryOntology:sequenced_by ?sequencer .
+             ?sequencer libraryOntology:sequencer_model ?sequencer_model
+  }
 }
\ No newline at end of file
diff --git a/htsworkflow/templates/geo_run_details.sparql b/htsworkflow/templates/geo_run_details.sparql
index aa7e438..dc97107 100644
--- a/htsworkflow/templates/geo_run_details.sparql
+++ b/htsworkflow/templates/geo_run_details.sparql
@@ -7,8 +7,7 @@ PREFIX cells: <http://encodewiki.ucsc.edu/EncodeDCC/index.php/Cell_lines#>
 
 # right now we're just grabbing the sequencer model
 # it might make sense to report each of the flowcell/image software.
-#select distinct ?flowcell ?image_software ?image_version ?basecall_software ?basecall_version ?sequencer_model
-select distinct ?sequencer_model
+select ?flowcell ?read_length ?image_software ?image_version ?basecall_software ?basecall_version
 where {
   <{{submission}}> submissionOntology:library ?library ;
                    a submissionOntology:submission .
@@ -17,7 +16,8 @@ where {
            libraryOntology:has_lane ?lane ;
            a libraryOntology:library .
   OPTIONAL { ?flowcell libraryOntology:has_lane ?lane .
-             ?flowcell libraryOntology:image_software ?image_software ;
+             ?flowcell libraryOntology:read_length ?read_length ;
+                       libraryOntology:image_software ?image_software ;
                        libraryOntology:image_version ?image_version ;
                        libraryOntology:basecall_software ?basecall_software ;
                        libraryOntology:basecall_version ?basecall_version ;
diff --git a/htsworkflow/templates/geo_submission.soft b/htsworkflow/templates/geo_submission.soft
index 3541e56..19e6faa 100644
--- a/htsworkflow/templates/geo_submission.soft
+++ b/htsworkflow/templates/geo_submission.soft
@@ -1,15 +1,13 @@
 {% for name, value in series %}{{name}} = {{value}}
-{% endfor %}!Series_platform_id = {{ platform_id }}
+{% endfor %}
 {% for row in samples %}
 ^SAMPLE={{row.name}}
 !Sample_type=SRA
 !Sample_title={{row.name}}
-!Sample_series_id = {{ series_id }}{% for run in row.run %}
-!Sample_instrument_model = {{ run.sequencer_model }}{% endfor %}
-!Sample_channel_count = 1
-!Sample_organism_ch1 = {{ row.species_name }}
-!Sample_taxid_ch1 = {{ row.taxon_id }}
-!Sample_platform_id = {{ platform_id }}
+!Sample_series_id={{ series_id }}
+!Sample_channel_count=1
+!Sample_organism_ch1={{ row.species_name }}
+!Sample_taxid_ch1={{ row.taxon_id }}
 !Sample_source_name_ch1={{row.cell}}
 !Sample_library_strategy={{ row.experiment_type }}
 !Sample_library_source={{row.library_source}}
@@ -17,20 +15,26 @@
 !Sample_growth_protocol_ch1={{ row.growthProtocol|safe }}
 !Sample_extract_protocol={{ row.extractProtocol|safe }}
 !Sample_data_processing={{ row.dataProtocol|safe }}
-!Sample_molecule_ch1 = {{ row.extractMolecule }}
-!Sample_characteristics_ch1 = labExpId: {{ row.library_id }}
-!Sample_characteristics_ch1 = replicate: {{ row.replicate }}
+!Sample_molecule_ch1={{ row.extractMolecule }}
+!Sample_characteristics_ch1=labExpId: {{ row.library_id }}
+!Sample_characteristics_ch1=replicate: {{ row.replicate }}
 {% if row.cell %}{% spaceless %}
-!Sample_characteristics_ch1 = cell: {{ row.cell }}
+!Sample_characteristics_ch1=cell: {{ row.cell }}
 {% endspaceless %}{% endif %}
 {% if row.readType %}{% spaceless %}
-!Sample_characteristics_ch1 = readType: {{ row.readType }}
+!Sample_characteristics_ch1=readType: {{ row.readType }}
 {% endspaceless %}{% endif %}{% if row.antibody %}{% spaceless %}
-!Sample_characteristics_ch1 = cell: {{ row.antibody }}
-{% endspaceless %}{% endif %}{% for raw in row.raw %}
-!Sample_raw_file_{{forloop.counter}}={{raw.filename}}
+!Sample_characteristics_ch1=cell: {{ row.antibody }}
+{% endspaceless %}{% endif %}{% for run in row.run %}
+!Sample_characteristics_ch1=Illumina image processing pipeline version: {{ run.image_software }}-{{ run.image_version }}
+!Sample_characteristics_ch1=Illumina base-calling pipeline version: {{ run.image_software }}-{{ run.image_version }}{% endfor %}{% for raw in row.raw %}
+!Sample_raw_file_{{forloop.counter}}={{ raw.filename }}
 !Sample_raw_file_type_{{forloop.counter}}={{raw.file_type_label}}
-!Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}}{% endfor %}{% for sup in row.supplimental %}
+!Sample_raw_file_insert_size_{{forloop.counter}}={{ row.insertLength }}
+!Sample_raw_file_read_length_{{forloop.counter}}={{raw.read_length}}
+!Sample_raw_file_instrument_model_{{forloop.counter}}={{raw.sequencer_model}}
+!Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}}
+!sample_raw_file_single_or_paired-end={{raw.flowcell_type}}{% endfor %}{% for sup in row.supplimental %}
 !Sample_supplementary_file_{{forloop.counter}}={{sup.filename}}
 !Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}}
-{% endfor %}{% endfor %}
\ No newline at end of file
+{% endfor %}{% endfor %}
-- 
2.30.2