From 697a1fe031741e6d7614127a2f16e69027578e10 Mon Sep 17 00:00:00 2001
From: Diane Trout <diane@caltech.edu>
Date: Fri, 11 May 2012 17:43:01 -0700
Subject: [PATCH] Properly constructing the geo soft file really needed
 multiple sparql queries. Most of this is glueing the various queries together
 into the soft file template.

One significant url change that should make it easier to
write turtle documents describing the library was to end the
submission set URI with # instead of /

this way .../SubmissionLog/SubName# is more clearly the same base as
.../SubmissionLog/SubName#AttributeName.

However this probably will break my older rule files.

And I'm not checking for that... *sigh*
---
 htsworkflow/submission/geo.py               | 91 +++++++++++++++++++--
 htsworkflow/submission/submission.py        | 56 +++++++++----
 htsworkflow/templates/geo_files.sparql      | 18 ++++
 htsworkflow/templates/geo_platform.sparql   | 14 ++++
 htsworkflow/templates/geo_samples.sparql    | 41 ++++++++++
 htsworkflow/templates/geo_series.sparql     | 14 ++++
 htsworkflow/templates/geo_submission.soft   | 42 ++++++++--
 htsworkflow/templates/geo_submission.sparql | 33 --------
 8 files changed, 244 insertions(+), 65 deletions(-)
 create mode 100644 htsworkflow/templates/geo_files.sparql
 create mode 100644 htsworkflow/templates/geo_platform.sparql
 create mode 100644 htsworkflow/templates/geo_samples.sparql
 create mode 100644 htsworkflow/templates/geo_series.sparql
 delete mode 100644 htsworkflow/templates/geo_submission.sparql

diff --git a/htsworkflow/submission/geo.py b/htsworkflow/submission/geo.py
index 737b1bb..8ff349f 100644
--- a/htsworkflow/submission/geo.py
+++ b/htsworkflow/submission/geo.py
@@ -1,4 +1,5 @@
 import logging
+import os
 
 import RDF
 
@@ -6,6 +7,8 @@ from htsworkflow.submission.submission import Submission
 
 from htsworkflow.util.rdfhelp import \
      fromTypedNode, \
+     geoSoftNS, \
+     simplifyUri, \
      submissionOntology
 
 from django.conf import settings
@@ -19,13 +22,33 @@ class GEOSubmission(Submission):
 
     def make_soft(self, result_map):
         samples = []
+        platform = self.get_platform_metadata()
+        platform_attribs = dict(platform)
+        platform_id = platform_attribs['^platform']
+        series = self.get_series_metadata()
+        series_attribs = dict(series)
+        series_id = series_attribs['^series']
         for lib_id, result_dir in result_map.items():
             an_analysis = self.get_submission_node(result_dir)
-            samples.append(self.get_sample_metadata(an_analysis))
+            metadata = self.get_sample_metadata(an_analysis)
+            if len(metadata) > 1:
+                errmsg = 'Confused there are more than one samples for %s'
+                LOGGER.debug(errmsg % (str(an_analysis,)))
+            metadata = metadata[0]
+            metadata['raw'] = self.get_sample_files(an_analysis,
+                                                    geoSoftNS['raw'])
+            metadata['supplimental'] = self.get_sample_files(
+                an_analysis,
+                geoSoftNS['supplemental'])
+            samples.append(metadata)
 
         soft_template = loader.get_template('geo_submission.soft')
         context = Context({
-            'samples': samples
+            'platform': platform,
+            'series': series,
+            'samples': samples,
+            'platform_id': platform_id,
+            'series_id': series_id,
         })
         print str(soft_template.render(context))
 
@@ -39,19 +62,69 @@ class GEOSubmission(Submission):
         else:
             return True
 
+    def get_platform_metadata(self):
+        """Gather information for filling out sample section of a SOFT file
+        """
+        query_template = loader.get_template('geo_platform.sparql')
+        submission = str(self.submissionSetNS[''].uri)
+        context = Context({
+            'submission': submission,
+            })
+
+        results = self.execute_query(query_template, context)
+        return self.query_to_soft_dictionary(results, 'platform')
+
+    def get_series_metadata(self):
+        """Gather information for filling out sample section of a SOFT file
+        """
+        query_template = loader.get_template('geo_series.sparql')
+        submission = str(self.submissionSetNS[''].uri)
+        context = Context({
+            'submission': submission,
+            })
+
+        results = self.execute_query(query_template, context)
+        return self.query_to_soft_dictionary(results, 'series')
+
     def get_sample_metadata(self, analysis_node):
         """Gather information for filling out sample section of a SOFT file
         """
-        query_template = loader.get_template('geo_submission.sparql')
+        query_template = loader.get_template('geo_samples.sparql')
 
         context = Context({
             'submission': str(analysis_node.uri),
+            'submissionSet': str(self.submissionSetNS[''].uri),
             })
 
-        formatted_query = query_template.render(context)
-        query = RDF.SPARQLQuery(str(formatted_query))
-        rdfstream = query.execute(self.model)
-        results = []
-        for r in rdfstream:
-            results.append(r)
+        results = self.execute_query(query_template, context)
+        for r in results:
+
+            r['dataProtocol'] = str(r['dataProtocol']).replace('\n', ' ')
         return results
+
+    def get_sample_files(self, analysis_node, file_class):
+        """Gather files
+        """
+        query_template = loader.get_template('geo_files.sparql')
+
+        context = Context({
+            'submission': str(analysis_node.uri),
+            'file_class': str(file_class)
+            })
+
+        return self.execute_query(query_template, context)
+
+    def query_to_soft_dictionary(self, results, heading):
+        attributes = []
+        for r in results:
+            name = simplifyUri(geoSoftNS, r['name'])
+            if name is not None:
+                if name.lower() == heading.lower():
+                    name = '^' + name
+                else:
+                    name = '!' + name
+                for v in fromTypedNode(r['value']).split(os.linesep):
+                    v = v.strip()
+                    if len(v) > 0:
+                        attributes.append((name, v))
+        return attributes
diff --git a/htsworkflow/submission/submission.py b/htsworkflow/submission/submission.py
index 98c25d5..e4ce90c 100644
--- a/htsworkflow/submission/submission.py
+++ b/htsworkflow/submission/submission.py
@@ -24,7 +24,7 @@ from htsworkflow.submission.daf import \
      MetadataLookupException, \
      get_submission_uri
 
-logger = logging.getLogger(__name__)
+LOGGER = logging.getLogger(__name__)
 
 class Submission(object):
     def __init__(self, name, model):
@@ -32,7 +32,7 @@ class Submission(object):
         self.model = model
 
         self.submissionSet = get_submission_uri(self.name)
-        self.submissionSetNS = RDF.NS(str(self.submissionSet) + '/')
+        self.submissionSetNS = RDF.NS(str(self.submissionSet) + '#')
         self.libraryNS = RDF.NS('http://jumpgate.caltech.edu/library/')
 
         self.__view_map = None
@@ -41,11 +41,11 @@ class Submission(object):
         """Examine files in our result directory
         """
         for lib_id, result_dir in result_map.items():
-            logger.info("Importing %s from %s" % (lib_id, result_dir))
+            LOGGER.info("Importing %s from %s" % (lib_id, result_dir))
             try:
                 self.import_analysis_dir(result_dir, lib_id)
             except MetadataLookupException, e:
-                logger.error("Skipping %s: %s" % (lib_id, str(e)))
+                LOGGER.error("Skipping %s: %s" % (lib_id, str(e)))
 
     def import_analysis_dir(self, analysis_dir, library_id):
         """Import a submission directories and update our model as needed
@@ -68,18 +68,28 @@ class Submission(object):
         """
         path, filename = os.path.split(pathname)
 
-        logger.debug("Searching for view")
-        file_classification = self.find_best_match(filename)
-        if file_classification is None:
-            logger.warn("Unrecognized file: {0}".format(pathname))
+        LOGGER.debug("Searching for view")
+        file_type = self.find_best_match(filename)
+        if file_type is None:
+            LOGGER.warn("Unrecognized file: {0}".format(pathname))
             return None
-        if str(file_classification) == str(libraryOntology['ignore']):
+        if str(file_type) == str(libraryOntology['ignore']):
             return None
 
         an_analysis_name = self.make_submission_name(analysis_dir)
         an_analysis = self.get_submission_node(analysis_dir)
         an_analysis_uri = str(an_analysis.uri)
+        file_classification = self.model.get_target(file_type,
+                                                    rdfNS['type'])
+        if file_classification is None:
+            errmsg = 'Could not find class for {0}'
+            logger.warning(errmsg.format(str(file_type)))
+            return
 
+        self.model.add_statement(
+            RDF.Statement(self.submissionSetNS[''],
+                          submissionOntology['has_submission'],
+                          an_analysis))
         self.model.add_statement(RDF.Statement(an_analysis,
                                                submissionOntology['name'],
                                                toTypedNode(an_analysis_name)))
@@ -91,7 +101,7 @@ class Submission(object):
                                                submissionOntology['library'],
                                                libNode))
 
-        logger.debug("Adding statements to {0}".format(str(an_analysis)))
+        LOGGER.debug("Adding statements to {0}".format(str(an_analysis)))
         # add track specific information
         self.model.add_statement(
             RDF.Statement(an_analysis,
@@ -108,8 +118,11 @@ class Submission(object):
                                              an_analysis_uri,
                                              analysis_dir)
         self.add_md5s(filename, fileNode, analysis_dir)
-
-        logger.debug("Done.")
+        self.model.add_statement(
+            RDF.Statement(fileNode,
+                          rdfNS['type'],
+                          file_type))
+        LOGGER.debug("Done.")
 
     def link_file_to_classes(self, filename, submissionNode, submission_uri, analysis_dir):
         # add file specific information
@@ -125,12 +138,12 @@ class Submission(object):
         return fileNode
 
     def add_md5s(self, filename, fileNode, analysis_dir):
-        logger.debug("Updating file md5sum")
+        LOGGER.debug("Updating file md5sum")
         submission_pathname = os.path.join(analysis_dir, filename)
         md5 = make_md5sum(submission_pathname)
         if md5 is None:
             errmsg = "Unable to produce md5sum for {0}"
-            logger.warning(errmsg.format(submission_pathname))
+            LOGGER.warning(errmsg.format(submission_pathname))
         else:
             self.model.add_statement(
                 RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
@@ -178,11 +191,11 @@ class Submission(object):
         for s in self.model.find_statements(filename_query):
             view_name = s.subject
             literal_re = s.object.literal_value['string']
-            logger.debug("Found: %s" % (literal_re,))
+            LOGGER.debug("Found: %s" % (literal_re,))
             try:
                 filename_re = re.compile(literal_re)
             except re.error, e:
-                logger.error("Unable to compile: %s" % (literal_re,))
+                LOGGER.error("Unable to compile: %s" % (literal_re,))
             patterns[literal_re] = view_name
         return patterns
 
@@ -254,3 +267,14 @@ class Submission(object):
                 "Unrecognized library type %s for %s" % \
                 (library_type, str(libNode)))
 
+    def execute_query(self, template, context):
+        """Execute the query, returning the results
+        """
+        formatted_query = template.render(context)
+        LOGGER.debug(formatted_query)
+        query = RDF.SPARQLQuery(str(formatted_query))
+        rdfstream = query.execute(self.model)
+        results = []
+        for r in rdfstream:
+            results.append(r)
+        return results
diff --git a/htsworkflow/templates/geo_files.sparql b/htsworkflow/templates/geo_files.sparql
new file mode 100644
index 0000000..7b66f4f
--- /dev/null
+++ b/htsworkflow/templates/geo_files.sparql
@@ -0,0 +1,18 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+
+select distinct ?filename, ?md5sum, ?file_type ?file_type_label
+WHERE {
+  <{{submission}}> ucscDaf:has_file ?file ;
+                   a submissionOntology:submission .
+
+  ?file ucscDaf:filename ?filename ;
+        ucscDaf:md5sum ?md5sum ;
+        a ?file_type .
+  ?file_type a <{{file_class}}> ;
+             geoSoft:fileTypeLabel ?file_type_label .
+
+}
\ No newline at end of file
diff --git a/htsworkflow/templates/geo_platform.sparql b/htsworkflow/templates/geo_platform.sparql
new file mode 100644
index 0000000..4d224d7
--- /dev/null
+++ b/htsworkflow/templates/geo_platform.sparql
@@ -0,0 +1,14 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+
+
+select distinct ?name ?value
+WHERE {
+  <{{submission}}> submissionOntology:has_platform ?platform .
+  ?platform a geoSoft:platform .
+
+  ?platform ?name ?value .
+}
diff --git a/htsworkflow/templates/geo_samples.sparql b/htsworkflow/templates/geo_samples.sparql
new file mode 100644
index 0000000..850d99a
--- /dev/null
+++ b/htsworkflow/templates/geo_samples.sparql
@@ -0,0 +1,41 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+PREFIX cells: <http://encodewiki.ucsc.edu/EncodeDCC/index.php/Cell_lines#>
+
+select distinct ?name ?cell ?antibody ?sex ?control ?strain ?controlId ?library_id ?treatment ?protocol ?readType ?insertLength ?replicate, ?mapAlgorithm ?species_name ?taxon_id ?extractMolecule ?growthProtocol ?extractProtocol ?dataProtocol ?experiment_type ?library_selection ?library_source
+WHERE {
+  <{{submission}}> a submissionOntology:submission .
+
+  OPTIONAL { <{{submission}}> ucscDaf:control ?control }
+  OPTIONAL { <{{submission}}> ucscDaf:controlId ?controlId }
+  OPTIONAL { ?library libraryOntology:antibody ?antibody }
+  OPTIONAL { ?library libraryOntology:cell_line ?cell .
+             ?cell_line cells:cell ?cell ;
+                        cells:documents ?growthProtocol . }
+  OPTIONAL { ?library ucscDaf:sex ?sex }
+  OPTIONAL { ?library libraryOntology:library_id ?library_id }
+  OPTIONAL { ?library libraryOntology:replicate ?replicate }
+  OPTIONAL { ?library libraryOntology:species ?species_name }
+  OPTIONAL { ?library libraryOntology:condition_term ?treatment }
+  OPTIONAL { ?library libraryOntology:experiment_type ?experiment_type }
+  OPTIONAL { ?library libraryOntology:librarySelection ?library_selection }
+  OPTIONAL { ?library libraryOntology:librarySource ?library_source }
+  OPTIONAL { <{{submissionSet}}> geoSoft:data_processing ?dataProtocol }
+  OPTIONAL { ?library libraryOntology:extractMolecule ?extractMolecule }
+  OPTIONAL { ?library libraryOntology:extractProtocol ?extractProtocol }
+  OPTIONAL { ?library ucscDaf:protocol ?protocol }
+  OPTIONAL { ?library ucscDaf:readType ?readType }
+  OPTIONAL { ?library ucscDaf:strain ?strain }
+  OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
+  OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
+
+  <{{submission}}> submissionOntology:library ?library ;
+                   submissionOntology:name ?name .
+  ?species libraryOntology:species ?species_name ;
+           libraryOntology:taxon_id ?taxon_id .
+
+
+}
\ No newline at end of file
diff --git a/htsworkflow/templates/geo_series.sparql b/htsworkflow/templates/geo_series.sparql
new file mode 100644
index 0000000..815f311
--- /dev/null
+++ b/htsworkflow/templates/geo_series.sparql
@@ -0,0 +1,14 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+
+
+select distinct ?name ?value
+WHERE {
+  <{{submission}}> submissionOntology:has_series ?series.
+  ?series a geoSoft:series .
+
+  ?series ?name ?value .
+}
diff --git a/htsworkflow/templates/geo_submission.soft b/htsworkflow/templates/geo_submission.soft
index ae2ac57..00c0b4d 100644
--- a/htsworkflow/templates/geo_submission.soft
+++ b/htsworkflow/templates/geo_submission.soft
@@ -1,11 +1,39 @@
-Soft template
-!Platform_title = Illumina Genome Analyzer (Homo sapiens)
-!Platform_geo_accession = GPL9052
-{% for sample in samples %}{% for row in sample %}{%if forloop.first %}
+{% for name, value in series %}{{name}} = {{value}}
+{% endfor %}!Series_platform_id = {{ platform_id }}
+{% for row in samples %}
 ^SAMPLE={{row.name}}
+!Sample_type=SRA
 !Sample_title={{row.name}}
+!Sample_series_id = {{ series_id }}
+!Sample_instrument_model = Illumina Genome Analyzer
+!Sample_instrument_model = Illumina Genome Analyzer II
+!Sample_instrument_model = Illumina Genome Analyzer IIx
+!Sample_instrument_model = Illumina HiSeq 2000
+!Sample_channel_count = 1
 !Sample_organism_ch1 = {{ row.species_name }}
 !Sample_taxid_ch1 = {{ row.taxon_id }}
-{% spaceless %}{% if row.cell %}!Sample_characteristics_ch1 = cell: {{ row.cell }}
-{% endif %}{% endspaceless %}{% endif %}
-!Sample_supplementary_file_{{forloop.counter}}={{row.filename}}{% endfor %}{% endfor %}
\ No newline at end of file
+!Sample_platform_id = {{ platform_id }}
+!Sample_source_name_ch1={{row.cell}}
+!Sample_library_strategy={{ row.experiment_type }}
+!Sample_library_source={{row.library_source}}
+!Sample_library_selection={{ row.library_selection }}
+!Sample_growth_protocol_ch1={{ row.growthProtocol|safe }}
+!Sample_extract_protocol={{ row.extractProtocol|safe }}
+!Sample_data_processing={{ row.dataProtocol|safe }}
+!Sample_molecule_ch1 = {{ row.extractMolecule }}
+!Sample_characteristics_ch1 = labExpId: {{ row.library_id }}
+!Sample_characteristics_ch1 = replicate: {{ row.replicate }}
+{% if row.cell %}{% spaceless %}
+!Sample_characteristics_ch1 = cell: {{ row.cell }}
+{% endspaceless %}{% endif %}
+{% if row.readType %}{% spaceless %}
+!Sample_characteristics_ch1 = readType: {{ row.readType }}
+{% endspaceless %}{% endif %}{% if row.antibody %}{% spaceless %}
+!Sample_characteristics_ch1 = cell: {{ row.antibody }}
+{% endspaceless %}{% endif %}{% for raw in row.raw %}
+!Sample_raw_file_{{forloop.counter}}={{raw.filename}}
+!Sample_raw_file_type_{{forloop.counter}}={{raw.file_type_label}}
+!Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}}{% endfor %}{% for sup in row.supplimental %}
+!Sample_supplementary_file_{{forloop.counter}}={{sup.filename}}
+!Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}}
+{% endfor %}{% endfor %}
\ No newline at end of file
diff --git a/htsworkflow/templates/geo_submission.sparql b/htsworkflow/templates/geo_submission.sparql
deleted file mode 100644
index 1d7cbb1..0000000
--- a/htsworkflow/templates/geo_submission.sparql
+++ /dev/null
@@ -1,33 +0,0 @@
-PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
-PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
-PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
-PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
-
-select distinct ?name ?filename ?md5sum ?cell ?antibody ?sex ?control ?strain ?controlId ?labExpId ?labVersion ?treatment ?protocol ?readType ?insertLength ?replicate, ?mapAlgorithm ?species_name ?taxon_id
-WHERE {
-  <{{submission}}> a submissionOntology:submission .
-
-  OPTIONAL { ?submission ucscDaf:control ?control }
-  #OPTIONAL { ?submission ucscDaf:controlId ?controlId }
-  #OPTIONAL { ?library libraryOntology:antibody ?antibody }
-  OPTIONAL { ?library libraryOntology:cell_line ?cell } .
-  #OPTIONAL { ?library ucscDaf:sex ?sex }
-  OPTIONAL { ?library libraryOntology:library_id ?labExpId }
-  OPTIONAL { ?library libraryOntology:library_id ?labVersion }
-  OPTIONAL { ?library libraryOntology:replicate ?replicate }
-  OPTIONAL { ?library libraryOntology:species ?species_name }
-  
-  #OPTIONAL { ?library libraryOntology:condition_term ?treatment }
-  #OPTIONAL { ?library ucscDaf:protocol ?protocol }
-  #OPTIONAL { ?library ucscDaf:readType ?readType }
-  #OPTIONAL { ?library ucscDaf:strain ?strain }
-  #OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
-  #OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
-
-  <{{submission}}> submissionOntology:library ?library ;
-                   ucscDaf:has_file ?file ;
-                   submissionOntology:name ?name .
-  ?species libraryOntology:species ?species_name ;
-           libraryOntology:taxon_id ?taxon_id .
-  ?file ucscDaf:filename ?filename .
-}
-- 
2.30.2