Properly constructing the geo soft file really needed multiple sparql queries.
authorDiane Trout <diane@caltech.edu>
Sat, 12 May 2012 00:43:01 +0000 (17:43 -0700)
committerDiane Trout <diane@caltech.edu>
Sat, 12 May 2012 00:43:01 +0000 (17:43 -0700)
Most of this is glueing the various queries together into the soft file
template.

One significant url change that should make it easier to
write turtle documents describing the library was to end the
submission set URI with # instead of /

this way .../SubmissionLog/SubName# is more clearly the same base as
.../SubmissionLog/SubName#AttributeName.

However this probably will break my older rule files.

And I'm not checking for that... *sigh*

htsworkflow/submission/geo.py
htsworkflow/submission/submission.py
htsworkflow/templates/geo_files.sparql [new file with mode: 0644]
htsworkflow/templates/geo_platform.sparql [new file with mode: 0644]
htsworkflow/templates/geo_samples.sparql [new file with mode: 0644]
htsworkflow/templates/geo_series.sparql [new file with mode: 0644]
htsworkflow/templates/geo_submission.soft
htsworkflow/templates/geo_submission.sparql [deleted file]

index 737b1bb353458ceb0d961adf551a680cc1f9399f..8ff349fe9ae7444c5cc82f20c15e8458d6cc0599 100644 (file)
@@ -1,4 +1,5 @@
 import logging
+import os
 
 import RDF
 
@@ -6,6 +7,8 @@ from htsworkflow.submission.submission import Submission
 
 from htsworkflow.util.rdfhelp import \
      fromTypedNode, \
+     geoSoftNS, \
+     simplifyUri, \
      submissionOntology
 
 from django.conf import settings
@@ -19,13 +22,33 @@ class GEOSubmission(Submission):
 
     def make_soft(self, result_map):
         samples = []
+        platform = self.get_platform_metadata()
+        platform_attribs = dict(platform)
+        platform_id = platform_attribs['^platform']
+        series = self.get_series_metadata()
+        series_attribs = dict(series)
+        series_id = series_attribs['^series']
         for lib_id, result_dir in result_map.items():
             an_analysis = self.get_submission_node(result_dir)
-            samples.append(self.get_sample_metadata(an_analysis))
+            metadata = self.get_sample_metadata(an_analysis)
+            if len(metadata) > 1:
+                errmsg = 'Confused there are more than one samples for %s'
+                LOGGER.debug(errmsg % (str(an_analysis,)))
+            metadata = metadata[0]
+            metadata['raw'] = self.get_sample_files(an_analysis,
+                                                    geoSoftNS['raw'])
+            metadata['supplimental'] = self.get_sample_files(
+                an_analysis,
+                geoSoftNS['supplemental'])
+            samples.append(metadata)
 
         soft_template = loader.get_template('geo_submission.soft')
         context = Context({
-            'samples': samples
+            'platform': platform,
+            'series': series,
+            'samples': samples,
+            'platform_id': platform_id,
+            'series_id': series_id,
         })
         print str(soft_template.render(context))
 
@@ -39,19 +62,69 @@ class GEOSubmission(Submission):
         else:
             return True
 
+    def get_platform_metadata(self):
+        """Gather information for filling out sample section of a SOFT file
+        """
+        query_template = loader.get_template('geo_platform.sparql')
+        submission = str(self.submissionSetNS[''].uri)
+        context = Context({
+            'submission': submission,
+            })
+
+        results = self.execute_query(query_template, context)
+        return self.query_to_soft_dictionary(results, 'platform')
+
+    def get_series_metadata(self):
+        """Gather information for filling out sample section of a SOFT file
+        """
+        query_template = loader.get_template('geo_series.sparql')
+        submission = str(self.submissionSetNS[''].uri)
+        context = Context({
+            'submission': submission,
+            })
+
+        results = self.execute_query(query_template, context)
+        return self.query_to_soft_dictionary(results, 'series')
+
     def get_sample_metadata(self, analysis_node):
         """Gather information for filling out sample section of a SOFT file
         """
-        query_template = loader.get_template('geo_submission.sparql')
+        query_template = loader.get_template('geo_samples.sparql')
 
         context = Context({
             'submission': str(analysis_node.uri),
+            'submissionSet': str(self.submissionSetNS[''].uri),
             })
 
-        formatted_query = query_template.render(context)
-        query = RDF.SPARQLQuery(str(formatted_query))
-        rdfstream = query.execute(self.model)
-        results = []
-        for r in rdfstream:
-            results.append(r)
+        results = self.execute_query(query_template, context)
+        for r in results:
+
+            r['dataProtocol'] = str(r['dataProtocol']).replace('\n', ' ')
         return results
+
+    def get_sample_files(self, analysis_node, file_class):
+        """Gather files
+        """
+        query_template = loader.get_template('geo_files.sparql')
+
+        context = Context({
+            'submission': str(analysis_node.uri),
+            'file_class': str(file_class)
+            })
+
+        return self.execute_query(query_template, context)
+
+    def query_to_soft_dictionary(self, results, heading):
+        attributes = []
+        for r in results:
+            name = simplifyUri(geoSoftNS, r['name'])
+            if name is not None:
+                if name.lower() == heading.lower():
+                    name = '^' + name
+                else:
+                    name = '!' + name
+                for v in fromTypedNode(r['value']).split(os.linesep):
+                    v = v.strip()
+                    if len(v) > 0:
+                        attributes.append((name, v))
+        return attributes
index 98c25d57befc4047c4cc846ecd72f26fe02c740d..e4ce90c73b073287913ec2f76f5f1d5f0d9bd887 100644 (file)
@@ -24,7 +24,7 @@ from htsworkflow.submission.daf import \
      MetadataLookupException, \
      get_submission_uri
 
-logger = logging.getLogger(__name__)
+LOGGER = logging.getLogger(__name__)
 
 class Submission(object):
     def __init__(self, name, model):
@@ -32,7 +32,7 @@ class Submission(object):
         self.model = model
 
         self.submissionSet = get_submission_uri(self.name)
-        self.submissionSetNS = RDF.NS(str(self.submissionSet) + '/')
+        self.submissionSetNS = RDF.NS(str(self.submissionSet) + '#')
         self.libraryNS = RDF.NS('http://jumpgate.caltech.edu/library/')
 
         self.__view_map = None
@@ -41,11 +41,11 @@ class Submission(object):
         """Examine files in our result directory
         """
         for lib_id, result_dir in result_map.items():
-            logger.info("Importing %s from %s" % (lib_id, result_dir))
+            LOGGER.info("Importing %s from %s" % (lib_id, result_dir))
             try:
                 self.import_analysis_dir(result_dir, lib_id)
             except MetadataLookupException, e:
-                logger.error("Skipping %s: %s" % (lib_id, str(e)))
+                LOGGER.error("Skipping %s: %s" % (lib_id, str(e)))
 
     def import_analysis_dir(self, analysis_dir, library_id):
         """Import a submission directories and update our model as needed
@@ -68,18 +68,28 @@ class Submission(object):
         """
         path, filename = os.path.split(pathname)
 
-        logger.debug("Searching for view")
-        file_classification = self.find_best_match(filename)
-        if file_classification is None:
-            logger.warn("Unrecognized file: {0}".format(pathname))
+        LOGGER.debug("Searching for view")
+        file_type = self.find_best_match(filename)
+        if file_type is None:
+            LOGGER.warn("Unrecognized file: {0}".format(pathname))
             return None
-        if str(file_classification) == str(libraryOntology['ignore']):
+        if str(file_type) == str(libraryOntology['ignore']):
             return None
 
         an_analysis_name = self.make_submission_name(analysis_dir)
         an_analysis = self.get_submission_node(analysis_dir)
         an_analysis_uri = str(an_analysis.uri)
+        file_classification = self.model.get_target(file_type,
+                                                    rdfNS['type'])
+        if file_classification is None:
+            errmsg = 'Could not find class for {0}'
+            logger.warning(errmsg.format(str(file_type)))
+            return
 
+        self.model.add_statement(
+            RDF.Statement(self.submissionSetNS[''],
+                          submissionOntology['has_submission'],
+                          an_analysis))
         self.model.add_statement(RDF.Statement(an_analysis,
                                                submissionOntology['name'],
                                                toTypedNode(an_analysis_name)))
@@ -91,7 +101,7 @@ class Submission(object):
                                                submissionOntology['library'],
                                                libNode))
 
-        logger.debug("Adding statements to {0}".format(str(an_analysis)))
+        LOGGER.debug("Adding statements to {0}".format(str(an_analysis)))
         # add track specific information
         self.model.add_statement(
             RDF.Statement(an_analysis,
@@ -108,8 +118,11 @@ class Submission(object):
                                              an_analysis_uri,
                                              analysis_dir)
         self.add_md5s(filename, fileNode, analysis_dir)
-
-        logger.debug("Done.")
+        self.model.add_statement(
+            RDF.Statement(fileNode,
+                          rdfNS['type'],
+                          file_type))
+        LOGGER.debug("Done.")
 
     def link_file_to_classes(self, filename, submissionNode, submission_uri, analysis_dir):
         # add file specific information
@@ -125,12 +138,12 @@ class Submission(object):
         return fileNode
 
     def add_md5s(self, filename, fileNode, analysis_dir):
-        logger.debug("Updating file md5sum")
+        LOGGER.debug("Updating file md5sum")
         submission_pathname = os.path.join(analysis_dir, filename)
         md5 = make_md5sum(submission_pathname)
         if md5 is None:
             errmsg = "Unable to produce md5sum for {0}"
-            logger.warning(errmsg.format(submission_pathname))
+            LOGGER.warning(errmsg.format(submission_pathname))
         else:
             self.model.add_statement(
                 RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
@@ -178,11 +191,11 @@ class Submission(object):
         for s in self.model.find_statements(filename_query):
             view_name = s.subject
             literal_re = s.object.literal_value['string']
-            logger.debug("Found: %s" % (literal_re,))
+            LOGGER.debug("Found: %s" % (literal_re,))
             try:
                 filename_re = re.compile(literal_re)
             except re.error, e:
-                logger.error("Unable to compile: %s" % (literal_re,))
+                LOGGER.error("Unable to compile: %s" % (literal_re,))
             patterns[literal_re] = view_name
         return patterns
 
@@ -254,3 +267,14 @@ class Submission(object):
                 "Unrecognized library type %s for %s" % \
                 (library_type, str(libNode)))
 
+    def execute_query(self, template, context):
+        """Execute the query, returning the results
+        """
+        formatted_query = template.render(context)
+        LOGGER.debug(formatted_query)
+        query = RDF.SPARQLQuery(str(formatted_query))
+        rdfstream = query.execute(self.model)
+        results = []
+        for r in rdfstream:
+            results.append(r)
+        return results
diff --git a/htsworkflow/templates/geo_files.sparql b/htsworkflow/templates/geo_files.sparql
new file mode 100644 (file)
index 0000000..7b66f4f
--- /dev/null
@@ -0,0 +1,18 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+
+select distinct ?filename, ?md5sum, ?file_type ?file_type_label
+WHERE {
+  <{{submission}}> ucscDaf:has_file ?file ;
+                   a submissionOntology:submission .
+
+  ?file ucscDaf:filename ?filename ;
+        ucscDaf:md5sum ?md5sum ;
+        a ?file_type .
+  ?file_type a <{{file_class}}> ;
+             geoSoft:fileTypeLabel ?file_type_label .
+
+}
\ No newline at end of file
diff --git a/htsworkflow/templates/geo_platform.sparql b/htsworkflow/templates/geo_platform.sparql
new file mode 100644 (file)
index 0000000..4d224d7
--- /dev/null
@@ -0,0 +1,14 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+
+
+select distinct ?name ?value
+WHERE {
+  <{{submission}}> submissionOntology:has_platform ?platform .
+  ?platform a geoSoft:platform .
+
+  ?platform ?name ?value .
+}
diff --git a/htsworkflow/templates/geo_samples.sparql b/htsworkflow/templates/geo_samples.sparql
new file mode 100644 (file)
index 0000000..850d99a
--- /dev/null
@@ -0,0 +1,41 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+PREFIX cells: <http://encodewiki.ucsc.edu/EncodeDCC/index.php/Cell_lines#>
+
+select distinct ?name ?cell ?antibody ?sex ?control ?strain ?controlId ?library_id ?treatment ?protocol ?readType ?insertLength ?replicate, ?mapAlgorithm ?species_name ?taxon_id ?extractMolecule ?growthProtocol ?extractProtocol ?dataProtocol ?experiment_type ?library_selection ?library_source
+WHERE {
+  <{{submission}}> a submissionOntology:submission .
+
+  OPTIONAL { <{{submission}}> ucscDaf:control ?control }
+  OPTIONAL { <{{submission}}> ucscDaf:controlId ?controlId }
+  OPTIONAL { ?library libraryOntology:antibody ?antibody }
+  OPTIONAL { ?library libraryOntology:cell_line ?cell .
+             ?cell_line cells:cell ?cell ;
+                        cells:documents ?growthProtocol . }
+  OPTIONAL { ?library ucscDaf:sex ?sex }
+  OPTIONAL { ?library libraryOntology:library_id ?library_id }
+  OPTIONAL { ?library libraryOntology:replicate ?replicate }
+  OPTIONAL { ?library libraryOntology:species ?species_name }
+  OPTIONAL { ?library libraryOntology:condition_term ?treatment }
+  OPTIONAL { ?library libraryOntology:experiment_type ?experiment_type }
+  OPTIONAL { ?library libraryOntology:librarySelection ?library_selection }
+  OPTIONAL { ?library libraryOntology:librarySource ?library_source }
+  OPTIONAL { <{{submissionSet}}> geoSoft:data_processing ?dataProtocol }
+  OPTIONAL { ?library libraryOntology:extractMolecule ?extractMolecule }
+  OPTIONAL { ?library libraryOntology:extractProtocol ?extractProtocol }
+  OPTIONAL { ?library ucscDaf:protocol ?protocol }
+  OPTIONAL { ?library ucscDaf:readType ?readType }
+  OPTIONAL { ?library ucscDaf:strain ?strain }
+  OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
+  OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
+
+  <{{submission}}> submissionOntology:library ?library ;
+                   submissionOntology:name ?name .
+  ?species libraryOntology:species ?species_name ;
+           libraryOntology:taxon_id ?taxon_id .
+
+
+}
\ No newline at end of file
diff --git a/htsworkflow/templates/geo_series.sparql b/htsworkflow/templates/geo_series.sparql
new file mode 100644 (file)
index 0000000..815f311
--- /dev/null
@@ -0,0 +1,14 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+
+
+select distinct ?name ?value
+WHERE {
+  <{{submission}}> submissionOntology:has_series ?series.
+  ?series a geoSoft:series .
+
+  ?series ?name ?value .
+}
index ae2ac5760946a14d414fd87730812e477bdafb87..00c0b4dc77818789bfea8950507fbd1dba0c13fb 100644 (file)
@@ -1,11 +1,39 @@
-Soft template
-!Platform_title = Illumina Genome Analyzer (Homo sapiens)
-!Platform_geo_accession = GPL9052
-{% for sample in samples %}{% for row in sample %}{%if forloop.first %}
+{% for name, value in series %}{{name}} = {{value}}
+{% endfor %}!Series_platform_id = {{ platform_id }}
+{% for row in samples %}
 ^SAMPLE={{row.name}}
+!Sample_type=SRA
 !Sample_title={{row.name}}
+!Sample_series_id = {{ series_id }}
+!Sample_instrument_model = Illumina Genome Analyzer
+!Sample_instrument_model = Illumina Genome Analyzer II
+!Sample_instrument_model = Illumina Genome Analyzer IIx
+!Sample_instrument_model = Illumina HiSeq 2000
+!Sample_channel_count = 1
 !Sample_organism_ch1 = {{ row.species_name }}
 !Sample_taxid_ch1 = {{ row.taxon_id }}
-{% spaceless %}{% if row.cell %}!Sample_characteristics_ch1 = cell: {{ row.cell }}
-{% endif %}{% endspaceless %}{% endif %}
-!Sample_supplementary_file_{{forloop.counter}}={{row.filename}}{% endfor %}{% endfor %}
\ No newline at end of file
+!Sample_platform_id = {{ platform_id }}
+!Sample_source_name_ch1={{row.cell}}
+!Sample_library_strategy={{ row.experiment_type }}
+!Sample_library_source={{row.library_source}}
+!Sample_library_selection={{ row.library_selection }}
+!Sample_growth_protocol_ch1={{ row.growthProtocol|safe }}
+!Sample_extract_protocol={{ row.extractProtocol|safe }}
+!Sample_data_processing={{ row.dataProtocol|safe }}
+!Sample_molecule_ch1 = {{ row.extractMolecule }}
+!Sample_characteristics_ch1 = labExpId: {{ row.library_id }}
+!Sample_characteristics_ch1 = replicate: {{ row.replicate }}
+{% if row.cell %}{% spaceless %}
+!Sample_characteristics_ch1 = cell: {{ row.cell }}
+{% endspaceless %}{% endif %}
+{% if row.readType %}{% spaceless %}
+!Sample_characteristics_ch1 = readType: {{ row.readType }}
+{% endspaceless %}{% endif %}{% if row.antibody %}{% spaceless %}
+!Sample_characteristics_ch1 = cell: {{ row.antibody }}
+{% endspaceless %}{% endif %}{% for raw in row.raw %}
+!Sample_raw_file_{{forloop.counter}}={{raw.filename}}
+!Sample_raw_file_type_{{forloop.counter}}={{raw.file_type_label}}
+!Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}}{% endfor %}{% for sup in row.supplimental %}
+!Sample_supplementary_file_{{forloop.counter}}={{sup.filename}}
+!Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}}
+{% endfor %}{% endfor %}
\ No newline at end of file
diff --git a/htsworkflow/templates/geo_submission.sparql b/htsworkflow/templates/geo_submission.sparql
deleted file mode 100644 (file)
index 1d7cbb1..0000000
+++ /dev/null
@@ -1,33 +0,0 @@
-PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
-PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
-PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
-PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
-
-select distinct ?name ?filename ?md5sum ?cell ?antibody ?sex ?control ?strain ?controlId ?labExpId ?labVersion ?treatment ?protocol ?readType ?insertLength ?replicate, ?mapAlgorithm ?species_name ?taxon_id
-WHERE {
-  <{{submission}}> a submissionOntology:submission .
-
-  OPTIONAL { ?submission ucscDaf:control ?control }
-  #OPTIONAL { ?submission ucscDaf:controlId ?controlId }
-  #OPTIONAL { ?library libraryOntology:antibody ?antibody }
-  OPTIONAL { ?library libraryOntology:cell_line ?cell } .
-  #OPTIONAL { ?library ucscDaf:sex ?sex }
-  OPTIONAL { ?library libraryOntology:library_id ?labExpId }
-  OPTIONAL { ?library libraryOntology:library_id ?labVersion }
-  OPTIONAL { ?library libraryOntology:replicate ?replicate }
-  OPTIONAL { ?library libraryOntology:species ?species_name }
-  
-  #OPTIONAL { ?library libraryOntology:condition_term ?treatment }
-  #OPTIONAL { ?library ucscDaf:protocol ?protocol }
-  #OPTIONAL { ?library ucscDaf:readType ?readType }
-  #OPTIONAL { ?library ucscDaf:strain ?strain }
-  #OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
-  #OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
-
-  <{{submission}}> submissionOntology:library ?library ;
-                   ucscDaf:has_file ?file ;
-                   submissionOntology:name ?name .
-  ?species libraryOntology:species ?species_name ;
-           libraryOntology:taxon_id ?taxon_id .
-  ?file ucscDaf:filename ?filename .
-}