Properly constructing the geo soft file really needed multiple sparql queries.

author Diane Trout <diane@caltech.edu>

Sat, 12 May 2012 00:43:01 +0000 (17:43 -0700)

committer Diane Trout <diane@caltech.edu>

Sat, 12 May 2012 00:43:01 +0000 (17:43 -0700)
author Diane Trout <diane@caltech.edu>
Sat, 12 May 2012 00:43:01 +0000 (17:43 -0700)
committer Diane Trout <diane@caltech.edu>
Sat, 12 May 2012 00:43:01 +0000 (17:43 -0700)
diff --git a/htsworkflow/submission/geo.py b/htsworkflow/submission/geo.py

index 737b1bb353458ceb0d961adf551a680cc1f9399f..8ff349fe9ae7444c5cc82f20c15e8458d6cc0599 100644 (file)
--- a/htsworkflow/submission/geo.py
+++ b/htsworkflow/submission/geo.py
@@ -1,4 +1,5 @@
  import logging
+import os
  
  import RDF
  
@@ -6,6 +7,8 @@ from htsworkflow.submission.submission import Submission
  
  from htsworkflow.util.rdfhelp import \
       fromTypedNode, \
+     geoSoftNS, \
+     simplifyUri, \
       submissionOntology
  
  from django.conf import settings
@@ -19,13 +22,33 @@ class GEOSubmission(Submission):
  
      def make_soft(self, result_map):
          samples = []
+        platform = self.get_platform_metadata()
+        platform_attribs = dict(platform)
+        platform_id = platform_attribs['^platform']
+        series = self.get_series_metadata()
+        series_attribs = dict(series)
+        series_id = series_attribs['^series']
          for lib_id, result_dir in result_map.items():
              an_analysis = self.get_submission_node(result_dir)
-            samples.append(self.get_sample_metadata(an_analysis))
+            metadata = self.get_sample_metadata(an_analysis)
+            if len(metadata) > 1:
+                errmsg = 'Confused there are more than one samples for %s'
+                LOGGER.debug(errmsg % (str(an_analysis,)))
+            metadata = metadata[0]
+            metadata['raw'] = self.get_sample_files(an_analysis,
+                                                    geoSoftNS['raw'])
+            metadata['supplimental'] = self.get_sample_files(
+                an_analysis,
+                geoSoftNS['supplemental'])
+            samples.append(metadata)
  
          soft_template = loader.get_template('geo_submission.soft')
          context = Context({
-            'samples': samples
+            'platform': platform,
+            'series': series,
+            'samples': samples,
+            'platform_id': platform_id,
+            'series_id': series_id,
          })
          print str(soft_template.render(context))
  
@@ -39,19 +62,69 @@ class GEOSubmission(Submission):
          else:
              return True
  
+    def get_platform_metadata(self):
+        """Gather information for filling out sample section of a SOFT file
+        """
+        query_template = loader.get_template('geo_platform.sparql')
+        submission = str(self.submissionSetNS[''].uri)
+        context = Context({
+            'submission': submission,
+            })
+
+        results = self.execute_query(query_template, context)
+        return self.query_to_soft_dictionary(results, 'platform')
+
+    def get_series_metadata(self):
+        """Gather information for filling out sample section of a SOFT file
+        """
+        query_template = loader.get_template('geo_series.sparql')
+        submission = str(self.submissionSetNS[''].uri)
+        context = Context({
+            'submission': submission,
+            })
+
+        results = self.execute_query(query_template, context)
+        return self.query_to_soft_dictionary(results, 'series')
+
      def get_sample_metadata(self, analysis_node):
          """Gather information for filling out sample section of a SOFT file
          """
-        query_template = loader.get_template('geo_submission.sparql')
+        query_template = loader.get_template('geo_samples.sparql')
  
          context = Context({
              'submission': str(analysis_node.uri),
+            'submissionSet': str(self.submissionSetNS[''].uri),
              })
  
-        formatted_query = query_template.render(context)
-        query = RDF.SPARQLQuery(str(formatted_query))
-        rdfstream = query.execute(self.model)
-        results = []
-        for r in rdfstream:
-            results.append(r)
+        results = self.execute_query(query_template, context)
+        for r in results:
+
+            r['dataProtocol'] = str(r['dataProtocol']).replace('\n', ' ')
          return results
+
+    def get_sample_files(self, analysis_node, file_class):
+        """Gather files
+        """
+        query_template = loader.get_template('geo_files.sparql')
+
+        context = Context({
+            'submission': str(analysis_node.uri),
+            'file_class': str(file_class)
+            })
+
+        return self.execute_query(query_template, context)
+
+    def query_to_soft_dictionary(self, results, heading):
+        attributes = []
+        for r in results:
+            name = simplifyUri(geoSoftNS, r['name'])
+            if name is not None:
+                if name.lower() == heading.lower():
+                    name = '^' + name
+                else:
+                    name = '!' + name
+                for v in fromTypedNode(r['value']).split(os.linesep):
+                    v = v.strip()
+                    if len(v) > 0:
+                        attributes.append((name, v))
+        return attributes
diff --git a/htsworkflow/submission/submission.py b/htsworkflow/submission/submission.py

index 98c25d57befc4047c4cc846ecd72f26fe02c740d..e4ce90c73b073287913ec2f76f5f1d5f0d9bd887 100644 (file)
--- a/htsworkflow/submission/submission.py
+++ b/htsworkflow/submission/submission.py
@@ -24,7 +24,7 @@ from htsworkflow.submission.daf import \
       MetadataLookupException, \
       get_submission_uri
  
-logger = logging.getLogger(__name__)
+LOGGER = logging.getLogger(__name__)
  
  class Submission(object):
      def __init__(self, name, model):
@@ -32,7 +32,7 @@ class Submission(object):
          self.model = model
  
          self.submissionSet = get_submission_uri(self.name)
-        self.submissionSetNS = RDF.NS(str(self.submissionSet) + '/')
+        self.submissionSetNS = RDF.NS(str(self.submissionSet) + '#')
          self.libraryNS = RDF.NS('http://jumpgate.caltech.edu/library/')
  
          self.__view_map = None
@@ -41,11 +41,11 @@ class Submission(object):
          """Examine files in our result directory
          """
          for lib_id, result_dir in result_map.items():
-            logger.info("Importing %s from %s" % (lib_id, result_dir))
+            LOGGER.info("Importing %s from %s" % (lib_id, result_dir))
              try:
                  self.import_analysis_dir(result_dir, lib_id)
              except MetadataLookupException, e:
-                logger.error("Skipping %s: %s" % (lib_id, str(e)))
+                LOGGER.error("Skipping %s: %s" % (lib_id, str(e)))
  
      def import_analysis_dir(self, analysis_dir, library_id):
          """Import a submission directories and update our model as needed
@@ -68,18 +68,28 @@ class Submission(object):
          """
          path, filename = os.path.split(pathname)
  
-        logger.debug("Searching for view")
-        file_classification = self.find_best_match(filename)
-        if file_classification is None:
-            logger.warn("Unrecognized file: {0}".format(pathname))
+        LOGGER.debug("Searching for view")
+        file_type = self.find_best_match(filename)
+        if file_type is None:
+            LOGGER.warn("Unrecognized file: {0}".format(pathname))
              return None
-        if str(file_classification) == str(libraryOntology['ignore']):
+        if str(file_type) == str(libraryOntology['ignore']):
              return None
  
          an_analysis_name = self.make_submission_name(analysis_dir)
          an_analysis = self.get_submission_node(analysis_dir)
          an_analysis_uri = str(an_analysis.uri)
+        file_classification = self.model.get_target(file_type,
+                                                    rdfNS['type'])
+        if file_classification is None:
+            errmsg = 'Could not find class for {0}'
+            logger.warning(errmsg.format(str(file_type)))
+            return
  
+        self.model.add_statement(
+            RDF.Statement(self.submissionSetNS[''],
+                          submissionOntology['has_submission'],
+                          an_analysis))
          self.model.add_statement(RDF.Statement(an_analysis,
                                                 submissionOntology['name'],
                                                 toTypedNode(an_analysis_name)))
@@ -91,7 +101,7 @@ class Submission(object):
                                                 submissionOntology['library'],
                                                 libNode))
  
-        logger.debug("Adding statements to {0}".format(str(an_analysis)))
+        LOGGER.debug("Adding statements to {0}".format(str(an_analysis)))
          # add track specific information
          self.model.add_statement(
              RDF.Statement(an_analysis,
@@ -108,8 +118,11 @@ class Submission(object):
                                               an_analysis_uri,
                                               analysis_dir)
          self.add_md5s(filename, fileNode, analysis_dir)
-
-        logger.debug("Done.")
+        self.model.add_statement(
+            RDF.Statement(fileNode,
+                          rdfNS['type'],
+                          file_type))
+        LOGGER.debug("Done.")
  
      def link_file_to_classes(self, filename, submissionNode, submission_uri, analysis_dir):
          # add file specific information
@@ -125,12 +138,12 @@ class Submission(object):
          return fileNode
  
      def add_md5s(self, filename, fileNode, analysis_dir):
-        logger.debug("Updating file md5sum")
+        LOGGER.debug("Updating file md5sum")
          submission_pathname = os.path.join(analysis_dir, filename)
          md5 = make_md5sum(submission_pathname)
          if md5 is None:
              errmsg = "Unable to produce md5sum for {0}"
-            logger.warning(errmsg.format(submission_pathname))
+            LOGGER.warning(errmsg.format(submission_pathname))
          else:
              self.model.add_statement(
                  RDF.Statement(fileNode, dafTermOntology['md5sum'], md5))
@@ -178,11 +191,11 @@ class Submission(object):
          for s in self.model.find_statements(filename_query):
              view_name = s.subject
              literal_re = s.object.literal_value['string']
-            logger.debug("Found: %s" % (literal_re,))
+            LOGGER.debug("Found: %s" % (literal_re,))
              try:
                  filename_re = re.compile(literal_re)
              except re.error, e:
-                logger.error("Unable to compile: %s" % (literal_re,))
+                LOGGER.error("Unable to compile: %s" % (literal_re,))
              patterns[literal_re] = view_name
          return patterns
  
@@ -254,3 +267,14 @@ class Submission(object):
                  "Unrecognized library type %s for %s" % \
                  (library_type, str(libNode)))
  
+    def execute_query(self, template, context):
+        """Execute the query, returning the results
+        """
+        formatted_query = template.render(context)
+        LOGGER.debug(formatted_query)
+        query = RDF.SPARQLQuery(str(formatted_query))
+        rdfstream = query.execute(self.model)
+        results = []
+        for r in rdfstream:
+            results.append(r)
+        return results
diff --git a/htsworkflow/templates/geo_files.sparql b/htsworkflow/templates/geo_files.sparql

new file mode 100644 (file)

index 0000000..7b66f4f
--- /dev/null
+++ b/htsworkflow/templates/geo_files.sparql
@@ -0,0 +1,18 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+
+select distinct ?filename, ?md5sum, ?file_type ?file_type_label
+WHERE {
+  <{{submission}}> ucscDaf:has_file ?file ;
+                   a submissionOntology:submission .
+
+  ?file ucscDaf:filename ?filename ;
+        ucscDaf:md5sum ?md5sum ;
+        a ?file_type .
+  ?file_type a <{{file_class}}> ;
+             geoSoft:fileTypeLabel ?file_type_label .
+
+}
+\ No newline at end of file
diff --git a/htsworkflow/templates/geo_platform.sparql b/htsworkflow/templates/geo_platform.sparql

new file mode 100644 (file)

index 0000000..4d224d7
--- /dev/null
+++ b/htsworkflow/templates/geo_platform.sparql
@@ -0,0 +1,14 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+
+
+select distinct ?name ?value
+WHERE {
+  <{{submission}}> submissionOntology:has_platform ?platform .
+  ?platform a geoSoft:platform .
+
+  ?platform ?name ?value .
+}
diff --git a/htsworkflow/templates/geo_samples.sparql b/htsworkflow/templates/geo_samples.sparql

new file mode 100644 (file)

index 0000000..850d99a
--- /dev/null
+++ b/htsworkflow/templates/geo_samples.sparql
@@ -0,0 +1,41 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+PREFIX cells: <http://encodewiki.ucsc.edu/EncodeDCC/index.php/Cell_lines#>
+
+select distinct ?name ?cell ?antibody ?sex ?control ?strain ?controlId ?library_id ?treatment ?protocol ?readType ?insertLength ?replicate, ?mapAlgorithm ?species_name ?taxon_id ?extractMolecule ?growthProtocol ?extractProtocol ?dataProtocol ?experiment_type ?library_selection ?library_source
+WHERE {
+  <{{submission}}> a submissionOntology:submission .
+
+  OPTIONAL { <{{submission}}> ucscDaf:control ?control }
+  OPTIONAL { <{{submission}}> ucscDaf:controlId ?controlId }
+  OPTIONAL { ?library libraryOntology:antibody ?antibody }
+  OPTIONAL { ?library libraryOntology:cell_line ?cell .
+             ?cell_line cells:cell ?cell ;
+                        cells:documents ?growthProtocol . }
+  OPTIONAL { ?library ucscDaf:sex ?sex }
+  OPTIONAL { ?library libraryOntology:library_id ?library_id }
+  OPTIONAL { ?library libraryOntology:replicate ?replicate }
+  OPTIONAL { ?library libraryOntology:species ?species_name }
+  OPTIONAL { ?library libraryOntology:condition_term ?treatment }
+  OPTIONAL { ?library libraryOntology:experiment_type ?experiment_type }
+  OPTIONAL { ?library libraryOntology:librarySelection ?library_selection }
+  OPTIONAL { ?library libraryOntology:librarySource ?library_source }
+  OPTIONAL { <{{submissionSet}}> geoSoft:data_processing ?dataProtocol }
+  OPTIONAL { ?library libraryOntology:extractMolecule ?extractMolecule }
+  OPTIONAL { ?library libraryOntology:extractProtocol ?extractProtocol }
+  OPTIONAL { ?library ucscDaf:protocol ?protocol }
+  OPTIONAL { ?library ucscDaf:readType ?readType }
+  OPTIONAL { ?library ucscDaf:strain ?strain }
+  OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
+  OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
+
+  <{{submission}}> submissionOntology:library ?library ;
+                   submissionOntology:name ?name .
+  ?species libraryOntology:species ?species_name ;
+           libraryOntology:taxon_id ?taxon_id .
+
+
+}
+\ No newline at end of file
diff --git a/htsworkflow/templates/geo_series.sparql b/htsworkflow/templates/geo_series.sparql

new file mode 100644 (file)

index 0000000..815f311
--- /dev/null
+++ b/htsworkflow/templates/geo_series.sparql
@@ -0,0 +1,14 @@
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+
+
+select distinct ?name ?value
+WHERE {
+  <{{submission}}> submissionOntology:has_series ?series.
+  ?series a geoSoft:series .
+
+  ?series ?name ?value .
+}
diff --git a/htsworkflow/templates/geo_submission.soft b/htsworkflow/templates/geo_submission.soft

index ae2ac5760946a14d414fd87730812e477bdafb87..00c0b4dc77818789bfea8950507fbd1dba0c13fb 100644 (file)
--- a/htsworkflow/templates/geo_submission.soft
+++ b/htsworkflow/templates/geo_submission.soft
@@ -1,11 +1,39 @@
-Soft template
-!Platform_title = Illumina Genome Analyzer (Homo sapiens)
-!Platform_geo_accession = GPL9052
-{% for sample in samples %}{% for row in sample %}{%if forloop.first %}
+{% for name, value in series %}{{name}} = {{value}}
+{% endfor %}!Series_platform_id = {{ platform_id }}
+{% for row in samples %}
  ^SAMPLE={{row.name}}
+!Sample_type=SRA
  !Sample_title={{row.name}}
+!Sample_series_id = {{ series_id }}
+!Sample_instrument_model = Illumina Genome Analyzer
+!Sample_instrument_model = Illumina Genome Analyzer II
+!Sample_instrument_model = Illumina Genome Analyzer IIx
+!Sample_instrument_model = Illumina HiSeq 2000
+!Sample_channel_count = 1
  !Sample_organism_ch1 = {{ row.species_name }}
  !Sample_taxid_ch1 = {{ row.taxon_id }}
-{% spaceless %}{% if row.cell %}!Sample_characteristics_ch1 = cell: {{ row.cell }}
-{% endif %}{% endspaceless %}{% endif %}
-!Sample_supplementary_file_{{forloop.counter}}={{row.filename}}{% endfor %}{% endfor %}
-\ No newline at end of file
+!Sample_platform_id = {{ platform_id }}
+!Sample_source_name_ch1={{row.cell}}
+!Sample_library_strategy={{ row.experiment_type }}
+!Sample_library_source={{row.library_source}}
+!Sample_library_selection={{ row.library_selection }}
+!Sample_growth_protocol_ch1={{ row.growthProtocol|safe }}
+!Sample_extract_protocol={{ row.extractProtocol|safe }}
+!Sample_data_processing={{ row.dataProtocol|safe }}
+!Sample_molecule_ch1 = {{ row.extractMolecule }}
+!Sample_characteristics_ch1 = labExpId: {{ row.library_id }}
+!Sample_characteristics_ch1 = replicate: {{ row.replicate }}
+{% if row.cell %}{% spaceless %}
+!Sample_characteristics_ch1 = cell: {{ row.cell }}
+{% endspaceless %}{% endif %}
+{% if row.readType %}{% spaceless %}
+!Sample_characteristics_ch1 = readType: {{ row.readType }}
+{% endspaceless %}{% endif %}{% if row.antibody %}{% spaceless %}
+!Sample_characteristics_ch1 = cell: {{ row.antibody }}
+{% endspaceless %}{% endif %}{% for raw in row.raw %}
+!Sample_raw_file_{{forloop.counter}}={{raw.filename}}
+!Sample_raw_file_type_{{forloop.counter}}={{raw.file_type_label}}
+!Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}}{% endfor %}{% for sup in row.supplimental %}
+!Sample_supplementary_file_{{forloop.counter}}={{sup.filename}}
+!Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}}
+{% endfor %}{% endfor %}
+\ No newline at end of file
diff --git a/htsworkflow/templates/geo_submission.sparql b/htsworkflow/templates/geo_submission.sparql

deleted file mode 100644 (file)

index 1d7cbb1..0000000
--- a/htsworkflow/templates/geo_submission.sparql
+++ /dev/null
@@ -1,33 +0,0 @@
-PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
-PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
-PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
-PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
-
-select distinct ?name ?filename ?md5sum ?cell ?antibody ?sex ?control ?strain ?controlId ?labExpId ?labVersion ?treatment ?protocol ?readType ?insertLength ?replicate, ?mapAlgorithm ?species_name ?taxon_id
-WHERE {
-  <{{submission}}> a submissionOntology:submission .
-
-  OPTIONAL { ?submission ucscDaf:control ?control }
-  #OPTIONAL { ?submission ucscDaf:controlId ?controlId }
-  #OPTIONAL { ?library libraryOntology:antibody ?antibody }
-  OPTIONAL { ?library libraryOntology:cell_line ?cell } .
-  #OPTIONAL { ?library ucscDaf:sex ?sex }
-  OPTIONAL { ?library libraryOntology:library_id ?labExpId }
-  OPTIONAL { ?library libraryOntology:library_id ?labVersion }
-  OPTIONAL { ?library libraryOntology:replicate ?replicate }
-  OPTIONAL { ?library libraryOntology:species ?species_name }
-  
-  #OPTIONAL { ?library libraryOntology:condition_term ?treatment }
-  #OPTIONAL { ?library ucscDaf:protocol ?protocol }
-  #OPTIONAL { ?library ucscDaf:readType ?readType }
-  #OPTIONAL { ?library ucscDaf:strain ?strain }
-  #OPTIONAL { ?library libraryOntology:insert_size ?insertLength }
-  #OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm }
-
-  <{{submission}}> submissionOntology:library ?library ;
-                   ucscDaf:has_file ?file ;
-                   submissionOntology:name ?name .
-  ?species libraryOntology:species ?species_name ;
-           libraryOntology:taxon_id ?taxon_id .
-  ?file ucscDaf:filename ?filename .
-}
author	Diane Trout <diane@caltech.edu>
	Sat, 12 May 2012 00:43:01 +0000 (17:43 -0700)
committer	Diane Trout <diane@caltech.edu>
	Sat, 12 May 2012 00:43:01 +0000 (17:43 -0700)
htsworkflow/submission/geo.py		patch \| blob \| history
htsworkflow/submission/submission.py		patch \| blob \| history
htsworkflow/templates/geo_files.sparql	[new file with mode: 0644]	patch \| blob
htsworkflow/templates/geo_platform.sparql	[new file with mode: 0644]	patch \| blob
htsworkflow/templates/geo_samples.sparql	[new file with mode: 0644]	patch \| blob
htsworkflow/templates/geo_series.sparql	[new file with mode: 0644]	patch \| blob
htsworkflow/templates/geo_submission.soft		patch \| blob \| history
htsworkflow/templates/geo_submission.sparql	[deleted file]	patch \| blob \| history