From 697a1fe031741e6d7614127a2f16e69027578e10 Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Fri, 11 May 2012 17:43:01 -0700 Subject: [PATCH] Properly constructing the geo soft file really needed multiple sparql queries. Most of this is glueing the various queries together into the soft file template. One significant url change that should make it easier to write turtle documents describing the library was to end the submission set URI with # instead of / this way .../SubmissionLog/SubName# is more clearly the same base as .../SubmissionLog/SubName#AttributeName. However this probably will break my older rule files. And I'm not checking for that... *sigh* --- htsworkflow/submission/geo.py | 91 +++++++++++++++++++-- htsworkflow/submission/submission.py | 56 +++++++++---- htsworkflow/templates/geo_files.sparql | 18 ++++ htsworkflow/templates/geo_platform.sparql | 14 ++++ htsworkflow/templates/geo_samples.sparql | 41 ++++++++++ htsworkflow/templates/geo_series.sparql | 14 ++++ htsworkflow/templates/geo_submission.soft | 42 ++++++++-- htsworkflow/templates/geo_submission.sparql | 33 -------- 8 files changed, 244 insertions(+), 65 deletions(-) create mode 100644 htsworkflow/templates/geo_files.sparql create mode 100644 htsworkflow/templates/geo_platform.sparql create mode 100644 htsworkflow/templates/geo_samples.sparql create mode 100644 htsworkflow/templates/geo_series.sparql delete mode 100644 htsworkflow/templates/geo_submission.sparql diff --git a/htsworkflow/submission/geo.py b/htsworkflow/submission/geo.py index 737b1bb..8ff349f 100644 --- a/htsworkflow/submission/geo.py +++ b/htsworkflow/submission/geo.py @@ -1,4 +1,5 @@ import logging +import os import RDF @@ -6,6 +7,8 @@ from htsworkflow.submission.submission import Submission from htsworkflow.util.rdfhelp import \ fromTypedNode, \ + geoSoftNS, \ + simplifyUri, \ submissionOntology from django.conf import settings @@ -19,13 +22,33 @@ class GEOSubmission(Submission): def make_soft(self, result_map): samples = [] + platform = self.get_platform_metadata() + platform_attribs = dict(platform) + platform_id = platform_attribs['^platform'] + series = self.get_series_metadata() + series_attribs = dict(series) + series_id = series_attribs['^series'] for lib_id, result_dir in result_map.items(): an_analysis = self.get_submission_node(result_dir) - samples.append(self.get_sample_metadata(an_analysis)) + metadata = self.get_sample_metadata(an_analysis) + if len(metadata) > 1: + errmsg = 'Confused there are more than one samples for %s' + LOGGER.debug(errmsg % (str(an_analysis,))) + metadata = metadata[0] + metadata['raw'] = self.get_sample_files(an_analysis, + geoSoftNS['raw']) + metadata['supplimental'] = self.get_sample_files( + an_analysis, + geoSoftNS['supplemental']) + samples.append(metadata) soft_template = loader.get_template('geo_submission.soft') context = Context({ - 'samples': samples + 'platform': platform, + 'series': series, + 'samples': samples, + 'platform_id': platform_id, + 'series_id': series_id, }) print str(soft_template.render(context)) @@ -39,19 +62,69 @@ class GEOSubmission(Submission): else: return True + def get_platform_metadata(self): + """Gather information for filling out sample section of a SOFT file + """ + query_template = loader.get_template('geo_platform.sparql') + submission = str(self.submissionSetNS[''].uri) + context = Context({ + 'submission': submission, + }) + + results = self.execute_query(query_template, context) + return self.query_to_soft_dictionary(results, 'platform') + + def get_series_metadata(self): + """Gather information for filling out sample section of a SOFT file + """ + query_template = loader.get_template('geo_series.sparql') + submission = str(self.submissionSetNS[''].uri) + context = Context({ + 'submission': submission, + }) + + results = self.execute_query(query_template, context) + return self.query_to_soft_dictionary(results, 'series') + def get_sample_metadata(self, analysis_node): """Gather information for filling out sample section of a SOFT file """ - query_template = loader.get_template('geo_submission.sparql') + query_template = loader.get_template('geo_samples.sparql') context = Context({ 'submission': str(analysis_node.uri), + 'submissionSet': str(self.submissionSetNS[''].uri), }) - formatted_query = query_template.render(context) - query = RDF.SPARQLQuery(str(formatted_query)) - rdfstream = query.execute(self.model) - results = [] - for r in rdfstream: - results.append(r) + results = self.execute_query(query_template, context) + for r in results: + + r['dataProtocol'] = str(r['dataProtocol']).replace('\n', ' ') return results + + def get_sample_files(self, analysis_node, file_class): + """Gather files + """ + query_template = loader.get_template('geo_files.sparql') + + context = Context({ + 'submission': str(analysis_node.uri), + 'file_class': str(file_class) + }) + + return self.execute_query(query_template, context) + + def query_to_soft_dictionary(self, results, heading): + attributes = [] + for r in results: + name = simplifyUri(geoSoftNS, r['name']) + if name is not None: + if name.lower() == heading.lower(): + name = '^' + name + else: + name = '!' + name + for v in fromTypedNode(r['value']).split(os.linesep): + v = v.strip() + if len(v) > 0: + attributes.append((name, v)) + return attributes diff --git a/htsworkflow/submission/submission.py b/htsworkflow/submission/submission.py index 98c25d5..e4ce90c 100644 --- a/htsworkflow/submission/submission.py +++ b/htsworkflow/submission/submission.py @@ -24,7 +24,7 @@ from htsworkflow.submission.daf import \ MetadataLookupException, \ get_submission_uri -logger = logging.getLogger(__name__) +LOGGER = logging.getLogger(__name__) class Submission(object): def __init__(self, name, model): @@ -32,7 +32,7 @@ class Submission(object): self.model = model self.submissionSet = get_submission_uri(self.name) - self.submissionSetNS = RDF.NS(str(self.submissionSet) + '/') + self.submissionSetNS = RDF.NS(str(self.submissionSet) + '#') self.libraryNS = RDF.NS('http://jumpgate.caltech.edu/library/') self.__view_map = None @@ -41,11 +41,11 @@ class Submission(object): """Examine files in our result directory """ for lib_id, result_dir in result_map.items(): - logger.info("Importing %s from %s" % (lib_id, result_dir)) + LOGGER.info("Importing %s from %s" % (lib_id, result_dir)) try: self.import_analysis_dir(result_dir, lib_id) except MetadataLookupException, e: - logger.error("Skipping %s: %s" % (lib_id, str(e))) + LOGGER.error("Skipping %s: %s" % (lib_id, str(e))) def import_analysis_dir(self, analysis_dir, library_id): """Import a submission directories and update our model as needed @@ -68,18 +68,28 @@ class Submission(object): """ path, filename = os.path.split(pathname) - logger.debug("Searching for view") - file_classification = self.find_best_match(filename) - if file_classification is None: - logger.warn("Unrecognized file: {0}".format(pathname)) + LOGGER.debug("Searching for view") + file_type = self.find_best_match(filename) + if file_type is None: + LOGGER.warn("Unrecognized file: {0}".format(pathname)) return None - if str(file_classification) == str(libraryOntology['ignore']): + if str(file_type) == str(libraryOntology['ignore']): return None an_analysis_name = self.make_submission_name(analysis_dir) an_analysis = self.get_submission_node(analysis_dir) an_analysis_uri = str(an_analysis.uri) + file_classification = self.model.get_target(file_type, + rdfNS['type']) + if file_classification is None: + errmsg = 'Could not find class for {0}' + logger.warning(errmsg.format(str(file_type))) + return + self.model.add_statement( + RDF.Statement(self.submissionSetNS[''], + submissionOntology['has_submission'], + an_analysis)) self.model.add_statement(RDF.Statement(an_analysis, submissionOntology['name'], toTypedNode(an_analysis_name))) @@ -91,7 +101,7 @@ class Submission(object): submissionOntology['library'], libNode)) - logger.debug("Adding statements to {0}".format(str(an_analysis))) + LOGGER.debug("Adding statements to {0}".format(str(an_analysis))) # add track specific information self.model.add_statement( RDF.Statement(an_analysis, @@ -108,8 +118,11 @@ class Submission(object): an_analysis_uri, analysis_dir) self.add_md5s(filename, fileNode, analysis_dir) - - logger.debug("Done.") + self.model.add_statement( + RDF.Statement(fileNode, + rdfNS['type'], + file_type)) + LOGGER.debug("Done.") def link_file_to_classes(self, filename, submissionNode, submission_uri, analysis_dir): # add file specific information @@ -125,12 +138,12 @@ class Submission(object): return fileNode def add_md5s(self, filename, fileNode, analysis_dir): - logger.debug("Updating file md5sum") + LOGGER.debug("Updating file md5sum") submission_pathname = os.path.join(analysis_dir, filename) md5 = make_md5sum(submission_pathname) if md5 is None: errmsg = "Unable to produce md5sum for {0}" - logger.warning(errmsg.format(submission_pathname)) + LOGGER.warning(errmsg.format(submission_pathname)) else: self.model.add_statement( RDF.Statement(fileNode, dafTermOntology['md5sum'], md5)) @@ -178,11 +191,11 @@ class Submission(object): for s in self.model.find_statements(filename_query): view_name = s.subject literal_re = s.object.literal_value['string'] - logger.debug("Found: %s" % (literal_re,)) + LOGGER.debug("Found: %s" % (literal_re,)) try: filename_re = re.compile(literal_re) except re.error, e: - logger.error("Unable to compile: %s" % (literal_re,)) + LOGGER.error("Unable to compile: %s" % (literal_re,)) patterns[literal_re] = view_name return patterns @@ -254,3 +267,14 @@ class Submission(object): "Unrecognized library type %s for %s" % \ (library_type, str(libNode))) + def execute_query(self, template, context): + """Execute the query, returning the results + """ + formatted_query = template.render(context) + LOGGER.debug(formatted_query) + query = RDF.SPARQLQuery(str(formatted_query)) + rdfstream = query.execute(self.model) + results = [] + for r in rdfstream: + results.append(r) + return results diff --git a/htsworkflow/templates/geo_files.sparql b/htsworkflow/templates/geo_files.sparql new file mode 100644 index 0000000..7b66f4f --- /dev/null +++ b/htsworkflow/templates/geo_files.sparql @@ -0,0 +1,18 @@ +PREFIX libraryOntology: +PREFIX submissionOntology: +PREFIX ucscDaf: +PREFIX ncbiTaxon: +PREFIX geoSoft: + +select distinct ?filename, ?md5sum, ?file_type ?file_type_label +WHERE { + <{{submission}}> ucscDaf:has_file ?file ; + a submissionOntology:submission . + + ?file ucscDaf:filename ?filename ; + ucscDaf:md5sum ?md5sum ; + a ?file_type . + ?file_type a <{{file_class}}> ; + geoSoft:fileTypeLabel ?file_type_label . + +} \ No newline at end of file diff --git a/htsworkflow/templates/geo_platform.sparql b/htsworkflow/templates/geo_platform.sparql new file mode 100644 index 0000000..4d224d7 --- /dev/null +++ b/htsworkflow/templates/geo_platform.sparql @@ -0,0 +1,14 @@ +PREFIX libraryOntology: +PREFIX submissionOntology: +PREFIX ucscDaf: +PREFIX ncbiTaxon: +PREFIX geoSoft: + + +select distinct ?name ?value +WHERE { + <{{submission}}> submissionOntology:has_platform ?platform . + ?platform a geoSoft:platform . + + ?platform ?name ?value . +} diff --git a/htsworkflow/templates/geo_samples.sparql b/htsworkflow/templates/geo_samples.sparql new file mode 100644 index 0000000..850d99a --- /dev/null +++ b/htsworkflow/templates/geo_samples.sparql @@ -0,0 +1,41 @@ +PREFIX libraryOntology: +PREFIX submissionOntology: +PREFIX ucscDaf: +PREFIX ncbiTaxon: +PREFIX geoSoft: +PREFIX cells: + +select distinct ?name ?cell ?antibody ?sex ?control ?strain ?controlId ?library_id ?treatment ?protocol ?readType ?insertLength ?replicate, ?mapAlgorithm ?species_name ?taxon_id ?extractMolecule ?growthProtocol ?extractProtocol ?dataProtocol ?experiment_type ?library_selection ?library_source +WHERE { + <{{submission}}> a submissionOntology:submission . + + OPTIONAL { <{{submission}}> ucscDaf:control ?control } + OPTIONAL { <{{submission}}> ucscDaf:controlId ?controlId } + OPTIONAL { ?library libraryOntology:antibody ?antibody } + OPTIONAL { ?library libraryOntology:cell_line ?cell . + ?cell_line cells:cell ?cell ; + cells:documents ?growthProtocol . } + OPTIONAL { ?library ucscDaf:sex ?sex } + OPTIONAL { ?library libraryOntology:library_id ?library_id } + OPTIONAL { ?library libraryOntology:replicate ?replicate } + OPTIONAL { ?library libraryOntology:species ?species_name } + OPTIONAL { ?library libraryOntology:condition_term ?treatment } + OPTIONAL { ?library libraryOntology:experiment_type ?experiment_type } + OPTIONAL { ?library libraryOntology:librarySelection ?library_selection } + OPTIONAL { ?library libraryOntology:librarySource ?library_source } + OPTIONAL { <{{submissionSet}}> geoSoft:data_processing ?dataProtocol } + OPTIONAL { ?library libraryOntology:extractMolecule ?extractMolecule } + OPTIONAL { ?library libraryOntology:extractProtocol ?extractProtocol } + OPTIONAL { ?library ucscDaf:protocol ?protocol } + OPTIONAL { ?library ucscDaf:readType ?readType } + OPTIONAL { ?library ucscDaf:strain ?strain } + OPTIONAL { ?library libraryOntology:insert_size ?insertLength } + OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm } + + <{{submission}}> submissionOntology:library ?library ; + submissionOntology:name ?name . + ?species libraryOntology:species ?species_name ; + libraryOntology:taxon_id ?taxon_id . + + +} \ No newline at end of file diff --git a/htsworkflow/templates/geo_series.sparql b/htsworkflow/templates/geo_series.sparql new file mode 100644 index 0000000..815f311 --- /dev/null +++ b/htsworkflow/templates/geo_series.sparql @@ -0,0 +1,14 @@ +PREFIX libraryOntology: +PREFIX submissionOntology: +PREFIX ucscDaf: +PREFIX ncbiTaxon: +PREFIX geoSoft: + + +select distinct ?name ?value +WHERE { + <{{submission}}> submissionOntology:has_series ?series. + ?series a geoSoft:series . + + ?series ?name ?value . +} diff --git a/htsworkflow/templates/geo_submission.soft b/htsworkflow/templates/geo_submission.soft index ae2ac57..00c0b4d 100644 --- a/htsworkflow/templates/geo_submission.soft +++ b/htsworkflow/templates/geo_submission.soft @@ -1,11 +1,39 @@ -Soft template -!Platform_title = Illumina Genome Analyzer (Homo sapiens) -!Platform_geo_accession = GPL9052 -{% for sample in samples %}{% for row in sample %}{%if forloop.first %} +{% for name, value in series %}{{name}} = {{value}} +{% endfor %}!Series_platform_id = {{ platform_id }} +{% for row in samples %} ^SAMPLE={{row.name}} +!Sample_type=SRA !Sample_title={{row.name}} +!Sample_series_id = {{ series_id }} +!Sample_instrument_model = Illumina Genome Analyzer +!Sample_instrument_model = Illumina Genome Analyzer II +!Sample_instrument_model = Illumina Genome Analyzer IIx +!Sample_instrument_model = Illumina HiSeq 2000 +!Sample_channel_count = 1 !Sample_organism_ch1 = {{ row.species_name }} !Sample_taxid_ch1 = {{ row.taxon_id }} -{% spaceless %}{% if row.cell %}!Sample_characteristics_ch1 = cell: {{ row.cell }} -{% endif %}{% endspaceless %}{% endif %} -!Sample_supplementary_file_{{forloop.counter}}={{row.filename}}{% endfor %}{% endfor %} \ No newline at end of file +!Sample_platform_id = {{ platform_id }} +!Sample_source_name_ch1={{row.cell}} +!Sample_library_strategy={{ row.experiment_type }} +!Sample_library_source={{row.library_source}} +!Sample_library_selection={{ row.library_selection }} +!Sample_growth_protocol_ch1={{ row.growthProtocol|safe }} +!Sample_extract_protocol={{ row.extractProtocol|safe }} +!Sample_data_processing={{ row.dataProtocol|safe }} +!Sample_molecule_ch1 = {{ row.extractMolecule }} +!Sample_characteristics_ch1 = labExpId: {{ row.library_id }} +!Sample_characteristics_ch1 = replicate: {{ row.replicate }} +{% if row.cell %}{% spaceless %} +!Sample_characteristics_ch1 = cell: {{ row.cell }} +{% endspaceless %}{% endif %} +{% if row.readType %}{% spaceless %} +!Sample_characteristics_ch1 = readType: {{ row.readType }} +{% endspaceless %}{% endif %}{% if row.antibody %}{% spaceless %} +!Sample_characteristics_ch1 = cell: {{ row.antibody }} +{% endspaceless %}{% endif %}{% for raw in row.raw %} +!Sample_raw_file_{{forloop.counter}}={{raw.filename}} +!Sample_raw_file_type_{{forloop.counter}}={{raw.file_type_label}} +!Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}}{% endfor %}{% for sup in row.supplimental %} +!Sample_supplementary_file_{{forloop.counter}}={{sup.filename}} +!Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}} +{% endfor %}{% endfor %} \ No newline at end of file diff --git a/htsworkflow/templates/geo_submission.sparql b/htsworkflow/templates/geo_submission.sparql deleted file mode 100644 index 1d7cbb1..0000000 --- a/htsworkflow/templates/geo_submission.sparql +++ /dev/null @@ -1,33 +0,0 @@ -PREFIX libraryOntology: -PREFIX submissionOntology: -PREFIX ucscDaf: -PREFIX ncbiTaxon: - -select distinct ?name ?filename ?md5sum ?cell ?antibody ?sex ?control ?strain ?controlId ?labExpId ?labVersion ?treatment ?protocol ?readType ?insertLength ?replicate, ?mapAlgorithm ?species_name ?taxon_id -WHERE { - <{{submission}}> a submissionOntology:submission . - - OPTIONAL { ?submission ucscDaf:control ?control } - #OPTIONAL { ?submission ucscDaf:controlId ?controlId } - #OPTIONAL { ?library libraryOntology:antibody ?antibody } - OPTIONAL { ?library libraryOntology:cell_line ?cell } . - #OPTIONAL { ?library ucscDaf:sex ?sex } - OPTIONAL { ?library libraryOntology:library_id ?labExpId } - OPTIONAL { ?library libraryOntology:library_id ?labVersion } - OPTIONAL { ?library libraryOntology:replicate ?replicate } - OPTIONAL { ?library libraryOntology:species ?species_name } - - #OPTIONAL { ?library libraryOntology:condition_term ?treatment } - #OPTIONAL { ?library ucscDaf:protocol ?protocol } - #OPTIONAL { ?library ucscDaf:readType ?readType } - #OPTIONAL { ?library ucscDaf:strain ?strain } - #OPTIONAL { ?library libraryOntology:insert_size ?insertLength } - #OPTIONAL { ?library ucscDaf:mapAlgorithm ?mapAlgorithm } - - <{{submission}}> submissionOntology:library ?library ; - ucscDaf:has_file ?file ; - submissionOntology:name ?name . - ?species libraryOntology:species ?species_name ; - libraryOntology:taxon_id ?taxon_id . - ?file ucscDaf:filename ?filename . -} -- 2.30.2