From b20de36b64c3de26b259c55c65a84d2f1513124d Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Wed, 1 Aug 2012 16:18:13 -0700 Subject: [PATCH] GEO wanted both files for a paired lane to be on a single line. This required a bit of formatting code to massage the results of the sparql query into an acceptable form. --- htsworkflow/submission/geo.py | 90 ++++++++++++++++++-- htsworkflow/templates/geo_fastqs.sparql | 31 +++++++ htsworkflow/templates/geo_files.sparql | 14 ++- htsworkflow/templates/geo_run_details.sparql | 6 +- htsworkflow/templates/geo_submission.soft | 38 +++++---- 5 files changed, 150 insertions(+), 29 deletions(-) create mode 100644 htsworkflow/templates/geo_fastqs.sparql diff --git a/htsworkflow/submission/geo.py b/htsworkflow/submission/geo.py index ad66422..a3ac2f1 100644 --- a/htsworkflow/submission/geo.py +++ b/htsworkflow/submission/geo.py @@ -35,11 +35,8 @@ class GEOSubmission(Submission): errmsg = 'Confused there are more than one samples for %s' LOGGER.debug(errmsg % (str(an_analysis,))) metadata = metadata[0] - metadata['raw'] = self.get_sample_files(an_analysis, - geoSoftNS['raw']) - metadata['supplimental'] = self.get_sample_files( - an_analysis, - geoSoftNS['supplemental']) + metadata['raw'] = self.get_raw_files(an_analysis) + metadata['supplimental'] = self.get_sample_files(an_analysis) metadata['run'] = self.get_run_details(an_analysis) samples.append(metadata) @@ -103,18 +100,95 @@ class GEOSubmission(Submission): return results - def get_sample_files(self, analysis_node, file_class): - """Gather files + def get_sample_files(self, analysis_node): + """Gather derived files """ query_template = loader.get_template('geo_files.sparql') context = Context({ 'submission': str(analysis_node.uri), - 'file_class': str(file_class) + 'file_class': str(geoSoftNS['supplemental']) }) return self.execute_query(query_template, context) + def get_raw_files(self, analysis_node): + """Gather raw data e.g. fastq files. + """ + query_template = loader.get_template('geo_fastqs.sparql') + + context = Context({ + 'submission': str(analysis_node.uri), + 'file_class': str(geoSoftNS['raw']), + }) + + lanes = {} + for row in self.execute_query(query_template, context): + data = {} + for k, v in row.items(): + data[k] = v + lane = str(data['lane']) + lanes.setdefault(lane, []).append(data) + result = [] + for lane, files in lanes.items(): + if len(files) > 2: + errmsg = "Don't know what to do with more than 2 raw files" + raise ValueError(errmsg) + elif len(files) == 2: + is_paired = True + elif len(files) == 1: + is_paired = False + elif len(files) == 0: + raise RuntimeError("Empty lane list discovered") + files = self._format_filename(files, is_paired) + files = self._format_flowcell_type(files, is_paired) + files = self._format_read_length(files, is_paired) + result.append(files[0]) + return result + + def _format_flowcell_type(self, files, is_paired): + """Used by get_raw_files to format value for single_or_paired-end + """ + for f in files: + if 'flowcell_type' in f: + flowcell_type = fromTypedNode(f['flowcell_type']) + if flowcell_type is None: + pass + elif flowcell_type.lower() == "paired": + f['flowcell_type'] = 'paired-end' + else: + f['flowcell_type'] = 'single' + + return files + + def _format_read_length(self, files, is_paired): + """Format + """ + read_count = 2 if is_paired else 1 + for f in files: + if 'read_length' in f: + read_length = str(fromTypedNode(f['read_length'])) + f['read_length'] = ",".join([read_length] * read_count) + return files + + def _format_filename(self, files, is_paired): + """Format file name for get_raw_files, also report if paired + """ + if len(files) == 2: + # should be paired + f0 = files[0] + f1 = files[1] + f0['filename'] = "%s, %s" % (str(f0['filename']), + str(f1['filename'])) + f0['md5sum'] = "%s, %s" % (str(f0['md5sum']), + str(f1['md5sum'])) + del files[1] + else: + files[0]['filename'] = str(files[0]['filename']) + files[0]['md5sum'] = str(files[0]['md5sum']) + return files + + def get_run_details(self, analysis_node): """Get information about runs """ diff --git a/htsworkflow/templates/geo_fastqs.sparql b/htsworkflow/templates/geo_fastqs.sparql new file mode 100644 index 0000000..e7fcbc1 --- /dev/null +++ b/htsworkflow/templates/geo_fastqs.sparql @@ -0,0 +1,31 @@ +PREFIX libraryOntology: +PREFIX submissionOntology: +PREFIX ucscDaf: +PREFIX ncbiTaxon: +PREFIX geoSoft: + +select distinct ?lane ?filename, ?md5sum, ?file_type ?file_type_label ?flowcell_id ?read_length ?flowcell_type ?image_software ?image_version ?basecall_software ?basecall_version ?sequencer_model +WHERE { + <{{submission}}> submissionOntology:library ?library ; + a submissionOntology:submission . + + ?file ucscDaf:filename ?filename ; + ucscDaf:md5sum ?md5sum ; + libraryOntology:has_lane ?lane ; + a ?file_type . + ?file_type a <{{file_class}}> ; + geoSoft:fileTypeLabel ?file_type_label . + + ?library libraryOntology:has_lane ?lane . + ?lane libraryOntology:flowcell ?flowcell . + ?flowcell libraryOntology:flowcell_id ?flowcell_id ; + libraryOntology:read_length ?read_length ; + libraryOntology:flowcell_type ?flowcell_type ; + OPTIONAL { ?flowcell libraryOntology:image_software ?image_software ; + libraryOntology:image_version ?image_version . } + OPTIONAL {?flowcell libraryOntology:basecall_software ?basecall_software ; + libraryOntology:basecall_version ?basecall_version . } + OPTIONAL {?flowcell libraryOntology:sequenced_by ?sequencer . + ?sequencer libraryOntology:sequencer_model ?sequencer_model . } +} + diff --git a/htsworkflow/templates/geo_files.sparql b/htsworkflow/templates/geo_files.sparql index 7b66f4f..e3fcb9d 100644 --- a/htsworkflow/templates/geo_files.sparql +++ b/htsworkflow/templates/geo_files.sparql @@ -4,15 +4,27 @@ PREFIX ucscDaf: PREFIX ncbiTaxon: PREFIX geoSoft: -select distinct ?filename, ?md5sum, ?file_type ?file_type_label +select distinct ?filename, ?md5sum, ?file_type ?file_type_label ?flowcell_id ?read_length ?flowcell_type ?image_software ?image_version ?basecall_software ?basecall_version ?sequencer_model WHERE { <{{submission}}> ucscDaf:has_file ?file ; a submissionOntology:submission . ?file ucscDaf:filename ?filename ; ucscDaf:md5sum ?md5sum ; + libraryOntology:has_lane ?lane ; a ?file_type . ?file_type a <{{file_class}}> ; geoSoft:fileTypeLabel ?file_type_label . + OPTIONAL { ?lane libraryOntology:flowcell ?flowcell . + ?flowcell libraryOntology:flowcell_id ?flowcell_id ; + libraryOntology:read_length ?read_length ; + libraryOntology:flowcell_type ?flowcell_type ; + libraryOntology:image_software ?image_software ; + libraryOntology:image_version ?image_version ; + libraryOntology:basecall_software ?basecall_software ; + libraryOntology:basecall_version ?basecall_version ; + libraryOntology:sequenced_by ?sequencer . + ?sequencer libraryOntology:sequencer_model ?sequencer_model + } } \ No newline at end of file diff --git a/htsworkflow/templates/geo_run_details.sparql b/htsworkflow/templates/geo_run_details.sparql index aa7e438..dc97107 100644 --- a/htsworkflow/templates/geo_run_details.sparql +++ b/htsworkflow/templates/geo_run_details.sparql @@ -7,8 +7,7 @@ PREFIX cells: # right now we're just grabbing the sequencer model # it might make sense to report each of the flowcell/image software. -#select distinct ?flowcell ?image_software ?image_version ?basecall_software ?basecall_version ?sequencer_model -select distinct ?sequencer_model +select ?flowcell ?read_length ?image_software ?image_version ?basecall_software ?basecall_version where { <{{submission}}> submissionOntology:library ?library ; a submissionOntology:submission . @@ -17,7 +16,8 @@ where { libraryOntology:has_lane ?lane ; a libraryOntology:library . OPTIONAL { ?flowcell libraryOntology:has_lane ?lane . - ?flowcell libraryOntology:image_software ?image_software ; + ?flowcell libraryOntology:read_length ?read_length ; + libraryOntology:image_software ?image_software ; libraryOntology:image_version ?image_version ; libraryOntology:basecall_software ?basecall_software ; libraryOntology:basecall_version ?basecall_version ; diff --git a/htsworkflow/templates/geo_submission.soft b/htsworkflow/templates/geo_submission.soft index 3541e56..19e6faa 100644 --- a/htsworkflow/templates/geo_submission.soft +++ b/htsworkflow/templates/geo_submission.soft @@ -1,15 +1,13 @@ {% for name, value in series %}{{name}} = {{value}} -{% endfor %}!Series_platform_id = {{ platform_id }} +{% endfor %} {% for row in samples %} ^SAMPLE={{row.name}} !Sample_type=SRA !Sample_title={{row.name}} -!Sample_series_id = {{ series_id }}{% for run in row.run %} -!Sample_instrument_model = {{ run.sequencer_model }}{% endfor %} -!Sample_channel_count = 1 -!Sample_organism_ch1 = {{ row.species_name }} -!Sample_taxid_ch1 = {{ row.taxon_id }} -!Sample_platform_id = {{ platform_id }} +!Sample_series_id={{ series_id }} +!Sample_channel_count=1 +!Sample_organism_ch1={{ row.species_name }} +!Sample_taxid_ch1={{ row.taxon_id }} !Sample_source_name_ch1={{row.cell}} !Sample_library_strategy={{ row.experiment_type }} !Sample_library_source={{row.library_source}} @@ -17,20 +15,26 @@ !Sample_growth_protocol_ch1={{ row.growthProtocol|safe }} !Sample_extract_protocol={{ row.extractProtocol|safe }} !Sample_data_processing={{ row.dataProtocol|safe }} -!Sample_molecule_ch1 = {{ row.extractMolecule }} -!Sample_characteristics_ch1 = labExpId: {{ row.library_id }} -!Sample_characteristics_ch1 = replicate: {{ row.replicate }} +!Sample_molecule_ch1={{ row.extractMolecule }} +!Sample_characteristics_ch1=labExpId: {{ row.library_id }} +!Sample_characteristics_ch1=replicate: {{ row.replicate }} {% if row.cell %}{% spaceless %} -!Sample_characteristics_ch1 = cell: {{ row.cell }} +!Sample_characteristics_ch1=cell: {{ row.cell }} {% endspaceless %}{% endif %} {% if row.readType %}{% spaceless %} -!Sample_characteristics_ch1 = readType: {{ row.readType }} +!Sample_characteristics_ch1=readType: {{ row.readType }} {% endspaceless %}{% endif %}{% if row.antibody %}{% spaceless %} -!Sample_characteristics_ch1 = cell: {{ row.antibody }} -{% endspaceless %}{% endif %}{% for raw in row.raw %} -!Sample_raw_file_{{forloop.counter}}={{raw.filename}} +!Sample_characteristics_ch1=cell: {{ row.antibody }} +{% endspaceless %}{% endif %}{% for run in row.run %} +!Sample_characteristics_ch1=Illumina image processing pipeline version: {{ run.image_software }}-{{ run.image_version }} +!Sample_characteristics_ch1=Illumina base-calling pipeline version: {{ run.image_software }}-{{ run.image_version }}{% endfor %}{% for raw in row.raw %} +!Sample_raw_file_{{forloop.counter}}={{ raw.filename }} !Sample_raw_file_type_{{forloop.counter}}={{raw.file_type_label}} -!Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}}{% endfor %}{% for sup in row.supplimental %} +!Sample_raw_file_insert_size_{{forloop.counter}}={{ row.insertLength }} +!Sample_raw_file_read_length_{{forloop.counter}}={{raw.read_length}} +!Sample_raw_file_instrument_model_{{forloop.counter}}={{raw.sequencer_model}} +!Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}} +!sample_raw_file_single_or_paired-end={{raw.flowcell_type}}{% endfor %}{% for sup in row.supplimental %} !Sample_supplementary_file_{{forloop.counter}}={{sup.filename}} !Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}} -{% endfor %}{% endfor %} \ No newline at end of file +{% endfor %}{% endfor %} -- 2.30.2