errmsg = 'Confused there are more than one samples for %s'
LOGGER.debug(errmsg % (str(an_analysis,)))
metadata = metadata[0]
- metadata['raw'] = self.get_sample_files(an_analysis,
- geoSoftNS['raw'])
- metadata['supplimental'] = self.get_sample_files(
- an_analysis,
- geoSoftNS['supplemental'])
+ metadata['raw'] = self.get_raw_files(an_analysis)
+ metadata['supplimental'] = self.get_sample_files(an_analysis)
metadata['run'] = self.get_run_details(an_analysis)
samples.append(metadata)
return results
- def get_sample_files(self, analysis_node, file_class):
- """Gather files
+ def get_sample_files(self, analysis_node):
+ """Gather derived files
"""
query_template = loader.get_template('geo_files.sparql')
context = Context({
'submission': str(analysis_node.uri),
- 'file_class': str(file_class)
+ 'file_class': str(geoSoftNS['supplemental'])
})
return self.execute_query(query_template, context)
+ def get_raw_files(self, analysis_node):
+ """Gather raw data e.g. fastq files.
+ """
+ query_template = loader.get_template('geo_fastqs.sparql')
+
+ context = Context({
+ 'submission': str(analysis_node.uri),
+ 'file_class': str(geoSoftNS['raw']),
+ })
+
+ lanes = {}
+ for row in self.execute_query(query_template, context):
+ data = {}
+ for k, v in row.items():
+ data[k] = v
+ lane = str(data['lane'])
+ lanes.setdefault(lane, []).append(data)
+ result = []
+ for lane, files in lanes.items():
+ if len(files) > 2:
+ errmsg = "Don't know what to do with more than 2 raw files"
+ raise ValueError(errmsg)
+ elif len(files) == 2:
+ is_paired = True
+ elif len(files) == 1:
+ is_paired = False
+ elif len(files) == 0:
+ raise RuntimeError("Empty lane list discovered")
+ files = self._format_filename(files, is_paired)
+ files = self._format_flowcell_type(files, is_paired)
+ files = self._format_read_length(files, is_paired)
+ result.append(files[0])
+ return result
+
+ def _format_flowcell_type(self, files, is_paired):
+ """Used by get_raw_files to format value for single_or_paired-end
+ """
+ for f in files:
+ if 'flowcell_type' in f:
+ flowcell_type = fromTypedNode(f['flowcell_type'])
+ if flowcell_type is None:
+ pass
+ elif flowcell_type.lower() == "paired":
+ f['flowcell_type'] = 'paired-end'
+ else:
+ f['flowcell_type'] = 'single'
+
+ return files
+
+ def _format_read_length(self, files, is_paired):
+ """Format
+ """
+ read_count = 2 if is_paired else 1
+ for f in files:
+ if 'read_length' in f:
+ read_length = str(fromTypedNode(f['read_length']))
+ f['read_length'] = ",".join([read_length] * read_count)
+ return files
+
+ def _format_filename(self, files, is_paired):
+ """Format file name for get_raw_files, also report if paired
+ """
+ if len(files) == 2:
+ # should be paired
+ f0 = files[0]
+ f1 = files[1]
+ f0['filename'] = "%s, %s" % (str(f0['filename']),
+ str(f1['filename']))
+ f0['md5sum'] = "%s, %s" % (str(f0['md5sum']),
+ str(f1['md5sum']))
+ del files[1]
+ else:
+ files[0]['filename'] = str(files[0]['filename'])
+ files[0]['md5sum'] = str(files[0]['md5sum'])
+ return files
+
+
def get_run_details(self, analysis_node):
"""Get information about runs
"""
--- /dev/null
+PREFIX libraryOntology: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+PREFIX submissionOntology: <http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#>
+PREFIX ucscDaf: <http://jumpgate.caltech.edu/wiki/UcscDaf#>
+PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
+PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
+
+select distinct ?lane ?filename, ?md5sum, ?file_type ?file_type_label ?flowcell_id ?read_length ?flowcell_type ?image_software ?image_version ?basecall_software ?basecall_version ?sequencer_model
+WHERE {
+ <{{submission}}> submissionOntology:library ?library ;
+ a submissionOntology:submission .
+
+ ?file ucscDaf:filename ?filename ;
+ ucscDaf:md5sum ?md5sum ;
+ libraryOntology:has_lane ?lane ;
+ a ?file_type .
+ ?file_type a <{{file_class}}> ;
+ geoSoft:fileTypeLabel ?file_type_label .
+
+ ?library libraryOntology:has_lane ?lane .
+ ?lane libraryOntology:flowcell ?flowcell .
+ ?flowcell libraryOntology:flowcell_id ?flowcell_id ;
+ libraryOntology:read_length ?read_length ;
+ libraryOntology:flowcell_type ?flowcell_type ;
+ OPTIONAL { ?flowcell libraryOntology:image_software ?image_software ;
+ libraryOntology:image_version ?image_version . }
+ OPTIONAL {?flowcell libraryOntology:basecall_software ?basecall_software ;
+ libraryOntology:basecall_version ?basecall_version . }
+ OPTIONAL {?flowcell libraryOntology:sequenced_by ?sequencer .
+ ?sequencer libraryOntology:sequencer_model ?sequencer_model . }
+}
+
PREFIX ncbiTaxon: <http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=>
PREFIX geoSoft: <http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#>
-select distinct ?filename, ?md5sum, ?file_type ?file_type_label
+select distinct ?filename, ?md5sum, ?file_type ?file_type_label ?flowcell_id ?read_length ?flowcell_type ?image_software ?image_version ?basecall_software ?basecall_version ?sequencer_model
WHERE {
<{{submission}}> ucscDaf:has_file ?file ;
a submissionOntology:submission .
?file ucscDaf:filename ?filename ;
ucscDaf:md5sum ?md5sum ;
+ libraryOntology:has_lane ?lane ;
a ?file_type .
?file_type a <{{file_class}}> ;
geoSoft:fileTypeLabel ?file_type_label .
+ OPTIONAL { ?lane libraryOntology:flowcell ?flowcell .
+ ?flowcell libraryOntology:flowcell_id ?flowcell_id ;
+ libraryOntology:read_length ?read_length ;
+ libraryOntology:flowcell_type ?flowcell_type ;
+ libraryOntology:image_software ?image_software ;
+ libraryOntology:image_version ?image_version ;
+ libraryOntology:basecall_software ?basecall_software ;
+ libraryOntology:basecall_version ?basecall_version ;
+ libraryOntology:sequenced_by ?sequencer .
+ ?sequencer libraryOntology:sequencer_model ?sequencer_model
+ }
}
\ No newline at end of file
# right now we're just grabbing the sequencer model
# it might make sense to report each of the flowcell/image software.
-#select distinct ?flowcell ?image_software ?image_version ?basecall_software ?basecall_version ?sequencer_model
-select distinct ?sequencer_model
+select ?flowcell ?read_length ?image_software ?image_version ?basecall_software ?basecall_version
where {
<{{submission}}> submissionOntology:library ?library ;
a submissionOntology:submission .
libraryOntology:has_lane ?lane ;
a libraryOntology:library .
OPTIONAL { ?flowcell libraryOntology:has_lane ?lane .
- ?flowcell libraryOntology:image_software ?image_software ;
+ ?flowcell libraryOntology:read_length ?read_length ;
+ libraryOntology:image_software ?image_software ;
libraryOntology:image_version ?image_version ;
libraryOntology:basecall_software ?basecall_software ;
libraryOntology:basecall_version ?basecall_version ;
{% for name, value in series %}{{name}} = {{value}}
-{% endfor %}!Series_platform_id = {{ platform_id }}
+{% endfor %}
{% for row in samples %}
^SAMPLE={{row.name}}
!Sample_type=SRA
!Sample_title={{row.name}}
-!Sample_series_id = {{ series_id }}{% for run in row.run %}
-!Sample_instrument_model = {{ run.sequencer_model }}{% endfor %}
-!Sample_channel_count = 1
-!Sample_organism_ch1 = {{ row.species_name }}
-!Sample_taxid_ch1 = {{ row.taxon_id }}
-!Sample_platform_id = {{ platform_id }}
+!Sample_series_id={{ series_id }}
+!Sample_channel_count=1
+!Sample_organism_ch1={{ row.species_name }}
+!Sample_taxid_ch1={{ row.taxon_id }}
!Sample_source_name_ch1={{row.cell}}
!Sample_library_strategy={{ row.experiment_type }}
!Sample_library_source={{row.library_source}}
!Sample_growth_protocol_ch1={{ row.growthProtocol|safe }}
!Sample_extract_protocol={{ row.extractProtocol|safe }}
!Sample_data_processing={{ row.dataProtocol|safe }}
-!Sample_molecule_ch1 = {{ row.extractMolecule }}
-!Sample_characteristics_ch1 = labExpId: {{ row.library_id }}
-!Sample_characteristics_ch1 = replicate: {{ row.replicate }}
+!Sample_molecule_ch1={{ row.extractMolecule }}
+!Sample_characteristics_ch1=labExpId: {{ row.library_id }}
+!Sample_characteristics_ch1=replicate: {{ row.replicate }}
{% if row.cell %}{% spaceless %}
-!Sample_characteristics_ch1 = cell: {{ row.cell }}
+!Sample_characteristics_ch1=cell: {{ row.cell }}
{% endspaceless %}{% endif %}
{% if row.readType %}{% spaceless %}
-!Sample_characteristics_ch1 = readType: {{ row.readType }}
+!Sample_characteristics_ch1=readType: {{ row.readType }}
{% endspaceless %}{% endif %}{% if row.antibody %}{% spaceless %}
-!Sample_characteristics_ch1 = cell: {{ row.antibody }}
-{% endspaceless %}{% endif %}{% for raw in row.raw %}
-!Sample_raw_file_{{forloop.counter}}={{raw.filename}}
+!Sample_characteristics_ch1=cell: {{ row.antibody }}
+{% endspaceless %}{% endif %}{% for run in row.run %}
+!Sample_characteristics_ch1=Illumina image processing pipeline version: {{ run.image_software }}-{{ run.image_version }}
+!Sample_characteristics_ch1=Illumina base-calling pipeline version: {{ run.image_software }}-{{ run.image_version }}{% endfor %}{% for raw in row.raw %}
+!Sample_raw_file_{{forloop.counter}}={{ raw.filename }}
!Sample_raw_file_type_{{forloop.counter}}={{raw.file_type_label}}
-!Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}}{% endfor %}{% for sup in row.supplimental %}
+!Sample_raw_file_insert_size_{{forloop.counter}}={{ row.insertLength }}
+!Sample_raw_file_read_length_{{forloop.counter}}={{raw.read_length}}
+!Sample_raw_file_instrument_model_{{forloop.counter}}={{raw.sequencer_model}}
+!Sample_raw_file_checksum_{{forloop.counter}}={{raw.md5sum}}
+!sample_raw_file_single_or_paired-end={{raw.flowcell_type}}{% endfor %}{% for sup in row.supplimental %}
!Sample_supplementary_file_{{forloop.counter}}={{sup.filename}}
!Sample_supplementary_file_checksum_{{forloop.counter}}={{sup.md5sum}}
-{% endfor %}{% endfor %}
\ No newline at end of file
+{% endfor %}{% endfor %}