1 from __future__ import print_function
8 from htsworkflow.submission.submission import Submission
10 from htsworkflow.util.rdfhelp import \
16 from django.conf import settings
17 from django.template import Context, loader
19 LOGGER = logging.getLogger(__name__)
21 class GEOSubmission(Submission):
22 def __init__(self, name, model, host):
23 super(GEOSubmission, self).__init__(name, model, host)
25 def make_soft(self, result_map):
27 platform = self.get_platform_metadata()
28 platform_attribs = dict(platform)
29 platform_id = platform_attribs['^platform']
30 series = self.get_series_metadata()
31 series_attribs = dict(series)
32 series_id = series_attribs['^series']
33 for lib_id, result_dir in result_map.items():
34 an_analysis = self.get_submission_node(result_dir)
35 metadata = self.get_sample_metadata(an_analysis)
36 if len(metadata) == 0:
37 errmsg = 'No metadata found for {0}'
38 LOGGER.error(errmsg.format(str(an_analysis),))
40 elif len(metadata) > 1:
41 errmsg = 'Confused there are more than one sample for %s'
42 LOGGER.debug(errmsg % (str(an_analysis),))
43 metadata = metadata[0]
44 metadata['raw'] = self.get_raw_files(an_analysis)
45 metadata['supplimental'] = self.get_sample_files(an_analysis)
46 metadata['run'] = self.get_run_details(an_analysis)
47 samples.append(metadata)
49 soft_template = loader.get_template('geo_submission.soft')
54 'platform_id': platform_id,
55 'series_id': series_id,
57 print(str(soft_template.render(context)))
59 def check_for_name(self, analysis_node):
61 self.model.get_target(analysis_node,
62 submissionOntology['name']))
64 logger.error("Need name for %s" % (str(analysis_node)))
69 def get_platform_metadata(self):
70 """Gather information for filling out sample section of a SOFT file
72 query_template = loader.get_template('geo_platform.sparql')
73 submission = str(self.submissionSetNS[''].uri)
75 'submission': submission,
78 results = self.execute_query(query_template, context)
79 return self.query_to_soft_dictionary(results, 'platform')
81 def get_series_metadata(self):
82 """Gather information for filling out sample section of a SOFT file
84 query_template = loader.get_template('geo_series.sparql')
85 submission = str(self.submissionSetNS[''].uri)
87 'submission': submission,
90 results = self.execute_query(query_template, context)
91 return self.query_to_soft_dictionary(results, 'series')
93 def get_sample_metadata(self, analysis_node):
94 """Gather information for filling out sample section of a SOFT file
96 query_template = loader.get_template('geo_samples.sparql')
99 'submission': str(analysis_node.uri),
100 'submissionSet': str(self.submissionSetNS[''].uri),
103 results = self.execute_query(query_template, context)
105 r['dataProtocol'] = str(r['dataProtocol']).replace('\n', ' ')
109 def get_sample_files(self, analysis_node):
110 """Gather derived files
112 query_template = loader.get_template('geo_files.sparql')
115 'submission': str(analysis_node.uri),
116 'file_class': str(geoSoftNS['supplemental'].uri)
119 return self.execute_query(query_template, context)
121 def get_raw_files(self, analysis_node):
122 """Gather raw data e.g. fastq files.
124 query_template = loader.get_template('geo_fastqs.sparql')
127 'submission': str(analysis_node.uri),
128 'file_class': str(geoSoftNS['raw'].uri),
132 for row in self.execute_query(query_template, context):
134 for k, v in row.items():
136 library = str(data['library'])
137 lanes.setdefault(library, []).append(data)
139 for library, files in lanes.items():
141 errmsg = "Don't know what to do with more than 2 raw files"
142 raise ValueError(errmsg)
143 elif len(files) == 2:
145 elif len(files) == 1:
147 elif len(files) == 0:
148 raise RuntimeError("Empty library list discovered")
149 files = self._format_filename(files, is_paired)
150 files = self._format_flowcell_type(files, is_paired)
151 files = self._format_read_length(files, is_paired)
152 result.append(files[0])
155 def _format_flowcell_type(self, files, is_paired):
156 """Used by get_raw_files to format value for single_or_paired-end
159 if 'flowcell_type' in f:
160 flowcell_type = fromTypedNode(f['flowcell_type'])
161 if flowcell_type is None:
163 elif flowcell_type.lower() == "paired":
164 f['flowcell_type'] = 'paired-end'
166 f['flowcell_type'] = 'single'
170 def _format_read_length(self, files, is_paired):
173 read_count = 2 if is_paired else 1
175 if 'read_length' in f:
176 read_length = str(fromTypedNode(f['read_length']))
177 f['read_length'] = ",".join([read_length] * read_count)
180 def _format_filename(self, files, is_paired):
181 """Format file name for get_raw_files, also report if paired
187 f0['filename'] = "%s, %s" % (str(f0['filename']),
189 f0['md5sum'] = "%s, %s" % (str(f0['md5sum']),
193 files[0]['filename'] = str(files[0]['filename'])
194 files[0]['md5sum'] = str(files[0]['md5sum'])
198 def get_run_details(self, analysis_node):
199 """Get information about runs
201 query_template = loader.get_template('geo_run_details.sparql')
204 'submission': str(analysis_node.uri),
207 return self.execute_query(query_template, context)
209 def query_to_soft_dictionary(self, results, heading):
212 name = strip_namespace(geoSoftNS, r['name'])
214 if name.lower() == heading.lower():
218 for v in fromTypedNode(r['value']).split(os.linesep):
221 attributes.append((name, v))