6 from htsworkflow.submission.submission import Submission
8 from htsworkflow.util.rdfhelp import \
14 from django.conf import settings
15 from django.template import Context, loader
17 LOGGER = logging.getLogger(__name__)
19 class GEOSubmission(Submission):
20 def __init__(self, name, model):
21 super(GEOSubmission, self).__init__(name, model)
23 def make_soft(self, result_map):
25 platform = self.get_platform_metadata()
26 platform_attribs = dict(platform)
27 platform_id = platform_attribs['^platform']
28 series = self.get_series_metadata()
29 series_attribs = dict(series)
30 series_id = series_attribs['^series']
31 for lib_id, result_dir in result_map.items():
32 an_analysis = self.get_submission_node(result_dir)
33 metadata = self.get_sample_metadata(an_analysis)
35 errmsg = 'Confused there are more than one samples for %s'
36 LOGGER.debug(errmsg % (str(an_analysis,)))
37 metadata = metadata[0]
38 metadata['raw'] = self.get_raw_files(an_analysis)
39 metadata['supplimental'] = self.get_sample_files(an_analysis)
40 metadata['run'] = self.get_run_details(an_analysis)
41 samples.append(metadata)
43 soft_template = loader.get_template('geo_submission.soft')
48 'platform_id': platform_id,
49 'series_id': series_id,
51 print str(soft_template.render(context))
53 def check_for_name(self, analysis_node):
55 self.model.get_target(analysis_node,
56 submissionOntology['name']))
58 logger.error("Need name for %s" % (str(analysis_node)))
63 def get_platform_metadata(self):
64 """Gather information for filling out sample section of a SOFT file
66 query_template = loader.get_template('geo_platform.sparql')
67 submission = str(self.submissionSetNS[''].uri)
69 'submission': submission,
72 results = self.execute_query(query_template, context)
73 return self.query_to_soft_dictionary(results, 'platform')
75 def get_series_metadata(self):
76 """Gather information for filling out sample section of a SOFT file
78 query_template = loader.get_template('geo_series.sparql')
79 submission = str(self.submissionSetNS[''].uri)
81 'submission': submission,
84 results = self.execute_query(query_template, context)
85 return self.query_to_soft_dictionary(results, 'series')
87 def get_sample_metadata(self, analysis_node):
88 """Gather information for filling out sample section of a SOFT file
90 query_template = loader.get_template('geo_samples.sparql')
93 'submission': str(analysis_node.uri),
94 'submissionSet': str(self.submissionSetNS[''].uri),
97 results = self.execute_query(query_template, context)
99 r['dataProtocol'] = str(r['dataProtocol']).replace('\n', ' ')
103 def get_sample_files(self, analysis_node):
104 """Gather derived files
106 query_template = loader.get_template('geo_files.sparql')
109 'submission': str(analysis_node.uri),
110 'file_class': str(geoSoftNS['supplemental'])
113 return self.execute_query(query_template, context)
115 def get_raw_files(self, analysis_node):
116 """Gather raw data e.g. fastq files.
118 query_template = loader.get_template('geo_fastqs.sparql')
121 'submission': str(analysis_node.uri),
122 'file_class': str(geoSoftNS['raw']),
126 for row in self.execute_query(query_template, context):
128 for k, v in row.items():
130 lane = str(data['lane'])
131 lanes.setdefault(lane, []).append(data)
133 for lane, files in lanes.items():
135 errmsg = "Don't know what to do with more than 2 raw files"
136 raise ValueError(errmsg)
137 elif len(files) == 2:
139 elif len(files) == 1:
141 elif len(files) == 0:
142 raise RuntimeError("Empty lane list discovered")
143 files = self._format_filename(files, is_paired)
144 files = self._format_flowcell_type(files, is_paired)
145 files = self._format_read_length(files, is_paired)
146 result.append(files[0])
149 def _format_flowcell_type(self, files, is_paired):
150 """Used by get_raw_files to format value for single_or_paired-end
153 if 'flowcell_type' in f:
154 flowcell_type = fromTypedNode(f['flowcell_type'])
155 if flowcell_type is None:
157 elif flowcell_type.lower() == "paired":
158 f['flowcell_type'] = 'paired-end'
160 f['flowcell_type'] = 'single'
164 def _format_read_length(self, files, is_paired):
167 read_count = 2 if is_paired else 1
169 if 'read_length' in f:
170 read_length = str(fromTypedNode(f['read_length']))
171 f['read_length'] = ",".join([read_length] * read_count)
174 def _format_filename(self, files, is_paired):
175 """Format file name for get_raw_files, also report if paired
181 f0['filename'] = "%s, %s" % (str(f0['filename']),
183 f0['md5sum'] = "%s, %s" % (str(f0['md5sum']),
187 files[0]['filename'] = str(files[0]['filename'])
188 files[0]['md5sum'] = str(files[0]['md5sum'])
192 def get_run_details(self, analysis_node):
193 """Get information about runs
195 query_template = loader.get_template('geo_run_details.sparql')
198 'submission': str(analysis_node.uri),
201 return self.execute_query(query_template, context)
203 def query_to_soft_dictionary(self, results, heading):
206 name = simplifyUri(geoSoftNS, r['name'])
208 if name.lower() == heading.lower():
212 for v in fromTypedNode(r['value']).split(os.linesep):
215 attributes.append((name, v))