convert several not covered by unit-test modules to use print function
[htsworkflow.git] / htsworkflow / submission / geo.py
1 from __future__ import print_function
2
3 import logging
4 import os
5
6 import RDF
7
8 from htsworkflow.submission.submission import Submission
9
10 from htsworkflow.util.rdfhelp import \
11      fromTypedNode, \
12      geoSoftNS, \
13      strip_namespace, \
14      submissionOntology
15
16 from django.conf import settings
17 from django.template import Context, loader
18
19 LOGGER = logging.getLogger(__name__)
20
21 class GEOSubmission(Submission):
22     def __init__(self, name, model, host):
23         super(GEOSubmission, self).__init__(name, model, host)
24
25     def make_soft(self, result_map):
26         samples = []
27         platform = self.get_platform_metadata()
28         platform_attribs = dict(platform)
29         platform_id = platform_attribs['^platform']
30         series = self.get_series_metadata()
31         series_attribs = dict(series)
32         series_id = series_attribs['^series']
33         for lib_id, result_dir in result_map.items():
34             an_analysis = self.get_submission_node(result_dir)
35             metadata = self.get_sample_metadata(an_analysis)
36             if len(metadata) == 0:
37                 errmsg = 'No metadata found for {0}'
38                 LOGGER.error(errmsg.format(str(an_analysis),))
39                 continue
40             elif len(metadata) > 1:
41                 errmsg = 'Confused there are more than one sample for %s'
42                 LOGGER.debug(errmsg % (str(an_analysis),))
43             metadata = metadata[0]
44             metadata['raw'] = self.get_raw_files(an_analysis)
45             metadata['supplimental'] = self.get_sample_files(an_analysis)
46             metadata['run'] = self.get_run_details(an_analysis)
47             samples.append(metadata)
48
49         soft_template = loader.get_template('geo_submission.soft')
50         context = Context({
51             'platform': platform,
52             'series': series,
53             'samples': samples,
54             'platform_id': platform_id,
55             'series_id': series_id,
56         })
57         print(str(soft_template.render(context)))
58
59     def check_for_name(self, analysis_node):
60         name = fromTypedNode(
61             self.model.get_target(analysis_node,
62                                   submissionOntology['name']))
63         if name is None:
64             logger.error("Need name for %s" % (str(analysis_node)))
65             return False
66         else:
67             return True
68
69     def get_platform_metadata(self):
70         """Gather information for filling out sample section of a SOFT file
71         """
72         query_template = loader.get_template('geo_platform.sparql')
73         submission = str(self.submissionSetNS[''].uri)
74         context = Context({
75             'submission': submission,
76             })
77
78         results = self.execute_query(query_template, context)
79         return self.query_to_soft_dictionary(results, 'platform')
80
81     def get_series_metadata(self):
82         """Gather information for filling out sample section of a SOFT file
83         """
84         query_template = loader.get_template('geo_series.sparql')
85         submission = str(self.submissionSetNS[''].uri)
86         context = Context({
87             'submission': submission,
88             })
89
90         results = self.execute_query(query_template, context)
91         return self.query_to_soft_dictionary(results, 'series')
92
93     def get_sample_metadata(self, analysis_node):
94         """Gather information for filling out sample section of a SOFT file
95         """
96         query_template = loader.get_template('geo_samples.sparql')
97
98         context = Context({
99             'submission': str(analysis_node.uri),
100             'submissionSet': str(self.submissionSetNS[''].uri),
101             })
102
103         results = self.execute_query(query_template, context)
104         for r in results:
105             r['dataProtocol'] = str(r['dataProtocol']).replace('\n', ' ')
106
107         return results
108
109     def get_sample_files(self, analysis_node):
110         """Gather derived files
111         """
112         query_template = loader.get_template('geo_files.sparql')
113
114         context = Context({
115             'submission': str(analysis_node.uri),
116             'file_class': str(geoSoftNS['supplemental'].uri)
117             })
118
119         return self.execute_query(query_template, context)
120
121     def get_raw_files(self, analysis_node):
122         """Gather raw data e.g. fastq files.
123         """
124         query_template = loader.get_template('geo_fastqs.sparql')
125
126         context = Context({
127             'submission': str(analysis_node.uri),
128             'file_class': str(geoSoftNS['raw'].uri),
129             })
130
131         lanes = {}
132         for row in self.execute_query(query_template, context):
133             data = {}
134             for k, v in row.items():
135                 data[k] = v
136             library = str(data['library'])
137             lanes.setdefault(library, []).append(data)
138         result = []
139         for library, files in lanes.items():
140             if len(files) > 2:
141                 errmsg = "Don't know what to do with more than 2 raw files"
142                 raise ValueError(errmsg)
143             elif len(files) == 2:
144                 is_paired = True
145             elif len(files) == 1:
146                 is_paired = False
147             elif len(files) == 0:
148                 raise RuntimeError("Empty library list discovered")
149             files = self._format_filename(files, is_paired)
150             files = self._format_flowcell_type(files, is_paired)
151             files = self._format_read_length(files, is_paired)
152             result.append(files[0])
153         return result
154
155     def _format_flowcell_type(self, files, is_paired):
156         """Used by get_raw_files to format value for single_or_paired-end
157         """
158         for f in files:
159             if 'flowcell_type' in f:
160                 flowcell_type = fromTypedNode(f['flowcell_type'])
161                 if flowcell_type is None:
162                     pass
163                 elif flowcell_type.lower() == "paired":
164                     f['flowcell_type'] = 'paired-end'
165                 else:
166                     f['flowcell_type'] = 'single'
167
168         return files
169
170     def _format_read_length(self, files, is_paired):
171         """Format
172         """
173         read_count = 2 if is_paired else 1
174         for f in files:
175             if 'read_length' in f:
176                 read_length = str(fromTypedNode(f['read_length']))
177                 f['read_length'] = ",".join([read_length] * read_count)
178         return files
179
180     def _format_filename(self, files, is_paired):
181         """Format file name for get_raw_files, also report if paired
182         """
183         if len(files) == 2:
184             # should be paired
185             f0 = files[0]
186             f1 = files[1]
187             f0['filename'] = "%s, %s" % (str(f0['filename']),
188                                          str(f1['filename']))
189             f0['md5sum'] = "%s, %s" % (str(f0['md5sum']),
190                                        str(f1['md5sum']))
191             del files[1]
192         else:
193             files[0]['filename'] = str(files[0]['filename'])
194             files[0]['md5sum'] = str(files[0]['md5sum'])
195         return files
196
197
198     def get_run_details(self, analysis_node):
199         """Get information about runs
200         """
201         query_template = loader.get_template('geo_run_details.sparql')
202
203         context = Context({
204             'submission': str(analysis_node.uri),
205             })
206
207         return self.execute_query(query_template, context)
208
209     def query_to_soft_dictionary(self, results, heading):
210         attributes = []
211         for r in results:
212             name = strip_namespace(geoSoftNS, r['name'])
213             if name is not None:
214                 if name.lower() == heading.lower():
215                     name = '^' + name
216                 else:
217                     name = '!' + name
218                 for v in fromTypedNode(r['value']).split(os.linesep):
219                     v = v.strip()
220                     if len(v) > 0:
221                         attributes.append((name, v))
222         return attributes