6137875b1ad86048b0c08080f6fb947d090e9471
[htsworkflow.git] / htsworkflow / submission / geo.py
1 import logging
2 import os
3
4 import RDF
5
6 from htsworkflow.submission.submission import Submission
7
8 from htsworkflow.util.rdfhelp import \
9      fromTypedNode, \
10      geoSoftNS, \
11      stripNamespace, \
12      submissionOntology
13
14 from django.conf import settings
15 from django.template import Context, loader
16
17 LOGGER = logging.getLogger(__name__)
18
19 class GEOSubmission(Submission):
20     def __init__(self, name, model):
21         super(GEOSubmission, self).__init__(name, model)
22
23     def make_soft(self, result_map):
24         samples = []
25         platform = self.get_platform_metadata()
26         platform_attribs = dict(platform)
27         platform_id = platform_attribs['^platform']
28         series = self.get_series_metadata()
29         series_attribs = dict(series)
30         series_id = series_attribs['^series']
31         for lib_id, result_dir in result_map.items():
32             an_analysis = self.get_submission_node(result_dir)
33             metadata = self.get_sample_metadata(an_analysis)
34             if len(metadata) > 1:
35                 errmsg = 'Confused there are more than one samples for %s'
36                 LOGGER.debug(errmsg % (str(an_analysis,)))
37             metadata = metadata[0]
38             metadata['raw'] = self.get_raw_files(an_analysis)
39             metadata['supplimental'] = self.get_sample_files(an_analysis)
40             metadata['run'] = self.get_run_details(an_analysis)
41             samples.append(metadata)
42
43         soft_template = loader.get_template('geo_submission.soft')
44         context = Context({
45             'platform': platform,
46             'series': series,
47             'samples': samples,
48             'platform_id': platform_id,
49             'series_id': series_id,
50         })
51         print str(soft_template.render(context))
52
53     def check_for_name(self, analysis_node):
54         name = fromTypedNode(
55             self.model.get_target(analysis_node,
56                                   submissionOntology['name']))
57         if name is None:
58             logger.error("Need name for %s" % (str(analysis_node)))
59             return False
60         else:
61             return True
62
63     def get_platform_metadata(self):
64         """Gather information for filling out sample section of a SOFT file
65         """
66         query_template = loader.get_template('geo_platform.sparql')
67         submission = str(self.submissionSetNS[''].uri)
68         context = Context({
69             'submission': submission,
70             })
71
72         results = self.execute_query(query_template, context)
73         return self.query_to_soft_dictionary(results, 'platform')
74
75     def get_series_metadata(self):
76         """Gather information for filling out sample section of a SOFT file
77         """
78         query_template = loader.get_template('geo_series.sparql')
79         submission = str(self.submissionSetNS[''].uri)
80         context = Context({
81             'submission': submission,
82             })
83
84         results = self.execute_query(query_template, context)
85         return self.query_to_soft_dictionary(results, 'series')
86
87     def get_sample_metadata(self, analysis_node):
88         """Gather information for filling out sample section of a SOFT file
89         """
90         query_template = loader.get_template('geo_samples.sparql')
91
92         context = Context({
93             'submission': str(analysis_node.uri),
94             'submissionSet': str(self.submissionSetNS[''].uri),
95             })
96
97         results = self.execute_query(query_template, context)
98         for r in results:
99             r['dataProtocol'] = str(r['dataProtocol']).replace('\n', ' ')
100
101         return results
102
103     def get_sample_files(self, analysis_node):
104         """Gather derived files
105         """
106         query_template = loader.get_template('geo_files.sparql')
107
108         context = Context({
109             'submission': str(analysis_node.uri),
110             'file_class': str(geoSoftNS['supplemental'])
111             })
112
113         return self.execute_query(query_template, context)
114
115     def get_raw_files(self, analysis_node):
116         """Gather raw data e.g. fastq files.
117         """
118         query_template = loader.get_template('geo_fastqs.sparql')
119
120         context = Context({
121             'submission': str(analysis_node.uri),
122             'file_class': str(geoSoftNS['raw']),
123             })
124
125         lanes = {}
126         for row in self.execute_query(query_template, context):
127             data = {}
128             for k, v in row.items():
129                 data[k] = v
130             lane = str(data['lane'])
131             lanes.setdefault(lane, []).append(data)
132         result = []
133         for lane, files in lanes.items():
134             if len(files) > 2:
135                 errmsg = "Don't know what to do with more than 2 raw files"
136                 raise ValueError(errmsg)
137             elif len(files) == 2:
138                 is_paired = True
139             elif len(files) == 1:
140                 is_paired = False
141             elif len(files) == 0:
142                 raise RuntimeError("Empty lane list discovered")
143             files = self._format_filename(files, is_paired)
144             files = self._format_flowcell_type(files, is_paired)
145             files = self._format_read_length(files, is_paired)
146             result.append(files[0])
147         return result
148
149     def _format_flowcell_type(self, files, is_paired):
150         """Used by get_raw_files to format value for single_or_paired-end
151         """
152         for f in files:
153             if 'flowcell_type' in f:
154                 flowcell_type = fromTypedNode(f['flowcell_type'])
155                 if flowcell_type is None:
156                     pass
157                 elif flowcell_type.lower() == "paired":
158                     f['flowcell_type'] = 'paired-end'
159                 else:
160                     f['flowcell_type'] = 'single'
161
162         return files
163
164     def _format_read_length(self, files, is_paired):
165         """Format
166         """
167         read_count = 2 if is_paired else 1
168         for f in files:
169             if 'read_length' in f:
170                 read_length = str(fromTypedNode(f['read_length']))
171                 f['read_length'] = ",".join([read_length] * read_count)
172         return files
173
174     def _format_filename(self, files, is_paired):
175         """Format file name for get_raw_files, also report if paired
176         """
177         if len(files) == 2:
178             # should be paired
179             f0 = files[0]
180             f1 = files[1]
181             f0['filename'] = "%s, %s" % (str(f0['filename']),
182                                          str(f1['filename']))
183             f0['md5sum'] = "%s, %s" % (str(f0['md5sum']),
184                                        str(f1['md5sum']))
185             del files[1]
186         else:
187             files[0]['filename'] = str(files[0]['filename'])
188             files[0]['md5sum'] = str(files[0]['md5sum'])
189         return files
190
191
192     def get_run_details(self, analysis_node):
193         """Get information about runs
194         """
195         query_template = loader.get_template('geo_run_details.sparql')
196
197         context = Context({
198             'submission': str(analysis_node.uri),
199             })
200
201         return self.execute_query(query_template, context)
202
203     def query_to_soft_dictionary(self, results, heading):
204         attributes = []
205         for r in results:
206             name = stripNamespace(geoSoftNS, r['name'])
207             if name is not None:
208                 if name.lower() == heading.lower():
209                     name = '^' + name
210                 else:
211                     name = '!' + name
212                 for v in fromTypedNode(r['value']).split(os.linesep):
213                     v = v.strip()
214                     if len(v) > 0:
215                         attributes.append((name, v))
216         return attributes