Report if the Summary.htm file is missing the Lane Results Summary block.
[htsworkflow.git] / htsworkflow / pipelines / summary.py
1 """
2 Analyze the Summary.htm file produced by GERALD
3 """
4 import logging
5 import types
6 from pprint import pprint
7
8 from htsworkflow.pipelines.runfolder import ElementTree
9 from htsworkflow.util.ethelp import indent, flatten
10
11 nan = float('nan')
12
13 class Summary(object):
14     """
15     Extract some useful information from the Summary.htm file
16     """
17     XML_VERSION = 3
18     SUMMARY = 'Summary'
19
20     class LaneResultSummary(object):
21         """
22         Parse the LaneResultSummary table out of Summary.htm
23         Mostly for the cluster number
24         """
25         LANE_RESULT_SUMMARY = 'LaneResultSummary'
26         TAGS = {
27           'LaneYield': 'lane_yield',
28           'Cluster': 'cluster', # Raw
29           'ClusterPF': 'cluster_pass_filter',
30           'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
31           'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
32           'PercentPassFilterClusters': 'percent_pass_filter_clusters',
33           'PercentPassFilterAlign': 'percent_pass_filter_align',
34           'AverageAlignmentScore': 'average_alignment_score',
35           'PercentErrorRate': 'percent_error_rate'
36         }
37
38         def __init__(self, html=None, xml=None):
39             self.lane = None
40             self.end = 0
41             self.lane_yield = None
42             self.cluster = None
43             self.cluster_pass_filter = None
44             self.average_first_cycle_intensity = None
45             self.percent_intensity_after_20_cycles = None
46             self.percent_pass_filter_clusters = None
47             self.percent_pass_filter_align = None
48             self.average_alignment_score = None
49             self.percent_error_rate = None
50
51             if html is not None:
52                 self.set_elements_from_html(html)
53             if xml is not None:
54                 self.set_elements(xml)
55
56         def set_elements_from_html(self, data):
57             if not len(data) in (8,10):
58                 raise RuntimeError("Summary.htm file format changed, len(data)=%d" % (len(data),))
59
60             # same in pre-0.3.0 Summary file and 0.3 summary file
61             self.lane = int(data[0])
62
63             if len(data) == 8:
64                 parsed_data = [ parse_mean_range(x) for x in data[1:] ]
65                 # this is the < 0.3 Pipeline version
66                 self.cluster = parsed_data[0]
67                 self.average_first_cycle_intensity = parsed_data[1]
68                 self.percent_intensity_after_20_cycles = parsed_data[2]
69                 self.percent_pass_filter_clusters = parsed_data[3]
70                 self.percent_pass_filter_align = parsed_data[4]
71                 self.average_alignment_score = parsed_data[5]
72                 self.percent_error_rate = parsed_data[6]
73             elif len(data) == 10:
74                 parsed_data = [ parse_mean_range(x) for x in data[2:] ]
75                 # this is the >= 0.3 summary file
76                 self.lane_yield = data[1]
77                 self.cluster = parsed_data[0]
78                 self.cluster_pass_filter = parsed_data[1]
79                 self.average_first_cycle_intensity = parsed_data[2]
80                 self.percent_intensity_after_20_cycles = parsed_data[3]
81                 self.percent_pass_filter_clusters = parsed_data[4]
82                 self.percent_pass_filter_align = parsed_data[5]
83                 self.average_alignment_score = parsed_data[6]
84                 self.percent_error_rate = parsed_data[7]
85
86         def get_elements(self):
87             lane_result = ElementTree.Element(
88                             Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
89                             {'lane': str(self.lane), 'end': str(self.end)})
90             for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
91                 value = getattr(self, variable_name)
92                 if value is None:
93                     continue
94                 # it looks like a sequence
95                 elif type(value) in (types.TupleType, types.ListType):
96                     element = make_mean_range_element(
97                       lane_result,
98                       tag,
99                       *value
100                     )
101                 else:
102                     element = ElementTree.SubElement(lane_result, tag)
103                     element.text = value
104             return lane_result
105
106         def set_elements(self, tree):
107             if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
108                 raise ValueError('Expected %s' % (
109                         Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
110             self.lane = int(tree.attrib['lane'])
111             # default to the first end, for the older summary files
112             # that are single ended
113             self.end = int(tree.attrib.get('end', 0))
114             tags = Summary.LaneResultSummary.TAGS
115             for element in list(tree):
116                 try:
117                     variable_name = tags[element.tag]
118                     setattr(self, variable_name,
119                             parse_summary_element(element))
120                 except KeyError, e:
121                     logging.warn('Unrecognized tag %s' % (element.tag,))
122
123     def __init__(self, filename=None, xml=None):
124         # lane results is a list of 1 or 2 ends containing
125         # a dictionary of all the lanes reported in this
126         # summary file
127         self.lane_results = [{}]
128
129         if filename is not None:
130             self._extract_lane_results(filename)
131         if xml is not None:
132             self.set_elements(xml)
133
134     def __getitem__(self, key):
135         return self.lane_results[key]
136
137     def __len__(self):
138         return len(self.lane_results)
139
140     def _flattened_row(self, row):
141         """
142         flatten the children of a <tr>...</tr>
143         """
144         return [flatten(x) for x in row.getchildren() ]
145
146     def _parse_table(self, table):
147         """
148         assumes the first line is the header of a table,
149         and that the remaining rows are data
150         """
151         rows = table.getchildren()
152         data = []
153         for r in rows:
154             data.append(self._flattened_row(r))
155         return data
156
157     def _extract_named_tables(self, pathname):
158         """
159         extract all the 'named' tables from a Summary.htm file
160         and return as a dictionary
161
162         Named tables are <h2>...</h2><table>...</table> pairs
163         The contents of the h2 tag is considered to the name
164         of the table.
165         """
166         # tree = ElementTree.parse(pathname).getroot()
167         # hack for 1.1rc1, this should be removed when possible.
168         file_body = open(pathname).read()
169         file_body = file_body.replace('CHASTITY<=', 'CHASTITY&lt;=')
170         tree = ElementTree.fromstring(file_body)
171         body = tree.find('body')
172         tables = {}
173         for i in range(len(body)):
174             if body[i].tag == 'h2' and body[i+1].tag == 'table':
175                 # we have an interesting table
176                 name = flatten(body[i])
177                 table = body[i+1]
178                 data = self._parse_table(table)
179                 tables[name] = data
180         return tables
181
182     def _extract_lane_results(self, pathname):
183         tables = self._extract_named_tables(pathname)
184         table_names = [ ('Lane Results Summary', 0),
185                         ('Lane Results Summary : Read 1', 0),
186                         ('Lane Results Summary : Read 2', 1),]
187         for name, end in table_names:
188           if tables.has_key(name):
189             self._extract_lane_results_for_end(tables, name, end)
190         else:
191             logging.warning("No Lane Results Summary Found in %s" % (pathname,))
192
193     def _extract_lane_results_for_end(self, tables, table_name, end):
194         """
195         extract the Lane Results Summary table
196         """
197         # parse lane result summary
198         lane_summary = tables[table_name]
199         # this is version 1 of the summary file
200         if len(lane_summary[-1]) == 8:
201             # strip header
202             headers = lane_summary[0]
203             # grab the lane by lane data
204             lane_summary = lane_summary[1:]
205
206         # len(lane_summary[-1] = 10 is version 2 of the summary file
207         #                      = 9  is version 3 of the Summary.htm file
208         elif len(lane_summary[-1]) in (9, 10):
209             # lane_summary[0] is a different less specific header row
210             headers = lane_summary[1]
211             lane_summary = lane_summary[2:10]
212             # after the last lane, there's a set of chip wide averages
213
214         # append an extra dictionary if needed
215         if len(self.lane_results) < (end + 1):
216           self.lane_results.append({})
217
218         for r in lane_summary:
219             lrs = Summary.LaneResultSummary(html=r)
220             lrs.end = end
221             self.lane_results[lrs.end][lrs.lane] = lrs
222
223     def get_elements(self):
224         summary = ElementTree.Element(Summary.SUMMARY,
225                                       {'version': unicode(Summary.XML_VERSION)})
226         for end in self.lane_results:
227             for lane in end.values():
228                 summary.append(lane.get_elements())
229         return summary
230
231     def set_elements(self, tree):
232         if tree.tag != Summary.SUMMARY:
233             return ValueError("Expected %s" % (Summary.SUMMARY,))
234         xml_version = int(tree.attrib.get('version', 0))
235         if xml_version > Summary.XML_VERSION:
236             logging.warn('Summary XML tree is a higher version than this class')
237         for element in list(tree):
238             lrs = Summary.LaneResultSummary()
239             lrs.set_elements(element)
240             if len(self.lane_results) < (lrs.end + 1):
241               self.lane_results.append({})
242             self.lane_results[lrs.end][lrs.lane] = lrs
243
244     def is_paired_end(self):
245       return len(self.lane_results) == 2
246
247     def dump(self):
248         """
249         Debugging function, report current object
250         """
251         pass
252
253 def tonumber(v):
254     """
255     Convert a value to int if its an int otherwise a float.
256     """
257     try:
258         v = int(v)
259     except ValueError, e:
260         v = float(v)
261     return v
262
263 def parse_mean_range(value):
264     """
265     Parse values like 123 +/- 4.5
266     """
267     if value.strip() == 'unknown':
268         return nan, nan
269
270     values = value.split()
271     if len(values) == 1:
272         if values[0] == '+/-':
273             return nan,nan
274         else:
275             return tonumber(values[0])
276
277     average, pm, deviation = values
278     if pm != '+/-':
279         raise RuntimeError("Summary.htm file format changed")
280     return tonumber(average), tonumber(deviation)
281
282 def make_mean_range_element(parent, name, mean, deviation):
283     """
284     Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
285     """
286     element = ElementTree.SubElement(parent, name,
287                                      { 'mean': unicode(mean),
288                                        'deviation': unicode(deviation)})
289     return element
290
291 def parse_mean_range_element(element):
292     """
293     Grab mean/deviation out of element
294     """
295     return (tonumber(element.attrib['mean']),
296             tonumber(element.attrib['deviation']))
297
298 def parse_summary_element(element):
299     """
300     Determine if we have a simple element or a mean/deviation element
301     """
302     if len(element.attrib) > 0:
303         return parse_mean_range_element(element)
304     else:
305         return element.text