dc323ff08f9193673c46d591be772041a27afcb5
[htsworkflow.git] / htsworkflow / pipelines / summary.py
1 """
2 Analyze the Summary.htm file produced by GERALD
3 """
4 import types
5 from pprint import pprint
6
7 from htsworkflow.pipelines.runfolder import ElementTree
8 from htsworkflow.util.ethelp import indent, flatten
9
10 nan = float('nan')
11
12 class Summary(object):
13     """
14     Extract some useful information from the Summary.htm file
15     """
16     XML_VERSION = 3
17     SUMMARY = 'Summary'
18
19     class LaneResultSummary(object):
20         """
21         Parse the LaneResultSummary table out of Summary.htm
22         Mostly for the cluster number
23         """
24         LANE_RESULT_SUMMARY = 'LaneResultSummary'
25         TAGS = {
26           'LaneYield': 'lane_yield',
27           'Cluster': 'cluster', # Raw
28           'ClusterPF': 'cluster_pass_filter',
29           'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
30           'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
31           'PercentPassFilterClusters': 'percent_pass_filter_clusters',
32           'PercentPassFilterAlign': 'percent_pass_filter_align',
33           'AverageAlignmentScore': 'average_alignment_score',
34           'PercentErrorRate': 'percent_error_rate'
35         }
36
37         def __init__(self, html=None, xml=None):
38             self.lane = None
39             self.end = 0
40             self.lane_yield = None
41             self.cluster = None
42             self.cluster_pass_filter = None
43             self.average_first_cycle_intensity = None
44             self.percent_intensity_after_20_cycles = None
45             self.percent_pass_filter_clusters = None
46             self.percent_pass_filter_align = None
47             self.average_alignment_score = None
48             self.percent_error_rate = None
49
50             if html is not None:
51                 self.set_elements_from_html(html)
52             if xml is not None:
53                 self.set_elements(xml)
54
55         def set_elements_from_html(self, data):
56             if not len(data) in (8,10):
57                 raise RuntimeError("Summary.htm file format changed, len(data)=%d" % (len(data),))
58
59             # same in pre-0.3.0 Summary file and 0.3 summary file
60             self.lane = int(data[0])
61
62             if len(data) == 8:
63                 parsed_data = [ parse_mean_range(x) for x in data[1:] ]
64                 # this is the < 0.3 Pipeline version
65                 self.cluster = parsed_data[0]
66                 self.average_first_cycle_intensity = parsed_data[1]
67                 self.percent_intensity_after_20_cycles = parsed_data[2]
68                 self.percent_pass_filter_clusters = parsed_data[3]
69                 self.percent_pass_filter_align = parsed_data[4]
70                 self.average_alignment_score = parsed_data[5]
71                 self.percent_error_rate = parsed_data[6]
72             elif len(data) == 10:
73                 parsed_data = [ parse_mean_range(x) for x in data[2:] ]
74                 # this is the >= 0.3 summary file
75                 self.lane_yield = data[1]
76                 self.cluster = parsed_data[0]
77                 self.cluster_pass_filter = parsed_data[1]
78                 self.average_first_cycle_intensity = parsed_data[2]
79                 self.percent_intensity_after_20_cycles = parsed_data[3]
80                 self.percent_pass_filter_clusters = parsed_data[4]
81                 self.percent_pass_filter_align = parsed_data[5]
82                 self.average_alignment_score = parsed_data[6]
83                 self.percent_error_rate = parsed_data[7]
84
85         def get_elements(self):
86             lane_result = ElementTree.Element(
87                             Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
88                             {'lane': str(self.lane), 'end': str(self.end)})
89             for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
90                 value = getattr(self, variable_name)
91                 if value is None:
92                     continue
93                 # it looks like a sequence
94                 elif type(value) in (types.TupleType, types.ListType):
95                     element = make_mean_range_element(
96                       lane_result,
97                       tag,
98                       *value
99                     )
100                 else:
101                     element = ElementTree.SubElement(lane_result, tag)
102                     element.text = value
103             return lane_result
104
105         def set_elements(self, tree):
106             if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
107                 raise ValueError('Expected %s' % (
108                         Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
109             self.lane = int(tree.attrib['lane'])
110             # default to the first end, for the older summary files
111             # that are single ended
112             self.end = int(tree.attrib.get('end', 0))
113             tags = Summary.LaneResultSummary.TAGS
114             for element in list(tree):
115                 try:
116                     variable_name = tags[element.tag]
117                     setattr(self, variable_name,
118                             parse_summary_element(element))
119                 except KeyError, e:
120                     logging.warn('Unrecognized tag %s' % (element.tag,))
121
122     def __init__(self, filename=None, xml=None):
123         # lane results is a list of 1 or 2 ends containing
124         # a dictionary of all the lanes reported in this
125         # summary file
126         self.lane_results = [{}]
127
128         if filename is not None:
129             self._extract_lane_results(filename)
130         if xml is not None:
131             self.set_elements(xml)
132
133     def __getitem__(self, key):
134         return self.lane_results[key]
135
136     def __len__(self):
137         return len(self.lane_results)
138
139     def _flattened_row(self, row):
140         """
141         flatten the children of a <tr>...</tr>
142         """
143         return [flatten(x) for x in row.getchildren() ]
144
145     def _parse_table(self, table):
146         """
147         assumes the first line is the header of a table,
148         and that the remaining rows are data
149         """
150         rows = table.getchildren()
151         data = []
152         for r in rows:
153             data.append(self._flattened_row(r))
154         return data
155
156     def _extract_named_tables(self, pathname):
157         """
158         extract all the 'named' tables from a Summary.htm file
159         and return as a dictionary
160
161         Named tables are <h2>...</h2><table>...</table> pairs
162         The contents of the h2 tag is considered to the name
163         of the table.
164         """
165         # tree = ElementTree.parse(pathname).getroot()
166         # hack for 1.1rc1, this should be removed when possible.
167         file_body = open(pathname).read()
168         file_body = file_body.replace('CHASTITY<=', 'CHASTITY&lt;=')
169         tree = ElementTree.fromstring(file_body)
170         body = tree.find('body')
171         tables = {}
172         for i in range(len(body)):
173             if body[i].tag == 'h2' and body[i+1].tag == 'table':
174                 # we have an interesting table
175                 name = flatten(body[i])
176                 table = body[i+1]
177                 data = self._parse_table(table)
178                 tables[name] = data
179         return tables
180
181     def _extract_lane_results(self, pathname):
182         tables = self._extract_named_tables(pathname)
183         table_names = [ ('Lane Results Summary', 0),
184                         ('Lane Results Summary : Read 1', 0),
185                         ('Lane Results Summary : Read 2', 1),]
186         for name, end in table_names:
187           if tables.has_key(name):
188             self._extract_lane_results_for_end(tables, name, end)
189
190     def _extract_lane_results_for_end(self, tables, table_name, end):
191         """
192         extract the Lane Results Summary table
193         """
194         # parse lane result summary
195         lane_summary = tables[table_name]
196         # this is version 1 of the summary file
197         if len(lane_summary[-1]) == 8:
198             # strip header
199             headers = lane_summary[0]
200             # grab the lane by lane data
201             lane_summary = lane_summary[1:]
202
203         # len(lane_summary[-1] = 10 is version 2 of the summary file
204         #                      = 9  is version 3 of the Summary.htm file
205         elif len(lane_summary[-1]) in (9, 10):
206             # lane_summary[0] is a different less specific header row
207             headers = lane_summary[1]
208             lane_summary = lane_summary[2:10]
209             # after the last lane, there's a set of chip wide averages
210
211         # append an extra dictionary if needed
212         if len(self.lane_results) < (end + 1):
213           self.lane_results.append({})
214
215         for r in lane_summary:
216             lrs = Summary.LaneResultSummary(html=r)
217             lrs.end = end
218             self.lane_results[lrs.end][lrs.lane] = lrs
219
220     def get_elements(self):
221         summary = ElementTree.Element(Summary.SUMMARY,
222                                       {'version': unicode(Summary.XML_VERSION)})
223         for end in self.lane_results:
224             for lane in end.values():
225                 summary.append(lane.get_elements())
226         return summary
227
228     def set_elements(self, tree):
229         if tree.tag != Summary.SUMMARY:
230             return ValueError("Expected %s" % (Summary.SUMMARY,))
231         xml_version = int(tree.attrib.get('version', 0))
232         if xml_version > Summary.XML_VERSION:
233             logging.warn('Summary XML tree is a higher version than this class')
234         for element in list(tree):
235             lrs = Summary.LaneResultSummary()
236             lrs.set_elements(element)
237             if len(self.lane_results) < (lrs.end + 1):
238               self.lane_results.append({})
239             self.lane_results[lrs.end][lrs.lane] = lrs
240
241     def is_paired_end(self):
242       return len(self.lane_results) == 2
243
244     def dump(self):
245         """
246         Debugging function, report current object
247         """
248         pass
249
250 def tonumber(v):
251     """
252     Convert a value to int if its an int otherwise a float.
253     """
254     try:
255         v = int(v)
256     except ValueError, e:
257         v = float(v)
258     return v
259
260 def parse_mean_range(value):
261     """
262     Parse values like 123 +/- 4.5
263     """
264     if value.strip() == 'unknown':
265         return nan, nan
266
267     values = value.split()
268     if len(values) == 1:
269         if values[0] == '+/-':
270             return nan,nan
271         else:
272             return tonumber(values[0])
273
274     average, pm, deviation = values
275     if pm != '+/-':
276         raise RuntimeError("Summary.htm file format changed")
277     return tonumber(average), tonumber(deviation)
278
279 def make_mean_range_element(parent, name, mean, deviation):
280     """
281     Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
282     """
283     element = ElementTree.SubElement(parent, name,
284                                      { 'mean': unicode(mean),
285                                        'deviation': unicode(deviation)})
286     return element
287
288 def parse_mean_range_element(element):
289     """
290     Grab mean/deviation out of element
291     """
292     return (tonumber(element.attrib['mean']),
293             tonumber(element.attrib['deviation']))
294
295 def parse_summary_element(element):
296     """
297     Determine if we have a simple element or a mean/deviation element
298     """
299     if len(element.attrib) > 0:
300         return parse_mean_range_element(element)
301     else:
302         return element.text