72b3e5e058058e8e4715b0c033b314a8b25ff3ef
[htsworkflow.git] / htsworkflow / pipelines / summary.py
1 """
2 Analyze the Summary.htm file produced by GERALD
3 """
4 import types
5
6 from htsworkflow.pipelines.runfolder import ElementTree
7 from htsworkflow.util.ethelp import indent, flatten
8
9 class Summary(object):
10     """
11     Extract some useful information from the Summary.htm file
12     """
13     XML_VERSION = 3
14     SUMMARY = 'Summary'
15
16     class LaneResultSummary(object):
17         """
18         Parse the LaneResultSummary table out of Summary.htm
19         Mostly for the cluster number
20         """
21         LANE_RESULT_SUMMARY = 'LaneResultSummary'
22         TAGS = {
23           'LaneYield': 'lane_yield',
24           'Cluster': 'cluster', # Raw
25           'ClusterPF': 'cluster_pass_filter',
26           'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
27           'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
28           'PercentPassFilterClusters': 'percent_pass_filter_clusters',
29           'PercentPassFilterAlign': 'percent_pass_filter_align',
30           'AverageAlignmentScore': 'average_alignment_score',
31           'PercentErrorRate': 'percent_error_rate'
32         }
33
34         def __init__(self, html=None, xml=None):
35             self.lane = None
36             self.end = 0
37             self.lane_yield = None
38             self.cluster = None
39             self.cluster_pass_filter = None
40             self.average_first_cycle_intensity = None
41             self.percent_intensity_after_20_cycles = None
42             self.percent_pass_filter_clusters = None
43             self.percent_pass_filter_align = None
44             self.average_alignment_score = None
45             self.percent_error_rate = None
46
47             if html is not None:
48                 self.set_elements_from_html(html)
49             if xml is not None:
50                 self.set_elements(xml)
51
52         def set_elements_from_html(self, data):
53             if not len(data) in (8,10):
54                 raise RuntimeError("Summary.htm file format changed")
55
56             # same in pre-0.3.0 Summary file and 0.3 summary file
57             self.lane = int(data[0])
58
59             if len(data) == 8:
60                 parsed_data = [ parse_mean_range(x) for x in data[1:] ]
61                 # this is the < 0.3 Pipeline version
62                 self.cluster = parsed_data[0]
63                 self.average_first_cycle_intensity = parsed_data[1]
64                 self.percent_intensity_after_20_cycles = parsed_data[2]
65                 self.percent_pass_filter_clusters = parsed_data[3]
66                 self.percent_pass_filter_align = parsed_data[4]
67                 self.average_alignment_score = parsed_data[5]
68                 self.percent_error_rate = parsed_data[6]
69             elif len(data) == 10:
70                 parsed_data = [ parse_mean_range(x) for x in data[2:] ]
71                 # this is the >= 0.3 summary file
72                 self.lane_yield = data[1]
73                 self.cluster = parsed_data[0]
74                 self.cluster_pass_filter = parsed_data[1]
75                 self.average_first_cycle_intensity = parsed_data[2]
76                 self.percent_intensity_after_20_cycles = parsed_data[3]
77                 self.percent_pass_filter_clusters = parsed_data[4]
78                 self.percent_pass_filter_align = parsed_data[5]
79                 self.average_alignment_score = parsed_data[6]
80                 self.percent_error_rate = parsed_data[7]
81
82         def get_elements(self):
83             lane_result = ElementTree.Element(
84                             Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
85                             {'lane': str(self.lane), 'end': str(self.end)})
86             for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
87                 value = getattr(self, variable_name)
88                 if value is None:
89                     continue
90                 # it looks like a sequence
91                 elif type(value) in (types.TupleType, types.ListType):
92                     element = make_mean_range_element(
93                       lane_result,
94                       tag,
95                       *value
96                     )
97                 else:
98                     element = ElementTree.SubElement(lane_result, tag)
99                     element.text = value
100             return lane_result
101
102         def set_elements(self, tree):
103             if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
104                 raise ValueError('Expected %s' % (
105                         Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
106             self.lane = int(tree.attrib['lane'])
107             # default to the first end, for the older summary files
108             # that are single ended
109             self.end = int(tree.attrib.get('end', 0))
110             tags = Summary.LaneResultSummary.TAGS
111             for element in list(tree):
112                 try:
113                     variable_name = tags[element.tag]
114                     setattr(self, variable_name,
115                             parse_summary_element(element))
116                 except KeyError, e:
117                     logging.warn('Unrecognized tag %s' % (element.tag,))
118
119     def __init__(self, filename=None, xml=None):
120         # lane results is a list of 1 or 2 ends containing
121         # a dictionary of all the lanes reported in this
122         # summary file
123         self.lane_results = [{}]
124
125         if filename is not None:
126             self._extract_lane_results(filename)
127         if xml is not None:
128             self.set_elements(xml)
129
130     def __getitem__(self, key):
131         return self.lane_results[key]
132
133     def __len__(self):
134         return len(self.lane_results)
135
136     def _flattened_row(self, row):
137         """
138         flatten the children of a <tr>...</tr>
139         """
140         return [flatten(x) for x in row.getchildren() ]
141
142     def _parse_table(self, table):
143         """
144         assumes the first line is the header of a table,
145         and that the remaining rows are data
146         """
147         rows = table.getchildren()
148         data = []
149         for r in rows:
150             data.append(self._flattened_row(r))
151         return data
152
153     def _extract_named_tables(self, pathname):
154         """
155         extract all the 'named' tables from a Summary.htm file
156         and return as a dictionary
157
158         Named tables are <h2>...</h2><table>...</table> pairs
159         The contents of the h2 tag is considered to the name
160         of the table.
161         """
162         # tree = ElementTree.parse(pathname).getroot()
163         # hack for 1.1rc1, this should be removed when possible.
164         file_body = open(pathname).read()
165         file_body = file_body.replace('CHASTITY<=', 'CHASTITY&lt;=')
166         tree = ElementTree.fromstring(file_body)
167         body = tree.find('body')
168         tables = {}
169         for i in range(len(body)):
170             if body[i].tag == 'h2' and body[i+1].tag == 'table':
171                 # we have an interesting table
172                 name = flatten(body[i])
173                 table = body[i+1]
174                 data = self._parse_table(table)
175                 tables[name] = data
176         return tables
177
178     def _extract_lane_results(self, pathname):
179         tables = self._extract_named_tables(pathname)
180         table_names = [ ('Lane Results Summary', 0),
181                         ('Lane Results Summary : Read 1', 0),
182                         ('Lane Results Summary : Read 2', 1),]
183         for name, end in table_names:
184           if tables.has_key(name):
185             self._extract_lane_results_for_end(tables, name, end)
186
187     def _extract_lane_results_for_end(self, tables, table_name, end):
188         """
189         extract the Lane Results Summary table
190         """
191         # parse lane result summary
192         lane_summary = tables[table_name]
193         # this is version 1 of the summary file
194         if len(lane_summary[-1]) == 8:
195             # strip header
196             headers = lane_summary[0]
197             # grab the lane by lane data
198             lane_summary = lane_summary[1:]
199
200         # this is version 2 of the summary file
201         if len(lane_summary[-1]) == 10:
202             # lane_summary[0] is a different less specific header row
203             headers = lane_summary[1]
204             lane_summary = lane_summary[2:10]
205             # after the last lane, there's a set of chip wide averages
206
207         # append an extra dictionary if needed
208         if len(self.lane_results) < (end + 1):
209           self.lane_results.append({})
210
211         for r in lane_summary:
212             lrs = Summary.LaneResultSummary(html=r)
213             lrs.end = end
214             self.lane_results[lrs.end][lrs.lane] = lrs
215
216     def get_elements(self):
217         summary = ElementTree.Element(Summary.SUMMARY,
218                                       {'version': unicode(Summary.XML_VERSION)})
219         for end in self.lane_results:
220             for lane in end.values():
221                 summary.append(lane.get_elements())
222         return summary
223
224     def set_elements(self, tree):
225         if tree.tag != Summary.SUMMARY:
226             return ValueError("Expected %s" % (Summary.SUMMARY,))
227         xml_version = int(tree.attrib.get('version', 0))
228         if xml_version > Summary.XML_VERSION:
229             logging.warn('Summary XML tree is a higher version than this class')
230         for element in list(tree):
231             lrs = Summary.LaneResultSummary()
232             lrs.set_elements(element)
233             print lrs.end, lrs.lane
234             if len(self.lane_results) < (lrs.end + 1):
235               self.lane_results.append({})
236             self.lane_results[lrs.end][lrs.lane] = lrs
237
238     def is_paired_end(self):
239       return len(self.lane_results) == 2
240
241     def dump(self):
242         """
243         Debugging function, report current object
244         """
245         pass
246
247 def tonumber(v):
248     """
249     Convert a value to int if its an int otherwise a float.
250     """
251     try:
252         v = int(v)
253     except ValueError, e:
254         v = float(v)
255     return v
256
257 def parse_mean_range(value):
258     """
259     Parse values like 123 +/- 4.5
260     """
261     if value.strip() == 'unknown':
262         return 0, 0
263
264     average, pm, deviation = value.split()
265     if pm != '+/-':
266         raise RuntimeError("Summary.htm file format changed")
267     return tonumber(average), tonumber(deviation)
268
269 def make_mean_range_element(parent, name, mean, deviation):
270     """
271     Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
272     """
273     element = ElementTree.SubElement(parent, name,
274                                      { 'mean': unicode(mean),
275                                        'deviation': unicode(deviation)})
276     return element
277
278 def parse_mean_range_element(element):
279     """
280     Grab mean/deviation out of element
281     """
282     return (tonumber(element.attrib['mean']),
283             tonumber(element.attrib['deviation']))
284
285 def parse_summary_element(element):
286     """
287     Determine if we have a simple element or a mean/deviation element
288     """
289     if len(element.attrib) > 0:
290         return parse_mean_range_element(element)
291     else:
292         return element.text