Test 1.1rc1 style runs, which unfortunately require a hack for parsing
[htsworkflow.git] / htsworkflow / pipelines / summary.py
1 """
2 Analyze the Summary.htm file produced by GERALD
3 """
4 import types
5
6 from htsworkflow.pipelines.runfolder import ElementTree
7 from htsworkflow.util.ethelp import indent, flatten
8
9 class Summary(object):
10     """
11     Extract some useful information from the Summary.htm file
12     """
13     XML_VERSION = 2
14     SUMMARY = 'Summary'
15
16     class LaneResultSummary(object):
17         """
18         Parse the LaneResultSummary table out of Summary.htm
19         Mostly for the cluster number
20         """
21         LANE_RESULT_SUMMARY = 'LaneResultSummary'
22         TAGS = {
23           'LaneYield': 'lane_yield',
24           'Cluster': 'cluster', # Raw
25           'ClusterPF': 'cluster_pass_filter',
26           'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
27           'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
28           'PercentPassFilterClusters': 'percent_pass_filter_clusters',
29           'PercentPassFilterAlign': 'percent_pass_filter_align',
30           'AverageAlignmentScore': 'average_alignment_score',
31           'PercentErrorRate': 'percent_error_rate'
32         }
33
34         def __init__(self, html=None, xml=None):
35             self.lane = None
36             self.lane_yield = None
37             self.cluster = None
38             self.cluster_pass_filter = None
39             self.average_first_cycle_intensity = None
40             self.percent_intensity_after_20_cycles = None
41             self.percent_pass_filter_clusters = None
42             self.percent_pass_filter_align = None
43             self.average_alignment_score = None
44             self.percent_error_rate = None
45
46             if html is not None:
47                 self.set_elements_from_html(html)
48             if xml is not None:
49                 self.set_elements(xml)
50
51         def set_elements_from_html(self, data):
52             if not len(data) in (8,10):
53                 raise RuntimeError("Summary.htm file format changed")
54
55             # same in pre-0.3.0 Summary file and 0.3 summary file
56             self.lane = data[0]
57
58             if len(data) == 8:
59                 parsed_data = [ parse_mean_range(x) for x in data[1:] ]
60                 # this is the < 0.3 Pipeline version
61                 self.cluster = parsed_data[0]
62                 self.average_first_cycle_intensity = parsed_data[1]
63                 self.percent_intensity_after_20_cycles = parsed_data[2]
64                 self.percent_pass_filter_clusters = parsed_data[3]
65                 self.percent_pass_filter_align = parsed_data[4]
66                 self.average_alignment_score = parsed_data[5]
67                 self.percent_error_rate = parsed_data[6]
68             elif len(data) == 10:
69                 parsed_data = [ parse_mean_range(x) for x in data[2:] ]
70                 # this is the >= 0.3 summary file
71                 self.lane_yield = data[1]
72                 self.cluster = parsed_data[0]
73                 self.cluster_pass_filter = parsed_data[1]
74                 self.average_first_cycle_intensity = parsed_data[2]
75                 self.percent_intensity_after_20_cycles = parsed_data[3]
76                 self.percent_pass_filter_clusters = parsed_data[4]
77                 self.percent_pass_filter_align = parsed_data[5]
78                 self.average_alignment_score = parsed_data[6]
79                 self.percent_error_rate = parsed_data[7]
80
81         def get_elements(self):
82             lane_result = ElementTree.Element(
83                             Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
84                             {'lane': self.lane})
85             for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
86                 value = getattr(self, variable_name)
87                 if value is None:
88                     continue
89                 # it looks like a sequence
90                 elif type(value) in (types.TupleType, types.ListType):
91                     element = make_mean_range_element(
92                       lane_result,
93                       tag,
94                       *value
95                     )
96                 else:
97                     element = ElementTree.SubElement(lane_result, tag)
98                     element.text = value
99             return lane_result
100
101         def set_elements(self, tree):
102             if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
103                 raise ValueError('Expected %s' % (
104                         Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
105             self.lane = tree.attrib['lane']
106             tags = Summary.LaneResultSummary.TAGS
107             for element in list(tree):
108                 try:
109                     variable_name = tags[element.tag]
110                     setattr(self, variable_name,
111                             parse_summary_element(element))
112                 except KeyError, e:
113                     logging.warn('Unrecognized tag %s' % (element.tag,))
114
115     def __init__(self, filename=None, xml=None):
116         self.lane_results = {}
117
118         if filename is not None:
119             self._extract_lane_results(filename)
120         if xml is not None:
121             self.set_elements(xml)
122
123     def __getitem__(self, key):
124         return self.lane_results[key]
125
126     def __len__(self):
127         return len(self.lane_results)
128
129     def keys(self):
130         return self.lane_results.keys()
131
132     def values(self):
133         return self.lane_results.values()
134
135     def items(self):
136         return self.lane_results.items()
137
138     def _flattened_row(self, row):
139         """
140         flatten the children of a <tr>...</tr>
141         """
142         return [flatten(x) for x in row.getchildren() ]
143
144     def _parse_table(self, table):
145         """
146         assumes the first line is the header of a table,
147         and that the remaining rows are data
148         """
149         rows = table.getchildren()
150         data = []
151         for r in rows:
152             data.append(self._flattened_row(r))
153         return data
154
155     def _extract_named_tables(self, pathname):
156         """
157         extract all the 'named' tables from a Summary.htm file
158         and return as a dictionary
159
160         Named tables are <h2>...</h2><table>...</table> pairs
161         The contents of the h2 tag is considered to the name
162         of the table.
163         """
164         # tree = ElementTree.parse(pathname).getroot()
165         # hack for 1.1rc1, this should be removed when possible.
166         file_body = open(pathname).read()
167         file_body = file_body.replace('CHASTITY<=', 'CHASTITY&lt;=')
168         tree = ElementTree.fromstring(file_body)
169         body = tree.find('body')
170         tables = {}
171         for i in range(len(body)):
172             if body[i].tag == 'h2' and body[i+1].tag == 'table':
173                 # we have an interesting table
174                 name = flatten(body[i])
175                 table = body[i+1]
176                 data = self._parse_table(table)
177                 tables[name] = data
178         return tables
179
180     def _extract_lane_results(self, pathname):
181         """
182         extract the Lane Results Summary table
183         """
184
185         tables = self._extract_named_tables(pathname)
186
187         # parse lane result summary
188         lane_summary = tables['Lane Results Summary']
189         # this is version 1 of the summary file
190         if len(lane_summary[-1]) == 8:
191             # strip header
192             headers = lane_summary[0]
193             # grab the lane by lane data
194             lane_summary = lane_summary[1:]
195
196         # this is version 2 of the summary file
197         if len(lane_summary[-1]) == 10:
198             # lane_summary[0] is a different less specific header row
199             headers = lane_summary[1]
200             lane_summary = lane_summary[2:10]
201             # after the last lane, there's a set of chip wide averages
202
203         for r in lane_summary:
204             lrs = Summary.LaneResultSummary(html=r)
205             self.lane_results[lrs.lane] = lrs
206
207     def get_elements(self):
208         summary = ElementTree.Element(Summary.SUMMARY,
209                                       {'version': unicode(Summary.XML_VERSION)})
210         for lane in self.lane_results.values():
211             summary.append(lane.get_elements())
212         return summary
213
214     def set_elements(self, tree):
215         if tree.tag != Summary.SUMMARY:
216             return ValueError("Expected %s" % (Summary.SUMMARY,))
217         xml_version = int(tree.attrib.get('version', 0))
218         if xml_version > Summary.XML_VERSION:
219             logging.warn('Summary XML tree is a higher version than this class')
220         for element in list(tree):
221             lrs = Summary.LaneResultSummary()
222             lrs.set_elements(element)
223             self.lane_results[lrs.lane] = lrs
224
225     def dump(self):
226         """
227         Debugging function, report current object
228         """
229         pass
230
231 def tonumber(v):
232     """
233     Convert a value to int if its an int otherwise a float.
234     """
235     try:
236         v = int(v)
237     except ValueError, e:
238         v = float(v)
239     return v
240
241 def parse_mean_range(value):
242     """
243     Parse values like 123 +/- 4.5
244     """
245     if value.strip() == 'unknown':
246         return 0, 0
247
248     average, pm, deviation = value.split()
249     if pm != '+/-':
250         raise RuntimeError("Summary.htm file format changed")
251     return tonumber(average), tonumber(deviation)
252
253 def make_mean_range_element(parent, name, mean, deviation):
254     """
255     Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
256     """
257     element = ElementTree.SubElement(parent, name,
258                                      { 'mean': unicode(mean),
259                                        'deviation': unicode(deviation)})
260     return element
261
262 def parse_mean_range_element(element):
263     """
264     Grab mean/deviation out of element
265     """
266     return (tonumber(element.attrib['mean']),
267             tonumber(element.attrib['deviation']))
268
269 def parse_summary_element(element):
270     """
271     Determine if we have a simple element or a mean/deviation element
272     """
273     if len(element.attrib) > 0:
274         return parse_mean_range_element(element)
275     else:
276         return element.text