2 Analyze the Summary.htm file produced by GERALD
6 from pprint import pprint
8 from htsworkflow.pipelines.runfolder import ElementTree
9 from htsworkflow.util.ethelp import indent, flatten
13 class Summary(object):
15 Extract some useful information from the Summary.htm file
20 class LaneResultSummary(object):
22 Parse the LaneResultSummary table out of Summary.htm
23 Mostly for the cluster number
25 LANE_RESULT_SUMMARY = 'LaneResultSummary'
27 'LaneYield': 'lane_yield',
28 'Cluster': 'cluster', # Raw
29 'ClusterPF': 'cluster_pass_filter',
30 'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
31 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
32 'PercentPassFilterClusters': 'percent_pass_filter_clusters',
33 'PercentPassFilterAlign': 'percent_pass_filter_align',
34 'AverageAlignmentScore': 'average_alignment_score',
35 'PercentErrorRate': 'percent_error_rate'
38 def __init__(self, html=None, xml=None):
41 self.lane_yield = None
43 self.cluster_pass_filter = None
44 self.average_first_cycle_intensity = None
45 self.percent_intensity_after_20_cycles = None
46 self.percent_pass_filter_clusters = None
47 self.percent_pass_filter_align = None
48 self.average_alignment_score = None
49 self.percent_error_rate = None
52 self.set_elements_from_html(html)
54 self.set_elements(xml)
56 def set_elements_from_html(self, data):
57 if not len(data) in (8,10):
58 raise RuntimeError("Summary.htm file format changed, len(data)=%d" % (len(data),))
60 # same in pre-0.3.0 Summary file and 0.3 summary file
61 self.lane = int(data[0])
64 parsed_data = [ parse_mean_range(x) for x in data[1:] ]
65 # this is the < 0.3 Pipeline version
66 self.cluster = parsed_data[0]
67 self.average_first_cycle_intensity = parsed_data[1]
68 self.percent_intensity_after_20_cycles = parsed_data[2]
69 self.percent_pass_filter_clusters = parsed_data[3]
70 self.percent_pass_filter_align = parsed_data[4]
71 self.average_alignment_score = parsed_data[5]
72 self.percent_error_rate = parsed_data[6]
74 parsed_data = [ parse_mean_range(x) for x in data[2:] ]
75 # this is the >= 0.3 summary file
76 self.lane_yield = data[1]
77 self.cluster = parsed_data[0]
78 self.cluster_pass_filter = parsed_data[1]
79 self.average_first_cycle_intensity = parsed_data[2]
80 self.percent_intensity_after_20_cycles = parsed_data[3]
81 self.percent_pass_filter_clusters = parsed_data[4]
82 self.percent_pass_filter_align = parsed_data[5]
83 self.average_alignment_score = parsed_data[6]
84 self.percent_error_rate = parsed_data[7]
86 def get_elements(self):
87 lane_result = ElementTree.Element(
88 Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
89 {'lane': str(self.lane), 'end': str(self.end)})
90 for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
91 value = getattr(self, variable_name)
94 # it looks like a sequence
95 elif type(value) in (types.TupleType, types.ListType):
96 element = make_mean_range_element(
102 element = ElementTree.SubElement(lane_result, tag)
106 def set_elements(self, tree):
107 if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
108 raise ValueError('Expected %s' % (
109 Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
110 self.lane = int(tree.attrib['lane'])
111 # default to the first end, for the older summary files
112 # that are single ended
113 self.end = int(tree.attrib.get('end', 0))
114 tags = Summary.LaneResultSummary.TAGS
115 for element in list(tree):
117 variable_name = tags[element.tag]
118 setattr(self, variable_name,
119 parse_summary_element(element))
121 logging.warn('Unrecognized tag %s' % (element.tag,))
123 def __init__(self, filename=None, xml=None):
124 # lane results is a list of 1 or 2 ends containing
125 # a dictionary of all the lanes reported in this
127 self.lane_results = [{}]
129 if filename is not None:
130 self._extract_lane_results(filename)
132 self.set_elements(xml)
134 def __getitem__(self, key):
135 return self.lane_results[key]
138 return len(self.lane_results)
140 def _flattened_row(self, row):
142 flatten the children of a <tr>...</tr>
144 return [flatten(x) for x in row.getchildren() ]
146 def _parse_table(self, table):
148 assumes the first line is the header of a table,
149 and that the remaining rows are data
151 rows = table.getchildren()
154 data.append(self._flattened_row(r))
157 def _extract_named_tables(self, pathname):
159 extract all the 'named' tables from a Summary.htm file
160 and return as a dictionary
162 Named tables are <h2>...</h2><table>...</table> pairs
163 The contents of the h2 tag is considered to the name
166 # tree = ElementTree.parse(pathname).getroot()
167 # hack for 1.1rc1, this should be removed when possible.
168 file_body = open(pathname).read()
169 file_body = file_body.replace('CHASTITY<=', 'CHASTITY<=')
170 tree = ElementTree.fromstring(file_body)
171 body = tree.find('body')
173 for i in range(len(body)):
174 if body[i].tag == 'h2' and body[i+1].tag == 'table':
175 # we have an interesting table
176 name = flatten(body[i])
178 data = self._parse_table(table)
182 def _extract_lane_results(self, pathname):
183 tables = self._extract_named_tables(pathname)
184 table_names = [ ('Lane Results Summary', 0),
185 ('Lane Results Summary : Read 1', 0),
186 ('Lane Results Summary : Read 2', 1),]
187 for name, end in table_names:
188 if tables.has_key(name):
189 self._extract_lane_results_for_end(tables, name, end)
191 logging.warning("No Lane Results Summary Found in %s" % (pathname,))
193 def _extract_lane_results_for_end(self, tables, table_name, end):
195 extract the Lane Results Summary table
197 # parse lane result summary
198 lane_summary = tables[table_name]
199 # this is version 1 of the summary file
200 if len(lane_summary[-1]) == 8:
202 headers = lane_summary[0]
203 # grab the lane by lane data
204 lane_summary = lane_summary[1:]
206 # len(lane_summary[-1] = 10 is version 2 of the summary file
207 # = 9 is version 3 of the Summary.htm file
208 elif len(lane_summary[-1]) in (9, 10):
209 # lane_summary[0] is a different less specific header row
210 headers = lane_summary[1]
211 lane_summary = lane_summary[2:10]
212 # after the last lane, there's a set of chip wide averages
214 # append an extra dictionary if needed
215 if len(self.lane_results) < (end + 1):
216 self.lane_results.append({})
218 for r in lane_summary:
219 lrs = Summary.LaneResultSummary(html=r)
221 self.lane_results[lrs.end][lrs.lane] = lrs
223 def get_elements(self):
224 summary = ElementTree.Element(Summary.SUMMARY,
225 {'version': unicode(Summary.XML_VERSION)})
226 for end in self.lane_results:
227 for lane in end.values():
228 summary.append(lane.get_elements())
231 def set_elements(self, tree):
232 if tree.tag != Summary.SUMMARY:
233 return ValueError("Expected %s" % (Summary.SUMMARY,))
234 xml_version = int(tree.attrib.get('version', 0))
235 if xml_version > Summary.XML_VERSION:
236 logging.warn('Summary XML tree is a higher version than this class')
237 for element in list(tree):
238 lrs = Summary.LaneResultSummary()
239 lrs.set_elements(element)
240 if len(self.lane_results) < (lrs.end + 1):
241 self.lane_results.append({})
242 self.lane_results[lrs.end][lrs.lane] = lrs
244 def is_paired_end(self):
245 return len(self.lane_results) == 2
249 Debugging function, report current object
255 Convert a value to int if its an int otherwise a float.
259 except ValueError, e:
263 def parse_mean_range(value):
265 Parse values like 123 +/- 4.5
267 if value.strip() == 'unknown':
270 values = value.split()
272 if values[0] == '+/-':
275 return tonumber(values[0])
277 average, pm, deviation = values
279 raise RuntimeError("Summary.htm file format changed")
280 return tonumber(average), tonumber(deviation)
282 def make_mean_range_element(parent, name, mean, deviation):
284 Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
286 element = ElementTree.SubElement(parent, name,
287 { 'mean': unicode(mean),
288 'deviation': unicode(deviation)})
291 def parse_mean_range_element(element):
293 Grab mean/deviation out of element
295 return (tonumber(element.attrib['mean']),
296 tonumber(element.attrib['deviation']))
298 def parse_summary_element(element):
300 Determine if we have a simple element or a mean/deviation element
302 if len(element.attrib) > 0:
303 return parse_mean_range_element(element)