2 Analyze the Summary.htm file produced by GERALD
6 from htsworkflow.pipelines.runfolder import ElementTree
7 from htsworkflow.util.ethelp import indent, flatten
11 Extract some useful information from the Summary.htm file
16 class LaneResultSummary(object):
18 Parse the LaneResultSummary table out of Summary.htm
19 Mostly for the cluster number
21 LANE_RESULT_SUMMARY = 'LaneResultSummary'
23 'LaneYield': 'lane_yield',
24 'Cluster': 'cluster', # Raw
25 'ClusterPF': 'cluster_pass_filter',
26 'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
27 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
28 'PercentPassFilterClusters': 'percent_pass_filter_clusters',
29 'PercentPassFilterAlign': 'percent_pass_filter_align',
30 'AverageAlignmentScore': 'average_alignment_score',
31 'PercentErrorRate': 'percent_error_rate'
34 def __init__(self, html=None, xml=None):
36 self.lane_yield = None
38 self.cluster_pass_filter = None
39 self.average_first_cycle_intensity = None
40 self.percent_intensity_after_20_cycles = None
41 self.percent_pass_filter_clusters = None
42 self.percent_pass_filter_align = None
43 self.average_alignment_score = None
44 self.percent_error_rate = None
47 self.set_elements_from_html(html)
49 self.set_elements(xml)
51 def set_elements_from_html(self, data):
52 if not len(data) in (8,10):
53 raise RuntimeError("Summary.htm file format changed")
55 # same in pre-0.3.0 Summary file and 0.3 summary file
59 parsed_data = [ parse_mean_range(x) for x in data[1:] ]
60 # this is the < 0.3 Pipeline version
61 self.cluster = parsed_data[0]
62 self.average_first_cycle_intensity = parsed_data[1]
63 self.percent_intensity_after_20_cycles = parsed_data[2]
64 self.percent_pass_filter_clusters = parsed_data[3]
65 self.percent_pass_filter_align = parsed_data[4]
66 self.average_alignment_score = parsed_data[5]
67 self.percent_error_rate = parsed_data[6]
69 parsed_data = [ parse_mean_range(x) for x in data[2:] ]
70 # this is the >= 0.3 summary file
71 self.lane_yield = data[1]
72 self.cluster = parsed_data[0]
73 self.cluster_pass_filter = parsed_data[1]
74 self.average_first_cycle_intensity = parsed_data[2]
75 self.percent_intensity_after_20_cycles = parsed_data[3]
76 self.percent_pass_filter_clusters = parsed_data[4]
77 self.percent_pass_filter_align = parsed_data[5]
78 self.average_alignment_score = parsed_data[6]
79 self.percent_error_rate = parsed_data[7]
81 def get_elements(self):
82 lane_result = ElementTree.Element(
83 Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
85 for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
86 value = getattr(self, variable_name)
89 # it looks like a sequence
90 elif type(value) in (types.TupleType, types.ListType):
91 element = make_mean_range_element(
97 element = ElementTree.SubElement(lane_result, tag)
101 def set_elements(self, tree):
102 if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
103 raise ValueError('Expected %s' % (
104 Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
105 self.lane = tree.attrib['lane']
106 tags = Summary.LaneResultSummary.TAGS
107 for element in list(tree):
109 variable_name = tags[element.tag]
110 setattr(self, variable_name,
111 parse_summary_element(element))
113 logging.warn('Unrecognized tag %s' % (element.tag,))
115 def __init__(self, filename=None, xml=None):
116 self.lane_results = {}
118 if filename is not None:
119 self._extract_lane_results(filename)
121 self.set_elements(xml)
123 def __getitem__(self, key):
124 return self.lane_results[key]
127 return len(self.lane_results)
130 return self.lane_results.keys()
133 return self.lane_results.values()
136 return self.lane_results.items()
138 def _flattened_row(self, row):
140 flatten the children of a <tr>...</tr>
142 return [flatten(x) for x in row.getchildren() ]
144 def _parse_table(self, table):
146 assumes the first line is the header of a table,
147 and that the remaining rows are data
149 rows = table.getchildren()
152 data.append(self._flattened_row(r))
155 def _extract_named_tables(self, pathname):
157 extract all the 'named' tables from a Summary.htm file
158 and return as a dictionary
160 Named tables are <h2>...</h2><table>...</table> pairs
161 The contents of the h2 tag is considered to the name
164 # tree = ElementTree.parse(pathname).getroot()
165 # hack for 1.1rc1, this should be removed when possible.
166 file_body = open(pathname).read()
167 file_body = file_body.replace('CHASTITY<=', 'CHASTITY<=')
168 tree = ElementTree.fromstring(file_body)
169 body = tree.find('body')
171 for i in range(len(body)):
172 if body[i].tag == 'h2' and body[i+1].tag == 'table':
173 # we have an interesting table
174 name = flatten(body[i])
176 data = self._parse_table(table)
180 def _extract_lane_results(self, pathname):
182 extract the Lane Results Summary table
185 tables = self._extract_named_tables(pathname)
187 # parse lane result summary
188 lane_summary = tables['Lane Results Summary']
189 # this is version 1 of the summary file
190 if len(lane_summary[-1]) == 8:
192 headers = lane_summary[0]
193 # grab the lane by lane data
194 lane_summary = lane_summary[1:]
196 # this is version 2 of the summary file
197 if len(lane_summary[-1]) == 10:
198 # lane_summary[0] is a different less specific header row
199 headers = lane_summary[1]
200 lane_summary = lane_summary[2:10]
201 # after the last lane, there's a set of chip wide averages
203 for r in lane_summary:
204 lrs = Summary.LaneResultSummary(html=r)
205 self.lane_results[lrs.lane] = lrs
207 def get_elements(self):
208 summary = ElementTree.Element(Summary.SUMMARY,
209 {'version': unicode(Summary.XML_VERSION)})
210 for lane in self.lane_results.values():
211 summary.append(lane.get_elements())
214 def set_elements(self, tree):
215 if tree.tag != Summary.SUMMARY:
216 return ValueError("Expected %s" % (Summary.SUMMARY,))
217 xml_version = int(tree.attrib.get('version', 0))
218 if xml_version > Summary.XML_VERSION:
219 logging.warn('Summary XML tree is a higher version than this class')
220 for element in list(tree):
221 lrs = Summary.LaneResultSummary()
222 lrs.set_elements(element)
223 self.lane_results[lrs.lane] = lrs
227 Debugging function, report current object
233 Convert a value to int if its an int otherwise a float.
237 except ValueError, e:
241 def parse_mean_range(value):
243 Parse values like 123 +/- 4.5
245 if value.strip() == 'unknown':
248 average, pm, deviation = value.split()
250 raise RuntimeError("Summary.htm file format changed")
251 return tonumber(average), tonumber(deviation)
253 def make_mean_range_element(parent, name, mean, deviation):
255 Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
257 element = ElementTree.SubElement(parent, name,
258 { 'mean': unicode(mean),
259 'deviation': unicode(deviation)})
262 def parse_mean_range_element(element):
264 Grab mean/deviation out of element
266 return (tonumber(element.attrib['mean']),
267 tonumber(element.attrib['deviation']))
269 def parse_summary_element(element):
271 Determine if we have a simple element or a mean/deviation element
273 if len(element.attrib) > 0:
274 return parse_mean_range_element(element)