2 Analyze the Summary.htm file produced by GERALD
6 from pprint import pprint
8 from htsworkflow.pipelines.runfolder import ElementTree
9 from htsworkflow.util.ethelp import indent, flatten
11 LOGGER = logging.getLogger(__name__)
14 class Summary(object):
16 Extract some useful information from the Summary.htm file
21 class LaneResultSummary(object):
23 Parse the LaneResultSummary table out of Summary.htm
24 Mostly for the cluster number
26 LANE_RESULT_SUMMARY = 'LaneResultSummary'
28 'LaneYield': 'lane_yield',
29 'Cluster': 'cluster', # Raw
30 'ClusterPF': 'cluster_pass_filter',
31 'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
32 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
33 'PercentPassFilterClusters': 'percent_pass_filter_clusters',
34 'PercentPassFilterAlign': 'percent_pass_filter_align',
35 'AverageAlignmentScore': 'average_alignment_score',
36 'PercentErrorRate': 'percent_error_rate'
38 # These are tags that have mean/stdev as found in the GERALD Summary.xml file
40 #'laneYield': 'lane_yield', #this is just a number
41 'clusterCountRaw': 'cluster', # Raw
42 'clusterCountPF': 'cluster_pass_filter',
43 'oneSig': 'average_first_cycle_intensity',
44 'signal20AsPctOf1': 'percent_intensity_after_20_cycles',
45 'percentClustersPF': 'percent_pass_filter_clusters',
46 'percentUniquelyAlignedPF': 'percent_pass_filter_align',
47 'averageAlignScorePF': 'average_alignment_score',
48 'errorPF': 'percent_error_rate'
51 def __init__(self, html=None, xml=None):
54 self.lane_yield = None
56 self.cluster_pass_filter = None
57 self.average_first_cycle_intensity = None
58 self.percent_intensity_after_20_cycles = None
59 self.percent_pass_filter_clusters = None
60 self.percent_pass_filter_align = None
61 self.average_alignment_score = None
62 self.percent_error_rate = None
65 self.set_elements_from_html(html)
67 self.set_elements(xml)
69 def set_elements_from_html(self, data):
70 if not len(data) in (8,10):
71 raise RuntimeError("Summary.htm file format changed, len(data)=%d" % (len(data),))
73 # same in pre-0.3.0 Summary file and 0.3 summary file
74 self.lane = int(data[0])
77 parsed_data = [ parse_mean_range(x) for x in data[1:] ]
78 # this is the < 0.3 Pipeline version
79 self.cluster = parsed_data[0]
80 self.average_first_cycle_intensity = parsed_data[1]
81 self.percent_intensity_after_20_cycles = parsed_data[2]
82 self.percent_pass_filter_clusters = parsed_data[3]
83 self.percent_pass_filter_align = parsed_data[4]
84 self.average_alignment_score = parsed_data[5]
85 self.percent_error_rate = parsed_data[6]
87 parsed_data = [ parse_mean_range(x) for x in data[2:] ]
88 # this is the >= 0.3 summary file
89 self.lane_yield = data[1]
90 self.cluster = parsed_data[0]
91 self.cluster_pass_filter = parsed_data[1]
92 self.average_first_cycle_intensity = parsed_data[2]
93 self.percent_intensity_after_20_cycles = parsed_data[3]
94 self.percent_pass_filter_clusters = parsed_data[4]
95 self.percent_pass_filter_align = parsed_data[5]
96 self.average_alignment_score = parsed_data[6]
97 self.percent_error_rate = parsed_data[7]
99 def set_elements_from_gerald_xml(self, read, element):
100 self.lane = int(element.find('laneNumber').text)
102 lane_yield_node = element.find('laneYield')
103 if lane_yield_node is not None:
104 self.lane_yield = int(lane_yield_node.text)
106 self.lane_yield = None
108 for GeraldName, LRSName in Summary.LaneResultSummary.GERALD_TAGS.items():
109 node = element.find(GeraldName)
111 LOGGER.info("Couldn't find %s" % (GeraldName))
112 setattr(self, LRSName, parse_xml_mean_range(node))
114 def get_elements(self):
115 lane_result = ElementTree.Element(
116 Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
117 {'lane': unicode(self.lane), 'end': unicode(self.end)})
118 for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
119 value = getattr(self, variable_name)
122 # it looks like a sequence
123 elif type(value) in (types.TupleType, types.ListType):
124 element = make_mean_range_element(
130 element = ElementTree.SubElement(lane_result, tag)
131 element.text = unicode(value)
134 def set_elements(self, tree):
135 if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
136 raise ValueError('Expected %s' % (
137 Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
138 self.lane = int(tree.attrib['lane'])
139 # default to the first end, for the older summary files
140 # that are single ended
141 self.end = int(tree.attrib.get('end', 0))
142 tags = Summary.LaneResultSummary.TAGS
143 for element in list(tree):
145 variable_name = tags[element.tag]
146 setattr(self, variable_name,
147 parse_summary_element(element))
149 LOGGER.warn('Unrecognized tag %s' % (element.tag,))
151 def __init__(self, filename=None, xml=None):
152 # lane results is a list of 1 or 2 ends containing
153 # a dictionary of all the lanes reported in this
155 self.lane_results = [{}]
157 if filename is not None:
158 self._extract_lane_results(filename)
160 self.set_elements(xml)
162 def __getitem__(self, key):
163 return self.lane_results[key]
166 return len(self.lane_results)
168 def _flattened_row(self, row):
170 flatten the children of a <tr>...</tr>
172 return [flatten(x) for x in row.getchildren() ]
174 def _parse_table(self, table):
176 assumes the first line is the header of a table,
177 and that the remaining rows are data
179 rows = table.getchildren()
182 data.append(self._flattened_row(r))
185 def _extract_lane_results(self, pathname):
187 Extract just the lane results.
188 Currently those are the only ones we care about.
191 tables = self._extract_named_tables(pathname)
194 def _extract_named_tables(self, pathname):
196 extract all the 'named' tables from a Summary.htm file
197 and return as a dictionary
199 Named tables are <h2>...</h2><table>...</table> pairs
200 The contents of the h2 tag is considered to the name
203 # tree = ElementTree.parse(pathname).getroot()
204 # hack for 1.1rc1, this should be removed when possible.
205 file_body = open(pathname).read()
206 file_body = file_body.replace('CHASTITY<=', 'CHASTITY<=')
207 tree = ElementTree.fromstring(file_body)
209 # are we reading the xml or the html version of the Summary file?
210 if tree.tag.lower() == 'summary':
212 tables = self._extract_named_tables_from_gerald_xml(tree)
213 elif tree.tag.lower() == 'html':
215 tables = self._extract_named_tables_from_html(tree)
216 table_names = [ ('Lane Results Summary', 0),
217 ('Lane Results Summary : Read 1', 0),
218 ('Lane Results Summary : Read 2', 1),]
219 for name, end in table_names:
220 if tables.has_key(name):
221 self._extract_lane_results_for_end(tables, name, end)
223 if len(self.lane_results[0]) == 0:
224 LOGGER.warning("No Lane Results Summary Found in %s" % (pathname,))
226 def _extract_named_tables_from_gerald_xml(self, tree):
228 Extract useful named tables from a gerald created Summary.xml file
230 # using the function to convert to lower instead of just writing it
231 # makes the tag easier to read (IMO)
232 useful_tables = ['LaneResultsSummary'.lower(),]
235 for child in tree.getchildren():
236 if child.tag.lower() in useful_tables:
237 read_tree = child.find('Read')
239 read = int(read_tree.find('readNumber').text)-1
240 for element in read_tree.getchildren():
241 if element.tag.lower() == "lane":
242 lrs = Summary.LaneResultSummary()
243 lrs.set_elements_from_gerald_xml(read, element)
244 self.lane_results[lrs.end][lrs.lane] = lrs
245 # probably not useful
248 ###### START HTML Table Extraction ########
249 def _extract_named_tables_from_html(self, tree):
250 body = tree.find('body')
252 for i in range(len(body)):
253 if body[i].tag == 'h2' and body[i+1].tag == 'table':
254 # we have an interesting table
255 name = flatten(body[i])
257 data = self._parse_table(table)
261 def _extract_lane_results_for_end(self, tables, table_name, end):
263 extract the Lane Results Summary table
265 # parse lane result summary
266 lane_summary = tables[table_name]
267 # this is version 1 of the summary file
268 if len(lane_summary[-1]) == 8:
270 headers = lane_summary[0]
271 # grab the lane by lane data
272 lane_summary = lane_summary[1:]
274 # len(lane_summary[-1] = 10 is version 2 of the summary file
275 # = 9 is version 3 of the Summary.htm file
276 elif len(lane_summary[-1]) in (9, 10):
277 # lane_summary[0] is a different less specific header row
278 headers = lane_summary[1]
279 lane_summary = lane_summary[2:10]
280 # after the last lane, there's a set of chip wide averages
282 # append an extra dictionary if needed
283 if len(self.lane_results) < (end + 1):
284 self.lane_results.append({})
286 for r in lane_summary:
287 lrs = Summary.LaneResultSummary(html=r)
289 self.lane_results[lrs.end][lrs.lane] = lrs
290 ###### END HTML Table Extraction ########
292 def get_elements(self):
293 summary = ElementTree.Element(Summary.SUMMARY,
294 {'version': unicode(Summary.XML_VERSION)})
295 for end in self.lane_results:
296 for lane in end.values():
297 summary.append(lane.get_elements())
300 def set_elements(self, tree):
301 if tree.tag != Summary.SUMMARY:
302 return ValueError("Expected %s" % (Summary.SUMMARY,))
303 xml_version = int(tree.attrib.get('version', 0))
304 if xml_version > Summary.XML_VERSION:
305 LOGGER.warn('Summary XML tree is a higher version than this class')
306 for element in list(tree):
307 lrs = Summary.LaneResultSummary()
308 lrs.set_elements(element)
309 if len(self.lane_results) < (lrs.end + 1):
310 self.lane_results.append({})
311 self.lane_results[lrs.end][lrs.lane] = lrs
313 def is_paired_end(self):
314 return len(self.lane_results) == 2
318 Debugging function, report current object
320 tree = self.get_elements()
321 print ElementTree.tostring(tree)
325 Convert a value to int if its an int otherwise a float.
329 except ValueError, e:
333 def parse_mean_range(value):
335 Parse values like 123 +/- 4.5
337 if value.strip() == 'unknown':
340 values = value.split()
342 if values[0] == '+/-':
345 return tonumber(values[0])
347 average, pm, deviation = values
349 raise RuntimeError("Summary.htm file format changed")
350 return tonumber(average), tonumber(deviation)
352 def make_mean_range_element(parent, name, mean, deviation):
354 Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
356 element = ElementTree.SubElement(parent, name,
357 { 'mean': unicode(mean),
358 'deviation': unicode(deviation)})
361 def parse_mean_range_element(element):
363 Grab mean/deviation out of element
365 return (tonumber(element.attrib['mean']),
366 tonumber(element.attrib['deviation']))
368 def parse_summary_element(element):
370 Determine if we have a simple element or a mean/deviation element
372 if len(element.attrib) > 0:
373 return parse_mean_range_element(element)
377 def parse_xml_mean_range(element):
379 Extract mean/stddev children from an element as a tuple
384 mean = element.find('mean')
385 stddev = element.find('stdev')
386 if mean is None or stddev is None:
387 raise RuntimeError("Summary.xml file format changed, expected mean/stddev tags")
388 if mean.text is None:
389 mean_value = float('nan')
391 mean_value = tonumber(mean.text)
393 if stddev.text is None:
394 stddev_value = float('nan')
396 stddev_value = tonumber(stddev.text)
399 return (mean_value, stddev_value)
401 if __name__ == "__main__":
403 from optparse import OptionParser
404 parser = OptionParser('%prog [Summary.xml/Summary.htm]+')
405 opts, args = parser.parse_args()
407 parser.error('need at least one xml/html file')