2 Analyze the Summary.htm file produced by GERALD
6 from pprint import pprint
8 from htsworkflow.pipelines.runfolder import ElementTree
9 from htsworkflow.util.ethelp import indent, flatten
13 class Summary(object):
15 Extract some useful information from the Summary.htm file
20 class LaneResultSummary(object):
22 Parse the LaneResultSummary table out of Summary.htm
23 Mostly for the cluster number
25 LANE_RESULT_SUMMARY = 'LaneResultSummary'
27 'LaneYield': 'lane_yield',
28 'Cluster': 'cluster', # Raw
29 'ClusterPF': 'cluster_pass_filter',
30 'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
31 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
32 'PercentPassFilterClusters': 'percent_pass_filter_clusters',
33 'PercentPassFilterAlign': 'percent_pass_filter_align',
34 'AverageAlignmentScore': 'average_alignment_score',
35 'PercentErrorRate': 'percent_error_rate'
37 # These are tags that have mean/stdev as found in the GERALD Summary.xml file
39 #'laneYield': 'lane_yield', #this is just a number
40 'clusterCountRaw': 'cluster', # Raw
41 'clusterCountPF': 'cluster_pass_filter',
42 'oneSig': 'average_first_cycle_intensity',
43 'signal20AsPctOf1': 'percent_intensity_after_20_cycles',
44 'percentClustersPF': 'percent_pass_filter_clusters',
45 'percentUniquelyAlignedPF': 'percent_pass_filter_align',
46 'averageAlignScorePF': 'average_alignment_score',
47 'errorPF': 'percent_error_rate'
50 def __init__(self, html=None, xml=None):
53 self.lane_yield = None
55 self.cluster_pass_filter = None
56 self.average_first_cycle_intensity = None
57 self.percent_intensity_after_20_cycles = None
58 self.percent_pass_filter_clusters = None
59 self.percent_pass_filter_align = None
60 self.average_alignment_score = None
61 self.percent_error_rate = None
64 self.set_elements_from_html(html)
66 self.set_elements(xml)
68 def set_elements_from_html(self, data):
69 if not len(data) in (8,10):
70 raise RuntimeError("Summary.htm file format changed, len(data)=%d" % (len(data),))
72 # same in pre-0.3.0 Summary file and 0.3 summary file
73 self.lane = int(data[0])
76 parsed_data = [ parse_mean_range(x) for x in data[1:] ]
77 # this is the < 0.3 Pipeline version
78 self.cluster = parsed_data[0]
79 self.average_first_cycle_intensity = parsed_data[1]
80 self.percent_intensity_after_20_cycles = parsed_data[2]
81 self.percent_pass_filter_clusters = parsed_data[3]
82 self.percent_pass_filter_align = parsed_data[4]
83 self.average_alignment_score = parsed_data[5]
84 self.percent_error_rate = parsed_data[6]
86 parsed_data = [ parse_mean_range(x) for x in data[2:] ]
87 # this is the >= 0.3 summary file
88 self.lane_yield = data[1]
89 self.cluster = parsed_data[0]
90 self.cluster_pass_filter = parsed_data[1]
91 self.average_first_cycle_intensity = parsed_data[2]
92 self.percent_intensity_after_20_cycles = parsed_data[3]
93 self.percent_pass_filter_clusters = parsed_data[4]
94 self.percent_pass_filter_align = parsed_data[5]
95 self.average_alignment_score = parsed_data[6]
96 self.percent_error_rate = parsed_data[7]
98 def set_elements_from_gerald_xml(self, read, element):
99 self.lane = int(element.find('laneNumber').text)
101 self.lane_yield = int(element.find('laneYield').text)
103 for GeraldName, LRSName in Summary.LaneResultSummary.GERALD_TAGS.items():
104 node = element.find(GeraldName)
106 logging.info("Couldn't find %s" % (GeraldName))
107 setattr(self, LRSName, parse_xml_mean_range(node))
109 def get_elements(self):
110 lane_result = ElementTree.Element(
111 Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
112 {'lane': unicode(self.lane), 'end': unicode(self.end)})
113 for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
114 value = getattr(self, variable_name)
117 # it looks like a sequence
118 elif type(value) in (types.TupleType, types.ListType):
119 element = make_mean_range_element(
125 element = ElementTree.SubElement(lane_result, tag)
126 element.text = unicode(value)
129 def set_elements(self, tree):
130 if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
131 raise ValueError('Expected %s' % (
132 Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
133 self.lane = int(tree.attrib['lane'])
134 # default to the first end, for the older summary files
135 # that are single ended
136 self.end = int(tree.attrib.get('end', 0))
137 tags = Summary.LaneResultSummary.TAGS
138 for element in list(tree):
140 variable_name = tags[element.tag]
141 setattr(self, variable_name,
142 parse_summary_element(element))
144 logging.warn('Unrecognized tag %s' % (element.tag,))
146 def __init__(self, filename=None, xml=None):
147 # lane results is a list of 1 or 2 ends containing
148 # a dictionary of all the lanes reported in this
150 self.lane_results = [{}]
152 if filename is not None:
153 self._extract_lane_results(filename)
155 self.set_elements(xml)
157 def __getitem__(self, key):
158 return self.lane_results[key]
161 return len(self.lane_results)
163 def _flattened_row(self, row):
165 flatten the children of a <tr>...</tr>
167 return [flatten(x) for x in row.getchildren() ]
169 def _parse_table(self, table):
171 assumes the first line is the header of a table,
172 and that the remaining rows are data
174 rows = table.getchildren()
177 data.append(self._flattened_row(r))
180 def _extract_lane_results(self, pathname):
182 Extract just the lane results.
183 Currently those are the only ones we care about.
186 tables = self._extract_named_tables(pathname)
189 def _extract_named_tables(self, pathname):
191 extract all the 'named' tables from a Summary.htm file
192 and return as a dictionary
194 Named tables are <h2>...</h2><table>...</table> pairs
195 The contents of the h2 tag is considered to the name
198 # tree = ElementTree.parse(pathname).getroot()
199 # hack for 1.1rc1, this should be removed when possible.
200 file_body = open(pathname).read()
201 file_body = file_body.replace('CHASTITY<=', 'CHASTITY<=')
202 tree = ElementTree.fromstring(file_body)
204 # are we reading the xml or the html version of the Summary file?
205 if tree.tag.lower() == 'summary':
207 tables = self._extract_named_tables_from_gerald_xml(tree)
208 elif tree.tag.lower() == 'html':
210 tables = self._extract_named_tables_from_html(tree)
211 table_names = [ ('Lane Results Summary', 0),
212 ('Lane Results Summary : Read 1', 0),
213 ('Lane Results Summary : Read 2', 1),]
214 for name, end in table_names:
215 if tables.has_key(name):
216 self._extract_lane_results_for_end(tables, name, end)
218 if len(self.lane_results[0]) == 0:
219 logging.warning("No Lane Results Summary Found in %s" % (pathname,))
221 def _extract_named_tables_from_gerald_xml(self, tree):
223 Extract useful named tables from a gerald created Summary.xml file
225 # using the function to convert to lower instead of just writing it
226 # makes the tag easier to read (IMO)
227 useful_tables = ['LaneResultsSummary'.lower(),]
230 for child in tree.getchildren():
231 if child.tag.lower() in useful_tables:
232 read_tree = child.find('Read')
234 read = int(read_tree.find('readNumber').text)-1
235 for element in read_tree.getchildren():
236 if element.tag.lower() == "lane":
237 lrs = Summary.LaneResultSummary()
238 lrs.set_elements_from_gerald_xml(read, element)
239 self.lane_results[lrs.end][lrs.lane] = lrs
240 # probably not useful
243 ###### START HTML Table Extraction ########
244 def _extract_named_tables_from_html(self, tree):
245 body = tree.find('body')
247 for i in range(len(body)):
248 if body[i].tag == 'h2' and body[i+1].tag == 'table':
249 # we have an interesting table
250 name = flatten(body[i])
252 data = self._parse_table(table)
256 def _extract_lane_results_for_end(self, tables, table_name, end):
258 extract the Lane Results Summary table
260 # parse lane result summary
261 lane_summary = tables[table_name]
262 # this is version 1 of the summary file
263 if len(lane_summary[-1]) == 8:
265 headers = lane_summary[0]
266 # grab the lane by lane data
267 lane_summary = lane_summary[1:]
269 # len(lane_summary[-1] = 10 is version 2 of the summary file
270 # = 9 is version 3 of the Summary.htm file
271 elif len(lane_summary[-1]) in (9, 10):
272 # lane_summary[0] is a different less specific header row
273 headers = lane_summary[1]
274 lane_summary = lane_summary[2:10]
275 # after the last lane, there's a set of chip wide averages
277 # append an extra dictionary if needed
278 if len(self.lane_results) < (end + 1):
279 self.lane_results.append({})
281 for r in lane_summary:
282 lrs = Summary.LaneResultSummary(html=r)
284 self.lane_results[lrs.end][lrs.lane] = lrs
285 ###### END HTML Table Extraction ########
287 def get_elements(self):
288 summary = ElementTree.Element(Summary.SUMMARY,
289 {'version': unicode(Summary.XML_VERSION)})
290 for end in self.lane_results:
291 for lane in end.values():
292 summary.append(lane.get_elements())
295 def set_elements(self, tree):
296 if tree.tag != Summary.SUMMARY:
297 return ValueError("Expected %s" % (Summary.SUMMARY,))
298 xml_version = int(tree.attrib.get('version', 0))
299 if xml_version > Summary.XML_VERSION:
300 logging.warn('Summary XML tree is a higher version than this class')
301 for element in list(tree):
302 lrs = Summary.LaneResultSummary()
303 lrs.set_elements(element)
304 if len(self.lane_results) < (lrs.end + 1):
305 self.lane_results.append({})
306 self.lane_results[lrs.end][lrs.lane] = lrs
308 def is_paired_end(self):
309 return len(self.lane_results) == 2
313 Debugging function, report current object
315 tree = self.get_elements()
316 print ElementTree.tostring(tree)
320 Convert a value to int if its an int otherwise a float.
324 except ValueError, e:
328 def parse_mean_range(value):
330 Parse values like 123 +/- 4.5
332 if value.strip() == 'unknown':
335 values = value.split()
337 if values[0] == '+/-':
340 return tonumber(values[0])
342 average, pm, deviation = values
344 raise RuntimeError("Summary.htm file format changed")
345 return tonumber(average), tonumber(deviation)
347 def make_mean_range_element(parent, name, mean, deviation):
349 Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
351 element = ElementTree.SubElement(parent, name,
352 { 'mean': unicode(mean),
353 'deviation': unicode(deviation)})
356 def parse_mean_range_element(element):
358 Grab mean/deviation out of element
360 return (tonumber(element.attrib['mean']),
361 tonumber(element.attrib['deviation']))
363 def parse_summary_element(element):
365 Determine if we have a simple element or a mean/deviation element
367 if len(element.attrib) > 0:
368 return parse_mean_range_element(element)
372 def parse_xml_mean_range(element):
374 Extract mean/stddev children from an element as a tuple
379 mean = element.find('mean')
380 stddev = element.find('stdev')
381 if mean is None or stddev is None:
382 raise RuntimeError("Summary.xml file format changed, expected mean/stddev tags")
384 return (tonumber(mean.text), tonumber(stddev.text))
386 if __name__ == "__main__":
388 from optparse import OptionParser
389 parser = OptionParser('%prog [Summary.xml/Summary.htm]+')
390 opts, args = parser.parse_args()
392 parser.error('need at least one xml/html file')