2 Analyze the Summary.htm file produced by GERALD
4 from __future__ import print_function
10 from pprint import pprint
13 from lxml import etree
14 from htsworkflow.util.ethelp import indent, flatten
16 LOGGER = logging.getLogger(__name__)
19 class Summary(object):
21 Extract some useful information from the Summary.htm file
26 def __init__(self, filename=None, xml=None):
27 # lane results is a list of 1 or 2 ends containing
28 # a dictionary of all the lanes reported in this
30 self.lane_results = [{}]
32 if filename is not None:
33 self._extract_lane_results(filename)
35 self.set_elements(xml)
37 def __getitem__(self, key):
38 return self.lane_results[key]
41 return len(self.lane_results)
43 def get_elements(self):
44 summary = etree.Element(Summary.SUMMARY,
45 {'version': unicode(Summary.XML_VERSION)})
46 for end in self.lane_results:
47 for lane in end.values():
48 summary.append(lane.get_elements())
51 def set_elements(self, tree):
52 if tree.tag != Summary.SUMMARY:
53 return ValueError("Expected %s" % (Summary.SUMMARY,))
54 xml_version = int(tree.attrib.get('version', 0))
55 if xml_version > Summary.XML_VERSION:
56 LOGGER.warn('Summary XML tree is a higher version than this class')
57 for element in list(tree):
58 lrs = LaneResultSummaryGA()
59 lrs.set_elements(element)
60 if len(self.lane_results) < (lrs.end + 1):
61 self.lane_results.append({})
62 self.lane_results[lrs.end][lrs.lane] = lrs
64 def is_paired_end(self):
65 return len(self.lane_results) == 2
69 Debugging function, report current object
71 tree = self.get_elements()
72 print(etree.tostring(tree))
74 class SummaryGA(Summary):
75 def __init__(self, filename=None, xml=None):
76 super(SummaryGA, self).__init__(filename, xml)
78 def _flattened_row(self, row):
80 flatten the children of a <tr>...</tr>
82 return [flatten(x) for x in row.getchildren() ]
84 def _parse_table(self, table):
86 assumes the first line is the header of a table,
87 and that the remaining rows are data
89 rows = table.getchildren()
92 data.append(self._flattened_row(r))
95 def _extract_lane_results(self, pathname):
97 Extract just the lane results.
98 Currently those are the only ones we care about.
101 tables = self._extract_named_tables(pathname)
104 def _extract_named_tables(self, pathname):
106 extract all the 'named' tables from a Summary.htm file
107 and return as a dictionary
109 Named tables are <h2>...</h2><table>...</table> pairs
110 The contents of the h2 tag is considered to the name
113 if isxml_file(pathname):
114 tree = etree.parse(pathname).getroot()
117 tree = html.parse(pathname).getroot()
118 # tree = etree.parse(pathname).getroot()
119 # hack for 1.1rc1, this should be removed when possible.
120 #file_body = open(pathname).read()
121 #file_body = file_body.replace('CHASTITY<=', 'CHASTITY<=')
122 #tree = etree.fromstring(file_body)
124 # are we reading the xml or the html version of the Summary file?
125 if tree.tag.lower() == 'summary':
127 tables = self._extract_named_tables_from_gerald_xml(tree)
128 elif tree.tag.lower() == 'html':
130 tables = self._extract_named_tables_from_html(tree)
131 table_names = [ ('Lane Results Summary', 0),
132 ('Lane Results Summary : Read 1', 0),
133 ('Lane Results Summary : Read 2', 1),]
134 for name, end in table_names:
136 self._extract_lane_results_for_end(tables, name, end)
138 if len(self.lane_results[0]) == 0:
139 LOGGER.warning("No Lane Results Summary Found in %s" % (pathname,))
141 def _extract_named_tables_from_gerald_xml(self, tree):
143 Extract useful named tables from a gerald created Summary.xml file
145 # using the function to convert to lower instead of just writing it
146 # makes the tag easier to read (IMO)
147 useful_tables = ['LaneResultsSummary'.lower(),]
150 for child in tree.getchildren():
151 if child.tag.lower() in useful_tables:
152 read_tree = child.find('Read')
154 read = int(read_tree.find('readNumber').text)-1
155 for element in read_tree.getchildren():
156 if element.tag.lower() == "lane":
157 lrs = LaneResultSummaryGA()
158 lrs.set_elements_from_gerald_xml(read, element)
159 self.lane_results[lrs.end][lrs.lane] = lrs
160 # probably not useful
163 ###### START HTML Table Extraction ########
164 def _extract_named_tables_from_html(self, tree):
166 for t in tree.findall('*//table'):
167 previous = t.getprevious()
169 previous = t.getparent()
171 if previous.tag.lower() == 'div':
172 previous = previous.getprevious()
174 if previous.tag in ('h2', 'p'):
176 name = flatten(previous)
177 data = self._parse_table(t)
181 def _extract_lane_results_for_end(self, tables, table_name, end):
183 extract the Lane Results Summary table
185 # parse lane result summary
186 lane_summary = tables[table_name]
187 # this is version 1 of the summary file
188 if len(lane_summary[-1]) == 8:
190 headers = lane_summary[0]
191 # grab the lane by lane data
192 lane_summary = lane_summary[1:]
194 # len(lane_summary[-1] = 10 is version 2 of the summary file
195 # = 9 is version 3 of the Summary.htm file
196 elif len(lane_summary[-1]) in (9, 10):
197 # lane_summary[0] is a different less specific header row
198 headers = lane_summary[1]
199 lane_summary = lane_summary[2:10]
200 # after the last lane, there's a set of chip wide averages
202 # append an extra dictionary if needed
203 if len(self.lane_results) < (end + 1):
204 self.lane_results.append({})
206 for r in lane_summary:
207 lrs = LaneResultSummaryGA(html=r)
209 self.lane_results[lrs.end][lrs.lane] = lrs
210 ###### END HTML Table Extraction ########
213 class SummaryHiSeq(Summary):
214 def __init__(self, filename=None, xml=None):
215 super(SummaryHiSeq, self).__init__(filename, xml)
217 def _extract_lane_results(self, filename):
218 read1 = os.path.join(filename, 'read1.xml')
219 read2 = os.path.join(filename, 'read2.xml')
221 if os.path.exists(read1):
222 self._extract_lane_results_for_end(read1, 0)
224 LOGGER.warn("No read1.xml at %s." % (read1,))
225 if os.path.exists(read2):
226 self.lane_results.append({})
227 self._extract_lane_results_for_end(read2, 1)
229 def _extract_lane_results_for_end(self, filename, end):
230 self.tree = etree.parse(filename)
231 root = self.tree.getroot()
232 for lane in root.getchildren():
233 lrs = LaneResultSummaryHiSeq(data=lane)
235 self.lane_results[lrs.end][lrs.lane] = lrs
238 class LaneResultSummary(object):
240 Parse the LaneResultSummary table out of Summary.htm
241 Mostly for the cluster number
243 LANE_RESULT_SUMMARY = 'LaneResultSummary'
245 'LaneYield': 'lane_yield',
246 'Cluster': 'cluster', # Raw
247 'ClusterPF': 'cluster_pass_filter',
248 'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
249 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
250 'PercentPassFilterClusters': 'percent_pass_filter_clusters',
251 'PercentPassFilterAlign': 'percent_pass_filter_align',
252 'AverageAlignmentScore': 'average_alignment_score',
253 'PercentErrorRate': 'percent_error_rate'
255 # These are tags that have mean/stdev as found in the GERALD Summary.xml file
257 #'laneYield': 'lane_yield', #this is just a number
258 'clusterCountRaw': 'cluster', # Raw
259 'clusterCountPF': 'cluster_pass_filter',
260 'oneSig': 'average_first_cycle_intensity',
261 'signal20AsPctOf1': 'percent_intensity_after_20_cycles',
262 'percentClustersPF': 'percent_pass_filter_clusters',
263 'percentUniquelyAlignedPF': 'percent_pass_filter_align',
264 'averageAlignScorePF': 'average_alignment_score',
265 'errorPF': 'percent_error_rate'
268 def __init__(self, html=None, xml=None):
271 self.lane_yield = None
273 self.cluster_pass_filter = None
274 self.average_first_cycle_intensity = None
275 self.percent_intensity_after_20_cycles = None
276 self.percent_pass_filter_clusters = None
277 self.percent_pass_filter_align = None
278 self.average_alignment_score = None
279 self.percent_error_rate = None
282 def get_elements(self):
283 lane_result = etree.Element(
284 LaneResultSummary.LANE_RESULT_SUMMARY,
285 {'lane': unicode(self.lane), 'end': unicode(self.end)})
286 for tag, variable_name in LaneResultSummary.TAGS.items():
287 value = getattr(self, variable_name)
290 # it looks like a sequence
291 elif type(value) in (types.TupleType, types.ListType):
292 element = make_mean_range_element(
298 element = etree.SubElement(lane_result, tag)
299 element.text = unicode(value)
302 def set_elements(self, tree):
303 if tree.tag != LaneResultSummary.LANE_RESULT_SUMMARY:
304 raise ValueError('Expected %s' % (
305 LaneResultSummary.LANE_RESULT_SUMMARY))
306 self.lane = int(tree.attrib['lane'])
307 # default to the first end, for the older summary files
308 # that are single ended
309 self.end = int(tree.attrib.get('end', 0))
310 tags = LaneResultSummary.TAGS
311 for element in list(tree):
313 variable_name = tags[element.tag]
314 setattr(self, variable_name,
315 parse_summary_element(element))
316 except KeyError as e:
317 LOGGER.warn('Unrecognized tag %s' % (element.tag,))
320 class LaneResultSummaryGA(LaneResultSummary):
321 def __init__(self, html=None, xml=None):
322 super(LaneResultSummaryGA, self).__init__(html, xml)
325 self.set_elements_from_source(html)
327 self.set_elements(xml)
329 def set_elements_from_gerald_xml(self, read, element):
330 self.lane = int(element.find('laneNumber').text)
332 lane_yield_node = element.find('laneYield')
333 if lane_yield_node is not None:
334 self.lane_yield = int(lane_yield_node.text)
336 self.lane_yield = None
338 for GeraldName, LRSName in LaneResultSummary.GERALD_TAGS.items():
339 node = element.find(GeraldName)
341 LOGGER.info("Couldn't find %s" % (GeraldName))
342 setattr(self, LRSName, parse_xml_mean_range(node))
344 def set_elements_from_source(self, data):
345 """Read from an initial summary data file. Either xml or html
347 if not len(data) in (8,10):
348 raise RuntimeError("Summary.htm file format changed, len(data)=%d" % (len(data),))
350 # same in pre-0.3.0 Summary file and 0.3 summary file
351 self.lane = int(data[0])
354 parsed_data = [ parse_mean_range(x) for x in data[1:] ]
355 # this is the < 0.3 Pipeline version
356 self.cluster = parsed_data[0]
357 self.average_first_cycle_intensity = parsed_data[1]
358 self.percent_intensity_after_20_cycles = parsed_data[2]
359 self.percent_pass_filter_clusters = parsed_data[3]
360 self.percent_pass_filter_align = parsed_data[4]
361 self.average_alignment_score = parsed_data[5]
362 self.percent_error_rate = parsed_data[6]
363 elif len(data) == 10:
364 parsed_data = [ parse_mean_range(x) for x in data[2:] ]
365 # this is the >= 0.3 summary file
366 self.lane_yield = data[1]
367 self.cluster = parsed_data[0]
368 self.cluster_pass_filter = parsed_data[1]
369 self.average_first_cycle_intensity = parsed_data[2]
370 self.percent_intensity_after_20_cycles = parsed_data[3]
371 self.percent_pass_filter_clusters = parsed_data[4]
372 self.percent_pass_filter_align = parsed_data[5]
373 self.average_alignment_score = parsed_data[6]
374 self.percent_error_rate = parsed_data[7]
377 class LaneResultSummaryHiSeq(LaneResultSummary):
378 def __init__(self, data=None, xml=None):
379 super(LaneResultSummaryHiSeq, self).__init__(data, xml)
382 self.set_elements_from_source(data)
384 self.set_elements(xml)
386 def set_elements_from_source(self, element):
387 """Read from an initial summary data file. Either xml or html
389 # same in pre-0.3.0 Summary file and 0.3 summary file
390 lane = element.attrib
391 self.lane = int(lane['key'])
392 #self.lane_yield = data[1]
393 self.cluster = (int(lane['ClustersRaw']),
394 float(lane['ClustersRawSD']))
395 self.cluster_pass_filter = (int(lane['ClustersPF']),
396 float(lane['ClustersPFSD']))
397 self.average_first_cycle_intensity = (int(lane['FirstCycleIntPF']),
398 float(lane['FirstCycleIntPFSD']))
399 self.percent_intensity_after_20_cycles = (
400 float(lane['PrcIntensityAfter20CyclesPF']),
401 float(lane['PrcIntensityAfter20CyclesPFSD']))
402 self.percent_pass_filter_clusters = (
403 float(lane['PrcPFClusters']),
404 float(lane['PrcPFClustersSD']))
405 self.percent_pass_filter_align = (
406 float(lane['PrcAlign']),
407 float(lane['PrcAlignSD']))
408 self.percent_error_rate = (
409 float(lane['ErrRatePhiX']),
410 float(lane['ErrRatePhiXSD']),)
415 Convert a value to int if its an int otherwise a float.
419 except ValueError as e:
423 def parse_mean_range(value):
425 Parse values like 123 +/- 4.5
427 if value.strip() == 'unknown':
430 values = value.split()
432 if values[0] == '+/-':
435 return tonumber(values[0])
437 average, pm, deviation = values
439 raise RuntimeError("Summary.htm file format changed")
440 return tonumber(average), tonumber(deviation)
442 def make_mean_range_element(parent, name, mean, deviation):
444 Make an etree subelement <Name mean='mean', deviation='deviation'/>
446 element = etree.SubElement(parent, name,
447 { 'mean': unicode(mean),
448 'deviation': unicode(deviation)})
451 def parse_mean_range_element(element):
453 Grab mean/deviation out of element
455 return (tonumber(element.attrib['mean']),
456 tonumber(element.attrib['deviation']))
458 def parse_summary_element(element):
460 Determine if we have a simple element or a mean/deviation element
462 if len(element.attrib) > 0:
463 return parse_mean_range_element(element)
467 def parse_xml_mean_range(element):
469 Extract mean/stddev children from an element as a tuple
474 mean = element.find('mean')
475 stddev = element.find('stdev')
476 if mean is None or stddev is None:
477 raise RuntimeError("Summary.xml file format changed, expected mean/stddev tags")
478 if mean.text is None:
479 mean_value = float('nan')
481 mean_value = tonumber(mean.text)
483 if stddev.text is None:
484 stddev_value = float('nan')
486 stddev_value = tonumber(stddev.text)
489 return (mean_value, stddev_value)
491 def isxml_file(fname):
492 with open(fname,'r') as stream:
493 return isxml_stream(stream)
495 def isxml_stream(stream):
496 """Return true or false if its sort of an xml file
499 line = stream.readline()
501 if re.match("<\?xml.*?>", line):
507 if __name__ == "__main__":
509 from optparse import OptionParser
510 parser = OptionParser('%prog [Summary.xml/Summary.htm]+')
511 opts, args = parser.parse_args()
513 parser.error('need at least one xml/html file')