2 Analyze the Summary.htm file produced by GERALD
8 from pprint import pprint
10 #from htsworkflow.pipelines.runfolder import ElementTree
12 from lxml import etree
13 from htsworkflow.util.ethelp import indent, flatten
15 LOGGER = logging.getLogger(__name__)
18 class Summary(object):
20 Extract some useful information from the Summary.htm file
25 def __init__(self, filename=None, xml=None):
26 # lane results is a list of 1 or 2 ends containing
27 # a dictionary of all the lanes reported in this
29 self.lane_results = [{}]
31 if filename is not None:
32 self._extract_lane_results(filename)
34 self.set_elements(xml)
36 def __getitem__(self, key):
37 return self.lane_results[key]
40 return len(self.lane_results)
42 def get_elements(self):
43 summary = etree.Element(Summary.SUMMARY,
44 {'version': unicode(Summary.XML_VERSION)})
45 for end in self.lane_results:
46 for lane in end.values():
47 summary.append(lane.get_elements())
50 def set_elements(self, tree):
51 if tree.tag != Summary.SUMMARY:
52 return ValueError("Expected %s" % (Summary.SUMMARY,))
53 xml_version = int(tree.attrib.get('version', 0))
54 if xml_version > Summary.XML_VERSION:
55 LOGGER.warn('Summary XML tree is a higher version than this class')
56 for element in list(tree):
57 lrs = LaneResultSummaryGA()
58 lrs.set_elements(element)
59 if len(self.lane_results) < (lrs.end + 1):
60 self.lane_results.append({})
61 self.lane_results[lrs.end][lrs.lane] = lrs
63 def is_paired_end(self):
64 return len(self.lane_results) == 2
68 Debugging function, report current object
70 tree = self.get_elements()
71 print etree.tostring(tree)
73 class SummaryGA(Summary):
74 def __init__(self, filename=None, xml=None):
75 super(SummaryGA, self).__init__(filename, xml)
77 def _flattened_row(self, row):
79 flatten the children of a <tr>...</tr>
81 return [flatten(x) for x in row.getchildren() ]
83 def _parse_table(self, table):
85 assumes the first line is the header of a table,
86 and that the remaining rows are data
88 rows = table.getchildren()
91 data.append(self._flattened_row(r))
94 def _extract_lane_results(self, pathname):
96 Extract just the lane results.
97 Currently those are the only ones we care about.
100 tables = self._extract_named_tables(pathname)
103 def _extract_named_tables(self, pathname):
105 extract all the 'named' tables from a Summary.htm file
106 and return as a dictionary
108 Named tables are <h2>...</h2><table>...</table> pairs
109 The contents of the h2 tag is considered to the name
112 if isxml_file(pathname):
113 tree = etree.parse(pathname).getroot()
116 tree = html.parse(pathname).getroot()
117 # tree = etree.parse(pathname).getroot()
118 # hack for 1.1rc1, this should be removed when possible.
119 #file_body = open(pathname).read()
120 #file_body = file_body.replace('CHASTITY<=', 'CHASTITY<=')
121 #tree = etree.fromstring(file_body)
123 # are we reading the xml or the html version of the Summary file?
124 if tree.tag.lower() == 'summary':
126 tables = self._extract_named_tables_from_gerald_xml(tree)
127 elif tree.tag.lower() == 'html':
129 tables = self._extract_named_tables_from_html(tree)
130 table_names = [ ('Lane Results Summary', 0),
131 ('Lane Results Summary : Read 1', 0),
132 ('Lane Results Summary : Read 2', 1),]
133 for name, end in table_names:
134 if tables.has_key(name):
135 self._extract_lane_results_for_end(tables, name, end)
137 if len(self.lane_results[0]) == 0:
138 LOGGER.warning("No Lane Results Summary Found in %s" % (pathname,))
140 def _extract_named_tables_from_gerald_xml(self, tree):
142 Extract useful named tables from a gerald created Summary.xml file
144 # using the function to convert to lower instead of just writing it
145 # makes the tag easier to read (IMO)
146 useful_tables = ['LaneResultsSummary'.lower(),]
149 for child in tree.getchildren():
150 if child.tag.lower() in useful_tables:
151 read_tree = child.find('Read')
153 read = int(read_tree.find('readNumber').text)-1
154 for element in read_tree.getchildren():
155 if element.tag.lower() == "lane":
156 lrs = LaneResultSummaryGA()
157 lrs.set_elements_from_gerald_xml(read, element)
158 self.lane_results[lrs.end][lrs.lane] = lrs
159 # probably not useful
162 ###### START HTML Table Extraction ########
163 def _extract_named_tables_from_html(self, tree):
165 for t in tree.findall('*//table'):
166 previous = t.getprevious()
168 previous = t.getparent()
170 if previous.tag.lower() == 'div':
171 previous = previous.getprevious()
173 if previous.tag in ('h2', 'p'):
175 name = flatten(previous)
176 data = self._parse_table(t)
180 def _extract_lane_results_for_end(self, tables, table_name, end):
182 extract the Lane Results Summary table
184 # parse lane result summary
185 lane_summary = tables[table_name]
186 # this is version 1 of the summary file
187 if len(lane_summary[-1]) == 8:
189 headers = lane_summary[0]
190 # grab the lane by lane data
191 lane_summary = lane_summary[1:]
193 # len(lane_summary[-1] = 10 is version 2 of the summary file
194 # = 9 is version 3 of the Summary.htm file
195 elif len(lane_summary[-1]) in (9, 10):
196 # lane_summary[0] is a different less specific header row
197 headers = lane_summary[1]
198 lane_summary = lane_summary[2:10]
199 # after the last lane, there's a set of chip wide averages
201 # append an extra dictionary if needed
202 if len(self.lane_results) < (end + 1):
203 self.lane_results.append({})
205 for r in lane_summary:
206 lrs = LaneResultSummaryGA(html=r)
208 self.lane_results[lrs.end][lrs.lane] = lrs
209 ###### END HTML Table Extraction ########
212 class SummaryHiSeq(Summary):
213 def __init__(self, filename=None, xml=None):
214 super(SummaryHiSeq, self).__init__(filename, xml)
216 def _extract_lane_results(self, filename):
217 read1 = os.path.join(filename, 'read1.xml')
218 read2 = os.path.join(filename, 'read2.xml')
220 if os.path.exists(read1):
221 self._extract_lane_results_for_end(read1, 0)
223 LOGGER.warn("No read1.xml at %s." % (read1,))
224 if os.path.exists(read2):
225 self.lane_results.append({})
226 self._extract_lane_results_for_end(read2, 1)
228 def _extract_lane_results_for_end(self, filename, end):
229 self.tree = etree.parse(filename)
230 root = self.tree.getroot()
231 for lane in root.getchildren():
232 lrs = LaneResultSummaryHiSeq(data=lane)
234 self.lane_results[lrs.end][lrs.lane] = lrs
237 class LaneResultSummary(object):
239 Parse the LaneResultSummary table out of Summary.htm
240 Mostly for the cluster number
242 LANE_RESULT_SUMMARY = 'LaneResultSummary'
244 'LaneYield': 'lane_yield',
245 'Cluster': 'cluster', # Raw
246 'ClusterPF': 'cluster_pass_filter',
247 'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
248 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
249 'PercentPassFilterClusters': 'percent_pass_filter_clusters',
250 'PercentPassFilterAlign': 'percent_pass_filter_align',
251 'AverageAlignmentScore': 'average_alignment_score',
252 'PercentErrorRate': 'percent_error_rate'
254 # These are tags that have mean/stdev as found in the GERALD Summary.xml file
256 #'laneYield': 'lane_yield', #this is just a number
257 'clusterCountRaw': 'cluster', # Raw
258 'clusterCountPF': 'cluster_pass_filter',
259 'oneSig': 'average_first_cycle_intensity',
260 'signal20AsPctOf1': 'percent_intensity_after_20_cycles',
261 'percentClustersPF': 'percent_pass_filter_clusters',
262 'percentUniquelyAlignedPF': 'percent_pass_filter_align',
263 'averageAlignScorePF': 'average_alignment_score',
264 'errorPF': 'percent_error_rate'
267 def __init__(self, html=None, xml=None):
270 self.lane_yield = None
272 self.cluster_pass_filter = None
273 self.average_first_cycle_intensity = None
274 self.percent_intensity_after_20_cycles = None
275 self.percent_pass_filter_clusters = None
276 self.percent_pass_filter_align = None
277 self.average_alignment_score = None
278 self.percent_error_rate = None
281 def get_elements(self):
282 lane_result = etree.Element(
283 LaneResultSummary.LANE_RESULT_SUMMARY,
284 {'lane': unicode(self.lane), 'end': unicode(self.end)})
285 for tag, variable_name in LaneResultSummary.TAGS.items():
286 value = getattr(self, variable_name)
289 # it looks like a sequence
290 elif type(value) in (types.TupleType, types.ListType):
291 element = make_mean_range_element(
297 element = etree.SubElement(lane_result, tag)
298 element.text = unicode(value)
301 def set_elements(self, tree):
302 if tree.tag != LaneResultSummary.LANE_RESULT_SUMMARY:
303 raise ValueError('Expected %s' % (
304 LaneResultSummary.LANE_RESULT_SUMMARY))
305 self.lane = int(tree.attrib['lane'])
306 # default to the first end, for the older summary files
307 # that are single ended
308 self.end = int(tree.attrib.get('end', 0))
309 tags = LaneResultSummary.TAGS
310 for element in list(tree):
312 variable_name = tags[element.tag]
313 setattr(self, variable_name,
314 parse_summary_element(element))
316 LOGGER.warn('Unrecognized tag %s' % (element.tag,))
319 class LaneResultSummaryGA(LaneResultSummary):
320 def __init__(self, html=None, xml=None):
321 super(LaneResultSummaryGA, self).__init__(html, xml)
324 self.set_elements_from_source(html)
326 self.set_elements(xml)
328 def set_elements_from_gerald_xml(self, read, element):
329 self.lane = int(element.find('laneNumber').text)
331 lane_yield_node = element.find('laneYield')
332 if lane_yield_node is not None:
333 self.lane_yield = int(lane_yield_node.text)
335 self.lane_yield = None
337 for GeraldName, LRSName in LaneResultSummary.GERALD_TAGS.items():
338 node = element.find(GeraldName)
340 LOGGER.info("Couldn't find %s" % (GeraldName))
341 setattr(self, LRSName, parse_xml_mean_range(node))
343 def set_elements_from_source(self, data):
344 """Read from an initial summary data file. Either xml or html
346 if not len(data) in (8,10):
347 raise RuntimeError("Summary.htm file format changed, len(data)=%d" % (len(data),))
349 # same in pre-0.3.0 Summary file and 0.3 summary file
350 self.lane = int(data[0])
353 parsed_data = [ parse_mean_range(x) for x in data[1:] ]
354 # this is the < 0.3 Pipeline version
355 self.cluster = parsed_data[0]
356 self.average_first_cycle_intensity = parsed_data[1]
357 self.percent_intensity_after_20_cycles = parsed_data[2]
358 self.percent_pass_filter_clusters = parsed_data[3]
359 self.percent_pass_filter_align = parsed_data[4]
360 self.average_alignment_score = parsed_data[5]
361 self.percent_error_rate = parsed_data[6]
362 elif len(data) == 10:
363 parsed_data = [ parse_mean_range(x) for x in data[2:] ]
364 # this is the >= 0.3 summary file
365 self.lane_yield = data[1]
366 self.cluster = parsed_data[0]
367 self.cluster_pass_filter = parsed_data[1]
368 self.average_first_cycle_intensity = parsed_data[2]
369 self.percent_intensity_after_20_cycles = parsed_data[3]
370 self.percent_pass_filter_clusters = parsed_data[4]
371 self.percent_pass_filter_align = parsed_data[5]
372 self.average_alignment_score = parsed_data[6]
373 self.percent_error_rate = parsed_data[7]
376 class LaneResultSummaryHiSeq(LaneResultSummary):
377 def __init__(self, data=None, xml=None):
378 super(LaneResultSummaryHiSeq, self).__init__(data, xml)
381 self.set_elements_from_source(data)
383 self.set_elements(xml)
385 def set_elements_from_source(self, element):
386 """Read from an initial summary data file. Either xml or html
388 # same in pre-0.3.0 Summary file and 0.3 summary file
389 lane = element.attrib
390 self.lane = int(lane['key'])
391 #self.lane_yield = data[1]
392 self.cluster = (int(lane['ClustersRaw']),
393 float(lane['ClustersRawSD']))
394 self.cluster_pass_filter = (int(lane['ClustersPF']),
395 float(lane['ClustersPFSD']))
396 self.average_first_cycle_intensity = (int(lane['FirstCycleIntPF']),
397 float(lane['FirstCycleIntPFSD']))
398 self.percent_intensity_after_20_cycles = (
399 float(lane['PrcIntensityAfter20CyclesPF']),
400 float(lane['PrcIntensityAfter20CyclesPFSD']))
401 self.percent_pass_filter_clusters = (
402 float(lane['PrcPFClusters']),
403 float(lane['PrcPFClustersSD']))
404 self.percent_pass_filter_align = (
405 float(lane['PrcAlign']),
406 float(lane['PrcAlignSD']))
407 self.percent_error_rate = (
408 float(lane['ErrRatePhiX']),
409 float(lane['ErrRatePhiXSD']),)
414 Convert a value to int if its an int otherwise a float.
418 except ValueError, e:
422 def parse_mean_range(value):
424 Parse values like 123 +/- 4.5
426 if value.strip() == 'unknown':
429 values = value.split()
431 if values[0] == '+/-':
434 return tonumber(values[0])
436 average, pm, deviation = values
438 raise RuntimeError("Summary.htm file format changed")
439 return tonumber(average), tonumber(deviation)
441 def make_mean_range_element(parent, name, mean, deviation):
443 Make an etree subelement <Name mean='mean', deviation='deviation'/>
445 element = etree.SubElement(parent, name,
446 { 'mean': unicode(mean),
447 'deviation': unicode(deviation)})
450 def parse_mean_range_element(element):
452 Grab mean/deviation out of element
454 return (tonumber(element.attrib['mean']),
455 tonumber(element.attrib['deviation']))
457 def parse_summary_element(element):
459 Determine if we have a simple element or a mean/deviation element
461 if len(element.attrib) > 0:
462 return parse_mean_range_element(element)
466 def parse_xml_mean_range(element):
468 Extract mean/stddev children from an element as a tuple
473 mean = element.find('mean')
474 stddev = element.find('stdev')
475 if mean is None or stddev is None:
476 raise RuntimeError("Summary.xml file format changed, expected mean/stddev tags")
477 if mean.text is None:
478 mean_value = float('nan')
480 mean_value = tonumber(mean.text)
482 if stddev.text is None:
483 stddev_value = float('nan')
485 stddev_value = tonumber(stddev.text)
488 return (mean_value, stddev_value)
490 def isxml_file(fname):
491 with open(fname,'r') as stream:
492 return isxml_stream(stream)
494 def isxml_stream(stream):
495 """Return true or false if its sort of an xml file
498 line = stream.readline()
500 if re.match("<\?xml.*?>", line):
506 if __name__ == "__main__":
508 from optparse import OptionParser
509 parser = OptionParser('%prog [Summary.xml/Summary.htm]+')
510 opts, args = parser.parse_args()
512 parser.error('need at least one xml/html file')