2 Analyze the Summary.htm file produced by GERALD
7 from pprint import pprint
9 #from htsworkflow.pipelines.runfolder import ElementTree
11 from lxml import etree
12 from htsworkflow.util.ethelp import indent, flatten
14 LOGGER = logging.getLogger(__name__)
17 class Summary(object):
19 Extract some useful information from the Summary.htm file
24 class LaneResultSummary(object):
26 Parse the LaneResultSummary table out of Summary.htm
27 Mostly for the cluster number
29 LANE_RESULT_SUMMARY = 'LaneResultSummary'
31 'LaneYield': 'lane_yield',
32 'Cluster': 'cluster', # Raw
33 'ClusterPF': 'cluster_pass_filter',
34 'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
35 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
36 'PercentPassFilterClusters': 'percent_pass_filter_clusters',
37 'PercentPassFilterAlign': 'percent_pass_filter_align',
38 'AverageAlignmentScore': 'average_alignment_score',
39 'PercentErrorRate': 'percent_error_rate'
41 # These are tags that have mean/stdev as found in the GERALD Summary.xml file
43 #'laneYield': 'lane_yield', #this is just a number
44 'clusterCountRaw': 'cluster', # Raw
45 'clusterCountPF': 'cluster_pass_filter',
46 'oneSig': 'average_first_cycle_intensity',
47 'signal20AsPctOf1': 'percent_intensity_after_20_cycles',
48 'percentClustersPF': 'percent_pass_filter_clusters',
49 'percentUniquelyAlignedPF': 'percent_pass_filter_align',
50 'averageAlignScorePF': 'average_alignment_score',
51 'errorPF': 'percent_error_rate'
54 def __init__(self, html=None, xml=None):
57 self.lane_yield = None
59 self.cluster_pass_filter = None
60 self.average_first_cycle_intensity = None
61 self.percent_intensity_after_20_cycles = None
62 self.percent_pass_filter_clusters = None
63 self.percent_pass_filter_align = None
64 self.average_alignment_score = None
65 self.percent_error_rate = None
68 self.set_elements_from_source(html)
70 self.set_elements(xml)
72 def set_elements_from_source(self, data):
73 """Read from an initial summary data file. Either xml or html
75 if not len(data) in (8,10):
76 raise RuntimeError("Summary.htm file format changed, len(data)=%d" % (len(data),))
78 # same in pre-0.3.0 Summary file and 0.3 summary file
79 self.lane = int(data[0])
82 parsed_data = [ parse_mean_range(x) for x in data[1:] ]
83 # this is the < 0.3 Pipeline version
84 self.cluster = parsed_data[0]
85 self.average_first_cycle_intensity = parsed_data[1]
86 self.percent_intensity_after_20_cycles = parsed_data[2]
87 self.percent_pass_filter_clusters = parsed_data[3]
88 self.percent_pass_filter_align = parsed_data[4]
89 self.average_alignment_score = parsed_data[5]
90 self.percent_error_rate = parsed_data[6]
92 parsed_data = [ parse_mean_range(x) for x in data[2:] ]
93 # this is the >= 0.3 summary file
94 self.lane_yield = data[1]
95 self.cluster = parsed_data[0]
96 self.cluster_pass_filter = parsed_data[1]
97 self.average_first_cycle_intensity = parsed_data[2]
98 self.percent_intensity_after_20_cycles = parsed_data[3]
99 self.percent_pass_filter_clusters = parsed_data[4]
100 self.percent_pass_filter_align = parsed_data[5]
101 self.average_alignment_score = parsed_data[6]
102 self.percent_error_rate = parsed_data[7]
104 def set_elements_from_gerald_xml(self, read, element):
105 self.lane = int(element.find('laneNumber').text)
107 lane_yield_node = element.find('laneYield')
108 if lane_yield_node is not None:
109 self.lane_yield = int(lane_yield_node.text)
111 self.lane_yield = None
113 for GeraldName, LRSName in Summary.LaneResultSummary.GERALD_TAGS.items():
114 node = element.find(GeraldName)
116 LOGGER.info("Couldn't find %s" % (GeraldName))
117 setattr(self, LRSName, parse_xml_mean_range(node))
119 def get_elements(self):
120 lane_result = etree.Element(
121 Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
122 {'lane': unicode(self.lane), 'end': unicode(self.end)})
123 for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
124 value = getattr(self, variable_name)
127 # it looks like a sequence
128 elif type(value) in (types.TupleType, types.ListType):
129 element = make_mean_range_element(
135 element = etree.SubElement(lane_result, tag)
136 element.text = unicode(value)
139 def set_elements(self, tree):
140 if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
141 raise ValueError('Expected %s' % (
142 Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
143 self.lane = int(tree.attrib['lane'])
144 # default to the first end, for the older summary files
145 # that are single ended
146 self.end = int(tree.attrib.get('end', 0))
147 tags = Summary.LaneResultSummary.TAGS
148 for element in list(tree):
150 variable_name = tags[element.tag]
151 setattr(self, variable_name,
152 parse_summary_element(element))
154 LOGGER.warn('Unrecognized tag %s' % (element.tag,))
156 def __init__(self, filename=None, xml=None):
157 # lane results is a list of 1 or 2 ends containing
158 # a dictionary of all the lanes reported in this
160 self.lane_results = [{}]
162 if filename is not None:
163 self._extract_lane_results(filename)
165 self.set_elements(xml)
167 def __getitem__(self, key):
168 return self.lane_results[key]
171 return len(self.lane_results)
173 def _flattened_row(self, row):
175 flatten the children of a <tr>...</tr>
177 return [flatten(x) for x in row.getchildren() ]
179 def _parse_table(self, table):
181 assumes the first line is the header of a table,
182 and that the remaining rows are data
184 rows = table.getchildren()
187 data.append(self._flattened_row(r))
190 def _extract_lane_results(self, pathname):
192 Extract just the lane results.
193 Currently those are the only ones we care about.
196 tables = self._extract_named_tables(pathname)
199 def _extract_named_tables(self, pathname):
201 extract all the 'named' tables from a Summary.htm file
202 and return as a dictionary
204 Named tables are <h2>...</h2><table>...</table> pairs
205 The contents of the h2 tag is considered to the name
208 if isxml_file(pathname):
209 tree = etree.parse(pathname).getroot()
212 tree = html.parse(pathname).getroot()
213 # tree = etree.parse(pathname).getroot()
214 # hack for 1.1rc1, this should be removed when possible.
215 #file_body = open(pathname).read()
216 #file_body = file_body.replace('CHASTITY<=', 'CHASTITY<=')
217 #tree = etree.fromstring(file_body)
219 # are we reading the xml or the html version of the Summary file?
220 if tree.tag.lower() == 'summary':
222 tables = self._extract_named_tables_from_gerald_xml(tree)
223 elif tree.tag.lower() == 'html':
225 tables = self._extract_named_tables_from_html(tree)
226 table_names = [ ('Lane Results Summary', 0),
227 ('Lane Results Summary : Read 1', 0),
228 ('Lane Results Summary : Read 2', 1),]
229 for name, end in table_names:
230 if tables.has_key(name):
231 self._extract_lane_results_for_end(tables, name, end)
233 if len(self.lane_results[0]) == 0:
234 LOGGER.warning("No Lane Results Summary Found in %s" % (pathname,))
236 def _extract_named_tables_from_gerald_xml(self, tree):
238 Extract useful named tables from a gerald created Summary.xml file
240 # using the function to convert to lower instead of just writing it
241 # makes the tag easier to read (IMO)
242 useful_tables = ['LaneResultsSummary'.lower(),]
245 for child in tree.getchildren():
246 if child.tag.lower() in useful_tables:
247 read_tree = child.find('Read')
249 read = int(read_tree.find('readNumber').text)-1
250 for element in read_tree.getchildren():
251 if element.tag.lower() == "lane":
252 lrs = Summary.LaneResultSummary()
253 lrs.set_elements_from_gerald_xml(read, element)
254 self.lane_results[lrs.end][lrs.lane] = lrs
255 # probably not useful
258 ###### START HTML Table Extraction ########
259 def _extract_named_tables_from_html(self, tree):
261 for t in tree.findall('*//table'):
262 previous = t.getprevious()
264 previous = t.getparent()
266 if previous.tag.lower() == 'div':
267 previous = previous.getprevious()
269 if previous.tag in ('h2', 'p'):
271 name = flatten(previous)
272 data = self._parse_table(t)
276 def _extract_lane_results_for_end(self, tables, table_name, end):
278 extract the Lane Results Summary table
280 # parse lane result summary
281 lane_summary = tables[table_name]
282 # this is version 1 of the summary file
283 if len(lane_summary[-1]) == 8:
285 headers = lane_summary[0]
286 # grab the lane by lane data
287 lane_summary = lane_summary[1:]
289 # len(lane_summary[-1] = 10 is version 2 of the summary file
290 # = 9 is version 3 of the Summary.htm file
291 elif len(lane_summary[-1]) in (9, 10):
292 # lane_summary[0] is a different less specific header row
293 headers = lane_summary[1]
294 lane_summary = lane_summary[2:10]
295 # after the last lane, there's a set of chip wide averages
297 # append an extra dictionary if needed
298 if len(self.lane_results) < (end + 1):
299 self.lane_results.append({})
301 for r in lane_summary:
302 lrs = Summary.LaneResultSummary(html=r)
304 self.lane_results[lrs.end][lrs.lane] = lrs
305 ###### END HTML Table Extraction ########
307 def get_elements(self):
308 summary = etree.Element(Summary.SUMMARY,
309 {'version': unicode(Summary.XML_VERSION)})
310 for end in self.lane_results:
311 for lane in end.values():
312 summary.append(lane.get_elements())
315 def set_elements(self, tree):
316 if tree.tag != Summary.SUMMARY:
317 return ValueError("Expected %s" % (Summary.SUMMARY,))
318 xml_version = int(tree.attrib.get('version', 0))
319 if xml_version > Summary.XML_VERSION:
320 LOGGER.warn('Summary XML tree is a higher version than this class')
321 for element in list(tree):
322 lrs = Summary.LaneResultSummary()
323 lrs.set_elements(element)
324 if len(self.lane_results) < (lrs.end + 1):
325 self.lane_results.append({})
326 self.lane_results[lrs.end][lrs.lane] = lrs
328 def is_paired_end(self):
329 return len(self.lane_results) == 2
333 Debugging function, report current object
335 tree = self.get_elements()
336 print etree.tostring(tree)
340 Convert a value to int if its an int otherwise a float.
344 except ValueError, e:
348 def parse_mean_range(value):
350 Parse values like 123 +/- 4.5
352 if value.strip() == 'unknown':
355 values = value.split()
357 if values[0] == '+/-':
360 return tonumber(values[0])
362 average, pm, deviation = values
364 raise RuntimeError("Summary.htm file format changed")
365 return tonumber(average), tonumber(deviation)
367 def make_mean_range_element(parent, name, mean, deviation):
369 Make an etree subelement <Name mean='mean', deviation='deviation'/>
371 element = etree.SubElement(parent, name,
372 { 'mean': unicode(mean),
373 'deviation': unicode(deviation)})
376 def parse_mean_range_element(element):
378 Grab mean/deviation out of element
380 return (tonumber(element.attrib['mean']),
381 tonumber(element.attrib['deviation']))
383 def parse_summary_element(element):
385 Determine if we have a simple element or a mean/deviation element
387 if len(element.attrib) > 0:
388 return parse_mean_range_element(element)
392 def parse_xml_mean_range(element):
394 Extract mean/stddev children from an element as a tuple
399 mean = element.find('mean')
400 stddev = element.find('stdev')
401 if mean is None or stddev is None:
402 raise RuntimeError("Summary.xml file format changed, expected mean/stddev tags")
403 if mean.text is None:
404 mean_value = float('nan')
406 mean_value = tonumber(mean.text)
408 if stddev.text is None:
409 stddev_value = float('nan')
411 stddev_value = tonumber(stddev.text)
414 return (mean_value, stddev_value)
416 def isxml_file(fname):
417 with open(fname,'r') as stream:
418 return isxml_stream(stream)
420 def isxml_stream(stream):
421 """Return true or false if its sort of an xml file
424 line = stream.readline()
426 if re.match("<\?xml.*?>", line):
432 if __name__ == "__main__":
434 from optparse import OptionParser
435 parser = OptionParser('%prog [Summary.xml/Summary.htm]+')
436 opts, args = parser.parse_args()
438 parser.error('need at least one xml/html file')