8c5ab7f91215d7c38198c124e1014b6637dd738a
[htsworkflow.git] / htsworkflow / pipelines / summary.py
1 """
2 Analyze the Summary.htm file produced by GERALD
3 """
4 import logging
5 import re
6 import types
7 from pprint import pprint
8
9 #from htsworkflow.pipelines.runfolder import ElementTree
10 from lxml import html
11 from lxml import etree
12 from htsworkflow.util.ethelp import indent, flatten
13
14 LOGGER = logging.getLogger(__name__)
15 nan = float('nan')
16
17 class Summary(object):
18     """
19     Extract some useful information from the Summary.htm file
20     """
21     XML_VERSION = 3
22     SUMMARY = 'Summary'
23
24     class LaneResultSummary(object):
25         """
26         Parse the LaneResultSummary table out of Summary.htm
27         Mostly for the cluster number
28         """
29         LANE_RESULT_SUMMARY = 'LaneResultSummary'
30         TAGS = {
31           'LaneYield': 'lane_yield',
32           'Cluster': 'cluster', # Raw
33           'ClusterPF': 'cluster_pass_filter',
34           'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
35           'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
36           'PercentPassFilterClusters': 'percent_pass_filter_clusters',
37           'PercentPassFilterAlign': 'percent_pass_filter_align',
38           'AverageAlignmentScore': 'average_alignment_score',
39           'PercentErrorRate': 'percent_error_rate'
40         }
41         # These are tags that have mean/stdev as found in the GERALD Summary.xml file
42         GERALD_TAGS = {
43           #'laneYield': 'lane_yield', #this is just a number
44           'clusterCountRaw': 'cluster', # Raw
45           'clusterCountPF': 'cluster_pass_filter',
46           'oneSig': 'average_first_cycle_intensity',
47           'signal20AsPctOf1': 'percent_intensity_after_20_cycles',
48           'percentClustersPF': 'percent_pass_filter_clusters',
49           'percentUniquelyAlignedPF': 'percent_pass_filter_align',
50           'averageAlignScorePF': 'average_alignment_score',
51           'errorPF': 'percent_error_rate'
52         }
53
54         def __init__(self, html=None, xml=None):
55             self.lane = None
56             self.end = 0
57             self.lane_yield = None
58             self.cluster = None
59             self.cluster_pass_filter = None
60             self.average_first_cycle_intensity = None
61             self.percent_intensity_after_20_cycles = None
62             self.percent_pass_filter_clusters = None
63             self.percent_pass_filter_align = None
64             self.average_alignment_score = None
65             self.percent_error_rate = None
66
67             if html is not None:
68                 self.set_elements_from_source(html)
69             if xml is not None:
70                 self.set_elements(xml)
71
72         def set_elements_from_source(self, data):
73             """Read from an initial summary data file. Either xml or html
74             """
75             if not len(data) in (8,10):
76                 raise RuntimeError("Summary.htm file format changed, len(data)=%d" % (len(data),))
77
78             # same in pre-0.3.0 Summary file and 0.3 summary file
79             self.lane = int(data[0])
80
81             if len(data) == 8:
82                 parsed_data = [ parse_mean_range(x) for x in data[1:] ]
83                 # this is the < 0.3 Pipeline version
84                 self.cluster = parsed_data[0]
85                 self.average_first_cycle_intensity = parsed_data[1]
86                 self.percent_intensity_after_20_cycles = parsed_data[2]
87                 self.percent_pass_filter_clusters = parsed_data[3]
88                 self.percent_pass_filter_align = parsed_data[4]
89                 self.average_alignment_score = parsed_data[5]
90                 self.percent_error_rate = parsed_data[6]
91             elif len(data) == 10:
92                 parsed_data = [ parse_mean_range(x) for x in data[2:] ]
93                 # this is the >= 0.3 summary file
94                 self.lane_yield = data[1]
95                 self.cluster = parsed_data[0]
96                 self.cluster_pass_filter = parsed_data[1]
97                 self.average_first_cycle_intensity = parsed_data[2]
98                 self.percent_intensity_after_20_cycles = parsed_data[3]
99                 self.percent_pass_filter_clusters = parsed_data[4]
100                 self.percent_pass_filter_align = parsed_data[5]
101                 self.average_alignment_score = parsed_data[6]
102                 self.percent_error_rate = parsed_data[7]
103
104         def set_elements_from_gerald_xml(self, read, element):
105             self.lane = int(element.find('laneNumber').text)
106             self.end = read
107             lane_yield_node = element.find('laneYield')
108             if lane_yield_node is not None:
109                 self.lane_yield = int(lane_yield_node.text)
110             else:
111                 self.lane_yield = None
112
113             for GeraldName, LRSName in Summary.LaneResultSummary.GERALD_TAGS.items():
114                 node = element.find(GeraldName)
115                 if node is None:
116                     LOGGER.info("Couldn't find %s" % (GeraldName))
117                 setattr(self, LRSName, parse_xml_mean_range(node))
118
119         def get_elements(self):
120             lane_result = etree.Element(
121                             Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
122                             {'lane': unicode(self.lane), 'end': unicode(self.end)})
123             for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
124                 value = getattr(self, variable_name)
125                 if value is None:
126                     continue
127                 # it looks like a sequence
128                 elif type(value) in (types.TupleType, types.ListType):
129                     element = make_mean_range_element(
130                       lane_result,
131                       tag,
132                       *value
133                     )
134                 else:
135                     element = etree.SubElement(lane_result, tag)
136                     element.text = unicode(value)
137             return lane_result
138
139         def set_elements(self, tree):
140             if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
141                 raise ValueError('Expected %s' % (
142                         Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
143             self.lane = int(tree.attrib['lane'])
144             # default to the first end, for the older summary files
145             # that are single ended
146             self.end = int(tree.attrib.get('end', 0))
147             tags = Summary.LaneResultSummary.TAGS
148             for element in list(tree):
149                 try:
150                     variable_name = tags[element.tag]
151                     setattr(self, variable_name,
152                             parse_summary_element(element))
153                 except KeyError, e:
154                     LOGGER.warn('Unrecognized tag %s' % (element.tag,))
155
156     def __init__(self, filename=None, xml=None):
157         # lane results is a list of 1 or 2 ends containing
158         # a dictionary of all the lanes reported in this
159         # summary file
160         self.lane_results = [{}]
161
162         if filename is not None:
163             self._extract_lane_results(filename)
164         if xml is not None:
165             self.set_elements(xml)
166
167     def __getitem__(self, key):
168         return self.lane_results[key]
169
170     def __len__(self):
171         return len(self.lane_results)
172
173     def _flattened_row(self, row):
174         """
175         flatten the children of a <tr>...</tr>
176         """
177         return [flatten(x) for x in row.getchildren() ]
178
179     def _parse_table(self, table):
180         """
181         assumes the first line is the header of a table,
182         and that the remaining rows are data
183         """
184         rows = table.getchildren()
185         data = []
186         for r in rows:
187             data.append(self._flattened_row(r))
188         return data
189
190     def _extract_lane_results(self, pathname):
191         """
192         Extract just the lane results.
193         Currently those are the only ones we care about.
194         """
195
196         tables = self._extract_named_tables(pathname)
197
198
199     def _extract_named_tables(self, pathname):
200         """
201         extract all the 'named' tables from a Summary.htm file
202         and return as a dictionary
203
204         Named tables are <h2>...</h2><table>...</table> pairs
205         The contents of the h2 tag is considered to the name
206         of the table.
207         """
208         if isxml_file(pathname):
209             tree = etree.parse(pathname).getroot()
210         else:
211             # html
212             tree = html.parse(pathname).getroot()
213         # tree = etree.parse(pathname).getroot()
214         # hack for 1.1rc1, this should be removed when possible.
215         #file_body = open(pathname).read()
216         #file_body = file_body.replace('CHASTITY<=', 'CHASTITY&lt;=')
217         #tree = etree.fromstring(file_body)
218
219         # are we reading the xml or the html version of the Summary file?
220         if tree.tag.lower() == 'summary':
221             # summary version
222             tables = self._extract_named_tables_from_gerald_xml(tree)
223         elif tree.tag.lower() == 'html':
224             # html version
225             tables = self._extract_named_tables_from_html(tree)
226             table_names = [ ('Lane Results Summary', 0),
227                             ('Lane Results Summary : Read 1', 0),
228                             ('Lane Results Summary : Read 2', 1),]
229             for name, end in table_names:
230                 if tables.has_key(name):
231                     self._extract_lane_results_for_end(tables, name, end)
232
233         if len(self.lane_results[0])  == 0:
234             LOGGER.warning("No Lane Results Summary Found in %s" % (pathname,))
235
236     def _extract_named_tables_from_gerald_xml(self, tree):
237         """
238         Extract useful named tables from a gerald created Summary.xml file
239         """
240         # using the function to convert to lower instead of just writing it
241         # makes the tag easier to read (IMO)
242         useful_tables = ['LaneResultsSummary'.lower(),]
243
244         tables ={}
245         for child in tree.getchildren():
246             if child.tag.lower() in  useful_tables:
247                 read_tree = child.find('Read')
248                 # we want 0 based.
249                 read = int(read_tree.find('readNumber').text)-1
250                 for element in read_tree.getchildren():
251                     if element.tag.lower() == "lane":
252                         lrs = Summary.LaneResultSummary()
253                         lrs.set_elements_from_gerald_xml(read, element)
254                         self.lane_results[lrs.end][lrs.lane] = lrs
255         # probably not useful
256         return tables
257
258     ###### START HTML Table Extraction ########
259     def _extract_named_tables_from_html(self, tree):
260         tables = {}
261         for t in tree.findall('*//table'):
262             previous = t.getprevious()
263             if previous is None:
264                 previous = t.getparent()
265
266             if previous.tag.lower() == 'div':
267                 previous = previous.getprevious()
268
269             if previous.tag in ('h2', 'p'):
270                 # we have a table
271                 name = flatten(previous)
272                 data = self._parse_table(t)
273                 tables[name] = data
274         return tables
275
276     def _extract_lane_results_for_end(self, tables, table_name, end):
277         """
278         extract the Lane Results Summary table
279         """
280         # parse lane result summary
281         lane_summary = tables[table_name]
282         # this is version 1 of the summary file
283         if len(lane_summary[-1]) == 8:
284             # strip header
285             headers = lane_summary[0]
286             # grab the lane by lane data
287             lane_summary = lane_summary[1:]
288
289         # len(lane_summary[-1] = 10 is version 2 of the summary file
290         #                      = 9  is version 3 of the Summary.htm file
291         elif len(lane_summary[-1]) in (9, 10):
292             # lane_summary[0] is a different less specific header row
293             headers = lane_summary[1]
294             lane_summary = lane_summary[2:10]
295             # after the last lane, there's a set of chip wide averages
296
297         # append an extra dictionary if needed
298         if len(self.lane_results) < (end + 1):
299           self.lane_results.append({})
300
301         for r in lane_summary:
302             lrs = Summary.LaneResultSummary(html=r)
303             lrs.end = end
304             self.lane_results[lrs.end][lrs.lane] = lrs
305     ###### END HTML Table Extraction ########
306
307     def get_elements(self):
308         summary = etree.Element(Summary.SUMMARY,
309                                       {'version': unicode(Summary.XML_VERSION)})
310         for end in self.lane_results:
311             for lane in end.values():
312                 summary.append(lane.get_elements())
313         return summary
314
315     def set_elements(self, tree):
316         if tree.tag != Summary.SUMMARY:
317             return ValueError("Expected %s" % (Summary.SUMMARY,))
318         xml_version = int(tree.attrib.get('version', 0))
319         if xml_version > Summary.XML_VERSION:
320             LOGGER.warn('Summary XML tree is a higher version than this class')
321         for element in list(tree):
322             lrs = Summary.LaneResultSummary()
323             lrs.set_elements(element)
324             if len(self.lane_results) < (lrs.end + 1):
325               self.lane_results.append({})
326             self.lane_results[lrs.end][lrs.lane] = lrs
327
328     def is_paired_end(self):
329       return len(self.lane_results) == 2
330
331     def dump(self):
332         """
333         Debugging function, report current object
334         """
335         tree = self.get_elements()
336         print etree.tostring(tree)
337
338 def tonumber(v):
339     """
340     Convert a value to int if its an int otherwise a float.
341     """
342     try:
343         v = int(v)
344     except ValueError, e:
345         v = float(v)
346     return v
347
348 def parse_mean_range(value):
349     """
350     Parse values like 123 +/- 4.5
351     """
352     if value.strip() == 'unknown':
353         return nan, nan
354
355     values = value.split()
356     if len(values) == 1:
357         if values[0] == '+/-':
358             return nan,nan
359         else:
360             return tonumber(values[0])
361
362     average, pm, deviation = values
363     if pm != '+/-':
364         raise RuntimeError("Summary.htm file format changed")
365     return tonumber(average), tonumber(deviation)
366
367 def make_mean_range_element(parent, name, mean, deviation):
368     """
369     Make an etree subelement <Name mean='mean', deviation='deviation'/>
370     """
371     element = etree.SubElement(parent, name,
372                                      { 'mean': unicode(mean),
373                                        'deviation': unicode(deviation)})
374     return element
375
376 def parse_mean_range_element(element):
377     """
378     Grab mean/deviation out of element
379     """
380     return (tonumber(element.attrib['mean']),
381             tonumber(element.attrib['deviation']))
382
383 def parse_summary_element(element):
384     """
385     Determine if we have a simple element or a mean/deviation element
386     """
387     if len(element.attrib) > 0:
388         return parse_mean_range_element(element)
389     else:
390         return element.text
391
392 def parse_xml_mean_range(element):
393     """
394     Extract mean/stddev children from an element as a tuple
395     """
396     if element is None:
397         return None
398
399     mean = element.find('mean')
400     stddev = element.find('stdev')
401     if mean is None or stddev is None:
402         raise RuntimeError("Summary.xml file format changed, expected mean/stddev tags")
403     if mean.text is None:
404         mean_value = float('nan')
405     else:
406         mean_value = tonumber(mean.text)
407
408     if stddev.text is None:
409         stddev_value = float('nan')
410     else:
411         stddev_value = tonumber(stddev.text)
412
413
414     return (mean_value, stddev_value)
415
416 def isxml_file(fname):
417     with open(fname,'r') as stream:
418         return isxml_stream(stream)
419
420 def isxml_stream(stream):
421     """Return true or false if its sort of an xml file
422     """
423     pos = stream.tell()
424     line = stream.readline()
425     stream.seek(pos)
426     if re.match("<\?xml.*?>", line):
427         # attempt at xml
428         return True
429     else:
430         return False
431
432 if __name__ == "__main__":
433     # test code
434     from optparse import OptionParser
435     parser = OptionParser('%prog [Summary.xml/Summary.htm]+')
436     opts, args = parser.parse_args()
437     if len(args) == 0:
438         parser.error('need at least one xml/html file')
439     for fname in args:
440         s = Summary(fname)
441         s.dump()
442