Initial port to python3
[htsworkflow.git] / htsworkflow / pipelines / summary.py
1 """
2 Analyze the Summary.htm file produced by GERALD
3 """
4 import os
5 import logging
6 import re
7 import types
8 from pprint import pprint
9
10 from lxml import html
11 from lxml import etree
12 from htsworkflow.util.ethelp import indent, flatten
13
14 LOGGER = logging.getLogger(__name__)
15 nan = float('nan')
16
17 class Summary(object):
18     """
19     Extract some useful information from the Summary.htm file
20     """
21     XML_VERSION = 3
22     SUMMARY = 'Summary'
23
24     def __init__(self, filename=None, xml=None):
25         # lane results is a list of 1 or 2 ends containing
26         # a dictionary of all the lanes reported in this
27         # summary file
28         self.lane_results = [{}]
29
30         if filename is not None:
31             self._extract_lane_results(filename)
32         if xml is not None:
33             self.set_elements(xml)
34
35     def __getitem__(self, key):
36         return self.lane_results[key]
37
38     def __len__(self):
39         return len(self.lane_results)
40
41     def get_elements(self):
42         summary = etree.Element(Summary.SUMMARY,
43                                       {'version': str(Summary.XML_VERSION)})
44         for end in self.lane_results:
45             for lane in list(end.values()):
46                 summary.append(lane.get_elements())
47         return summary
48
49     def set_elements(self, tree):
50         if tree.tag != Summary.SUMMARY:
51             return ValueError("Expected %s" % (Summary.SUMMARY,))
52         xml_version = int(tree.attrib.get('version', 0))
53         if xml_version > Summary.XML_VERSION:
54             LOGGER.warn('Summary XML tree is a higher version than this class')
55         for element in list(tree):
56             lrs = LaneResultSummaryGA()
57             lrs.set_elements(element)
58             if len(self.lane_results) < (lrs.end + 1):
59               self.lane_results.append({})
60             self.lane_results[lrs.end][lrs.lane] = lrs
61
62     def is_paired_end(self):
63       return len(self.lane_results) == 2
64
65     def dump(self):
66         """
67         Debugging function, report current object
68         """
69         tree = self.get_elements()
70         print(etree.tostring(tree))
71
72 class SummaryGA(Summary):
73     def __init__(self, filename=None, xml=None):
74         super(SummaryGA, self).__init__(filename, xml)
75
76     def _flattened_row(self, row):
77         """
78         flatten the children of a <tr>...</tr>
79         """
80         return [flatten(x) for x in row.getchildren() ]
81
82     def _parse_table(self, table):
83         """
84         assumes the first line is the header of a table,
85         and that the remaining rows are data
86         """
87         rows = table.getchildren()
88         data = []
89         for r in rows:
90             data.append(self._flattened_row(r))
91         return data
92
93     def _extract_lane_results(self, pathname):
94         """
95         Extract just the lane results.
96         Currently those are the only ones we care about.
97         """
98
99         tables = self._extract_named_tables(pathname)
100
101
102     def _extract_named_tables(self, pathname):
103         """
104         extract all the 'named' tables from a Summary.htm file
105         and return as a dictionary
106
107         Named tables are <h2>...</h2><table>...</table> pairs
108         The contents of the h2 tag is considered to the name
109         of the table.
110         """
111         if isxml_file(pathname):
112             tree = etree.parse(pathname).getroot()
113         else:
114             # html
115             tree = html.parse(pathname).getroot()
116         # tree = etree.parse(pathname).getroot()
117         # hack for 1.1rc1, this should be removed when possible.
118         #file_body = open(pathname).read()
119         #file_body = file_body.replace('CHASTITY<=', 'CHASTITY&lt;=')
120         #tree = etree.fromstring(file_body)
121
122         # are we reading the xml or the html version of the Summary file?
123         if tree.tag.lower() == 'summary':
124             # summary version
125             tables = self._extract_named_tables_from_gerald_xml(tree)
126         elif tree.tag.lower() == 'html':
127             # html version
128             tables = self._extract_named_tables_from_html(tree)
129             table_names = [ ('Lane Results Summary', 0),
130                             ('Lane Results Summary : Read 1', 0),
131                             ('Lane Results Summary : Read 2', 1),]
132             for name, end in table_names:
133                 if name in tables:
134                     self._extract_lane_results_for_end(tables, name, end)
135
136         if len(self.lane_results[0])  == 0:
137             LOGGER.warning("No Lane Results Summary Found in %s" % (pathname,))
138
139     def _extract_named_tables_from_gerald_xml(self, tree):
140         """
141         Extract useful named tables from a gerald created Summary.xml file
142         """
143         # using the function to convert to lower instead of just writing it
144         # makes the tag easier to read (IMO)
145         useful_tables = ['LaneResultsSummary'.lower(),]
146
147         tables ={}
148         for child in tree.getchildren():
149             if child.tag.lower() in  useful_tables:
150                 read_tree = child.find('Read')
151                 # we want 0 based.
152                 read = int(read_tree.find('readNumber').text)-1
153                 for element in read_tree.getchildren():
154                     if element.tag.lower() == "lane":
155                         lrs = LaneResultSummaryGA()
156                         lrs.set_elements_from_gerald_xml(read, element)
157                         self.lane_results[lrs.end][lrs.lane] = lrs
158         # probably not useful
159         return tables
160
161     ###### START HTML Table Extraction ########
162     def _extract_named_tables_from_html(self, tree):
163         tables = {}
164         for t in tree.findall('*//table'):
165             previous = t.getprevious()
166             if previous is None:
167                 previous = t.getparent()
168
169             if previous.tag.lower() == 'div':
170                 previous = previous.getprevious()
171
172             if previous.tag in ('h2', 'p'):
173                 # we have a table
174                 name = flatten(previous)
175                 data = self._parse_table(t)
176                 tables[name] = data
177         return tables
178
179     def _extract_lane_results_for_end(self, tables, table_name, end):
180         """
181         extract the Lane Results Summary table
182         """
183         # parse lane result summary
184         lane_summary = tables[table_name]
185         # this is version 1 of the summary file
186         if len(lane_summary[-1]) == 8:
187             # strip header
188             headers = lane_summary[0]
189             # grab the lane by lane data
190             lane_summary = lane_summary[1:]
191
192         # len(lane_summary[-1] = 10 is version 2 of the summary file
193         #                      = 9  is version 3 of the Summary.htm file
194         elif len(lane_summary[-1]) in (9, 10):
195             # lane_summary[0] is a different less specific header row
196             headers = lane_summary[1]
197             lane_summary = lane_summary[2:10]
198             # after the last lane, there's a set of chip wide averages
199
200         # append an extra dictionary if needed
201         if len(self.lane_results) < (end + 1):
202           self.lane_results.append({})
203
204         for r in lane_summary:
205             lrs = LaneResultSummaryGA(html=r)
206             lrs.end = end
207             self.lane_results[lrs.end][lrs.lane] = lrs
208     ###### END HTML Table Extraction ########
209
210
211 class SummaryHiSeq(Summary):
212     def __init__(self, filename=None, xml=None):
213         super(SummaryHiSeq, self).__init__(filename, xml)
214
215     def _extract_lane_results(self, filename):
216         read1 = os.path.join(filename, 'read1.xml')
217         read2 = os.path.join(filename, 'read2.xml')
218
219         if os.path.exists(read1):
220             self._extract_lane_results_for_end(read1, 0)
221         else:
222             LOGGER.warn("No read1.xml at %s." % (read1,))
223         if os.path.exists(read2):
224             self.lane_results.append({})
225             self._extract_lane_results_for_end(read2, 1)
226
227     def _extract_lane_results_for_end(self, filename, end):
228         self.tree = etree.parse(filename)
229         root = self.tree.getroot()
230         for lane in root.getchildren():
231             lrs = LaneResultSummaryHiSeq(data=lane)
232             lrs.end = end
233             self.lane_results[lrs.end][lrs.lane] = lrs
234
235
236 class LaneResultSummary(object):
237     """
238     Parse the LaneResultSummary table out of Summary.htm
239     Mostly for the cluster number
240     """
241     LANE_RESULT_SUMMARY = 'LaneResultSummary'
242     TAGS = {
243       'LaneYield': 'lane_yield',
244       'Cluster': 'cluster', # Raw
245       'ClusterPF': 'cluster_pass_filter',
246       'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
247       'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
248       'PercentPassFilterClusters': 'percent_pass_filter_clusters',
249       'PercentPassFilterAlign': 'percent_pass_filter_align',
250       'AverageAlignmentScore': 'average_alignment_score',
251       'PercentErrorRate': 'percent_error_rate'
252     }
253     # These are tags that have mean/stdev as found in the GERALD Summary.xml file
254     GERALD_TAGS = {
255       #'laneYield': 'lane_yield', #this is just a number
256       'clusterCountRaw': 'cluster', # Raw
257       'clusterCountPF': 'cluster_pass_filter',
258       'oneSig': 'average_first_cycle_intensity',
259       'signal20AsPctOf1': 'percent_intensity_after_20_cycles',
260       'percentClustersPF': 'percent_pass_filter_clusters',
261       'percentUniquelyAlignedPF': 'percent_pass_filter_align',
262       'averageAlignScorePF': 'average_alignment_score',
263       'errorPF': 'percent_error_rate'
264     }
265
266     def __init__(self, html=None, xml=None):
267         self.lane = None
268         self.end = 0
269         self.lane_yield = None
270         self.cluster = None
271         self.cluster_pass_filter = None
272         self.average_first_cycle_intensity = None
273         self.percent_intensity_after_20_cycles = None
274         self.percent_pass_filter_clusters = None
275         self.percent_pass_filter_align = None
276         self.average_alignment_score = None
277         self.percent_error_rate = None
278
279
280     def get_elements(self):
281         lane_result = etree.Element(
282                         LaneResultSummary.LANE_RESULT_SUMMARY,
283                         {'lane': str(self.lane), 'end': str(self.end)})
284         for tag, variable_name in list(LaneResultSummary.TAGS.items()):
285             value = getattr(self, variable_name)
286             if value is None:
287                 continue
288             # it looks like a sequence
289             elif type(value) in (tuple, list):
290                 element = make_mean_range_element(
291                   lane_result,
292                   tag,
293                   *value
294                 )
295             else:
296                 element = etree.SubElement(lane_result, tag)
297                 element.text = str(value)
298         return lane_result
299
300     def set_elements(self, tree):
301         if tree.tag != LaneResultSummary.LANE_RESULT_SUMMARY:
302             raise ValueError('Expected %s' % (
303                     LaneResultSummary.LANE_RESULT_SUMMARY))
304         self.lane = int(tree.attrib['lane'])
305         # default to the first end, for the older summary files
306         # that are single ended
307         self.end = int(tree.attrib.get('end', 0))
308         tags = LaneResultSummary.TAGS
309         for element in list(tree):
310             try:
311                 variable_name = tags[element.tag]
312                 setattr(self, variable_name,
313                         parse_summary_element(element))
314             except KeyError as e:
315                 LOGGER.warn('Unrecognized tag %s' % (element.tag,))
316
317
318 class LaneResultSummaryGA(LaneResultSummary):
319     def __init__(self, html=None, xml=None):
320         super(LaneResultSummaryGA, self).__init__(html, xml)
321
322         if html is not None:
323             self.set_elements_from_source(html)
324         if xml is not None:
325             self.set_elements(xml)
326
327     def set_elements_from_gerald_xml(self, read, element):
328         self.lane = int(element.find('laneNumber').text)
329         self.end = read
330         lane_yield_node = element.find('laneYield')
331         if lane_yield_node is not None:
332             self.lane_yield = int(lane_yield_node.text)
333         else:
334             self.lane_yield = None
335
336         for GeraldName, LRSName in list(LaneResultSummary.GERALD_TAGS.items()):
337             node = element.find(GeraldName)
338             if node is None:
339                 LOGGER.info("Couldn't find %s" % (GeraldName))
340             setattr(self, LRSName, parse_xml_mean_range(node))
341
342     def set_elements_from_source(self, data):
343         """Read from an initial summary data file. Either xml or html
344         """
345         if not len(data) in (8,10):
346             raise RuntimeError("Summary.htm file format changed, len(data)=%d" % (len(data),))
347
348         # same in pre-0.3.0 Summary file and 0.3 summary file
349         self.lane = int(data[0])
350
351         if len(data) == 8:
352             parsed_data = [ parse_mean_range(x) for x in data[1:] ]
353             # this is the < 0.3 Pipeline version
354             self.cluster = parsed_data[0]
355             self.average_first_cycle_intensity = parsed_data[1]
356             self.percent_intensity_after_20_cycles = parsed_data[2]
357             self.percent_pass_filter_clusters = parsed_data[3]
358             self.percent_pass_filter_align = parsed_data[4]
359             self.average_alignment_score = parsed_data[5]
360             self.percent_error_rate = parsed_data[6]
361         elif len(data) == 10:
362             parsed_data = [ parse_mean_range(x) for x in data[2:] ]
363             # this is the >= 0.3 summary file
364             self.lane_yield = data[1]
365             self.cluster = parsed_data[0]
366             self.cluster_pass_filter = parsed_data[1]
367             self.average_first_cycle_intensity = parsed_data[2]
368             self.percent_intensity_after_20_cycles = parsed_data[3]
369             self.percent_pass_filter_clusters = parsed_data[4]
370             self.percent_pass_filter_align = parsed_data[5]
371             self.average_alignment_score = parsed_data[6]
372             self.percent_error_rate = parsed_data[7]
373
374
375 class LaneResultSummaryHiSeq(LaneResultSummary):
376     def __init__(self, data=None, xml=None):
377         super(LaneResultSummaryHiSeq, self).__init__(data, xml)
378
379         if data is not None:
380             self.set_elements_from_source(data)
381         if xml is not None:
382             self.set_elements(xml)
383
384     def set_elements_from_source(self, element):
385         """Read from an initial summary data file. Either xml or html
386         """
387         # same in pre-0.3.0 Summary file and 0.3 summary file
388         lane = element.attrib
389         self.lane = int(lane['key'])
390         #self.lane_yield = data[1]
391         self.cluster = (int(lane['ClustersRaw']),
392                         float(lane['ClustersRawSD']))
393         self.cluster_pass_filter = (int(lane['ClustersPF']),
394                                     float(lane['ClustersPFSD']))
395         self.average_first_cycle_intensity = (int(lane['FirstCycleIntPF']),
396                                               float(lane['FirstCycleIntPFSD']))
397         self.percent_intensity_after_20_cycles = (
398             float(lane['PrcIntensityAfter20CyclesPF']),
399             float(lane['PrcIntensityAfter20CyclesPFSD']))
400         self.percent_pass_filter_clusters = (
401             float(lane['PrcPFClusters']),
402             float(lane['PrcPFClustersSD']))
403         self.percent_pass_filter_align = (
404             float(lane['PrcAlign']),
405             float(lane['PrcAlignSD']))
406         self.percent_error_rate = (
407             float(lane['ErrRatePhiX']),
408             float(lane['ErrRatePhiXSD']),)
409
410
411 def tonumber(v):
412     """
413     Convert a value to int if its an int otherwise a float.
414     """
415     try:
416         v = int(v)
417     except ValueError as e:
418         v = float(v)
419     return v
420
421 def parse_mean_range(value):
422     """
423     Parse values like 123 +/- 4.5
424     """
425     if value.strip() == 'unknown':
426         return nan, nan
427
428     values = value.split()
429     if len(values) == 1:
430         if values[0] == '+/-':
431             return nan,nan
432         else:
433             return tonumber(values[0])
434
435     average, pm, deviation = values
436     if pm != '+/-':
437         raise RuntimeError("Summary.htm file format changed")
438     return tonumber(average), tonumber(deviation)
439
440 def make_mean_range_element(parent, name, mean, deviation):
441     """
442     Make an etree subelement <Name mean='mean', deviation='deviation'/>
443     """
444     element = etree.SubElement(parent, name,
445                                      { 'mean': str(mean),
446                                        'deviation': str(deviation)})
447     return element
448
449 def parse_mean_range_element(element):
450     """
451     Grab mean/deviation out of element
452     """
453     return (tonumber(element.attrib['mean']),
454             tonumber(element.attrib['deviation']))
455
456 def parse_summary_element(element):
457     """
458     Determine if we have a simple element or a mean/deviation element
459     """
460     if len(element.attrib) > 0:
461         return parse_mean_range_element(element)
462     else:
463         return element.text
464
465 def parse_xml_mean_range(element):
466     """
467     Extract mean/stddev children from an element as a tuple
468     """
469     if element is None:
470         return None
471
472     mean = element.find('mean')
473     stddev = element.find('stdev')
474     if mean is None or stddev is None:
475         raise RuntimeError("Summary.xml file format changed, expected mean/stddev tags")
476     if mean.text is None:
477         mean_value = float('nan')
478     else:
479         mean_value = tonumber(mean.text)
480
481     if stddev.text is None:
482         stddev_value = float('nan')
483     else:
484         stddev_value = tonumber(stddev.text)
485
486
487     return (mean_value, stddev_value)
488
489 def isxml_file(fname):
490     with open(fname,'r') as stream:
491         return isxml_stream(stream)
492
493 def isxml_stream(stream):
494     """Return true or false if its sort of an xml file
495     """
496     pos = stream.tell()
497     line = stream.readline()
498     stream.seek(pos)
499     if re.match("<\?xml.*?>", line):
500         # attempt at xml
501         return True
502     else:
503         return False
504
505 if __name__ == "__main__":
506     # test code
507     from optparse import OptionParser
508     parser = OptionParser('%prog [Summary.xml/Summary.htm]+')
509     opts, args = parser.parse_args()
510     if len(args) == 0:
511         parser.error('need at least one xml/html file')
512     for fname in args:
513         s = Summary(fname)
514         s.dump()
515