convert to print_function
[htsworkflow.git] / htsworkflow / pipelines / summary.py
1 """
2 Analyze the Summary.htm file produced by GERALD
3 """
4 from __future__ import print_function
5
6 import os
7 import logging
8 import re
9 import types
10 from pprint import pprint
11
12 from lxml import html
13 from lxml import etree
14 from htsworkflow.util.ethelp import indent, flatten
15
16 LOGGER = logging.getLogger(__name__)
17 nan = float('nan')
18
19 class Summary(object):
20     """
21     Extract some useful information from the Summary.htm file
22     """
23     XML_VERSION = 3
24     SUMMARY = 'Summary'
25
26     def __init__(self, filename=None, xml=None):
27         # lane results is a list of 1 or 2 ends containing
28         # a dictionary of all the lanes reported in this
29         # summary file
30         self.lane_results = [{}]
31
32         if filename is not None:
33             self._extract_lane_results(filename)
34         if xml is not None:
35             self.set_elements(xml)
36
37     def __getitem__(self, key):
38         return self.lane_results[key]
39
40     def __len__(self):
41         return len(self.lane_results)
42
43     def get_elements(self):
44         summary = etree.Element(Summary.SUMMARY,
45                                       {'version': unicode(Summary.XML_VERSION)})
46         for end in self.lane_results:
47             for lane in end.values():
48                 summary.append(lane.get_elements())
49         return summary
50
51     def set_elements(self, tree):
52         if tree.tag != Summary.SUMMARY:
53             return ValueError("Expected %s" % (Summary.SUMMARY,))
54         xml_version = int(tree.attrib.get('version', 0))
55         if xml_version > Summary.XML_VERSION:
56             LOGGER.warn('Summary XML tree is a higher version than this class')
57         for element in list(tree):
58             lrs = LaneResultSummaryGA()
59             lrs.set_elements(element)
60             if len(self.lane_results) < (lrs.end + 1):
61               self.lane_results.append({})
62             self.lane_results[lrs.end][lrs.lane] = lrs
63
64     def is_paired_end(self):
65       return len(self.lane_results) == 2
66
67     def dump(self):
68         """
69         Debugging function, report current object
70         """
71         tree = self.get_elements()
72         print(etree.tostring(tree))
73
74 class SummaryGA(Summary):
75     def __init__(self, filename=None, xml=None):
76         super(SummaryGA, self).__init__(filename, xml)
77
78     def _flattened_row(self, row):
79         """
80         flatten the children of a <tr>...</tr>
81         """
82         return [flatten(x) for x in row.getchildren() ]
83
84     def _parse_table(self, table):
85         """
86         assumes the first line is the header of a table,
87         and that the remaining rows are data
88         """
89         rows = table.getchildren()
90         data = []
91         for r in rows:
92             data.append(self._flattened_row(r))
93         return data
94
95     def _extract_lane_results(self, pathname):
96         """
97         Extract just the lane results.
98         Currently those are the only ones we care about.
99         """
100
101         tables = self._extract_named_tables(pathname)
102
103
104     def _extract_named_tables(self, pathname):
105         """
106         extract all the 'named' tables from a Summary.htm file
107         and return as a dictionary
108
109         Named tables are <h2>...</h2><table>...</table> pairs
110         The contents of the h2 tag is considered to the name
111         of the table.
112         """
113         if isxml_file(pathname):
114             tree = etree.parse(pathname).getroot()
115         else:
116             # html
117             tree = html.parse(pathname).getroot()
118         # tree = etree.parse(pathname).getroot()
119         # hack for 1.1rc1, this should be removed when possible.
120         #file_body = open(pathname).read()
121         #file_body = file_body.replace('CHASTITY<=', 'CHASTITY&lt;=')
122         #tree = etree.fromstring(file_body)
123
124         # are we reading the xml or the html version of the Summary file?
125         if tree.tag.lower() == 'summary':
126             # summary version
127             tables = self._extract_named_tables_from_gerald_xml(tree)
128         elif tree.tag.lower() == 'html':
129             # html version
130             tables = self._extract_named_tables_from_html(tree)
131             table_names = [ ('Lane Results Summary', 0),
132                             ('Lane Results Summary : Read 1', 0),
133                             ('Lane Results Summary : Read 2', 1),]
134             for name, end in table_names:
135                 if tables.has_key(name):
136                     self._extract_lane_results_for_end(tables, name, end)
137
138         if len(self.lane_results[0])  == 0:
139             LOGGER.warning("No Lane Results Summary Found in %s" % (pathname,))
140
141     def _extract_named_tables_from_gerald_xml(self, tree):
142         """
143         Extract useful named tables from a gerald created Summary.xml file
144         """
145         # using the function to convert to lower instead of just writing it
146         # makes the tag easier to read (IMO)
147         useful_tables = ['LaneResultsSummary'.lower(),]
148
149         tables ={}
150         for child in tree.getchildren():
151             if child.tag.lower() in  useful_tables:
152                 read_tree = child.find('Read')
153                 # we want 0 based.
154                 read = int(read_tree.find('readNumber').text)-1
155                 for element in read_tree.getchildren():
156                     if element.tag.lower() == "lane":
157                         lrs = LaneResultSummaryGA()
158                         lrs.set_elements_from_gerald_xml(read, element)
159                         self.lane_results[lrs.end][lrs.lane] = lrs
160         # probably not useful
161         return tables
162
163     ###### START HTML Table Extraction ########
164     def _extract_named_tables_from_html(self, tree):
165         tables = {}
166         for t in tree.findall('*//table'):
167             previous = t.getprevious()
168             if previous is None:
169                 previous = t.getparent()
170
171             if previous.tag.lower() == 'div':
172                 previous = previous.getprevious()
173
174             if previous.tag in ('h2', 'p'):
175                 # we have a table
176                 name = flatten(previous)
177                 data = self._parse_table(t)
178                 tables[name] = data
179         return tables
180
181     def _extract_lane_results_for_end(self, tables, table_name, end):
182         """
183         extract the Lane Results Summary table
184         """
185         # parse lane result summary
186         lane_summary = tables[table_name]
187         # this is version 1 of the summary file
188         if len(lane_summary[-1]) == 8:
189             # strip header
190             headers = lane_summary[0]
191             # grab the lane by lane data
192             lane_summary = lane_summary[1:]
193
194         # len(lane_summary[-1] = 10 is version 2 of the summary file
195         #                      = 9  is version 3 of the Summary.htm file
196         elif len(lane_summary[-1]) in (9, 10):
197             # lane_summary[0] is a different less specific header row
198             headers = lane_summary[1]
199             lane_summary = lane_summary[2:10]
200             # after the last lane, there's a set of chip wide averages
201
202         # append an extra dictionary if needed
203         if len(self.lane_results) < (end + 1):
204           self.lane_results.append({})
205
206         for r in lane_summary:
207             lrs = LaneResultSummaryGA(html=r)
208             lrs.end = end
209             self.lane_results[lrs.end][lrs.lane] = lrs
210     ###### END HTML Table Extraction ########
211
212
213 class SummaryHiSeq(Summary):
214     def __init__(self, filename=None, xml=None):
215         super(SummaryHiSeq, self).__init__(filename, xml)
216
217     def _extract_lane_results(self, filename):
218         read1 = os.path.join(filename, 'read1.xml')
219         read2 = os.path.join(filename, 'read2.xml')
220
221         if os.path.exists(read1):
222             self._extract_lane_results_for_end(read1, 0)
223         else:
224             LOGGER.warn("No read1.xml at %s." % (read1,))
225         if os.path.exists(read2):
226             self.lane_results.append({})
227             self._extract_lane_results_for_end(read2, 1)
228
229     def _extract_lane_results_for_end(self, filename, end):
230         self.tree = etree.parse(filename)
231         root = self.tree.getroot()
232         for lane in root.getchildren():
233             lrs = LaneResultSummaryHiSeq(data=lane)
234             lrs.end = end
235             self.lane_results[lrs.end][lrs.lane] = lrs
236
237
238 class LaneResultSummary(object):
239     """
240     Parse the LaneResultSummary table out of Summary.htm
241     Mostly for the cluster number
242     """
243     LANE_RESULT_SUMMARY = 'LaneResultSummary'
244     TAGS = {
245       'LaneYield': 'lane_yield',
246       'Cluster': 'cluster', # Raw
247       'ClusterPF': 'cluster_pass_filter',
248       'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
249       'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
250       'PercentPassFilterClusters': 'percent_pass_filter_clusters',
251       'PercentPassFilterAlign': 'percent_pass_filter_align',
252       'AverageAlignmentScore': 'average_alignment_score',
253       'PercentErrorRate': 'percent_error_rate'
254     }
255     # These are tags that have mean/stdev as found in the GERALD Summary.xml file
256     GERALD_TAGS = {
257       #'laneYield': 'lane_yield', #this is just a number
258       'clusterCountRaw': 'cluster', # Raw
259       'clusterCountPF': 'cluster_pass_filter',
260       'oneSig': 'average_first_cycle_intensity',
261       'signal20AsPctOf1': 'percent_intensity_after_20_cycles',
262       'percentClustersPF': 'percent_pass_filter_clusters',
263       'percentUniquelyAlignedPF': 'percent_pass_filter_align',
264       'averageAlignScorePF': 'average_alignment_score',
265       'errorPF': 'percent_error_rate'
266     }
267
268     def __init__(self, html=None, xml=None):
269         self.lane = None
270         self.end = 0
271         self.lane_yield = None
272         self.cluster = None
273         self.cluster_pass_filter = None
274         self.average_first_cycle_intensity = None
275         self.percent_intensity_after_20_cycles = None
276         self.percent_pass_filter_clusters = None
277         self.percent_pass_filter_align = None
278         self.average_alignment_score = None
279         self.percent_error_rate = None
280
281
282     def get_elements(self):
283         lane_result = etree.Element(
284                         LaneResultSummary.LANE_RESULT_SUMMARY,
285                         {'lane': unicode(self.lane), 'end': unicode(self.end)})
286         for tag, variable_name in LaneResultSummary.TAGS.items():
287             value = getattr(self, variable_name)
288             if value is None:
289                 continue
290             # it looks like a sequence
291             elif type(value) in (types.TupleType, types.ListType):
292                 element = make_mean_range_element(
293                   lane_result,
294                   tag,
295                   *value
296                 )
297             else:
298                 element = etree.SubElement(lane_result, tag)
299                 element.text = unicode(value)
300         return lane_result
301
302     def set_elements(self, tree):
303         if tree.tag != LaneResultSummary.LANE_RESULT_SUMMARY:
304             raise ValueError('Expected %s' % (
305                     LaneResultSummary.LANE_RESULT_SUMMARY))
306         self.lane = int(tree.attrib['lane'])
307         # default to the first end, for the older summary files
308         # that are single ended
309         self.end = int(tree.attrib.get('end', 0))
310         tags = LaneResultSummary.TAGS
311         for element in list(tree):
312             try:
313                 variable_name = tags[element.tag]
314                 setattr(self, variable_name,
315                         parse_summary_element(element))
316             except KeyError as e:
317                 LOGGER.warn('Unrecognized tag %s' % (element.tag,))
318
319
320 class LaneResultSummaryGA(LaneResultSummary):
321     def __init__(self, html=None, xml=None):
322         super(LaneResultSummaryGA, self).__init__(html, xml)
323
324         if html is not None:
325             self.set_elements_from_source(html)
326         if xml is not None:
327             self.set_elements(xml)
328
329     def set_elements_from_gerald_xml(self, read, element):
330         self.lane = int(element.find('laneNumber').text)
331         self.end = read
332         lane_yield_node = element.find('laneYield')
333         if lane_yield_node is not None:
334             self.lane_yield = int(lane_yield_node.text)
335         else:
336             self.lane_yield = None
337
338         for GeraldName, LRSName in LaneResultSummary.GERALD_TAGS.items():
339             node = element.find(GeraldName)
340             if node is None:
341                 LOGGER.info("Couldn't find %s" % (GeraldName))
342             setattr(self, LRSName, parse_xml_mean_range(node))
343
344     def set_elements_from_source(self, data):
345         """Read from an initial summary data file. Either xml or html
346         """
347         if not len(data) in (8,10):
348             raise RuntimeError("Summary.htm file format changed, len(data)=%d" % (len(data),))
349
350         # same in pre-0.3.0 Summary file and 0.3 summary file
351         self.lane = int(data[0])
352
353         if len(data) == 8:
354             parsed_data = [ parse_mean_range(x) for x in data[1:] ]
355             # this is the < 0.3 Pipeline version
356             self.cluster = parsed_data[0]
357             self.average_first_cycle_intensity = parsed_data[1]
358             self.percent_intensity_after_20_cycles = parsed_data[2]
359             self.percent_pass_filter_clusters = parsed_data[3]
360             self.percent_pass_filter_align = parsed_data[4]
361             self.average_alignment_score = parsed_data[5]
362             self.percent_error_rate = parsed_data[6]
363         elif len(data) == 10:
364             parsed_data = [ parse_mean_range(x) for x in data[2:] ]
365             # this is the >= 0.3 summary file
366             self.lane_yield = data[1]
367             self.cluster = parsed_data[0]
368             self.cluster_pass_filter = parsed_data[1]
369             self.average_first_cycle_intensity = parsed_data[2]
370             self.percent_intensity_after_20_cycles = parsed_data[3]
371             self.percent_pass_filter_clusters = parsed_data[4]
372             self.percent_pass_filter_align = parsed_data[5]
373             self.average_alignment_score = parsed_data[6]
374             self.percent_error_rate = parsed_data[7]
375
376
377 class LaneResultSummaryHiSeq(LaneResultSummary):
378     def __init__(self, data=None, xml=None):
379         super(LaneResultSummaryHiSeq, self).__init__(data, xml)
380
381         if data is not None:
382             self.set_elements_from_source(data)
383         if xml is not None:
384             self.set_elements(xml)
385
386     def set_elements_from_source(self, element):
387         """Read from an initial summary data file. Either xml or html
388         """
389         # same in pre-0.3.0 Summary file and 0.3 summary file
390         lane = element.attrib
391         self.lane = int(lane['key'])
392         #self.lane_yield = data[1]
393         self.cluster = (int(lane['ClustersRaw']),
394                         float(lane['ClustersRawSD']))
395         self.cluster_pass_filter = (int(lane['ClustersPF']),
396                                     float(lane['ClustersPFSD']))
397         self.average_first_cycle_intensity = (int(lane['FirstCycleIntPF']),
398                                               float(lane['FirstCycleIntPFSD']))
399         self.percent_intensity_after_20_cycles = (
400             float(lane['PrcIntensityAfter20CyclesPF']),
401             float(lane['PrcIntensityAfter20CyclesPFSD']))
402         self.percent_pass_filter_clusters = (
403             float(lane['PrcPFClusters']),
404             float(lane['PrcPFClustersSD']))
405         self.percent_pass_filter_align = (
406             float(lane['PrcAlign']),
407             float(lane['PrcAlignSD']))
408         self.percent_error_rate = (
409             float(lane['ErrRatePhiX']),
410             float(lane['ErrRatePhiXSD']),)
411
412
413 def tonumber(v):
414     """
415     Convert a value to int if its an int otherwise a float.
416     """
417     try:
418         v = int(v)
419     except ValueError as e:
420         v = float(v)
421     return v
422
423 def parse_mean_range(value):
424     """
425     Parse values like 123 +/- 4.5
426     """
427     if value.strip() == 'unknown':
428         return nan, nan
429
430     values = value.split()
431     if len(values) == 1:
432         if values[0] == '+/-':
433             return nan,nan
434         else:
435             return tonumber(values[0])
436
437     average, pm, deviation = values
438     if pm != '+/-':
439         raise RuntimeError("Summary.htm file format changed")
440     return tonumber(average), tonumber(deviation)
441
442 def make_mean_range_element(parent, name, mean, deviation):
443     """
444     Make an etree subelement <Name mean='mean', deviation='deviation'/>
445     """
446     element = etree.SubElement(parent, name,
447                                      { 'mean': unicode(mean),
448                                        'deviation': unicode(deviation)})
449     return element
450
451 def parse_mean_range_element(element):
452     """
453     Grab mean/deviation out of element
454     """
455     return (tonumber(element.attrib['mean']),
456             tonumber(element.attrib['deviation']))
457
458 def parse_summary_element(element):
459     """
460     Determine if we have a simple element or a mean/deviation element
461     """
462     if len(element.attrib) > 0:
463         return parse_mean_range_element(element)
464     else:
465         return element.text
466
467 def parse_xml_mean_range(element):
468     """
469     Extract mean/stddev children from an element as a tuple
470     """
471     if element is None:
472         return None
473
474     mean = element.find('mean')
475     stddev = element.find('stdev')
476     if mean is None or stddev is None:
477         raise RuntimeError("Summary.xml file format changed, expected mean/stddev tags")
478     if mean.text is None:
479         mean_value = float('nan')
480     else:
481         mean_value = tonumber(mean.text)
482
483     if stddev.text is None:
484         stddev_value = float('nan')
485     else:
486         stddev_value = tonumber(stddev.text)
487
488
489     return (mean_value, stddev_value)
490
491 def isxml_file(fname):
492     with open(fname,'r') as stream:
493         return isxml_stream(stream)
494
495 def isxml_stream(stream):
496     """Return true or false if its sort of an xml file
497     """
498     pos = stream.tell()
499     line = stream.readline()
500     stream.seek(pos)
501     if re.match("<\?xml.*?>", line):
502         # attempt at xml
503         return True
504     else:
505         return False
506
507 if __name__ == "__main__":
508     # test code
509     from optparse import OptionParser
510     parser = OptionParser('%prog [Summary.xml/Summary.htm]+')
511     opts, args = parser.parse_args()
512     if len(args) == 0:
513         parser.error('need at least one xml/html file')
514     for fname in args:
515         s = Summary(fname)
516         s.dump()
517