6fdda43cf8b4abcee39100e4204df7182fa4a48b
[htsworkflow.git] / htsworkflow / pipelines / summary.py
1 """
2 Analyze the Summary.htm file produced by GERALD
3 """
4 import os
5 import logging
6 import re
7 import types
8 from pprint import pprint
9
10 #from htsworkflow.pipelines.runfolder import ElementTree
11 from lxml import html
12 from lxml import etree
13 from htsworkflow.util.ethelp import indent, flatten
14
15 LOGGER = logging.getLogger(__name__)
16 nan = float('nan')
17
18 class Summary(object):
19     """
20     Extract some useful information from the Summary.htm file
21     """
22     XML_VERSION = 3
23     SUMMARY = 'Summary'
24
25     def __init__(self, filename=None, xml=None):
26         # lane results is a list of 1 or 2 ends containing
27         # a dictionary of all the lanes reported in this
28         # summary file
29         self.lane_results = [{}]
30
31         if filename is not None:
32             self._extract_lane_results(filename)
33         if xml is not None:
34             self.set_elements(xml)
35
36     def __getitem__(self, key):
37         return self.lane_results[key]
38
39     def __len__(self):
40         return len(self.lane_results)
41
42     def get_elements(self):
43         summary = etree.Element(Summary.SUMMARY,
44                                       {'version': unicode(Summary.XML_VERSION)})
45         for end in self.lane_results:
46             for lane in end.values():
47                 summary.append(lane.get_elements())
48         return summary
49
50     def set_elements(self, tree):
51         if tree.tag != Summary.SUMMARY:
52             return ValueError("Expected %s" % (Summary.SUMMARY,))
53         xml_version = int(tree.attrib.get('version', 0))
54         if xml_version > Summary.XML_VERSION:
55             LOGGER.warn('Summary XML tree is a higher version than this class')
56         for element in list(tree):
57             lrs = LaneResultSummaryGA()
58             lrs.set_elements(element)
59             if len(self.lane_results) < (lrs.end + 1):
60               self.lane_results.append({})
61             self.lane_results[lrs.end][lrs.lane] = lrs
62
63     def is_paired_end(self):
64       return len(self.lane_results) == 2
65
66     def dump(self):
67         """
68         Debugging function, report current object
69         """
70         tree = self.get_elements()
71         print etree.tostring(tree)
72
73 class SummaryGA(Summary):
74     def __init__(self, filename=None, xml=None):
75         super(SummaryGA, self).__init__(filename, xml)
76
77     def _flattened_row(self, row):
78         """
79         flatten the children of a <tr>...</tr>
80         """
81         return [flatten(x) for x in row.getchildren() ]
82
83     def _parse_table(self, table):
84         """
85         assumes the first line is the header of a table,
86         and that the remaining rows are data
87         """
88         rows = table.getchildren()
89         data = []
90         for r in rows:
91             data.append(self._flattened_row(r))
92         return data
93
94     def _extract_lane_results(self, pathname):
95         """
96         Extract just the lane results.
97         Currently those are the only ones we care about.
98         """
99
100         tables = self._extract_named_tables(pathname)
101
102
103     def _extract_named_tables(self, pathname):
104         """
105         extract all the 'named' tables from a Summary.htm file
106         and return as a dictionary
107
108         Named tables are <h2>...</h2><table>...</table> pairs
109         The contents of the h2 tag is considered to the name
110         of the table.
111         """
112         if isxml_file(pathname):
113             tree = etree.parse(pathname).getroot()
114         else:
115             # html
116             tree = html.parse(pathname).getroot()
117         # tree = etree.parse(pathname).getroot()
118         # hack for 1.1rc1, this should be removed when possible.
119         #file_body = open(pathname).read()
120         #file_body = file_body.replace('CHASTITY<=', 'CHASTITY&lt;=')
121         #tree = etree.fromstring(file_body)
122
123         # are we reading the xml or the html version of the Summary file?
124         if tree.tag.lower() == 'summary':
125             # summary version
126             tables = self._extract_named_tables_from_gerald_xml(tree)
127         elif tree.tag.lower() == 'html':
128             # html version
129             tables = self._extract_named_tables_from_html(tree)
130             table_names = [ ('Lane Results Summary', 0),
131                             ('Lane Results Summary : Read 1', 0),
132                             ('Lane Results Summary : Read 2', 1),]
133             for name, end in table_names:
134                 if tables.has_key(name):
135                     self._extract_lane_results_for_end(tables, name, end)
136
137         if len(self.lane_results[0])  == 0:
138             LOGGER.warning("No Lane Results Summary Found in %s" % (pathname,))
139
140     def _extract_named_tables_from_gerald_xml(self, tree):
141         """
142         Extract useful named tables from a gerald created Summary.xml file
143         """
144         # using the function to convert to lower instead of just writing it
145         # makes the tag easier to read (IMO)
146         useful_tables = ['LaneResultsSummary'.lower(),]
147
148         tables ={}
149         for child in tree.getchildren():
150             if child.tag.lower() in  useful_tables:
151                 read_tree = child.find('Read')
152                 # we want 0 based.
153                 read = int(read_tree.find('readNumber').text)-1
154                 for element in read_tree.getchildren():
155                     if element.tag.lower() == "lane":
156                         lrs = LaneResultSummaryGA()
157                         lrs.set_elements_from_gerald_xml(read, element)
158                         self.lane_results[lrs.end][lrs.lane] = lrs
159         # probably not useful
160         return tables
161
162     ###### START HTML Table Extraction ########
163     def _extract_named_tables_from_html(self, tree):
164         tables = {}
165         for t in tree.findall('*//table'):
166             previous = t.getprevious()
167             if previous is None:
168                 previous = t.getparent()
169
170             if previous.tag.lower() == 'div':
171                 previous = previous.getprevious()
172
173             if previous.tag in ('h2', 'p'):
174                 # we have a table
175                 name = flatten(previous)
176                 data = self._parse_table(t)
177                 tables[name] = data
178         return tables
179
180     def _extract_lane_results_for_end(self, tables, table_name, end):
181         """
182         extract the Lane Results Summary table
183         """
184         # parse lane result summary
185         lane_summary = tables[table_name]
186         # this is version 1 of the summary file
187         if len(lane_summary[-1]) == 8:
188             # strip header
189             headers = lane_summary[0]
190             # grab the lane by lane data
191             lane_summary = lane_summary[1:]
192
193         # len(lane_summary[-1] = 10 is version 2 of the summary file
194         #                      = 9  is version 3 of the Summary.htm file
195         elif len(lane_summary[-1]) in (9, 10):
196             # lane_summary[0] is a different less specific header row
197             headers = lane_summary[1]
198             lane_summary = lane_summary[2:10]
199             # after the last lane, there's a set of chip wide averages
200
201         # append an extra dictionary if needed
202         if len(self.lane_results) < (end + 1):
203           self.lane_results.append({})
204
205         for r in lane_summary:
206             lrs = LaneResultSummaryGA(html=r)
207             lrs.end = end
208             self.lane_results[lrs.end][lrs.lane] = lrs
209     ###### END HTML Table Extraction ########
210
211
212 class SummaryHiSeq(Summary):
213     def __init__(self, filename=None, xml=None):
214         super(SummaryHiSeq, self).__init__(filename, xml)
215
216     def _extract_lane_results(self, filename):
217         read1 = os.path.join(filename, 'read1.xml')
218         read2 = os.path.join(filename, 'read2.xml')
219
220         if os.path.exists(read1):
221             self._extract_lane_results_for_end(read1, 0)
222         else:
223             LOGGER.warn("No read1.xml at %s." % (read1,))
224         if os.path.exists(read2):
225             self.lane_results.append({})
226             self._extract_lane_results_for_end(read2, 1)
227
228     def _extract_lane_results_for_end(self, filename, end):
229         self.tree = etree.parse(filename)
230         root = self.tree.getroot()
231         for lane in root.getchildren():
232             lrs = LaneResultSummaryHiSeq(data=lane)
233             lrs.end = end
234             self.lane_results[lrs.end][lrs.lane] = lrs
235
236
237 class LaneResultSummary(object):
238     """
239     Parse the LaneResultSummary table out of Summary.htm
240     Mostly for the cluster number
241     """
242     LANE_RESULT_SUMMARY = 'LaneResultSummary'
243     TAGS = {
244       'LaneYield': 'lane_yield',
245       'Cluster': 'cluster', # Raw
246       'ClusterPF': 'cluster_pass_filter',
247       'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
248       'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
249       'PercentPassFilterClusters': 'percent_pass_filter_clusters',
250       'PercentPassFilterAlign': 'percent_pass_filter_align',
251       'AverageAlignmentScore': 'average_alignment_score',
252       'PercentErrorRate': 'percent_error_rate'
253     }
254     # These are tags that have mean/stdev as found in the GERALD Summary.xml file
255     GERALD_TAGS = {
256       #'laneYield': 'lane_yield', #this is just a number
257       'clusterCountRaw': 'cluster', # Raw
258       'clusterCountPF': 'cluster_pass_filter',
259       'oneSig': 'average_first_cycle_intensity',
260       'signal20AsPctOf1': 'percent_intensity_after_20_cycles',
261       'percentClustersPF': 'percent_pass_filter_clusters',
262       'percentUniquelyAlignedPF': 'percent_pass_filter_align',
263       'averageAlignScorePF': 'average_alignment_score',
264       'errorPF': 'percent_error_rate'
265     }
266
267     def __init__(self, html=None, xml=None):
268         self.lane = None
269         self.end = 0
270         self.lane_yield = None
271         self.cluster = None
272         self.cluster_pass_filter = None
273         self.average_first_cycle_intensity = None
274         self.percent_intensity_after_20_cycles = None
275         self.percent_pass_filter_clusters = None
276         self.percent_pass_filter_align = None
277         self.average_alignment_score = None
278         self.percent_error_rate = None
279
280
281     def get_elements(self):
282         lane_result = etree.Element(
283                         LaneResultSummary.LANE_RESULT_SUMMARY,
284                         {'lane': unicode(self.lane), 'end': unicode(self.end)})
285         for tag, variable_name in LaneResultSummary.TAGS.items():
286             value = getattr(self, variable_name)
287             if value is None:
288                 continue
289             # it looks like a sequence
290             elif type(value) in (types.TupleType, types.ListType):
291                 element = make_mean_range_element(
292                   lane_result,
293                   tag,
294                   *value
295                 )
296             else:
297                 element = etree.SubElement(lane_result, tag)
298                 element.text = unicode(value)
299         return lane_result
300
301     def set_elements(self, tree):
302         if tree.tag != LaneResultSummary.LANE_RESULT_SUMMARY:
303             raise ValueError('Expected %s' % (
304                     LaneResultSummary.LANE_RESULT_SUMMARY))
305         self.lane = int(tree.attrib['lane'])
306         # default to the first end, for the older summary files
307         # that are single ended
308         self.end = int(tree.attrib.get('end', 0))
309         tags = LaneResultSummary.TAGS
310         for element in list(tree):
311             try:
312                 variable_name = tags[element.tag]
313                 setattr(self, variable_name,
314                         parse_summary_element(element))
315             except KeyError, e:
316                 LOGGER.warn('Unrecognized tag %s' % (element.tag,))
317
318
319 class LaneResultSummaryGA(LaneResultSummary):
320     def __init__(self, html=None, xml=None):
321         super(LaneResultSummaryGA, self).__init__(html, xml)
322
323         if html is not None:
324             self.set_elements_from_source(html)
325         if xml is not None:
326             self.set_elements(xml)
327
328     def set_elements_from_gerald_xml(self, read, element):
329         self.lane = int(element.find('laneNumber').text)
330         self.end = read
331         lane_yield_node = element.find('laneYield')
332         if lane_yield_node is not None:
333             self.lane_yield = int(lane_yield_node.text)
334         else:
335             self.lane_yield = None
336
337         for GeraldName, LRSName in LaneResultSummary.GERALD_TAGS.items():
338             node = element.find(GeraldName)
339             if node is None:
340                 LOGGER.info("Couldn't find %s" % (GeraldName))
341             setattr(self, LRSName, parse_xml_mean_range(node))
342
343     def set_elements_from_source(self, data):
344         """Read from an initial summary data file. Either xml or html
345         """
346         if not len(data) in (8,10):
347             raise RuntimeError("Summary.htm file format changed, len(data)=%d" % (len(data),))
348
349         # same in pre-0.3.0 Summary file and 0.3 summary file
350         self.lane = int(data[0])
351
352         if len(data) == 8:
353             parsed_data = [ parse_mean_range(x) for x in data[1:] ]
354             # this is the < 0.3 Pipeline version
355             self.cluster = parsed_data[0]
356             self.average_first_cycle_intensity = parsed_data[1]
357             self.percent_intensity_after_20_cycles = parsed_data[2]
358             self.percent_pass_filter_clusters = parsed_data[3]
359             self.percent_pass_filter_align = parsed_data[4]
360             self.average_alignment_score = parsed_data[5]
361             self.percent_error_rate = parsed_data[6]
362         elif len(data) == 10:
363             parsed_data = [ parse_mean_range(x) for x in data[2:] ]
364             # this is the >= 0.3 summary file
365             self.lane_yield = data[1]
366             self.cluster = parsed_data[0]
367             self.cluster_pass_filter = parsed_data[1]
368             self.average_first_cycle_intensity = parsed_data[2]
369             self.percent_intensity_after_20_cycles = parsed_data[3]
370             self.percent_pass_filter_clusters = parsed_data[4]
371             self.percent_pass_filter_align = parsed_data[5]
372             self.average_alignment_score = parsed_data[6]
373             self.percent_error_rate = parsed_data[7]
374
375
376 class LaneResultSummaryHiSeq(LaneResultSummary):
377     def __init__(self, data=None, xml=None):
378         super(LaneResultSummaryHiSeq, self).__init__(data, xml)
379
380         if data is not None:
381             self.set_elements_from_source(data)
382         if xml is not None:
383             self.set_elements(xml)
384
385     def set_elements_from_source(self, element):
386         """Read from an initial summary data file. Either xml or html
387         """
388         # same in pre-0.3.0 Summary file and 0.3 summary file
389         lane = element.attrib
390         self.lane = int(lane['key'])
391         #self.lane_yield = data[1]
392         self.cluster = (int(lane['ClustersRaw']),
393                         float(lane['ClustersRawSD']))
394         self.cluster_pass_filter = (int(lane['ClustersPF']),
395                                     float(lane['ClustersPFSD']))
396         self.average_first_cycle_intensity = (int(lane['FirstCycleIntPF']),
397                                               float(lane['FirstCycleIntPFSD']))
398         self.percent_intensity_after_20_cycles = (
399             float(lane['PrcIntensityAfter20CyclesPF']),
400             float(lane['PrcIntensityAfter20CyclesPFSD']))
401         self.percent_pass_filter_clusters = (
402             float(lane['PrcPFClusters']),
403             float(lane['PrcPFClustersSD']))
404         self.percent_pass_filter_align = (
405             float(lane['PrcAlign']),
406             float(lane['PrcAlignSD']))
407         self.percent_error_rate = (
408             float(lane['ErrRatePhiX']),
409             float(lane['ErrRatePhiXSD']),)
410
411
412 def tonumber(v):
413     """
414     Convert a value to int if its an int otherwise a float.
415     """
416     try:
417         v = int(v)
418     except ValueError, e:
419         v = float(v)
420     return v
421
422 def parse_mean_range(value):
423     """
424     Parse values like 123 +/- 4.5
425     """
426     if value.strip() == 'unknown':
427         return nan, nan
428
429     values = value.split()
430     if len(values) == 1:
431         if values[0] == '+/-':
432             return nan,nan
433         else:
434             return tonumber(values[0])
435
436     average, pm, deviation = values
437     if pm != '+/-':
438         raise RuntimeError("Summary.htm file format changed")
439     return tonumber(average), tonumber(deviation)
440
441 def make_mean_range_element(parent, name, mean, deviation):
442     """
443     Make an etree subelement <Name mean='mean', deviation='deviation'/>
444     """
445     element = etree.SubElement(parent, name,
446                                      { 'mean': unicode(mean),
447                                        'deviation': unicode(deviation)})
448     return element
449
450 def parse_mean_range_element(element):
451     """
452     Grab mean/deviation out of element
453     """
454     return (tonumber(element.attrib['mean']),
455             tonumber(element.attrib['deviation']))
456
457 def parse_summary_element(element):
458     """
459     Determine if we have a simple element or a mean/deviation element
460     """
461     if len(element.attrib) > 0:
462         return parse_mean_range_element(element)
463     else:
464         return element.text
465
466 def parse_xml_mean_range(element):
467     """
468     Extract mean/stddev children from an element as a tuple
469     """
470     if element is None:
471         return None
472
473     mean = element.find('mean')
474     stddev = element.find('stdev')
475     if mean is None or stddev is None:
476         raise RuntimeError("Summary.xml file format changed, expected mean/stddev tags")
477     if mean.text is None:
478         mean_value = float('nan')
479     else:
480         mean_value = tonumber(mean.text)
481
482     if stddev.text is None:
483         stddev_value = float('nan')
484     else:
485         stddev_value = tonumber(stddev.text)
486
487
488     return (mean_value, stddev_value)
489
490 def isxml_file(fname):
491     with open(fname,'r') as stream:
492         return isxml_stream(stream)
493
494 def isxml_stream(stream):
495     """Return true or false if its sort of an xml file
496     """
497     pos = stream.tell()
498     line = stream.readline()
499     stream.seek(pos)
500     if re.match("<\?xml.*?>", line):
501         # attempt at xml
502         return True
503     else:
504         return False
505
506 if __name__ == "__main__":
507     # test code
508     from optparse import OptionParser
509     parser = OptionParser('%prog [Summary.xml/Summary.htm]+')
510     opts, args = parser.parse_args()
511     if len(args) == 0:
512         parser.error('need at least one xml/html file')
513     for fname in args:
514         s = Summary(fname)
515         s.dump()
516