Handle the case when a sequencing lane lacks any yield information.
[htsworkflow.git] / htsworkflow / pipelines / summary.py
1 """
2 Analyze the Summary.htm file produced by GERALD
3 """
4 import logging
5 import types
6 from pprint import pprint
7
8 from htsworkflow.pipelines.runfolder import ElementTree
9 from htsworkflow.util.ethelp import indent, flatten
10
11 nan = float('nan')
12
13 class Summary(object):
14     """
15     Extract some useful information from the Summary.htm file
16     """
17     XML_VERSION = 3
18     SUMMARY = 'Summary'
19
20     class LaneResultSummary(object):
21         """
22         Parse the LaneResultSummary table out of Summary.htm
23         Mostly for the cluster number
24         """
25         LANE_RESULT_SUMMARY = 'LaneResultSummary'
26         TAGS = {
27           'LaneYield': 'lane_yield',
28           'Cluster': 'cluster', # Raw
29           'ClusterPF': 'cluster_pass_filter',
30           'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
31           'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
32           'PercentPassFilterClusters': 'percent_pass_filter_clusters',
33           'PercentPassFilterAlign': 'percent_pass_filter_align',
34           'AverageAlignmentScore': 'average_alignment_score',
35           'PercentErrorRate': 'percent_error_rate'
36         }
37         # These are tags that have mean/stdev as found in the GERALD Summary.xml file
38         GERALD_TAGS = {
39           #'laneYield': 'lane_yield', #this is just a number
40           'clusterCountRaw': 'cluster', # Raw
41           'clusterCountPF': 'cluster_pass_filter',
42           'oneSig': 'average_first_cycle_intensity',
43           'signal20AsPctOf1': 'percent_intensity_after_20_cycles',
44           'percentClustersPF': 'percent_pass_filter_clusters',
45           'percentUniquelyAlignedPF': 'percent_pass_filter_align',
46           'averageAlignScorePF': 'average_alignment_score',
47           'errorPF': 'percent_error_rate'
48         }
49
50         def __init__(self, html=None, xml=None):
51             self.lane = None
52             self.end = 0
53             self.lane_yield = None
54             self.cluster = None
55             self.cluster_pass_filter = None
56             self.average_first_cycle_intensity = None
57             self.percent_intensity_after_20_cycles = None
58             self.percent_pass_filter_clusters = None
59             self.percent_pass_filter_align = None
60             self.average_alignment_score = None
61             self.percent_error_rate = None
62
63             if html is not None:
64                 self.set_elements_from_html(html)
65             if xml is not None:
66                 self.set_elements(xml)
67
68         def set_elements_from_html(self, data):
69             if not len(data) in (8,10):
70                 raise RuntimeError("Summary.htm file format changed, len(data)=%d" % (len(data),))
71
72             # same in pre-0.3.0 Summary file and 0.3 summary file
73             self.lane = int(data[0])
74
75             if len(data) == 8:
76                 parsed_data = [ parse_mean_range(x) for x in data[1:] ]
77                 # this is the < 0.3 Pipeline version
78                 self.cluster = parsed_data[0]
79                 self.average_first_cycle_intensity = parsed_data[1]
80                 self.percent_intensity_after_20_cycles = parsed_data[2]
81                 self.percent_pass_filter_clusters = parsed_data[3]
82                 self.percent_pass_filter_align = parsed_data[4]
83                 self.average_alignment_score = parsed_data[5]
84                 self.percent_error_rate = parsed_data[6]
85             elif len(data) == 10:
86                 parsed_data = [ parse_mean_range(x) for x in data[2:] ]
87                 # this is the >= 0.3 summary file
88                 self.lane_yield = data[1]
89                 self.cluster = parsed_data[0]
90                 self.cluster_pass_filter = parsed_data[1]
91                 self.average_first_cycle_intensity = parsed_data[2]
92                 self.percent_intensity_after_20_cycles = parsed_data[3]
93                 self.percent_pass_filter_clusters = parsed_data[4]
94                 self.percent_pass_filter_align = parsed_data[5]
95                 self.average_alignment_score = parsed_data[6]
96                 self.percent_error_rate = parsed_data[7]
97
98         def set_elements_from_gerald_xml(self, read, element):
99             self.lane = int(element.find('laneNumber').text)
100             self.end = read
101             lane_yield_node = element.find('laneYield')
102             if lane_yield_node is not None:
103                 self.lane_yield = int(lane_yield_node.text)
104             else:
105                 self.lane_yield = None
106
107             for GeraldName, LRSName in Summary.LaneResultSummary.GERALD_TAGS.items():
108                 node = element.find(GeraldName)
109                 if node is None:
110                     logging.info("Couldn't find %s" % (GeraldName))
111                 setattr(self, LRSName, parse_xml_mean_range(node))
112                                                                       
113         def get_elements(self):
114             lane_result = ElementTree.Element(
115                             Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
116                             {'lane': unicode(self.lane), 'end': unicode(self.end)})
117             for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
118                 value = getattr(self, variable_name)
119                 if value is None:
120                     continue
121                 # it looks like a sequence
122                 elif type(value) in (types.TupleType, types.ListType):
123                     element = make_mean_range_element(
124                       lane_result,
125                       tag,
126                       *value
127                     )
128                 else:
129                     element = ElementTree.SubElement(lane_result, tag)
130                     element.text = unicode(value)
131             return lane_result
132
133         def set_elements(self, tree):
134             if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
135                 raise ValueError('Expected %s' % (
136                         Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
137             self.lane = int(tree.attrib['lane'])
138             # default to the first end, for the older summary files
139             # that are single ended
140             self.end = int(tree.attrib.get('end', 0))
141             tags = Summary.LaneResultSummary.TAGS
142             for element in list(tree):
143                 try:
144                     variable_name = tags[element.tag]
145                     setattr(self, variable_name,
146                             parse_summary_element(element))
147                 except KeyError, e:
148                     logging.warn('Unrecognized tag %s' % (element.tag,))
149
150     def __init__(self, filename=None, xml=None):
151         # lane results is a list of 1 or 2 ends containing
152         # a dictionary of all the lanes reported in this
153         # summary file
154         self.lane_results = [{}]
155
156         if filename is not None:
157             self._extract_lane_results(filename)
158         if xml is not None:
159             self.set_elements(xml)
160
161     def __getitem__(self, key):
162         return self.lane_results[key]
163
164     def __len__(self):
165         return len(self.lane_results)
166
167     def _flattened_row(self, row):
168         """
169         flatten the children of a <tr>...</tr>
170         """
171         return [flatten(x) for x in row.getchildren() ]
172
173     def _parse_table(self, table):
174         """
175         assumes the first line is the header of a table,
176         and that the remaining rows are data
177         """
178         rows = table.getchildren()
179         data = []
180         for r in rows:
181             data.append(self._flattened_row(r))
182         return data
183
184     def _extract_lane_results(self, pathname):
185         """
186         Extract just the lane results.
187         Currently those are the only ones we care about.
188         """
189         
190         tables = self._extract_named_tables(pathname)
191
192
193     def _extract_named_tables(self, pathname):
194         """
195         extract all the 'named' tables from a Summary.htm file
196         and return as a dictionary
197
198         Named tables are <h2>...</h2><table>...</table> pairs
199         The contents of the h2 tag is considered to the name
200         of the table.
201         """
202         # tree = ElementTree.parse(pathname).getroot()
203         # hack for 1.1rc1, this should be removed when possible.
204         file_body = open(pathname).read()
205         file_body = file_body.replace('CHASTITY<=', 'CHASTITY&lt;=')
206         tree = ElementTree.fromstring(file_body)
207
208         # are we reading the xml or the html version of the Summary file?
209         if tree.tag.lower() == 'summary':
210             # summary version
211             tables = self._extract_named_tables_from_gerald_xml(tree)
212         elif tree.tag.lower() == 'html':
213             # html version
214             tables = self._extract_named_tables_from_html(tree)
215             table_names = [ ('Lane Results Summary', 0),
216                             ('Lane Results Summary : Read 1', 0),
217                             ('Lane Results Summary : Read 2', 1),]
218             for name, end in table_names:
219                 if tables.has_key(name):
220                     self._extract_lane_results_for_end(tables, name, end)
221
222         if len(self.lane_results[0])  == 0:
223             logging.warning("No Lane Results Summary Found in %s" % (pathname,))
224
225     def _extract_named_tables_from_gerald_xml(self, tree):
226         """
227         Extract useful named tables from a gerald created Summary.xml file
228         """
229         # using the function to convert to lower instead of just writing it
230         # makes the tag easier to read (IMO)
231         useful_tables = ['LaneResultsSummary'.lower(),]
232
233         tables ={}
234         for child in tree.getchildren():
235             if child.tag.lower() in  useful_tables:
236                 read_tree = child.find('Read')
237                 # we want 0 based.
238                 read = int(read_tree.find('readNumber').text)-1
239                 for element in read_tree.getchildren():
240                     if element.tag.lower() == "lane":
241                         lrs = Summary.LaneResultSummary()
242                         lrs.set_elements_from_gerald_xml(read, element)
243                         self.lane_results[lrs.end][lrs.lane] = lrs
244         # probably not useful
245         return tables
246         
247     ###### START HTML Table Extraction ########
248     def _extract_named_tables_from_html(self, tree):
249         body = tree.find('body')
250         tables = {}
251         for i in range(len(body)):
252             if body[i].tag == 'h2' and body[i+1].tag == 'table':
253                 # we have an interesting table
254                 name = flatten(body[i])
255                 table = body[i+1]
256                 data = self._parse_table(table)
257                 tables[name] = data
258         return tables
259
260     def _extract_lane_results_for_end(self, tables, table_name, end):
261         """
262         extract the Lane Results Summary table
263         """
264         # parse lane result summary
265         lane_summary = tables[table_name]
266         # this is version 1 of the summary file
267         if len(lane_summary[-1]) == 8:
268             # strip header
269             headers = lane_summary[0]
270             # grab the lane by lane data
271             lane_summary = lane_summary[1:]
272
273         # len(lane_summary[-1] = 10 is version 2 of the summary file
274         #                      = 9  is version 3 of the Summary.htm file
275         elif len(lane_summary[-1]) in (9, 10):
276             # lane_summary[0] is a different less specific header row
277             headers = lane_summary[1]
278             lane_summary = lane_summary[2:10]
279             # after the last lane, there's a set of chip wide averages
280
281         # append an extra dictionary if needed
282         if len(self.lane_results) < (end + 1):
283           self.lane_results.append({})
284
285         for r in lane_summary:
286             lrs = Summary.LaneResultSummary(html=r)
287             lrs.end = end
288             self.lane_results[lrs.end][lrs.lane] = lrs
289     ###### END HTML Table Extraction ########
290
291     def get_elements(self):
292         summary = ElementTree.Element(Summary.SUMMARY,
293                                       {'version': unicode(Summary.XML_VERSION)})
294         for end in self.lane_results:
295             for lane in end.values():
296                 summary.append(lane.get_elements())
297         return summary
298
299     def set_elements(self, tree):
300         if tree.tag != Summary.SUMMARY:
301             return ValueError("Expected %s" % (Summary.SUMMARY,))
302         xml_version = int(tree.attrib.get('version', 0))
303         if xml_version > Summary.XML_VERSION:
304             logging.warn('Summary XML tree is a higher version than this class')
305         for element in list(tree):
306             lrs = Summary.LaneResultSummary()
307             lrs.set_elements(element)
308             if len(self.lane_results) < (lrs.end + 1):
309               self.lane_results.append({})
310             self.lane_results[lrs.end][lrs.lane] = lrs
311
312     def is_paired_end(self):
313       return len(self.lane_results) == 2
314
315     def dump(self):
316         """
317         Debugging function, report current object
318         """
319         tree = self.get_elements()
320         print ElementTree.tostring(tree)
321
322 def tonumber(v):
323     """
324     Convert a value to int if its an int otherwise a float.
325     """
326     try:
327         v = int(v)
328     except ValueError, e:
329         v = float(v)
330     return v
331
332 def parse_mean_range(value):
333     """
334     Parse values like 123 +/- 4.5
335     """
336     if value.strip() == 'unknown':
337         return nan, nan
338
339     values = value.split()
340     if len(values) == 1:
341         if values[0] == '+/-':
342             return nan,nan
343         else:
344             return tonumber(values[0])
345
346     average, pm, deviation = values
347     if pm != '+/-':
348         raise RuntimeError("Summary.htm file format changed")
349     return tonumber(average), tonumber(deviation)
350
351 def make_mean_range_element(parent, name, mean, deviation):
352     """
353     Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
354     """
355     element = ElementTree.SubElement(parent, name,
356                                      { 'mean': unicode(mean),
357                                        'deviation': unicode(deviation)})
358     return element
359
360 def parse_mean_range_element(element):
361     """
362     Grab mean/deviation out of element
363     """
364     return (tonumber(element.attrib['mean']),
365             tonumber(element.attrib['deviation']))
366
367 def parse_summary_element(element):
368     """
369     Determine if we have a simple element or a mean/deviation element
370     """
371     if len(element.attrib) > 0:
372         return parse_mean_range_element(element)
373     else:
374         return element.text
375
376 def parse_xml_mean_range(element):
377     """
378     Extract mean/stddev children from an element as a tuple
379     """
380     if element is None:
381         return None
382     
383     mean = element.find('mean')
384     stddev = element.find('stdev')
385     if mean is None or stddev is None:
386         raise RuntimeError("Summary.xml file format changed, expected mean/stddev tags")
387     if mean.text is None: 
388         mean_value = float('nan')
389     else:
390         mean_value = tonumber(mean.text)
391
392     if stddev.text is None: 
393         stddev_value = float('nan')
394     else:
395         stddev_value = tonumber(stddev.text)
396
397
398     return (mean_value, stddev_value)
399
400 if __name__ == "__main__":
401     # test code
402     from optparse import OptionParser
403     parser = OptionParser('%prog [Summary.xml/Summary.htm]+')
404     opts, args = parser.parse_args()
405     if len(args) == 0:
406         parser.error('need at least one xml/html file')
407     for fname in args:
408         s = Summary(fname)
409         s.dump()
410