Update summary script to read from the GERALD Summary.xml file
[htsworkflow.git] / htsworkflow / pipelines / summary.py
1 """
2 Analyze the Summary.htm file produced by GERALD
3 """
4 import logging
5 import types
6 from pprint import pprint
7
8 from htsworkflow.pipelines.runfolder import ElementTree
9 from htsworkflow.util.ethelp import indent, flatten
10
11 nan = float('nan')
12
13 class Summary(object):
14     """
15     Extract some useful information from the Summary.htm file
16     """
17     XML_VERSION = 3
18     SUMMARY = 'Summary'
19
20     class LaneResultSummary(object):
21         """
22         Parse the LaneResultSummary table out of Summary.htm
23         Mostly for the cluster number
24         """
25         LANE_RESULT_SUMMARY = 'LaneResultSummary'
26         TAGS = {
27           'LaneYield': 'lane_yield',
28           'Cluster': 'cluster', # Raw
29           'ClusterPF': 'cluster_pass_filter',
30           'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
31           'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
32           'PercentPassFilterClusters': 'percent_pass_filter_clusters',
33           'PercentPassFilterAlign': 'percent_pass_filter_align',
34           'AverageAlignmentScore': 'average_alignment_score',
35           'PercentErrorRate': 'percent_error_rate'
36         }
37         # These are tags that have mean/stdev as found in the GERALD Summary.xml file
38         GERALD_TAGS = {
39           #'laneYield': 'lane_yield', #this is just a number
40           'clusterCountRaw': 'cluster', # Raw
41           'clusterCountPF': 'cluster_pass_filter',
42           'oneSig': 'average_first_cycle_intensity',
43           'signal20AsPctOf1': 'percent_intensity_after_20_cycles',
44           'percentClustersPF': 'percent_pass_filter_clusters',
45           'percentUniquelyAlignedPF': 'percent_pass_filter_align',
46           'averageAlignScorePF': 'average_alignment_score',
47           'errorPF': 'percent_error_rate'
48         }
49
50         def __init__(self, html=None, xml=None):
51             self.lane = None
52             self.end = 0
53             self.lane_yield = None
54             self.cluster = None
55             self.cluster_pass_filter = None
56             self.average_first_cycle_intensity = None
57             self.percent_intensity_after_20_cycles = None
58             self.percent_pass_filter_clusters = None
59             self.percent_pass_filter_align = None
60             self.average_alignment_score = None
61             self.percent_error_rate = None
62
63             if html is not None:
64                 self.set_elements_from_html(html)
65             if xml is not None:
66                 self.set_elements(xml)
67
68         def set_elements_from_html(self, data):
69             if not len(data) in (8,10):
70                 raise RuntimeError("Summary.htm file format changed, len(data)=%d" % (len(data),))
71
72             # same in pre-0.3.0 Summary file and 0.3 summary file
73             self.lane = int(data[0])
74
75             if len(data) == 8:
76                 parsed_data = [ parse_mean_range(x) for x in data[1:] ]
77                 # this is the < 0.3 Pipeline version
78                 self.cluster = parsed_data[0]
79                 self.average_first_cycle_intensity = parsed_data[1]
80                 self.percent_intensity_after_20_cycles = parsed_data[2]
81                 self.percent_pass_filter_clusters = parsed_data[3]
82                 self.percent_pass_filter_align = parsed_data[4]
83                 self.average_alignment_score = parsed_data[5]
84                 self.percent_error_rate = parsed_data[6]
85             elif len(data) == 10:
86                 parsed_data = [ parse_mean_range(x) for x in data[2:] ]
87                 # this is the >= 0.3 summary file
88                 self.lane_yield = data[1]
89                 self.cluster = parsed_data[0]
90                 self.cluster_pass_filter = parsed_data[1]
91                 self.average_first_cycle_intensity = parsed_data[2]
92                 self.percent_intensity_after_20_cycles = parsed_data[3]
93                 self.percent_pass_filter_clusters = parsed_data[4]
94                 self.percent_pass_filter_align = parsed_data[5]
95                 self.average_alignment_score = parsed_data[6]
96                 self.percent_error_rate = parsed_data[7]
97
98         def set_elements_from_gerald_xml(self, read, element):
99             self.lane = int(element.find('laneNumber').text)
100             self.end = read
101             self.lane_yield = int(element.find('laneYield').text)
102
103             for GeraldName, LRSName in Summary.LaneResultSummary.GERALD_TAGS.items():
104                 node = element.find(GeraldName)
105                 if node is None:
106                     logging.info("Couldn't find %s" % (GeraldName))
107                 setattr(self, LRSName, parse_xml_mean_range(node))
108                                                                       
109         def get_elements(self):
110             lane_result = ElementTree.Element(
111                             Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
112                             {'lane': unicode(self.lane), 'end': unicode(self.end)})
113             for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
114                 value = getattr(self, variable_name)
115                 if value is None:
116                     continue
117                 # it looks like a sequence
118                 elif type(value) in (types.TupleType, types.ListType):
119                     element = make_mean_range_element(
120                       lane_result,
121                       tag,
122                       *value
123                     )
124                 else:
125                     element = ElementTree.SubElement(lane_result, tag)
126                     element.text = unicode(value)
127             return lane_result
128
129         def set_elements(self, tree):
130             if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
131                 raise ValueError('Expected %s' % (
132                         Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
133             self.lane = int(tree.attrib['lane'])
134             # default to the first end, for the older summary files
135             # that are single ended
136             self.end = int(tree.attrib.get('end', 0))
137             tags = Summary.LaneResultSummary.TAGS
138             for element in list(tree):
139                 try:
140                     variable_name = tags[element.tag]
141                     setattr(self, variable_name,
142                             parse_summary_element(element))
143                 except KeyError, e:
144                     logging.warn('Unrecognized tag %s' % (element.tag,))
145
146     def __init__(self, filename=None, xml=None):
147         # lane results is a list of 1 or 2 ends containing
148         # a dictionary of all the lanes reported in this
149         # summary file
150         self.lane_results = [{}]
151
152         if filename is not None:
153             self._extract_lane_results(filename)
154         if xml is not None:
155             self.set_elements(xml)
156
157     def __getitem__(self, key):
158         return self.lane_results[key]
159
160     def __len__(self):
161         return len(self.lane_results)
162
163     def _flattened_row(self, row):
164         """
165         flatten the children of a <tr>...</tr>
166         """
167         return [flatten(x) for x in row.getchildren() ]
168
169     def _parse_table(self, table):
170         """
171         assumes the first line is the header of a table,
172         and that the remaining rows are data
173         """
174         rows = table.getchildren()
175         data = []
176         for r in rows:
177             data.append(self._flattened_row(r))
178         return data
179
180     def _extract_lane_results(self, pathname):
181         """
182         Extract just the lane results.
183         Currently those are the only ones we care about.
184         """
185         
186         tables = self._extract_named_tables(pathname)
187
188
189     def _extract_named_tables(self, pathname):
190         """
191         extract all the 'named' tables from a Summary.htm file
192         and return as a dictionary
193
194         Named tables are <h2>...</h2><table>...</table> pairs
195         The contents of the h2 tag is considered to the name
196         of the table.
197         """
198         # tree = ElementTree.parse(pathname).getroot()
199         # hack for 1.1rc1, this should be removed when possible.
200         file_body = open(pathname).read()
201         file_body = file_body.replace('CHASTITY<=', 'CHASTITY&lt;=')
202         tree = ElementTree.fromstring(file_body)
203
204         # are we reading the xml or the html version of the Summary file?
205         if tree.tag.lower() == 'summary':
206             # summary version
207             tables = self._extract_named_tables_from_gerald_xml(tree)
208         elif tree.tag.lower() == 'html':
209             # html version
210             tables = self._extract_named_tables_from_html(tree)
211             table_names = [ ('Lane Results Summary', 0),
212                             ('Lane Results Summary : Read 1', 0),
213                             ('Lane Results Summary : Read 2', 1),]
214             for name, end in table_names:
215                 if tables.has_key(name):
216                     self._extract_lane_results_for_end(tables, name, end)
217
218         if len(self.lane_results[0])  == 0:
219             logging.warning("No Lane Results Summary Found in %s" % (pathname,))
220
221     def _extract_named_tables_from_gerald_xml(self, tree):
222         """
223         Extract useful named tables from a gerald created Summary.xml file
224         """
225         # using the function to convert to lower instead of just writing it
226         # makes the tag easier to read (IMO)
227         useful_tables = ['LaneResultsSummary'.lower(),]
228
229         tables ={}
230         for child in tree.getchildren():
231             if child.tag.lower() in  useful_tables:
232                 read_tree = child.find('Read')
233                 # we want 0 based.
234                 read = int(read_tree.find('readNumber').text)-1
235                 for element in read_tree.getchildren():
236                     if element.tag.lower() == "lane":
237                         lrs = Summary.LaneResultSummary()
238                         lrs.set_elements_from_gerald_xml(read, element)
239                         self.lane_results[lrs.end][lrs.lane] = lrs
240         # probably not useful
241         return tables
242         
243     ###### START HTML Table Extraction ########
244     def _extract_named_tables_from_html(self, tree):
245         body = tree.find('body')
246         tables = {}
247         for i in range(len(body)):
248             if body[i].tag == 'h2' and body[i+1].tag == 'table':
249                 # we have an interesting table
250                 name = flatten(body[i])
251                 table = body[i+1]
252                 data = self._parse_table(table)
253                 tables[name] = data
254         return tables
255
256     def _extract_lane_results_for_end(self, tables, table_name, end):
257         """
258         extract the Lane Results Summary table
259         """
260         # parse lane result summary
261         lane_summary = tables[table_name]
262         # this is version 1 of the summary file
263         if len(lane_summary[-1]) == 8:
264             # strip header
265             headers = lane_summary[0]
266             # grab the lane by lane data
267             lane_summary = lane_summary[1:]
268
269         # len(lane_summary[-1] = 10 is version 2 of the summary file
270         #                      = 9  is version 3 of the Summary.htm file
271         elif len(lane_summary[-1]) in (9, 10):
272             # lane_summary[0] is a different less specific header row
273             headers = lane_summary[1]
274             lane_summary = lane_summary[2:10]
275             # after the last lane, there's a set of chip wide averages
276
277         # append an extra dictionary if needed
278         if len(self.lane_results) < (end + 1):
279           self.lane_results.append({})
280
281         for r in lane_summary:
282             lrs = Summary.LaneResultSummary(html=r)
283             lrs.end = end
284             self.lane_results[lrs.end][lrs.lane] = lrs
285     ###### END HTML Table Extraction ########
286
287     def get_elements(self):
288         summary = ElementTree.Element(Summary.SUMMARY,
289                                       {'version': unicode(Summary.XML_VERSION)})
290         for end in self.lane_results:
291             for lane in end.values():
292                 summary.append(lane.get_elements())
293         return summary
294
295     def set_elements(self, tree):
296         if tree.tag != Summary.SUMMARY:
297             return ValueError("Expected %s" % (Summary.SUMMARY,))
298         xml_version = int(tree.attrib.get('version', 0))
299         if xml_version > Summary.XML_VERSION:
300             logging.warn('Summary XML tree is a higher version than this class')
301         for element in list(tree):
302             lrs = Summary.LaneResultSummary()
303             lrs.set_elements(element)
304             if len(self.lane_results) < (lrs.end + 1):
305               self.lane_results.append({})
306             self.lane_results[lrs.end][lrs.lane] = lrs
307
308     def is_paired_end(self):
309       return len(self.lane_results) == 2
310
311     def dump(self):
312         """
313         Debugging function, report current object
314         """
315         tree = self.get_elements()
316         print ElementTree.tostring(tree)
317
318 def tonumber(v):
319     """
320     Convert a value to int if its an int otherwise a float.
321     """
322     try:
323         v = int(v)
324     except ValueError, e:
325         v = float(v)
326     return v
327
328 def parse_mean_range(value):
329     """
330     Parse values like 123 +/- 4.5
331     """
332     if value.strip() == 'unknown':
333         return nan, nan
334
335     values = value.split()
336     if len(values) == 1:
337         if values[0] == '+/-':
338             return nan,nan
339         else:
340             return tonumber(values[0])
341
342     average, pm, deviation = values
343     if pm != '+/-':
344         raise RuntimeError("Summary.htm file format changed")
345     return tonumber(average), tonumber(deviation)
346
347 def make_mean_range_element(parent, name, mean, deviation):
348     """
349     Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
350     """
351     element = ElementTree.SubElement(parent, name,
352                                      { 'mean': unicode(mean),
353                                        'deviation': unicode(deviation)})
354     return element
355
356 def parse_mean_range_element(element):
357     """
358     Grab mean/deviation out of element
359     """
360     return (tonumber(element.attrib['mean']),
361             tonumber(element.attrib['deviation']))
362
363 def parse_summary_element(element):
364     """
365     Determine if we have a simple element or a mean/deviation element
366     """
367     if len(element.attrib) > 0:
368         return parse_mean_range_element(element)
369     else:
370         return element.text
371
372 def parse_xml_mean_range(element):
373     """
374     Extract mean/stddev children from an element as a tuple
375     """
376     if element is None:
377         return None
378     
379     mean = element.find('mean')
380     stddev = element.find('stdev')
381     if mean is None or stddev is None:
382         raise RuntimeError("Summary.xml file format changed, expected mean/stddev tags")
383
384     return (tonumber(mean.text), tonumber(stddev.text))
385
386 if __name__ == "__main__":
387     # test code
388     from optparse import OptionParser
389     parser = OptionParser('%prog [Summary.xml/Summary.htm]+')
390     opts, args = parser.parse_args()
391     if len(args) == 0:
392         parser.error('need at least one xml/html file')
393     for fname in args:
394         s = Summary(fname)
395         s.dump()
396