Use a logger initialized to the module name much more consistently.
[htsworkflow.git] / htsworkflow / pipelines / summary.py
1 """
2 Analyze the Summary.htm file produced by GERALD
3 """
4 import logging
5 import types
6 from pprint import pprint
7
8 from htsworkflow.pipelines.runfolder import ElementTree
9 from htsworkflow.util.ethelp import indent, flatten
10
11 LOGGER = logging.getLogger(__name__)
12 nan = float('nan')
13
14 class Summary(object):
15     """
16     Extract some useful information from the Summary.htm file
17     """
18     XML_VERSION = 3
19     SUMMARY = 'Summary'
20
21     class LaneResultSummary(object):
22         """
23         Parse the LaneResultSummary table out of Summary.htm
24         Mostly for the cluster number
25         """
26         LANE_RESULT_SUMMARY = 'LaneResultSummary'
27         TAGS = {
28           'LaneYield': 'lane_yield',
29           'Cluster': 'cluster', # Raw
30           'ClusterPF': 'cluster_pass_filter',
31           'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
32           'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
33           'PercentPassFilterClusters': 'percent_pass_filter_clusters',
34           'PercentPassFilterAlign': 'percent_pass_filter_align',
35           'AverageAlignmentScore': 'average_alignment_score',
36           'PercentErrorRate': 'percent_error_rate'
37         }
38         # These are tags that have mean/stdev as found in the GERALD Summary.xml file
39         GERALD_TAGS = {
40           #'laneYield': 'lane_yield', #this is just a number
41           'clusterCountRaw': 'cluster', # Raw
42           'clusterCountPF': 'cluster_pass_filter',
43           'oneSig': 'average_first_cycle_intensity',
44           'signal20AsPctOf1': 'percent_intensity_after_20_cycles',
45           'percentClustersPF': 'percent_pass_filter_clusters',
46           'percentUniquelyAlignedPF': 'percent_pass_filter_align',
47           'averageAlignScorePF': 'average_alignment_score',
48           'errorPF': 'percent_error_rate'
49         }
50
51         def __init__(self, html=None, xml=None):
52             self.lane = None
53             self.end = 0
54             self.lane_yield = None
55             self.cluster = None
56             self.cluster_pass_filter = None
57             self.average_first_cycle_intensity = None
58             self.percent_intensity_after_20_cycles = None
59             self.percent_pass_filter_clusters = None
60             self.percent_pass_filter_align = None
61             self.average_alignment_score = None
62             self.percent_error_rate = None
63
64             if html is not None:
65                 self.set_elements_from_html(html)
66             if xml is not None:
67                 self.set_elements(xml)
68
69         def set_elements_from_html(self, data):
70             if not len(data) in (8,10):
71                 raise RuntimeError("Summary.htm file format changed, len(data)=%d" % (len(data),))
72
73             # same in pre-0.3.0 Summary file and 0.3 summary file
74             self.lane = int(data[0])
75
76             if len(data) == 8:
77                 parsed_data = [ parse_mean_range(x) for x in data[1:] ]
78                 # this is the < 0.3 Pipeline version
79                 self.cluster = parsed_data[0]
80                 self.average_first_cycle_intensity = parsed_data[1]
81                 self.percent_intensity_after_20_cycles = parsed_data[2]
82                 self.percent_pass_filter_clusters = parsed_data[3]
83                 self.percent_pass_filter_align = parsed_data[4]
84                 self.average_alignment_score = parsed_data[5]
85                 self.percent_error_rate = parsed_data[6]
86             elif len(data) == 10:
87                 parsed_data = [ parse_mean_range(x) for x in data[2:] ]
88                 # this is the >= 0.3 summary file
89                 self.lane_yield = data[1]
90                 self.cluster = parsed_data[0]
91                 self.cluster_pass_filter = parsed_data[1]
92                 self.average_first_cycle_intensity = parsed_data[2]
93                 self.percent_intensity_after_20_cycles = parsed_data[3]
94                 self.percent_pass_filter_clusters = parsed_data[4]
95                 self.percent_pass_filter_align = parsed_data[5]
96                 self.average_alignment_score = parsed_data[6]
97                 self.percent_error_rate = parsed_data[7]
98
99         def set_elements_from_gerald_xml(self, read, element):
100             self.lane = int(element.find('laneNumber').text)
101             self.end = read
102             lane_yield_node = element.find('laneYield')
103             if lane_yield_node is not None:
104                 self.lane_yield = int(lane_yield_node.text)
105             else:
106                 self.lane_yield = None
107
108             for GeraldName, LRSName in Summary.LaneResultSummary.GERALD_TAGS.items():
109                 node = element.find(GeraldName)
110                 if node is None:
111                     LOGGER.info("Couldn't find %s" % (GeraldName))
112                 setattr(self, LRSName, parse_xml_mean_range(node))
113
114         def get_elements(self):
115             lane_result = ElementTree.Element(
116                             Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
117                             {'lane': unicode(self.lane), 'end': unicode(self.end)})
118             for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
119                 value = getattr(self, variable_name)
120                 if value is None:
121                     continue
122                 # it looks like a sequence
123                 elif type(value) in (types.TupleType, types.ListType):
124                     element = make_mean_range_element(
125                       lane_result,
126                       tag,
127                       *value
128                     )
129                 else:
130                     element = ElementTree.SubElement(lane_result, tag)
131                     element.text = unicode(value)
132             return lane_result
133
134         def set_elements(self, tree):
135             if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
136                 raise ValueError('Expected %s' % (
137                         Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
138             self.lane = int(tree.attrib['lane'])
139             # default to the first end, for the older summary files
140             # that are single ended
141             self.end = int(tree.attrib.get('end', 0))
142             tags = Summary.LaneResultSummary.TAGS
143             for element in list(tree):
144                 try:
145                     variable_name = tags[element.tag]
146                     setattr(self, variable_name,
147                             parse_summary_element(element))
148                 except KeyError, e:
149                     LOGGER.warn('Unrecognized tag %s' % (element.tag,))
150
151     def __init__(self, filename=None, xml=None):
152         # lane results is a list of 1 or 2 ends containing
153         # a dictionary of all the lanes reported in this
154         # summary file
155         self.lane_results = [{}]
156
157         if filename is not None:
158             self._extract_lane_results(filename)
159         if xml is not None:
160             self.set_elements(xml)
161
162     def __getitem__(self, key):
163         return self.lane_results[key]
164
165     def __len__(self):
166         return len(self.lane_results)
167
168     def _flattened_row(self, row):
169         """
170         flatten the children of a <tr>...</tr>
171         """
172         return [flatten(x) for x in row.getchildren() ]
173
174     def _parse_table(self, table):
175         """
176         assumes the first line is the header of a table,
177         and that the remaining rows are data
178         """
179         rows = table.getchildren()
180         data = []
181         for r in rows:
182             data.append(self._flattened_row(r))
183         return data
184
185     def _extract_lane_results(self, pathname):
186         """
187         Extract just the lane results.
188         Currently those are the only ones we care about.
189         """
190
191         tables = self._extract_named_tables(pathname)
192
193
194     def _extract_named_tables(self, pathname):
195         """
196         extract all the 'named' tables from a Summary.htm file
197         and return as a dictionary
198
199         Named tables are <h2>...</h2><table>...</table> pairs
200         The contents of the h2 tag is considered to the name
201         of the table.
202         """
203         # tree = ElementTree.parse(pathname).getroot()
204         # hack for 1.1rc1, this should be removed when possible.
205         file_body = open(pathname).read()
206         file_body = file_body.replace('CHASTITY<=', 'CHASTITY&lt;=')
207         tree = ElementTree.fromstring(file_body)
208
209         # are we reading the xml or the html version of the Summary file?
210         if tree.tag.lower() == 'summary':
211             # summary version
212             tables = self._extract_named_tables_from_gerald_xml(tree)
213         elif tree.tag.lower() == 'html':
214             # html version
215             tables = self._extract_named_tables_from_html(tree)
216             table_names = [ ('Lane Results Summary', 0),
217                             ('Lane Results Summary : Read 1', 0),
218                             ('Lane Results Summary : Read 2', 1),]
219             for name, end in table_names:
220                 if tables.has_key(name):
221                     self._extract_lane_results_for_end(tables, name, end)
222
223         if len(self.lane_results[0])  == 0:
224             LOGGER.warning("No Lane Results Summary Found in %s" % (pathname,))
225
226     def _extract_named_tables_from_gerald_xml(self, tree):
227         """
228         Extract useful named tables from a gerald created Summary.xml file
229         """
230         # using the function to convert to lower instead of just writing it
231         # makes the tag easier to read (IMO)
232         useful_tables = ['LaneResultsSummary'.lower(),]
233
234         tables ={}
235         for child in tree.getchildren():
236             if child.tag.lower() in  useful_tables:
237                 read_tree = child.find('Read')
238                 # we want 0 based.
239                 read = int(read_tree.find('readNumber').text)-1
240                 for element in read_tree.getchildren():
241                     if element.tag.lower() == "lane":
242                         lrs = Summary.LaneResultSummary()
243                         lrs.set_elements_from_gerald_xml(read, element)
244                         self.lane_results[lrs.end][lrs.lane] = lrs
245         # probably not useful
246         return tables
247
248     ###### START HTML Table Extraction ########
249     def _extract_named_tables_from_html(self, tree):
250         body = tree.find('body')
251         tables = {}
252         for i in range(len(body)):
253             if body[i].tag == 'h2' and body[i+1].tag == 'table':
254                 # we have an interesting table
255                 name = flatten(body[i])
256                 table = body[i+1]
257                 data = self._parse_table(table)
258                 tables[name] = data
259         return tables
260
261     def _extract_lane_results_for_end(self, tables, table_name, end):
262         """
263         extract the Lane Results Summary table
264         """
265         # parse lane result summary
266         lane_summary = tables[table_name]
267         # this is version 1 of the summary file
268         if len(lane_summary[-1]) == 8:
269             # strip header
270             headers = lane_summary[0]
271             # grab the lane by lane data
272             lane_summary = lane_summary[1:]
273
274         # len(lane_summary[-1] = 10 is version 2 of the summary file
275         #                      = 9  is version 3 of the Summary.htm file
276         elif len(lane_summary[-1]) in (9, 10):
277             # lane_summary[0] is a different less specific header row
278             headers = lane_summary[1]
279             lane_summary = lane_summary[2:10]
280             # after the last lane, there's a set of chip wide averages
281
282         # append an extra dictionary if needed
283         if len(self.lane_results) < (end + 1):
284           self.lane_results.append({})
285
286         for r in lane_summary:
287             lrs = Summary.LaneResultSummary(html=r)
288             lrs.end = end
289             self.lane_results[lrs.end][lrs.lane] = lrs
290     ###### END HTML Table Extraction ########
291
292     def get_elements(self):
293         summary = ElementTree.Element(Summary.SUMMARY,
294                                       {'version': unicode(Summary.XML_VERSION)})
295         for end in self.lane_results:
296             for lane in end.values():
297                 summary.append(lane.get_elements())
298         return summary
299
300     def set_elements(self, tree):
301         if tree.tag != Summary.SUMMARY:
302             return ValueError("Expected %s" % (Summary.SUMMARY,))
303         xml_version = int(tree.attrib.get('version', 0))
304         if xml_version > Summary.XML_VERSION:
305             LOGGER.warn('Summary XML tree is a higher version than this class')
306         for element in list(tree):
307             lrs = Summary.LaneResultSummary()
308             lrs.set_elements(element)
309             if len(self.lane_results) < (lrs.end + 1):
310               self.lane_results.append({})
311             self.lane_results[lrs.end][lrs.lane] = lrs
312
313     def is_paired_end(self):
314       return len(self.lane_results) == 2
315
316     def dump(self):
317         """
318         Debugging function, report current object
319         """
320         tree = self.get_elements()
321         print ElementTree.tostring(tree)
322
323 def tonumber(v):
324     """
325     Convert a value to int if its an int otherwise a float.
326     """
327     try:
328         v = int(v)
329     except ValueError, e:
330         v = float(v)
331     return v
332
333 def parse_mean_range(value):
334     """
335     Parse values like 123 +/- 4.5
336     """
337     if value.strip() == 'unknown':
338         return nan, nan
339
340     values = value.split()
341     if len(values) == 1:
342         if values[0] == '+/-':
343             return nan,nan
344         else:
345             return tonumber(values[0])
346
347     average, pm, deviation = values
348     if pm != '+/-':
349         raise RuntimeError("Summary.htm file format changed")
350     return tonumber(average), tonumber(deviation)
351
352 def make_mean_range_element(parent, name, mean, deviation):
353     """
354     Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
355     """
356     element = ElementTree.SubElement(parent, name,
357                                      { 'mean': unicode(mean),
358                                        'deviation': unicode(deviation)})
359     return element
360
361 def parse_mean_range_element(element):
362     """
363     Grab mean/deviation out of element
364     """
365     return (tonumber(element.attrib['mean']),
366             tonumber(element.attrib['deviation']))
367
368 def parse_summary_element(element):
369     """
370     Determine if we have a simple element or a mean/deviation element
371     """
372     if len(element.attrib) > 0:
373         return parse_mean_range_element(element)
374     else:
375         return element.text
376
377 def parse_xml_mean_range(element):
378     """
379     Extract mean/stddev children from an element as a tuple
380     """
381     if element is None:
382         return None
383
384     mean = element.find('mean')
385     stddev = element.find('stdev')
386     if mean is None or stddev is None:
387         raise RuntimeError("Summary.xml file format changed, expected mean/stddev tags")
388     if mean.text is None:
389         mean_value = float('nan')
390     else:
391         mean_value = tonumber(mean.text)
392
393     if stddev.text is None:
394         stddev_value = float('nan')
395     else:
396         stddev_value = tonumber(stddev.text)
397
398
399     return (mean_value, stddev_value)
400
401 if __name__ == "__main__":
402     # test code
403     from optparse import OptionParser
404     parser = OptionParser('%prog [Summary.xml/Summary.htm]+')
405     opts, args = parser.parse_args()
406     if len(args) == 0:
407         parser.error('need at least one xml/html file')
408     for fname in args:
409         s = Summary(fname)
410         s.dump()
411