bce8184b7bd126978de6ab8800adf28981cd7a17
[htsworkflow.git] / htsworkflow / pipelines / summary.py
1 """
2 Analyze the Summary.htm file produced by GERALD
3 """
4 import types
5 from pprint import pprint
6
7 from htsworkflow.pipelines.runfolder import ElementTree
8 from htsworkflow.util.ethelp import indent, flatten
9
10 class Summary(object):
11     """
12     Extract some useful information from the Summary.htm file
13     """
14     XML_VERSION = 3
15     SUMMARY = 'Summary'
16
17     class LaneResultSummary(object):
18         """
19         Parse the LaneResultSummary table out of Summary.htm
20         Mostly for the cluster number
21         """
22         LANE_RESULT_SUMMARY = 'LaneResultSummary'
23         TAGS = {
24           'LaneYield': 'lane_yield',
25           'Cluster': 'cluster', # Raw
26           'ClusterPF': 'cluster_pass_filter',
27           'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
28           'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
29           'PercentPassFilterClusters': 'percent_pass_filter_clusters',
30           'PercentPassFilterAlign': 'percent_pass_filter_align',
31           'AverageAlignmentScore': 'average_alignment_score',
32           'PercentErrorRate': 'percent_error_rate'
33         }
34
35         def __init__(self, html=None, xml=None):
36             self.lane = None
37             self.end = 0
38             self.lane_yield = None
39             self.cluster = None
40             self.cluster_pass_filter = None
41             self.average_first_cycle_intensity = None
42             self.percent_intensity_after_20_cycles = None
43             self.percent_pass_filter_clusters = None
44             self.percent_pass_filter_align = None
45             self.average_alignment_score = None
46             self.percent_error_rate = None
47
48             if html is not None:
49                 self.set_elements_from_html(html)
50             if xml is not None:
51                 self.set_elements(xml)
52
53         def set_elements_from_html(self, data):
54             if not len(data) in (8,10):
55                 raise RuntimeError("Summary.htm file format changed, len(data)=%d" % (len(data),))
56
57             # same in pre-0.3.0 Summary file and 0.3 summary file
58             self.lane = int(data[0])
59
60             if len(data) == 8:
61                 parsed_data = [ parse_mean_range(x) for x in data[1:] ]
62                 # this is the < 0.3 Pipeline version
63                 self.cluster = parsed_data[0]
64                 self.average_first_cycle_intensity = parsed_data[1]
65                 self.percent_intensity_after_20_cycles = parsed_data[2]
66                 self.percent_pass_filter_clusters = parsed_data[3]
67                 self.percent_pass_filter_align = parsed_data[4]
68                 self.average_alignment_score = parsed_data[5]
69                 self.percent_error_rate = parsed_data[6]
70             elif len(data) == 10:
71                 parsed_data = [ parse_mean_range(x) for x in data[2:] ]
72                 # this is the >= 0.3 summary file
73                 self.lane_yield = data[1]
74                 self.cluster = parsed_data[0]
75                 self.cluster_pass_filter = parsed_data[1]
76                 self.average_first_cycle_intensity = parsed_data[2]
77                 self.percent_intensity_after_20_cycles = parsed_data[3]
78                 self.percent_pass_filter_clusters = parsed_data[4]
79                 self.percent_pass_filter_align = parsed_data[5]
80                 self.average_alignment_score = parsed_data[6]
81                 self.percent_error_rate = parsed_data[7]
82
83         def get_elements(self):
84             lane_result = ElementTree.Element(
85                             Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
86                             {'lane': str(self.lane), 'end': str(self.end)})
87             for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
88                 value = getattr(self, variable_name)
89                 if value is None:
90                     continue
91                 # it looks like a sequence
92                 elif type(value) in (types.TupleType, types.ListType):
93                     element = make_mean_range_element(
94                       lane_result,
95                       tag,
96                       *value
97                     )
98                 else:
99                     element = ElementTree.SubElement(lane_result, tag)
100                     element.text = value
101             return lane_result
102
103         def set_elements(self, tree):
104             if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
105                 raise ValueError('Expected %s' % (
106                         Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
107             self.lane = int(tree.attrib['lane'])
108             # default to the first end, for the older summary files
109             # that are single ended
110             self.end = int(tree.attrib.get('end', 0))
111             tags = Summary.LaneResultSummary.TAGS
112             for element in list(tree):
113                 try:
114                     variable_name = tags[element.tag]
115                     setattr(self, variable_name,
116                             parse_summary_element(element))
117                 except KeyError, e:
118                     logging.warn('Unrecognized tag %s' % (element.tag,))
119
120     def __init__(self, filename=None, xml=None):
121         # lane results is a list of 1 or 2 ends containing
122         # a dictionary of all the lanes reported in this
123         # summary file
124         self.lane_results = [{}]
125
126         if filename is not None:
127             self._extract_lane_results(filename)
128         if xml is not None:
129             self.set_elements(xml)
130
131     def __getitem__(self, key):
132         return self.lane_results[key]
133
134     def __len__(self):
135         return len(self.lane_results)
136
137     def _flattened_row(self, row):
138         """
139         flatten the children of a <tr>...</tr>
140         """
141         return [flatten(x) for x in row.getchildren() ]
142
143     def _parse_table(self, table):
144         """
145         assumes the first line is the header of a table,
146         and that the remaining rows are data
147         """
148         rows = table.getchildren()
149         data = []
150         for r in rows:
151             data.append(self._flattened_row(r))
152         return data
153
154     def _extract_named_tables(self, pathname):
155         """
156         extract all the 'named' tables from a Summary.htm file
157         and return as a dictionary
158
159         Named tables are <h2>...</h2><table>...</table> pairs
160         The contents of the h2 tag is considered to the name
161         of the table.
162         """
163         # tree = ElementTree.parse(pathname).getroot()
164         # hack for 1.1rc1, this should be removed when possible.
165         file_body = open(pathname).read()
166         file_body = file_body.replace('CHASTITY<=', 'CHASTITY&lt;=')
167         tree = ElementTree.fromstring(file_body)
168         body = tree.find('body')
169         tables = {}
170         for i in range(len(body)):
171             if body[i].tag == 'h2' and body[i+1].tag == 'table':
172                 # we have an interesting table
173                 name = flatten(body[i])
174                 table = body[i+1]
175                 data = self._parse_table(table)
176                 tables[name] = data
177         return tables
178
179     def _extract_lane_results(self, pathname):
180         tables = self._extract_named_tables(pathname)
181         table_names = [ ('Lane Results Summary', 0),
182                         ('Lane Results Summary : Read 1', 0),
183                         ('Lane Results Summary : Read 2', 1),]
184         for name, end in table_names:
185           if tables.has_key(name):
186             self._extract_lane_results_for_end(tables, name, end)
187
188     def _extract_lane_results_for_end(self, tables, table_name, end):
189         """
190         extract the Lane Results Summary table
191         """
192         # parse lane result summary
193         lane_summary = tables[table_name]
194         # this is version 1 of the summary file
195         if len(lane_summary[-1]) == 8:
196             # strip header
197             headers = lane_summary[0]
198             # grab the lane by lane data
199             lane_summary = lane_summary[1:]
200
201         # len(lane_summary[-1] = 10 is version 2 of the summary file
202         #                      = 9  is version 3 of the Summary.htm file
203         elif len(lane_summary[-1]) in (9, 10):
204             # lane_summary[0] is a different less specific header row
205             headers = lane_summary[1]
206             lane_summary = lane_summary[2:10]
207             # after the last lane, there's a set of chip wide averages
208
209         # append an extra dictionary if needed
210         if len(self.lane_results) < (end + 1):
211           self.lane_results.append({})
212
213         for r in lane_summary:
214             lrs = Summary.LaneResultSummary(html=r)
215             lrs.end = end
216             self.lane_results[lrs.end][lrs.lane] = lrs
217
218     def get_elements(self):
219         summary = ElementTree.Element(Summary.SUMMARY,
220                                       {'version': unicode(Summary.XML_VERSION)})
221         for end in self.lane_results:
222             for lane in end.values():
223                 summary.append(lane.get_elements())
224         return summary
225
226     def set_elements(self, tree):
227         if tree.tag != Summary.SUMMARY:
228             return ValueError("Expected %s" % (Summary.SUMMARY,))
229         xml_version = int(tree.attrib.get('version', 0))
230         if xml_version > Summary.XML_VERSION:
231             logging.warn('Summary XML tree is a higher version than this class')
232         for element in list(tree):
233             lrs = Summary.LaneResultSummary()
234             lrs.set_elements(element)
235             if len(self.lane_results) < (lrs.end + 1):
236               self.lane_results.append({})
237             self.lane_results[lrs.end][lrs.lane] = lrs
238
239     def is_paired_end(self):
240       return len(self.lane_results) == 2
241
242     def dump(self):
243         """
244         Debugging function, report current object
245         """
246         pass
247
248 def tonumber(v):
249     """
250     Convert a value to int if its an int otherwise a float.
251     """
252     try:
253         v = int(v)
254     except ValueError, e:
255         v = float(v)
256     return v
257
258 def parse_mean_range(value):
259     """
260     Parse values like 123 +/- 4.5
261     """
262     if value.strip() == 'unknown':
263         return 0, 0
264
265     average, pm, deviation = value.split()
266     if pm != '+/-':
267         raise RuntimeError("Summary.htm file format changed")
268     return tonumber(average), tonumber(deviation)
269
270 def make_mean_range_element(parent, name, mean, deviation):
271     """
272     Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
273     """
274     element = ElementTree.SubElement(parent, name,
275                                      { 'mean': unicode(mean),
276                                        'deviation': unicode(deviation)})
277     return element
278
279 def parse_mean_range_element(element):
280     """
281     Grab mean/deviation out of element
282     """
283     return (tonumber(element.attrib['mean']),
284             tonumber(element.attrib['deviation']))
285
286 def parse_summary_element(element):
287     """
288     Determine if we have a simple element or a mean/deviation element
289     """
290     if len(element.attrib) > 0:
291         return parse_mean_range_element(element)
292     else:
293         return element.text