"""
Analyze the Summary.htm file produced by GERALD
"""
+import logging
import types
+from pprint import pprint
from htsworkflow.pipelines.runfolder import ElementTree
from htsworkflow.util.ethelp import indent, flatten
+nan = float('nan')
+
class Summary(object):
"""
Extract some useful information from the Summary.htm file
"""
- XML_VERSION = 2
+ XML_VERSION = 3
SUMMARY = 'Summary'
class LaneResultSummary(object):
def __init__(self, html=None, xml=None):
self.lane = None
+ self.end = 0
self.lane_yield = None
self.cluster = None
self.cluster_pass_filter = None
def set_elements_from_html(self, data):
if not len(data) in (8,10):
- raise RuntimeError("Summary.htm file format changed")
+ raise RuntimeError("Summary.htm file format changed, len(data)=%d" % (len(data),))
# same in pre-0.3.0 Summary file and 0.3 summary file
- self.lane = data[0]
+ self.lane = int(data[0])
if len(data) == 8:
parsed_data = [ parse_mean_range(x) for x in data[1:] ]
def get_elements(self):
lane_result = ElementTree.Element(
Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
- {'lane': self.lane})
+ {'lane': str(self.lane), 'end': str(self.end)})
for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
value = getattr(self, variable_name)
if value is None:
if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
raise ValueError('Expected %s' % (
Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
- self.lane = tree.attrib['lane']
+ self.lane = int(tree.attrib['lane'])
+ # default to the first end, for the older summary files
+ # that are single ended
+ self.end = int(tree.attrib.get('end', 0))
tags = Summary.LaneResultSummary.TAGS
for element in list(tree):
try:
logging.warn('Unrecognized tag %s' % (element.tag,))
def __init__(self, filename=None, xml=None):
- self.lane_results = {}
+ # lane results is a list of 1 or 2 ends containing
+ # a dictionary of all the lanes reported in this
+ # summary file
+ self.lane_results = [{}]
if filename is not None:
self._extract_lane_results(filename)
def __len__(self):
return len(self.lane_results)
- def keys(self):
- return self.lane_results.keys()
-
- def values(self):
- return self.lane_results.values()
-
- def items(self):
- return self.lane_results.items()
-
def _flattened_row(self, row):
"""
flatten the children of a <tr>...</tr>
return tables
def _extract_lane_results(self, pathname):
+ tables = self._extract_named_tables(pathname)
+ table_names = [ ('Lane Results Summary', 0),
+ ('Lane Results Summary : Read 1', 0),
+ ('Lane Results Summary : Read 2', 1),]
+ for name, end in table_names:
+ if tables.has_key(name):
+ self._extract_lane_results_for_end(tables, name, end)
+ else:
+ logging.warning("No Lane Results Summary Found in %s" % (pathname,))
+
+ def _extract_lane_results_for_end(self, tables, table_name, end):
"""
extract the Lane Results Summary table
"""
-
- tables = self._extract_named_tables(pathname)
-
# parse lane result summary
- lane_summary = tables['Lane Results Summary']
+ lane_summary = tables[table_name]
# this is version 1 of the summary file
if len(lane_summary[-1]) == 8:
# strip header
# grab the lane by lane data
lane_summary = lane_summary[1:]
- # this is version 2 of the summary file
- if len(lane_summary[-1]) == 10:
+ # len(lane_summary[-1] = 10 is version 2 of the summary file
+ # = 9 is version 3 of the Summary.htm file
+ elif len(lane_summary[-1]) in (9, 10):
# lane_summary[0] is a different less specific header row
headers = lane_summary[1]
lane_summary = lane_summary[2:10]
# after the last lane, there's a set of chip wide averages
+ # append an extra dictionary if needed
+ if len(self.lane_results) < (end + 1):
+ self.lane_results.append({})
+
for r in lane_summary:
lrs = Summary.LaneResultSummary(html=r)
- self.lane_results[lrs.lane] = lrs
+ lrs.end = end
+ self.lane_results[lrs.end][lrs.lane] = lrs
def get_elements(self):
summary = ElementTree.Element(Summary.SUMMARY,
{'version': unicode(Summary.XML_VERSION)})
- for lane in self.lane_results.values():
- summary.append(lane.get_elements())
+ for end in self.lane_results:
+ for lane in end.values():
+ summary.append(lane.get_elements())
return summary
def set_elements(self, tree):
for element in list(tree):
lrs = Summary.LaneResultSummary()
lrs.set_elements(element)
- self.lane_results[lrs.lane] = lrs
+ if len(self.lane_results) < (lrs.end + 1):
+ self.lane_results.append({})
+ self.lane_results[lrs.end][lrs.lane] = lrs
+
+ def is_paired_end(self):
+ return len(self.lane_results) == 2
def dump(self):
"""
Parse values like 123 +/- 4.5
"""
if value.strip() == 'unknown':
- return 0, 0
+ return nan, nan
+
+ values = value.split()
+ if len(values) == 1:
+ if values[0] == '+/-':
+ return nan,nan
+ else:
+ return tonumber(values[0])
- average, pm, deviation = value.split()
+ average, pm, deviation = values
if pm != '+/-':
raise RuntimeError("Summary.htm file format changed")
return tonumber(average), tonumber(deviation)