"""
Analyze the Summary.htm file produced by GERALD
"""
+import logging
import types
+from pprint import pprint
from htsworkflow.pipelines.runfolder import ElementTree
from htsworkflow.util.ethelp import indent, flatten
+nan = float('nan')
+
class Summary(object):
"""
Extract some useful information from the Summary.htm file
'AverageAlignmentScore': 'average_alignment_score',
'PercentErrorRate': 'percent_error_rate'
}
+ # These are tags that have mean/stdev as found in the GERALD Summary.xml file
+ GERALD_TAGS = {
+ #'laneYield': 'lane_yield', #this is just a number
+ 'clusterCountRaw': 'cluster', # Raw
+ 'clusterCountPF': 'cluster_pass_filter',
+ 'oneSig': 'average_first_cycle_intensity',
+ 'signal20AsPctOf1': 'percent_intensity_after_20_cycles',
+ 'percentClustersPF': 'percent_pass_filter_clusters',
+ 'percentUniquelyAlignedPF': 'percent_pass_filter_align',
+ 'averageAlignScorePF': 'average_alignment_score',
+ 'errorPF': 'percent_error_rate'
+ }
def __init__(self, html=None, xml=None):
self.lane = None
def set_elements_from_html(self, data):
if not len(data) in (8,10):
- raise RuntimeError("Summary.htm file format changed")
+ raise RuntimeError("Summary.htm file format changed, len(data)=%d" % (len(data),))
# same in pre-0.3.0 Summary file and 0.3 summary file
self.lane = int(data[0])
self.average_alignment_score = parsed_data[6]
self.percent_error_rate = parsed_data[7]
+ def set_elements_from_gerald_xml(self, read, element):
+ self.lane = int(element.find('laneNumber').text)
+ self.end = read
+ self.lane_yield = int(element.find('laneYield').text)
+
+ for GeraldName, LRSName in Summary.LaneResultSummary.GERALD_TAGS.items():
+ node = element.find(GeraldName)
+ if node is None:
+ logging.info("Couldn't find %s" % (GeraldName))
+ setattr(self, LRSName, parse_xml_mean_range(node))
+
def get_elements(self):
lane_result = ElementTree.Element(
Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
- {'lane': str(self.lane), 'end': str(self.end)})
+ {'lane': unicode(self.lane), 'end': unicode(self.end)})
for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
value = getattr(self, variable_name)
if value is None:
)
else:
element = ElementTree.SubElement(lane_result, tag)
- element.text = value
+ element.text = unicode(value)
return lane_result
def set_elements(self, tree):
data.append(self._flattened_row(r))
return data
+ def _extract_lane_results(self, pathname):
+ """
+ Extract just the lane results.
+ Currently those are the only ones we care about.
+ """
+
+ tables = self._extract_named_tables(pathname)
+
+
def _extract_named_tables(self, pathname):
"""
extract all the 'named' tables from a Summary.htm file
file_body = open(pathname).read()
file_body = file_body.replace('CHASTITY<=', 'CHASTITY<=')
tree = ElementTree.fromstring(file_body)
+
+ # are we reading the xml or the html version of the Summary file?
+ if tree.tag.lower() == 'summary':
+ # summary version
+ tables = self._extract_named_tables_from_gerald_xml(tree)
+ elif tree.tag.lower() == 'html':
+ # html version
+ tables = self._extract_named_tables_from_html(tree)
+ table_names = [ ('Lane Results Summary', 0),
+ ('Lane Results Summary : Read 1', 0),
+ ('Lane Results Summary : Read 2', 1),]
+ for name, end in table_names:
+ if tables.has_key(name):
+ self._extract_lane_results_for_end(tables, name, end)
+
+ if len(self.lane_results[0]) == 0:
+ logging.warning("No Lane Results Summary Found in %s" % (pathname,))
+
+ def _extract_named_tables_from_gerald_xml(self, tree):
+ """
+ Extract useful named tables from a gerald created Summary.xml file
+ """
+ # using the function to convert to lower instead of just writing it
+ # makes the tag easier to read (IMO)
+ useful_tables = ['LaneResultsSummary'.lower(),]
+
+ tables ={}
+ for child in tree.getchildren():
+ if child.tag.lower() in useful_tables:
+ read_tree = child.find('Read')
+ # we want 0 based.
+ read = int(read_tree.find('readNumber').text)-1
+ for element in read_tree.getchildren():
+ if element.tag.lower() == "lane":
+ lrs = Summary.LaneResultSummary()
+ lrs.set_elements_from_gerald_xml(read, element)
+ self.lane_results[lrs.end][lrs.lane] = lrs
+ # probably not useful
+ return tables
+
+ ###### START HTML Table Extraction ########
+ def _extract_named_tables_from_html(self, tree):
body = tree.find('body')
tables = {}
for i in range(len(body)):
tables[name] = data
return tables
- def _extract_lane_results(self, pathname):
- tables = self._extract_named_tables(pathname)
- table_names = [ ('Lane Results Summary', 0),
- ('Lane Results Summary : Read 1', 0),
- ('Lane Results Summary : Read 2', 1),]
- for name, end in table_names:
- if tables.has_key(name):
- self._extract_lane_results_for_end(tables, name, end)
-
def _extract_lane_results_for_end(self, tables, table_name, end):
"""
extract the Lane Results Summary table
# grab the lane by lane data
lane_summary = lane_summary[1:]
- # this is version 2 of the summary file
- if len(lane_summary[-1]) == 10:
+ # len(lane_summary[-1] = 10 is version 2 of the summary file
+ # = 9 is version 3 of the Summary.htm file
+ elif len(lane_summary[-1]) in (9, 10):
# lane_summary[0] is a different less specific header row
headers = lane_summary[1]
lane_summary = lane_summary[2:10]
lrs = Summary.LaneResultSummary(html=r)
lrs.end = end
self.lane_results[lrs.end][lrs.lane] = lrs
+ ###### END HTML Table Extraction ########
def get_elements(self):
summary = ElementTree.Element(Summary.SUMMARY,
for element in list(tree):
lrs = Summary.LaneResultSummary()
lrs.set_elements(element)
- print lrs.end, lrs.lane
if len(self.lane_results) < (lrs.end + 1):
self.lane_results.append({})
self.lane_results[lrs.end][lrs.lane] = lrs
"""
Debugging function, report current object
"""
- pass
+ tree = self.get_elements()
+ print ElementTree.tostring(tree)
def tonumber(v):
"""
Parse values like 123 +/- 4.5
"""
if value.strip() == 'unknown':
- return 0, 0
+ return nan, nan
- average, pm, deviation = value.split()
+ values = value.split()
+ if len(values) == 1:
+ if values[0] == '+/-':
+ return nan,nan
+ else:
+ return tonumber(values[0])
+
+ average, pm, deviation = values
if pm != '+/-':
raise RuntimeError("Summary.htm file format changed")
return tonumber(average), tonumber(deviation)
return parse_mean_range_element(element)
else:
return element.text
+
+def parse_xml_mean_range(element):
+ """
+ Extract mean/stddev children from an element as a tuple
+ """
+ if element is None:
+ return None
+
+ mean = element.find('mean')
+ stddev = element.find('stdev')
+ if mean is None or stddev is None:
+ raise RuntimeError("Summary.xml file format changed, expected mean/stddev tags")
+
+ return (tonumber(mean.text), tonumber(stddev.text))
+
+if __name__ == "__main__":
+ # test code
+ from optparse import OptionParser
+ parser = OptionParser('%prog [Summary.xml/Summary.htm]+')
+ opts, args = parser.parse_args()
+ if len(args) == 0:
+ parser.error('need at least one xml/html file')
+ for fname in args:
+ s = Summary(fname)
+ s.dump()
+