From 3e38bdc59e4d60f978206b6c0c4d79bcdc65da59 Mon Sep 17 00:00:00 2001
From: Diane Trout <diane@caltech.edu>
Date: Thu, 3 Jul 2008 00:16:50 +0000
Subject: [PATCH] Partially handle the changed Summary.htm file from the 0.3
 version of the GAPipeline.

This update is incomplete as I'm pretty sure the xml serialization code
for the run xml file will break. However it does generate the summary
report for both the old summary file and the new post 0.3 file.

I also need to add unit tests for parsing and serializing the 0.3
file format.
---
 gaworkflow/pipeline/gerald.py | 96 +++++++++++++++++++++++++++--------
 1 file changed, 75 insertions(+), 21 deletions(-)
diff --git a/gaworkflow/pipeline/gerald.py b/gaworkflow/pipeline/gerald.py
index f4f0cbc..3e43f1b 100644
--- a/gaworkflow/pipeline/gerald.py
+++ b/gaworkflow/pipeline/gerald.py
@@ -253,19 +253,29 @@ class Summary(object):
             if xml is not None:
                 self.set_elements(xml)
 
-        def set_elements_from_html(self, row_element):
-            data = [ flatten(x) for x in row_element ]
-            if len(data) != 8:
+        def set_elements_from_html(self, data):
+            if not len(data) in (8,10):
                 raise RuntimeError("Summary.htm file format changed")
 
+            # same in pre-0.3.0 Summary file and 0.3 summary file
             self.lane = data[0]
-            self.cluster = parse_mean_range(data[1])
-            self.average_first_cycle_intensity = parse_mean_range(data[2])
-            self.percent_intensity_after_20_cycles = parse_mean_range(data[3])
-            self.percent_pass_filter_clusters = parse_mean_range(data[4])
-            self.percent_pass_filter_align = parse_mean_range(data[5])
-            self.average_alignment_score = parse_mean_range(data[6])
-            self.percent_error_rate = parse_mean_range(data[7])
+
+            if len(data) == 8:
+                # this is the < 0.3 Pipeline version
+                self.cluster = parse_mean_range(data[1])
+                self.average_first_cycle_intensity = parse_mean_range(data[2])
+                self.percent_intensity_after_20_cycles = \
+                    parse_mean_range(data[3])
+                self.percent_pass_filter_clusters = parse_mean_range(data[4])
+                self.percent_pass_filter_align = parse_mean_range(data[5])
+                self.average_alignment_score = parse_mean_range(data[6])
+                self.percent_error_rate = parse_mean_range(data[7])
+            elif len(data) == 10:
+                # this is the >= 0.3 summary file
+                self.cluster_raw = data[1]
+                self.cluster = parse_mean_range(data[2])
+                # FIXME: think of generic way to capture the variable data
+                
 
         def get_elements(self):
             lane_result = ElementTree.Element(
@@ -316,24 +326,68 @@ class Summary(object):
     def items(self):
         return self.lane_results.items()
 
+    def _flattened_row(self, row):
+        """
+        flatten the children of a <tr>...</tr>
+        """
+        return [flatten(x) for x in row.getchildren() ]
+    
+    def _parse_table(self, table):
+        """
+        assumes the first line is the header of a table, 
+        and that the remaining rows are data
+        """
+        rows = table.getchildren()
+        data = []
+        for r in rows:
+            data.append(self._flattened_row(r))
+        return data
+    
+    def _extract_named_tables(self, pathname):
+        """
+        extract all the 'named' tables from a Summary.htm file
+        and return as a dictionary
+        
+        Named tables are <h2>...</h2><table>...</table> pairs
+        The contents of the h2 tag is considered to the name
+        of the table.
+        """
+        tree = ElementTree.parse(pathname).getroot()
+        body = tree.find('body')
+        tables = {}
+        for i in range(len(body)):
+            if body[i].tag == 'h2' and body[i+1].tag == 'table':
+                # we have an interesting table
+                name = flatten(body[i])
+                table = body[i+1]
+                data = self._parse_table(table)
+                tables[name] = data
+        return tables
+
     def _extract_lane_results(self, pathname):
         """
         extract the Lane Results Summary table
         """
-        tree = ElementTree.parse(pathname).getroot()
-        if flatten(tree.findall('*//h2')[3]) != 'Lane Results Summary':
-            raise RuntimeError("Summary.htm file format changed")
 
-        tables = tree.findall('*//table')
+        tables = self._extract_named_tables(pathname)
 
         # parse lane result summary
-        lane_summary = tables[2]
-        rows = lane_summary.getchildren()
-        headers = rows[0].getchildren()
-        if flatten(headers[2]) != 'Av 1st Cycle Int ':
-            raise RuntimeError("Summary.htm file format changed")
-
-        for r in rows[1:]:
+        lane_summary = tables['Lane Results Summary']
+        # this is version 1 of the summary file
+        if len(lane_summary[-1]) == 8:
+            # strip header
+            headers = lane_summary[0]
+            # grab the lane by lane data
+            lane_summary = lane_summary[1:]
+
+        # this is version 2 of the summary file
+        if len(lane_summary[-1]) == 10:
+            # lane_summary[0] is a different less specific header row
+            headers = lane_summary[1]
+            lane_summary = lane_summary[2:10]
+            # after the last lane, there's a set of chip wide averages
+
+        for r in lane_summary:
             lrs = Summary.LaneResultSummary(html=r)
             self.lane_results[lrs.lane] = lrs
 
-- 
2.30.2