From 10781eaf8aa6d87eb147d469dd73ab1a0ce633bb Mon Sep 17 00:00:00 2001
From: Diane Trout <diane@caltech.edu>
Date: Thu, 24 Apr 2008 00:24:26 +0000
Subject: [PATCH] Report cluster results with the rest of the lane summary
 information. this involved breaking names like "s_1" into their sample and
 lane identifiers and then exclusively using the lane identifiers.

One complexity is that I still had to treat the lane IDs as keys into
a dictionary instead of offsets into a list, because the lanes
were labeled in the range 1..8, but python's list indexes would have
been 0..7.

I also changed the report code to return a string instead of printing
stuff to stdout, to make it easier for me to integrate it into code
to email the summary report.
---
 scripts/runfolder.py | 79 ++++++++++++++++++++++++++++----------------
 1 file changed, 50 insertions(+), 29 deletions(-)

diff --git a/scripts/runfolder.py b/scripts/runfolder.py
index ae04db2..ffc32a6 100644
--- a/scripts/runfolder.py
+++ b/scripts/runfolder.py
@@ -188,7 +188,10 @@ class GERALD(object):
             if container is None or \
                len(container.getchildren()) != LANES_PER_FLOWCELL:
                 raise RuntimeError('GERALD config.xml file changed')
-            element = container.find(self._key)
+            lanes = [x.tag.split('_')[1] for x in container.getchildren()]
+            index = lanes.index(self._key)
+            #element = container.find(self._key)
+            element = container[index]
             return element.text
         def _get_analysis(self):
             return self.__get_attribute('ANALYSIS')
@@ -218,7 +221,11 @@ class GERALD(object):
         def keys(self):
             if self._keys is None:
                 analysis = self._tree.find('LaneSpecificRunParameters/ANALYSIS')
-                self._keys = [ x.tag for x in analysis]
+                # according to the pipeline specs I think their fields 
+                # are sampleName_laneID, with sampleName defaulting to s
+                # since laneIDs are constant lets just try using 
+                # those consistently.
+                self._keys = [ x.tag.split('_')[1] for x in analysis]
             return self._keys
         def values(self):
             return [ self[x] for x in self.keys() ]
@@ -339,7 +346,7 @@ class Summary(object):
     def __init__(self, pathname):
         self.pathname = pathname
         self.tree = ElementTree.parse(pathname).getroot()
-        self.lane_results = []
+        self.lane_results = {}
 
         self._extract_lane_results()
 
@@ -350,18 +357,22 @@ class Summary(object):
         if flatten(self.tree.findall('*//h2')[3]) != 'Lane Results Summary':
             raise RuntimeError("Summary.htm file format changed")
 
-        table = self.tree.findall('*//table')[2]
-        rows = table.getchildren()
+        tables = self.tree.findall('*//table')
+
+        # parse lane result summary
+        lane_summary = tables[2]
+        rows = lane_summary.getchildren()
         headers = rows[0].getchildren()
         if flatten(headers[2]) != 'Av 1st Cycle Int ':
             raise RuntimeError("Summary.htm file format changed")
 
         for r in rows[1:]:
-            self.lane_results.append(LaneResultSummary(r))
+            lrs = LaneResultSummary(r)
+            self.lane_results[lrs.lane] = lrs
 
     def _get_elements(self):
         summary = ElementTree.Element('Summary')
-        for lane in self.lane_results:
+        for lane in self.lane_results.values():
             summary.append(lane.elements)
         return summary
     elements = property(_get_elements)
@@ -386,7 +397,9 @@ class ELAND(object):
             self.pathname = pathname
             # extract the sample name
             path, name = os.path.split(self.pathname)
-            self.sample_name = name.replace("_eland_result.txt","")
+            split_name = name.split('_')
+            self.sample_name = split_name[0]
+            self.lane_id = split_name[1]
             self._reads = None
             self._mapped_reads = None
             self._fasta_map = {}
@@ -417,7 +430,7 @@ class ELAND(object):
             
             reads = 0
             mapped_reads = {}
-            genome_dir = self.gerald.lanes[self.sample_name].eland_genome
+            genome_dir = self.gerald.lanes[self.lane_id].eland_genome
             self._build_fasta_map(genome_dir)
             match_codes = {'NM':0, 'QC':0, 'RM':0, 
                            'U0':0, 'U1':0, 'U2':0,
@@ -457,7 +470,7 @@ class ELAND(object):
         self.results = {}
         for f in glob(os.path.join(basedir, "*_eland_result.txt")):
             eland_result = ELAND.ElandResult(gerald, f)
-            self.results[eland_result.sample_name] = eland_result
+            self.results[eland_result.lane_id] = eland_result
 
 class PipelineRun(object):
     """
@@ -580,28 +593,36 @@ def summary_report(runs):
     """
     Summarize cluster numbers and mapped read counts for a runfolder
     """
+    report = []
     for run in runs:
         # print a run name?
-        print 'Summary for', run.name
-        for lane in run.gerald.summary.lane_results:
-            print 'lane', lane.lane, 'clusters', lane.cluster[0], '+/-',
-            print lane.cluster[1]
-        print ""
+        report.append('Summary for %s' % (run.name,))
 	# sort the report
-	sample_keys = run.gerald.eland_results.results.keys()
-	sample_keys.sort(alphanum)
-	for sample in sample_keys:
-            print '---'
-	    result = run.gerald.eland_results.results[sample]
-            print "Sample name", sample
-            print "Total Reads", result.reads
+	eland_keys = run.gerald.eland_results.results.keys()
+	eland_keys.sort(alphanum)
+
+        lane_results = run.gerald.summary.lane_results
+	for lane_id in eland_keys:
+	    result = run.gerald.eland_results.results[lane_id]
+            report.append("Sample name %s" % (result.sample_name))
+            report.append("Lane id %s" % (result.lane_id,))
+            cluster = lane_results[result.lane_id].cluster
+            report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
+            report.append("Total Reads: %d" % (result.reads))
             mc = result._match_codes
-	    print "No Match", mc['NM']
-	    print "QC Failed", mc['QC']
-            print 'Unique (0,1,2 mismatches)', mc['U0'], mc['U1'], mc['U2']
-            print 'Repeat (0,1,2 mismatches)', mc['R0'], mc['R1'], mc['R2']
-            print "Mapped Reads"
-            pprint(summarize_mapped_reads(result.mapped_reads))
+	    report.append("No Match: %d" % (mc['NM']))
+	    report.append("QC Failed: %d" % (mc['QC']))
+            report.append('Unique (0,1,2 mismatches) %d %d %d' % \
+                          (mc['U0'], mc['U1'], mc['U2']))
+            report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
+                          (mc['R0'], mc['R1'], mc['R2']))
+            report.append("Mapped Reads")
+            mapped_reads = summarize_mapped_reads(result.mapped_reads)
+            for name, counts in mapped_reads.items():
+              report.append("  %s: %d" % (name, counts))
+            report.append('---')
+            report.append('')
+        return os.linesep.join(report)
 
 def make_parser():
     usage = 'usage: %prog [options] runfolder_root_dir'
@@ -632,7 +653,7 @@ def main(cmdlist=None):
     for runfolder in args:
         runs = get_runs(runfolder)
         if opt.summary:
-            summary_report(runs)
+            print summary_report(runs)
         if opt.archive:
             extract_run_parameters(runs)
 
-- 
2.30.2