Move runfolder analysis classes out of scripts/runfolder.py into seperate files
authorDiane Trout <diane@caltech.edu>
Fri, 9 May 2008 03:21:16 +0000 (03:21 +0000)
committerDiane Trout <diane@caltech.edu>
Fri, 9 May 2008 03:21:16 +0000 (03:21 +0000)
Also rename runfolder.py to runfolder

This was a really annoying patch to make, I wanted to do two major things,
be able to construct the runfolder configuration extracting classes
from the xml file I was creating, and to make unit tests to make sure
all the code was at least somewhat correct.

Writing all of the xml serialization code was really annoying and dull,
there was probably some nifty metaprogramming way of solving it, but
I didn't feel like figuring it out, as I really need to move on to
more important parts of the project.

I wanted to rename runfolder.py to runfolder as the solexa pipeline code
has a runfolder.py (and if anyone has a better name for the script that's
supposed to dump the runfolder xml file, let me know).

Also in working on the xml serialization code, I extended the serialization
for the eland files, this version now dumps the genome_map and the
eland statistics, like reads, match counts and the like. It does
mean that the --archive mode will take longer, but it also means
I'll have enough information to generate the run statistics later.

Now I might have to redo this if we figure out if we should be handling
the realign files instead.

gaworkflow/pipeline/bustard.py [new file with mode: 0644]
gaworkflow/pipeline/firecrest.py [new file with mode: 0644]
gaworkflow/pipeline/gerald.py [new file with mode: 0644]
gaworkflow/pipeline/runfolder.py [new file with mode: 0644]
gaworkflow/pipeline/test/test_runfolder.py [new file with mode: 0644]
gaworkflow/util/ethelp.py [new file with mode: 0644]
gaworkflow/util/test/test_ethelp.py [new file with mode: 0644]
scripts/runfolder [new file with mode: 0644]
scripts/runfolder.py [deleted file]

diff --git a/gaworkflow/pipeline/bustard.py b/gaworkflow/pipeline/bustard.py
new file mode 100644 (file)
index 0000000..221617e
--- /dev/null
@@ -0,0 +1,145 @@
+
+from datetime import datetime
+from glob import glob
+import logging
+import os
+import time
+import re
+
+from gaworkflow.pipeline.runfolder import \
+   ElementTree, \
+   VERSION_RE, \
+   EUROPEAN_STRPTIME
+
+class Phasing(object):
+    PHASING = 'Phasing'
+    PREPHASING = 'Prephasing'
+
+    def __init__(self, fromfile=None, xml=None):
+        self.lane = None
+        self.phasing = None
+        self.prephasing = None
+
+        if fromfile is not None:
+            self._initialize_from_file(fromfile)
+        elif xml is not None:
+            self.set_elements(xml)
+
+    def _initialize_from_file(self, pathname):
+        path, name = os.path.split(pathname)
+        basename, ext = os.path.splitext(name)
+        # the last character of the param base filename should be the
+        # lane number
+        tree = ElementTree.parse(pathname).getroot()
+        self.set_elements(tree)
+        self.lane = int(basename[-1])
+
+    def get_elements(self):
+        root = ElementTree.Element(Phasing.PHASING, {'lane': str(self.lane)})
+        phasing = ElementTree.SubElement(root, Phasing.PHASING)
+        phasing.text = str(self.phasing)
+        prephasing = ElementTree.SubElement(root, Phasing.PREPHASING)
+        prephasing.text = str(self.prephasing)
+        return root
+
+    def set_elements(self, tree):
+        if tree.tag not in ('Phasing', 'Parameters'):
+            raise ValueError('exptected Phasing or Parameters')
+        lane = tree.attrib.get('lane', None)
+        if lane is not None:
+            self.lane = int(lane)
+        for element in list(tree):
+            if element.tag == Phasing.PHASING:
+                self.phasing = float(element.text)
+            elif element.tag == Phasing.PREPHASING:
+                self.prephasing = float(element.text)
+
+class Bustard(object):
+    XML_VERSION = 1
+
+    # Xml Tags
+    BUSTARD = 'Bustard'
+    SOFTWARE_VERSION = 'version'
+    DATE = 'run_time'
+    USER = 'user'
+    PARAMETERS = 'Parameters'
+
+    def __init__(self, xml=None):
+        self.version = None
+        self.date = datetime.now()
+        self.user = None
+        self.phasing = {}
+
+        if xml is not None:
+            self.set_elements(xml)
+
+    def _get_time(self):
+        return time.mktime(self.date.timetuple())
+    time = property(_get_time, doc='return run time as seconds since epoch')
+
+    def dump(self):
+        print "Bustard version:", self.version
+        print "Run date", self.date
+        print "user:", self.user
+        for lane, tree in self.phasing.items():
+            print lane
+            print tree
+
+    def get_elements(self):
+        root = ElementTree.Element('Bustard', 
+                                   {'version': str(Bustard.XML_VERSION)})
+        version = ElementTree.SubElement(root, Bustard.SOFTWARE_VERSION)
+        version.text = self.version
+        run_date = ElementTree.SubElement(root, Bustard.DATE)
+        run_date.text = str(self.time)
+        user = ElementTree.SubElement(root, Bustard.USER)
+        user.text = self.user
+        params = ElementTree.SubElement(root, Bustard.PARAMETERS)
+        for p in self.phasing.values():
+            params.append(p.get_elements())
+        return root
+
+    def set_elements(self, tree):
+        if tree.tag != Bustard.BUSTARD:
+            raise ValueError('Expected "Bustard" SubElements')
+        xml_version = int(tree.attrib.get('version', 0))
+        if xml_version > Bustard.XML_VERSION:
+            logging.warn('Bustard XML tree is a higher version than this class')
+        for element in list(tree):
+            if element.tag == Bustard.SOFTWARE_VERSION:
+                self.version = element.text
+            elif element.tag == Bustard.DATE:
+                self.date = datetime.fromtimestamp(float(element.text))
+            elif element.tag == Bustard.USER:
+                self.user = element.text
+            elif element.tag == Bustard.PARAMETERS:
+                for param in element:
+                    p = Phasing(xml=param)
+                    self.phasing[p.lane] = p
+            else:
+                raise ValueError("Unrecognized tag: %s" % (element.tag,))
+        
+
+
+def bustard(pathname):
+    """
+    Construct a Bustard object from pathname
+    """
+    b = Bustard()
+    path, name = os.path.split(pathname)
+    groups = name.split("_")
+    version = re.search(VERSION_RE, groups[0])
+    b.version = version.group(1)
+    b.date = datetime.strptime(groups[1], EUROPEAN_STRPTIME)
+    b.user = groups[2]
+    paramfiles = glob(os.path.join(pathname, "params?.xml"))
+    for paramfile in paramfiles:
+        phasing = Phasing(paramfile)
+        assert (phasing.lane >= 1 and phasing.lane <= 8)
+        b.phasing[phasing.lane] = phasing
+    return b
+
+def fromxml(tree):
+    b = Bustard()
+    b.set_elements(tree)
+    return b
diff --git a/gaworkflow/pipeline/firecrest.py b/gaworkflow/pipeline/firecrest.py
new file mode 100644 (file)
index 0000000..a0a30b3
--- /dev/null
@@ -0,0 +1,126 @@
+"""
+Extract information about the Firecrest run
+
+Firecrest - class holding the properties we found
+firecrest - Firecrest factory function initalized from a directory name
+fromxml - Firecrest factory function initalized from an xml dump from
+          the Firecrest object.
+"""
+
+from datetime import datetime
+import os
+import re
+import time
+
+from gaworkflow.pipeline.runfolder import \
+   ElementTree, \
+   VERSION_RE, \
+   EUROPEAN_STRPTIME
+
+class Firecrest(object):
+    XML_VERSION=1
+
+    # xml tag names
+    FIRECREST = 'Firecrest'
+    SOFTWARE_VERSION = 'version'
+    START = 'FirstCycle'
+    STOP = 'LastCycle'
+    DATE = 'run_time'
+    USER = 'user'
+    MATRIX = 'matrix'
+
+    def __init__(self, xml=None):
+        self.start = None
+        self.stop = None
+        self.version = None
+        self.date = datetime.now()
+        self.user = None
+        self.matrix = None
+
+        if xml is not None:
+            self.set_elements(xml)
+        
+    def _get_time(self):
+        return time.mktime(self.date.timetuple())
+    time = property(_get_time, doc='return run time as seconds since epoch')
+
+    def dump(self):
+        print "Starting cycle:", self.start
+        print "Ending cycle:", self.stop
+        print "Firecrest version:", self.version
+        print "Run date:", self.date
+        print "user:", self.user
+
+    def get_elements(self):
+        attribs = {'version': str(Firecrest.XML_VERSION) }
+        root = ElementTree.Element(Firecrest.FIRECREST, attrib=attribs)
+        version = ElementTree.SubElement(root, Firecrest.SOFTWARE_VERSION)
+        version.text = self.version
+        start_cycle = ElementTree.SubElement(root, Firecrest.START)
+        start_cycle.text = str(self.start)
+        stop_cycle = ElementTree.SubElement(root, Firecrest.STOP)
+        stop_cycle.text = str(self.stop)
+        run_date = ElementTree.SubElement(root, Firecrest.DATE)
+        run_date.text = str(self.time)
+        user = ElementTree.SubElement(root, Firecrest.USER)
+        user.text = self.user
+        matrix = ElementTree.SubElement(root, Firecrest.MATRIX)
+        matrix.text = self.matrix
+        return root
+
+    def set_elements(self, tree):
+        if tree.tag != Firecrest.FIRECREST:
+            raise ValueError('Expected "Firecrest" SubElements')
+        xml_version = int(tree.attrib.get('version', 0))
+        if xml_version > Firecrest.XML_VERSION:
+            logging.warn('Firecrest XML tree is a higher version than this class')
+        for element in list(tree):
+            if element.tag == Firecrest.SOFTWARE_VERSION:
+                self.version = element.text
+            elif element.tag == Firecrest.START:
+                self.start = int(element.text)
+            elif element.tag == Firecrest.STOP:
+                self.stop = int(element.text)
+            elif element.tag == Firecrest.DATE:
+                self.date = datetime.fromtimestamp(float(element.text))
+            elif element.tag == Firecrest.USER:
+                self.user = element.text
+            elif element.tag == Firecrest.MATRIX:
+                self.matrix = element.text
+            else:
+                raise ValueError("Unrecognized tag: %s" % (element.tag,))
+
+def firecrest(pathname):
+    """
+    Examine the directory at pathname and initalize a Firecrest object
+    """
+    f = Firecrest()
+
+    # parse firecrest directory name
+    path, name = os.path.split(pathname)
+    groups = name.split('_')
+    # grab the start/stop cycle information
+    cycle = re.match("C([0-9]+)-([0-9]+)", groups[0])
+    f.start = int(cycle.group(1))
+    f.stop = int(cycle.group(2))
+    # firecrest version
+    version = re.search(VERSION_RE, groups[1])
+    f.version = (version.group(1))
+    # datetime
+    f.date = datetime.strptime(groups[2], EUROPEAN_STRPTIME)
+    # username
+    f.user = groups[3]
+
+    # should I parse this deeper than just stashing the 
+    # contents of the matrix file?
+    matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt')
+    f.matrix = open(matrix_pathname, 'r').read()
+    return f
+
+def fromxml(tree):
+    """
+    Initialize a Firecrest object from an element tree node
+    """
+    f = Firecrest()
+    f.set_elements(tree)
+    return f
diff --git a/gaworkflow/pipeline/gerald.py b/gaworkflow/pipeline/gerald.py
new file mode 100644 (file)
index 0000000..1f15eea
--- /dev/null
@@ -0,0 +1,571 @@
+"""
+Provide access to information stored in the GERALD directory.
+"""
+from datetime import datetime, date
+from glob import glob
+import logging
+import os
+import stat
+import time
+
+from gaworkflow.pipeline.runfolder import \
+   ElementTree, \
+   EUROPEAN_STRPTIME, \
+   LANES_PER_FLOWCELL, \
+   VERSION_RE
+from gaworkflow.util.ethelp import indent, flatten
+
+class Gerald(object):
+    """
+    Capture meaning out of the GERALD directory
+    """
+    XML_VERSION = 1
+    GERALD='Gerald'
+    RUN_PARAMETERS='RunParameters'
+    SUMMARY='Summary'
+
+    class LaneParameters(object):
+        """
+        Make it easy to access elements of LaneSpecificRunParameters from python
+        """
+        def __init__(self, gerald, key):
+            self._gerald = gerald
+            self._key = key
+        
+        def __get_attribute(self, xml_tag):
+            subtree = self._gerald.tree.find('LaneSpecificRunParameters')
+            container = subtree.find(xml_tag)
+            if container is None or \
+               len(container.getchildren()) != LANES_PER_FLOWCELL:
+                raise RuntimeError('GERALD config.xml file changed')
+            lanes = [x.tag.split('_')[1] for x in container.getchildren()]
+            index = lanes.index(self._key)
+            #element = container.find(self._key)
+            element = container[index]
+            return element.text
+        def _get_analysis(self):
+            return self.__get_attribute('ANALYSIS')
+        analysis = property(_get_analysis)
+
+        def _get_eland_genome(self):
+            return self.__get_attribute('ELAND_GENOME')
+        eland_genome = property(_get_eland_genome)
+
+        def _get_read_length(self):
+            return self.__get_attribute('READ_LENGTH')
+        read_length = property(_get_read_length)
+
+        def _get_use_bases(self):
+            return self.__get_attribute('USE_BASES')
+        use_bases = property(_get_use_bases)
+
+    class LaneSpecificRunParameters(object):
+        """
+        Provide access to LaneSpecificRunParameters
+        """
+        def __init__(self, gerald):
+            self._gerald = gerald
+            self._keys = None
+        def __getitem__(self, key):
+            return Gerald.LaneParameters(self._gerald, key)
+        def keys(self):
+            if self._keys is None:
+                tree = self._gerald.tree
+                analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
+                # according to the pipeline specs I think their fields 
+                # are sampleName_laneID, with sampleName defaulting to s
+                # since laneIDs are constant lets just try using 
+                # those consistently.
+                self._keys = [ x.tag.split('_')[1] for x in analysis]
+            return self._keys
+        def values(self):
+            return [ self[x] for x in self.keys() ]
+        def items(self):
+            return zip(self.keys(), self.values())
+        def __len__(self):
+            return len(self.keys())
+
+    def __init__(self, xml=None):
+        self.pathname = None
+        self.tree = None
+
+        # parse lane parameters out of the config.xml file
+        self.lanes = Gerald.LaneSpecificRunParameters(self)
+
+        self.summary = None
+        self.eland_results = None
+
+        if xml is not None:
+            self.set_elements(xml)
+
+    def _get_date(self):
+        if self.tree is None:
+            return datetime.today()
+        timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
+        return datetime.strptime(timestamp, '%c')
+    date = property(_get_date)
+
+    def _get_time(self):
+        return time.mktime(self.date.timetuple())
+    time = property(_get_time, doc='return run time as seconds since epoch')
+
+    def _get_version(self):
+        if self.tree is None:
+            return None
+        return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
+    version = property(_get_version)
+
+    def dump(self):
+        """
+        Debugging function, report current object
+        """
+        print 'Gerald version:', self.version
+        print 'Gerald run date:', self.date
+        print 'Gerald config.xml:', self.tree
+        self.summary.dump()
+
+    def get_elements(self):
+        if self.tree is None or self.summary is None:
+            return None
+
+        gerald = ElementTree.Element(Gerald.GERALD, 
+                                     {'version': unicode(Gerald.XML_VERSION)})
+        gerald.append(self.tree)
+        gerald.append(self.summary.get_elements())
+        if self.eland_results:
+            gerald.append(self.eland_results.get_elements())
+        return gerald
+
+    def set_elements(self, tree):
+        if tree.tag !=  Gerald.GERALD:
+            raise ValueError('exptected GERALD')
+        xml_version = int(tree.attrib.get('version', 0))
+        if xml_version > Gerald.XML_VERSION:
+            logging.warn('XML tree is a higher version than this class')
+        for element in list(tree):
+            tag = element.tag.lower()
+            if tag == Gerald.RUN_PARAMETERS.lower():
+                self.tree = element
+            elif tag == Gerald.SUMMARY.lower():
+                self.summary = Summary(xml=element)
+            elif tag == ELAND.ELAND.lower():
+                self.eland_results = ELAND(xml=element)
+            else:
+                logging.warn("Unrecognized tag %s" % (element.tag,))
+        
+
+def gerald(pathname):
+    g = Gerald()
+    g.pathname = pathname
+    path, name = os.path.split(pathname)
+    config_pathname = os.path.join(pathname, 'config.xml')
+    g.tree = ElementTree.parse(config_pathname).getroot()
+
+    # parse Summary.htm file
+    summary_pathname = os.path.join(pathname, 'Summary.htm')
+    g.summary = Summary(summary_pathname)
+    # parse eland files
+    g.eland_results = eland(g.pathname, g)
+    return g
+
+def tonumber(v):
+    """
+    Convert a value to int if its an int otherwise a float.
+    """
+    try:
+        v = int(v)
+    except ValueError, e:
+        v = float(v)
+    return v
+
+def parse_mean_range(value):
+    """
+    Parse values like 123 +/- 4.5
+    """
+    if value.strip() == 'unknown':
+       return 0, 0
+
+    average, pm, deviation = value.split()
+    if pm != '+/-':
+        raise RuntimeError("Summary.htm file format changed")
+    return tonumber(average), tonumber(deviation)
+
+def make_mean_range_element(parent, name, mean, deviation):
+    """
+    Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
+    """
+    element = ElementTree.SubElement(parent, name,
+                                     { 'mean': unicode(mean),
+                                       'deviation': unicode(deviation)})
+    return element
+
+def parse_mean_range_element(element):
+    """
+    Grab mean/deviation out of element
+    """
+    return (tonumber(element.attrib['mean']), 
+            tonumber(element.attrib['deviation']))
+
+class Summary(object):
+    """
+    Extract some useful information from the Summary.htm file
+    """
+    XML_VERSION = 1
+    SUMMARY = 'Summary'
+
+    class LaneResultSummary(object):
+        """
+        Parse the LaneResultSummary table out of Summary.htm
+        Mostly for the cluster number
+        """
+        LANE_RESULT_SUMMARY = 'LaneResultSummary'
+        TAGS = { 
+          'Cluster': 'cluster',
+          'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
+          'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
+          'PercentPassFilterClusters': 'percent_pass_filter_clusters',
+          'PercentPassFilterAlign': 'percent_pass_filter_align',
+          'AverageAlignmentScore': 'average_alignment_score',
+          'PercentErrorRate': 'percent_error_rate'
+        }
+                 
+        def __init__(self, html=None, xml=None):
+            self.lane = None
+            self.cluster = None
+            self.average_first_cycle_intensity = None
+            self.percent_intensity_after_20_cycles = None
+            self.percent_pass_filter_clusters = None
+            self.percent_pass_filter_align = None
+            self.average_alignment_score = None
+            self.percent_error_rate = None
+
+            if html is not None:
+                self.set_elements_from_html(html)
+            if xml is not None:
+                self.set_elements(xml)
+
+        def set_elements_from_html(self, row_element):
+            data = [ flatten(x) for x in row_element ]
+            if len(data) != 8:
+                raise RuntimeError("Summary.htm file format changed")
+
+            self.lane = data[0]
+            self.cluster = parse_mean_range(data[1])
+            self.average_first_cycle_intensity = parse_mean_range(data[2])
+            self.percent_intensity_after_20_cycles = parse_mean_range(data[3])
+            self.percent_pass_filter_clusters = parse_mean_range(data[4])
+            self.percent_pass_filter_align = parse_mean_range(data[5])
+            self.average_alignment_score = parse_mean_range(data[6])
+            self.percent_error_rate = parse_mean_range(data[7])
+
+        def get_elements(self):
+            lane_result = ElementTree.Element(
+                            Summary.LaneResultSummary.LANE_RESULT_SUMMARY, 
+                            {'lane': self.lane})
+            for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
+                element = make_mean_range_element(
+                    lane_result,
+                    tag,
+                    *getattr(self, variable_name)
+                )
+            return lane_result
+
+        def set_elements(self, tree):
+            if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
+                raise ValueError('Expected %s' % (
+                        Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
+            self.lane = tree.attrib['lane']
+            tags = Summary.LaneResultSummary.TAGS
+            for element in list(tree):
+                try:
+                    variable_name = tags[element.tag]
+                    setattr(self, variable_name, 
+                            parse_mean_range_element(element))
+                except KeyError, e:
+                    logging.warn('Unrecognized tag %s' % (element.tag,))
+
+    def __init__(self, filename=None, xml=None):
+        self.lane_results = {}
+
+        if filename is not None:
+            self._extract_lane_results(filename)
+        if xml is not None:
+            self.set_elements(xml)
+
+    def __getitem__(self, key):
+        return self.lane_results[key]
+
+    def __len__(self):
+        return len(self.lane_results)
+
+    def keys(self):
+        return self.lane_results.keys()
+
+    def values(self):
+        return self.lane_results.values()
+
+    def items(self):
+        return self.lane_results.items()
+
+    def _extract_lane_results(self, pathname):
+        """
+        extract the Lane Results Summary table
+        """
+        tree = ElementTree.parse(pathname).getroot()
+        if flatten(tree.findall('*//h2')[3]) != 'Lane Results Summary':
+            raise RuntimeError("Summary.htm file format changed")
+
+        tables = tree.findall('*//table')
+
+        # parse lane result summary
+        lane_summary = tables[2]
+        rows = lane_summary.getchildren()
+        headers = rows[0].getchildren()
+        if flatten(headers[2]) != 'Av 1st Cycle Int ':
+            raise RuntimeError("Summary.htm file format changed")
+
+        for r in rows[1:]:
+            lrs = Summary.LaneResultSummary(html=r)
+            self.lane_results[lrs.lane] = lrs
+
+    def get_elements(self):
+        summary = ElementTree.Element(Summary.SUMMARY, 
+                                      {'version': unicode(Summary.XML_VERSION)})
+        for lane in self.lane_results.values():
+            summary.append(lane.get_elements())
+        return summary
+
+    def set_elements(self, tree):
+        if tree.tag != Summary.SUMMARY:
+            return ValueError("Expected %s" % (Summary.SUMMARY,))
+        xml_version = int(tree.attrib.get('version', 0))
+        if xml_version > Summary.XML_VERSION:
+            logging.warn('Summary XML tree is a higher version than this class')
+        for element in list(tree):
+            lrs = Summary.LaneResultSummary()
+            lrs.set_elements(element)
+            self.lane_results[lrs.lane] = lrs
+
+    def dump(self):
+        """
+        Debugging function, report current object
+        """
+        pass
+
+
+def build_genome_fasta_map(genome_dir):
+    # build fasta to fasta file map
+    genome = genome_dir.split(os.path.sep)[-1]
+    fasta_map = {}
+    for vld_file in glob(os.path.join(genome_dir, '*.vld')):
+        is_link = False
+        if os.path.islink(vld_file):
+            is_link = True
+        vld_file = os.path.realpath(vld_file)
+        path, vld_name = os.path.split(vld_file)
+        name, ext = os.path.splitext(vld_name)
+        if is_link:
+            fasta_map[name] = name
+        else:
+            fasta_map[name] = os.path.join(genome, name)
+    return fasta_map
+    
+class ElandLane(object):
+    """
+    Process an eland result file
+    """
+    XML_VERSION = 1
+    LANE = 'ElandLane'
+    GENOME_MAP = 'GenomeMap'
+    GENOME_ITEM = 'GenomeItem'
+    MAPPED_READS = 'MappedReads'
+    MAPPED_ITEM = 'MappedItem'
+    MATCH_CODES = 'MatchCodes'
+    MATCH_ITEM = 'Code'
+    READS = 'Reads'
+
+    def __init__(self, pathname=None, genome_map=None, xml=None):
+        self.pathname = pathname
+        self._reads = None
+        self._mapped_reads = {}
+        self._match_codes = {}
+        if genome_map is None:
+            genome_map = {}
+        self.genome_map = genome_map
+        
+        if pathname is not None:
+            self._update()
+        if xml is not None:
+            self.set_elements(xml)
+
+    def _update(self):
+        """
+        Actually read the file and actually count the reads
+        """
+        # can't do anything if we don't have a file to process
+        if self.pathname is None:
+            return
+
+        if os.stat(self.pathname)[stat.ST_SIZE] == 0:
+            raise RuntimeError("Eland isn't done, try again later.")
+
+        reads = 0
+        mapped_reads = {}
+
+        match_codes = {'NM':0, 'QC':0, 'RM':0, 
+                       'U0':0, 'U1':0, 'U2':0,
+                       'R0':0, 'R1':0, 'R2':0,
+                      }
+        for line in open(self.pathname):
+            reads += 1
+            fields = line.split()
+            # code = fields[2]
+            # match_codes[code] = match_codes.setdefault(code, 0) + 1
+            # the QC/NM etc codes are in the 3rd field and always present
+            match_codes[fields[2]] += 1
+            # ignore lines that don't have a fasta filename
+            if len(fields) < 7:
+                continue
+            fasta = self.genome_map.get(fields[6], fields[6])
+            mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
+        self._match_codes = match_codes
+        self._mapped_reads = mapped_reads
+        self._reads = reads
+
+    def _get_reads(self):
+        if self._reads is None:
+            self._update()
+        return self._reads
+    reads = property(_get_reads)
+
+    def _get_mapped_reads(self):
+        if self._mapped_reads is None:
+            self._update()
+        return self._mapped_reads
+    mapped_reads = property(_get_mapped_reads)
+
+    def _get_match_codes(self):
+        if self._match_codes is None:
+            self._update()
+        return self._match_codes
+    match_codes = property(_get_match_codes)
+
+    def get_elements(self):
+        lane = ElementTree.Element(ElandLane.LANE, 
+                                   {'version': 
+                                    unicode(ElandLane.XML_VERSION)})
+        genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
+        for k, v in self.genome_map.items():
+            item = ElementTree.SubElement(
+                genome_map, ElandLane.GENOME_ITEM, 
+                {'name':k, 'value':unicode(v)})
+        mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
+        for k, v in self.mapped_reads.items():
+            item = ElementTree.SubElement(
+                mapped_reads, ElandLane.MAPPED_ITEM, 
+                {'name':k, 'value':unicode(v)})
+        match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
+        for k, v in self.match_codes.items():
+            item = ElementTree.SubElement(
+                match_codes, ElandLane.MATCH_ITEM, 
+                {'name':k, 'value':unicode(v)})
+        reads = ElementTree.SubElement(lane, ElandLane.READS)
+        reads.text = unicode(self.reads)
+
+        return lane
+
+    def set_elements(self, tree):
+        if tree.tag != ElandLane.LANE:
+            raise ValueError('Exptecting %s' % (ElandLane.LANE,))
+        for element in tree:
+            tag = element.tag.lower()
+            if tag == ElandLane.GENOME_MAP.lower():
+                for child in element:
+                    name = child.attrib['name']
+                    value = child.attrib['value']
+                    self.genome_map[name] = value
+            elif tag == ElandLane.MAPPED_READS.lower():
+                for child in element:
+                    name = child.attrib['name']
+                    value = child.attrib['value']
+                    self._mapped_reads[name] = int(value)
+            elif tag == ElandLane.MATCH_CODES.lower():
+                for child in element:
+                    name = child.attrib['name']
+                    value = int(child.attrib['value'])
+                    self._match_codes[name] = value
+            elif tag == ElandLane.READS.lower():
+                self._reads = int(element.text)
+            else:
+                logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
+
+
+class ELAND(object):
+    """
+    Summarize information from eland files
+    """
+    XML_VERSION = 1
+
+    ELAND = 'ElandCollection'
+    LANE = 'Lane'
+    LANE_ID = 'id'
+
+    def __init__(self, xml=None):
+        # we need information from the gerald config.xml
+        self.results = {}
+        
+        if xml is not None:
+            self.set_elements(xml)
+
+    def __len__(self):
+        return len(self.results)
+
+    def keys(self):
+        return self.results.keys()
+    
+    def values(self):
+        return self.results.values()
+
+    def items(self):
+        return self.results.items()
+
+    def __getitem__(self, key):
+        return self.results[key]
+
+    def get_elements(self):
+        root = ElementTree.Element(ELAND.ELAND, 
+                                   {'version': unicode(ELAND.XML_VERSION)})
+        for lane_id, lane in self.results.items():
+            eland_lane = lane.get_elements()
+            eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
+            root.append(eland_lane)
+        return root
+
+    def set_elements(self, tree):
+        if tree.tag.lower() != ELAND.ELAND.lower():
+            raise ValueError('Expecting %s', ELAND.ELAND)
+        for element in list(tree):
+            lane_id = element.attrib[ELAND.LANE_ID]
+            lane = ElandLane(xml=element)
+            self.results[lane_id] = lane
+
+def eland(basedir, gerald=None, genome_maps=None):
+    e = ELAND()
+    for pathname in glob(os.path.join(basedir, "*_eland_result.txt")):
+        # extract the sample name
+        path, name = os.path.split(pathname)
+        split_name = name.split('_')
+        sample_name = split_name[0]
+        lane_id = split_name[1]
+
+        if genome_maps is not None:
+            genome_map = genome_maps[lane_id]
+        elif gerald is not None:
+            genome_dir = gerald.lanes[lane_id].eland_genome
+            genome_map = build_genome_fasta_map(genome_dir)
+        else:
+            genome_map = {}
+
+        eland_result = ElandLane(pathname, genome_map)
+        e.results[lane_id] = eland_result
+    return e
diff --git a/gaworkflow/pipeline/runfolder.py b/gaworkflow/pipeline/runfolder.py
new file mode 100644 (file)
index 0000000..1219627
--- /dev/null
@@ -0,0 +1,218 @@
+"""
+Core information needed to inspect a runfolder.
+"""
+from glob import glob
+import logging
+import os
+import re
+import stat
+import sys
+import time
+
+try:
+  from xml.etree import ElementTree
+except ImportError, e:
+  from elementtree import ElementTree
+
+EUROPEAN_STRPTIME = "%d-%m-%Y"
+EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
+VERSION_RE = "([0-9\.]+)"
+USER_RE = "([a-zA-Z0-9]+)"
+LANES_PER_FLOWCELL = 8
+
+from gaworkflow.util.alphanum import alphanum
+from gaworkflow.util.ethelp import indent, flatten
+
+
+class PipelineRun(object):
+    """
+    Capture "interesting" information about a pipeline run
+    """
+    XML_VERSION = 1
+    PIPELINE_RUN = 'PipelineRun'
+    FLOWCELL_ID = 'FlowcellID'
+
+    def __init__(self, pathname=None, firecrest=None, bustard=None, gerald=None, xml=None):
+        self.pathname = pathname
+        self._name = None
+        self._flowcell_id = None
+        self.firecrest = firecrest
+        self.bustard = bustard
+        self.gerald = gerald
+
+        if xml is not None:
+          self.set_elements(xml)
+    
+    def _get_flowcell_id(self):
+        # extract flowcell ID
+        if self._flowcell_id is None:
+          config_dir = os.path.join(self.pathname, 'Config')
+          flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
+         if os.path.exists(flowcell_id_path):
+            flowcell_id_tree = ElementTree.parse(flowcell_id_path)
+            self._flowcell_id = flowcell_id_tree.findtext('Text')
+         else:
+            path_fields = self.pathname.split('_')
+            if len(path_fields) > 0:
+              # guessing last element of filename
+              flowcell_id = path_fields[-1]
+            else:
+              flowcell_id = 'unknown'
+              
+           logging.warning(
+             "Flowcell idwas not found, guessing %s" % (
+                flowcell_id))
+           self._flowcell_id = flowcell_id
+        return self._flowcell_id
+    flowcell_id = property(_get_flowcell_id)
+
+    def get_elements(self):
+        """
+        make one master xml file from all of our sub-components.
+        """
+        root = ElementTree.Element(PipelineRun.PIPELINE_RUN)
+        flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID)
+        flowcell.text = self.flowcell_id
+        root.append(self.firecrest.get_elements())
+        root.append(self.bustard.get_elements())
+        root.append(self.gerald.get_elements())
+        return root
+
+    def set_elements(self, tree):
+        # this file gets imported by all the others,
+        # so we need to hide the imports to avoid a cyclic imports
+        from gaworkflow.pipeline import firecrest
+        from gaworkflow.pipeline import bustard
+        from gaworkflow.pipeline import gerald
+
+        tag = tree.tag.lower()
+        if tag != PipelineRun.PIPELINE_RUN.lower():
+          raise ValueError('Pipeline Run Expecting %s got %s' % (
+              PipelineRun.PIPELINE_RUN, tag))
+        for element in tree:
+          tag = element.tag.lower()
+          if tag == PipelineRun.FLOWCELL_ID.lower():
+            self._flowcell_id = element.text
+          #ok the xword.Xword.XWORD pattern for module.class.constant is lame
+          elif tag == firecrest.Firecrest.FIRECREST.lower():
+            self.firecrest = firecrest.Firecrest(xml=element)
+          elif tag == bustard.Bustard.BUSTARD.lower():
+            self.bustard = bustard.Bustard(xml=element)
+          elif tag == gerald.Gerald.GERALD.lower():
+            self.gerald = gerald.Gerald(xml=element)
+          else:
+            logging.warn('PipelineRun unrecognized tag %s' % (tag,))
+
+    def _get_run_name(self):
+        """
+        Given a run tuple, find the latest date and use that as our name
+        """
+        if self._name is None:
+          tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time)
+          timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
+          self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml'
+        return self._name
+    name = property(_get_run_name)
+
+    def save(self):
+        logging.info("Saving run report "+ self.name)
+        xml = self.get_elements()
+        indent(xml)
+        ElementTree.ElementTree(xml).write(self.name)
+
+    def load(self, filename):
+        logging.info("Loading run report from " + filename)
+        tree = ElementTree.parse(filename).getroot()
+        self.set_elements(tree)
+
+def get_runs(runfolder):
+    """
+    Search through a run folder for all the various sub component runs
+    and then return a PipelineRun for each different combination.
+
+    For example if there are two different GERALD runs, this will
+    generate two different PipelineRun objects, that differ
+    in there gerald component.
+    """
+    from gaworkflow.pipeline import firecrest
+    from gaworkflow.pipeline import bustard
+    from gaworkflow.pipeline import gerald
+
+    datadir = os.path.join(runfolder, 'Data')
+
+    logging.info('Searching for runs in ' + datadir)
+    runs = []
+    for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
+        f = firecrest.firecrest(firecrest_pathname)
+        bustard_glob = os.path.join(firecrest_pathname, "Bustard*")
+        for bustard_pathname in glob(bustard_glob):
+            b = bustard.bustard(bustard_pathname)
+            gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
+            for gerald_pathname in glob(gerald_glob):
+                try:
+                    g = gerald.gerald(gerald_pathname)
+                    runs.append(PipelineRun(runfolder, f, b, g))
+                except IOError, e:
+                    print "Ignoring", str(e)
+    return runs
+                
+    
+def extract_run_parameters(runs):
+    """
+    Search through runfolder_path for various runs and grab their parameters
+    """
+    for run in runs:
+      run.save()
+
+def summarize_mapped_reads(mapped_reads):
+    """
+    Summarize per chromosome reads into a genome count
+    But handle spike-in/contamination symlinks seperately.
+    """
+    summarized_reads = {}
+    genome_reads = 0
+    genome = 'unknown'
+    for k, v in mapped_reads.items():
+        path, k = os.path.split(k)
+        if len(path) > 0:
+            genome = path
+            genome_reads += v
+        else:
+            summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
+    summarized_reads[genome] = genome_reads
+    return summarized_reads
+
+def summary_report(runs):
+    """
+    Summarize cluster numbers and mapped read counts for a runfolder
+    """
+    report = []
+    for run in runs:
+        # print a run name?
+        report.append('Summary for %s' % (run.name,))
+       # sort the report
+       eland_keys = run.gerald.eland_results.results.keys()
+       eland_keys.sort(alphanum)
+
+        lane_results = run.gerald.summary.lane_results
+       for lane_id in eland_keys:
+           result = run.gerald.eland_results.results[lane_id]
+            report.append("Sample name %s" % (result.sample_name))
+            report.append("Lane id %s" % (result.lane_id,))
+            cluster = lane_results[result.lane_id].cluster
+            report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
+            report.append("Total Reads: %d" % (result.reads))
+            mc = result._match_codes
+           report.append("No Match: %d" % (mc['NM']))
+           report.append("QC Failed: %d" % (mc['QC']))
+            report.append('Unique (0,1,2 mismatches) %d %d %d' % \
+                          (mc['U0'], mc['U1'], mc['U2']))
+            report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
+                          (mc['R0'], mc['R1'], mc['R2']))
+            report.append("Mapped Reads")
+            mapped_reads = summarize_mapped_reads(result.mapped_reads)
+            for name, counts in mapped_reads.items():
+              report.append("  %s: %d" % (name, counts))
+            report.append('---')
+            report.append('')
+        return os.linesep.join(report)
diff --git a/gaworkflow/pipeline/test/test_runfolder.py b/gaworkflow/pipeline/test/test_runfolder.py
new file mode 100644 (file)
index 0000000..d4c2fd0
--- /dev/null
@@ -0,0 +1,597 @@
+#!/usr/bin/env python
+
+from datetime import datetime
+import os
+import tempfile
+import shutil
+import unittest
+
+from gaworkflow.pipeline import firecrest
+from gaworkflow.pipeline import bustard
+from gaworkflow.pipeline import gerald
+from gaworkflow.pipeline import runfolder
+from gaworkflow.pipeline.runfolder import ElementTree
+
+
+def make_flowcell_id(runfolder_dir, flowcell_id=None):
+    if flowcell_id is None:
+        flowcell_id = '207BTAAXY'
+
+    config = """<?xml version="1.0"?>
+<FlowcellId>
+  <Text>%s</Text>
+</FlowcellId>""" % (flowcell_id,)
+    config_dir = os.path.join(runfolder_dir, 'Config')
+    
+    if not os.path.exists(config_dir):
+        os.mkdir(config_dir)
+    pathname = os.path.join(config_dir, 'FlowcellId.xml')
+    f = open(pathname,'w')
+    f.write(config)
+    f.close()
+
+def make_matrix(matrix_dir):
+    contents = """# Auto-generated frequency response matrix
+> A
+> C
+> G
+> T
+0.77 0.15 -0.04 -0.04 
+0.76 1.02 -0.05 -0.06 
+-0.10 -0.10 1.17 -0.03 
+-0.13 -0.12 0.80 1.27 
+"""
+    s_matrix = os.path.join(matrix_dir, 's_matrix.txt')
+    f = open(s_matrix, 'w')
+    f.write(contents)
+    f.close()
+    
+def make_phasing_params(bustard_dir):
+    for lane in range(1,9):
+        pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane))
+        f = open(pathname, 'w')
+        f.write("""<Parameters>
+  <Phasing>0.009900</Phasing>
+  <Prephasing>0.003500</Prephasing>
+</Parameters>
+""")
+        f.close()
+
+def make_gerald_config(gerald_dir):
+    config_xml = """<RunParameters>
+<ChipWideRunParameters>
+  <ANALYSIS>default</ANALYSIS>
+  <BAD_LANES></BAD_LANES>
+  <BAD_TILES></BAD_TILES>
+  <CONTAM_DIR></CONTAM_DIR>
+  <CONTAM_FILE></CONTAM_FILE>
+  <ELAND_GENOME>Need_to_specify_ELAND_genome_directory</ELAND_GENOME>
+  <ELAND_MULTIPLE_INSTANCES>8</ELAND_MULTIPLE_INSTANCES>
+  <ELAND_REPEAT></ELAND_REPEAT>
+  <EMAIL_DOMAIN>domain.com</EMAIL_DOMAIN>
+  <EMAIL_LIST>diane</EMAIL_LIST>
+  <EMAIL_SERVER>localhost:25</EMAIL_SERVER>
+  <EXPT_DIR>/home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane</EXPT_DIR>
+  <EXPT_DIR_ROOT>/home/diane/gec</EXPT_DIR_ROOT>
+  <FORCE>1</FORCE>
+  <GENOME_DIR>/home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes</GENOME_DIR>
+  <GENOME_FILE>Need_to_specify_genome_file_name</GENOME_FILE>
+  <HAMSTER_FLAG>genome</HAMSTER_FLAG>
+  <OUT_DIR>/home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane</OUT_DIR>
+  <POST_RUN_COMMAND></POST_RUN_COMMAND>
+  <PRB_FILE_SUFFIX>_prb.txt</PRB_FILE_SUFFIX>
+  <PURE_BASES>12</PURE_BASES>
+  <QF_PARAMS>'((CHASTITY&gt;=0.6))'</QF_PARAMS>
+  <QHG_FILE_SUFFIX>_qhg.txt</QHG_FILE_SUFFIX>
+  <QUALITY_FORMAT>--symbolic</QUALITY_FORMAT>
+  <READ_LENGTH>32</READ_LENGTH>
+  <SEQUENCE_FORMAT>--scarf</SEQUENCE_FORMAT>
+  <SEQ_FILE_SUFFIX>_seq.txt</SEQ_FILE_SUFFIX>
+  <SIG_FILE_SUFFIX_DEPHASED>_sig2.txt</SIG_FILE_SUFFIX_DEPHASED>
+  <SIG_FILE_SUFFIX_NOT_DEPHASED>_sig.txt</SIG_FILE_SUFFIX_NOT_DEPHASED>
+  <SOFTWARE_VERSION>@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp</SOFTWARE_VERSION>
+  <TILE_REGEX>s_[1-8]_[0-9][0-9][0-9][0-9]</TILE_REGEX>
+  <TILE_ROOT>s</TILE_ROOT>
+  <TIME_STAMP>Sat Apr 19 19:08:30 2008</TIME_STAMP>
+  <TOOLS_DIR>/home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald</TOOLS_DIR>
+  <USE_BASES>all</USE_BASES>
+  <WEB_DIR_ROOT>http://host.domain.com/yourshare/</WEB_DIR_ROOT>
+</ChipWideRunParameters>
+<LaneSpecificRunParameters>
+  <ANALYSIS>
+    <s_1>eland</s_1>
+    <s_2>eland</s_2>
+    <s_3>eland</s_3>
+    <s_4>eland</s_4>
+    <s_5>eland</s_5>
+    <s_6>eland</s_6>
+    <s_7>eland</s_7>
+    <s_8>eland</s_8>
+  </ANALYSIS>
+  <ELAND_GENOME>
+    <s_1>/g/dm3</s_1>
+    <s_2>/g/equcab1</s_2>
+    <s_3>/g/equcab1</s_3>
+    <s_4>/g/canfam2</s_4>
+    <s_5>/g/hg18</s_5>
+    <s_6>/g/hg18</s_6>
+    <s_7>/g/hg18</s_7>
+    <s_8>/g/hg18</s_8>
+  </ELAND_GENOME>
+  <READ_LENGTH>
+    <s_1>32</s_1>
+    <s_2>32</s_2>
+    <s_3>32</s_3>
+    <s_4>32</s_4>
+    <s_5>32</s_5>
+    <s_6>32</s_6>
+    <s_7>32</s_7>
+    <s_8>32</s_8>
+  </READ_LENGTH>
+  <USE_BASES>
+    <s_1>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_1>
+    <s_2>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_2>
+    <s_3>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_3>
+    <s_4>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_4>
+    <s_5>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_5>
+    <s_6>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_6>
+    <s_7>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_7>
+    <s_8>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_8>
+  </USE_BASES>
+</LaneSpecificRunParameters>
+</RunParameters>
+"""
+    pathname = os.path.join(gerald_dir, 'config.xml')
+    f = open(pathname,'w')
+    f.write(config_xml)
+    f.close()
+    
+
+def make_summary_htm(gerald_dir):
+    summary_htm = """<!--RUN_TIME Mon Apr 21 11:52:25 2008 -->
+<!--SOFTWARE_VERSION @(#) $Id: jerboa.pl,v 1.31 2007/03/05 17:52:15 km Exp $-->
+<html>
+<body>
+
+<a name="Top"><h2><title>080416_HWI-EAS229_0024_207BTAAXX Summary</title></h2></a>
+<h1>Summary Information For Experiment 080416_HWI-EAS229_0024_207BTAAXX on Machine HWI-EAS229</h1>
+<h2><br></br>Chip Summary<br></br></h2>
+<table border="1" cellpadding="5">
+<tr><td>Machine</td><td>HWI-EAS229</td></tr>
+<tr><td>Run Folder</td><td>080416_HWI-EAS229_0024_207BTAAXX</td></tr>
+<tr><td>Chip ID</td><td>unknown</td></tr>
+</table>
+<h2><br></br>Lane Parameter Summary<br></br></h2>
+<table border="1" cellpadding="5">
+<tr>
+<td>Lane</td>
+<td>Sample ID</td>
+<td>Sample Target</td>
+<td>Sample Type</td>
+<td>Length</td>
+<td>Filter</td>
+<td>Tiles</td>
+</tr>
+<tr>
+<td>1</td>
+<td>unknown</td>
+<td>dm3</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane1">Lane 1</a></td>
+</tr>
+<tr>
+<td>2</td>
+<td>unknown</td>
+<td>equcab1</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane2">Lane 2</a></td>
+</tr>
+<tr>
+<td>3</td>
+<td>unknown</td>
+<td>equcab1</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane3">Lane 3</a></td>
+</tr>
+<tr>
+<td>4</td>
+<td>unknown</td>
+<td>canfam2</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane4">Lane 4</a></td>
+</tr>
+<tr>
+<td>5</td>
+<td>unknown</td>
+<td>hg18</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane5">Lane 5</a></td>
+</tr>
+<tr>
+<td>6</td>
+<td>unknown</td>
+<td>hg18</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane6">Lane 6</a></td>
+</tr>
+<tr>
+<td>7</td>
+<td>unknown</td>
+<td>hg18</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane7">Lane 7</a></td>
+</tr>
+<tr>
+<td>8</td>
+<td>unknown</td>
+<td>hg18</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane8">Lane 8</a></td>
+</tr>
+</table>
+<h2><br></br>Lane Results Summary<br></br></h2>
+<table border="1" cellpadding="5">
+<tr>
+
+<td>Lane </td>
+<td>Clusters </td>
+<td>Av 1st Cycle Int </td>
+<td>% intensity after 20 cycles </td>
+<td>% PF Clusters </td>
+<td>% Align (PF) </td>
+<td>Av Alignment Score (PF) </td>
+<td> % Error Rate (PF) </td>
+</tr>
+<tr>
+<td>1</td>
+<td>17421 +/- 2139</td>
+<td>7230 +/- 801</td>
+<td>23.73 +/- 10.79</td>
+<td>13.00 +/- 22.91</td>
+<td>32.03 +/- 18.45</td>
+<td>6703.57 +/- 3753.85</td>
+<td>4.55 +/- 4.81</td>
+</tr>
+<tr>
+<td>2</td>
+<td>20311 +/- 2402</td>
+<td>7660 +/- 678</td>
+<td>17.03 +/- 4.40</td>
+<td>40.74 +/- 30.33</td>
+<td>29.54 +/- 9.03</td>
+<td>5184.02 +/- 1631.54</td>
+<td>3.27 +/- 3.94</td>
+</tr>
+<tr>
+<td>3</td>
+<td>20193 +/- 2399</td>
+<td>7700 +/- 797</td>
+<td>15.75 +/- 3.30</td>
+<td>56.56 +/- 17.16</td>
+<td>27.33 +/- 7.48</td>
+<td>4803.49 +/- 1313.31</td>
+<td>3.07 +/- 2.86</td>
+</tr>
+<tr>
+<td>4</td>
+<td>15537 +/- 2531</td>
+<td>7620 +/- 1392</td>
+<td>15.37 +/- 3.79</td>
+<td>63.05 +/- 18.30</td>
+<td>15.88 +/- 4.99</td>
+<td>3162.13 +/- 962.59</td>
+<td>3.11 +/- 2.22</td>
+</tr>
+<tr>
+<td>5</td>
+<td>32047 +/- 3356</td>
+<td>8093 +/- 831</td>
+<td>23.79 +/- 6.18</td>
+<td>53.36 +/- 18.06</td>
+<td>48.04 +/- 13.77</td>
+<td>9866.23 +/- 2877.30</td>
+<td>2.26 +/- 1.16</td>
+</tr>
+<tr>
+<td>6</td>
+<td>32946 +/- 4753</td>
+<td>8227 +/- 736</td>
+<td>24.07 +/- 4.69</td>
+<td>54.65 +/- 12.57</td>
+<td>50.98 +/- 10.54</td>
+<td>10468.86 +/- 2228.53</td>
+<td>2.21 +/- 2.33</td>
+</tr>
+<tr>
+<td>7</td>
+<td>39504 +/- 4171</td>
+<td>8401 +/- 785</td>
+<td>22.55 +/- 4.56</td>
+<td>45.22 +/- 10.34</td>
+<td>48.41 +/- 9.67</td>
+<td>9829.40 +/- 1993.20</td>
+<td>2.26 +/- 1.11</td>
+</tr>
+<tr>
+<td>8</td>
+<td>37998 +/- 3792</td>
+<td>8443 +/- 1211</td>
+<td>39.03 +/- 7.52</td>
+<td>42.16 +/- 12.35</td>
+<td>40.98 +/- 14.89</td>
+<td>8128.87 +/- 3055.34</td>
+<td>3.57 +/- 2.77</td>
+</tr>
+</table>
+</body>
+</html>
+"""
+    pathname = os.path.join(gerald_dir, 'Summary.htm')
+    f = open(pathname, 'w')
+    f.write(summary_htm)
+    f.close()
+
+def make_eland_results(gerald_dir):
+    eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759    ACATAGNCACAGACATAAACATAGACATAGAC U0      1       1       3       chrUextra.fa    28189829        R       D.
+>HWI-EAS229_24_207BTAAXX:1:7:205:842    AAACAANNCTCCCAAACACGTAAACTGGAAAA  U1      0       1       0       chr2L.fa        8796855 R       DD      24T
+>HWI-EAS229_24_207BTAAXX:1:7:776:582    AGCTCANCCGATCGAAAACCTCNCCAAGCAAT        NM      0       0       0
+>HWI-EAS229_24_207BTAAXX:1:7:205:842    AAACAANNCTCCCAAACACGTAAACTGGAAAA        U1      0       1       0       Lambda.fa        8796855 R       DD      24T
+"""
+    for i in range(1,9):
+        pathname = os.path.join(gerald_dir, 
+                                's_%d_eland_result.txt' % (i,))
+        f = open(pathname, 'w')
+        f.write(eland_result)
+        f.close()
+                     
+class RunfolderTests(unittest.TestCase):
+    """
+    Test components of the runfolder processing code
+    which includes firecrest, bustard, and gerald
+    """
+    def setUp(self):
+        # make a fake runfolder directory
+        self.temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_')
+
+        self.runfolder_dir = os.path.join(self.temp_dir, 
+                                          '080102_HWI-EAS229_0010_207BTAAXX')
+        os.mkdir(self.runfolder_dir)
+
+        self.data_dir = os.path.join(self.runfolder_dir, 'Data')
+        os.mkdir(self.data_dir)
+
+        self.firecrest_dir = os.path.join(self.data_dir, 
+                               'C1-33_Firecrest1.8.28_12-04-2008_diane'
+                             )
+        os.mkdir(self.firecrest_dir)
+        self.matrix_dir = os.path.join(self.firecrest_dir, 'Matrix')
+        os.mkdir(self.matrix_dir)
+        make_matrix(self.matrix_dir)
+
+        self.bustard_dir = os.path.join(self.firecrest_dir, 
+                                        'Bustard1.8.28_12-04-2008_diane')
+        os.mkdir(self.bustard_dir)
+        make_phasing_params(self.bustard_dir)
+        
+        self.gerald_dir = os.path.join(self.bustard_dir,
+                                       'GERALD_12-04-2008_diane')
+        os.mkdir(self.gerald_dir)
+        make_gerald_config(self.gerald_dir)
+        make_summary_htm(self.gerald_dir)
+        make_eland_results(self.gerald_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.temp_dir)
+
+    def test_firecrest(self):
+        """
+        Construct a firecrest object
+        """
+        f = firecrest.firecrest(self.firecrest_dir)
+        self.failUnlessEqual(f.version, '1.8.28')
+        self.failUnlessEqual(f.start, 1)
+        self.failUnlessEqual(f.stop, 33)
+        self.failUnlessEqual(f.user, 'diane')
+        self.failUnlessEqual(f.date, datetime(2008,4,12))
+
+        xml = f.get_elements()
+        # just make sure that element tree can serialize the tree
+        xml_str = ElementTree.tostring(xml)
+
+        f2 = firecrest.Firecrest(xml=xml)
+        self.failUnlessEqual(f.version, f2.version)
+        self.failUnlessEqual(f.start,   f2.start)
+        self.failUnlessEqual(f.stop,    f2.stop)
+        self.failUnlessEqual(f.user,    f2.user)
+        self.failUnlessEqual(f.date,    f2.date)
+
+    def test_bustard(self):
+        """
+        construct a bustard object
+        """
+        b = bustard.bustard(self.bustard_dir)
+        self.failUnlessEqual(b.version, '1.8.28')
+        self.failUnlessEqual(b.date,    datetime(2008,4,12))
+        self.failUnlessEqual(b.user,    'diane')
+        self.failUnlessEqual(len(b.phasing), 8)
+        self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099)
+        
+        xml = b.get_elements()
+        b2 = bustard.Bustard(xml=xml)
+        self.failUnlessEqual(b.version, b2.version)
+        self.failUnlessEqual(b.date,    b2.date )
+        self.failUnlessEqual(b.user,    b2.user)
+        self.failUnlessEqual(len(b.phasing), len(b2.phasing))
+        for key in b.phasing.keys():
+            self.failUnlessEqual(b.phasing[key].lane, 
+                                 b2.phasing[key].lane)
+            self.failUnlessEqual(b.phasing[key].phasing, 
+                                 b2.phasing[key].phasing)
+            self.failUnlessEqual(b.phasing[key].prephasing, 
+                                 b2.phasing[key].prephasing)
+
+    def test_gerald(self):
+        # need to update gerald and make tests for it
+        g = gerald.gerald(self.gerald_dir) 
+
+        self.failUnlessEqual(g.version, 
+            '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp')
+        self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30))
+        self.failUnlessEqual(len(g.lanes), len(g.lanes.keys()))
+        self.failUnlessEqual(len(g.lanes), len(g.lanes.items()))
+
+        
+        # list of genomes, matches what was defined up in 
+        # make_gerald_config.
+        # the first None is to offset the genomes list to be 1..9
+        # instead of pythons default 0..8
+        genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2',
+                         '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ]
+
+        # test lane specific parameters from gerald config file
+        for i in range(1,9):
+            cur_lane = g.lanes[str(i)]
+            self.failUnlessEqual(cur_lane.analysis, 'eland')
+            self.failUnlessEqual(cur_lane.eland_genome, genomes[i])
+            self.failUnlessEqual(cur_lane.read_length, '32')
+            self.failUnlessEqual(cur_lane.use_bases, 'Y'*32)
+
+        # test data extracted from summary file
+        clusters = [None, 
+                    (17421, 2139), (20311, 2402), (20193, 2399), (15537, 2531),
+                    (32047, 3356), (32946, 4753), (39504, 4171), (37998, 3792)]
+
+        for i in range(1,9):
+            summary_lane = g.summary[str(i)]
+            self.failUnlessEqual(summary_lane.cluster, clusters[i])
+            self.failUnlessEqual(summary_lane.lane, str(i))
+
+        xml = g.get_elements()
+        # just make sure that element tree can serialize the tree
+        xml_str = ElementTree.tostring(xml)
+        g2 = gerald.Gerald(xml=xml)
+
+        # do it all again after extracting from the xml file
+        self.failUnlessEqual(g.version, g2.version)
+        self.failUnlessEqual(g.date, g2.date)
+        self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys()))
+        self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items()))
+
+        # test lane specific parameters from gerald config file
+        for i in range(1,9):
+            g_lane = g.lanes[str(i)]
+            g2_lane = g2.lanes[str(i)]
+            self.failUnlessEqual(g_lane.analysis, g2_lane.analysis)
+            self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome)
+            self.failUnlessEqual(g_lane.read_length, g2_lane.read_length)
+            self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases)
+
+        # test (some) summary elements
+        for i in range(1,9):
+            g_summary = g.summary[str(i)]
+            g2_summary = g2.summary[str(i)]
+            self.failUnlessEqual(g_summary.cluster, g2_summary.cluster)
+            self.failUnlessEqual(g_summary.lane, g2_summary.lane)
+
+            g_eland = g.eland_results
+            g2_eland = g2.eland_results
+            for lane in g_eland.keys():
+                self.failUnlessEqual(g_eland[lane].reads, 
+                                     g2_eland[lane].reads)
+                self.failUnlessEqual(len(g_eland[lane].mapped_reads), 
+                                     len(g2_eland[lane].mapped_reads))
+                for k in g_eland[lane].mapped_reads.keys():
+                    self.failUnlessEqual(g_eland[lane].mapped_reads[k],
+                                         g2_eland[lane].mapped_reads[k])
+
+                self.failUnlessEqual(len(g_eland[lane].match_codes), 
+                                     len(g2_eland[lane].match_codes))
+                for k in g_eland[lane].match_codes.keys():
+                    self.failUnlessEqual(g_eland[lane].match_codes[k],
+                                         g2_eland[lane].match_codes[k])
+
+
+    def test_eland(self):
+        dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa',
+                    'chr2L.fa': 'dm3/chr2L.fa',
+                    'Lambda.fa': 'Lambda.fa'}
+        genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map,
+                        '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map }
+        eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps)
+        
+        for i in range(1,9):
+            lane = eland[str(i)]
+            self.failUnlessEqual(lane.reads, 4)
+            self.failUnlessEqual(len(lane.mapped_reads), 3)
+            self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1)
+            self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1)
+            self.failUnlessEqual(lane.match_codes['U1'], 2)
+            self.failUnlessEqual(lane.match_codes['NM'], 1)
+
+        xml = eland.get_elements()
+        # just make sure that element tree can serialize the tree
+        xml_str = ElementTree.tostring(xml)
+        e2 = gerald.ELAND(xml=xml)
+
+        for i in range(1,9):
+            l1 = eland[str(i)]
+            l2 = e2[str(i)]
+            self.failUnlessEqual(l1.reads, l2.reads)
+            self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads))
+            self.failUnlessEqual(len(l1.mapped_reads), 3)
+            for k in l1.mapped_reads.keys():
+                self.failUnlessEqual(l1.mapped_reads[k],
+                                     l2.mapped_reads[k])
+
+            self.failUnlessEqual(len(l1.match_codes), 9)
+            self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes))
+            for k in l1.match_codes.keys():
+                self.failUnlessEqual(l1.match_codes[k], 
+                                     l2.match_codes[k])
+
+    def test_runfolder(self):
+        runs = runfolder.get_runs(self.runfolder_dir)
+        
+        # do we get the flowcell id from the filename?
+        self.failUnlessEqual(len(runs), 1)
+        self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-04-19.xml')
+
+        # do we get the flowcell id from the FlowcellId.xml file
+        make_flowcell_id(self.runfolder_dir, '207BTAAXY')
+        runs = runfolder.get_runs(self.runfolder_dir)
+        self.failUnlessEqual(len(runs), 1)
+        self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-04-19.xml')
+        
+        r1 = runs[0]
+        xml = r1.get_elements()
+        xml_str = ElementTree.tostring(xml)
+
+        r2 = runfolder.PipelineRun(xml=xml)
+        self.failUnlessEqual(r1.name, r2.name)
+        self.failIfEqual(r2.firecrest, None)
+        self.failIfEqual(r2.bustard, None)
+        self.failIfEqual(r2.gerald, None)
+        
+
+def suite():
+    return unittest.makeSuite(RunfolderTests,'test')
+
+if __name__ == "__main__":
+    unittest.main(defaultTest="suite")
+    
diff --git a/gaworkflow/util/ethelp.py b/gaworkflow/util/ethelp.py
new file mode 100644 (file)
index 0000000..19f6c9f
--- /dev/null
@@ -0,0 +1,32 @@
+"""
+ElementTree helper functions
+"""
+def indent(elem, level=0):
+    """
+    reformat an element tree to be 'pretty' (indented)
+    """
+    i = "\n" + level*"  "
+    if len(elem):
+        if not elem.text or not elem.text.strip():
+            elem.text = i + "  "
+        for child in elem:
+            indent(child, level+1)
+        # we don't want the closing tag indented too far
+        child.tail = i
+        if not elem.tail or not elem.tail.strip():
+            elem.tail = i
+    else:
+        if level and (not elem.tail or not elem.tail.strip()):
+            elem.tail = i
+
+def flatten(elem, include_tail=0):
+    """
+    Extract the text from an element tree 
+    (AKA extract the text that not part of XML tags)
+    """
+    text = elem.text or ""
+    for e in elem:
+        text += flatten(e, 1)
+    if include_tail and elem.tail: text += elem.tail
+    return text
+
diff --git a/gaworkflow/util/test/test_ethelp.py b/gaworkflow/util/test/test_ethelp.py
new file mode 100644 (file)
index 0000000..46f81a1
--- /dev/null
@@ -0,0 +1,31 @@
+import os
+import unittest
+
+from xml.etree import ElementTree
+from gaworkflow.util.ethelp import indent, flatten
+
+class testETHelper(unittest.TestCase):
+    def setUp(self):
+        self.foo = '<foo><bar>asdf</bar><br/></foo>'
+        self.foo_tree = ElementTree.fromstring(self.foo)
+
+    def test_indent(self):
+        flat_foo = ElementTree.tostring(self.foo_tree)
+        self.failUnlessEqual(len(flat_foo.split('\n')), 1)
+
+        indent(self.foo_tree)
+        pretty_foo = ElementTree.tostring(self.foo_tree)
+        self.failUnlessEqual(len(pretty_foo.split('\n')), 5)
+
+    def test_flatten(self):
+        self.failUnless(flatten(self.foo_tree), 'asdf')
+
+def suite():
+    return unittest.makeSuite(testETHelper, 'test')
+
+if __name__ == "__main__":
+    unittest.main(defaultTest='suite')
+
+
+
+
diff --git a/scripts/runfolder b/scripts/runfolder
new file mode 100644 (file)
index 0000000..13c5cad
--- /dev/null
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+"""
+Runfolder.py can generate a xml file capturing all the 'interesting' parameters from a finished pipeline run. (using the -a option). The information currently being captured includes:
+
+  * Flowcell ID
+  * run dates
+  * start/stop cycle numbers
+  * Firecrest, bustard, gerald version numbers
+  * Eland analysis types, and everything in the eland configuration file.
+  * cluster numbers and other values from the Summary.htm 
+    LaneSpecificParameters table. 
+  * How many reads mapped to a genome from an eland file
+
+The ELAND "mapped reads" counter will also check for eland squashed file
+that were symlinked from another directory. This is so I can track how 
+many reads landed on the genome of interest and on the spike ins. 
+
+Basically my subdirectories something like:
+
+genomes/hg18
+genomes/hg18/chr*.2bpb <- files for hg18 genome
+genomes/hg18/chr*.vld  
+genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
+genomes/spikein 
+
+runfolder.py can also spit out a simple summary report (-s option) 
+that contains the per lane post filter cluster numbers and the mapped 
+read counts. (The report isn't currently very pretty)
+"""
+import logging
+import optparse
+import sys
+
+from gaworkflow.pipeline import runfolder
+        
+def make_parser():
+    usage = 'usage: %prog [options] runfolder_root_dir'
+    parser = optparse.OptionParser(usage)
+    parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
+                      default=False,
+                      help='turn on verbose mode')
+    parser.add_option('-s', '--summary', dest='summary', action='store_true',
+                      default=False,
+                      help='produce summary report')
+    parser.add_option('-a', '--archive', dest='archive', action='store_true',
+                      default=False,
+                      help='generate run configuration archive')
+    return parser
+
+def main(cmdlist=None):
+    parser = make_parser()
+    opt, args = parser.parse_args(cmdlist)
+
+    if len(args) == 0:
+        parser.error('need path to a runfolder')
+    
+    logging.basicConfig()
+    if opt.verbose:
+        root_log = logging.getLogger()
+        root_log.setLevel(logging.INFO)
+
+    for run_dir in args:
+        runs = runfolder.get_runs(run_dir)
+        if opt.summary:
+            print runfolder.summary_report(runs)
+        if opt.archive:
+            runfolder.extract_run_parameters(runs)
+
+    return 0
+
+if __name__ == "__main__":
+  sys.exit(main(sys.argv[1:]))
diff --git a/scripts/runfolder.py b/scripts/runfolder.py
deleted file mode 100644 (file)
index ffc32a6..0000000
+++ /dev/null
@@ -1,663 +0,0 @@
-#!/usr/bin/env python
-"""
-Runfolder.py can generate a xml file capturing all the 'interesting' parameters from a finished pipeline run. (using the -a option). The information currently being captured includes:
-
-  * Flowcell ID
-  * run dates
-  * start/stop cycle numbers
-  * Firecrest, bustard, gerald version numbers
-  * Eland analysis types, and everything in the eland configuration file.
-  * cluster numbers and other values from the Summary.htm 
-    LaneSpecificParameters table. 
-  * How many reads mapped to a genome from an eland file
-
-The ELAND "mapped reads" counter will also check for eland squashed file
-that were symlinked from another directory. This is so I can track how 
-many reads landed on the genome of interest and on the spike ins. 
-
-Basically my subdirectories something like:
-
-genomes/hg18
-genomes/hg18/chr*.2bpb <- files for hg18 genome
-genomes/hg18/chr*.vld  
-genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
-genomes/spikein 
-
-runfolder.py can also spit out a simple summary report (-s option) 
-that contains the per lane post filter cluster numbers and the mapped 
-read counts. (The report isn't currently very pretty)
-"""
-import time
-import logging
-import os
-import re
-import stat
-import sys
-from glob import glob
-from pprint import pprint
-import optparse
-
-try:
-  from xml.etree import ElementTree
-except ImportError, e:
-  from elementtree import ElementTree
-
-from gaworkflow.util.alphanum import alphanum
-EUROPEAN_STRPTIME = "%d-%m-%Y"
-EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
-VERSION_RE = "([0-9\.]+)"
-USER_RE = "([a-zA-Z0-9]+)"
-LANES_PER_FLOWCELL = 8
-
-runfolder_path = '/home/diane/gec/080221_HWI-EAS229_0018_201BHAAXX'
-
-def indent(elem, level=0):
-    """
-    reformat an element tree to be 'pretty' (indented)
-    """
-    i = "\n" + level*"  "
-    if len(elem):
-        if not elem.text or not elem.text.strip():
-            elem.text = i + "  "
-        for child in elem:
-            indent(child, level+1)
-        # we don't want the closing tag indented too far
-        child.tail = i
-        if not elem.tail or not elem.tail.strip():
-            elem.tail = i
-    else:
-        if level and (not elem.tail or not elem.tail.strip()):
-            elem.tail = i
-
-def flatten(elem, include_tail=0):
-    """
-    Extract the text from an element tree 
-    (AKA extract the text that not part of XML tags)
-    """
-    text = elem.text or ""
-    for e in elem:
-        text += flatten(e, 1)
-    if include_tail and elem.tail: text += elem.tail
-    return text
-
-class Firecrest(object):
-    def __init__(self, pathname):
-        self.pathname = pathname
-
-        # parse firecrest directory name
-        path, name = os.path.split(pathname)
-        groups = name.split('_')
-        # grab the start/stop cycle information
-        cycle = re.match("C([0-9]+)-([0-9]+)", groups[0])
-        self.start = int(cycle.group(1))
-        self.stop = int(cycle.group(2))
-        # firecrest version
-        version = re.search(VERSION_RE, groups[1])
-        self.version = (version.group(1))
-        # datetime
-        self.date = time.strptime(groups[2], EUROPEAN_STRPTIME)
-        self.time = time.mktime(self.date)
-        # username
-        self.user = groups[3]
-
-        # should I parse this deeper than just stashing the 
-        # contents of the matrix file?
-        matrix_pathname = os.path.join(self.pathname, 'Matrix', 's_matrix.txt')
-        self.matrix = open(matrix_pathname, 'r').read()
-        
-    def dump(self):
-        print "Starting cycle:", self.start
-        print "Ending cycle:", self.stop
-        print "Firecrest version:", self.version
-        print "Run date:", self.date
-        print "user:", self.user
-
-    def _get_elements(self):
-        firecrest = ElementTree.Element('Firecrest')
-        version = ElementTree.SubElement(firecrest, 'version')
-        version.text = self.version
-        start_cycle = ElementTree.SubElement(firecrest, 'FirstCycle')
-        start_cycle.text = str(self.start)
-        stop_cycle = ElementTree.SubElement(firecrest, 'LastCycle')
-        stop_cycle.text = str(self.stop)
-        run_date = ElementTree.SubElement(firecrest, 'run_time')
-        run_date.text = str(self.time)
-        matrix = ElementTree.SubElement(firecrest, 'matrix')
-        matrix.text = self.matrix
-        return firecrest
-    elements=property(_get_elements)
-
-class Bustard(object):
-    def __init__(self, pathname):
-        self.pathname = pathname
-
-        path, name = os.path.split(pathname)
-        groups = name.split("_")
-        version = re.search(VERSION_RE, groups[0])
-        self.version = version.group(1)
-        self.date = time.strptime(groups[1], EUROPEAN_STRPTIME)
-        self.time = time.mktime(self.date)
-        self.user = groups[2]
-        self.phasing = {}
-        self._load_params()
-
-    def _load_params(self):
-        paramfiles = glob(os.path.join(self.pathname, "params*"))
-        for paramfile in paramfiles:
-            path, name = os.path.split(paramfile)
-            basename, ext = os.path.splitext(name)
-            # the last character of the param filename should be the
-            # lane number
-            lane = int(basename[-1])
-            # we want the whole tree, not just the stuff under
-            # the first tag
-            param_tree = ElementTree.parse(paramfile).getroot()
-            self.phasing[lane] = param_tree
-
-    def dump(self):
-        print "Bustard version:", self.version
-        print "Run date", self.date
-        print "user:", self.user
-        for lane, tree in self.phasing.items():
-            print lane
-            print tree
-
-    def _get_elements(self):
-        bustard = ElementTree.Element('Bustard')
-        version = ElementTree.SubElement(bustard, 'version')
-        version.text = self.version
-        run_date = ElementTree.SubElement(bustard, 'run_time')
-        run_date.text = str(self.time)
-        return bustard
-    elements=property(_get_elements)
-        
-class GERALD(object):
-    """
-    Capture meaning out of the GERALD directory
-    """
-    class LaneParameters(object):
-        """
-        Make it easy to access elements of LaneSpecificRunParameters from python
-        """
-        def __init__(self, tree, key):
-            self._tree = tree.find('LaneSpecificRunParameters')
-            self._key = key
-        
-        def __get_attribute(self, xml_tag):
-            container = self._tree.find(xml_tag)
-            if container is None or \
-               len(container.getchildren()) != LANES_PER_FLOWCELL:
-                raise RuntimeError('GERALD config.xml file changed')
-            lanes = [x.tag.split('_')[1] for x in container.getchildren()]
-            index = lanes.index(self._key)
-            #element = container.find(self._key)
-            element = container[index]
-            return element.text
-        def _get_analysis(self):
-            return self.__get_attribute('ANALYSIS')
-        analysis = property(_get_analysis)
-
-        def _get_eland_genome(self):
-            return self.__get_attribute('ELAND_GENOME')
-        eland_genome = property(_get_eland_genome)
-
-        def _get_read_length(self):
-            return self.__get_attribute('READ_LENGTH')
-        read_length = property(_get_read_length)
-
-        def _get_use_bases(self):
-            return self.__get_attribute('USE_BASES')
-        use_bases = property(_get_use_bases)
-
-    class LaneSpecificRunParameters(object):
-        """
-        Provide access to LaneSpecificRunParameters
-        """
-        def __init__(self, tree):
-            self._tree = tree
-            self._keys = None
-        def __getitem__(self, key):
-            return GERALD.LaneParameters(self._tree, key)
-        def keys(self):
-            if self._keys is None:
-                analysis = self._tree.find('LaneSpecificRunParameters/ANALYSIS')
-                # according to the pipeline specs I think their fields 
-                # are sampleName_laneID, with sampleName defaulting to s
-                # since laneIDs are constant lets just try using 
-                # those consistently.
-                self._keys = [ x.tag.split('_')[1] for x in analysis]
-            return self._keys
-        def values(self):
-            return [ self[x] for x in self.keys() ]
-        def items(self):
-            return zip(self.keys(), self.values())
-        def __len__(self):
-            return len(self.keys)
-
-    def __init__(self, pathname):
-        self.pathname = pathname
-        path, name = os.path.split(pathname)
-        config_pathname = os.path.join(self.pathname, 'config.xml')
-        self.tree = ElementTree.parse(config_pathname).getroot()
-        self.version = self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
-
-        date = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
-        self.date = time.strptime(date)
-        self.time = time.mktime(self.date)
-        
-        # parse Summary.htm file
-        summary_pathname = os.path.join(self.pathname, 'Summary.htm')
-        self.summary = Summary(summary_pathname)
-        self.lanes = GERALD.LaneSpecificRunParameters(self.tree)
-        self.eland_results = ELAND(self, self.pathname)
-
-    def dump(self):
-        """
-        Debugging function, report current object
-        """
-        print 'Gerald version:', self.version
-        print 'Gerald run date:', self.date
-        print 'Gerald config.xml:', self.tree
-        self.summary.dump()
-
-    def _get_elements(self):
-        gerald = ElementTree.Element('Gerald')
-        gerald.append(self.tree)
-        gerald.append(self.summary.elements)
-        return gerald
-    elements = property(_get_elements)
-
-def tonumber(v):
-    """
-    Convert a value to int if its an int otherwise a float.
-    """
-    try:
-        v = int(v)
-    except ValueError, e:
-        v = float(v)
-    return v
-
-def parse_mean_range(value):
-    """
-    Parse values like 123 +/- 4.5
-    """
-    if value.strip() == 'unknown':
-       return 0, 0
-
-    average, pm, deviation = value.split()
-    if pm != '+/-':
-        raise RuntimeError("Summary.htm file format changed")
-    return tonumber(average), tonumber(deviation)
-
-def mean_range_element(parent, name, mean, deviation):
-    """
-    Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
-    """
-    element = ElementTree.SubElement(parent, name,
-                                     { 'mean': str(mean),
-                                       'deviation': str(deviation)})
-    return element
-
-class LaneResultSummary(object):
-    """
-    Parse the LaneResultSummary table out of Summary.htm
-    Mostly for the cluster number
-    """
-    def __init__(self, row_element):
-        data = [ flatten(x) for x in row_element ]
-        if len(data) != 8:
-            raise RuntimeError("Summary.htm file format changed")
-
-        self.lane = data[0]
-        self.cluster = parse_mean_range(data[1])
-        self.average_first_cycle_intensity = parse_mean_range(data[2])
-        self.percent_intensity_after_20_cycles = parse_mean_range(data[3])
-        self.percent_pass_filter_clusters = parse_mean_range(data[4])
-        self.percent_pass_filter_align = parse_mean_range(data[5])
-        self.average_alignment_score = parse_mean_range(data[6])
-        self.percent_error_rate = parse_mean_range(data[7])
-
-    def _get_elements(self):
-        lane_result = ElementTree.Element('LaneResultSummary', 
-                                          {'lane': self.lane})
-        cluster = mean_range_element(lane_result, 'Cluster', *self.cluster)
-        first_cycle = mean_range_element(lane_result, 
-                                         'AverageFirstCycleIntensity',
-                                         *self.average_first_cycle_intensity)
-        after_20 = mean_range_element(lane_result,
-                                      'PercentIntensityAfter20Cycles',
-                                      *self.percent_intensity_after_20_cycles)
-        pass_filter = mean_range_element(lane_result,
-                                         'PercentPassFilterClusters',
-                                         *self.percent_pass_filter_clusters)
-        alignment = mean_range_element(lane_result,
-                                       'AverageAlignmentScore',
-                                       *self.average_alignment_score)
-        error_rate = mean_range_element(lane_result,
-                                        'PercentErrorRate',
-                                        *self.percent_error_rate)
-        return lane_result
-    elements = property(_get_elements)
-
-class Summary(object):
-    """
-    Extract some useful information from the Summary.htm file
-    """
-    def __init__(self, pathname):
-        self.pathname = pathname
-        self.tree = ElementTree.parse(pathname).getroot()
-        self.lane_results = {}
-
-        self._extract_lane_results()
-
-    def _extract_lane_results(self):
-        """
-        extract the Lane Results Summary table
-        """
-        if flatten(self.tree.findall('*//h2')[3]) != 'Lane Results Summary':
-            raise RuntimeError("Summary.htm file format changed")
-
-        tables = self.tree.findall('*//table')
-
-        # parse lane result summary
-        lane_summary = tables[2]
-        rows = lane_summary.getchildren()
-        headers = rows[0].getchildren()
-        if flatten(headers[2]) != 'Av 1st Cycle Int ':
-            raise RuntimeError("Summary.htm file format changed")
-
-        for r in rows[1:]:
-            lrs = LaneResultSummary(r)
-            self.lane_results[lrs.lane] = lrs
-
-    def _get_elements(self):
-        summary = ElementTree.Element('Summary')
-        for lane in self.lane_results.values():
-            summary.append(lane.elements)
-        return summary
-    elements = property(_get_elements)
-
-    def dump(self):
-        """
-        Debugging function, report current object
-        """
-        pass
-
-    
-class ELAND(object):
-    """
-    Summarize information from eland files
-    """
-    class ElandResult(object):
-        """
-        Process an eland result file
-        """
-        def __init__(self, gerald, pathname):
-            self.gerald = gerald
-            self.pathname = pathname
-            # extract the sample name
-            path, name = os.path.split(self.pathname)
-            split_name = name.split('_')
-            self.sample_name = split_name[0]
-            self.lane_id = split_name[1]
-            self._reads = None
-            self._mapped_reads = None
-            self._fasta_map = {}
-
-        def _build_fasta_map(self, genome_dir):
-            # build fasta to fasta file map
-            genome = genome_dir.split(os.path.sep)[-1]
-            fasta_map = {}
-            for vld_file in glob(os.path.join(genome_dir, '*.vld')):
-                is_link = False
-                if os.path.islink(vld_file):
-                    is_link = True
-                vld_file = os.path.realpath(vld_file)
-               path, vld_name = os.path.split(vld_file)
-                name, ext = os.path.splitext(vld_name)
-                if is_link:
-                    fasta_map[name] = name
-                else:
-                    fasta_map[name] = os.path.join(genome, name)
-            self._fasta_map = fasta_map
-
-        def _update(self):
-            """
-            Actually read the file and actually count the reads
-            """
-            if os.stat(self.pathname)[stat.ST_SIZE] == 0:
-                raise RuntimeError("Eland isn't done, try again later.")
-            
-            reads = 0
-            mapped_reads = {}
-            genome_dir = self.gerald.lanes[self.lane_id].eland_genome
-            self._build_fasta_map(genome_dir)
-            match_codes = {'NM':0, 'QC':0, 'RM':0, 
-                           'U0':0, 'U1':0, 'U2':0,
-                           'R0':0, 'R1':0, 'R2':0,
-                          }
-            for line in open(self.pathname):
-                reads += 1
-                fields = line.split()
-               # code = fields[2]
-                # match_codes[code] = match_codes.setdefault(code, 0) + 1
-               # the QC/NM etc codes are in the 3rd field and always present
-                match_codes[fields[2]] += 1
-                # ignore lines that don't have a fasta filename
-                if len(fields) < 7:
-                    continue
-                fasta = self._fasta_map.get(fields[6], fields[6])
-                mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
-            self._match_codes = match_codes
-            self._mapped_reads = mapped_reads
-            self._reads = reads
-        
-        def _get_reads(self):
-            if self._reads is None:
-                self._update()
-            return self._reads
-        reads = property(_get_reads)
-
-        def _get_mapped_reads(self):
-            if self._mapped_reads is None:
-                self._update()
-            return self._mapped_reads
-        mapped_reads = property(_get_mapped_reads)
-            
-    def __init__(self, gerald, basedir):
-        # we need information from the gerald config.xml
-        self.gerald = gerald
-        self.results = {}
-        for f in glob(os.path.join(basedir, "*_eland_result.txt")):
-            eland_result = ELAND.ElandResult(gerald, f)
-            self.results[eland_result.lane_id] = eland_result
-
-class PipelineRun(object):
-    """
-    Capture "interesting" information about a pipeline run
-    """
-    def __init__(self, pathname, firecrest, bustard, gerald):
-        self.pathname = pathname
-        self._name = None
-        self._flowcell_id = None
-        self.firecrest = firecrest
-        self.bustard = bustard
-        self.gerald = gerald
-    
-    def _get_flowcell_id(self):
-        # extract flowcell ID
-        if self._flowcell_id is None:
-          config_dir = os.path.join(self.pathname, 'Config')
-          flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
-         if os.path.exists(flowcell_id_path):
-            flowcell_id_tree = ElementTree.parse(flowcell_id_path)
-            self._flowcell_id = flowcell_id_tree.findtext('Text')
-         else:
-           logging.warning(
-             "Unable to determine flowcell id as %s was not found" % (
-                flowcell_id_path))
-           self._flowcell_id = "unknown"
-        return self._flowcell_id
-    flowcell_id = property(_get_flowcell_id)
-
-    def _get_xml(self):
-        """
-        make one master xml file from all of our sub-components.
-        """
-        root = ElementTree.Element('PipelineRun')
-        flowcell = ElementTree.SubElement(root, 'FlowcellID')
-        flowcell.text = self.flowcell_id
-        root.append(self.firecrest.elements)
-        root.append(self.bustard.elements)
-        root.append(self.gerald.elements)
-        return root
-
-    def _get_pretty_xml(self):
-        """
-        Generate indented xml file
-        """
-        root = self._get_xml()
-        indent(root)
-        return root
-    xml = property(_get_pretty_xml)
-
-    def _get_run_name(self):
-        """
-        Given a run tuple, find the latest date and use that as our name
-        """
-        if self._name is None:
-          tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time)
-          timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
-          self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml'
-        return self._name
-    name = property(_get_run_name)
-
-    def save(self):
-        logging.info("Saving run report "+ self.name)
-        ElementTree.ElementTree(self.xml).write(self.name)
-
-def get_runs(runfolder):
-    """
-    Search through a run folder for all the various sub component runs
-    and then return a PipelineRun for each different combination.
-
-    For example if there are two different GERALD runs, this will
-    generate two different PipelineRun objects, that differ
-    in there gerald component.
-    """
-    datadir = os.path.join(runfolder, 'Data')
-
-    logging.info('Searching for runs in ' + datadir)
-    runs = []
-    for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
-        f = Firecrest(firecrest_pathname)
-        bustard_glob = os.path.join(firecrest_pathname, "Bustard*")
-        for bustard_pathname in glob(bustard_glob):
-            b = Bustard(bustard_pathname)
-            gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
-            for gerald_pathname in glob(gerald_glob):
-                try:
-                    g = GERALD(gerald_pathname)
-                    runs.append(PipelineRun(runfolder, f, b, g))
-                except IOError, e:
-                    print "Ignoring", str(e)
-    return runs
-                
-    
-def extract_run_parameters(runs):
-    """
-    Search through runfolder_path for various runs and grab their parameters
-    """
-    for run in runs:
-      run.save()
-
-def summarize_mapped_reads(mapped_reads):
-    """
-    Summarize per chromosome reads into a genome count
-    But handle spike-in/contamination symlinks seperately.
-    """
-    summarized_reads = {}
-    genome_reads = 0
-    genome = 'unknown'
-    for k, v in mapped_reads.items():
-        path, k = os.path.split(k)
-        if len(path) > 0:
-            genome = path
-            genome_reads += v
-        else:
-            summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
-    summarized_reads[genome] = genome_reads
-    return summarized_reads
-
-def summary_report(runs):
-    """
-    Summarize cluster numbers and mapped read counts for a runfolder
-    """
-    report = []
-    for run in runs:
-        # print a run name?
-        report.append('Summary for %s' % (run.name,))
-       # sort the report
-       eland_keys = run.gerald.eland_results.results.keys()
-       eland_keys.sort(alphanum)
-
-        lane_results = run.gerald.summary.lane_results
-       for lane_id in eland_keys:
-           result = run.gerald.eland_results.results[lane_id]
-            report.append("Sample name %s" % (result.sample_name))
-            report.append("Lane id %s" % (result.lane_id,))
-            cluster = lane_results[result.lane_id].cluster
-            report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
-            report.append("Total Reads: %d" % (result.reads))
-            mc = result._match_codes
-           report.append("No Match: %d" % (mc['NM']))
-           report.append("QC Failed: %d" % (mc['QC']))
-            report.append('Unique (0,1,2 mismatches) %d %d %d' % \
-                          (mc['U0'], mc['U1'], mc['U2']))
-            report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
-                          (mc['R0'], mc['R1'], mc['R2']))
-            report.append("Mapped Reads")
-            mapped_reads = summarize_mapped_reads(result.mapped_reads)
-            for name, counts in mapped_reads.items():
-              report.append("  %s: %d" % (name, counts))
-            report.append('---')
-            report.append('')
-        return os.linesep.join(report)
-
-def make_parser():
-    usage = 'usage: %prog [options] runfolder_root_dir'
-    parser = optparse.OptionParser(usage)
-    parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
-                      default=False,
-                      help='turn on verbose mode')
-    parser.add_option('-s', '--summary', dest='summary', action='store_true',
-                      default=False,
-                      help='produce summary report')
-    parser.add_option('-a', '--archive', dest='archive', action='store_true',
-                      default=False,
-                      help='generate run configuration archive')
-    return parser
-
-def main(cmdlist=None):
-    parser = make_parser()
-    opt, args = parser.parse_args(cmdlist)
-
-    if len(args) == 0:
-        parser.error('need path to a runfolder')
-    
-    logging.basicConfig()
-    if opt.verbose:
-        root_log = logging.getLogger()
-        root_log.setLevel(logging.INFO)
-
-    for runfolder in args:
-        runs = get_runs(runfolder)
-        if opt.summary:
-            print summary_report(runs)
-        if opt.archive:
-            extract_run_parameters(runs)
-
-    return 0
-
-if __name__ == "__main__":
-  sys.exit(main(sys.argv[1:]))