Move runfolder analysis classes out of scripts/runfolder.py into seperate files

author Diane Trout <diane@caltech.edu>

Fri, 9 May 2008 03:21:16 +0000 (03:21 +0000)

committer Diane Trout <diane@caltech.edu>

Fri, 9 May 2008 03:21:16 +0000 (03:21 +0000)
author Diane Trout <diane@caltech.edu>
Fri, 9 May 2008 03:21:16 +0000 (03:21 +0000)
committer Diane Trout <diane@caltech.edu>
Fri, 9 May 2008 03:21:16 +0000 (03:21 +0000)
diff --git a/gaworkflow/pipeline/bustard.py b/gaworkflow/pipeline/bustard.py

new file mode 100644 (file)

index 0000000..221617e
--- /dev/null
+++ b/gaworkflow/pipeline/bustard.py
@@ -0,0 +1,145 @@
+
+from datetime import datetime
+from glob import glob
+import logging
+import os
+import time
+import re
+
+from gaworkflow.pipeline.runfolder import \
+   ElementTree, \
+   VERSION_RE, \
+   EUROPEAN_STRPTIME
+
+class Phasing(object):
+    PHASING = 'Phasing'
+    PREPHASING = 'Prephasing'
+
+    def __init__(self, fromfile=None, xml=None):
+        self.lane = None
+        self.phasing = None
+        self.prephasing = None
+
+        if fromfile is not None:
+            self._initialize_from_file(fromfile)
+        elif xml is not None:
+            self.set_elements(xml)
+
+    def _initialize_from_file(self, pathname):
+        path, name = os.path.split(pathname)
+        basename, ext = os.path.splitext(name)
+        # the last character of the param base filename should be the
+        # lane number
+        tree = ElementTree.parse(pathname).getroot()
+        self.set_elements(tree)
+        self.lane = int(basename[-1])
+
+    def get_elements(self):
+        root = ElementTree.Element(Phasing.PHASING, {'lane': str(self.lane)})
+        phasing = ElementTree.SubElement(root, Phasing.PHASING)
+        phasing.text = str(self.phasing)
+        prephasing = ElementTree.SubElement(root, Phasing.PREPHASING)
+        prephasing.text = str(self.prephasing)
+        return root
+
+    def set_elements(self, tree):
+        if tree.tag not in ('Phasing', 'Parameters'):
+            raise ValueError('exptected Phasing or Parameters')
+        lane = tree.attrib.get('lane', None)
+        if lane is not None:
+            self.lane = int(lane)
+        for element in list(tree):
+            if element.tag == Phasing.PHASING:
+                self.phasing = float(element.text)
+            elif element.tag == Phasing.PREPHASING:
+                self.prephasing = float(element.text)
+
+class Bustard(object):
+    XML_VERSION = 1
+
+    # Xml Tags
+    BUSTARD = 'Bustard'
+    SOFTWARE_VERSION = 'version'
+    DATE = 'run_time'
+    USER = 'user'
+    PARAMETERS = 'Parameters'
+
+    def __init__(self, xml=None):
+        self.version = None
+        self.date = datetime.now()
+        self.user = None
+        self.phasing = {}
+
+        if xml is not None:
+            self.set_elements(xml)
+
+    def _get_time(self):
+        return time.mktime(self.date.timetuple())
+    time = property(_get_time, doc='return run time as seconds since epoch')
+
+    def dump(self):
+        print "Bustard version:", self.version
+        print "Run date", self.date
+        print "user:", self.user
+        for lane, tree in self.phasing.items():
+            print lane
+            print tree
+
+    def get_elements(self):
+        root = ElementTree.Element('Bustard', 
+                                   {'version': str(Bustard.XML_VERSION)})
+        version = ElementTree.SubElement(root, Bustard.SOFTWARE_VERSION)
+        version.text = self.version
+        run_date = ElementTree.SubElement(root, Bustard.DATE)
+        run_date.text = str(self.time)
+        user = ElementTree.SubElement(root, Bustard.USER)
+        user.text = self.user
+        params = ElementTree.SubElement(root, Bustard.PARAMETERS)
+        for p in self.phasing.values():
+            params.append(p.get_elements())
+        return root
+
+    def set_elements(self, tree):
+        if tree.tag != Bustard.BUSTARD:
+            raise ValueError('Expected "Bustard" SubElements')
+        xml_version = int(tree.attrib.get('version', 0))
+        if xml_version > Bustard.XML_VERSION:
+            logging.warn('Bustard XML tree is a higher version than this class')
+        for element in list(tree):
+            if element.tag == Bustard.SOFTWARE_VERSION:
+                self.version = element.text
+            elif element.tag == Bustard.DATE:
+                self.date = datetime.fromtimestamp(float(element.text))
+            elif element.tag == Bustard.USER:
+                self.user = element.text
+            elif element.tag == Bustard.PARAMETERS:
+                for param in element:
+                    p = Phasing(xml=param)
+                    self.phasing[p.lane] = p
+            else:
+                raise ValueError("Unrecognized tag: %s" % (element.tag,))
+        
+
+
+def bustard(pathname):
+    """
+    Construct a Bustard object from pathname
+    """
+    b = Bustard()
+    path, name = os.path.split(pathname)
+    groups = name.split("_")
+    version = re.search(VERSION_RE, groups[0])
+    b.version = version.group(1)
+    b.date = datetime.strptime(groups[1], EUROPEAN_STRPTIME)
+    b.user = groups[2]
+    paramfiles = glob(os.path.join(pathname, "params?.xml"))
+    for paramfile in paramfiles:
+        phasing = Phasing(paramfile)
+        assert (phasing.lane >= 1 and phasing.lane <= 8)
+        b.phasing[phasing.lane] = phasing
+    return b
+
+def fromxml(tree):
+    b = Bustard()
+    b.set_elements(tree)
+    return b
diff --git a/gaworkflow/pipeline/firecrest.py b/gaworkflow/pipeline/firecrest.py

new file mode 100644 (file)

index 0000000..a0a30b3
--- /dev/null
+++ b/gaworkflow/pipeline/firecrest.py
@@ -0,0 +1,126 @@
+"""
+Extract information about the Firecrest run
+
+Firecrest - class holding the properties we found
+firecrest - Firecrest factory function initalized from a directory name
+fromxml - Firecrest factory function initalized from an xml dump from
+          the Firecrest object.
+"""
+
+from datetime import datetime
+import os
+import re
+import time
+
+from gaworkflow.pipeline.runfolder import \
+   ElementTree, \
+   VERSION_RE, \
+   EUROPEAN_STRPTIME
+
+class Firecrest(object):
+    XML_VERSION=1
+
+    # xml tag names
+    FIRECREST = 'Firecrest'
+    SOFTWARE_VERSION = 'version'
+    START = 'FirstCycle'
+    STOP = 'LastCycle'
+    DATE = 'run_time'
+    USER = 'user'
+    MATRIX = 'matrix'
+
+    def __init__(self, xml=None):
+        self.start = None
+        self.stop = None
+        self.version = None
+        self.date = datetime.now()
+        self.user = None
+        self.matrix = None
+
+        if xml is not None:
+            self.set_elements(xml)
+        
+    def _get_time(self):
+        return time.mktime(self.date.timetuple())
+    time = property(_get_time, doc='return run time as seconds since epoch')
+
+    def dump(self):
+        print "Starting cycle:", self.start
+        print "Ending cycle:", self.stop
+        print "Firecrest version:", self.version
+        print "Run date:", self.date
+        print "user:", self.user
+
+    def get_elements(self):
+        attribs = {'version': str(Firecrest.XML_VERSION) }
+        root = ElementTree.Element(Firecrest.FIRECREST, attrib=attribs)
+        version = ElementTree.SubElement(root, Firecrest.SOFTWARE_VERSION)
+        version.text = self.version
+        start_cycle = ElementTree.SubElement(root, Firecrest.START)
+        start_cycle.text = str(self.start)
+        stop_cycle = ElementTree.SubElement(root, Firecrest.STOP)
+        stop_cycle.text = str(self.stop)
+        run_date = ElementTree.SubElement(root, Firecrest.DATE)
+        run_date.text = str(self.time)
+        user = ElementTree.SubElement(root, Firecrest.USER)
+        user.text = self.user
+        matrix = ElementTree.SubElement(root, Firecrest.MATRIX)
+        matrix.text = self.matrix
+        return root
+
+    def set_elements(self, tree):
+        if tree.tag != Firecrest.FIRECREST:
+            raise ValueError('Expected "Firecrest" SubElements')
+        xml_version = int(tree.attrib.get('version', 0))
+        if xml_version > Firecrest.XML_VERSION:
+            logging.warn('Firecrest XML tree is a higher version than this class')
+        for element in list(tree):
+            if element.tag == Firecrest.SOFTWARE_VERSION:
+                self.version = element.text
+            elif element.tag == Firecrest.START:
+                self.start = int(element.text)
+            elif element.tag == Firecrest.STOP:
+                self.stop = int(element.text)
+            elif element.tag == Firecrest.DATE:
+                self.date = datetime.fromtimestamp(float(element.text))
+            elif element.tag == Firecrest.USER:
+                self.user = element.text
+            elif element.tag == Firecrest.MATRIX:
+                self.matrix = element.text
+            else:
+                raise ValueError("Unrecognized tag: %s" % (element.tag,))
+
+def firecrest(pathname):
+    """
+    Examine the directory at pathname and initalize a Firecrest object
+    """
+    f = Firecrest()
+
+    # parse firecrest directory name
+    path, name = os.path.split(pathname)
+    groups = name.split('_')
+    # grab the start/stop cycle information
+    cycle = re.match("C([0-9]+)-([0-9]+)", groups[0])
+    f.start = int(cycle.group(1))
+    f.stop = int(cycle.group(2))
+    # firecrest version
+    version = re.search(VERSION_RE, groups[1])
+    f.version = (version.group(1))
+    # datetime
+    f.date = datetime.strptime(groups[2], EUROPEAN_STRPTIME)
+    # username
+    f.user = groups[3]
+
+    # should I parse this deeper than just stashing the 
+    # contents of the matrix file?
+    matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt')
+    f.matrix = open(matrix_pathname, 'r').read()
+    return f
+
+def fromxml(tree):
+    """
+    Initialize a Firecrest object from an element tree node
+    """
+    f = Firecrest()
+    f.set_elements(tree)
+    return f
diff --git a/gaworkflow/pipeline/gerald.py b/gaworkflow/pipeline/gerald.py

new file mode 100644 (file)

index 0000000..1f15eea
--- /dev/null
+++ b/gaworkflow/pipeline/gerald.py
@@ -0,0 +1,571 @@
+"""
+Provide access to information stored in the GERALD directory.
+"""
+from datetime import datetime, date
+from glob import glob
+import logging
+import os
+import stat
+import time
+
+from gaworkflow.pipeline.runfolder import \
+   ElementTree, \
+   EUROPEAN_STRPTIME, \
+   LANES_PER_FLOWCELL, \
+   VERSION_RE
+from gaworkflow.util.ethelp import indent, flatten
+
+class Gerald(object):
+    """
+    Capture meaning out of the GERALD directory
+    """
+    XML_VERSION = 1
+    GERALD='Gerald'
+    RUN_PARAMETERS='RunParameters'
+    SUMMARY='Summary'
+
+    class LaneParameters(object):
+        """
+        Make it easy to access elements of LaneSpecificRunParameters from python
+        """
+        def __init__(self, gerald, key):
+            self._gerald = gerald
+            self._key = key
+        
+        def __get_attribute(self, xml_tag):
+            subtree = self._gerald.tree.find('LaneSpecificRunParameters')
+            container = subtree.find(xml_tag)
+            if container is None or \
+               len(container.getchildren()) != LANES_PER_FLOWCELL:
+                raise RuntimeError('GERALD config.xml file changed')
+            lanes = [x.tag.split('_')[1] for x in container.getchildren()]
+            index = lanes.index(self._key)
+            #element = container.find(self._key)
+            element = container[index]
+            return element.text
+        def _get_analysis(self):
+            return self.__get_attribute('ANALYSIS')
+        analysis = property(_get_analysis)
+
+        def _get_eland_genome(self):
+            return self.__get_attribute('ELAND_GENOME')
+        eland_genome = property(_get_eland_genome)
+
+        def _get_read_length(self):
+            return self.__get_attribute('READ_LENGTH')
+        read_length = property(_get_read_length)
+
+        def _get_use_bases(self):
+            return self.__get_attribute('USE_BASES')
+        use_bases = property(_get_use_bases)
+
+    class LaneSpecificRunParameters(object):
+        """
+        Provide access to LaneSpecificRunParameters
+        """
+        def __init__(self, gerald):
+            self._gerald = gerald
+            self._keys = None
+        def __getitem__(self, key):
+            return Gerald.LaneParameters(self._gerald, key)
+        def keys(self):
+            if self._keys is None:
+                tree = self._gerald.tree
+                analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
+                # according to the pipeline specs I think their fields 
+                # are sampleName_laneID, with sampleName defaulting to s
+                # since laneIDs are constant lets just try using 
+                # those consistently.
+                self._keys = [ x.tag.split('_')[1] for x in analysis]
+            return self._keys
+        def values(self):
+            return [ self[x] for x in self.keys() ]
+        def items(self):
+            return zip(self.keys(), self.values())
+        def __len__(self):
+            return len(self.keys())
+
+    def __init__(self, xml=None):
+        self.pathname = None
+        self.tree = None
+
+        # parse lane parameters out of the config.xml file
+        self.lanes = Gerald.LaneSpecificRunParameters(self)
+
+        self.summary = None
+        self.eland_results = None
+
+        if xml is not None:
+            self.set_elements(xml)
+
+    def _get_date(self):
+        if self.tree is None:
+            return datetime.today()
+        timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
+        return datetime.strptime(timestamp, '%c')
+    date = property(_get_date)
+
+    def _get_time(self):
+        return time.mktime(self.date.timetuple())
+    time = property(_get_time, doc='return run time as seconds since epoch')
+
+    def _get_version(self):
+        if self.tree is None:
+            return None
+        return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
+    version = property(_get_version)
+
+    def dump(self):
+        """
+        Debugging function, report current object
+        """
+        print 'Gerald version:', self.version
+        print 'Gerald run date:', self.date
+        print 'Gerald config.xml:', self.tree
+        self.summary.dump()
+
+    def get_elements(self):
+        if self.tree is None or self.summary is None:
+            return None
+
+        gerald = ElementTree.Element(Gerald.GERALD, 
+                                     {'version': unicode(Gerald.XML_VERSION)})
+        gerald.append(self.tree)
+        gerald.append(self.summary.get_elements())
+        if self.eland_results:
+            gerald.append(self.eland_results.get_elements())
+        return gerald
+
+    def set_elements(self, tree):
+        if tree.tag !=  Gerald.GERALD:
+            raise ValueError('exptected GERALD')
+        xml_version = int(tree.attrib.get('version', 0))
+        if xml_version > Gerald.XML_VERSION:
+            logging.warn('XML tree is a higher version than this class')
+        for element in list(tree):
+            tag = element.tag.lower()
+            if tag == Gerald.RUN_PARAMETERS.lower():
+                self.tree = element
+            elif tag == Gerald.SUMMARY.lower():
+                self.summary = Summary(xml=element)
+            elif tag == ELAND.ELAND.lower():
+                self.eland_results = ELAND(xml=element)
+            else:
+                logging.warn("Unrecognized tag %s" % (element.tag,))
+        
+
+def gerald(pathname):
+    g = Gerald()
+    g.pathname = pathname
+    path, name = os.path.split(pathname)
+    config_pathname = os.path.join(pathname, 'config.xml')
+    g.tree = ElementTree.parse(config_pathname).getroot()
+
+    # parse Summary.htm file
+    summary_pathname = os.path.join(pathname, 'Summary.htm')
+    g.summary = Summary(summary_pathname)
+    # parse eland files
+    g.eland_results = eland(g.pathname, g)
+    return g
+
+def tonumber(v):
+    """
+    Convert a value to int if its an int otherwise a float.
+    """
+    try:
+        v = int(v)
+    except ValueError, e:
+        v = float(v)
+    return v
+
+def parse_mean_range(value):
+    """
+    Parse values like 123 +/- 4.5
+    """
+    if value.strip() == 'unknown':
+       return 0, 0
+
+    average, pm, deviation = value.split()
+    if pm != '+/-':
+        raise RuntimeError("Summary.htm file format changed")
+    return tonumber(average), tonumber(deviation)
+
+def make_mean_range_element(parent, name, mean, deviation):
+    """
+    Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
+    """
+    element = ElementTree.SubElement(parent, name,
+                                     { 'mean': unicode(mean),
+                                       'deviation': unicode(deviation)})
+    return element
+
+def parse_mean_range_element(element):
+    """
+    Grab mean/deviation out of element
+    """
+    return (tonumber(element.attrib['mean']), 
+            tonumber(element.attrib['deviation']))
+
+class Summary(object):
+    """
+    Extract some useful information from the Summary.htm file
+    """
+    XML_VERSION = 1
+    SUMMARY = 'Summary'
+
+    class LaneResultSummary(object):
+        """
+        Parse the LaneResultSummary table out of Summary.htm
+        Mostly for the cluster number
+        """
+        LANE_RESULT_SUMMARY = 'LaneResultSummary'
+        TAGS = { 
+          'Cluster': 'cluster',
+          'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
+          'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
+          'PercentPassFilterClusters': 'percent_pass_filter_clusters',
+          'PercentPassFilterAlign': 'percent_pass_filter_align',
+          'AverageAlignmentScore': 'average_alignment_score',
+          'PercentErrorRate': 'percent_error_rate'
+        }
+                 
+        def __init__(self, html=None, xml=None):
+            self.lane = None
+            self.cluster = None
+            self.average_first_cycle_intensity = None
+            self.percent_intensity_after_20_cycles = None
+            self.percent_pass_filter_clusters = None
+            self.percent_pass_filter_align = None
+            self.average_alignment_score = None
+            self.percent_error_rate = None
+
+            if html is not None:
+                self.set_elements_from_html(html)
+            if xml is not None:
+                self.set_elements(xml)
+
+        def set_elements_from_html(self, row_element):
+            data = [ flatten(x) for x in row_element ]
+            if len(data) != 8:
+                raise RuntimeError("Summary.htm file format changed")
+
+            self.lane = data[0]
+            self.cluster = parse_mean_range(data[1])
+            self.average_first_cycle_intensity = parse_mean_range(data[2])
+            self.percent_intensity_after_20_cycles = parse_mean_range(data[3])
+            self.percent_pass_filter_clusters = parse_mean_range(data[4])
+            self.percent_pass_filter_align = parse_mean_range(data[5])
+            self.average_alignment_score = parse_mean_range(data[6])
+            self.percent_error_rate = parse_mean_range(data[7])
+
+        def get_elements(self):
+            lane_result = ElementTree.Element(
+                            Summary.LaneResultSummary.LANE_RESULT_SUMMARY, 
+                            {'lane': self.lane})
+            for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
+                element = make_mean_range_element(
+                    lane_result,
+                    tag,
+                    *getattr(self, variable_name)
+                )
+            return lane_result
+
+        def set_elements(self, tree):
+            if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
+                raise ValueError('Expected %s' % (
+                        Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
+            self.lane = tree.attrib['lane']
+            tags = Summary.LaneResultSummary.TAGS
+            for element in list(tree):
+                try:
+                    variable_name = tags[element.tag]
+                    setattr(self, variable_name, 
+                            parse_mean_range_element(element))
+                except KeyError, e:
+                    logging.warn('Unrecognized tag %s' % (element.tag,))
+
+    def __init__(self, filename=None, xml=None):
+        self.lane_results = {}
+
+        if filename is not None:
+            self._extract_lane_results(filename)
+        if xml is not None:
+            self.set_elements(xml)
+
+    def __getitem__(self, key):
+        return self.lane_results[key]
+
+    def __len__(self):
+        return len(self.lane_results)
+
+    def keys(self):
+        return self.lane_results.keys()
+
+    def values(self):
+        return self.lane_results.values()
+
+    def items(self):
+        return self.lane_results.items()
+
+    def _extract_lane_results(self, pathname):
+        """
+        extract the Lane Results Summary table
+        """
+        tree = ElementTree.parse(pathname).getroot()
+        if flatten(tree.findall('*//h2')[3]) != 'Lane Results Summary':
+            raise RuntimeError("Summary.htm file format changed")
+
+        tables = tree.findall('*//table')
+
+        # parse lane result summary
+        lane_summary = tables[2]
+        rows = lane_summary.getchildren()
+        headers = rows[0].getchildren()
+        if flatten(headers[2]) != 'Av 1st Cycle Int ':
+            raise RuntimeError("Summary.htm file format changed")
+
+        for r in rows[1:]:
+            lrs = Summary.LaneResultSummary(html=r)
+            self.lane_results[lrs.lane] = lrs
+
+    def get_elements(self):
+        summary = ElementTree.Element(Summary.SUMMARY, 
+                                      {'version': unicode(Summary.XML_VERSION)})
+        for lane in self.lane_results.values():
+            summary.append(lane.get_elements())
+        return summary
+
+    def set_elements(self, tree):
+        if tree.tag != Summary.SUMMARY:
+            return ValueError("Expected %s" % (Summary.SUMMARY,))
+        xml_version = int(tree.attrib.get('version', 0))
+        if xml_version > Summary.XML_VERSION:
+            logging.warn('Summary XML tree is a higher version than this class')
+        for element in list(tree):
+            lrs = Summary.LaneResultSummary()
+            lrs.set_elements(element)
+            self.lane_results[lrs.lane] = lrs
+
+    def dump(self):
+        """
+        Debugging function, report current object
+        """
+        pass
+
+
+def build_genome_fasta_map(genome_dir):
+    # build fasta to fasta file map
+    genome = genome_dir.split(os.path.sep)[-1]
+    fasta_map = {}
+    for vld_file in glob(os.path.join(genome_dir, '*.vld')):
+        is_link = False
+        if os.path.islink(vld_file):
+            is_link = True
+        vld_file = os.path.realpath(vld_file)
+        path, vld_name = os.path.split(vld_file)
+        name, ext = os.path.splitext(vld_name)
+        if is_link:
+            fasta_map[name] = name
+        else:
+            fasta_map[name] = os.path.join(genome, name)
+    return fasta_map
+    
+class ElandLane(object):
+    """
+    Process an eland result file
+    """
+    XML_VERSION = 1
+    LANE = 'ElandLane'
+    GENOME_MAP = 'GenomeMap'
+    GENOME_ITEM = 'GenomeItem'
+    MAPPED_READS = 'MappedReads'
+    MAPPED_ITEM = 'MappedItem'
+    MATCH_CODES = 'MatchCodes'
+    MATCH_ITEM = 'Code'
+    READS = 'Reads'
+
+    def __init__(self, pathname=None, genome_map=None, xml=None):
+        self.pathname = pathname
+        self._reads = None
+        self._mapped_reads = {}
+        self._match_codes = {}
+        if genome_map is None:
+            genome_map = {}
+        self.genome_map = genome_map
+        
+        if pathname is not None:
+            self._update()
+        if xml is not None:
+            self.set_elements(xml)
+
+    def _update(self):
+        """
+        Actually read the file and actually count the reads
+        """
+        # can't do anything if we don't have a file to process
+        if self.pathname is None:
+            return
+
+        if os.stat(self.pathname)[stat.ST_SIZE] == 0:
+            raise RuntimeError("Eland isn't done, try again later.")
+
+        reads = 0
+        mapped_reads = {}
+
+        match_codes = {'NM':0, 'QC':0, 'RM':0, 
+                       'U0':0, 'U1':0, 'U2':0,
+                       'R0':0, 'R1':0, 'R2':0,
+                      }
+        for line in open(self.pathname):
+            reads += 1
+            fields = line.split()
+            # code = fields[2]
+            # match_codes[code] = match_codes.setdefault(code, 0) + 1
+            # the QC/NM etc codes are in the 3rd field and always present
+            match_codes[fields[2]] += 1
+            # ignore lines that don't have a fasta filename
+            if len(fields) < 7:
+                continue
+            fasta = self.genome_map.get(fields[6], fields[6])
+            mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
+        self._match_codes = match_codes
+        self._mapped_reads = mapped_reads
+        self._reads = reads
+
+    def _get_reads(self):
+        if self._reads is None:
+            self._update()
+        return self._reads
+    reads = property(_get_reads)
+
+    def _get_mapped_reads(self):
+        if self._mapped_reads is None:
+            self._update()
+        return self._mapped_reads
+    mapped_reads = property(_get_mapped_reads)
+
+    def _get_match_codes(self):
+        if self._match_codes is None:
+            self._update()
+        return self._match_codes
+    match_codes = property(_get_match_codes)
+
+    def get_elements(self):
+        lane = ElementTree.Element(ElandLane.LANE, 
+                                   {'version': 
+                                    unicode(ElandLane.XML_VERSION)})
+        genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
+        for k, v in self.genome_map.items():
+            item = ElementTree.SubElement(
+                genome_map, ElandLane.GENOME_ITEM, 
+                {'name':k, 'value':unicode(v)})
+        mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
+        for k, v in self.mapped_reads.items():
+            item = ElementTree.SubElement(
+                mapped_reads, ElandLane.MAPPED_ITEM, 
+                {'name':k, 'value':unicode(v)})
+        match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
+        for k, v in self.match_codes.items():
+            item = ElementTree.SubElement(
+                match_codes, ElandLane.MATCH_ITEM, 
+                {'name':k, 'value':unicode(v)})
+        reads = ElementTree.SubElement(lane, ElandLane.READS)
+        reads.text = unicode(self.reads)
+
+        return lane
+
+    def set_elements(self, tree):
+        if tree.tag != ElandLane.LANE:
+            raise ValueError('Exptecting %s' % (ElandLane.LANE,))
+        for element in tree:
+            tag = element.tag.lower()
+            if tag == ElandLane.GENOME_MAP.lower():
+                for child in element:
+                    name = child.attrib['name']
+                    value = child.attrib['value']
+                    self.genome_map[name] = value
+            elif tag == ElandLane.MAPPED_READS.lower():
+                for child in element:
+                    name = child.attrib['name']
+                    value = child.attrib['value']
+                    self._mapped_reads[name] = int(value)
+            elif tag == ElandLane.MATCH_CODES.lower():
+                for child in element:
+                    name = child.attrib['name']
+                    value = int(child.attrib['value'])
+                    self._match_codes[name] = value
+            elif tag == ElandLane.READS.lower():
+                self._reads = int(element.text)
+            else:
+                logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
+
+
+class ELAND(object):
+    """
+    Summarize information from eland files
+    """
+    XML_VERSION = 1
+
+    ELAND = 'ElandCollection'
+    LANE = 'Lane'
+    LANE_ID = 'id'
+
+    def __init__(self, xml=None):
+        # we need information from the gerald config.xml
+        self.results = {}
+        
+        if xml is not None:
+            self.set_elements(xml)
+
+    def __len__(self):
+        return len(self.results)
+
+    def keys(self):
+        return self.results.keys()
+    
+    def values(self):
+        return self.results.values()
+
+    def items(self):
+        return self.results.items()
+
+    def __getitem__(self, key):
+        return self.results[key]
+
+    def get_elements(self):
+        root = ElementTree.Element(ELAND.ELAND, 
+                                   {'version': unicode(ELAND.XML_VERSION)})
+        for lane_id, lane in self.results.items():
+            eland_lane = lane.get_elements()
+            eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
+            root.append(eland_lane)
+        return root
+
+    def set_elements(self, tree):
+        if tree.tag.lower() != ELAND.ELAND.lower():
+            raise ValueError('Expecting %s', ELAND.ELAND)
+        for element in list(tree):
+            lane_id = element.attrib[ELAND.LANE_ID]
+            lane = ElandLane(xml=element)
+            self.results[lane_id] = lane
+
+def eland(basedir, gerald=None, genome_maps=None):
+    e = ELAND()
+    for pathname in glob(os.path.join(basedir, "*_eland_result.txt")):
+        # extract the sample name
+        path, name = os.path.split(pathname)
+        split_name = name.split('_')
+        sample_name = split_name[0]
+        lane_id = split_name[1]
+
+        if genome_maps is not None:
+            genome_map = genome_maps[lane_id]
+        elif gerald is not None:
+            genome_dir = gerald.lanes[lane_id].eland_genome
+            genome_map = build_genome_fasta_map(genome_dir)
+        else:
+            genome_map = {}
+
+        eland_result = ElandLane(pathname, genome_map)
+        e.results[lane_id] = eland_result
+    return e
diff --git a/gaworkflow/pipeline/runfolder.py b/gaworkflow/pipeline/runfolder.py

new file mode 100644 (file)

index 0000000..1219627
--- /dev/null
+++ b/gaworkflow/pipeline/runfolder.py
@@ -0,0 +1,218 @@
+"""
+Core information needed to inspect a runfolder.
+"""
+from glob import glob
+import logging
+import os
+import re
+import stat
+import sys
+import time
+
+try:
+  from xml.etree import ElementTree
+except ImportError, e:
+  from elementtree import ElementTree
+
+EUROPEAN_STRPTIME = "%d-%m-%Y"
+EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
+VERSION_RE = "([0-9\.]+)"
+USER_RE = "([a-zA-Z0-9]+)"
+LANES_PER_FLOWCELL = 8
+
+from gaworkflow.util.alphanum import alphanum
+from gaworkflow.util.ethelp import indent, flatten
+
+
+class PipelineRun(object):
+    """
+    Capture "interesting" information about a pipeline run
+    """
+    XML_VERSION = 1
+    PIPELINE_RUN = 'PipelineRun'
+    FLOWCELL_ID = 'FlowcellID'
+
+    def __init__(self, pathname=None, firecrest=None, bustard=None, gerald=None, xml=None):
+        self.pathname = pathname
+        self._name = None
+        self._flowcell_id = None
+        self.firecrest = firecrest
+        self.bustard = bustard
+        self.gerald = gerald
+
+        if xml is not None:
+          self.set_elements(xml)
+    
+    def _get_flowcell_id(self):
+        # extract flowcell ID
+        if self._flowcell_id is None:
+          config_dir = os.path.join(self.pathname, 'Config')
+          flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
+         if os.path.exists(flowcell_id_path):
+            flowcell_id_tree = ElementTree.parse(flowcell_id_path)
+            self._flowcell_id = flowcell_id_tree.findtext('Text')
+         else:
+            path_fields = self.pathname.split('_')
+            if len(path_fields) > 0:
+              # guessing last element of filename
+              flowcell_id = path_fields[-1]
+            else:
+              flowcell_id = 'unknown'
+              
+           logging.warning(
+             "Flowcell idwas not found, guessing %s" % (
+                flowcell_id))
+           self._flowcell_id = flowcell_id
+        return self._flowcell_id
+    flowcell_id = property(_get_flowcell_id)
+
+    def get_elements(self):
+        """
+        make one master xml file from all of our sub-components.
+        """
+        root = ElementTree.Element(PipelineRun.PIPELINE_RUN)
+        flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID)
+        flowcell.text = self.flowcell_id
+        root.append(self.firecrest.get_elements())
+        root.append(self.bustard.get_elements())
+        root.append(self.gerald.get_elements())
+        return root
+
+    def set_elements(self, tree):
+        # this file gets imported by all the others,
+        # so we need to hide the imports to avoid a cyclic imports
+        from gaworkflow.pipeline import firecrest
+        from gaworkflow.pipeline import bustard
+        from gaworkflow.pipeline import gerald
+
+        tag = tree.tag.lower()
+        if tag != PipelineRun.PIPELINE_RUN.lower():
+          raise ValueError('Pipeline Run Expecting %s got %s' % (
+              PipelineRun.PIPELINE_RUN, tag))
+        for element in tree:
+          tag = element.tag.lower()
+          if tag == PipelineRun.FLOWCELL_ID.lower():
+            self._flowcell_id = element.text
+          #ok the xword.Xword.XWORD pattern for module.class.constant is lame
+          elif tag == firecrest.Firecrest.FIRECREST.lower():
+            self.firecrest = firecrest.Firecrest(xml=element)
+          elif tag == bustard.Bustard.BUSTARD.lower():
+            self.bustard = bustard.Bustard(xml=element)
+          elif tag == gerald.Gerald.GERALD.lower():
+            self.gerald = gerald.Gerald(xml=element)
+          else:
+            logging.warn('PipelineRun unrecognized tag %s' % (tag,))
+
+    def _get_run_name(self):
+        """
+        Given a run tuple, find the latest date and use that as our name
+        """
+        if self._name is None:
+          tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time)
+          timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
+          self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml'
+        return self._name
+    name = property(_get_run_name)
+
+    def save(self):
+        logging.info("Saving run report "+ self.name)
+        xml = self.get_elements()
+        indent(xml)
+        ElementTree.ElementTree(xml).write(self.name)
+
+    def load(self, filename):
+        logging.info("Loading run report from " + filename)
+        tree = ElementTree.parse(filename).getroot()
+        self.set_elements(tree)
+
+def get_runs(runfolder):
+    """
+    Search through a run folder for all the various sub component runs
+    and then return a PipelineRun for each different combination.
+
+    For example if there are two different GERALD runs, this will
+    generate two different PipelineRun objects, that differ
+    in there gerald component.
+    """
+    from gaworkflow.pipeline import firecrest
+    from gaworkflow.pipeline import bustard
+    from gaworkflow.pipeline import gerald
+
+    datadir = os.path.join(runfolder, 'Data')
+
+    logging.info('Searching for runs in ' + datadir)
+    runs = []
+    for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
+        f = firecrest.firecrest(firecrest_pathname)
+        bustard_glob = os.path.join(firecrest_pathname, "Bustard*")
+        for bustard_pathname in glob(bustard_glob):
+            b = bustard.bustard(bustard_pathname)
+            gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
+            for gerald_pathname in glob(gerald_glob):
+                try:
+                    g = gerald.gerald(gerald_pathname)
+                    runs.append(PipelineRun(runfolder, f, b, g))
+                except IOError, e:
+                    print "Ignoring", str(e)
+    return runs
+                
+    
+def extract_run_parameters(runs):
+    """
+    Search through runfolder_path for various runs and grab their parameters
+    """
+    for run in runs:
+      run.save()
+
+def summarize_mapped_reads(mapped_reads):
+    """
+    Summarize per chromosome reads into a genome count
+    But handle spike-in/contamination symlinks seperately.
+    """
+    summarized_reads = {}
+    genome_reads = 0
+    genome = 'unknown'
+    for k, v in mapped_reads.items():
+        path, k = os.path.split(k)
+        if len(path) > 0:
+            genome = path
+            genome_reads += v
+        else:
+            summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
+    summarized_reads[genome] = genome_reads
+    return summarized_reads
+
+def summary_report(runs):
+    """
+    Summarize cluster numbers and mapped read counts for a runfolder
+    """
+    report = []
+    for run in runs:
+        # print a run name?
+        report.append('Summary for %s' % (run.name,))
+       # sort the report
+       eland_keys = run.gerald.eland_results.results.keys()
+       eland_keys.sort(alphanum)
+
+        lane_results = run.gerald.summary.lane_results
+       for lane_id in eland_keys:
+           result = run.gerald.eland_results.results[lane_id]
+            report.append("Sample name %s" % (result.sample_name))
+            report.append("Lane id %s" % (result.lane_id,))
+            cluster = lane_results[result.lane_id].cluster
+            report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
+            report.append("Total Reads: %d" % (result.reads))
+            mc = result._match_codes
+           report.append("No Match: %d" % (mc['NM']))
+           report.append("QC Failed: %d" % (mc['QC']))
+            report.append('Unique (0,1,2 mismatches) %d %d %d' % \
+                          (mc['U0'], mc['U1'], mc['U2']))
+            report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
+                          (mc['R0'], mc['R1'], mc['R2']))
+            report.append("Mapped Reads")
+            mapped_reads = summarize_mapped_reads(result.mapped_reads)
+            for name, counts in mapped_reads.items():
+              report.append("  %s: %d" % (name, counts))
+            report.append('---')
+            report.append('')
+        return os.linesep.join(report)
diff --git a/gaworkflow/pipeline/test/test_runfolder.py b/gaworkflow/pipeline/test/test_runfolder.py

new file mode 100644 (file)

index 0000000..d4c2fd0
--- /dev/null
+++ b/gaworkflow/pipeline/test/test_runfolder.py
@@ -0,0 +1,597 @@
+#!/usr/bin/env python
+
+from datetime import datetime
+import os
+import tempfile
+import shutil
+import unittest
+
+from gaworkflow.pipeline import firecrest
+from gaworkflow.pipeline import bustard
+from gaworkflow.pipeline import gerald
+from gaworkflow.pipeline import runfolder
+from gaworkflow.pipeline.runfolder import ElementTree
+
+
+def make_flowcell_id(runfolder_dir, flowcell_id=None):
+    if flowcell_id is None:
+        flowcell_id = '207BTAAXY'
+
+    config = """<?xml version="1.0"?>
+<FlowcellId>
+  <Text>%s</Text>
+</FlowcellId>""" % (flowcell_id,)
+    config_dir = os.path.join(runfolder_dir, 'Config')
+    
+    if not os.path.exists(config_dir):
+        os.mkdir(config_dir)
+    pathname = os.path.join(config_dir, 'FlowcellId.xml')
+    f = open(pathname,'w')
+    f.write(config)
+    f.close()
+
+def make_matrix(matrix_dir):
+    contents = """# Auto-generated frequency response matrix
+> A
+> C
+> G
+> T
+0.77 0.15 -0.04 -0.04 
+0.76 1.02 -0.05 -0.06 
+-0.10 -0.10 1.17 -0.03 
+-0.13 -0.12 0.80 1.27 
+"""
+    s_matrix = os.path.join(matrix_dir, 's_matrix.txt')
+    f = open(s_matrix, 'w')
+    f.write(contents)
+    f.close()
+    
+def make_phasing_params(bustard_dir):
+    for lane in range(1,9):
+        pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane))
+        f = open(pathname, 'w')
+        f.write("""<Parameters>
+  <Phasing>0.009900</Phasing>
+  <Prephasing>0.003500</Prephasing>
+</Parameters>
+""")
+        f.close()
+
+def make_gerald_config(gerald_dir):
+    config_xml = """<RunParameters>
+<ChipWideRunParameters>
+  <ANALYSIS>default</ANALYSIS>
+  <BAD_LANES></BAD_LANES>
+  <BAD_TILES></BAD_TILES>
+  <CONTAM_DIR></CONTAM_DIR>
+  <CONTAM_FILE></CONTAM_FILE>
+  <ELAND_GENOME>Need_to_specify_ELAND_genome_directory</ELAND_GENOME>
+  <ELAND_MULTIPLE_INSTANCES>8</ELAND_MULTIPLE_INSTANCES>
+  <ELAND_REPEAT></ELAND_REPEAT>
+  <EMAIL_DOMAIN>domain.com</EMAIL_DOMAIN>
+  <EMAIL_LIST>diane</EMAIL_LIST>
+  <EMAIL_SERVER>localhost:25</EMAIL_SERVER>
+  <EXPT_DIR>/home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane</EXPT_DIR>
+  <EXPT_DIR_ROOT>/home/diane/gec</EXPT_DIR_ROOT>
+  <FORCE>1</FORCE>
+  <GENOME_DIR>/home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes</GENOME_DIR>
+  <GENOME_FILE>Need_to_specify_genome_file_name</GENOME_FILE>
+  <HAMSTER_FLAG>genome</HAMSTER_FLAG>
+  <OUT_DIR>/home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane</OUT_DIR>
+  <POST_RUN_COMMAND></POST_RUN_COMMAND>
+  <PRB_FILE_SUFFIX>_prb.txt</PRB_FILE_SUFFIX>
+  <PURE_BASES>12</PURE_BASES>
+  <QF_PARAMS>'((CHASTITY&gt;=0.6))'</QF_PARAMS>
+  <QHG_FILE_SUFFIX>_qhg.txt</QHG_FILE_SUFFIX>
+  <QUALITY_FORMAT>--symbolic</QUALITY_FORMAT>
+  <READ_LENGTH>32</READ_LENGTH>
+  <SEQUENCE_FORMAT>--scarf</SEQUENCE_FORMAT>
+  <SEQ_FILE_SUFFIX>_seq.txt</SEQ_FILE_SUFFIX>
+  <SIG_FILE_SUFFIX_DEPHASED>_sig2.txt</SIG_FILE_SUFFIX_DEPHASED>
+  <SIG_FILE_SUFFIX_NOT_DEPHASED>_sig.txt</SIG_FILE_SUFFIX_NOT_DEPHASED>
+  <SOFTWARE_VERSION>@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp</SOFTWARE_VERSION>
+  <TILE_REGEX>s_[1-8]_[0-9][0-9][0-9][0-9]</TILE_REGEX>
+  <TILE_ROOT>s</TILE_ROOT>
+  <TIME_STAMP>Sat Apr 19 19:08:30 2008</TIME_STAMP>
+  <TOOLS_DIR>/home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald</TOOLS_DIR>
+  <USE_BASES>all</USE_BASES>
+  <WEB_DIR_ROOT>http://host.domain.com/yourshare/</WEB_DIR_ROOT>
+</ChipWideRunParameters>
+<LaneSpecificRunParameters>
+  <ANALYSIS>
+    <s_1>eland</s_1>
+    <s_2>eland</s_2>
+    <s_3>eland</s_3>
+    <s_4>eland</s_4>
+    <s_5>eland</s_5>
+    <s_6>eland</s_6>
+    <s_7>eland</s_7>
+    <s_8>eland</s_8>
+  </ANALYSIS>
+  <ELAND_GENOME>
+    <s_1>/g/dm3</s_1>
+    <s_2>/g/equcab1</s_2>
+    <s_3>/g/equcab1</s_3>
+    <s_4>/g/canfam2</s_4>
+    <s_5>/g/hg18</s_5>
+    <s_6>/g/hg18</s_6>
+    <s_7>/g/hg18</s_7>
+    <s_8>/g/hg18</s_8>
+  </ELAND_GENOME>
+  <READ_LENGTH>
+    <s_1>32</s_1>
+    <s_2>32</s_2>
+    <s_3>32</s_3>
+    <s_4>32</s_4>
+    <s_5>32</s_5>
+    <s_6>32</s_6>
+    <s_7>32</s_7>
+    <s_8>32</s_8>
+  </READ_LENGTH>
+  <USE_BASES>
+    <s_1>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_1>
+    <s_2>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_2>
+    <s_3>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_3>
+    <s_4>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_4>
+    <s_5>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_5>
+    <s_6>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_6>
+    <s_7>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_7>
+    <s_8>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_8>
+  </USE_BASES>
+</LaneSpecificRunParameters>
+</RunParameters>
+"""
+    pathname = os.path.join(gerald_dir, 'config.xml')
+    f = open(pathname,'w')
+    f.write(config_xml)
+    f.close()
+    
+
+def make_summary_htm(gerald_dir):
+    summary_htm = """<!--RUN_TIME Mon Apr 21 11:52:25 2008 -->
+<!--SOFTWARE_VERSION @(#) $Id: jerboa.pl,v 1.31 2007/03/05 17:52:15 km Exp $-->
+<html>
+<body>
+
+<a name="Top"><h2><title>080416_HWI-EAS229_0024_207BTAAXX Summary</title></h2></a>
+<h1>Summary Information For Experiment 080416_HWI-EAS229_0024_207BTAAXX on Machine HWI-EAS229</h1>
+<h2><br></br>Chip Summary<br></br></h2>
+<table border="1" cellpadding="5">
+<tr><td>Machine</td><td>HWI-EAS229</td></tr>
+<tr><td>Run Folder</td><td>080416_HWI-EAS229_0024_207BTAAXX</td></tr>
+<tr><td>Chip ID</td><td>unknown</td></tr>
+</table>
+<h2><br></br>Lane Parameter Summary<br></br></h2>
+<table border="1" cellpadding="5">
+<tr>
+<td>Lane</td>
+<td>Sample ID</td>
+<td>Sample Target</td>
+<td>Sample Type</td>
+<td>Length</td>
+<td>Filter</td>
+<td>Tiles</td>
+</tr>
+<tr>
+<td>1</td>
+<td>unknown</td>
+<td>dm3</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane1">Lane 1</a></td>
+</tr>
+<tr>
+<td>2</td>
+<td>unknown</td>
+<td>equcab1</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane2">Lane 2</a></td>
+</tr>
+<tr>
+<td>3</td>
+<td>unknown</td>
+<td>equcab1</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane3">Lane 3</a></td>
+</tr>
+<tr>
+<td>4</td>
+<td>unknown</td>
+<td>canfam2</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane4">Lane 4</a></td>
+</tr>
+<tr>
+<td>5</td>
+<td>unknown</td>
+<td>hg18</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane5">Lane 5</a></td>
+</tr>
+<tr>
+<td>6</td>
+<td>unknown</td>
+<td>hg18</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane6">Lane 6</a></td>
+</tr>
+<tr>
+<td>7</td>
+<td>unknown</td>
+<td>hg18</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane7">Lane 7</a></td>
+</tr>
+<tr>
+<td>8</td>
+<td>unknown</td>
+<td>hg18</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane8">Lane 8</a></td>
+</tr>
+</table>
+<h2><br></br>Lane Results Summary<br></br></h2>
+<table border="1" cellpadding="5">
+<tr>
+
+<td>Lane </td>
+<td>Clusters </td>
+<td>Av 1st Cycle Int </td>
+<td>% intensity after 20 cycles </td>
+<td>% PF Clusters </td>
+<td>% Align (PF) </td>
+<td>Av Alignment Score (PF) </td>
+<td> % Error Rate (PF) </td>
+</tr>
+<tr>
+<td>1</td>
+<td>17421 +/- 2139</td>
+<td>7230 +/- 801</td>
+<td>23.73 +/- 10.79</td>
+<td>13.00 +/- 22.91</td>
+<td>32.03 +/- 18.45</td>
+<td>6703.57 +/- 3753.85</td>
+<td>4.55 +/- 4.81</td>
+</tr>
+<tr>
+<td>2</td>
+<td>20311 +/- 2402</td>
+<td>7660 +/- 678</td>
+<td>17.03 +/- 4.40</td>
+<td>40.74 +/- 30.33</td>
+<td>29.54 +/- 9.03</td>
+<td>5184.02 +/- 1631.54</td>
+<td>3.27 +/- 3.94</td>
+</tr>
+<tr>
+<td>3</td>
+<td>20193 +/- 2399</td>
+<td>7700 +/- 797</td>
+<td>15.75 +/- 3.30</td>
+<td>56.56 +/- 17.16</td>
+<td>27.33 +/- 7.48</td>
+<td>4803.49 +/- 1313.31</td>
+<td>3.07 +/- 2.86</td>
+</tr>
+<tr>
+<td>4</td>
+<td>15537 +/- 2531</td>
+<td>7620 +/- 1392</td>
+<td>15.37 +/- 3.79</td>
+<td>63.05 +/- 18.30</td>
+<td>15.88 +/- 4.99</td>
+<td>3162.13 +/- 962.59</td>
+<td>3.11 +/- 2.22</td>
+</tr>
+<tr>
+<td>5</td>
+<td>32047 +/- 3356</td>
+<td>8093 +/- 831</td>
+<td>23.79 +/- 6.18</td>
+<td>53.36 +/- 18.06</td>
+<td>48.04 +/- 13.77</td>
+<td>9866.23 +/- 2877.30</td>
+<td>2.26 +/- 1.16</td>
+</tr>
+<tr>
+<td>6</td>
+<td>32946 +/- 4753</td>
+<td>8227 +/- 736</td>
+<td>24.07 +/- 4.69</td>
+<td>54.65 +/- 12.57</td>
+<td>50.98 +/- 10.54</td>
+<td>10468.86 +/- 2228.53</td>
+<td>2.21 +/- 2.33</td>
+</tr>
+<tr>
+<td>7</td>
+<td>39504 +/- 4171</td>
+<td>8401 +/- 785</td>
+<td>22.55 +/- 4.56</td>
+<td>45.22 +/- 10.34</td>
+<td>48.41 +/- 9.67</td>
+<td>9829.40 +/- 1993.20</td>
+<td>2.26 +/- 1.11</td>
+</tr>
+<tr>
+<td>8</td>
+<td>37998 +/- 3792</td>
+<td>8443 +/- 1211</td>
+<td>39.03 +/- 7.52</td>
+<td>42.16 +/- 12.35</td>
+<td>40.98 +/- 14.89</td>
+<td>8128.87 +/- 3055.34</td>
+<td>3.57 +/- 2.77</td>
+</tr>
+</table>
+</body>
+</html>
+"""
+    pathname = os.path.join(gerald_dir, 'Summary.htm')
+    f = open(pathname, 'w')
+    f.write(summary_htm)
+    f.close()
+
+def make_eland_results(gerald_dir):
+    eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759    ACATAGNCACAGACATAAACATAGACATAGAC U0      1       1       3       chrUextra.fa    28189829        R       D.
+>HWI-EAS229_24_207BTAAXX:1:7:205:842    AAACAANNCTCCCAAACACGTAAACTGGAAAA  U1      0       1       0       chr2L.fa        8796855 R       DD      24T
+>HWI-EAS229_24_207BTAAXX:1:7:776:582    AGCTCANCCGATCGAAAACCTCNCCAAGCAAT        NM      0       0       0
+>HWI-EAS229_24_207BTAAXX:1:7:205:842    AAACAANNCTCCCAAACACGTAAACTGGAAAA        U1      0       1       0       Lambda.fa        8796855 R       DD      24T
+"""
+    for i in range(1,9):
+        pathname = os.path.join(gerald_dir, 
+                                's_%d_eland_result.txt' % (i,))
+        f = open(pathname, 'w')
+        f.write(eland_result)
+        f.close()
+                     
+class RunfolderTests(unittest.TestCase):
+    """
+    Test components of the runfolder processing code
+    which includes firecrest, bustard, and gerald
+    """
+    def setUp(self):
+        # make a fake runfolder directory
+        self.temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_')
+
+        self.runfolder_dir = os.path.join(self.temp_dir, 
+                                          '080102_HWI-EAS229_0010_207BTAAXX')
+        os.mkdir(self.runfolder_dir)
+
+        self.data_dir = os.path.join(self.runfolder_dir, 'Data')
+        os.mkdir(self.data_dir)
+
+        self.firecrest_dir = os.path.join(self.data_dir, 
+                               'C1-33_Firecrest1.8.28_12-04-2008_diane'
+                             )
+        os.mkdir(self.firecrest_dir)
+        self.matrix_dir = os.path.join(self.firecrest_dir, 'Matrix')
+        os.mkdir(self.matrix_dir)
+        make_matrix(self.matrix_dir)
+
+        self.bustard_dir = os.path.join(self.firecrest_dir, 
+                                        'Bustard1.8.28_12-04-2008_diane')
+        os.mkdir(self.bustard_dir)
+        make_phasing_params(self.bustard_dir)
+        
+        self.gerald_dir = os.path.join(self.bustard_dir,
+                                       'GERALD_12-04-2008_diane')
+        os.mkdir(self.gerald_dir)
+        make_gerald_config(self.gerald_dir)
+        make_summary_htm(self.gerald_dir)
+        make_eland_results(self.gerald_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.temp_dir)
+
+    def test_firecrest(self):
+        """
+        Construct a firecrest object
+        """
+        f = firecrest.firecrest(self.firecrest_dir)
+        self.failUnlessEqual(f.version, '1.8.28')
+        self.failUnlessEqual(f.start, 1)
+        self.failUnlessEqual(f.stop, 33)
+        self.failUnlessEqual(f.user, 'diane')
+        self.failUnlessEqual(f.date, datetime(2008,4,12))
+
+        xml = f.get_elements()
+        # just make sure that element tree can serialize the tree
+        xml_str = ElementTree.tostring(xml)
+
+        f2 = firecrest.Firecrest(xml=xml)
+        self.failUnlessEqual(f.version, f2.version)
+        self.failUnlessEqual(f.start,   f2.start)
+        self.failUnlessEqual(f.stop,    f2.stop)
+        self.failUnlessEqual(f.user,    f2.user)
+        self.failUnlessEqual(f.date,    f2.date)
+
+    def test_bustard(self):
+        """
+        construct a bustard object
+        """
+        b = bustard.bustard(self.bustard_dir)
+        self.failUnlessEqual(b.version, '1.8.28')
+        self.failUnlessEqual(b.date,    datetime(2008,4,12))
+        self.failUnlessEqual(b.user,    'diane')
+        self.failUnlessEqual(len(b.phasing), 8)
+        self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099)
+        
+        xml = b.get_elements()
+        b2 = bustard.Bustard(xml=xml)
+        self.failUnlessEqual(b.version, b2.version)
+        self.failUnlessEqual(b.date,    b2.date )
+        self.failUnlessEqual(b.user,    b2.user)
+        self.failUnlessEqual(len(b.phasing), len(b2.phasing))
+        for key in b.phasing.keys():
+            self.failUnlessEqual(b.phasing[key].lane, 
+                                 b2.phasing[key].lane)
+            self.failUnlessEqual(b.phasing[key].phasing, 
+                                 b2.phasing[key].phasing)
+            self.failUnlessEqual(b.phasing[key].prephasing, 
+                                 b2.phasing[key].prephasing)
+
+    def test_gerald(self):
+        # need to update gerald and make tests for it
+        g = gerald.gerald(self.gerald_dir) 
+
+        self.failUnlessEqual(g.version, 
+            '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp')
+        self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30))
+        self.failUnlessEqual(len(g.lanes), len(g.lanes.keys()))
+        self.failUnlessEqual(len(g.lanes), len(g.lanes.items()))
+
+        
+        # list of genomes, matches what was defined up in 
+        # make_gerald_config.
+        # the first None is to offset the genomes list to be 1..9
+        # instead of pythons default 0..8
+        genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2',
+                         '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ]
+
+        # test lane specific parameters from gerald config file
+        for i in range(1,9):
+            cur_lane = g.lanes[str(i)]
+            self.failUnlessEqual(cur_lane.analysis, 'eland')
+            self.failUnlessEqual(cur_lane.eland_genome, genomes[i])
+            self.failUnlessEqual(cur_lane.read_length, '32')
+            self.failUnlessEqual(cur_lane.use_bases, 'Y'*32)
+
+        # test data extracted from summary file
+        clusters = [None, 
+                    (17421, 2139), (20311, 2402), (20193, 2399), (15537, 2531),
+                    (32047, 3356), (32946, 4753), (39504, 4171), (37998, 3792)]
+
+        for i in range(1,9):
+            summary_lane = g.summary[str(i)]
+            self.failUnlessEqual(summary_lane.cluster, clusters[i])
+            self.failUnlessEqual(summary_lane.lane, str(i))
+
+        xml = g.get_elements()
+        # just make sure that element tree can serialize the tree
+        xml_str = ElementTree.tostring(xml)
+        g2 = gerald.Gerald(xml=xml)
+
+        # do it all again after extracting from the xml file
+        self.failUnlessEqual(g.version, g2.version)
+        self.failUnlessEqual(g.date, g2.date)
+        self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys()))
+        self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items()))
+
+        # test lane specific parameters from gerald config file
+        for i in range(1,9):
+            g_lane = g.lanes[str(i)]
+            g2_lane = g2.lanes[str(i)]
+            self.failUnlessEqual(g_lane.analysis, g2_lane.analysis)
+            self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome)
+            self.failUnlessEqual(g_lane.read_length, g2_lane.read_length)
+            self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases)
+
+        # test (some) summary elements
+        for i in range(1,9):
+            g_summary = g.summary[str(i)]
+            g2_summary = g2.summary[str(i)]
+            self.failUnlessEqual(g_summary.cluster, g2_summary.cluster)
+            self.failUnlessEqual(g_summary.lane, g2_summary.lane)
+
+            g_eland = g.eland_results
+            g2_eland = g2.eland_results
+            for lane in g_eland.keys():
+                self.failUnlessEqual(g_eland[lane].reads, 
+                                     g2_eland[lane].reads)
+                self.failUnlessEqual(len(g_eland[lane].mapped_reads), 
+                                     len(g2_eland[lane].mapped_reads))
+                for k in g_eland[lane].mapped_reads.keys():
+                    self.failUnlessEqual(g_eland[lane].mapped_reads[k],
+                                         g2_eland[lane].mapped_reads[k])
+
+                self.failUnlessEqual(len(g_eland[lane].match_codes), 
+                                     len(g2_eland[lane].match_codes))
+                for k in g_eland[lane].match_codes.keys():
+                    self.failUnlessEqual(g_eland[lane].match_codes[k],
+                                         g2_eland[lane].match_codes[k])
+
+
+    def test_eland(self):
+        dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa',
+                    'chr2L.fa': 'dm3/chr2L.fa',
+                    'Lambda.fa': 'Lambda.fa'}
+        genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map,
+                        '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map }
+        eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps)
+        
+        for i in range(1,9):
+            lane = eland[str(i)]
+            self.failUnlessEqual(lane.reads, 4)
+            self.failUnlessEqual(len(lane.mapped_reads), 3)
+            self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1)
+            self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1)
+            self.failUnlessEqual(lane.match_codes['U1'], 2)
+            self.failUnlessEqual(lane.match_codes['NM'], 1)
+
+        xml = eland.get_elements()
+        # just make sure that element tree can serialize the tree
+        xml_str = ElementTree.tostring(xml)
+        e2 = gerald.ELAND(xml=xml)
+
+        for i in range(1,9):
+            l1 = eland[str(i)]
+            l2 = e2[str(i)]
+            self.failUnlessEqual(l1.reads, l2.reads)
+            self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads))
+            self.failUnlessEqual(len(l1.mapped_reads), 3)
+            for k in l1.mapped_reads.keys():
+                self.failUnlessEqual(l1.mapped_reads[k],
+                                     l2.mapped_reads[k])
+
+            self.failUnlessEqual(len(l1.match_codes), 9)
+            self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes))
+            for k in l1.match_codes.keys():
+                self.failUnlessEqual(l1.match_codes[k], 
+                                     l2.match_codes[k])
+
+    def test_runfolder(self):
+        runs = runfolder.get_runs(self.runfolder_dir)
+        
+        # do we get the flowcell id from the filename?
+        self.failUnlessEqual(len(runs), 1)
+        self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-04-19.xml')
+
+        # do we get the flowcell id from the FlowcellId.xml file
+        make_flowcell_id(self.runfolder_dir, '207BTAAXY')
+        runs = runfolder.get_runs(self.runfolder_dir)
+        self.failUnlessEqual(len(runs), 1)
+        self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-04-19.xml')
+        
+        r1 = runs[0]
+        xml = r1.get_elements()
+        xml_str = ElementTree.tostring(xml)
+
+        r2 = runfolder.PipelineRun(xml=xml)
+        self.failUnlessEqual(r1.name, r2.name)
+        self.failIfEqual(r2.firecrest, None)
+        self.failIfEqual(r2.bustard, None)
+        self.failIfEqual(r2.gerald, None)
+        
+
+def suite():
+    return unittest.makeSuite(RunfolderTests,'test')
+
+if __name__ == "__main__":
+    unittest.main(defaultTest="suite")
+    
diff --git a/gaworkflow/util/ethelp.py b/gaworkflow/util/ethelp.py

new file mode 100644 (file)

index 0000000..19f6c9f
--- /dev/null
+++ b/gaworkflow/util/ethelp.py
@@ -0,0 +1,32 @@
+"""
+ElementTree helper functions
+"""
+def indent(elem, level=0):
+    """
+    reformat an element tree to be 'pretty' (indented)
+    """
+    i = "\n" + level*"  "
+    if len(elem):
+        if not elem.text or not elem.text.strip():
+            elem.text = i + "  "
+        for child in elem:
+            indent(child, level+1)
+        # we don't want the closing tag indented too far
+        child.tail = i
+        if not elem.tail or not elem.tail.strip():
+            elem.tail = i
+    else:
+        if level and (not elem.tail or not elem.tail.strip()):
+            elem.tail = i
+
+def flatten(elem, include_tail=0):
+    """
+    Extract the text from an element tree 
+    (AKA extract the text that not part of XML tags)
+    """
+    text = elem.text or ""
+    for e in elem:
+        text += flatten(e, 1)
+    if include_tail and elem.tail: text += elem.tail
+    return text
+
diff --git a/gaworkflow/util/test/test_ethelp.py b/gaworkflow/util/test/test_ethelp.py

new file mode 100644 (file)

index 0000000..46f81a1
--- /dev/null
+++ b/gaworkflow/util/test/test_ethelp.py
@@ -0,0 +1,31 @@
+import os
+import unittest
+
+from xml.etree import ElementTree
+from gaworkflow.util.ethelp import indent, flatten
+
+class testETHelper(unittest.TestCase):
+    def setUp(self):
+        self.foo = '<foo><bar>asdf</bar><br/></foo>'
+        self.foo_tree = ElementTree.fromstring(self.foo)
+
+    def test_indent(self):
+        flat_foo = ElementTree.tostring(self.foo_tree)
+        self.failUnlessEqual(len(flat_foo.split('\n')), 1)
+
+        indent(self.foo_tree)
+        pretty_foo = ElementTree.tostring(self.foo_tree)
+        self.failUnlessEqual(len(pretty_foo.split('\n')), 5)
+
+    def test_flatten(self):
+        self.failUnless(flatten(self.foo_tree), 'asdf')
+
+def suite():
+    return unittest.makeSuite(testETHelper, 'test')
+
+if __name__ == "__main__":
+    unittest.main(defaultTest='suite')
+
+
+
+
diff --git a/scripts/runfolder b/scripts/runfolder

new file mode 100644 (file)

index 0000000..13c5cad
--- /dev/null
+++ b/scripts/runfolder
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+"""
+Runfolder.py can generate a xml file capturing all the 'interesting' parameters from a finished pipeline run. (using the -a option). The information currently being captured includes:
+
+  * Flowcell ID
+  * run dates
+  * start/stop cycle numbers
+  * Firecrest, bustard, gerald version numbers
+  * Eland analysis types, and everything in the eland configuration file.
+  * cluster numbers and other values from the Summary.htm 
+    LaneSpecificParameters table. 
+  * How many reads mapped to a genome from an eland file
+
+The ELAND "mapped reads" counter will also check for eland squashed file
+that were symlinked from another directory. This is so I can track how 
+many reads landed on the genome of interest and on the spike ins. 
+
+Basically my subdirectories something like:
+
+genomes/hg18
+genomes/hg18/chr*.2bpb <- files for hg18 genome
+genomes/hg18/chr*.vld  
+genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
+genomes/spikein 
+
+runfolder.py can also spit out a simple summary report (-s option) 
+that contains the per lane post filter cluster numbers and the mapped 
+read counts. (The report isn't currently very pretty)
+"""
+import logging
+import optparse
+import sys
+
+from gaworkflow.pipeline import runfolder
+        
+def make_parser():
+    usage = 'usage: %prog [options] runfolder_root_dir'
+    parser = optparse.OptionParser(usage)
+    parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
+                      default=False,
+                      help='turn on verbose mode')
+    parser.add_option('-s', '--summary', dest='summary', action='store_true',
+                      default=False,
+                      help='produce summary report')
+    parser.add_option('-a', '--archive', dest='archive', action='store_true',
+                      default=False,
+                      help='generate run configuration archive')
+    return parser
+
+def main(cmdlist=None):
+    parser = make_parser()
+    opt, args = parser.parse_args(cmdlist)
+
+    if len(args) == 0:
+        parser.error('need path to a runfolder')
+    
+    logging.basicConfig()
+    if opt.verbose:
+        root_log = logging.getLogger()
+        root_log.setLevel(logging.INFO)
+
+    for run_dir in args:
+        runs = runfolder.get_runs(run_dir)
+        if opt.summary:
+            print runfolder.summary_report(runs)
+        if opt.archive:
+            runfolder.extract_run_parameters(runs)
+
+    return 0
+
+if __name__ == "__main__":
+  sys.exit(main(sys.argv[1:]))
diff --git a/scripts/runfolder.py b/scripts/runfolder.py

deleted file mode 100644 (file)

index ffc32a6..0000000
--- a/scripts/runfolder.py
+++ /dev/null
@@ -1,663 +0,0 @@
-#!/usr/bin/env python
-"""
-Runfolder.py can generate a xml file capturing all the 'interesting' parameters from a finished pipeline run. (using the -a option). The information currently being captured includes:
-
-  * Flowcell ID
-  * run dates
-  * start/stop cycle numbers
-  * Firecrest, bustard, gerald version numbers
-  * Eland analysis types, and everything in the eland configuration file.
-  * cluster numbers and other values from the Summary.htm 
-    LaneSpecificParameters table. 
-  * How many reads mapped to a genome from an eland file
-
-The ELAND "mapped reads" counter will also check for eland squashed file
-that were symlinked from another directory. This is so I can track how 
-many reads landed on the genome of interest and on the spike ins. 
-
-Basically my subdirectories something like:
-
-genomes/hg18
-genomes/hg18/chr*.2bpb <- files for hg18 genome
-genomes/hg18/chr*.vld  
-genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
-genomes/spikein 
-
-runfolder.py can also spit out a simple summary report (-s option) 
-that contains the per lane post filter cluster numbers and the mapped 
-read counts. (The report isn't currently very pretty)
-"""
-import time
-import logging
-import os
-import re
-import stat
-import sys
-from glob import glob
-from pprint import pprint
-import optparse
-
-try:
-  from xml.etree import ElementTree
-except ImportError, e:
-  from elementtree import ElementTree
-
-from gaworkflow.util.alphanum import alphanum
-EUROPEAN_STRPTIME = "%d-%m-%Y"
-EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
-VERSION_RE = "([0-9\.]+)"
-USER_RE = "([a-zA-Z0-9]+)"
-LANES_PER_FLOWCELL = 8
-
-runfolder_path = '/home/diane/gec/080221_HWI-EAS229_0018_201BHAAXX'
-
-def indent(elem, level=0):
-    """
-    reformat an element tree to be 'pretty' (indented)
-    """
-    i = "\n" + level*"  "
-    if len(elem):
-        if not elem.text or not elem.text.strip():
-            elem.text = i + "  "
-        for child in elem:
-            indent(child, level+1)
-        # we don't want the closing tag indented too far
-        child.tail = i
-        if not elem.tail or not elem.tail.strip():
-            elem.tail = i
-    else:
-        if level and (not elem.tail or not elem.tail.strip()):
-            elem.tail = i
-
-def flatten(elem, include_tail=0):
-    """
-    Extract the text from an element tree 
-    (AKA extract the text that not part of XML tags)
-    """
-    text = elem.text or ""
-    for e in elem:
-        text += flatten(e, 1)
-    if include_tail and elem.tail: text += elem.tail
-    return text
-
-class Firecrest(object):
-    def __init__(self, pathname):
-        self.pathname = pathname
-
-        # parse firecrest directory name
-        path, name = os.path.split(pathname)
-        groups = name.split('_')
-        # grab the start/stop cycle information
-        cycle = re.match("C([0-9]+)-([0-9]+)", groups[0])
-        self.start = int(cycle.group(1))
-        self.stop = int(cycle.group(2))
-        # firecrest version
-        version = re.search(VERSION_RE, groups[1])
-        self.version = (version.group(1))
-        # datetime
-        self.date = time.strptime(groups[2], EUROPEAN_STRPTIME)
-        self.time = time.mktime(self.date)
-        # username
-        self.user = groups[3]
-
-        # should I parse this deeper than just stashing the 
-        # contents of the matrix file?
-        matrix_pathname = os.path.join(self.pathname, 'Matrix', 's_matrix.txt')
-        self.matrix = open(matrix_pathname, 'r').read()
-        
-    def dump(self):
-        print "Starting cycle:", self.start
-        print "Ending cycle:", self.stop
-        print "Firecrest version:", self.version
-        print "Run date:", self.date
-        print "user:", self.user
-
-    def _get_elements(self):
-        firecrest = ElementTree.Element('Firecrest')
-        version = ElementTree.SubElement(firecrest, 'version')
-        version.text = self.version
-        start_cycle = ElementTree.SubElement(firecrest, 'FirstCycle')
-        start_cycle.text = str(self.start)
-        stop_cycle = ElementTree.SubElement(firecrest, 'LastCycle')
-        stop_cycle.text = str(self.stop)
-        run_date = ElementTree.SubElement(firecrest, 'run_time')
-        run_date.text = str(self.time)
-        matrix = ElementTree.SubElement(firecrest, 'matrix')
-        matrix.text = self.matrix
-        return firecrest
-    elements=property(_get_elements)
-
-class Bustard(object):
-    def __init__(self, pathname):
-        self.pathname = pathname
-
-        path, name = os.path.split(pathname)
-        groups = name.split("_")
-        version = re.search(VERSION_RE, groups[0])
-        self.version = version.group(1)
-        self.date = time.strptime(groups[1], EUROPEAN_STRPTIME)
-        self.time = time.mktime(self.date)
-        self.user = groups[2]
-        self.phasing = {}
-        self._load_params()
-
-    def _load_params(self):
-        paramfiles = glob(os.path.join(self.pathname, "params*"))
-        for paramfile in paramfiles:
-            path, name = os.path.split(paramfile)
-            basename, ext = os.path.splitext(name)
-            # the last character of the param filename should be the
-            # lane number
-            lane = int(basename[-1])
-            # we want the whole tree, not just the stuff under
-            # the first tag
-            param_tree = ElementTree.parse(paramfile).getroot()
-            self.phasing[lane] = param_tree
-
-    def dump(self):
-        print "Bustard version:", self.version
-        print "Run date", self.date
-        print "user:", self.user
-        for lane, tree in self.phasing.items():
-            print lane
-            print tree
-
-    def _get_elements(self):
-        bustard = ElementTree.Element('Bustard')
-        version = ElementTree.SubElement(bustard, 'version')
-        version.text = self.version
-        run_date = ElementTree.SubElement(bustard, 'run_time')
-        run_date.text = str(self.time)
-        return bustard
-    elements=property(_get_elements)
-        
-class GERALD(object):
-    """
-    Capture meaning out of the GERALD directory
-    """
-    class LaneParameters(object):
-        """
-        Make it easy to access elements of LaneSpecificRunParameters from python
-        """
-        def __init__(self, tree, key):
-            self._tree = tree.find('LaneSpecificRunParameters')
-            self._key = key
-        
-        def __get_attribute(self, xml_tag):
-            container = self._tree.find(xml_tag)
-            if container is None or \
-               len(container.getchildren()) != LANES_PER_FLOWCELL:
-                raise RuntimeError('GERALD config.xml file changed')
-            lanes = [x.tag.split('_')[1] for x in container.getchildren()]
-            index = lanes.index(self._key)
-            #element = container.find(self._key)
-            element = container[index]
-            return element.text
-        def _get_analysis(self):
-            return self.__get_attribute('ANALYSIS')
-        analysis = property(_get_analysis)
-
-        def _get_eland_genome(self):
-            return self.__get_attribute('ELAND_GENOME')
-        eland_genome = property(_get_eland_genome)
-
-        def _get_read_length(self):
-            return self.__get_attribute('READ_LENGTH')
-        read_length = property(_get_read_length)
-
-        def _get_use_bases(self):
-            return self.__get_attribute('USE_BASES')
-        use_bases = property(_get_use_bases)
-
-    class LaneSpecificRunParameters(object):
-        """
-        Provide access to LaneSpecificRunParameters
-        """
-        def __init__(self, tree):
-            self._tree = tree
-            self._keys = None
-        def __getitem__(self, key):
-            return GERALD.LaneParameters(self._tree, key)
-        def keys(self):
-            if self._keys is None:
-                analysis = self._tree.find('LaneSpecificRunParameters/ANALYSIS')
-                # according to the pipeline specs I think their fields 
-                # are sampleName_laneID, with sampleName defaulting to s
-                # since laneIDs are constant lets just try using 
-                # those consistently.
-                self._keys = [ x.tag.split('_')[1] for x in analysis]
-            return self._keys
-        def values(self):
-            return [ self[x] for x in self.keys() ]
-        def items(self):
-            return zip(self.keys(), self.values())
-        def __len__(self):
-            return len(self.keys)
-
-    def __init__(self, pathname):
-        self.pathname = pathname
-        path, name = os.path.split(pathname)
-        config_pathname = os.path.join(self.pathname, 'config.xml')
-        self.tree = ElementTree.parse(config_pathname).getroot()
-        self.version = self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
-
-        date = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
-        self.date = time.strptime(date)
-        self.time = time.mktime(self.date)
-        
-        # parse Summary.htm file
-        summary_pathname = os.path.join(self.pathname, 'Summary.htm')
-        self.summary = Summary(summary_pathname)
-        self.lanes = GERALD.LaneSpecificRunParameters(self.tree)
-        self.eland_results = ELAND(self, self.pathname)
-
-    def dump(self):
-        """
-        Debugging function, report current object
-        """
-        print 'Gerald version:', self.version
-        print 'Gerald run date:', self.date
-        print 'Gerald config.xml:', self.tree
-        self.summary.dump()
-
-    def _get_elements(self):
-        gerald = ElementTree.Element('Gerald')
-        gerald.append(self.tree)
-        gerald.append(self.summary.elements)
-        return gerald
-    elements = property(_get_elements)
-
-def tonumber(v):
-    """
-    Convert a value to int if its an int otherwise a float.
-    """
-    try:
-        v = int(v)
-    except ValueError, e:
-        v = float(v)
-    return v
-
-def parse_mean_range(value):
-    """
-    Parse values like 123 +/- 4.5
-    """
-    if value.strip() == 'unknown':
-       return 0, 0
-
-    average, pm, deviation = value.split()
-    if pm != '+/-':
-        raise RuntimeError("Summary.htm file format changed")
-    return tonumber(average), tonumber(deviation)
-
-def mean_range_element(parent, name, mean, deviation):
-    """
-    Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
-    """
-    element = ElementTree.SubElement(parent, name,
-                                     { 'mean': str(mean),
-                                       'deviation': str(deviation)})
-    return element
-
-class LaneResultSummary(object):
-    """
-    Parse the LaneResultSummary table out of Summary.htm
-    Mostly for the cluster number
-    """
-    def __init__(self, row_element):
-        data = [ flatten(x) for x in row_element ]
-        if len(data) != 8:
-            raise RuntimeError("Summary.htm file format changed")
-
-        self.lane = data[0]
-        self.cluster = parse_mean_range(data[1])
-        self.average_first_cycle_intensity = parse_mean_range(data[2])
-        self.percent_intensity_after_20_cycles = parse_mean_range(data[3])
-        self.percent_pass_filter_clusters = parse_mean_range(data[4])
-        self.percent_pass_filter_align = parse_mean_range(data[5])
-        self.average_alignment_score = parse_mean_range(data[6])
-        self.percent_error_rate = parse_mean_range(data[7])
-
-    def _get_elements(self):
-        lane_result = ElementTree.Element('LaneResultSummary', 
-                                          {'lane': self.lane})
-        cluster = mean_range_element(lane_result, 'Cluster', *self.cluster)
-        first_cycle = mean_range_element(lane_result, 
-                                         'AverageFirstCycleIntensity',
-                                         *self.average_first_cycle_intensity)
-        after_20 = mean_range_element(lane_result,
-                                      'PercentIntensityAfter20Cycles',
-                                      *self.percent_intensity_after_20_cycles)
-        pass_filter = mean_range_element(lane_result,
-                                         'PercentPassFilterClusters',
-                                         *self.percent_pass_filter_clusters)
-        alignment = mean_range_element(lane_result,
-                                       'AverageAlignmentScore',
-                                       *self.average_alignment_score)
-        error_rate = mean_range_element(lane_result,
-                                        'PercentErrorRate',
-                                        *self.percent_error_rate)
-        return lane_result
-    elements = property(_get_elements)
-
-class Summary(object):
-    """
-    Extract some useful information from the Summary.htm file
-    """
-    def __init__(self, pathname):
-        self.pathname = pathname
-        self.tree = ElementTree.parse(pathname).getroot()
-        self.lane_results = {}
-
-        self._extract_lane_results()
-
-    def _extract_lane_results(self):
-        """
-        extract the Lane Results Summary table
-        """
-        if flatten(self.tree.findall('*//h2')[3]) != 'Lane Results Summary':
-            raise RuntimeError("Summary.htm file format changed")
-
-        tables = self.tree.findall('*//table')
-
-        # parse lane result summary
-        lane_summary = tables[2]
-        rows = lane_summary.getchildren()
-        headers = rows[0].getchildren()
-        if flatten(headers[2]) != 'Av 1st Cycle Int ':
-            raise RuntimeError("Summary.htm file format changed")
-
-        for r in rows[1:]:
-            lrs = LaneResultSummary(r)
-            self.lane_results[lrs.lane] = lrs
-
-    def _get_elements(self):
-        summary = ElementTree.Element('Summary')
-        for lane in self.lane_results.values():
-            summary.append(lane.elements)
-        return summary
-    elements = property(_get_elements)
-
-    def dump(self):
-        """
-        Debugging function, report current object
-        """
-        pass
-
-    
-class ELAND(object):
-    """
-    Summarize information from eland files
-    """
-    class ElandResult(object):
-        """
-        Process an eland result file
-        """
-        def __init__(self, gerald, pathname):
-            self.gerald = gerald
-            self.pathname = pathname
-            # extract the sample name
-            path, name = os.path.split(self.pathname)
-            split_name = name.split('_')
-            self.sample_name = split_name[0]
-            self.lane_id = split_name[1]
-            self._reads = None
-            self._mapped_reads = None
-            self._fasta_map = {}
-
-        def _build_fasta_map(self, genome_dir):
-            # build fasta to fasta file map
-            genome = genome_dir.split(os.path.sep)[-1]
-            fasta_map = {}
-            for vld_file in glob(os.path.join(genome_dir, '*.vld')):
-                is_link = False
-                if os.path.islink(vld_file):
-                    is_link = True
-                vld_file = os.path.realpath(vld_file)
-               path, vld_name = os.path.split(vld_file)
-                name, ext = os.path.splitext(vld_name)
-                if is_link:
-                    fasta_map[name] = name
-                else:
-                    fasta_map[name] = os.path.join(genome, name)
-            self._fasta_map = fasta_map
-
-        def _update(self):
-            """
-            Actually read the file and actually count the reads
-            """
-            if os.stat(self.pathname)[stat.ST_SIZE] == 0:
-                raise RuntimeError("Eland isn't done, try again later.")
-            
-            reads = 0
-            mapped_reads = {}
-            genome_dir = self.gerald.lanes[self.lane_id].eland_genome
-            self._build_fasta_map(genome_dir)
-            match_codes = {'NM':0, 'QC':0, 'RM':0, 
-                           'U0':0, 'U1':0, 'U2':0,
-                           'R0':0, 'R1':0, 'R2':0,
-                          }
-            for line in open(self.pathname):
-                reads += 1
-                fields = line.split()
-               # code = fields[2]
-                # match_codes[code] = match_codes.setdefault(code, 0) + 1
-               # the QC/NM etc codes are in the 3rd field and always present
-                match_codes[fields[2]] += 1
-                # ignore lines that don't have a fasta filename
-                if len(fields) < 7:
-                    continue
-                fasta = self._fasta_map.get(fields[6], fields[6])
-                mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
-            self._match_codes = match_codes
-            self._mapped_reads = mapped_reads
-            self._reads = reads
-        
-        def _get_reads(self):
-            if self._reads is None:
-                self._update()
-            return self._reads
-        reads = property(_get_reads)
-
-        def _get_mapped_reads(self):
-            if self._mapped_reads is None:
-                self._update()
-            return self._mapped_reads
-        mapped_reads = property(_get_mapped_reads)
-            
-    def __init__(self, gerald, basedir):
-        # we need information from the gerald config.xml
-        self.gerald = gerald
-        self.results = {}
-        for f in glob(os.path.join(basedir, "*_eland_result.txt")):
-            eland_result = ELAND.ElandResult(gerald, f)
-            self.results[eland_result.lane_id] = eland_result
-
-class PipelineRun(object):
-    """
-    Capture "interesting" information about a pipeline run
-    """
-    def __init__(self, pathname, firecrest, bustard, gerald):
-        self.pathname = pathname
-        self._name = None
-        self._flowcell_id = None
-        self.firecrest = firecrest
-        self.bustard = bustard
-        self.gerald = gerald
-    
-    def _get_flowcell_id(self):
-        # extract flowcell ID
-        if self._flowcell_id is None:
-          config_dir = os.path.join(self.pathname, 'Config')
-          flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
-         if os.path.exists(flowcell_id_path):
-            flowcell_id_tree = ElementTree.parse(flowcell_id_path)
-            self._flowcell_id = flowcell_id_tree.findtext('Text')
-         else:
-           logging.warning(
-             "Unable to determine flowcell id as %s was not found" % (
-                flowcell_id_path))
-           self._flowcell_id = "unknown"
-        return self._flowcell_id
-    flowcell_id = property(_get_flowcell_id)
-
-    def _get_xml(self):
-        """
-        make one master xml file from all of our sub-components.
-        """
-        root = ElementTree.Element('PipelineRun')
-        flowcell = ElementTree.SubElement(root, 'FlowcellID')
-        flowcell.text = self.flowcell_id
-        root.append(self.firecrest.elements)
-        root.append(self.bustard.elements)
-        root.append(self.gerald.elements)
-        return root
-
-    def _get_pretty_xml(self):
-        """
-        Generate indented xml file
-        """
-        root = self._get_xml()
-        indent(root)
-        return root
-    xml = property(_get_pretty_xml)
-
-    def _get_run_name(self):
-        """
-        Given a run tuple, find the latest date and use that as our name
-        """
-        if self._name is None:
-          tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time)
-          timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
-          self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml'
-        return self._name
-    name = property(_get_run_name)
-
-    def save(self):
-        logging.info("Saving run report "+ self.name)
-        ElementTree.ElementTree(self.xml).write(self.name)
-
-def get_runs(runfolder):
-    """
-    Search through a run folder for all the various sub component runs
-    and then return a PipelineRun for each different combination.
-
-    For example if there are two different GERALD runs, this will
-    generate two different PipelineRun objects, that differ
-    in there gerald component.
-    """
-    datadir = os.path.join(runfolder, 'Data')
-
-    logging.info('Searching for runs in ' + datadir)
-    runs = []
-    for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
-        f = Firecrest(firecrest_pathname)
-        bustard_glob = os.path.join(firecrest_pathname, "Bustard*")
-        for bustard_pathname in glob(bustard_glob):
-            b = Bustard(bustard_pathname)
-            gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
-            for gerald_pathname in glob(gerald_glob):
-                try:
-                    g = GERALD(gerald_pathname)
-                    runs.append(PipelineRun(runfolder, f, b, g))
-                except IOError, e:
-                    print "Ignoring", str(e)
-    return runs
-                
-    
-def extract_run_parameters(runs):
-    """
-    Search through runfolder_path for various runs and grab their parameters
-    """
-    for run in runs:
-      run.save()
-
-def summarize_mapped_reads(mapped_reads):
-    """
-    Summarize per chromosome reads into a genome count
-    But handle spike-in/contamination symlinks seperately.
-    """
-    summarized_reads = {}
-    genome_reads = 0
-    genome = 'unknown'
-    for k, v in mapped_reads.items():
-        path, k = os.path.split(k)
-        if len(path) > 0:
-            genome = path
-            genome_reads += v
-        else:
-            summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
-    summarized_reads[genome] = genome_reads
-    return summarized_reads
-
-def summary_report(runs):
-    """
-    Summarize cluster numbers and mapped read counts for a runfolder
-    """
-    report = []
-    for run in runs:
-        # print a run name?
-        report.append('Summary for %s' % (run.name,))
-       # sort the report
-       eland_keys = run.gerald.eland_results.results.keys()
-       eland_keys.sort(alphanum)
-
-        lane_results = run.gerald.summary.lane_results
-       for lane_id in eland_keys:
-           result = run.gerald.eland_results.results[lane_id]
-            report.append("Sample name %s" % (result.sample_name))
-            report.append("Lane id %s" % (result.lane_id,))
-            cluster = lane_results[result.lane_id].cluster
-            report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
-            report.append("Total Reads: %d" % (result.reads))
-            mc = result._match_codes
-           report.append("No Match: %d" % (mc['NM']))
-           report.append("QC Failed: %d" % (mc['QC']))
-            report.append('Unique (0,1,2 mismatches) %d %d %d' % \
-                          (mc['U0'], mc['U1'], mc['U2']))
-            report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
-                          (mc['R0'], mc['R1'], mc['R2']))
-            report.append("Mapped Reads")
-            mapped_reads = summarize_mapped_reads(result.mapped_reads)
-            for name, counts in mapped_reads.items():
-              report.append("  %s: %d" % (name, counts))
-            report.append('---')
-            report.append('')
-        return os.linesep.join(report)
-
-def make_parser():
-    usage = 'usage: %prog [options] runfolder_root_dir'
-    parser = optparse.OptionParser(usage)
-    parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
-                      default=False,
-                      help='turn on verbose mode')
-    parser.add_option('-s', '--summary', dest='summary', action='store_true',
-                      default=False,
-                      help='produce summary report')
-    parser.add_option('-a', '--archive', dest='archive', action='store_true',
-                      default=False,
-                      help='generate run configuration archive')
-    return parser
-
-def main(cmdlist=None):
-    parser = make_parser()
-    opt, args = parser.parse_args(cmdlist)
-
-    if len(args) == 0:
-        parser.error('need path to a runfolder')
-    
-    logging.basicConfig()
-    if opt.verbose:
-        root_log = logging.getLogger()
-        root_log.setLevel(logging.INFO)
-
-    for runfolder in args:
-        runs = get_runs(runfolder)
-        if opt.summary:
-            print summary_report(runs)
-        if opt.archive:
-            extract_run_parameters(runs)
-
-    return 0
-
-if __name__ == "__main__":
-  sys.exit(main(sys.argv[1:]))
author	Diane Trout <diane@caltech.edu>
	Fri, 9 May 2008 03:21:16 +0000 (03:21 +0000)
committer	Diane Trout <diane@caltech.edu>
	Fri, 9 May 2008 03:21:16 +0000 (03:21 +0000)
gaworkflow/pipeline/bustard.py	[new file with mode: 0644]	patch \| blob
gaworkflow/pipeline/firecrest.py	[new file with mode: 0644]	patch \| blob
gaworkflow/pipeline/gerald.py	[new file with mode: 0644]	patch \| blob
gaworkflow/pipeline/runfolder.py	[new file with mode: 0644]	patch \| blob
gaworkflow/pipeline/test/test_runfolder.py	[new file with mode: 0644]	patch \| blob
gaworkflow/util/ethelp.py	[new file with mode: 0644]	patch \| blob
gaworkflow/util/test/test_ethelp.py	[new file with mode: 0644]	patch \| blob
scripts/runfolder	[new file with mode: 0644]	patch \| blob
scripts/runfolder.py	[deleted file]	patch \| blob \| history