From 439054b7acd1620616d20cbc20dfa5c243f1eb0b Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Thu, 30 Oct 2008 22:28:01 +0000 Subject: [PATCH] The htsworkflow.pipelines.gerald module was getting to large so I broke the portion that analyzed the Summary.htm file and the eland_result files into seperate modules in anticipation of extending the eland code to handle some of the newer eland result file types. --- htsworkflow/pipelines/eland.py | 291 ++++++++++++++++ htsworkflow/pipelines/gerald.py | 549 +------------------------------ htsworkflow/pipelines/summary.py | 272 +++++++++++++++ scripts/elandseq | 2 +- scripts/rerun_eland.py | 3 +- 5 files changed, 569 insertions(+), 548 deletions(-) create mode 100644 htsworkflow/pipelines/eland.py create mode 100644 htsworkflow/pipelines/summary.py diff --git a/htsworkflow/pipelines/eland.py b/htsworkflow/pipelines/eland.py new file mode 100644 index 0000000..43062e1 --- /dev/null +++ b/htsworkflow/pipelines/eland.py @@ -0,0 +1,291 @@ +""" +Analyze ELAND files +""" + +from glob import glob +import logging +import os +import stat + +from htsworkflow.pipelines.runfolder import ElementTree +from htsworkflow.util.ethelp import indent, flatten +from htsworkflow.util.opener import autoopen + +class ElandLane(object): + """ + Process an eland result file + """ + XML_VERSION = 1 + LANE = 'ElandLane' + SAMPLE_NAME = 'SampleName' + LANE_ID = 'LaneID' + GENOME_MAP = 'GenomeMap' + GENOME_ITEM = 'GenomeItem' + MAPPED_READS = 'MappedReads' + MAPPED_ITEM = 'MappedItem' + MATCH_CODES = 'MatchCodes' + MATCH_ITEM = 'Code' + READS = 'Reads' + + def __init__(self, pathname=None, genome_map=None, xml=None): + self.pathname = pathname + self._sample_name = None + self._lane_id = None + self._reads = None + self._mapped_reads = None + self._match_codes = None + if genome_map is None: + genome_map = {} + self.genome_map = genome_map + + if xml is not None: + self.set_elements(xml) + + def _update(self): + """ + Actually read the file and actually count the reads + """ + # can't do anything if we don't have a file to process + if self.pathname is None: + return + + if os.stat(self.pathname)[stat.ST_SIZE] == 0: + raise RuntimeError("Eland isn't done, try again later.") + + logging.info("summarizing results for %s" % (self.pathname)) + reads = 0 + mapped_reads = {} + + match_codes = {'NM':0, 'QC':0, 'RM':0, + 'U0':0, 'U1':0, 'U2':0, + 'R0':0, 'R1':0, 'R2':0, + } + for line in autoopen(self.pathname,'r'): + reads += 1 + fields = line.split() + # code = fields[2] + # match_codes[code] = match_codes.setdefault(code, 0) + 1 + # the QC/NM etc codes are in the 3rd field and always present + match_codes[fields[2]] += 1 + # ignore lines that don't have a fasta filename + if len(fields) < 7: + continue + fasta = self.genome_map.get(fields[6], fields[6]) + mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1 + self._match_codes = match_codes + self._mapped_reads = mapped_reads + self._reads = reads + + def _update_name(self): + # extract the sample name + if self.pathname is None: + return + + path, name = os.path.split(self.pathname) + split_name = name.split('_') + self._sample_name = split_name[0] + self._lane_id = split_name[1] + + def _get_sample_name(self): + if self._sample_name is None: + self._update_name() + return self._sample_name + sample_name = property(_get_sample_name) + + def _get_lane_id(self): + if self._lane_id is None: + self._update_name() + return self._lane_id + lane_id = property(_get_lane_id) + + def _get_reads(self): + if self._reads is None: + self._update() + return self._reads + reads = property(_get_reads) + + def _get_mapped_reads(self): + if self._mapped_reads is None: + self._update() + return self._mapped_reads + mapped_reads = property(_get_mapped_reads) + + def _get_match_codes(self): + if self._match_codes is None: + self._update() + return self._match_codes + match_codes = property(_get_match_codes) + + def get_elements(self): + lane = ElementTree.Element(ElandLane.LANE, + {'version': + unicode(ElandLane.XML_VERSION)}) + sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME) + sample_tag.text = self.sample_name + lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID) + lane_tag.text = self.lane_id + genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP) + for k, v in self.genome_map.items(): + item = ElementTree.SubElement( + genome_map, ElandLane.GENOME_ITEM, + {'name':k, 'value':unicode(v)}) + mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS) + for k, v in self.mapped_reads.items(): + item = ElementTree.SubElement( + mapped_reads, ElandLane.MAPPED_ITEM, + {'name':k, 'value':unicode(v)}) + match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES) + for k, v in self.match_codes.items(): + item = ElementTree.SubElement( + match_codes, ElandLane.MATCH_ITEM, + {'name':k, 'value':unicode(v)}) + reads = ElementTree.SubElement(lane, ElandLane.READS) + reads.text = unicode(self.reads) + + return lane + + def set_elements(self, tree): + if tree.tag != ElandLane.LANE: + raise ValueError('Exptecting %s' % (ElandLane.LANE,)) + + # reset dictionaries + self._mapped_reads = {} + self._match_codes = {} + + for element in tree: + tag = element.tag.lower() + if tag == ElandLane.SAMPLE_NAME.lower(): + self._sample_name = element.text + elif tag == ElandLane.LANE_ID.lower(): + self._lane_id = element.text + elif tag == ElandLane.GENOME_MAP.lower(): + for child in element: + name = child.attrib['name'] + value = child.attrib['value'] + self.genome_map[name] = value + elif tag == ElandLane.MAPPED_READS.lower(): + for child in element: + name = child.attrib['name'] + value = child.attrib['value'] + self._mapped_reads[name] = int(value) + elif tag == ElandLane.MATCH_CODES.lower(): + for child in element: + name = child.attrib['name'] + value = int(child.attrib['value']) + self._match_codes[name] = value + elif tag == ElandLane.READS.lower(): + self._reads = int(element.text) + else: + logging.warn("ElandLane unrecognized tag %s" % (element.tag,)) + +class ELAND(object): + """ + Summarize information from eland files + """ + XML_VERSION = 1 + + ELAND = 'ElandCollection' + LANE = 'Lane' + LANE_ID = 'id' + + def __init__(self, xml=None): + # we need information from the gerald config.xml + self.results = {} + + if xml is not None: + self.set_elements(xml) + + def __len__(self): + return len(self.results) + + def keys(self): + return self.results.keys() + + def values(self): + return self.results.values() + + def items(self): + return self.results.items() + + def __getitem__(self, key): + return self.results[key] + + def get_elements(self): + root = ElementTree.Element(ELAND.ELAND, + {'version': unicode(ELAND.XML_VERSION)}) + for lane_id, lane in self.results.items(): + eland_lane = lane.get_elements() + eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id) + root.append(eland_lane) + return root + + def set_elements(self, tree): + if tree.tag.lower() != ELAND.ELAND.lower(): + raise ValueError('Expecting %s', ELAND.ELAND) + for element in list(tree): + lane_id = element.attrib[ELAND.LANE_ID] + lane = ElandLane(xml=element) + self.results[lane_id] = lane + +def eland(basedir, gerald=None, genome_maps=None): + e = ELAND() + + file_list = glob(os.path.join(basedir, "*_eland_result.txt")) + if len(file_list) == 0: + # lets handle compressed eland files too + file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2")) + + for pathname in file_list: + # yes the lane_id is also being computed in ElandLane._update + # I didn't want to clutter up my constructor + # but I needed to persist the sample_name/lane_id for + # runfolder summary_report + path, name = os.path.split(pathname) + logging.info("Adding eland file %s" %(name,)) + split_name = name.split('_') + lane_id = split_name[1] + + if genome_maps is not None: + genome_map = genome_maps[lane_id] + elif gerald is not None: + genome_dir = gerald.lanes[lane_id].eland_genome + genome_map = build_genome_fasta_map(genome_dir) + else: + genome_map = {} + + eland_result = ElandLane(pathname, genome_map) + e.results[lane_id] = eland_result + return e + +def build_genome_fasta_map(genome_dir): + # build fasta to fasta file map + logging.info("Building genome map") + genome = genome_dir.split(os.path.sep)[-1] + fasta_map = {} + for vld_file in glob(os.path.join(genome_dir, '*.vld')): + is_link = False + if os.path.islink(vld_file): + is_link = True + vld_file = os.path.realpath(vld_file) + path, vld_name = os.path.split(vld_file) + name, ext = os.path.splitext(vld_name) + if is_link: + fasta_map[name] = name + else: + fasta_map[name] = os.path.join(genome, name) + return fasta_map + + +def extract_eland_sequence(instream, outstream, start, end): + """ + Extract a chunk of sequence out of an eland file + """ + for line in instream: + record = line.split() + if len(record) > 1: + result = [record[0], record[1][start:end]] + else: + result = [record[0][start:end]] + outstream.write("\t".join(result)) + outstream.write(os.linesep) + diff --git a/htsworkflow/pipelines/gerald.py b/htsworkflow/pipelines/gerald.py index 59608da..34f4f30 100644 --- a/htsworkflow/pipelines/gerald.py +++ b/htsworkflow/pipelines/gerald.py @@ -2,12 +2,12 @@ Provide access to information stored in the GERALD directory. """ from datetime import datetime, date -from glob import glob import logging import os -import stat import time -import types + +from htsworkflow.pipelines.summary import Summary +from htsworkflow.pipelines.eland import eland from htsworkflow.pipelines.runfolder import \ ElementTree, \ @@ -15,7 +15,6 @@ from htsworkflow.pipelines.runfolder import \ LANES_PER_FLOWCELL, \ VERSION_RE from htsworkflow.util.ethelp import indent, flatten -from htsworkflow.util.opener import autoopen class Gerald(object): """ @@ -180,545 +179,3 @@ def gerald(pathname): g.eland_results = eland(g.pathname, g) return g -def tonumber(v): - """ - Convert a value to int if its an int otherwise a float. - """ - try: - v = int(v) - except ValueError, e: - v = float(v) - return v - -def parse_mean_range(value): - """ - Parse values like 123 +/- 4.5 - """ - if value.strip() == 'unknown': - return 0, 0 - - average, pm, deviation = value.split() - if pm != '+/-': - raise RuntimeError("Summary.htm file format changed") - return tonumber(average), tonumber(deviation) - -def make_mean_range_element(parent, name, mean, deviation): - """ - Make an ElementTree subelement - """ - element = ElementTree.SubElement(parent, name, - { 'mean': unicode(mean), - 'deviation': unicode(deviation)}) - return element - -def parse_mean_range_element(element): - """ - Grab mean/deviation out of element - """ - return (tonumber(element.attrib['mean']), - tonumber(element.attrib['deviation'])) - -def parse_summary_element(element): - """ - Determine if we have a simple element or a mean/deviation element - """ - if len(element.attrib) > 0: - return parse_mean_range_element(element) - else: - return element.text - -class Summary(object): - """ - Extract some useful information from the Summary.htm file - """ - XML_VERSION = 2 - SUMMARY = 'Summary' - - class LaneResultSummary(object): - """ - Parse the LaneResultSummary table out of Summary.htm - Mostly for the cluster number - """ - LANE_RESULT_SUMMARY = 'LaneResultSummary' - TAGS = { - 'LaneYield': 'lane_yield', - 'Cluster': 'cluster', # Raw - 'ClusterPF': 'cluster_pass_filter', - 'AverageFirstCycleIntensity': 'average_first_cycle_intensity', - 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles', - 'PercentPassFilterClusters': 'percent_pass_filter_clusters', - 'PercentPassFilterAlign': 'percent_pass_filter_align', - 'AverageAlignmentScore': 'average_alignment_score', - 'PercentErrorRate': 'percent_error_rate' - } - - def __init__(self, html=None, xml=None): - self.lane = None - self.lane_yield = None - self.cluster = None - self.cluster_pass_filter = None - self.average_first_cycle_intensity = None - self.percent_intensity_after_20_cycles = None - self.percent_pass_filter_clusters = None - self.percent_pass_filter_align = None - self.average_alignment_score = None - self.percent_error_rate = None - - if html is not None: - self.set_elements_from_html(html) - if xml is not None: - self.set_elements(xml) - - def set_elements_from_html(self, data): - if not len(data) in (8,10): - raise RuntimeError("Summary.htm file format changed") - - # same in pre-0.3.0 Summary file and 0.3 summary file - self.lane = data[0] - - if len(data) == 8: - parsed_data = [ parse_mean_range(x) for x in data[1:] ] - # this is the < 0.3 Pipeline version - self.cluster = parsed_data[0] - self.average_first_cycle_intensity = parsed_data[1] - self.percent_intensity_after_20_cycles = parsed_data[2] - self.percent_pass_filter_clusters = parsed_data[3] - self.percent_pass_filter_align = parsed_data[4] - self.average_alignment_score = parsed_data[5] - self.percent_error_rate = parsed_data[6] - elif len(data) == 10: - parsed_data = [ parse_mean_range(x) for x in data[2:] ] - # this is the >= 0.3 summary file - self.lane_yield = data[1] - self.cluster = parsed_data[0] - self.cluster_pass_filter = parsed_data[1] - self.average_first_cycle_intensity = parsed_data[2] - self.percent_intensity_after_20_cycles = parsed_data[3] - self.percent_pass_filter_clusters = parsed_data[4] - self.percent_pass_filter_align = parsed_data[5] - self.average_alignment_score = parsed_data[6] - self.percent_error_rate = parsed_data[7] - - def get_elements(self): - lane_result = ElementTree.Element( - Summary.LaneResultSummary.LANE_RESULT_SUMMARY, - {'lane': self.lane}) - for tag, variable_name in Summary.LaneResultSummary.TAGS.items(): - value = getattr(self, variable_name) - if value is None: - continue - # it looks like a sequence - elif type(value) in (types.TupleType, types.ListType): - element = make_mean_range_element( - lane_result, - tag, - *value - ) - else: - element = ElementTree.SubElement(lane_result, tag) - element.text = value - return lane_result - - def set_elements(self, tree): - if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY: - raise ValueError('Expected %s' % ( - Summary.LaneResultSummary.LANE_RESULT_SUMMARY)) - self.lane = tree.attrib['lane'] - tags = Summary.LaneResultSummary.TAGS - for element in list(tree): - try: - variable_name = tags[element.tag] - setattr(self, variable_name, - parse_summary_element(element)) - except KeyError, e: - logging.warn('Unrecognized tag %s' % (element.tag,)) - - def __init__(self, filename=None, xml=None): - self.lane_results = {} - - if filename is not None: - self._extract_lane_results(filename) - if xml is not None: - self.set_elements(xml) - - def __getitem__(self, key): - return self.lane_results[key] - - def __len__(self): - return len(self.lane_results) - - def keys(self): - return self.lane_results.keys() - - def values(self): - return self.lane_results.values() - - def items(self): - return self.lane_results.items() - - def _flattened_row(self, row): - """ - flatten the children of a ... - """ - return [flatten(x) for x in row.getchildren() ] - - def _parse_table(self, table): - """ - assumes the first line is the header of a table, - and that the remaining rows are data - """ - rows = table.getchildren() - data = [] - for r in rows: - data.append(self._flattened_row(r)) - return data - - def _extract_named_tables(self, pathname): - """ - extract all the 'named' tables from a Summary.htm file - and return as a dictionary - - Named tables are

...

...
pairs - The contents of the h2 tag is considered to the name - of the table. - """ - tree = ElementTree.parse(pathname).getroot() - body = tree.find('body') - tables = {} - for i in range(len(body)): - if body[i].tag == 'h2' and body[i+1].tag == 'table': - # we have an interesting table - name = flatten(body[i]) - table = body[i+1] - data = self._parse_table(table) - tables[name] = data - return tables - - def _extract_lane_results(self, pathname): - """ - extract the Lane Results Summary table - """ - - tables = self._extract_named_tables(pathname) - - # parse lane result summary - lane_summary = tables['Lane Results Summary'] - # this is version 1 of the summary file - if len(lane_summary[-1]) == 8: - # strip header - headers = lane_summary[0] - # grab the lane by lane data - lane_summary = lane_summary[1:] - - # this is version 2 of the summary file - if len(lane_summary[-1]) == 10: - # lane_summary[0] is a different less specific header row - headers = lane_summary[1] - lane_summary = lane_summary[2:10] - # after the last lane, there's a set of chip wide averages - - for r in lane_summary: - lrs = Summary.LaneResultSummary(html=r) - self.lane_results[lrs.lane] = lrs - - def get_elements(self): - summary = ElementTree.Element(Summary.SUMMARY, - {'version': unicode(Summary.XML_VERSION)}) - for lane in self.lane_results.values(): - summary.append(lane.get_elements()) - return summary - - def set_elements(self, tree): - if tree.tag != Summary.SUMMARY: - return ValueError("Expected %s" % (Summary.SUMMARY,)) - xml_version = int(tree.attrib.get('version', 0)) - if xml_version > Summary.XML_VERSION: - logging.warn('Summary XML tree is a higher version than this class') - for element in list(tree): - lrs = Summary.LaneResultSummary() - lrs.set_elements(element) - self.lane_results[lrs.lane] = lrs - - def dump(self): - """ - Debugging function, report current object - """ - pass - - -def build_genome_fasta_map(genome_dir): - # build fasta to fasta file map - logging.info("Building genome map") - genome = genome_dir.split(os.path.sep)[-1] - fasta_map = {} - for vld_file in glob(os.path.join(genome_dir, '*.vld')): - is_link = False - if os.path.islink(vld_file): - is_link = True - vld_file = os.path.realpath(vld_file) - path, vld_name = os.path.split(vld_file) - name, ext = os.path.splitext(vld_name) - if is_link: - fasta_map[name] = name - else: - fasta_map[name] = os.path.join(genome, name) - return fasta_map - -class ElandLane(object): - """ - Process an eland result file - """ - XML_VERSION = 1 - LANE = 'ElandLane' - SAMPLE_NAME = 'SampleName' - LANE_ID = 'LaneID' - GENOME_MAP = 'GenomeMap' - GENOME_ITEM = 'GenomeItem' - MAPPED_READS = 'MappedReads' - MAPPED_ITEM = 'MappedItem' - MATCH_CODES = 'MatchCodes' - MATCH_ITEM = 'Code' - READS = 'Reads' - - def __init__(self, pathname=None, genome_map=None, xml=None): - self.pathname = pathname - self._sample_name = None - self._lane_id = None - self._reads = None - self._mapped_reads = None - self._match_codes = None - if genome_map is None: - genome_map = {} - self.genome_map = genome_map - - if xml is not None: - self.set_elements(xml) - - def _update(self): - """ - Actually read the file and actually count the reads - """ - # can't do anything if we don't have a file to process - if self.pathname is None: - return - - if os.stat(self.pathname)[stat.ST_SIZE] == 0: - raise RuntimeError("Eland isn't done, try again later.") - - logging.info("summarizing results for %s" % (self.pathname)) - reads = 0 - mapped_reads = {} - - match_codes = {'NM':0, 'QC':0, 'RM':0, - 'U0':0, 'U1':0, 'U2':0, - 'R0':0, 'R1':0, 'R2':0, - } - for line in autoopen(self.pathname,'r'): - reads += 1 - fields = line.split() - # code = fields[2] - # match_codes[code] = match_codes.setdefault(code, 0) + 1 - # the QC/NM etc codes are in the 3rd field and always present - match_codes[fields[2]] += 1 - # ignore lines that don't have a fasta filename - if len(fields) < 7: - continue - fasta = self.genome_map.get(fields[6], fields[6]) - mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1 - self._match_codes = match_codes - self._mapped_reads = mapped_reads - self._reads = reads - - def _update_name(self): - # extract the sample name - if self.pathname is None: - return - - path, name = os.path.split(self.pathname) - split_name = name.split('_') - self._sample_name = split_name[0] - self._lane_id = split_name[1] - - def _get_sample_name(self): - if self._sample_name is None: - self._update_name() - return self._sample_name - sample_name = property(_get_sample_name) - - def _get_lane_id(self): - if self._lane_id is None: - self._update_name() - return self._lane_id - lane_id = property(_get_lane_id) - - def _get_reads(self): - if self._reads is None: - self._update() - return self._reads - reads = property(_get_reads) - - def _get_mapped_reads(self): - if self._mapped_reads is None: - self._update() - return self._mapped_reads - mapped_reads = property(_get_mapped_reads) - - def _get_match_codes(self): - if self._match_codes is None: - self._update() - return self._match_codes - match_codes = property(_get_match_codes) - - def get_elements(self): - lane = ElementTree.Element(ElandLane.LANE, - {'version': - unicode(ElandLane.XML_VERSION)}) - sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME) - sample_tag.text = self.sample_name - lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID) - lane_tag.text = self.lane_id - genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP) - for k, v in self.genome_map.items(): - item = ElementTree.SubElement( - genome_map, ElandLane.GENOME_ITEM, - {'name':k, 'value':unicode(v)}) - mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS) - for k, v in self.mapped_reads.items(): - item = ElementTree.SubElement( - mapped_reads, ElandLane.MAPPED_ITEM, - {'name':k, 'value':unicode(v)}) - match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES) - for k, v in self.match_codes.items(): - item = ElementTree.SubElement( - match_codes, ElandLane.MATCH_ITEM, - {'name':k, 'value':unicode(v)}) - reads = ElementTree.SubElement(lane, ElandLane.READS) - reads.text = unicode(self.reads) - - return lane - - def set_elements(self, tree): - if tree.tag != ElandLane.LANE: - raise ValueError('Exptecting %s' % (ElandLane.LANE,)) - - # reset dictionaries - self._mapped_reads = {} - self._match_codes = {} - - for element in tree: - tag = element.tag.lower() - if tag == ElandLane.SAMPLE_NAME.lower(): - self._sample_name = element.text - elif tag == ElandLane.LANE_ID.lower(): - self._lane_id = element.text - elif tag == ElandLane.GENOME_MAP.lower(): - for child in element: - name = child.attrib['name'] - value = child.attrib['value'] - self.genome_map[name] = value - elif tag == ElandLane.MAPPED_READS.lower(): - for child in element: - name = child.attrib['name'] - value = child.attrib['value'] - self._mapped_reads[name] = int(value) - elif tag == ElandLane.MATCH_CODES.lower(): - for child in element: - name = child.attrib['name'] - value = int(child.attrib['value']) - self._match_codes[name] = value - elif tag == ElandLane.READS.lower(): - self._reads = int(element.text) - else: - logging.warn("ElandLane unrecognized tag %s" % (element.tag,)) - -def extract_eland_sequence(instream, outstream, start, end): - """ - Extract a chunk of sequence out of an eland file - """ - for line in instream: - record = line.split() - if len(record) > 1: - result = [record[0], record[1][start:end]] - else: - result = [record[0][start:end]] - outstream.write("\t".join(result)) - outstream.write(os.linesep) - -class ELAND(object): - """ - Summarize information from eland files - """ - XML_VERSION = 1 - - ELAND = 'ElandCollection' - LANE = 'Lane' - LANE_ID = 'id' - - def __init__(self, xml=None): - # we need information from the gerald config.xml - self.results = {} - - if xml is not None: - self.set_elements(xml) - - def __len__(self): - return len(self.results) - - def keys(self): - return self.results.keys() - - def values(self): - return self.results.values() - - def items(self): - return self.results.items() - - def __getitem__(self, key): - return self.results[key] - - def get_elements(self): - root = ElementTree.Element(ELAND.ELAND, - {'version': unicode(ELAND.XML_VERSION)}) - for lane_id, lane in self.results.items(): - eland_lane = lane.get_elements() - eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id) - root.append(eland_lane) - return root - - def set_elements(self, tree): - if tree.tag.lower() != ELAND.ELAND.lower(): - raise ValueError('Expecting %s', ELAND.ELAND) - for element in list(tree): - lane_id = element.attrib[ELAND.LANE_ID] - lane = ElandLane(xml=element) - self.results[lane_id] = lane - -def eland(basedir, gerald=None, genome_maps=None): - e = ELAND() - - file_list = glob(os.path.join(basedir, "*_eland_result.txt")) - if len(file_list) == 0: - # lets handle compressed eland files too - file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2")) - - for pathname in file_list: - # yes the lane_id is also being computed in ElandLane._update - # I didn't want to clutter up my constructor - # but I needed to persist the sample_name/lane_id for - # runfolder summary_report - path, name = os.path.split(pathname) - logging.info("Adding eland file %s" %(name,)) - split_name = name.split('_') - lane_id = split_name[1] - - if genome_maps is not None: - genome_map = genome_maps[lane_id] - elif gerald is not None: - genome_dir = gerald.lanes[lane_id].eland_genome - genome_map = build_genome_fasta_map(genome_dir) - else: - genome_map = {} - - eland_result = ElandLane(pathname, genome_map) - e.results[lane_id] = eland_result - return e diff --git a/htsworkflow/pipelines/summary.py b/htsworkflow/pipelines/summary.py new file mode 100644 index 0000000..77160d1 --- /dev/null +++ b/htsworkflow/pipelines/summary.py @@ -0,0 +1,272 @@ +""" +Analyze the Summary.htm file produced by GERALD +""" + + +from htsworkflow.pipelines.runfolder import ElementTree +from htsworkflow.util.ethelp import indent, flatten + +class Summary(object): + """ + Extract some useful information from the Summary.htm file + """ + XML_VERSION = 2 + SUMMARY = 'Summary' + + class LaneResultSummary(object): + """ + Parse the LaneResultSummary table out of Summary.htm + Mostly for the cluster number + """ + LANE_RESULT_SUMMARY = 'LaneResultSummary' + TAGS = { + 'LaneYield': 'lane_yield', + 'Cluster': 'cluster', # Raw + 'ClusterPF': 'cluster_pass_filter', + 'AverageFirstCycleIntensity': 'average_first_cycle_intensity', + 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles', + 'PercentPassFilterClusters': 'percent_pass_filter_clusters', + 'PercentPassFilterAlign': 'percent_pass_filter_align', + 'AverageAlignmentScore': 'average_alignment_score', + 'PercentErrorRate': 'percent_error_rate' + } + + def __init__(self, html=None, xml=None): + self.lane = None + self.lane_yield = None + self.cluster = None + self.cluster_pass_filter = None + self.average_first_cycle_intensity = None + self.percent_intensity_after_20_cycles = None + self.percent_pass_filter_clusters = None + self.percent_pass_filter_align = None + self.average_alignment_score = None + self.percent_error_rate = None + + if html is not None: + self.set_elements_from_html(html) + if xml is not None: + self.set_elements(xml) + + def set_elements_from_html(self, data): + if not len(data) in (8,10): + raise RuntimeError("Summary.htm file format changed") + + # same in pre-0.3.0 Summary file and 0.3 summary file + self.lane = data[0] + + if len(data) == 8: + parsed_data = [ parse_mean_range(x) for x in data[1:] ] + # this is the < 0.3 Pipeline version + self.cluster = parsed_data[0] + self.average_first_cycle_intensity = parsed_data[1] + self.percent_intensity_after_20_cycles = parsed_data[2] + self.percent_pass_filter_clusters = parsed_data[3] + self.percent_pass_filter_align = parsed_data[4] + self.average_alignment_score = parsed_data[5] + self.percent_error_rate = parsed_data[6] + elif len(data) == 10: + parsed_data = [ parse_mean_range(x) for x in data[2:] ] + # this is the >= 0.3 summary file + self.lane_yield = data[1] + self.cluster = parsed_data[0] + self.cluster_pass_filter = parsed_data[1] + self.average_first_cycle_intensity = parsed_data[2] + self.percent_intensity_after_20_cycles = parsed_data[3] + self.percent_pass_filter_clusters = parsed_data[4] + self.percent_pass_filter_align = parsed_data[5] + self.average_alignment_score = parsed_data[6] + self.percent_error_rate = parsed_data[7] + + def get_elements(self): + lane_result = ElementTree.Element( + Summary.LaneResultSummary.LANE_RESULT_SUMMARY, + {'lane': self.lane}) + for tag, variable_name in Summary.LaneResultSummary.TAGS.items(): + value = getattr(self, variable_name) + if value is None: + continue + # it looks like a sequence + elif type(value) in (types.TupleType, types.ListType): + element = make_mean_range_element( + lane_result, + tag, + *value + ) + else: + element = ElementTree.SubElement(lane_result, tag) + element.text = value + return lane_result + + def set_elements(self, tree): + if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY: + raise ValueError('Expected %s' % ( + Summary.LaneResultSummary.LANE_RESULT_SUMMARY)) + self.lane = tree.attrib['lane'] + tags = Summary.LaneResultSummary.TAGS + for element in list(tree): + try: + variable_name = tags[element.tag] + setattr(self, variable_name, + parse_summary_element(element)) + except KeyError, e: + logging.warn('Unrecognized tag %s' % (element.tag,)) + + def __init__(self, filename=None, xml=None): + self.lane_results = {} + + if filename is not None: + self._extract_lane_results(filename) + if xml is not None: + self.set_elements(xml) + + def __getitem__(self, key): + return self.lane_results[key] + + def __len__(self): + return len(self.lane_results) + + def keys(self): + return self.lane_results.keys() + + def values(self): + return self.lane_results.values() + + def items(self): + return self.lane_results.items() + + def _flattened_row(self, row): + """ + flatten the children of a ... + """ + return [flatten(x) for x in row.getchildren() ] + + def _parse_table(self, table): + """ + assumes the first line is the header of a table, + and that the remaining rows are data + """ + rows = table.getchildren() + data = [] + for r in rows: + data.append(self._flattened_row(r)) + return data + + def _extract_named_tables(self, pathname): + """ + extract all the 'named' tables from a Summary.htm file + and return as a dictionary + + Named tables are

...

...
pairs + The contents of the h2 tag is considered to the name + of the table. + """ + tree = ElementTree.parse(pathname).getroot() + body = tree.find('body') + tables = {} + for i in range(len(body)): + if body[i].tag == 'h2' and body[i+1].tag == 'table': + # we have an interesting table + name = flatten(body[i]) + table = body[i+1] + data = self._parse_table(table) + tables[name] = data + return tables + + def _extract_lane_results(self, pathname): + """ + extract the Lane Results Summary table + """ + + tables = self._extract_named_tables(pathname) + + # parse lane result summary + lane_summary = tables['Lane Results Summary'] + # this is version 1 of the summary file + if len(lane_summary[-1]) == 8: + # strip header + headers = lane_summary[0] + # grab the lane by lane data + lane_summary = lane_summary[1:] + + # this is version 2 of the summary file + if len(lane_summary[-1]) == 10: + # lane_summary[0] is a different less specific header row + headers = lane_summary[1] + lane_summary = lane_summary[2:10] + # after the last lane, there's a set of chip wide averages + + for r in lane_summary: + lrs = Summary.LaneResultSummary(html=r) + self.lane_results[lrs.lane] = lrs + + def get_elements(self): + summary = ElementTree.Element(Summary.SUMMARY, + {'version': unicode(Summary.XML_VERSION)}) + for lane in self.lane_results.values(): + summary.append(lane.get_elements()) + return summary + + def set_elements(self, tree): + if tree.tag != Summary.SUMMARY: + return ValueError("Expected %s" % (Summary.SUMMARY,)) + xml_version = int(tree.attrib.get('version', 0)) + if xml_version > Summary.XML_VERSION: + logging.warn('Summary XML tree is a higher version than this class') + for element in list(tree): + lrs = Summary.LaneResultSummary() + lrs.set_elements(element) + self.lane_results[lrs.lane] = lrs + + def dump(self): + """ + Debugging function, report current object + """ + pass + +def tonumber(v): + """ + Convert a value to int if its an int otherwise a float. + """ + try: + v = int(v) + except ValueError, e: + v = float(v) + return v + +def parse_mean_range(value): + """ + Parse values like 123 +/- 4.5 + """ + if value.strip() == 'unknown': + return 0, 0 + + average, pm, deviation = value.split() + if pm != '+/-': + raise RuntimeError("Summary.htm file format changed") + return tonumber(average), tonumber(deviation) + +def make_mean_range_element(parent, name, mean, deviation): + """ + Make an ElementTree subelement + """ + element = ElementTree.SubElement(parent, name, + { 'mean': unicode(mean), + 'deviation': unicode(deviation)}) + return element + +def parse_mean_range_element(element): + """ + Grab mean/deviation out of element + """ + return (tonumber(element.attrib['mean']), + tonumber(element.attrib['deviation'])) + +def parse_summary_element(element): + """ + Determine if we have a simple element or a mean/deviation element + """ + if len(element.attrib) > 0: + return parse_mean_range_element(element) + else: + return element.text diff --git a/scripts/elandseq b/scripts/elandseq index 2e456c1..6a5178c 100755 --- a/scripts/elandseq +++ b/scripts/elandseq @@ -3,7 +3,7 @@ import optparse import os import sys -from htsworkflow.pipelines.gerald import extract_eland_sequence +from htsworkflow.pipelines.eland import extract_eland_sequence def make_parser(): usage = "usage: %prog [options] infile [outfile]" diff --git a/scripts/rerun_eland.py b/scripts/rerun_eland.py index 6ab9abe..af06cdd 100644 --- a/scripts/rerun_eland.py +++ b/scripts/rerun_eland.py @@ -7,6 +7,7 @@ import subprocess import sys from htsworkflow.pipelines import gerald +from htsworkflow.pipelines.eland import extract_eland_sequence from htsworkflow.pipelines import runfolder def make_query_filename(eland_obj, output_dir): @@ -40,7 +41,7 @@ def extract_sequence(inpathname, query_pathname, length, dry_run=False): try: instream = open(inpathname, 'r') outstream = open(query_pathname, 'w') - gerald.extract_eland_sequence(instream, outstream, 0, length) + extract_eland_sequence(instream, outstream, 0, length) finally: outstream.close() instream.close() -- 2.30.2