--- /dev/null
+
+from datetime import datetime
+from glob import glob
+import logging
+import os
+import time
+import re
+
+from gaworkflow.pipeline.runfolder import \
+ ElementTree, \
+ VERSION_RE, \
+ EUROPEAN_STRPTIME
+
+class Phasing(object):
+ PHASING = 'Phasing'
+ PREPHASING = 'Prephasing'
+
+ def __init__(self, fromfile=None, xml=None):
+ self.lane = None
+ self.phasing = None
+ self.prephasing = None
+
+ if fromfile is not None:
+ self._initialize_from_file(fromfile)
+ elif xml is not None:
+ self.set_elements(xml)
+
+ def _initialize_from_file(self, pathname):
+ path, name = os.path.split(pathname)
+ basename, ext = os.path.splitext(name)
+ # the last character of the param base filename should be the
+ # lane number
+ tree = ElementTree.parse(pathname).getroot()
+ self.set_elements(tree)
+ self.lane = int(basename[-1])
+
+ def get_elements(self):
+ root = ElementTree.Element(Phasing.PHASING, {'lane': str(self.lane)})
+ phasing = ElementTree.SubElement(root, Phasing.PHASING)
+ phasing.text = str(self.phasing)
+ prephasing = ElementTree.SubElement(root, Phasing.PREPHASING)
+ prephasing.text = str(self.prephasing)
+ return root
+
+ def set_elements(self, tree):
+ if tree.tag not in ('Phasing', 'Parameters'):
+ raise ValueError('exptected Phasing or Parameters')
+ lane = tree.attrib.get('lane', None)
+ if lane is not None:
+ self.lane = int(lane)
+ for element in list(tree):
+ if element.tag == Phasing.PHASING:
+ self.phasing = float(element.text)
+ elif element.tag == Phasing.PREPHASING:
+ self.prephasing = float(element.text)
+
+class Bustard(object):
+ XML_VERSION = 1
+
+ # Xml Tags
+ BUSTARD = 'Bustard'
+ SOFTWARE_VERSION = 'version'
+ DATE = 'run_time'
+ USER = 'user'
+ PARAMETERS = 'Parameters'
+
+ def __init__(self, xml=None):
+ self.version = None
+ self.date = datetime.now()
+ self.user = None
+ self.phasing = {}
+
+ if xml is not None:
+ self.set_elements(xml)
+
+ def _get_time(self):
+ return time.mktime(self.date.timetuple())
+ time = property(_get_time, doc='return run time as seconds since epoch')
+
+ def dump(self):
+ print "Bustard version:", self.version
+ print "Run date", self.date
+ print "user:", self.user
+ for lane, tree in self.phasing.items():
+ print lane
+ print tree
+
+ def get_elements(self):
+ root = ElementTree.Element('Bustard',
+ {'version': str(Bustard.XML_VERSION)})
+ version = ElementTree.SubElement(root, Bustard.SOFTWARE_VERSION)
+ version.text = self.version
+ run_date = ElementTree.SubElement(root, Bustard.DATE)
+ run_date.text = str(self.time)
+ user = ElementTree.SubElement(root, Bustard.USER)
+ user.text = self.user
+ params = ElementTree.SubElement(root, Bustard.PARAMETERS)
+ for p in self.phasing.values():
+ params.append(p.get_elements())
+ return root
+
+ def set_elements(self, tree):
+ if tree.tag != Bustard.BUSTARD:
+ raise ValueError('Expected "Bustard" SubElements')
+ xml_version = int(tree.attrib.get('version', 0))
+ if xml_version > Bustard.XML_VERSION:
+ logging.warn('Bustard XML tree is a higher version than this class')
+ for element in list(tree):
+ if element.tag == Bustard.SOFTWARE_VERSION:
+ self.version = element.text
+ elif element.tag == Bustard.DATE:
+ self.date = datetime.fromtimestamp(float(element.text))
+ elif element.tag == Bustard.USER:
+ self.user = element.text
+ elif element.tag == Bustard.PARAMETERS:
+ for param in element:
+ p = Phasing(xml=param)
+ self.phasing[p.lane] = p
+ else:
+ raise ValueError("Unrecognized tag: %s" % (element.tag,))
+
+
+
+def bustard(pathname):
+ """
+ Construct a Bustard object from pathname
+ """
+ b = Bustard()
+ path, name = os.path.split(pathname)
+ groups = name.split("_")
+ version = re.search(VERSION_RE, groups[0])
+ b.version = version.group(1)
+ b.date = datetime.strptime(groups[1], EUROPEAN_STRPTIME)
+ b.user = groups[2]
+ paramfiles = glob(os.path.join(pathname, "params?.xml"))
+ for paramfile in paramfiles:
+ phasing = Phasing(paramfile)
+ assert (phasing.lane >= 1 and phasing.lane <= 8)
+ b.phasing[phasing.lane] = phasing
+ return b
+
+def fromxml(tree):
+ b = Bustard()
+ b.set_elements(tree)
+ return b
--- /dev/null
+"""
+Extract information about the Firecrest run
+
+Firecrest - class holding the properties we found
+firecrest - Firecrest factory function initalized from a directory name
+fromxml - Firecrest factory function initalized from an xml dump from
+ the Firecrest object.
+"""
+
+from datetime import datetime
+import os
+import re
+import time
+
+from gaworkflow.pipeline.runfolder import \
+ ElementTree, \
+ VERSION_RE, \
+ EUROPEAN_STRPTIME
+
+class Firecrest(object):
+ XML_VERSION=1
+
+ # xml tag names
+ FIRECREST = 'Firecrest'
+ SOFTWARE_VERSION = 'version'
+ START = 'FirstCycle'
+ STOP = 'LastCycle'
+ DATE = 'run_time'
+ USER = 'user'
+ MATRIX = 'matrix'
+
+ def __init__(self, xml=None):
+ self.start = None
+ self.stop = None
+ self.version = None
+ self.date = datetime.now()
+ self.user = None
+ self.matrix = None
+
+ if xml is not None:
+ self.set_elements(xml)
+
+ def _get_time(self):
+ return time.mktime(self.date.timetuple())
+ time = property(_get_time, doc='return run time as seconds since epoch')
+
+ def dump(self):
+ print "Starting cycle:", self.start
+ print "Ending cycle:", self.stop
+ print "Firecrest version:", self.version
+ print "Run date:", self.date
+ print "user:", self.user
+
+ def get_elements(self):
+ attribs = {'version': str(Firecrest.XML_VERSION) }
+ root = ElementTree.Element(Firecrest.FIRECREST, attrib=attribs)
+ version = ElementTree.SubElement(root, Firecrest.SOFTWARE_VERSION)
+ version.text = self.version
+ start_cycle = ElementTree.SubElement(root, Firecrest.START)
+ start_cycle.text = str(self.start)
+ stop_cycle = ElementTree.SubElement(root, Firecrest.STOP)
+ stop_cycle.text = str(self.stop)
+ run_date = ElementTree.SubElement(root, Firecrest.DATE)
+ run_date.text = str(self.time)
+ user = ElementTree.SubElement(root, Firecrest.USER)
+ user.text = self.user
+ matrix = ElementTree.SubElement(root, Firecrest.MATRIX)
+ matrix.text = self.matrix
+ return root
+
+ def set_elements(self, tree):
+ if tree.tag != Firecrest.FIRECREST:
+ raise ValueError('Expected "Firecrest" SubElements')
+ xml_version = int(tree.attrib.get('version', 0))
+ if xml_version > Firecrest.XML_VERSION:
+ logging.warn('Firecrest XML tree is a higher version than this class')
+ for element in list(tree):
+ if element.tag == Firecrest.SOFTWARE_VERSION:
+ self.version = element.text
+ elif element.tag == Firecrest.START:
+ self.start = int(element.text)
+ elif element.tag == Firecrest.STOP:
+ self.stop = int(element.text)
+ elif element.tag == Firecrest.DATE:
+ self.date = datetime.fromtimestamp(float(element.text))
+ elif element.tag == Firecrest.USER:
+ self.user = element.text
+ elif element.tag == Firecrest.MATRIX:
+ self.matrix = element.text
+ else:
+ raise ValueError("Unrecognized tag: %s" % (element.tag,))
+
+def firecrest(pathname):
+ """
+ Examine the directory at pathname and initalize a Firecrest object
+ """
+ f = Firecrest()
+
+ # parse firecrest directory name
+ path, name = os.path.split(pathname)
+ groups = name.split('_')
+ # grab the start/stop cycle information
+ cycle = re.match("C([0-9]+)-([0-9]+)", groups[0])
+ f.start = int(cycle.group(1))
+ f.stop = int(cycle.group(2))
+ # firecrest version
+ version = re.search(VERSION_RE, groups[1])
+ f.version = (version.group(1))
+ # datetime
+ f.date = datetime.strptime(groups[2], EUROPEAN_STRPTIME)
+ # username
+ f.user = groups[3]
+
+ # should I parse this deeper than just stashing the
+ # contents of the matrix file?
+ matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt')
+ f.matrix = open(matrix_pathname, 'r').read()
+ return f
+
+def fromxml(tree):
+ """
+ Initialize a Firecrest object from an element tree node
+ """
+ f = Firecrest()
+ f.set_elements(tree)
+ return f
--- /dev/null
+"""
+Provide access to information stored in the GERALD directory.
+"""
+from datetime import datetime, date
+from glob import glob
+import logging
+import os
+import stat
+import time
+
+from gaworkflow.pipeline.runfolder import \
+ ElementTree, \
+ EUROPEAN_STRPTIME, \
+ LANES_PER_FLOWCELL, \
+ VERSION_RE
+from gaworkflow.util.ethelp import indent, flatten
+
+class Gerald(object):
+ """
+ Capture meaning out of the GERALD directory
+ """
+ XML_VERSION = 1
+ GERALD='Gerald'
+ RUN_PARAMETERS='RunParameters'
+ SUMMARY='Summary'
+
+ class LaneParameters(object):
+ """
+ Make it easy to access elements of LaneSpecificRunParameters from python
+ """
+ def __init__(self, gerald, key):
+ self._gerald = gerald
+ self._key = key
+
+ def __get_attribute(self, xml_tag):
+ subtree = self._gerald.tree.find('LaneSpecificRunParameters')
+ container = subtree.find(xml_tag)
+ if container is None or \
+ len(container.getchildren()) != LANES_PER_FLOWCELL:
+ raise RuntimeError('GERALD config.xml file changed')
+ lanes = [x.tag.split('_')[1] for x in container.getchildren()]
+ index = lanes.index(self._key)
+ #element = container.find(self._key)
+ element = container[index]
+ return element.text
+ def _get_analysis(self):
+ return self.__get_attribute('ANALYSIS')
+ analysis = property(_get_analysis)
+
+ def _get_eland_genome(self):
+ return self.__get_attribute('ELAND_GENOME')
+ eland_genome = property(_get_eland_genome)
+
+ def _get_read_length(self):
+ return self.__get_attribute('READ_LENGTH')
+ read_length = property(_get_read_length)
+
+ def _get_use_bases(self):
+ return self.__get_attribute('USE_BASES')
+ use_bases = property(_get_use_bases)
+
+ class LaneSpecificRunParameters(object):
+ """
+ Provide access to LaneSpecificRunParameters
+ """
+ def __init__(self, gerald):
+ self._gerald = gerald
+ self._keys = None
+ def __getitem__(self, key):
+ return Gerald.LaneParameters(self._gerald, key)
+ def keys(self):
+ if self._keys is None:
+ tree = self._gerald.tree
+ analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
+ # according to the pipeline specs I think their fields
+ # are sampleName_laneID, with sampleName defaulting to s
+ # since laneIDs are constant lets just try using
+ # those consistently.
+ self._keys = [ x.tag.split('_')[1] for x in analysis]
+ return self._keys
+ def values(self):
+ return [ self[x] for x in self.keys() ]
+ def items(self):
+ return zip(self.keys(), self.values())
+ def __len__(self):
+ return len(self.keys())
+
+ def __init__(self, xml=None):
+ self.pathname = None
+ self.tree = None
+
+ # parse lane parameters out of the config.xml file
+ self.lanes = Gerald.LaneSpecificRunParameters(self)
+
+ self.summary = None
+ self.eland_results = None
+
+ if xml is not None:
+ self.set_elements(xml)
+
+ def _get_date(self):
+ if self.tree is None:
+ return datetime.today()
+ timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
+ return datetime.strptime(timestamp, '%c')
+ date = property(_get_date)
+
+ def _get_time(self):
+ return time.mktime(self.date.timetuple())
+ time = property(_get_time, doc='return run time as seconds since epoch')
+
+ def _get_version(self):
+ if self.tree is None:
+ return None
+ return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
+ version = property(_get_version)
+
+ def dump(self):
+ """
+ Debugging function, report current object
+ """
+ print 'Gerald version:', self.version
+ print 'Gerald run date:', self.date
+ print 'Gerald config.xml:', self.tree
+ self.summary.dump()
+
+ def get_elements(self):
+ if self.tree is None or self.summary is None:
+ return None
+
+ gerald = ElementTree.Element(Gerald.GERALD,
+ {'version': unicode(Gerald.XML_VERSION)})
+ gerald.append(self.tree)
+ gerald.append(self.summary.get_elements())
+ if self.eland_results:
+ gerald.append(self.eland_results.get_elements())
+ return gerald
+
+ def set_elements(self, tree):
+ if tree.tag != Gerald.GERALD:
+ raise ValueError('exptected GERALD')
+ xml_version = int(tree.attrib.get('version', 0))
+ if xml_version > Gerald.XML_VERSION:
+ logging.warn('XML tree is a higher version than this class')
+ for element in list(tree):
+ tag = element.tag.lower()
+ if tag == Gerald.RUN_PARAMETERS.lower():
+ self.tree = element
+ elif tag == Gerald.SUMMARY.lower():
+ self.summary = Summary(xml=element)
+ elif tag == ELAND.ELAND.lower():
+ self.eland_results = ELAND(xml=element)
+ else:
+ logging.warn("Unrecognized tag %s" % (element.tag,))
+
+
+def gerald(pathname):
+ g = Gerald()
+ g.pathname = pathname
+ path, name = os.path.split(pathname)
+ config_pathname = os.path.join(pathname, 'config.xml')
+ g.tree = ElementTree.parse(config_pathname).getroot()
+
+ # parse Summary.htm file
+ summary_pathname = os.path.join(pathname, 'Summary.htm')
+ g.summary = Summary(summary_pathname)
+ # parse eland files
+ g.eland_results = eland(g.pathname, g)
+ return g
+
+def tonumber(v):
+ """
+ Convert a value to int if its an int otherwise a float.
+ """
+ try:
+ v = int(v)
+ except ValueError, e:
+ v = float(v)
+ return v
+
+def parse_mean_range(value):
+ """
+ Parse values like 123 +/- 4.5
+ """
+ if value.strip() == 'unknown':
+ return 0, 0
+
+ average, pm, deviation = value.split()
+ if pm != '+/-':
+ raise RuntimeError("Summary.htm file format changed")
+ return tonumber(average), tonumber(deviation)
+
+def make_mean_range_element(parent, name, mean, deviation):
+ """
+ Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
+ """
+ element = ElementTree.SubElement(parent, name,
+ { 'mean': unicode(mean),
+ 'deviation': unicode(deviation)})
+ return element
+
+def parse_mean_range_element(element):
+ """
+ Grab mean/deviation out of element
+ """
+ return (tonumber(element.attrib['mean']),
+ tonumber(element.attrib['deviation']))
+
+class Summary(object):
+ """
+ Extract some useful information from the Summary.htm file
+ """
+ XML_VERSION = 1
+ SUMMARY = 'Summary'
+
+ class LaneResultSummary(object):
+ """
+ Parse the LaneResultSummary table out of Summary.htm
+ Mostly for the cluster number
+ """
+ LANE_RESULT_SUMMARY = 'LaneResultSummary'
+ TAGS = {
+ 'Cluster': 'cluster',
+ 'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
+ 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
+ 'PercentPassFilterClusters': 'percent_pass_filter_clusters',
+ 'PercentPassFilterAlign': 'percent_pass_filter_align',
+ 'AverageAlignmentScore': 'average_alignment_score',
+ 'PercentErrorRate': 'percent_error_rate'
+ }
+
+ def __init__(self, html=None, xml=None):
+ self.lane = None
+ self.cluster = None
+ self.average_first_cycle_intensity = None
+ self.percent_intensity_after_20_cycles = None
+ self.percent_pass_filter_clusters = None
+ self.percent_pass_filter_align = None
+ self.average_alignment_score = None
+ self.percent_error_rate = None
+
+ if html is not None:
+ self.set_elements_from_html(html)
+ if xml is not None:
+ self.set_elements(xml)
+
+ def set_elements_from_html(self, row_element):
+ data = [ flatten(x) for x in row_element ]
+ if len(data) != 8:
+ raise RuntimeError("Summary.htm file format changed")
+
+ self.lane = data[0]
+ self.cluster = parse_mean_range(data[1])
+ self.average_first_cycle_intensity = parse_mean_range(data[2])
+ self.percent_intensity_after_20_cycles = parse_mean_range(data[3])
+ self.percent_pass_filter_clusters = parse_mean_range(data[4])
+ self.percent_pass_filter_align = parse_mean_range(data[5])
+ self.average_alignment_score = parse_mean_range(data[6])
+ self.percent_error_rate = parse_mean_range(data[7])
+
+ def get_elements(self):
+ lane_result = ElementTree.Element(
+ Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
+ {'lane': self.lane})
+ for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
+ element = make_mean_range_element(
+ lane_result,
+ tag,
+ *getattr(self, variable_name)
+ )
+ return lane_result
+
+ def set_elements(self, tree):
+ if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
+ raise ValueError('Expected %s' % (
+ Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
+ self.lane = tree.attrib['lane']
+ tags = Summary.LaneResultSummary.TAGS
+ for element in list(tree):
+ try:
+ variable_name = tags[element.tag]
+ setattr(self, variable_name,
+ parse_mean_range_element(element))
+ except KeyError, e:
+ logging.warn('Unrecognized tag %s' % (element.tag,))
+
+ def __init__(self, filename=None, xml=None):
+ self.lane_results = {}
+
+ if filename is not None:
+ self._extract_lane_results(filename)
+ if xml is not None:
+ self.set_elements(xml)
+
+ def __getitem__(self, key):
+ return self.lane_results[key]
+
+ def __len__(self):
+ return len(self.lane_results)
+
+ def keys(self):
+ return self.lane_results.keys()
+
+ def values(self):
+ return self.lane_results.values()
+
+ def items(self):
+ return self.lane_results.items()
+
+ def _extract_lane_results(self, pathname):
+ """
+ extract the Lane Results Summary table
+ """
+ tree = ElementTree.parse(pathname).getroot()
+ if flatten(tree.findall('*//h2')[3]) != 'Lane Results Summary':
+ raise RuntimeError("Summary.htm file format changed")
+
+ tables = tree.findall('*//table')
+
+ # parse lane result summary
+ lane_summary = tables[2]
+ rows = lane_summary.getchildren()
+ headers = rows[0].getchildren()
+ if flatten(headers[2]) != 'Av 1st Cycle Int ':
+ raise RuntimeError("Summary.htm file format changed")
+
+ for r in rows[1:]:
+ lrs = Summary.LaneResultSummary(html=r)
+ self.lane_results[lrs.lane] = lrs
+
+ def get_elements(self):
+ summary = ElementTree.Element(Summary.SUMMARY,
+ {'version': unicode(Summary.XML_VERSION)})
+ for lane in self.lane_results.values():
+ summary.append(lane.get_elements())
+ return summary
+
+ def set_elements(self, tree):
+ if tree.tag != Summary.SUMMARY:
+ return ValueError("Expected %s" % (Summary.SUMMARY,))
+ xml_version = int(tree.attrib.get('version', 0))
+ if xml_version > Summary.XML_VERSION:
+ logging.warn('Summary XML tree is a higher version than this class')
+ for element in list(tree):
+ lrs = Summary.LaneResultSummary()
+ lrs.set_elements(element)
+ self.lane_results[lrs.lane] = lrs
+
+ def dump(self):
+ """
+ Debugging function, report current object
+ """
+ pass
+
+
+def build_genome_fasta_map(genome_dir):
+ # build fasta to fasta file map
+ genome = genome_dir.split(os.path.sep)[-1]
+ fasta_map = {}
+ for vld_file in glob(os.path.join(genome_dir, '*.vld')):
+ is_link = False
+ if os.path.islink(vld_file):
+ is_link = True
+ vld_file = os.path.realpath(vld_file)
+ path, vld_name = os.path.split(vld_file)
+ name, ext = os.path.splitext(vld_name)
+ if is_link:
+ fasta_map[name] = name
+ else:
+ fasta_map[name] = os.path.join(genome, name)
+ return fasta_map
+
+class ElandLane(object):
+ """
+ Process an eland result file
+ """
+ XML_VERSION = 1
+ LANE = 'ElandLane'
+ GENOME_MAP = 'GenomeMap'
+ GENOME_ITEM = 'GenomeItem'
+ MAPPED_READS = 'MappedReads'
+ MAPPED_ITEM = 'MappedItem'
+ MATCH_CODES = 'MatchCodes'
+ MATCH_ITEM = 'Code'
+ READS = 'Reads'
+
+ def __init__(self, pathname=None, genome_map=None, xml=None):
+ self.pathname = pathname
+ self._reads = None
+ self._mapped_reads = {}
+ self._match_codes = {}
+ if genome_map is None:
+ genome_map = {}
+ self.genome_map = genome_map
+
+ if pathname is not None:
+ self._update()
+ if xml is not None:
+ self.set_elements(xml)
+
+ def _update(self):
+ """
+ Actually read the file and actually count the reads
+ """
+ # can't do anything if we don't have a file to process
+ if self.pathname is None:
+ return
+
+ if os.stat(self.pathname)[stat.ST_SIZE] == 0:
+ raise RuntimeError("Eland isn't done, try again later.")
+
+ reads = 0
+ mapped_reads = {}
+
+ match_codes = {'NM':0, 'QC':0, 'RM':0,
+ 'U0':0, 'U1':0, 'U2':0,
+ 'R0':0, 'R1':0, 'R2':0,
+ }
+ for line in open(self.pathname):
+ reads += 1
+ fields = line.split()
+ # code = fields[2]
+ # match_codes[code] = match_codes.setdefault(code, 0) + 1
+ # the QC/NM etc codes are in the 3rd field and always present
+ match_codes[fields[2]] += 1
+ # ignore lines that don't have a fasta filename
+ if len(fields) < 7:
+ continue
+ fasta = self.genome_map.get(fields[6], fields[6])
+ mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
+ self._match_codes = match_codes
+ self._mapped_reads = mapped_reads
+ self._reads = reads
+
+ def _get_reads(self):
+ if self._reads is None:
+ self._update()
+ return self._reads
+ reads = property(_get_reads)
+
+ def _get_mapped_reads(self):
+ if self._mapped_reads is None:
+ self._update()
+ return self._mapped_reads
+ mapped_reads = property(_get_mapped_reads)
+
+ def _get_match_codes(self):
+ if self._match_codes is None:
+ self._update()
+ return self._match_codes
+ match_codes = property(_get_match_codes)
+
+ def get_elements(self):
+ lane = ElementTree.Element(ElandLane.LANE,
+ {'version':
+ unicode(ElandLane.XML_VERSION)})
+ genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
+ for k, v in self.genome_map.items():
+ item = ElementTree.SubElement(
+ genome_map, ElandLane.GENOME_ITEM,
+ {'name':k, 'value':unicode(v)})
+ mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
+ for k, v in self.mapped_reads.items():
+ item = ElementTree.SubElement(
+ mapped_reads, ElandLane.MAPPED_ITEM,
+ {'name':k, 'value':unicode(v)})
+ match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
+ for k, v in self.match_codes.items():
+ item = ElementTree.SubElement(
+ match_codes, ElandLane.MATCH_ITEM,
+ {'name':k, 'value':unicode(v)})
+ reads = ElementTree.SubElement(lane, ElandLane.READS)
+ reads.text = unicode(self.reads)
+
+ return lane
+
+ def set_elements(self, tree):
+ if tree.tag != ElandLane.LANE:
+ raise ValueError('Exptecting %s' % (ElandLane.LANE,))
+ for element in tree:
+ tag = element.tag.lower()
+ if tag == ElandLane.GENOME_MAP.lower():
+ for child in element:
+ name = child.attrib['name']
+ value = child.attrib['value']
+ self.genome_map[name] = value
+ elif tag == ElandLane.MAPPED_READS.lower():
+ for child in element:
+ name = child.attrib['name']
+ value = child.attrib['value']
+ self._mapped_reads[name] = int(value)
+ elif tag == ElandLane.MATCH_CODES.lower():
+ for child in element:
+ name = child.attrib['name']
+ value = int(child.attrib['value'])
+ self._match_codes[name] = value
+ elif tag == ElandLane.READS.lower():
+ self._reads = int(element.text)
+ else:
+ logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
+
+
+class ELAND(object):
+ """
+ Summarize information from eland files
+ """
+ XML_VERSION = 1
+
+ ELAND = 'ElandCollection'
+ LANE = 'Lane'
+ LANE_ID = 'id'
+
+ def __init__(self, xml=None):
+ # we need information from the gerald config.xml
+ self.results = {}
+
+ if xml is not None:
+ self.set_elements(xml)
+
+ def __len__(self):
+ return len(self.results)
+
+ def keys(self):
+ return self.results.keys()
+
+ def values(self):
+ return self.results.values()
+
+ def items(self):
+ return self.results.items()
+
+ def __getitem__(self, key):
+ return self.results[key]
+
+ def get_elements(self):
+ root = ElementTree.Element(ELAND.ELAND,
+ {'version': unicode(ELAND.XML_VERSION)})
+ for lane_id, lane in self.results.items():
+ eland_lane = lane.get_elements()
+ eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
+ root.append(eland_lane)
+ return root
+
+ def set_elements(self, tree):
+ if tree.tag.lower() != ELAND.ELAND.lower():
+ raise ValueError('Expecting %s', ELAND.ELAND)
+ for element in list(tree):
+ lane_id = element.attrib[ELAND.LANE_ID]
+ lane = ElandLane(xml=element)
+ self.results[lane_id] = lane
+
+def eland(basedir, gerald=None, genome_maps=None):
+ e = ELAND()
+ for pathname in glob(os.path.join(basedir, "*_eland_result.txt")):
+ # extract the sample name
+ path, name = os.path.split(pathname)
+ split_name = name.split('_')
+ sample_name = split_name[0]
+ lane_id = split_name[1]
+
+ if genome_maps is not None:
+ genome_map = genome_maps[lane_id]
+ elif gerald is not None:
+ genome_dir = gerald.lanes[lane_id].eland_genome
+ genome_map = build_genome_fasta_map(genome_dir)
+ else:
+ genome_map = {}
+
+ eland_result = ElandLane(pathname, genome_map)
+ e.results[lane_id] = eland_result
+ return e
--- /dev/null
+"""
+Core information needed to inspect a runfolder.
+"""
+from glob import glob
+import logging
+import os
+import re
+import stat
+import sys
+import time
+
+try:
+ from xml.etree import ElementTree
+except ImportError, e:
+ from elementtree import ElementTree
+
+EUROPEAN_STRPTIME = "%d-%m-%Y"
+EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
+VERSION_RE = "([0-9\.]+)"
+USER_RE = "([a-zA-Z0-9]+)"
+LANES_PER_FLOWCELL = 8
+
+from gaworkflow.util.alphanum import alphanum
+from gaworkflow.util.ethelp import indent, flatten
+
+
+class PipelineRun(object):
+ """
+ Capture "interesting" information about a pipeline run
+ """
+ XML_VERSION = 1
+ PIPELINE_RUN = 'PipelineRun'
+ FLOWCELL_ID = 'FlowcellID'
+
+ def __init__(self, pathname=None, firecrest=None, bustard=None, gerald=None, xml=None):
+ self.pathname = pathname
+ self._name = None
+ self._flowcell_id = None
+ self.firecrest = firecrest
+ self.bustard = bustard
+ self.gerald = gerald
+
+ if xml is not None:
+ self.set_elements(xml)
+
+ def _get_flowcell_id(self):
+ # extract flowcell ID
+ if self._flowcell_id is None:
+ config_dir = os.path.join(self.pathname, 'Config')
+ flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
+ if os.path.exists(flowcell_id_path):
+ flowcell_id_tree = ElementTree.parse(flowcell_id_path)
+ self._flowcell_id = flowcell_id_tree.findtext('Text')
+ else:
+ path_fields = self.pathname.split('_')
+ if len(path_fields) > 0:
+ # guessing last element of filename
+ flowcell_id = path_fields[-1]
+ else:
+ flowcell_id = 'unknown'
+
+ logging.warning(
+ "Flowcell idwas not found, guessing %s" % (
+ flowcell_id))
+ self._flowcell_id = flowcell_id
+ return self._flowcell_id
+ flowcell_id = property(_get_flowcell_id)
+
+ def get_elements(self):
+ """
+ make one master xml file from all of our sub-components.
+ """
+ root = ElementTree.Element(PipelineRun.PIPELINE_RUN)
+ flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID)
+ flowcell.text = self.flowcell_id
+ root.append(self.firecrest.get_elements())
+ root.append(self.bustard.get_elements())
+ root.append(self.gerald.get_elements())
+ return root
+
+ def set_elements(self, tree):
+ # this file gets imported by all the others,
+ # so we need to hide the imports to avoid a cyclic imports
+ from gaworkflow.pipeline import firecrest
+ from gaworkflow.pipeline import bustard
+ from gaworkflow.pipeline import gerald
+
+ tag = tree.tag.lower()
+ if tag != PipelineRun.PIPELINE_RUN.lower():
+ raise ValueError('Pipeline Run Expecting %s got %s' % (
+ PipelineRun.PIPELINE_RUN, tag))
+ for element in tree:
+ tag = element.tag.lower()
+ if tag == PipelineRun.FLOWCELL_ID.lower():
+ self._flowcell_id = element.text
+ #ok the xword.Xword.XWORD pattern for module.class.constant is lame
+ elif tag == firecrest.Firecrest.FIRECREST.lower():
+ self.firecrest = firecrest.Firecrest(xml=element)
+ elif tag == bustard.Bustard.BUSTARD.lower():
+ self.bustard = bustard.Bustard(xml=element)
+ elif tag == gerald.Gerald.GERALD.lower():
+ self.gerald = gerald.Gerald(xml=element)
+ else:
+ logging.warn('PipelineRun unrecognized tag %s' % (tag,))
+
+ def _get_run_name(self):
+ """
+ Given a run tuple, find the latest date and use that as our name
+ """
+ if self._name is None:
+ tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time)
+ timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
+ self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml'
+ return self._name
+ name = property(_get_run_name)
+
+ def save(self):
+ logging.info("Saving run report "+ self.name)
+ xml = self.get_elements()
+ indent(xml)
+ ElementTree.ElementTree(xml).write(self.name)
+
+ def load(self, filename):
+ logging.info("Loading run report from " + filename)
+ tree = ElementTree.parse(filename).getroot()
+ self.set_elements(tree)
+
+def get_runs(runfolder):
+ """
+ Search through a run folder for all the various sub component runs
+ and then return a PipelineRun for each different combination.
+
+ For example if there are two different GERALD runs, this will
+ generate two different PipelineRun objects, that differ
+ in there gerald component.
+ """
+ from gaworkflow.pipeline import firecrest
+ from gaworkflow.pipeline import bustard
+ from gaworkflow.pipeline import gerald
+
+ datadir = os.path.join(runfolder, 'Data')
+
+ logging.info('Searching for runs in ' + datadir)
+ runs = []
+ for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
+ f = firecrest.firecrest(firecrest_pathname)
+ bustard_glob = os.path.join(firecrest_pathname, "Bustard*")
+ for bustard_pathname in glob(bustard_glob):
+ b = bustard.bustard(bustard_pathname)
+ gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
+ for gerald_pathname in glob(gerald_glob):
+ try:
+ g = gerald.gerald(gerald_pathname)
+ runs.append(PipelineRun(runfolder, f, b, g))
+ except IOError, e:
+ print "Ignoring", str(e)
+ return runs
+
+
+def extract_run_parameters(runs):
+ """
+ Search through runfolder_path for various runs and grab their parameters
+ """
+ for run in runs:
+ run.save()
+
+def summarize_mapped_reads(mapped_reads):
+ """
+ Summarize per chromosome reads into a genome count
+ But handle spike-in/contamination symlinks seperately.
+ """
+ summarized_reads = {}
+ genome_reads = 0
+ genome = 'unknown'
+ for k, v in mapped_reads.items():
+ path, k = os.path.split(k)
+ if len(path) > 0:
+ genome = path
+ genome_reads += v
+ else:
+ summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
+ summarized_reads[genome] = genome_reads
+ return summarized_reads
+
+def summary_report(runs):
+ """
+ Summarize cluster numbers and mapped read counts for a runfolder
+ """
+ report = []
+ for run in runs:
+ # print a run name?
+ report.append('Summary for %s' % (run.name,))
+ # sort the report
+ eland_keys = run.gerald.eland_results.results.keys()
+ eland_keys.sort(alphanum)
+
+ lane_results = run.gerald.summary.lane_results
+ for lane_id in eland_keys:
+ result = run.gerald.eland_results.results[lane_id]
+ report.append("Sample name %s" % (result.sample_name))
+ report.append("Lane id %s" % (result.lane_id,))
+ cluster = lane_results[result.lane_id].cluster
+ report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
+ report.append("Total Reads: %d" % (result.reads))
+ mc = result._match_codes
+ report.append("No Match: %d" % (mc['NM']))
+ report.append("QC Failed: %d" % (mc['QC']))
+ report.append('Unique (0,1,2 mismatches) %d %d %d' % \
+ (mc['U0'], mc['U1'], mc['U2']))
+ report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
+ (mc['R0'], mc['R1'], mc['R2']))
+ report.append("Mapped Reads")
+ mapped_reads = summarize_mapped_reads(result.mapped_reads)
+ for name, counts in mapped_reads.items():
+ report.append(" %s: %d" % (name, counts))
+ report.append('---')
+ report.append('')
+ return os.linesep.join(report)
--- /dev/null
+#!/usr/bin/env python
+
+from datetime import datetime
+import os
+import tempfile
+import shutil
+import unittest
+
+from gaworkflow.pipeline import firecrest
+from gaworkflow.pipeline import bustard
+from gaworkflow.pipeline import gerald
+from gaworkflow.pipeline import runfolder
+from gaworkflow.pipeline.runfolder import ElementTree
+
+
+def make_flowcell_id(runfolder_dir, flowcell_id=None):
+ if flowcell_id is None:
+ flowcell_id = '207BTAAXY'
+
+ config = """<?xml version="1.0"?>
+<FlowcellId>
+ <Text>%s</Text>
+</FlowcellId>""" % (flowcell_id,)
+ config_dir = os.path.join(runfolder_dir, 'Config')
+
+ if not os.path.exists(config_dir):
+ os.mkdir(config_dir)
+ pathname = os.path.join(config_dir, 'FlowcellId.xml')
+ f = open(pathname,'w')
+ f.write(config)
+ f.close()
+
+def make_matrix(matrix_dir):
+ contents = """# Auto-generated frequency response matrix
+> A
+> C
+> G
+> T
+0.77 0.15 -0.04 -0.04
+0.76 1.02 -0.05 -0.06
+-0.10 -0.10 1.17 -0.03
+-0.13 -0.12 0.80 1.27
+"""
+ s_matrix = os.path.join(matrix_dir, 's_matrix.txt')
+ f = open(s_matrix, 'w')
+ f.write(contents)
+ f.close()
+
+def make_phasing_params(bustard_dir):
+ for lane in range(1,9):
+ pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane))
+ f = open(pathname, 'w')
+ f.write("""<Parameters>
+ <Phasing>0.009900</Phasing>
+ <Prephasing>0.003500</Prephasing>
+</Parameters>
+""")
+ f.close()
+
+def make_gerald_config(gerald_dir):
+ config_xml = """<RunParameters>
+<ChipWideRunParameters>
+ <ANALYSIS>default</ANALYSIS>
+ <BAD_LANES></BAD_LANES>
+ <BAD_TILES></BAD_TILES>
+ <CONTAM_DIR></CONTAM_DIR>
+ <CONTAM_FILE></CONTAM_FILE>
+ <ELAND_GENOME>Need_to_specify_ELAND_genome_directory</ELAND_GENOME>
+ <ELAND_MULTIPLE_INSTANCES>8</ELAND_MULTIPLE_INSTANCES>
+ <ELAND_REPEAT></ELAND_REPEAT>
+ <EMAIL_DOMAIN>domain.com</EMAIL_DOMAIN>
+ <EMAIL_LIST>diane</EMAIL_LIST>
+ <EMAIL_SERVER>localhost:25</EMAIL_SERVER>
+ <EXPT_DIR>/home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane</EXPT_DIR>
+ <EXPT_DIR_ROOT>/home/diane/gec</EXPT_DIR_ROOT>
+ <FORCE>1</FORCE>
+ <GENOME_DIR>/home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes</GENOME_DIR>
+ <GENOME_FILE>Need_to_specify_genome_file_name</GENOME_FILE>
+ <HAMSTER_FLAG>genome</HAMSTER_FLAG>
+ <OUT_DIR>/home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane</OUT_DIR>
+ <POST_RUN_COMMAND></POST_RUN_COMMAND>
+ <PRB_FILE_SUFFIX>_prb.txt</PRB_FILE_SUFFIX>
+ <PURE_BASES>12</PURE_BASES>
+ <QF_PARAMS>'((CHASTITY>=0.6))'</QF_PARAMS>
+ <QHG_FILE_SUFFIX>_qhg.txt</QHG_FILE_SUFFIX>
+ <QUALITY_FORMAT>--symbolic</QUALITY_FORMAT>
+ <READ_LENGTH>32</READ_LENGTH>
+ <SEQUENCE_FORMAT>--scarf</SEQUENCE_FORMAT>
+ <SEQ_FILE_SUFFIX>_seq.txt</SEQ_FILE_SUFFIX>
+ <SIG_FILE_SUFFIX_DEPHASED>_sig2.txt</SIG_FILE_SUFFIX_DEPHASED>
+ <SIG_FILE_SUFFIX_NOT_DEPHASED>_sig.txt</SIG_FILE_SUFFIX_NOT_DEPHASED>
+ <SOFTWARE_VERSION>@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp</SOFTWARE_VERSION>
+ <TILE_REGEX>s_[1-8]_[0-9][0-9][0-9][0-9]</TILE_REGEX>
+ <TILE_ROOT>s</TILE_ROOT>
+ <TIME_STAMP>Sat Apr 19 19:08:30 2008</TIME_STAMP>
+ <TOOLS_DIR>/home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald</TOOLS_DIR>
+ <USE_BASES>all</USE_BASES>
+ <WEB_DIR_ROOT>http://host.domain.com/yourshare/</WEB_DIR_ROOT>
+</ChipWideRunParameters>
+<LaneSpecificRunParameters>
+ <ANALYSIS>
+ <s_1>eland</s_1>
+ <s_2>eland</s_2>
+ <s_3>eland</s_3>
+ <s_4>eland</s_4>
+ <s_5>eland</s_5>
+ <s_6>eland</s_6>
+ <s_7>eland</s_7>
+ <s_8>eland</s_8>
+ </ANALYSIS>
+ <ELAND_GENOME>
+ <s_1>/g/dm3</s_1>
+ <s_2>/g/equcab1</s_2>
+ <s_3>/g/equcab1</s_3>
+ <s_4>/g/canfam2</s_4>
+ <s_5>/g/hg18</s_5>
+ <s_6>/g/hg18</s_6>
+ <s_7>/g/hg18</s_7>
+ <s_8>/g/hg18</s_8>
+ </ELAND_GENOME>
+ <READ_LENGTH>
+ <s_1>32</s_1>
+ <s_2>32</s_2>
+ <s_3>32</s_3>
+ <s_4>32</s_4>
+ <s_5>32</s_5>
+ <s_6>32</s_6>
+ <s_7>32</s_7>
+ <s_8>32</s_8>
+ </READ_LENGTH>
+ <USE_BASES>
+ <s_1>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_1>
+ <s_2>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_2>
+ <s_3>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_3>
+ <s_4>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_4>
+ <s_5>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_5>
+ <s_6>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_6>
+ <s_7>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_7>
+ <s_8>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_8>
+ </USE_BASES>
+</LaneSpecificRunParameters>
+</RunParameters>
+"""
+ pathname = os.path.join(gerald_dir, 'config.xml')
+ f = open(pathname,'w')
+ f.write(config_xml)
+ f.close()
+
+
+def make_summary_htm(gerald_dir):
+ summary_htm = """<!--RUN_TIME Mon Apr 21 11:52:25 2008 -->
+<!--SOFTWARE_VERSION @(#) $Id: jerboa.pl,v 1.31 2007/03/05 17:52:15 km Exp $-->
+<html>
+<body>
+
+<a name="Top"><h2><title>080416_HWI-EAS229_0024_207BTAAXX Summary</title></h2></a>
+<h1>Summary Information For Experiment 080416_HWI-EAS229_0024_207BTAAXX on Machine HWI-EAS229</h1>
+<h2><br></br>Chip Summary<br></br></h2>
+<table border="1" cellpadding="5">
+<tr><td>Machine</td><td>HWI-EAS229</td></tr>
+<tr><td>Run Folder</td><td>080416_HWI-EAS229_0024_207BTAAXX</td></tr>
+<tr><td>Chip ID</td><td>unknown</td></tr>
+</table>
+<h2><br></br>Lane Parameter Summary<br></br></h2>
+<table border="1" cellpadding="5">
+<tr>
+<td>Lane</td>
+<td>Sample ID</td>
+<td>Sample Target</td>
+<td>Sample Type</td>
+<td>Length</td>
+<td>Filter</td>
+<td>Tiles</td>
+</tr>
+<tr>
+<td>1</td>
+<td>unknown</td>
+<td>dm3</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane1">Lane 1</a></td>
+</tr>
+<tr>
+<td>2</td>
+<td>unknown</td>
+<td>equcab1</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane2">Lane 2</a></td>
+</tr>
+<tr>
+<td>3</td>
+<td>unknown</td>
+<td>equcab1</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane3">Lane 3</a></td>
+</tr>
+<tr>
+<td>4</td>
+<td>unknown</td>
+<td>canfam2</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane4">Lane 4</a></td>
+</tr>
+<tr>
+<td>5</td>
+<td>unknown</td>
+<td>hg18</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane5">Lane 5</a></td>
+</tr>
+<tr>
+<td>6</td>
+<td>unknown</td>
+<td>hg18</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane6">Lane 6</a></td>
+</tr>
+<tr>
+<td>7</td>
+<td>unknown</td>
+<td>hg18</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane7">Lane 7</a></td>
+</tr>
+<tr>
+<td>8</td>
+<td>unknown</td>
+<td>hg18</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane8">Lane 8</a></td>
+</tr>
+</table>
+<h2><br></br>Lane Results Summary<br></br></h2>
+<table border="1" cellpadding="5">
+<tr>
+
+<td>Lane </td>
+<td>Clusters </td>
+<td>Av 1st Cycle Int </td>
+<td>% intensity after 20 cycles </td>
+<td>% PF Clusters </td>
+<td>% Align (PF) </td>
+<td>Av Alignment Score (PF) </td>
+<td> % Error Rate (PF) </td>
+</tr>
+<tr>
+<td>1</td>
+<td>17421 +/- 2139</td>
+<td>7230 +/- 801</td>
+<td>23.73 +/- 10.79</td>
+<td>13.00 +/- 22.91</td>
+<td>32.03 +/- 18.45</td>
+<td>6703.57 +/- 3753.85</td>
+<td>4.55 +/- 4.81</td>
+</tr>
+<tr>
+<td>2</td>
+<td>20311 +/- 2402</td>
+<td>7660 +/- 678</td>
+<td>17.03 +/- 4.40</td>
+<td>40.74 +/- 30.33</td>
+<td>29.54 +/- 9.03</td>
+<td>5184.02 +/- 1631.54</td>
+<td>3.27 +/- 3.94</td>
+</tr>
+<tr>
+<td>3</td>
+<td>20193 +/- 2399</td>
+<td>7700 +/- 797</td>
+<td>15.75 +/- 3.30</td>
+<td>56.56 +/- 17.16</td>
+<td>27.33 +/- 7.48</td>
+<td>4803.49 +/- 1313.31</td>
+<td>3.07 +/- 2.86</td>
+</tr>
+<tr>
+<td>4</td>
+<td>15537 +/- 2531</td>
+<td>7620 +/- 1392</td>
+<td>15.37 +/- 3.79</td>
+<td>63.05 +/- 18.30</td>
+<td>15.88 +/- 4.99</td>
+<td>3162.13 +/- 962.59</td>
+<td>3.11 +/- 2.22</td>
+</tr>
+<tr>
+<td>5</td>
+<td>32047 +/- 3356</td>
+<td>8093 +/- 831</td>
+<td>23.79 +/- 6.18</td>
+<td>53.36 +/- 18.06</td>
+<td>48.04 +/- 13.77</td>
+<td>9866.23 +/- 2877.30</td>
+<td>2.26 +/- 1.16</td>
+</tr>
+<tr>
+<td>6</td>
+<td>32946 +/- 4753</td>
+<td>8227 +/- 736</td>
+<td>24.07 +/- 4.69</td>
+<td>54.65 +/- 12.57</td>
+<td>50.98 +/- 10.54</td>
+<td>10468.86 +/- 2228.53</td>
+<td>2.21 +/- 2.33</td>
+</tr>
+<tr>
+<td>7</td>
+<td>39504 +/- 4171</td>
+<td>8401 +/- 785</td>
+<td>22.55 +/- 4.56</td>
+<td>45.22 +/- 10.34</td>
+<td>48.41 +/- 9.67</td>
+<td>9829.40 +/- 1993.20</td>
+<td>2.26 +/- 1.11</td>
+</tr>
+<tr>
+<td>8</td>
+<td>37998 +/- 3792</td>
+<td>8443 +/- 1211</td>
+<td>39.03 +/- 7.52</td>
+<td>42.16 +/- 12.35</td>
+<td>40.98 +/- 14.89</td>
+<td>8128.87 +/- 3055.34</td>
+<td>3.57 +/- 2.77</td>
+</tr>
+</table>
+</body>
+</html>
+"""
+ pathname = os.path.join(gerald_dir, 'Summary.htm')
+ f = open(pathname, 'w')
+ f.write(summary_htm)
+ f.close()
+
+def make_eland_results(gerald_dir):
+ eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D.
+>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 chr2L.fa 8796855 R DD 24T
+>HWI-EAS229_24_207BTAAXX:1:7:776:582 AGCTCANCCGATCGAAAACCTCNCCAAGCAAT NM 0 0 0
+>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 Lambda.fa 8796855 R DD 24T
+"""
+ for i in range(1,9):
+ pathname = os.path.join(gerald_dir,
+ 's_%d_eland_result.txt' % (i,))
+ f = open(pathname, 'w')
+ f.write(eland_result)
+ f.close()
+
+class RunfolderTests(unittest.TestCase):
+ """
+ Test components of the runfolder processing code
+ which includes firecrest, bustard, and gerald
+ """
+ def setUp(self):
+ # make a fake runfolder directory
+ self.temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_')
+
+ self.runfolder_dir = os.path.join(self.temp_dir,
+ '080102_HWI-EAS229_0010_207BTAAXX')
+ os.mkdir(self.runfolder_dir)
+
+ self.data_dir = os.path.join(self.runfolder_dir, 'Data')
+ os.mkdir(self.data_dir)
+
+ self.firecrest_dir = os.path.join(self.data_dir,
+ 'C1-33_Firecrest1.8.28_12-04-2008_diane'
+ )
+ os.mkdir(self.firecrest_dir)
+ self.matrix_dir = os.path.join(self.firecrest_dir, 'Matrix')
+ os.mkdir(self.matrix_dir)
+ make_matrix(self.matrix_dir)
+
+ self.bustard_dir = os.path.join(self.firecrest_dir,
+ 'Bustard1.8.28_12-04-2008_diane')
+ os.mkdir(self.bustard_dir)
+ make_phasing_params(self.bustard_dir)
+
+ self.gerald_dir = os.path.join(self.bustard_dir,
+ 'GERALD_12-04-2008_diane')
+ os.mkdir(self.gerald_dir)
+ make_gerald_config(self.gerald_dir)
+ make_summary_htm(self.gerald_dir)
+ make_eland_results(self.gerald_dir)
+
+ def tearDown(self):
+ shutil.rmtree(self.temp_dir)
+
+ def test_firecrest(self):
+ """
+ Construct a firecrest object
+ """
+ f = firecrest.firecrest(self.firecrest_dir)
+ self.failUnlessEqual(f.version, '1.8.28')
+ self.failUnlessEqual(f.start, 1)
+ self.failUnlessEqual(f.stop, 33)
+ self.failUnlessEqual(f.user, 'diane')
+ self.failUnlessEqual(f.date, datetime(2008,4,12))
+
+ xml = f.get_elements()
+ # just make sure that element tree can serialize the tree
+ xml_str = ElementTree.tostring(xml)
+
+ f2 = firecrest.Firecrest(xml=xml)
+ self.failUnlessEqual(f.version, f2.version)
+ self.failUnlessEqual(f.start, f2.start)
+ self.failUnlessEqual(f.stop, f2.stop)
+ self.failUnlessEqual(f.user, f2.user)
+ self.failUnlessEqual(f.date, f2.date)
+
+ def test_bustard(self):
+ """
+ construct a bustard object
+ """
+ b = bustard.bustard(self.bustard_dir)
+ self.failUnlessEqual(b.version, '1.8.28')
+ self.failUnlessEqual(b.date, datetime(2008,4,12))
+ self.failUnlessEqual(b.user, 'diane')
+ self.failUnlessEqual(len(b.phasing), 8)
+ self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099)
+
+ xml = b.get_elements()
+ b2 = bustard.Bustard(xml=xml)
+ self.failUnlessEqual(b.version, b2.version)
+ self.failUnlessEqual(b.date, b2.date )
+ self.failUnlessEqual(b.user, b2.user)
+ self.failUnlessEqual(len(b.phasing), len(b2.phasing))
+ for key in b.phasing.keys():
+ self.failUnlessEqual(b.phasing[key].lane,
+ b2.phasing[key].lane)
+ self.failUnlessEqual(b.phasing[key].phasing,
+ b2.phasing[key].phasing)
+ self.failUnlessEqual(b.phasing[key].prephasing,
+ b2.phasing[key].prephasing)
+
+ def test_gerald(self):
+ # need to update gerald and make tests for it
+ g = gerald.gerald(self.gerald_dir)
+
+ self.failUnlessEqual(g.version,
+ '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp')
+ self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30))
+ self.failUnlessEqual(len(g.lanes), len(g.lanes.keys()))
+ self.failUnlessEqual(len(g.lanes), len(g.lanes.items()))
+
+
+ # list of genomes, matches what was defined up in
+ # make_gerald_config.
+ # the first None is to offset the genomes list to be 1..9
+ # instead of pythons default 0..8
+ genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2',
+ '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ]
+
+ # test lane specific parameters from gerald config file
+ for i in range(1,9):
+ cur_lane = g.lanes[str(i)]
+ self.failUnlessEqual(cur_lane.analysis, 'eland')
+ self.failUnlessEqual(cur_lane.eland_genome, genomes[i])
+ self.failUnlessEqual(cur_lane.read_length, '32')
+ self.failUnlessEqual(cur_lane.use_bases, 'Y'*32)
+
+ # test data extracted from summary file
+ clusters = [None,
+ (17421, 2139), (20311, 2402), (20193, 2399), (15537, 2531),
+ (32047, 3356), (32946, 4753), (39504, 4171), (37998, 3792)]
+
+ for i in range(1,9):
+ summary_lane = g.summary[str(i)]
+ self.failUnlessEqual(summary_lane.cluster, clusters[i])
+ self.failUnlessEqual(summary_lane.lane, str(i))
+
+ xml = g.get_elements()
+ # just make sure that element tree can serialize the tree
+ xml_str = ElementTree.tostring(xml)
+ g2 = gerald.Gerald(xml=xml)
+
+ # do it all again after extracting from the xml file
+ self.failUnlessEqual(g.version, g2.version)
+ self.failUnlessEqual(g.date, g2.date)
+ self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys()))
+ self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items()))
+
+ # test lane specific parameters from gerald config file
+ for i in range(1,9):
+ g_lane = g.lanes[str(i)]
+ g2_lane = g2.lanes[str(i)]
+ self.failUnlessEqual(g_lane.analysis, g2_lane.analysis)
+ self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome)
+ self.failUnlessEqual(g_lane.read_length, g2_lane.read_length)
+ self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases)
+
+ # test (some) summary elements
+ for i in range(1,9):
+ g_summary = g.summary[str(i)]
+ g2_summary = g2.summary[str(i)]
+ self.failUnlessEqual(g_summary.cluster, g2_summary.cluster)
+ self.failUnlessEqual(g_summary.lane, g2_summary.lane)
+
+ g_eland = g.eland_results
+ g2_eland = g2.eland_results
+ for lane in g_eland.keys():
+ self.failUnlessEqual(g_eland[lane].reads,
+ g2_eland[lane].reads)
+ self.failUnlessEqual(len(g_eland[lane].mapped_reads),
+ len(g2_eland[lane].mapped_reads))
+ for k in g_eland[lane].mapped_reads.keys():
+ self.failUnlessEqual(g_eland[lane].mapped_reads[k],
+ g2_eland[lane].mapped_reads[k])
+
+ self.failUnlessEqual(len(g_eland[lane].match_codes),
+ len(g2_eland[lane].match_codes))
+ for k in g_eland[lane].match_codes.keys():
+ self.failUnlessEqual(g_eland[lane].match_codes[k],
+ g2_eland[lane].match_codes[k])
+
+
+ def test_eland(self):
+ dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa',
+ 'chr2L.fa': 'dm3/chr2L.fa',
+ 'Lambda.fa': 'Lambda.fa'}
+ genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map,
+ '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map }
+ eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps)
+
+ for i in range(1,9):
+ lane = eland[str(i)]
+ self.failUnlessEqual(lane.reads, 4)
+ self.failUnlessEqual(len(lane.mapped_reads), 3)
+ self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1)
+ self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1)
+ self.failUnlessEqual(lane.match_codes['U1'], 2)
+ self.failUnlessEqual(lane.match_codes['NM'], 1)
+
+ xml = eland.get_elements()
+ # just make sure that element tree can serialize the tree
+ xml_str = ElementTree.tostring(xml)
+ e2 = gerald.ELAND(xml=xml)
+
+ for i in range(1,9):
+ l1 = eland[str(i)]
+ l2 = e2[str(i)]
+ self.failUnlessEqual(l1.reads, l2.reads)
+ self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads))
+ self.failUnlessEqual(len(l1.mapped_reads), 3)
+ for k in l1.mapped_reads.keys():
+ self.failUnlessEqual(l1.mapped_reads[k],
+ l2.mapped_reads[k])
+
+ self.failUnlessEqual(len(l1.match_codes), 9)
+ self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes))
+ for k in l1.match_codes.keys():
+ self.failUnlessEqual(l1.match_codes[k],
+ l2.match_codes[k])
+
+ def test_runfolder(self):
+ runs = runfolder.get_runs(self.runfolder_dir)
+
+ # do we get the flowcell id from the filename?
+ self.failUnlessEqual(len(runs), 1)
+ self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-04-19.xml')
+
+ # do we get the flowcell id from the FlowcellId.xml file
+ make_flowcell_id(self.runfolder_dir, '207BTAAXY')
+ runs = runfolder.get_runs(self.runfolder_dir)
+ self.failUnlessEqual(len(runs), 1)
+ self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-04-19.xml')
+
+ r1 = runs[0]
+ xml = r1.get_elements()
+ xml_str = ElementTree.tostring(xml)
+
+ r2 = runfolder.PipelineRun(xml=xml)
+ self.failUnlessEqual(r1.name, r2.name)
+ self.failIfEqual(r2.firecrest, None)
+ self.failIfEqual(r2.bustard, None)
+ self.failIfEqual(r2.gerald, None)
+
+
+def suite():
+ return unittest.makeSuite(RunfolderTests,'test')
+
+if __name__ == "__main__":
+ unittest.main(defaultTest="suite")
+
--- /dev/null
+"""
+ElementTree helper functions
+"""
+def indent(elem, level=0):
+ """
+ reformat an element tree to be 'pretty' (indented)
+ """
+ i = "\n" + level*" "
+ if len(elem):
+ if not elem.text or not elem.text.strip():
+ elem.text = i + " "
+ for child in elem:
+ indent(child, level+1)
+ # we don't want the closing tag indented too far
+ child.tail = i
+ if not elem.tail or not elem.tail.strip():
+ elem.tail = i
+ else:
+ if level and (not elem.tail or not elem.tail.strip()):
+ elem.tail = i
+
+def flatten(elem, include_tail=0):
+ """
+ Extract the text from an element tree
+ (AKA extract the text that not part of XML tags)
+ """
+ text = elem.text or ""
+ for e in elem:
+ text += flatten(e, 1)
+ if include_tail and elem.tail: text += elem.tail
+ return text
+
--- /dev/null
+import os
+import unittest
+
+from xml.etree import ElementTree
+from gaworkflow.util.ethelp import indent, flatten
+
+class testETHelper(unittest.TestCase):
+ def setUp(self):
+ self.foo = '<foo><bar>asdf</bar><br/></foo>'
+ self.foo_tree = ElementTree.fromstring(self.foo)
+
+ def test_indent(self):
+ flat_foo = ElementTree.tostring(self.foo_tree)
+ self.failUnlessEqual(len(flat_foo.split('\n')), 1)
+
+ indent(self.foo_tree)
+ pretty_foo = ElementTree.tostring(self.foo_tree)
+ self.failUnlessEqual(len(pretty_foo.split('\n')), 5)
+
+ def test_flatten(self):
+ self.failUnless(flatten(self.foo_tree), 'asdf')
+
+def suite():
+ return unittest.makeSuite(testETHelper, 'test')
+
+if __name__ == "__main__":
+ unittest.main(defaultTest='suite')
+
+
+
+
--- /dev/null
+#!/usr/bin/env python
+"""
+Runfolder.py can generate a xml file capturing all the 'interesting' parameters from a finished pipeline run. (using the -a option). The information currently being captured includes:
+
+ * Flowcell ID
+ * run dates
+ * start/stop cycle numbers
+ * Firecrest, bustard, gerald version numbers
+ * Eland analysis types, and everything in the eland configuration file.
+ * cluster numbers and other values from the Summary.htm
+ LaneSpecificParameters table.
+ * How many reads mapped to a genome from an eland file
+
+The ELAND "mapped reads" counter will also check for eland squashed file
+that were symlinked from another directory. This is so I can track how
+many reads landed on the genome of interest and on the spike ins.
+
+Basically my subdirectories something like:
+
+genomes/hg18
+genomes/hg18/chr*.2bpb <- files for hg18 genome
+genomes/hg18/chr*.vld
+genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
+genomes/spikein
+
+runfolder.py can also spit out a simple summary report (-s option)
+that contains the per lane post filter cluster numbers and the mapped
+read counts. (The report isn't currently very pretty)
+"""
+import logging
+import optparse
+import sys
+
+from gaworkflow.pipeline import runfolder
+
+def make_parser():
+ usage = 'usage: %prog [options] runfolder_root_dir'
+ parser = optparse.OptionParser(usage)
+ parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
+ default=False,
+ help='turn on verbose mode')
+ parser.add_option('-s', '--summary', dest='summary', action='store_true',
+ default=False,
+ help='produce summary report')
+ parser.add_option('-a', '--archive', dest='archive', action='store_true',
+ default=False,
+ help='generate run configuration archive')
+ return parser
+
+def main(cmdlist=None):
+ parser = make_parser()
+ opt, args = parser.parse_args(cmdlist)
+
+ if len(args) == 0:
+ parser.error('need path to a runfolder')
+
+ logging.basicConfig()
+ if opt.verbose:
+ root_log = logging.getLogger()
+ root_log.setLevel(logging.INFO)
+
+ for run_dir in args:
+ runs = runfolder.get_runs(run_dir)
+ if opt.summary:
+ print runfolder.summary_report(runs)
+ if opt.archive:
+ runfolder.extract_run_parameters(runs)
+
+ return 0
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
+++ /dev/null
-#!/usr/bin/env python
-"""
-Runfolder.py can generate a xml file capturing all the 'interesting' parameters from a finished pipeline run. (using the -a option). The information currently being captured includes:
-
- * Flowcell ID
- * run dates
- * start/stop cycle numbers
- * Firecrest, bustard, gerald version numbers
- * Eland analysis types, and everything in the eland configuration file.
- * cluster numbers and other values from the Summary.htm
- LaneSpecificParameters table.
- * How many reads mapped to a genome from an eland file
-
-The ELAND "mapped reads" counter will also check for eland squashed file
-that were symlinked from another directory. This is so I can track how
-many reads landed on the genome of interest and on the spike ins.
-
-Basically my subdirectories something like:
-
-genomes/hg18
-genomes/hg18/chr*.2bpb <- files for hg18 genome
-genomes/hg18/chr*.vld
-genomes/hg18/VATG.fa.2bp <- symlink to genomes/spikeins
-genomes/spikein
-
-runfolder.py can also spit out a simple summary report (-s option)
-that contains the per lane post filter cluster numbers and the mapped
-read counts. (The report isn't currently very pretty)
-"""
-import time
-import logging
-import os
-import re
-import stat
-import sys
-from glob import glob
-from pprint import pprint
-import optparse
-
-try:
- from xml.etree import ElementTree
-except ImportError, e:
- from elementtree import ElementTree
-
-from gaworkflow.util.alphanum import alphanum
-EUROPEAN_STRPTIME = "%d-%m-%Y"
-EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
-VERSION_RE = "([0-9\.]+)"
-USER_RE = "([a-zA-Z0-9]+)"
-LANES_PER_FLOWCELL = 8
-
-runfolder_path = '/home/diane/gec/080221_HWI-EAS229_0018_201BHAAXX'
-
-def indent(elem, level=0):
- """
- reformat an element tree to be 'pretty' (indented)
- """
- i = "\n" + level*" "
- if len(elem):
- if not elem.text or not elem.text.strip():
- elem.text = i + " "
- for child in elem:
- indent(child, level+1)
- # we don't want the closing tag indented too far
- child.tail = i
- if not elem.tail or not elem.tail.strip():
- elem.tail = i
- else:
- if level and (not elem.tail or not elem.tail.strip()):
- elem.tail = i
-
-def flatten(elem, include_tail=0):
- """
- Extract the text from an element tree
- (AKA extract the text that not part of XML tags)
- """
- text = elem.text or ""
- for e in elem:
- text += flatten(e, 1)
- if include_tail and elem.tail: text += elem.tail
- return text
-
-class Firecrest(object):
- def __init__(self, pathname):
- self.pathname = pathname
-
- # parse firecrest directory name
- path, name = os.path.split(pathname)
- groups = name.split('_')
- # grab the start/stop cycle information
- cycle = re.match("C([0-9]+)-([0-9]+)", groups[0])
- self.start = int(cycle.group(1))
- self.stop = int(cycle.group(2))
- # firecrest version
- version = re.search(VERSION_RE, groups[1])
- self.version = (version.group(1))
- # datetime
- self.date = time.strptime(groups[2], EUROPEAN_STRPTIME)
- self.time = time.mktime(self.date)
- # username
- self.user = groups[3]
-
- # should I parse this deeper than just stashing the
- # contents of the matrix file?
- matrix_pathname = os.path.join(self.pathname, 'Matrix', 's_matrix.txt')
- self.matrix = open(matrix_pathname, 'r').read()
-
- def dump(self):
- print "Starting cycle:", self.start
- print "Ending cycle:", self.stop
- print "Firecrest version:", self.version
- print "Run date:", self.date
- print "user:", self.user
-
- def _get_elements(self):
- firecrest = ElementTree.Element('Firecrest')
- version = ElementTree.SubElement(firecrest, 'version')
- version.text = self.version
- start_cycle = ElementTree.SubElement(firecrest, 'FirstCycle')
- start_cycle.text = str(self.start)
- stop_cycle = ElementTree.SubElement(firecrest, 'LastCycle')
- stop_cycle.text = str(self.stop)
- run_date = ElementTree.SubElement(firecrest, 'run_time')
- run_date.text = str(self.time)
- matrix = ElementTree.SubElement(firecrest, 'matrix')
- matrix.text = self.matrix
- return firecrest
- elements=property(_get_elements)
-
-class Bustard(object):
- def __init__(self, pathname):
- self.pathname = pathname
-
- path, name = os.path.split(pathname)
- groups = name.split("_")
- version = re.search(VERSION_RE, groups[0])
- self.version = version.group(1)
- self.date = time.strptime(groups[1], EUROPEAN_STRPTIME)
- self.time = time.mktime(self.date)
- self.user = groups[2]
- self.phasing = {}
- self._load_params()
-
- def _load_params(self):
- paramfiles = glob(os.path.join(self.pathname, "params*"))
- for paramfile in paramfiles:
- path, name = os.path.split(paramfile)
- basename, ext = os.path.splitext(name)
- # the last character of the param filename should be the
- # lane number
- lane = int(basename[-1])
- # we want the whole tree, not just the stuff under
- # the first tag
- param_tree = ElementTree.parse(paramfile).getroot()
- self.phasing[lane] = param_tree
-
- def dump(self):
- print "Bustard version:", self.version
- print "Run date", self.date
- print "user:", self.user
- for lane, tree in self.phasing.items():
- print lane
- print tree
-
- def _get_elements(self):
- bustard = ElementTree.Element('Bustard')
- version = ElementTree.SubElement(bustard, 'version')
- version.text = self.version
- run_date = ElementTree.SubElement(bustard, 'run_time')
- run_date.text = str(self.time)
- return bustard
- elements=property(_get_elements)
-
-class GERALD(object):
- """
- Capture meaning out of the GERALD directory
- """
- class LaneParameters(object):
- """
- Make it easy to access elements of LaneSpecificRunParameters from python
- """
- def __init__(self, tree, key):
- self._tree = tree.find('LaneSpecificRunParameters')
- self._key = key
-
- def __get_attribute(self, xml_tag):
- container = self._tree.find(xml_tag)
- if container is None or \
- len(container.getchildren()) != LANES_PER_FLOWCELL:
- raise RuntimeError('GERALD config.xml file changed')
- lanes = [x.tag.split('_')[1] for x in container.getchildren()]
- index = lanes.index(self._key)
- #element = container.find(self._key)
- element = container[index]
- return element.text
- def _get_analysis(self):
- return self.__get_attribute('ANALYSIS')
- analysis = property(_get_analysis)
-
- def _get_eland_genome(self):
- return self.__get_attribute('ELAND_GENOME')
- eland_genome = property(_get_eland_genome)
-
- def _get_read_length(self):
- return self.__get_attribute('READ_LENGTH')
- read_length = property(_get_read_length)
-
- def _get_use_bases(self):
- return self.__get_attribute('USE_BASES')
- use_bases = property(_get_use_bases)
-
- class LaneSpecificRunParameters(object):
- """
- Provide access to LaneSpecificRunParameters
- """
- def __init__(self, tree):
- self._tree = tree
- self._keys = None
- def __getitem__(self, key):
- return GERALD.LaneParameters(self._tree, key)
- def keys(self):
- if self._keys is None:
- analysis = self._tree.find('LaneSpecificRunParameters/ANALYSIS')
- # according to the pipeline specs I think their fields
- # are sampleName_laneID, with sampleName defaulting to s
- # since laneIDs are constant lets just try using
- # those consistently.
- self._keys = [ x.tag.split('_')[1] for x in analysis]
- return self._keys
- def values(self):
- return [ self[x] for x in self.keys() ]
- def items(self):
- return zip(self.keys(), self.values())
- def __len__(self):
- return len(self.keys)
-
- def __init__(self, pathname):
- self.pathname = pathname
- path, name = os.path.split(pathname)
- config_pathname = os.path.join(self.pathname, 'config.xml')
- self.tree = ElementTree.parse(config_pathname).getroot()
- self.version = self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
-
- date = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
- self.date = time.strptime(date)
- self.time = time.mktime(self.date)
-
- # parse Summary.htm file
- summary_pathname = os.path.join(self.pathname, 'Summary.htm')
- self.summary = Summary(summary_pathname)
- self.lanes = GERALD.LaneSpecificRunParameters(self.tree)
- self.eland_results = ELAND(self, self.pathname)
-
- def dump(self):
- """
- Debugging function, report current object
- """
- print 'Gerald version:', self.version
- print 'Gerald run date:', self.date
- print 'Gerald config.xml:', self.tree
- self.summary.dump()
-
- def _get_elements(self):
- gerald = ElementTree.Element('Gerald')
- gerald.append(self.tree)
- gerald.append(self.summary.elements)
- return gerald
- elements = property(_get_elements)
-
-def tonumber(v):
- """
- Convert a value to int if its an int otherwise a float.
- """
- try:
- v = int(v)
- except ValueError, e:
- v = float(v)
- return v
-
-def parse_mean_range(value):
- """
- Parse values like 123 +/- 4.5
- """
- if value.strip() == 'unknown':
- return 0, 0
-
- average, pm, deviation = value.split()
- if pm != '+/-':
- raise RuntimeError("Summary.htm file format changed")
- return tonumber(average), tonumber(deviation)
-
-def mean_range_element(parent, name, mean, deviation):
- """
- Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
- """
- element = ElementTree.SubElement(parent, name,
- { 'mean': str(mean),
- 'deviation': str(deviation)})
- return element
-
-class LaneResultSummary(object):
- """
- Parse the LaneResultSummary table out of Summary.htm
- Mostly for the cluster number
- """
- def __init__(self, row_element):
- data = [ flatten(x) for x in row_element ]
- if len(data) != 8:
- raise RuntimeError("Summary.htm file format changed")
-
- self.lane = data[0]
- self.cluster = parse_mean_range(data[1])
- self.average_first_cycle_intensity = parse_mean_range(data[2])
- self.percent_intensity_after_20_cycles = parse_mean_range(data[3])
- self.percent_pass_filter_clusters = parse_mean_range(data[4])
- self.percent_pass_filter_align = parse_mean_range(data[5])
- self.average_alignment_score = parse_mean_range(data[6])
- self.percent_error_rate = parse_mean_range(data[7])
-
- def _get_elements(self):
- lane_result = ElementTree.Element('LaneResultSummary',
- {'lane': self.lane})
- cluster = mean_range_element(lane_result, 'Cluster', *self.cluster)
- first_cycle = mean_range_element(lane_result,
- 'AverageFirstCycleIntensity',
- *self.average_first_cycle_intensity)
- after_20 = mean_range_element(lane_result,
- 'PercentIntensityAfter20Cycles',
- *self.percent_intensity_after_20_cycles)
- pass_filter = mean_range_element(lane_result,
- 'PercentPassFilterClusters',
- *self.percent_pass_filter_clusters)
- alignment = mean_range_element(lane_result,
- 'AverageAlignmentScore',
- *self.average_alignment_score)
- error_rate = mean_range_element(lane_result,
- 'PercentErrorRate',
- *self.percent_error_rate)
- return lane_result
- elements = property(_get_elements)
-
-class Summary(object):
- """
- Extract some useful information from the Summary.htm file
- """
- def __init__(self, pathname):
- self.pathname = pathname
- self.tree = ElementTree.parse(pathname).getroot()
- self.lane_results = {}
-
- self._extract_lane_results()
-
- def _extract_lane_results(self):
- """
- extract the Lane Results Summary table
- """
- if flatten(self.tree.findall('*//h2')[3]) != 'Lane Results Summary':
- raise RuntimeError("Summary.htm file format changed")
-
- tables = self.tree.findall('*//table')
-
- # parse lane result summary
- lane_summary = tables[2]
- rows = lane_summary.getchildren()
- headers = rows[0].getchildren()
- if flatten(headers[2]) != 'Av 1st Cycle Int ':
- raise RuntimeError("Summary.htm file format changed")
-
- for r in rows[1:]:
- lrs = LaneResultSummary(r)
- self.lane_results[lrs.lane] = lrs
-
- def _get_elements(self):
- summary = ElementTree.Element('Summary')
- for lane in self.lane_results.values():
- summary.append(lane.elements)
- return summary
- elements = property(_get_elements)
-
- def dump(self):
- """
- Debugging function, report current object
- """
- pass
-
-
-class ELAND(object):
- """
- Summarize information from eland files
- """
- class ElandResult(object):
- """
- Process an eland result file
- """
- def __init__(self, gerald, pathname):
- self.gerald = gerald
- self.pathname = pathname
- # extract the sample name
- path, name = os.path.split(self.pathname)
- split_name = name.split('_')
- self.sample_name = split_name[0]
- self.lane_id = split_name[1]
- self._reads = None
- self._mapped_reads = None
- self._fasta_map = {}
-
- def _build_fasta_map(self, genome_dir):
- # build fasta to fasta file map
- genome = genome_dir.split(os.path.sep)[-1]
- fasta_map = {}
- for vld_file in glob(os.path.join(genome_dir, '*.vld')):
- is_link = False
- if os.path.islink(vld_file):
- is_link = True
- vld_file = os.path.realpath(vld_file)
- path, vld_name = os.path.split(vld_file)
- name, ext = os.path.splitext(vld_name)
- if is_link:
- fasta_map[name] = name
- else:
- fasta_map[name] = os.path.join(genome, name)
- self._fasta_map = fasta_map
-
- def _update(self):
- """
- Actually read the file and actually count the reads
- """
- if os.stat(self.pathname)[stat.ST_SIZE] == 0:
- raise RuntimeError("Eland isn't done, try again later.")
-
- reads = 0
- mapped_reads = {}
- genome_dir = self.gerald.lanes[self.lane_id].eland_genome
- self._build_fasta_map(genome_dir)
- match_codes = {'NM':0, 'QC':0, 'RM':0,
- 'U0':0, 'U1':0, 'U2':0,
- 'R0':0, 'R1':0, 'R2':0,
- }
- for line in open(self.pathname):
- reads += 1
- fields = line.split()
- # code = fields[2]
- # match_codes[code] = match_codes.setdefault(code, 0) + 1
- # the QC/NM etc codes are in the 3rd field and always present
- match_codes[fields[2]] += 1
- # ignore lines that don't have a fasta filename
- if len(fields) < 7:
- continue
- fasta = self._fasta_map.get(fields[6], fields[6])
- mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
- self._match_codes = match_codes
- self._mapped_reads = mapped_reads
- self._reads = reads
-
- def _get_reads(self):
- if self._reads is None:
- self._update()
- return self._reads
- reads = property(_get_reads)
-
- def _get_mapped_reads(self):
- if self._mapped_reads is None:
- self._update()
- return self._mapped_reads
- mapped_reads = property(_get_mapped_reads)
-
- def __init__(self, gerald, basedir):
- # we need information from the gerald config.xml
- self.gerald = gerald
- self.results = {}
- for f in glob(os.path.join(basedir, "*_eland_result.txt")):
- eland_result = ELAND.ElandResult(gerald, f)
- self.results[eland_result.lane_id] = eland_result
-
-class PipelineRun(object):
- """
- Capture "interesting" information about a pipeline run
- """
- def __init__(self, pathname, firecrest, bustard, gerald):
- self.pathname = pathname
- self._name = None
- self._flowcell_id = None
- self.firecrest = firecrest
- self.bustard = bustard
- self.gerald = gerald
-
- def _get_flowcell_id(self):
- # extract flowcell ID
- if self._flowcell_id is None:
- config_dir = os.path.join(self.pathname, 'Config')
- flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
- if os.path.exists(flowcell_id_path):
- flowcell_id_tree = ElementTree.parse(flowcell_id_path)
- self._flowcell_id = flowcell_id_tree.findtext('Text')
- else:
- logging.warning(
- "Unable to determine flowcell id as %s was not found" % (
- flowcell_id_path))
- self._flowcell_id = "unknown"
- return self._flowcell_id
- flowcell_id = property(_get_flowcell_id)
-
- def _get_xml(self):
- """
- make one master xml file from all of our sub-components.
- """
- root = ElementTree.Element('PipelineRun')
- flowcell = ElementTree.SubElement(root, 'FlowcellID')
- flowcell.text = self.flowcell_id
- root.append(self.firecrest.elements)
- root.append(self.bustard.elements)
- root.append(self.gerald.elements)
- return root
-
- def _get_pretty_xml(self):
- """
- Generate indented xml file
- """
- root = self._get_xml()
- indent(root)
- return root
- xml = property(_get_pretty_xml)
-
- def _get_run_name(self):
- """
- Given a run tuple, find the latest date and use that as our name
- """
- if self._name is None:
- tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time)
- timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
- self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml'
- return self._name
- name = property(_get_run_name)
-
- def save(self):
- logging.info("Saving run report "+ self.name)
- ElementTree.ElementTree(self.xml).write(self.name)
-
-def get_runs(runfolder):
- """
- Search through a run folder for all the various sub component runs
- and then return a PipelineRun for each different combination.
-
- For example if there are two different GERALD runs, this will
- generate two different PipelineRun objects, that differ
- in there gerald component.
- """
- datadir = os.path.join(runfolder, 'Data')
-
- logging.info('Searching for runs in ' + datadir)
- runs = []
- for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
- f = Firecrest(firecrest_pathname)
- bustard_glob = os.path.join(firecrest_pathname, "Bustard*")
- for bustard_pathname in glob(bustard_glob):
- b = Bustard(bustard_pathname)
- gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
- for gerald_pathname in glob(gerald_glob):
- try:
- g = GERALD(gerald_pathname)
- runs.append(PipelineRun(runfolder, f, b, g))
- except IOError, e:
- print "Ignoring", str(e)
- return runs
-
-
-def extract_run_parameters(runs):
- """
- Search through runfolder_path for various runs and grab their parameters
- """
- for run in runs:
- run.save()
-
-def summarize_mapped_reads(mapped_reads):
- """
- Summarize per chromosome reads into a genome count
- But handle spike-in/contamination symlinks seperately.
- """
- summarized_reads = {}
- genome_reads = 0
- genome = 'unknown'
- for k, v in mapped_reads.items():
- path, k = os.path.split(k)
- if len(path) > 0:
- genome = path
- genome_reads += v
- else:
- summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
- summarized_reads[genome] = genome_reads
- return summarized_reads
-
-def summary_report(runs):
- """
- Summarize cluster numbers and mapped read counts for a runfolder
- """
- report = []
- for run in runs:
- # print a run name?
- report.append('Summary for %s' % (run.name,))
- # sort the report
- eland_keys = run.gerald.eland_results.results.keys()
- eland_keys.sort(alphanum)
-
- lane_results = run.gerald.summary.lane_results
- for lane_id in eland_keys:
- result = run.gerald.eland_results.results[lane_id]
- report.append("Sample name %s" % (result.sample_name))
- report.append("Lane id %s" % (result.lane_id,))
- cluster = lane_results[result.lane_id].cluster
- report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
- report.append("Total Reads: %d" % (result.reads))
- mc = result._match_codes
- report.append("No Match: %d" % (mc['NM']))
- report.append("QC Failed: %d" % (mc['QC']))
- report.append('Unique (0,1,2 mismatches) %d %d %d' % \
- (mc['U0'], mc['U1'], mc['U2']))
- report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
- (mc['R0'], mc['R1'], mc['R2']))
- report.append("Mapped Reads")
- mapped_reads = summarize_mapped_reads(result.mapped_reads)
- for name, counts in mapped_reads.items():
- report.append(" %s: %d" % (name, counts))
- report.append('---')
- report.append('')
- return os.linesep.join(report)
-
-def make_parser():
- usage = 'usage: %prog [options] runfolder_root_dir'
- parser = optparse.OptionParser(usage)
- parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
- default=False,
- help='turn on verbose mode')
- parser.add_option('-s', '--summary', dest='summary', action='store_true',
- default=False,
- help='produce summary report')
- parser.add_option('-a', '--archive', dest='archive', action='store_true',
- default=False,
- help='generate run configuration archive')
- return parser
-
-def main(cmdlist=None):
- parser = make_parser()
- opt, args = parser.parse_args(cmdlist)
-
- if len(args) == 0:
- parser.error('need path to a runfolder')
-
- logging.basicConfig()
- if opt.verbose:
- root_log = logging.getLogger()
- root_log.setLevel(logging.INFO)
-
- for runfolder in args:
- runs = get_runs(runfolder)
- if opt.summary:
- print summary_report(runs)
- if opt.archive:
- extract_run_parameters(runs)
-
- return 0
-
-if __name__ == "__main__":
- sys.exit(main(sys.argv[1:]))