From: Diane Trout Date: Wed, 15 Oct 2008 19:49:34 +0000 (+0000) Subject: rename pipeline to pipelines to imply that we can process more than just illumina. X-Git-Tag: stanford.caltech-merged-database-2009-jan-15~32 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=fecaccc112d39cd8e7c1e6fad2593179a52cdac4 rename pipeline to pipelines to imply that we can process more than just illumina. --- diff --git a/htsworkflow/pipeline/__init__.py b/htsworkflow/pipeline/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/htsworkflow/pipeline/bustard.py b/htsworkflow/pipeline/bustard.py deleted file mode 100644 index 4e268f2..0000000 --- a/htsworkflow/pipeline/bustard.py +++ /dev/null @@ -1,146 +0,0 @@ - -from datetime import date -from glob import glob -import logging -import os -import time -import re - -from htsworkflow.pipeline.runfolder import \ - ElementTree, \ - VERSION_RE, \ - EUROPEAN_STRPTIME - -class Phasing(object): - PHASING = 'Phasing' - PREPHASING = 'Prephasing' - - def __init__(self, fromfile=None, xml=None): - self.lane = None - self.phasing = None - self.prephasing = None - - if fromfile is not None: - self._initialize_from_file(fromfile) - elif xml is not None: - self.set_elements(xml) - - def _initialize_from_file(self, pathname): - path, name = os.path.split(pathname) - basename, ext = os.path.splitext(name) - # the last character of the param base filename should be the - # lane number - tree = ElementTree.parse(pathname).getroot() - self.set_elements(tree) - self.lane = int(basename[-1]) - - def get_elements(self): - root = ElementTree.Element(Phasing.PHASING, {'lane': str(self.lane)}) - phasing = ElementTree.SubElement(root, Phasing.PHASING) - phasing.text = str(self.phasing) - prephasing = ElementTree.SubElement(root, Phasing.PREPHASING) - prephasing.text = str(self.prephasing) - return root - - def set_elements(self, tree): - if tree.tag not in ('Phasing', 'Parameters'): - raise ValueError('exptected Phasing or Parameters') - lane = tree.attrib.get('lane', None) - if lane is not None: - self.lane = int(lane) - for element in list(tree): - if element.tag == Phasing.PHASING: - self.phasing = float(element.text) - elif element.tag == Phasing.PREPHASING: - self.prephasing = float(element.text) - -class Bustard(object): - XML_VERSION = 1 - - # Xml Tags - BUSTARD = 'Bustard' - SOFTWARE_VERSION = 'version' - DATE = 'run_time' - USER = 'user' - PARAMETERS = 'Parameters' - - def __init__(self, xml=None): - self.version = None - self.date = date.today() - self.user = None - self.phasing = {} - - if xml is not None: - self.set_elements(xml) - - def _get_time(self): - return time.mktime(self.date.timetuple()) - time = property(_get_time, doc='return run time as seconds since epoch') - - def dump(self): - print "Bustard version:", self.version - print "Run date", self.date - print "user:", self.user - for lane, tree in self.phasing.items(): - print lane - print tree - - def get_elements(self): - root = ElementTree.Element('Bustard', - {'version': str(Bustard.XML_VERSION)}) - version = ElementTree.SubElement(root, Bustard.SOFTWARE_VERSION) - version.text = self.version - run_date = ElementTree.SubElement(root, Bustard.DATE) - run_date.text = str(self.time) - user = ElementTree.SubElement(root, Bustard.USER) - user.text = self.user - params = ElementTree.SubElement(root, Bustard.PARAMETERS) - for p in self.phasing.values(): - params.append(p.get_elements()) - return root - - def set_elements(self, tree): - if tree.tag != Bustard.BUSTARD: - raise ValueError('Expected "Bustard" SubElements') - xml_version = int(tree.attrib.get('version', 0)) - if xml_version > Bustard.XML_VERSION: - logging.warn('Bustard XML tree is a higher version than this class') - for element in list(tree): - if element.tag == Bustard.SOFTWARE_VERSION: - self.version = element.text - elif element.tag == Bustard.DATE: - self.date = date.fromtimestamp(float(element.text)) - elif element.tag == Bustard.USER: - self.user = element.text - elif element.tag == Bustard.PARAMETERS: - for param in element: - p = Phasing(xml=param) - self.phasing[p.lane] = p - else: - raise ValueError("Unrecognized tag: %s" % (element.tag,)) - - - -def bustard(pathname): - """ - Construct a Bustard object from pathname - """ - b = Bustard() - path, name = os.path.split(pathname) - groups = name.split("_") - version = re.search(VERSION_RE, groups[0]) - b.version = version.group(1) - t = time.strptime(groups[1], EUROPEAN_STRPTIME) - b.date = date(*t[0:3]) - b.user = groups[2] - paramfiles = glob(os.path.join(pathname, "params?.xml")) - for paramfile in paramfiles: - phasing = Phasing(paramfile) - assert (phasing.lane >= 1 and phasing.lane <= 8) - b.phasing[phasing.lane] = phasing - return b - -def fromxml(tree): - b = Bustard() - b.set_elements(tree) - return b diff --git a/htsworkflow/pipeline/configure_run.py b/htsworkflow/pipeline/configure_run.py deleted file mode 100644 index d541de3..0000000 --- a/htsworkflow/pipeline/configure_run.py +++ /dev/null @@ -1,606 +0,0 @@ -#!/usr/bin/python -import subprocess -import logging -import time -import re -import os - -from htsworkflow.pipeline.retrieve_config import getCombinedOptions, saveConfigFile -from htsworkflow.pipeline.retrieve_config import FlowCellNotFound, WebError404 -from htsworkflow.pipeline.genome_mapper import DuplicateGenome, getAvailableGenomes, constructMapperDict -from htsworkflow.pipeline.run_status import GARunStatus - -from pyinotify import WatchManager, ThreadedNotifier -from pyinotify import EventsCodes, ProcessEvent - -class ConfigInfo: - - def __init__(self): - #run_path = firecrest analysis directory to run analysis from - self.run_path = None - self.bustard_path = None - self.config_filepath = None - self.status = None - - #top level directory where all analyses are placed - self.base_analysis_dir = None - #analysis_dir, top level analysis dir... - # base_analysis_dir + '/070924_USI-EAS44_0022_FC12150' - self.analysis_dir = None - - - def createStatusObject(self): - """ - Creates a status object which can be queried for - status of running the pipeline - - returns True if object created - returns False if object cannot be created - """ - if self.config_filepath is None: - return False - - self.status = GARunStatus(self.config_filepath) - return True - - - -#################################### -# inotify event processor - -s_firecrest_finished = re.compile('Firecrest[0-9\._\-A-Za-z]+/finished.txt') -s_bustard_finished = re.compile('Bustard[0-9\._\-A-Za-z]+/finished.txt') -s_gerald_finished = re.compile('GERALD[0-9\._\-A-Za-z]+/finished.txt') - -s_gerald_all = re.compile('Firecrest[0-9\._\-A-Za-z]+/Bustard[0-9\._\-A-Za-z]+/GERALD[0-9\._\-A-Za-z]+/') -s_bustard_all = re.compile('Firecrest[0-9\._\-A-Za-z]+/Bustard[0-9\._\-A-Za-z]+/') -s_firecrest_all = re.compile('Firecrest[0-9\._\-A-Za-z]+/') - -class RunEvent(ProcessEvent): - - def __init__(self, conf_info): - - self.run_status_dict = {'firecrest': False, - 'bustard': False, - 'gerald': False} - - self._ci = conf_info - - ProcessEvent.__init__(self) - - - def process_IN_CREATE(self, event): - fullpath = os.path.join(event.path, event.name) - if s_finished.search(fullpath): - logging.info("File Found: %s" % (fullpath)) - - if s_firecrest_finished.search(fullpath): - self.run_status_dict['firecrest'] = True - self._ci.status.updateFirecrest(event.name) - elif s_bustard_finished.search(fullpath): - self.run_status_dict['bustard'] = True - self._ci.status.updateBustard(event.name) - elif s_gerald_finished.search(fullpath): - self.run_status_dict['gerald'] = True - self._ci.status.updateGerald(event.name) - - #WARNING: The following order is important!! - # Firecrest regex will catch all gerald, bustard, and firecrest - # Bustard regex will catch all gerald and bustard - # Gerald regex will catch all gerald - # So, order needs to be Gerald, Bustard, Firecrest, or this - # won't work properly. - elif s_gerald_all.search(fullpath): - self._ci.status.updateGerald(event.name) - elif s_bustard_all.search(fullpath): - self._ci.status.updateBustard(event.name) - elif s_firecrest_all.search(fullpath): - self._ci.status.updateFirecrest(event.name) - - #print "Create: %s" % (os.path.join(event.path, event.name)) - - def process_IN_DELETE(self, event): - #print "Remove %s" % (os.path.join(event.path, event.name)) - pass - - - - -#FLAGS -# Config Step Error -RUN_ABORT = 'abort' -# Run Step Error -RUN_FAILED = 'failed' - - -##################################### -# Configure Step (goat_pipeline.py) -#Info -s_start = re.compile('Starting Genome Analyzer Pipeline') -s_gerald = re.compile("[\S\s]+--GERALD[\S\s]+--make[\S\s]+") -s_generating = re.compile('^Generating journals, Makefiles') -s_seq_folder = re.compile('^Sequence folder: ') -s_seq_folder_sub = re.compile('want to make ') -s_stderr_taskcomplete = re.compile('^Task complete, exiting') - -#Errors -s_invalid_cmdline = re.compile('Usage:[\S\s]*goat_pipeline.py') -s_species_dir_err = re.compile('Error: Lane [1-8]:') -s_goat_traceb = re.compile("^Traceback \(most recent call last\):") -s_missing_cycles = re.compile('^Error: Tile s_[1-8]_[0-9]+: Different number of cycles: [0-9]+ instead of [0-9]+') - -SUPPRESS_MISSING_CYCLES = False - - -##Ignore - Example of out above each ignore regex. -#NOTE: Commenting out an ignore will cause it to be -# logged as DEBUG with the logging module. -#CF_STDERR_IGNORE_LIST = [] -s_skip = re.compile('s_[0-8]_[0-9]+') - - -########################################## -# Pipeline Run Step (make -j8 recursive) - -##Info -s_finished = re.compile('finished') - -##Errors -s_make_error = re.compile('^make[\S\s]+Error') -s_no_gnuplot = re.compile('gnuplot: command not found') -s_no_convert = re.compile('^Can\'t exec "convert"') -s_no_ghostscript = re.compile('gs: command not found') - -##Ignore - Example of out above each ignore regex. -#NOTE: Commenting out an ignore will cause it to be -# logged as DEBUG with the logging module. -# -PL_STDERR_IGNORE_LIST = [] -# Info: PF 11802 -PL_STDERR_IGNORE_LIST.append( re.compile('^Info: PF') ) -# About to analyse intensity file s_4_0101_sig2.txt -PL_STDERR_IGNORE_LIST.append( re.compile('^About to analyse intensity file') ) -# Will send output to standard output -PL_STDERR_IGNORE_LIST.append( re.compile('^Will send output to standard output') ) -# Found 31877 clusters -PL_STDERR_IGNORE_LIST.append( re.compile('^Found [0-9]+ clusters') ) -# Will use quality criterion ((CHASTITY>=0.6) -PL_STDERR_IGNORE_LIST.append( re.compile('^Will use quality criterion') ) -# Quality criterion translated to (($F[5]>=0.6)) -PL_STDERR_IGNORE_LIST.append( re.compile('^Quality criterion translated to') ) -# opened /woldlab/trog/data1/king/070924_USI-EAS44_0022_FC12150/Data/C1-36_Firecrest1.9.1_14-11-2007_king.4/Bustard1.9.1_14-11-2007_king/s_4_0101_qhg.txt -# AND -# opened s_4_0103_qhg.txt -PL_STDERR_IGNORE_LIST.append( re.compile('^opened[\S\s]+qhg.txt') ) -# 81129 sequences out of 157651 passed filter criteria -PL_STDERR_IGNORE_LIST.append( re.compile('^[0-9]+ sequences out of [0-9]+ passed filter criteria') ) - - -def pl_stderr_ignore(line): - """ - Searches lines for lines to ignore (i.e. not to log) - - returns True if line should be ignored - returns False if line should NOT be ignored - """ - for s in PL_STDERR_IGNORE_LIST: - if s.search(line): - return True - return False - - -def config_stdout_handler(line, conf_info): - """ - Processes each line of output from GOAT - and stores useful information using the logging module - - Loads useful information into conf_info as well, for future - use outside the function. - - returns True if found condition that signifies success. - """ - - # Skip irrelevant line (without logging) - if s_skip.search(line): - pass - - # Detect invalid command-line arguments - elif s_invalid_cmdline.search(line): - logging.error("Invalid commandline options!") - - # Detect starting of configuration - elif s_start.search(line): - logging.info('START: Configuring pipeline') - - # Detect it made it past invalid arguments - elif s_gerald.search(line): - logging.info('Running make now') - - # Detect that make files have been generated (based on output) - elif s_generating.search(line): - logging.info('Make files generted') - return True - - # Capture run directory - elif s_seq_folder.search(line): - mo = s_seq_folder_sub.search(line) - #Output changed when using --tiles= - # at least in pipeline v0.3.0b2 - if mo: - firecrest_bustard_gerald_makefile = line[mo.end():] - firecrest_bustard_gerald, junk = \ - os.path.split(firecrest_bustard_gerald_makefile) - firecrest_bustard, junk = os.path.split(firecrest_bustard_gerald) - firecrest, junk = os.path.split(firecrest_bustard) - - conf_info.bustard_path = firecrest_bustard - conf_info.run_path = firecrest - - #Standard output handling - else: - print 'Sequence line:', line - mo = s_seq_folder.search(line) - conf_info.bustard_path = line[mo.end():] - conf_info.run_path, temp = os.path.split(conf_info.bustard_path) - - # Log all other output for debugging purposes - else: - logging.warning('CONF:?: %s' % (line)) - - return False - - - -def config_stderr_handler(line, conf_info): - """ - Processes each line of output from GOAT - and stores useful information using the logging module - - Loads useful information into conf_info as well, for future - use outside the function. - - returns RUN_ABORT upon detecting failure; - True on success message; - False if neutral message - (i.e. doesn't signify failure or success) - """ - global SUPPRESS_MISSING_CYCLES - - # Detect invalid species directory error - if s_species_dir_err.search(line): - logging.error(line) - return RUN_ABORT - # Detect goat_pipeline.py traceback - elif s_goat_traceb.search(line): - logging.error("Goat config script died, traceback in debug output") - return RUN_ABORT - # Detect indication of successful configuration (from stderr; odd, but ok) - elif s_stderr_taskcomplete.search(line): - logging.info('Configure step successful (from: stderr)') - return True - # Detect missing cycles - elif s_missing_cycles.search(line): - - # Only display error once - if not SUPPRESS_MISSING_CYCLES: - logging.error("Missing cycles detected; Not all cycles copied?") - logging.debug("CONF:STDERR:MISSING_CYCLES: %s" % (line)) - SUPPRESS_MISSING_CYCLES = True - return RUN_ABORT - - # Log all other output as debug output - else: - logging.debug('CONF:STDERR:?: %s' % (line)) - - # Neutral (not failure; nor success) - return False - - -#def pipeline_stdout_handler(line, conf_info): -# """ -# Processes each line of output from running the pipeline -# and stores useful information using the logging module -# -# Loads useful information into conf_info as well, for future -# use outside the function. -# -# returns True if found condition that signifies success. -# """ -# -# #f.write(line + '\n') -# -# return True - - - -def pipeline_stderr_handler(line, conf_info): - """ - Processes each line of stderr from pipelien run - and stores useful information using the logging module - - ##FIXME: Future feature (doesn't actually do this yet) - #Loads useful information into conf_info as well, for future - #use outside the function. - - returns RUN_FAILED upon detecting failure; - #True on success message; (no clear success state) - False if neutral message - (i.e. doesn't signify failure or success) - """ - - if pl_stderr_ignore(line): - pass - elif s_make_error.search(line): - logging.error("make error detected; run failed") - return RUN_FAILED - elif s_no_gnuplot.search(line): - logging.error("gnuplot not found") - return RUN_FAILED - elif s_no_convert.search(line): - logging.error("imagemagick's convert command not found") - return RUN_FAILED - elif s_no_ghostscript.search(line): - logging.error("ghostscript not found") - return RUN_FAILED - else: - logging.debug('PIPE:STDERR:?: %s' % (line)) - - return False - - -def retrieve_config(conf_info, flowcell, cfg_filepath, genome_dir): - """ - Gets the config file from server... - requires config file in: - /etc/ga_frontend/ga_frontend.conf - or - ~/.ga_frontend.conf - - with: - [config_file_server] - base_host_url: http://host:port - - return True if successful, False is failure - """ - options = getCombinedOptions() - - if options.url is None: - logging.error("~/.ga_frontend.conf or /etc/ga_frontend/ga_frontend.conf" \ - " missing base_host_url option") - return False - - try: - saveConfigFile(flowcell, options.url, cfg_filepath) - conf_info.config_filepath = cfg_filepath - except FlowCellNotFound, e: - logging.error(e) - return False - except WebError404, e: - logging.error(e) - return False - except IOError, e: - logging.error(e) - return False - except Exception, e: - logging.error(e) - return False - - f = open(cfg_filepath, 'r') - data = f.read() - f.close() - - genome_dict = getAvailableGenomes(genome_dir) - mapper_dict = constructMapperDict(genome_dict) - - logging.debug(data) - - f = open(cfg_filepath, 'w') - f.write(data % (mapper_dict)) - f.close() - - return True - - - -def configure(conf_info): - """ - Attempts to configure the GA pipeline using goat. - - Uses logging module to store information about status. - - returns True if configuration successful, otherwise False. - """ - #ERROR Test: - #pipe = subprocess.Popen(['goat_pipeline.py', - # '--GERALD=config32bk.txt', - # '--make .',], - # #'.'], - # stdout=subprocess.PIPE, - # stderr=subprocess.PIPE) - - #ERROR Test (2), causes goat_pipeline.py traceback - #pipe = subprocess.Popen(['goat_pipeline.py', - # '--GERALD=%s' % (conf_info.config_filepath), - # '--tiles=s_4_100,s_4_101,s_4_102,s_4_103,s_4_104', - # '--make', - # '.'], - # stdout=subprocess.PIPE, - # stderr=subprocess.PIPE) - - ########################## - # Run configuration step - # Not a test; actual configure attempt. - #pipe = subprocess.Popen(['goat_pipeline.py', - # '--GERALD=%s' % (conf_info.config_filepath), - # '--make', - # '.'], - # stdout=subprocess.PIPE, - # stderr=subprocess.PIPE) - - - stdout_filepath = os.path.join(conf_info.analysis_dir, - "pipeline_configure_stdout.txt") - stderr_filepath = os.path.join(conf_info.analysis_dir, - "pipeline_configure_stderr.txt") - - fout = open(stdout_filepath, 'w') - ferr = open(stderr_filepath, 'w') - - pipe = subprocess.Popen(['goat_pipeline.py', - '--GERALD=%s' % (conf_info.config_filepath), - #'--tiles=s_4_0100,s_4_0101,s_4_0102,s_4_0103,s_4_0104', - '--make', - conf_info.analysis_dir], - stdout=fout, - stderr=ferr) - - print "Configuring pipeline: %s" % (time.ctime()) - error_code = pipe.wait() - - # Clean up - fout.close() - ferr.close() - - - ################## - # Process stdout - fout = open(stdout_filepath, 'r') - - stdout_line = fout.readline() - - complete = False - while stdout_line != '': - # Handle stdout - if config_stdout_handler(stdout_line, conf_info): - complete = True - stdout_line = fout.readline() - - fout.close() - - - #error_code = pipe.wait() - if error_code: - logging.error('Recieved error_code: %s' % (error_code)) - else: - logging.info('We are go for launch!') - - #Process stderr - ferr = open(stderr_filepath, 'r') - stderr_line = ferr.readline() - - abort = 'NO!' - stderr_success = False - while stderr_line != '': - stderr_status = config_stderr_handler(stderr_line, conf_info) - if stderr_status == RUN_ABORT: - abort = RUN_ABORT - elif stderr_status is True: - stderr_success = True - stderr_line = ferr.readline() - - ferr.close() - - - #Success requirements: - # 1) The stdout completed without error - # 2) The program exited with status 0 - # 3) No errors found in stdout - print '#Expect: True, False, True, True' - print complete, bool(error_code), abort != RUN_ABORT, stderr_success is True - status = complete is True and \ - bool(error_code) is False and \ - abort != RUN_ABORT and \ - stderr_success is True - - # If everything was successful, but for some reason - # we didn't retrieve the path info, log it. - if status is True: - if conf_info.bustard_path is None or conf_info.run_path is None: - logging.error("Failed to retrieve run_path") - return False - - return status - - -def run_pipeline(conf_info): - """ - Run the pipeline and monitor status. - """ - # Fail if the run_path doesn't actually exist - if not os.path.exists(conf_info.run_path): - logging.error('Run path does not exist: %s' \ - % (conf_info.run_path)) - return False - - # Change cwd to run_path - stdout_filepath = os.path.join(conf_info.analysis_dir, 'pipeline_run_stdout.txt') - stderr_filepath = os.path.join(conf_info.analysis_dir, 'pipeline_run_stderr.txt') - - # Create status object - conf_info.createStatusObject() - - # Monitor file creation - wm = WatchManager() - mask = EventsCodes.IN_DELETE | EventsCodes.IN_CREATE - event = RunEvent(conf_info) - notifier = ThreadedNotifier(wm, event) - notifier.start() - wdd = wm.add_watch(conf_info.run_path, mask, rec=True) - - # Log pipeline starting - logging.info('STARTING PIPELINE @ %s' % (time.ctime())) - - # Start the pipeline (and hide!) - #pipe = subprocess.Popen(['make', - # '-j8', - # 'recursive'], - # stdout=subprocess.PIPE, - # stderr=subprocess.PIPE) - - fout = open(stdout_filepath, 'w') - ferr = open(stderr_filepath, 'w') - - pipe = subprocess.Popen(['make', - '--directory=%s' % (conf_info.run_path), - '-j8', - 'recursive'], - stdout=fout, - stderr=ferr) - #shell=True) - # Wait for run to finish - retcode = pipe.wait() - - - # Clean up - notifier.stop() - fout.close() - ferr.close() - - # Process stderr - ferr = open(stderr_filepath, 'r') - - run_failed_stderr = False - for line in ferr: - err_status = pipeline_stderr_handler(line, conf_info) - if err_status == RUN_FAILED: - run_failed_stderr = True - - ferr.close() - - # Finished file check! - print 'RUN SUCCESS CHECK:' - for key, value in event.run_status_dict.items(): - print ' %s: %s' % (key, value) - - dstatus = event.run_status_dict - - # Success or failure check - status = (retcode == 0) and \ - run_failed_stderr is False and \ - dstatus['firecrest'] is True and \ - dstatus['bustard'] is True and \ - dstatus['gerald'] is True - - return status - - diff --git a/htsworkflow/pipeline/firecrest.py b/htsworkflow/pipeline/firecrest.py deleted file mode 100644 index 89ea598..0000000 --- a/htsworkflow/pipeline/firecrest.py +++ /dev/null @@ -1,127 +0,0 @@ -""" -Extract information about the Firecrest run - -Firecrest - class holding the properties we found -firecrest - Firecrest factory function initalized from a directory name -fromxml - Firecrest factory function initalized from an xml dump from - the Firecrest object. -""" - -from datetime import date -import os -import re -import time - -from htsworkflow.pipeline.runfolder import \ - ElementTree, \ - VERSION_RE, \ - EUROPEAN_STRPTIME - -class Firecrest(object): - XML_VERSION=1 - - # xml tag names - FIRECREST = 'Firecrest' - SOFTWARE_VERSION = 'version' - START = 'FirstCycle' - STOP = 'LastCycle' - DATE = 'run_time' - USER = 'user' - MATRIX = 'matrix' - - def __init__(self, xml=None): - self.start = None - self.stop = None - self.version = None - self.date = date.today() - self.user = None - self.matrix = None - - if xml is not None: - self.set_elements(xml) - - def _get_time(self): - return time.mktime(self.date.timetuple()) - time = property(_get_time, doc='return run time as seconds since epoch') - - def dump(self): - print "Starting cycle:", self.start - print "Ending cycle:", self.stop - print "Firecrest version:", self.version - print "Run date:", self.date - print "user:", self.user - - def get_elements(self): - attribs = {'version': str(Firecrest.XML_VERSION) } - root = ElementTree.Element(Firecrest.FIRECREST, attrib=attribs) - version = ElementTree.SubElement(root, Firecrest.SOFTWARE_VERSION) - version.text = self.version - start_cycle = ElementTree.SubElement(root, Firecrest.START) - start_cycle.text = str(self.start) - stop_cycle = ElementTree.SubElement(root, Firecrest.STOP) - stop_cycle.text = str(self.stop) - run_date = ElementTree.SubElement(root, Firecrest.DATE) - run_date.text = str(self.time) - user = ElementTree.SubElement(root, Firecrest.USER) - user.text = self.user - matrix = ElementTree.SubElement(root, Firecrest.MATRIX) - matrix.text = self.matrix - return root - - def set_elements(self, tree): - if tree.tag != Firecrest.FIRECREST: - raise ValueError('Expected "Firecrest" SubElements') - xml_version = int(tree.attrib.get('version', 0)) - if xml_version > Firecrest.XML_VERSION: - logging.warn('Firecrest XML tree is a higher version than this class') - for element in list(tree): - if element.tag == Firecrest.SOFTWARE_VERSION: - self.version = element.text - elif element.tag == Firecrest.START: - self.start = int(element.text) - elif element.tag == Firecrest.STOP: - self.stop = int(element.text) - elif element.tag == Firecrest.DATE: - self.date = date.fromtimestamp(float(element.text)) - elif element.tag == Firecrest.USER: - self.user = element.text - elif element.tag == Firecrest.MATRIX: - self.matrix = element.text - else: - raise ValueError("Unrecognized tag: %s" % (element.tag,)) - -def firecrest(pathname): - """ - Examine the directory at pathname and initalize a Firecrest object - """ - f = Firecrest() - - # parse firecrest directory name - path, name = os.path.split(pathname) - groups = name.split('_') - # grab the start/stop cycle information - cycle = re.match("C([0-9]+)-([0-9]+)", groups[0]) - f.start = int(cycle.group(1)) - f.stop = int(cycle.group(2)) - # firecrest version - version = re.search(VERSION_RE, groups[1]) - f.version = (version.group(1)) - # datetime - t = time.strptime(groups[2], EUROPEAN_STRPTIME) - f.date = date(*t[0:3]) - # username - f.user = groups[3] - - # should I parse this deeper than just stashing the - # contents of the matrix file? - matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt') - f.matrix = open(matrix_pathname, 'r').read() - return f - -def fromxml(tree): - """ - Initialize a Firecrest object from an element tree node - """ - f = Firecrest() - f.set_elements(tree) - return f diff --git a/htsworkflow/pipeline/genome_mapper.py b/htsworkflow/pipeline/genome_mapper.py deleted file mode 100644 index a8ea871..0000000 --- a/htsworkflow/pipeline/genome_mapper.py +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/python -import glob -import sys -import os -import re - -import logging - -from htsworkflow.util.alphanum import alphanum - -class DuplicateGenome(Exception): pass - - -def _has_metainfo(genome_dir): - metapath = os.path.join(genome_dir, '_metainfo_') - if os.path.isfile(metapath): - return True - else: - return False - -def getAvailableGenomes(genome_base_dir): - """ - raises IOError (on genome_base_dir not found) - raises DuplicateGenome on duplicate genomes found. - - returns a double dictionary (i.e. d[species][build] = path) - """ - - # Need valid directory - if not os.path.exists(genome_base_dir): - msg = "Directory does not exist: %s" % (genome_base_dir) - raise IOError, msg - - # Find all subdirectories - filepath_list = glob.glob(os.path.join(genome_base_dir, '*')) - potential_genome_dirs = \ - [ filepath for filepath in filepath_list if os.path.isdir(filepath)] - - # Get list of metadata files - genome_dir_list = \ - [ dirpath \ - for dirpath in potential_genome_dirs \ - if _has_metainfo(dirpath) ] - - # Genome double dictionary - d = {} - - for genome_dir in genome_dir_list: - line = open(os.path.join(genome_dir, '_metainfo_'), 'r').readline().strip() - - # Get species, build... log and skip on failure - try: - species, build = line.split('|') - except: - logging.warning('Skipping: Invalid metafile (%s) line: %s' \ - % (metafile, line)) - continue - - build_dict = d.setdefault(species, {}) - if build in build_dict: - msg = "Duplicate genome for %s|%s" % (species, build) - raise DuplicateGenome, msg - - build_dict[build] = genome_dir - - return d - - -class constructMapperDict(object): - """ - Emulate a dictionary to map genome|build names to paths. - - It uses the dictionary generated by getAvailableGenomes. - """ - def __init__(self, genome_dict): - self.genome_dict = genome_dict - - def __getitem__(self, key): - """ - Return the best match for key - """ - elements = re.split("\|", key) - - if len(elements) == 1: - # we just the species name - # get the set of builds - builds = self.genome_dict[elements[0]] - - # sort build names the way humans would - keys = builds.keys() - keys.sort(cmp=alphanum) - - # return the path from the 'last' build name - return builds[keys[-1]] - - elif len(elements) == 2: - # we have species, and build name - return self.genome_dict[elements[0]][elements[1]] - else: - raise KeyError("Unrecognized key") - - def keys(self): - keys = [] - for species in self.genome_dict.keys(): - for build in self.genome_dict[species]: - keys.append([species+'|'+build]) - return keys - - def values(self): - values = [] - for species in self.genome_dict.keys(): - for build in self.genome_dict[species]: - values.append(self.genome_dict[species][build]) - return values - - def items(self): - items = [] - for species in self.genome_dict.keys(): - for build in self.genome_dict[species]: - key = [species+'|'+build] - value = self.genome_dict[species][build] - items.append((key, value)) - return items - -if __name__ == '__main__': - - if len(sys.argv) != 2: - print 'useage: %s ' % (sys.argv[0]) - sys.exit(1) - - d = getAvailableGenomes(sys.argv[1]) - d2 = constructMapperDict(d) - - for k,v in d2.items(): - print '%s: %s' % (k,v) - - diff --git a/htsworkflow/pipeline/gerald.py b/htsworkflow/pipeline/gerald.py deleted file mode 100644 index 07ca009..0000000 --- a/htsworkflow/pipeline/gerald.py +++ /dev/null @@ -1,719 +0,0 @@ -""" -Provide access to information stored in the GERALD directory. -""" -from datetime import datetime, date -from glob import glob -import logging -import os -import stat -import time -import types - -from htsworkflow.pipeline.runfolder import \ - ElementTree, \ - EUROPEAN_STRPTIME, \ - LANES_PER_FLOWCELL, \ - VERSION_RE -from htsworkflow.util.ethelp import indent, flatten -from htsworkflow.util.opener import autoopen - -class Gerald(object): - """ - Capture meaning out of the GERALD directory - """ - XML_VERSION = 1 - GERALD='Gerald' - RUN_PARAMETERS='RunParameters' - SUMMARY='Summary' - - class LaneParameters(object): - """ - Make it easy to access elements of LaneSpecificRunParameters from python - """ - def __init__(self, gerald, key): - self._gerald = gerald - self._key = key - - def __get_attribute(self, xml_tag): - subtree = self._gerald.tree.find('LaneSpecificRunParameters') - container = subtree.find(xml_tag) - if container is None: - return None - if len(container.getchildren()) > LANES_PER_FLOWCELL: - raise RuntimeError('GERALD config.xml file changed') - lanes = [x.tag.split('_')[1] for x in container.getchildren()] - index = lanes.index(self._key) - element = container[index] - return element.text - def _get_analysis(self): - return self.__get_attribute('ANALYSIS') - analysis = property(_get_analysis) - - def _get_eland_genome(self): - genome = self.__get_attribute('ELAND_GENOME') - # default to the chipwide parameters if there isn't an - # entry in the lane specific paramaters - if genome is None: - subtree = self._gerald.tree.find('ChipWideRunParameters') - container = subtree.find('ELAND_GENOME') - genome = container.text - return genome - eland_genome = property(_get_eland_genome) - - def _get_read_length(self): - return self.__get_attribute('READ_LENGTH') - read_length = property(_get_read_length) - - def _get_use_bases(self): - return self.__get_attribute('USE_BASES') - use_bases = property(_get_use_bases) - - class LaneSpecificRunParameters(object): - """ - Provide access to LaneSpecificRunParameters - """ - def __init__(self, gerald): - self._gerald = gerald - self._keys = None - def __getitem__(self, key): - return Gerald.LaneParameters(self._gerald, key) - def keys(self): - if self._keys is None: - tree = self._gerald.tree - analysis = tree.find('LaneSpecificRunParameters/ANALYSIS') - # according to the pipeline specs I think their fields - # are sampleName_laneID, with sampleName defaulting to s - # since laneIDs are constant lets just try using - # those consistently. - self._keys = [ x.tag.split('_')[1] for x in analysis] - return self._keys - def values(self): - return [ self[x] for x in self.keys() ] - def items(self): - return zip(self.keys(), self.values()) - def __len__(self): - return len(self.keys()) - - def __init__(self, xml=None): - self.pathname = None - self.tree = None - - # parse lane parameters out of the config.xml file - self.lanes = Gerald.LaneSpecificRunParameters(self) - - self.summary = None - self.eland_results = None - - if xml is not None: - self.set_elements(xml) - - def _get_date(self): - if self.tree is None: - return datetime.today() - timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP') - epochstamp = time.mktime(time.strptime(timestamp, '%c')) - return datetime.fromtimestamp(epochstamp) - date = property(_get_date) - - def _get_time(self): - return time.mktime(self.date.timetuple()) - time = property(_get_time, doc='return run time as seconds since epoch') - - def _get_version(self): - if self.tree is None: - return None - return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION') - version = property(_get_version) - - def dump(self): - """ - Debugging function, report current object - """ - print 'Gerald version:', self.version - print 'Gerald run date:', self.date - print 'Gerald config.xml:', self.tree - self.summary.dump() - - def get_elements(self): - if self.tree is None or self.summary is None: - return None - - gerald = ElementTree.Element(Gerald.GERALD, - {'version': unicode(Gerald.XML_VERSION)}) - gerald.append(self.tree) - gerald.append(self.summary.get_elements()) - if self.eland_results: - gerald.append(self.eland_results.get_elements()) - return gerald - - def set_elements(self, tree): - if tree.tag != Gerald.GERALD: - raise ValueError('exptected GERALD') - xml_version = int(tree.attrib.get('version', 0)) - if xml_version > Gerald.XML_VERSION: - logging.warn('XML tree is a higher version than this class') - for element in list(tree): - tag = element.tag.lower() - if tag == Gerald.RUN_PARAMETERS.lower(): - self.tree = element - elif tag == Gerald.SUMMARY.lower(): - self.summary = Summary(xml=element) - elif tag == ELAND.ELAND.lower(): - self.eland_results = ELAND(xml=element) - else: - logging.warn("Unrecognized tag %s" % (element.tag,)) - - -def gerald(pathname): - g = Gerald() - g.pathname = pathname - path, name = os.path.split(pathname) - config_pathname = os.path.join(pathname, 'config.xml') - g.tree = ElementTree.parse(config_pathname).getroot() - - # parse Summary.htm file - summary_pathname = os.path.join(pathname, 'Summary.htm') - g.summary = Summary(summary_pathname) - # parse eland files - g.eland_results = eland(g.pathname, g) - return g - -def tonumber(v): - """ - Convert a value to int if its an int otherwise a float. - """ - try: - v = int(v) - except ValueError, e: - v = float(v) - return v - -def parse_mean_range(value): - """ - Parse values like 123 +/- 4.5 - """ - if value.strip() == 'unknown': - return 0, 0 - - average, pm, deviation = value.split() - if pm != '+/-': - raise RuntimeError("Summary.htm file format changed") - return tonumber(average), tonumber(deviation) - -def make_mean_range_element(parent, name, mean, deviation): - """ - Make an ElementTree subelement - """ - element = ElementTree.SubElement(parent, name, - { 'mean': unicode(mean), - 'deviation': unicode(deviation)}) - return element - -def parse_mean_range_element(element): - """ - Grab mean/deviation out of element - """ - return (tonumber(element.attrib['mean']), - tonumber(element.attrib['deviation'])) - -def parse_summary_element(element): - """ - Determine if we have a simple element or a mean/deviation element - """ - if len(element.attrib) > 0: - return parse_mean_range_element(element) - else: - return element.text - -class Summary(object): - """ - Extract some useful information from the Summary.htm file - """ - XML_VERSION = 2 - SUMMARY = 'Summary' - - class LaneResultSummary(object): - """ - Parse the LaneResultSummary table out of Summary.htm - Mostly for the cluster number - """ - LANE_RESULT_SUMMARY = 'LaneResultSummary' - TAGS = { - 'LaneYield': 'lane_yield', - 'Cluster': 'cluster', # Raw - 'ClusterPF': 'cluster_pass_filter', - 'AverageFirstCycleIntensity': 'average_first_cycle_intensity', - 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles', - 'PercentPassFilterClusters': 'percent_pass_filter_clusters', - 'PercentPassFilterAlign': 'percent_pass_filter_align', - 'AverageAlignmentScore': 'average_alignment_score', - 'PercentErrorRate': 'percent_error_rate' - } - - def __init__(self, html=None, xml=None): - self.lane = None - self.lane_yield = None - self.cluster = None - self.cluster_pass_filter = None - self.average_first_cycle_intensity = None - self.percent_intensity_after_20_cycles = None - self.percent_pass_filter_clusters = None - self.percent_pass_filter_align = None - self.average_alignment_score = None - self.percent_error_rate = None - - if html is not None: - self.set_elements_from_html(html) - if xml is not None: - self.set_elements(xml) - - def set_elements_from_html(self, data): - if not len(data) in (8,10): - raise RuntimeError("Summary.htm file format changed") - - # same in pre-0.3.0 Summary file and 0.3 summary file - self.lane = data[0] - - if len(data) == 8: - parsed_data = [ parse_mean_range(x) for x in data[1:] ] - # this is the < 0.3 Pipeline version - self.cluster = parsed_data[0] - self.average_first_cycle_intensity = parsed_data[1] - self.percent_intensity_after_20_cycles = parsed_data[2] - self.percent_pass_filter_clusters = parsed_data[3] - self.percent_pass_filter_align = parsed_data[4] - self.average_alignment_score = parsed_data[5] - self.percent_error_rate = parsed_data[6] - elif len(data) == 10: - parsed_data = [ parse_mean_range(x) for x in data[2:] ] - # this is the >= 0.3 summary file - self.lane_yield = data[1] - self.cluster = parsed_data[0] - self.cluster_pass_filter = parsed_data[1] - self.average_first_cycle_intensity = parsed_data[2] - self.percent_intensity_after_20_cycles = parsed_data[3] - self.percent_pass_filter_clusters = parsed_data[4] - self.percent_pass_filter_align = parsed_data[5] - self.average_alignment_score = parsed_data[6] - self.percent_error_rate = parsed_data[7] - - def get_elements(self): - lane_result = ElementTree.Element( - Summary.LaneResultSummary.LANE_RESULT_SUMMARY, - {'lane': self.lane}) - for tag, variable_name in Summary.LaneResultSummary.TAGS.items(): - value = getattr(self, variable_name) - if value is None: - continue - # it looks like a sequence - elif type(value) in (types.TupleType, types.ListType): - element = make_mean_range_element( - lane_result, - tag, - *value - ) - else: - element = ElementTree.SubElement(lane_result, tag) - element.text = value - return lane_result - - def set_elements(self, tree): - if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY: - raise ValueError('Expected %s' % ( - Summary.LaneResultSummary.LANE_RESULT_SUMMARY)) - self.lane = tree.attrib['lane'] - tags = Summary.LaneResultSummary.TAGS - for element in list(tree): - try: - variable_name = tags[element.tag] - setattr(self, variable_name, - parse_summary_element(element)) - except KeyError, e: - logging.warn('Unrecognized tag %s' % (element.tag,)) - - def __init__(self, filename=None, xml=None): - self.lane_results = {} - - if filename is not None: - self._extract_lane_results(filename) - if xml is not None: - self.set_elements(xml) - - def __getitem__(self, key): - return self.lane_results[key] - - def __len__(self): - return len(self.lane_results) - - def keys(self): - return self.lane_results.keys() - - def values(self): - return self.lane_results.values() - - def items(self): - return self.lane_results.items() - - def _flattened_row(self, row): - """ - flatten the children of a ... - """ - return [flatten(x) for x in row.getchildren() ] - - def _parse_table(self, table): - """ - assumes the first line is the header of a table, - and that the remaining rows are data - """ - rows = table.getchildren() - data = [] - for r in rows: - data.append(self._flattened_row(r)) - return data - - def _extract_named_tables(self, pathname): - """ - extract all the 'named' tables from a Summary.htm file - and return as a dictionary - - Named tables are

...

...
pairs - The contents of the h2 tag is considered to the name - of the table. - """ - tree = ElementTree.parse(pathname).getroot() - body = tree.find('body') - tables = {} - for i in range(len(body)): - if body[i].tag == 'h2' and body[i+1].tag == 'table': - # we have an interesting table - name = flatten(body[i]) - table = body[i+1] - data = self._parse_table(table) - tables[name] = data - return tables - - def _extract_lane_results(self, pathname): - """ - extract the Lane Results Summary table - """ - - tables = self._extract_named_tables(pathname) - - # parse lane result summary - lane_summary = tables['Lane Results Summary'] - # this is version 1 of the summary file - if len(lane_summary[-1]) == 8: - # strip header - headers = lane_summary[0] - # grab the lane by lane data - lane_summary = lane_summary[1:] - - # this is version 2 of the summary file - if len(lane_summary[-1]) == 10: - # lane_summary[0] is a different less specific header row - headers = lane_summary[1] - lane_summary = lane_summary[2:10] - # after the last lane, there's a set of chip wide averages - - for r in lane_summary: - lrs = Summary.LaneResultSummary(html=r) - self.lane_results[lrs.lane] = lrs - - def get_elements(self): - summary = ElementTree.Element(Summary.SUMMARY, - {'version': unicode(Summary.XML_VERSION)}) - for lane in self.lane_results.values(): - summary.append(lane.get_elements()) - return summary - - def set_elements(self, tree): - if tree.tag != Summary.SUMMARY: - return ValueError("Expected %s" % (Summary.SUMMARY,)) - xml_version = int(tree.attrib.get('version', 0)) - if xml_version > Summary.XML_VERSION: - logging.warn('Summary XML tree is a higher version than this class') - for element in list(tree): - lrs = Summary.LaneResultSummary() - lrs.set_elements(element) - self.lane_results[lrs.lane] = lrs - - def dump(self): - """ - Debugging function, report current object - """ - pass - - -def build_genome_fasta_map(genome_dir): - # build fasta to fasta file map - genome = genome_dir.split(os.path.sep)[-1] - fasta_map = {} - for vld_file in glob(os.path.join(genome_dir, '*.vld')): - is_link = False - if os.path.islink(vld_file): - is_link = True - vld_file = os.path.realpath(vld_file) - path, vld_name = os.path.split(vld_file) - name, ext = os.path.splitext(vld_name) - if is_link: - fasta_map[name] = name - else: - fasta_map[name] = os.path.join(genome, name) - return fasta_map - -class ElandLane(object): - """ - Process an eland result file - """ - XML_VERSION = 1 - LANE = 'ElandLane' - SAMPLE_NAME = 'SampleName' - LANE_ID = 'LaneID' - GENOME_MAP = 'GenomeMap' - GENOME_ITEM = 'GenomeItem' - MAPPED_READS = 'MappedReads' - MAPPED_ITEM = 'MappedItem' - MATCH_CODES = 'MatchCodes' - MATCH_ITEM = 'Code' - READS = 'Reads' - - def __init__(self, pathname=None, genome_map=None, xml=None): - self.pathname = pathname - self._sample_name = None - self._lane_id = None - self._reads = None - self._mapped_reads = None - self._match_codes = None - if genome_map is None: - genome_map = {} - self.genome_map = genome_map - - if xml is not None: - self.set_elements(xml) - - def _update(self): - """ - Actually read the file and actually count the reads - """ - # can't do anything if we don't have a file to process - if self.pathname is None: - return - - if os.stat(self.pathname)[stat.ST_SIZE] == 0: - raise RuntimeError("Eland isn't done, try again later.") - - reads = 0 - mapped_reads = {} - - match_codes = {'NM':0, 'QC':0, 'RM':0, - 'U0':0, 'U1':0, 'U2':0, - 'R0':0, 'R1':0, 'R2':0, - } - for line in autoopen(self.pathname,'r'): - reads += 1 - fields = line.split() - # code = fields[2] - # match_codes[code] = match_codes.setdefault(code, 0) + 1 - # the QC/NM etc codes are in the 3rd field and always present - match_codes[fields[2]] += 1 - # ignore lines that don't have a fasta filename - if len(fields) < 7: - continue - fasta = self.genome_map.get(fields[6], fields[6]) - mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1 - self._match_codes = match_codes - self._mapped_reads = mapped_reads - self._reads = reads - - def _update_name(self): - # extract the sample name - if self.pathname is None: - return - - path, name = os.path.split(self.pathname) - split_name = name.split('_') - self._sample_name = split_name[0] - self._lane_id = split_name[1] - - def _get_sample_name(self): - if self._sample_name is None: - self._update_name() - return self._sample_name - sample_name = property(_get_sample_name) - - def _get_lane_id(self): - if self._lane_id is None: - self._update_name() - return self._lane_id - lane_id = property(_get_lane_id) - - def _get_reads(self): - if self._reads is None: - self._update() - return self._reads - reads = property(_get_reads) - - def _get_mapped_reads(self): - if self._mapped_reads is None: - self._update() - return self._mapped_reads - mapped_reads = property(_get_mapped_reads) - - def _get_match_codes(self): - if self._match_codes is None: - self._update() - return self._match_codes - match_codes = property(_get_match_codes) - - def get_elements(self): - lane = ElementTree.Element(ElandLane.LANE, - {'version': - unicode(ElandLane.XML_VERSION)}) - sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME) - sample_tag.text = self.sample_name - lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID) - lane_tag.text = self.lane_id - genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP) - for k, v in self.genome_map.items(): - item = ElementTree.SubElement( - genome_map, ElandLane.GENOME_ITEM, - {'name':k, 'value':unicode(v)}) - mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS) - for k, v in self.mapped_reads.items(): - item = ElementTree.SubElement( - mapped_reads, ElandLane.MAPPED_ITEM, - {'name':k, 'value':unicode(v)}) - match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES) - for k, v in self.match_codes.items(): - item = ElementTree.SubElement( - match_codes, ElandLane.MATCH_ITEM, - {'name':k, 'value':unicode(v)}) - reads = ElementTree.SubElement(lane, ElandLane.READS) - reads.text = unicode(self.reads) - - return lane - - def set_elements(self, tree): - if tree.tag != ElandLane.LANE: - raise ValueError('Exptecting %s' % (ElandLane.LANE,)) - - # reset dictionaries - self._mapped_reads = {} - self._match_codes = {} - - for element in tree: - tag = element.tag.lower() - if tag == ElandLane.SAMPLE_NAME.lower(): - self._sample_name = element.text - elif tag == ElandLane.LANE_ID.lower(): - self._lane_id = element.text - elif tag == ElandLane.GENOME_MAP.lower(): - for child in element: - name = child.attrib['name'] - value = child.attrib['value'] - self.genome_map[name] = value - elif tag == ElandLane.MAPPED_READS.lower(): - for child in element: - name = child.attrib['name'] - value = child.attrib['value'] - self._mapped_reads[name] = int(value) - elif tag == ElandLane.MATCH_CODES.lower(): - for child in element: - name = child.attrib['name'] - value = int(child.attrib['value']) - self._match_codes[name] = value - elif tag == ElandLane.READS.lower(): - self._reads = int(element.text) - else: - logging.warn("ElandLane unrecognized tag %s" % (element.tag,)) - -def extract_eland_sequence(instream, outstream, start, end): - """ - Extract a chunk of sequence out of an eland file - """ - for line in instream: - record = line.split() - if len(record) > 1: - result = [record[0], record[1][start:end]] - else: - result = [record[0][start:end]] - outstream.write("\t".join(result)) - outstream.write(os.linesep) - -class ELAND(object): - """ - Summarize information from eland files - """ - XML_VERSION = 1 - - ELAND = 'ElandCollection' - LANE = 'Lane' - LANE_ID = 'id' - - def __init__(self, xml=None): - # we need information from the gerald config.xml - self.results = {} - - if xml is not None: - self.set_elements(xml) - - def __len__(self): - return len(self.results) - - def keys(self): - return self.results.keys() - - def values(self): - return self.results.values() - - def items(self): - return self.results.items() - - def __getitem__(self, key): - return self.results[key] - - def get_elements(self): - root = ElementTree.Element(ELAND.ELAND, - {'version': unicode(ELAND.XML_VERSION)}) - for lane_id, lane in self.results.items(): - eland_lane = lane.get_elements() - eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id) - root.append(eland_lane) - return root - - def set_elements(self, tree): - if tree.tag.lower() != ELAND.ELAND.lower(): - raise ValueError('Expecting %s', ELAND.ELAND) - for element in list(tree): - lane_id = element.attrib[ELAND.LANE_ID] - lane = ElandLane(xml=element) - self.results[lane_id] = lane - -def eland(basedir, gerald=None, genome_maps=None): - e = ELAND() - - file_list = glob(os.path.join(basedir, "*_eland_result.txt")) - if len(file_list) == 0: - # lets handle compressed eland files too - file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2")) - - for pathname in file_list: - # yes the lane_id is also being computed in ElandLane._update - # I didn't want to clutter up my constructor - # but I needed to persist the sample_name/lane_id for - # runfolder summary_report - path, name = os.path.split(pathname) - split_name = name.split('_') - lane_id = split_name[1] - - if genome_maps is not None: - genome_map = genome_maps[lane_id] - elif gerald is not None: - genome_dir = gerald.lanes[lane_id].eland_genome - genome_map = build_genome_fasta_map(genome_dir) - else: - genome_map = {} - - eland_result = ElandLane(pathname, genome_map) - e.results[lane_id] = eland_result - return e diff --git a/htsworkflow/pipeline/recipe_parser.py b/htsworkflow/pipeline/recipe_parser.py deleted file mode 100644 index 7f5ced6..0000000 --- a/htsworkflow/pipeline/recipe_parser.py +++ /dev/null @@ -1,48 +0,0 @@ -from xml import sax - - -def get_cycles(recipe_xml_filepath): - """ - returns the number of cycles found in Recipe*.xml - """ - handler = CycleXmlHandler() - sax.parse(recipe_xml_filepath, handler) - return handler.cycle_count - - - -class CycleXmlHandler(sax.ContentHandler): - - def __init__(self): - self.cycle_count = 0 - self.in_protocol = False - sax.ContentHandler.__init__(self) - - - def startDocument(self): - self.cycle_count = 0 - self.in_protocol = False - - - def startElement(self, name, attrs): - - #Only count Incorporations as cycles if within - # the protocol section of the xml document. - if name == "Incorporation" and self.in_protocol: - #print 'Found a cycle!' - self.cycle_count += 1 - return - - elif name == 'Protocol': - #print 'In protocol' - self.in_protocol = True - return - - #print 'Skipping: %s' % (name) - - - def endElement(self, name): - - if name == 'Protocol': - #print 'End protocol' - self.in_protocol = False diff --git a/htsworkflow/pipeline/retrieve_config.py b/htsworkflow/pipeline/retrieve_config.py deleted file mode 100644 index 72cff17..0000000 --- a/htsworkflow/pipeline/retrieve_config.py +++ /dev/null @@ -1,185 +0,0 @@ -#!/usr/bin/env python - -from optparse import OptionParser, IndentedHelpFormatter -from ConfigParser import SafeConfigParser - -import logging -import os -import sys -import urllib2 - -CONFIG_SYSTEM = '/etc/ga_frontend/ga_frontend.conf' -CONFIG_USER = os.path.expanduser('~/.ga_frontend.conf') - -#Disable or enable commandline arg parsing; disabled by default. -DISABLE_CMDLINE = True - -class FlowCellNotFound(Exception): pass -class WebError404(Exception): pass - -class DummyOptions: - """ - Used when command line parsing is disabled; default - """ - def __init__(self): - self.url = None - self.output_filepath = None - self.flowcell = None - self.genome_dir = None - -class PreformattedDescriptionFormatter(IndentedHelpFormatter): - - #def format_description(self, description): - # - # if description: - # return description + "\n" - # else: - # return "" - - def format_epilog(self, epilog): - """ - It was removing my preformated epilog, so this should override - that behavior! Muhahaha! - """ - if epilog: - return "\n" + epilog + "\n" - else: - return "" - - -def constructOptionParser(): - """ - returns a pre-setup optparser - """ - global DISABLE_CMDLINE - - if DISABLE_CMDLINE: - return None - - parser = OptionParser(formatter=PreformattedDescriptionFormatter()) - - parser.set_description('Retrieves eland config file from ga_frontend web frontend.') - - parser.epilog = """ -Config File: - * %s (System wide) - * %s (User specific; overrides system) - * command line overrides all config file options - - Example Config File: - - [config_file_server] - base_host_url=http://somewhere.domain:port -""" % (CONFIG_SYSTEM, CONFIG_USER) - - #Special formatter for allowing preformatted description. - ##parser.format_epilog(PreformattedDescriptionFormatter()) - - parser.add_option("-u", "--url", - action="store", type="string", dest="url") - - parser.add_option("-o", "--output", - action="store", type="string", dest="output_filepath") - - parser.add_option("-f", "--flowcell", - action="store", type="string", dest="flowcell") - - parser.add_option("-g", "--genome_dir", - action="store", type="string", dest="genome_dir") - - #parser.set_default("url", "default") - - return parser - -def constructConfigParser(): - """ - returns a pre-setup config parser - """ - parser = SafeConfigParser() - parser.read([CONFIG_SYSTEM, CONFIG_USER]) - if not parser.has_section('config_file_server'): - parser.add_section('config_file_server') - if not parser.has_section('local_setup'): - parser.add_section('local_setup') - - return parser - - -def getCombinedOptions(): - """ - Returns optparse options after it has be updated with ConfigParser - config files and merged with parsed commandline options. - """ - cl_parser = constructOptionParser() - conf_parser = constructConfigParser() - - if cl_parser is None: - options = DummyOptions() - else: - options, args = cl_parser.parse_args() - - if options.url is None: - if conf_parser.has_option('config_file_server', 'base_host_url'): - options.url = conf_parser.get('config_file_server', 'base_host_url') - - if options.genome_dir is None: - if conf_parser.has_option('local_setup', 'genome_dir'): - options.genome_dir = conf_parser.get('local_setup', 'genome_dir') - - print 'USING OPTIONS:' - print ' URL:', options.url - print ' OUT:', options.output_filepath - print ' FC:', options.flowcell - print 'GDIR:', options.genome_dir - print '' - - return options - - -def saveConfigFile(flowcell, base_host_url, output_filepath): - """ - retrieves the flowcell eland config file, give the base_host_url - (i.e. http://sub.domain.edu:port) - """ - url = base_host_url + '/eland_config/%s/' % (flowcell) - - f = open(output_filepath, 'w') - #try: - try: - web = urllib2.urlopen(url) - except urllib2.URLError, e: - errmsg = 'URLError: %d' % (e.code,) - logging.error(errmsg) - logging.error('opened %s' % (url,)) - logging.error('%s' % ( e.read(),)) - raise IOError(errmsg) - - #except IOError, msg: - # if str(msg).find("Connection refused") >= 0: - # print 'Error: Connection refused for: %s' % (url) - # f.close() - # sys.exit(1) - # elif str(msg).find("Name or service not known") >= 0: - # print 'Error: Invalid domain or ip address for: %s' % (url) - # f.close() - # sys.exit(2) - # else: - # raise IOError, msg - - data = web.read() - - if data.find('Hmm, config file for') >= 0: - msg = "Flowcell (%s) not found in DB; full url(%s)" % (flowcell, url) - raise FlowCellNotFound, msg - - if data.find('404 - Not Found') >= 0: - msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \ - "Did you get right port #?" % (flowcell, base_host_url, url) - raise FlowCellNotFound, msg - - f.write(data) - web.close() - f.close() - logging.info('Wrote config file to %s' % (output_filepath,)) - - diff --git a/htsworkflow/pipeline/run_status.py b/htsworkflow/pipeline/run_status.py deleted file mode 100644 index 39dc54c..0000000 --- a/htsworkflow/pipeline/run_status.py +++ /dev/null @@ -1,478 +0,0 @@ -import glob -import re -import os -import sys -import time -import threading - -s_comment = re.compile('^#') -s_general_read_len = re.compile('^READ_LENGTH ') -s_read_len = re.compile('^[1-8]+:READ_LENGTH ') - -s_firecrest = None - -def _four_digit_num_in_string(num): - if num < 0: - pass - elif num < 10: - return '000' + str(num) - elif num < 100: - return '00' + str(num) - elif num < 1000: - return '0' + str(num) - elif num < 10000: - return str(num) - - msg = 'Invalid number: %s' % (num) - raise ValueError, msg - -def _two_digit_num_in_string(num): - if num < 0: - pass - elif num < 10: - return '0' + str(num) - elif num < 100: - return str(num) - - msg = 'Invalid number: %s' % (num) - raise ValueError, msg - - -# FIRECREST PATTERNS -# _p2f(, lane, tile, cycle) -PATTERN_FIRECREST_QCM = 's_%s_%s_%s_qcm.xml' - -# _p2f(, lane, tile) -PATTERN_FIRECREST_INT = 's_%s_%s_02_int.txt' -PATTERN_FIRECREST_NSE = 's_%s_%s_nse.txt.gz' -PATTERN_FIRECREST_POS = 's_%s_%s_pos.txt' -PATTERN_FIRECREST_IDX = 's_%s_%s_idx.txt' -PATTERN_FIRECREST_CLU1 = 's_%s_%s_01_1_clu.txt' -PATTERN_FIRECREST_CLU2 = 's_%s_%s_01_2_clu.txt' -PATTERN_FIRECREST_CLU3 = 's_%s_%s_01_3_clu.txt' -PATTERN_FIRECREST_CLU4 = 's_%s_%s_01_4_clu.txt' - - -# BUSTARD PATTERNS -# _p2f(, lane, tile) -PATTERN_BUSTARD_SIG2 = 's_%s_%s_sig2.txt' -PATTERN_BUSTARD_PRB = 's_%s_%s_prb.txt' - - - -# GERALD PATTERNS -# _p2f(, lane, tile) -PATTERN_GERALD_ALLTMP = 's_%s_%s_all.txt.tmp' -PATTERN_GERALD_QRAWTMP = 's_%s_%s_qraw.txt.tmp' -PATTERN_GERALD_ALLPNGTMP = 's_%s_%s_all.tmp.png' -PATTERN_GERALD_ALIGNTMP = 's_%s_%s_align.txt.tmp' -PATTERN_GERALD_QVALTMP = 's_%s_%s_qval.txt.tmp' -PATTERN_GERALD_SCORETMP = 's_%s_%s_score.txt.tmp' -PATTERN_GERALD_PREALIGNTMP = 's_%s_%s_prealign.txt.tmp' -PATTERN_GERALD_REALIGNTMP = 's_%s_%s_realign.txt.tmp' -PATTERN_GERALD_RESCORETMP = 's_%s_%s_rescore.txt.tmp' -PATTERN_GERALD_RESCOREPNG = 's_%s_%s_rescore.png' -PATTERN_GERALD_ERRORSTMPPNG = 's_%s_%s_errors.tmp.png' -PATTERN_GERALD_QCALTMP = 's_%s_%s_qcal.txt.tmp' -PATTERN_GERALD_QVAL = 's_%s_%s_qval.txt' - -# _p2f(, lane) -PATTERN_GERALD_SEQPRETMP = 's_%s_seqpre.txt.tmp' -PATTERN_GERALD_RESULTTMP = 's_%s_eland_result.txt.tmp' -PATTERN_GERALD_SIGMEANSTMP = 's_%s_Signal_Means.txt.tmp' -PATTERN_GERALD_CALLPNG = 's_%s_call.png' -PATTERN_GERALD_ALLPNG = 's_%s_all.png' -PATTERN_GERALD_PERCENTALLPNG = 's_%s_percent_all.png' -PATTERN_GERALD_PERCENTCALLPNG = 's_%s_percent_call.png' -PATTERN_GERALD_PERCENTBASEPNG = 's_%s_percent_base.png' -PATTERN_GERALD_FILTTMP = 's_%s_filt.txt.tmp' -PATTERN_GERALD_FRAGTMP = 's_%s_frag.txt.tmp' -PATTERN_GERALD_QREPORTTMP = 's_%s_qreport.txt.tmp' -PATTERN_GERALD_QTABLETMP = 's_%s_qtable.txt.tmp' -PATTERN_GERALD_QCALREPORTTMP = 's_%s_qcalreport.txt.tmp' -PATTERN_GERALD_SEQUENCETMP = 's_%s_sequence.txt.tmp' -PATTERN_GERALD_LANEFINISHED = 's_%s_finished.txt' - - - -def _p2f(pattern, lane, tile=None, cycle=None): - """ - Converts a pattern plus info into file names - """ - - # lane, and cycle provided (INVALID) - if tile is None and cycle is not None: - msg = "Handling of cycle without tile is not currently implemented." - raise ValueError, msg - - # lane, tile, cycle provided - elif cycle: - return pattern % (lane, - _four_digit_num_in_string(tile), - _two_digit_num_in_string(cycle)) - - # lane, tile provided - elif tile: - return pattern % (lane, _four_digit_num_in_string(tile)) - - # lane provided - else: - return pattern % (lane) - - -class GARunStatus(object): - - def __init__(self, conf_filepath): - """ - Given an eland config file in the top level directory - of a run, predicts the files that will be generated - during a run and provides methods for retrieving - (completed, total) for each step or entire run. - """ - #print 'self._conf_filepath = %s' % (conf_filepath) - self._conf_filepath = conf_filepath - self._base_dir, junk = os.path.split(conf_filepath) - self._image_dir = os.path.join(self._base_dir, 'Images') - - self.lanes = [] - self.lane_read_length = {} - self.tiles = None - self.cycles = None - - self.status = {} - self.status['firecrest'] = {} - self.status['bustard'] = {} - self.status['gerald'] = {} - - self._process_config() - self._count_tiles() - self._count_cycles() - self._generate_expected() - - - def _process_config(self): - """ - Grabs info from self._conf_filepath - """ - f = open(self._conf_filepath, 'r') - - for line in f: - - #Skip comment lines for now. - if s_comment.search(line): - continue - - mo = s_general_read_len.search(line) - if mo: - read_length = int(line[mo.end():]) - #Handle general READ_LENGTH - for i in range(1,9): - self.lane_read_length[i] = read_length - - mo = s_read_len.search(line) - if mo: - read_length = int(line[mo.end():]) - lanes, junk = line.split(':') - - #Convert lanes from string of lanes to list of lane #s. - lanes = [ int(i) for i in lanes ] - - - for lane in lanes: - - #Keep track of which lanes are being run. - if lane not in self.lanes: - self.lanes.append(lane) - - #Update with lane specific read lengths - self.lane_read_length[lane] = read_length - - self.lanes.sort() - - - def _count_tiles(self): - """ - Count the number of tiles being used - """ - self.tiles = len(glob.glob(os.path.join(self._image_dir, - 'L001', - 'C1.1', - 's_1_*_a.tif'))) - - def _count_cycles(self): - """ - Figures out the number of cycles that are available - """ - #print 'self._image_dir = %s' % (self._image_dir) - cycle_dirs = glob.glob(os.path.join(self._image_dir, 'L001', 'C*.1')) - #print 'cycle_dirs = %s' % (cycle_dirs) - cycle_list = [] - for cycle_dir in cycle_dirs: - junk, c = os.path.split(cycle_dir) - cycle_list.append(int(c[1:c.find('.')])) - - self.cycles = max(cycle_list) - - - - - def _generate_expected(self): - """ - generates a list of files we expect to find. - """ - - firecrest = self.status['firecrest'] - bustard = self.status['bustard'] - gerald = self.status['gerald'] - - - for lane in self.lanes: - for tile in range(1,self.tiles+1): - for cycle in range(1, self.cycles+1): - - ########################## - # LANE, TILE, CYCLE LAYER - - # FIRECREST - firecrest[_p2f(PATTERN_FIRECREST_QCM, lane, tile, cycle)] = False - - - ################### - # LANE, TILE LAYER - - # FIRECREST - firecrest[_p2f(PATTERN_FIRECREST_INT, lane, tile)] = False - firecrest[_p2f(PATTERN_FIRECREST_NSE, lane, tile)] = False - firecrest[_p2f(PATTERN_FIRECREST_POS, lane, tile)] = False - firecrest[_p2f(PATTERN_FIRECREST_IDX, lane, tile)] = False - firecrest[_p2f(PATTERN_FIRECREST_CLU1, lane, tile)] = False - firecrest[_p2f(PATTERN_FIRECREST_CLU2, lane, tile)] = False - firecrest[_p2f(PATTERN_FIRECREST_CLU3, lane, tile)] = False - firecrest[_p2f(PATTERN_FIRECREST_CLU4, lane, tile)] = False - - - # BUSTARD - bustard[_p2f(PATTERN_BUSTARD_SIG2, lane, tile)] = False - bustard[_p2f(PATTERN_BUSTARD_PRB, lane, tile)] = False - - - # GERALD - #gerald[_p2f(PATTERN_GERALD_ALLTMP, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_QRAWTMP, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_ALLPNGTMP, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_ALIGNTMP, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_QVALTMP, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_SCORETMP, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_PREALIGNTMP, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_REALIGNTMP, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_RESCORETMP, lane, tile)] = False - gerald[_p2f(PATTERN_GERALD_RESCOREPNG, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_ERRORSTMPPNG, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_QCALTMP, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_QVAL, lane, tile)] = False - - ################### - # LANE LAYER - - # GERALD - #gerald[_p2f(PATTERN_GERALD_SEQPRETMP, lane)] = False - #gerald[_p2f(PATTERN_GERALD_RESULTTMP, lane)] = False - #gerald[_p2f(PATTERN_GERALD_SIGMEANSTMP, lane)] = False - gerald[_p2f(PATTERN_GERALD_CALLPNG, lane)] = False - gerald[_p2f(PATTERN_GERALD_ALLPNG, lane)] = False - gerald[_p2f(PATTERN_GERALD_PERCENTALLPNG, lane)] = False - gerald[_p2f(PATTERN_GERALD_PERCENTCALLPNG, lane)] = False - gerald[_p2f(PATTERN_GERALD_PERCENTBASEPNG, lane)] = False - #gerald[_p2f(PATTERN_GERALD_FILTTMP, lane)] = False - #gerald[_p2f(PATTERN_GERALD_FRAGTMP, lane)] = False - #gerald[_p2f(PATTERN_GERALD_QREPORTTMP, lane)] = False - #gerald[_p2f(PATTERN_GERALD_QTABLETMP, lane)] = False - #gerald[_p2f(PATTERN_GERALD_QCALREPORTTMP, lane)] = False - #gerald[_p2f(PATTERN_GERALD_SEQUENCETMP, lane)] = False - gerald[_p2f(PATTERN_GERALD_LANEFINISHED, lane)] = False - - - - ################# - # LOOPS FINISHED - - # FIRECREST - firecrest['offsets_finished.txt'] = False - firecrest['finished.txt'] = False - - # BUSTARD - bustard['finished.txt'] = False - - # GERALD - gerald['tiles.txt'] = False - gerald['FullAll.htm'] = False - #gerald['All.htm.tmp'] = False - #gerald['Signal_Means.txt.tmp'] = False - #gerald['plotIntensity_for_IVC'] = False - #gerald['IVC.htm.tmp'] = False - gerald['FullError.htm'] = False - gerald['FullPerfect.htm'] = False - #gerald['Error.htm.tmp'] = False - #gerald['Perfect.htm.tmp'] = False - #gerald['Summary.htm.tmp'] = False - #gerald['Tile.htm.tmp'] = False - gerald['finished.txt'] = False - - def statusFirecrest(self): - """ - returns (, ) - """ - firecrest = self.status['firecrest'] - total = len(firecrest) - completed = firecrest.values().count(True) - - return (completed, total) - - - def statusBustard(self): - """ - returns (, ) - """ - bustard = self.status['bustard'] - total = len(bustard) - completed = bustard.values().count(True) - - return (completed, total) - - - def statusGerald(self): - """ - returns (, ) - """ - gerald = self.status['gerald'] - total = len(gerald) - completed = gerald.values().count(True) - - return (completed, total) - - - def statusTotal(self): - """ - returns (, ) - """ - #f = firecrest c = completed - #b = bustard t = total - #g = gerald - fc, ft = self.statusFirecrest() - bc, bt = self.statusBustard() - gc, gt = self.statusGerald() - - return (fc+bc+gc, ft+bt+gt) - - - def statusReport(self): - """ - Generate the basic percent complete report - """ - def _percentCompleted(completed, total): - """ - Returns precent completed as float - """ - return (completed / float(total)) * 100 - - fc, ft = self.statusFirecrest() - bc, bt = self.statusBustard() - gc, gt = self.statusGerald() - tc, tt = self.statusTotal() - - fp = _percentCompleted(fc, ft) - bp = _percentCompleted(bc, bt) - gp = _percentCompleted(gc, gt) - tp = _percentCompleted(tc, tt) - - report = ['Firecrest: %s%% (%s/%s)' % (fp, fc, ft), - ' Bustard: %s%% (%s/%s)' % (bp, bc, bt), - ' Gerald: %s%% (%s/%s)' % (gp, gc, gt), - '-----------------------', - ' Total: %s%% (%s/%s)' % (tp, tc, tt), - ] - return report - - def updateFirecrest(self, filename): - """ - Marks firecrest filename as being completed. - """ - self.status['firecrest'][filename] = True - - - def updateBustard(self, filename): - """ - Marks bustard filename as being completed. - """ - self.status['bustard'][filename] = True - - - def updateGerald(self, filename): - """ - Marks gerald filename as being completed. - """ - self.status['gerald'][filename] = True - - - -################################################## -# Functions to be called by Thread(target=) -def _cmdLineStatusMonitorFunc(conf_info): - """ - Given a ConfigInfo object, provides status to stdout. - - You should probably use startCmdLineStatusMonitor() - instead of ths function. - - Use with: - t = threading.Thread(target=_cmdLineStatusMonitorFunc, - args=[conf_info]) - t.setDaemon(True) - t.start() - """ - SLEEP_AMOUNT = 30 - - while 1: - if conf_info.status is None: - print "No status object yet." - time.sleep(SLEEP_AMOUNT) - continue - - report = conf_info.status.statusReport() - print os.linesep.join(report) - print - - time.sleep(SLEEP_AMOUNT) - - -############################################# -# Start monitor thread convenience functions -def startCmdLineStatusMonitor(conf_info): - """ - Starts a command line status monitor given a conf_info object. - """ - t = threading.Thread(target=_cmdLineStatusMonitorFunc, args=[conf_info]) - t.setDaemon(True) - t.start() - -from optparse import OptionParser -def make_parser(): - usage = "%prog: config file" - - parser = OptionParser() - return parser - -def main(cmdline=None): - parser = make_parser() - opt, args = parser.parse_args(cmdline) - - if len(args) != 1: - parser.error("need name of configuration file") - - status = GARunStatus(args[0]) - print os.linesep.join(status.statusReport()) - return 0 - -if __name__ == "__main__": - sys.exit(main(sys.argv[1:])) - diff --git a/htsworkflow/pipeline/runfolder.py b/htsworkflow/pipeline/runfolder.py deleted file mode 100644 index 492d103..0000000 --- a/htsworkflow/pipeline/runfolder.py +++ /dev/null @@ -1,313 +0,0 @@ -""" -Core information needed to inspect a runfolder. -""" -from glob import glob -import logging -import os -import re -import shutil -import stat -import subprocess -import sys -import time - -try: - from xml.etree import ElementTree -except ImportError, e: - from elementtree import ElementTree - -EUROPEAN_STRPTIME = "%d-%m-%Y" -EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})" -VERSION_RE = "([0-9\.]+)" -USER_RE = "([a-zA-Z0-9]+)" -LANES_PER_FLOWCELL = 8 - -from htsworkflow.util.alphanum import alphanum -from htsworkflow.util.ethelp import indent, flatten - - -class PipelineRun(object): - """ - Capture "interesting" information about a pipeline run - """ - XML_VERSION = 1 - PIPELINE_RUN = 'PipelineRun' - FLOWCELL_ID = 'FlowcellID' - - def __init__(self, pathname=None, firecrest=None, bustard=None, gerald=None, xml=None): - if pathname is not None: - self.pathname = os.path.normpath(pathname) - else: - self.pathname = None - self._name = None - self._flowcell_id = None - self.firecrest = firecrest - self.bustard = bustard - self.gerald = gerald - - if xml is not None: - self.set_elements(xml) - - def _get_flowcell_id(self): - # extract flowcell ID - if self._flowcell_id is None: - config_dir = os.path.join(self.pathname, 'Config') - flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml') - if os.path.exists(flowcell_id_path): - flowcell_id_tree = ElementTree.parse(flowcell_id_path) - self._flowcell_id = flowcell_id_tree.findtext('Text') - else: - path_fields = self.pathname.split('_') - if len(path_fields) > 0: - # guessing last element of filename - flowcell_id = path_fields[-1] - else: - flowcell_id = 'unknown' - - logging.warning( - "Flowcell id was not found, guessing %s" % ( - flowcell_id)) - self._flowcell_id = flowcell_id - return self._flowcell_id - flowcell_id = property(_get_flowcell_id) - - def get_elements(self): - """ - make one master xml file from all of our sub-components. - """ - root = ElementTree.Element(PipelineRun.PIPELINE_RUN) - flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID) - flowcell.text = self.flowcell_id - root.append(self.firecrest.get_elements()) - root.append(self.bustard.get_elements()) - root.append(self.gerald.get_elements()) - return root - - def set_elements(self, tree): - # this file gets imported by all the others, - # so we need to hide the imports to avoid a cyclic imports - from htsworkflow.pipeline import firecrest - from htsworkflow.pipeline import bustard - from htsworkflow.pipeline import gerald - - tag = tree.tag.lower() - if tag != PipelineRun.PIPELINE_RUN.lower(): - raise ValueError('Pipeline Run Expecting %s got %s' % ( - PipelineRun.PIPELINE_RUN, tag)) - for element in tree: - tag = element.tag.lower() - if tag == PipelineRun.FLOWCELL_ID.lower(): - self._flowcell_id = element.text - #ok the xword.Xword.XWORD pattern for module.class.constant is lame - elif tag == firecrest.Firecrest.FIRECREST.lower(): - self.firecrest = firecrest.Firecrest(xml=element) - elif tag == bustard.Bustard.BUSTARD.lower(): - self.bustard = bustard.Bustard(xml=element) - elif tag == gerald.Gerald.GERALD.lower(): - self.gerald = gerald.Gerald(xml=element) - else: - logging.warn('PipelineRun unrecognized tag %s' % (tag,)) - - def _get_run_name(self): - """ - Given a run tuple, find the latest date and use that as our name - """ - if self._name is None: - tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time) - timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax)) - self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml' - return self._name - name = property(_get_run_name) - - def save(self, destdir=None): - if destdir is None: - destdir = '' - logging.info("Saving run report "+ self.name) - xml = self.get_elements() - indent(xml) - dest_pathname = os.path.join(destdir, self.name) - ElementTree.ElementTree(xml).write(dest_pathname) - - def load(self, filename): - logging.info("Loading run report from " + filename) - tree = ElementTree.parse(filename).getroot() - self.set_elements(tree) - -def get_runs(runfolder): - """ - Search through a run folder for all the various sub component runs - and then return a PipelineRun for each different combination. - - For example if there are two different GERALD runs, this will - generate two different PipelineRun objects, that differ - in there gerald component. - """ - from htsworkflow.pipeline import firecrest - from htsworkflow.pipeline import bustard - from htsworkflow.pipeline import gerald - - datadir = os.path.join(runfolder, 'Data') - - logging.info('Searching for runs in ' + datadir) - runs = [] - for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")): - f = firecrest.firecrest(firecrest_pathname) - bustard_glob = os.path.join(firecrest_pathname, "Bustard*") - for bustard_pathname in glob(bustard_glob): - b = bustard.bustard(bustard_pathname) - gerald_glob = os.path.join(bustard_pathname, 'GERALD*') - for gerald_pathname in glob(gerald_glob): - try: - g = gerald.gerald(gerald_pathname) - runs.append(PipelineRun(runfolder, f, b, g)) - except IOError, e: - print "Ignoring", str(e) - return runs - - -def extract_run_parameters(runs): - """ - Search through runfolder_path for various runs and grab their parameters - """ - for run in runs: - run.save() - -def summarize_mapped_reads(mapped_reads): - """ - Summarize per chromosome reads into a genome count - But handle spike-in/contamination symlinks seperately. - """ - summarized_reads = {} - genome_reads = 0 - genome = 'unknown' - for k, v in mapped_reads.items(): - path, k = os.path.split(k) - if len(path) > 0: - genome = path - genome_reads += v - else: - summarized_reads[k] = summarized_reads.setdefault(k, 0) + v - summarized_reads[genome] = genome_reads - return summarized_reads - -def summary_report(runs): - """ - Summarize cluster numbers and mapped read counts for a runfolder - """ - report = [] - for run in runs: - # print a run name? - report.append('Summary for %s' % (run.name,)) - # sort the report - eland_keys = run.gerald.eland_results.results.keys() - eland_keys.sort(alphanum) - - lane_results = run.gerald.summary.lane_results - for lane_id in eland_keys: - result = run.gerald.eland_results.results[lane_id] - report.append("Sample name %s" % (result.sample_name)) - report.append("Lane id %s" % (result.lane_id,)) - cluster = lane_results[result.lane_id].cluster - report.append("Clusters %d +/- %d" % (cluster[0], cluster[1])) - report.append("Total Reads: %d" % (result.reads)) - mc = result._match_codes - nm = mc['NM'] - nm_percent = float(nm)/result.reads * 100 - qc = mc['QC'] - qc_percent = float(qc)/result.reads * 100 - - report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent)) - report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent)) - report.append('Unique (0,1,2 mismatches) %d %d %d' % \ - (mc['U0'], mc['U1'], mc['U2'])) - report.append('Repeat (0,1,2 mismatches) %d %d %d' % \ - (mc['R0'], mc['R1'], mc['R2'])) - report.append("Mapped Reads") - mapped_reads = summarize_mapped_reads(result.mapped_reads) - for name, counts in mapped_reads.items(): - report.append(" %s: %d" % (name, counts)) - report.append('---') - report.append('') - return os.linesep.join(report) - -def extract_results(runs, output_base_dir=None): - if output_base_dir is None: - output_base_dir = os.getcwd() - - for r in runs: - result_dir = os.path.join(output_base_dir, r.flowcell_id) - logging.info("Using %s as result directory" % (result_dir,)) - if not os.path.exists(result_dir): - os.mkdir(result_dir) - - # create cycle_dir - cycle = "C%d-%d" % (r.firecrest.start, r.firecrest.stop) - logging.info("Filling in %s" % (cycle,)) - cycle_dir = os.path.join(result_dir, cycle) - if os.path.exists(cycle_dir): - logging.error("%s already exists, not overwriting" % (cycle_dir,)) - continue - else: - os.mkdir(cycle_dir) - - # copy stuff out of the main run - g = r.gerald - - # save run file - r.save(cycle_dir) - - # Copy Summary.htm - summary_path = os.path.join(r.gerald.pathname, 'Summary.htm') - if os.path.exists(summary_path): - logging.info('Copying %s to %s' % (summary_path, cycle_dir)) - shutil.copy(summary_path, cycle_dir) - else: - logging.info('Summary file %s was not found' % (summary_path,)) - - # tar score files - score_files = [] - for f in os.listdir(g.pathname): - if re.match('.*_score.txt', f): - score_files.append(f) - - tar_cmd = ['/bin/tar', 'c'] + score_files - bzip_cmd = [ 'bzip2', '-9', '-c' ] - tar_dest_name =os.path.join(cycle_dir, 'scores.tar.bz2') - tar_dest = open(tar_dest_name, 'w') - logging.info("Compressing score files in %s" % (g.pathname,)) - logging.info("Running tar: " + " ".join(tar_cmd[:10])) - logging.info("Running bzip2: " + " ".join(bzip_cmd)) - logging.info("Writing to %s" %(tar_dest_name)) - - tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, cwd=g.pathname) - bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest) - tar.wait() - - # copy & bzip eland files - for eland_lane in g.eland_results.values(): - source_name = eland_lane.pathname - path, name = os.path.split(eland_lane.pathname) - dest_name = os.path.join(cycle_dir, name+'.bz2') - - args = ['bzip2', '-9', '-c', source_name] - logging.info('Running: %s' % ( " ".join(args) )) - bzip_dest = open(dest_name, 'w') - bzip = subprocess.Popen(args, stdout=bzip_dest) - logging.info('Saving to %s' % (dest_name, )) - bzip.wait() - -def clean_runs(runs): - """ - Clean up run folders to optimize for compression. - """ - # TODO: implement this. - # rm RunLog*.xml - # rm pipeline_*.txt - # rm gclog.txt - # rm NetCopy.log - # rm nfn.log - # rm Images/L* - # cd Data/C1-*_Firecrest* - # make clean_intermediate - - pass diff --git a/htsworkflow/pipeline/test/test_genome_mapper.py b/htsworkflow/pipeline/test/test_genome_mapper.py deleted file mode 100644 index 8916965..0000000 --- a/htsworkflow/pipeline/test/test_genome_mapper.py +++ /dev/null @@ -1,33 +0,0 @@ -import unittest - -from StringIO import StringIO -from htsworkflow.pipeline import genome_mapper - -class testGenomeMapper(unittest.TestCase): - def test_construct_mapper(self): - genomes = { - 'Arabidopsis thaliana': {'v01212004': '/arabidopsis'}, - 'Homo sapiens': {'hg18': '/hg18'}, - 'Mus musculus': {'mm8': '/mm8', - 'mm9': '/mm9', - 'mm10': '/mm10'}, - 'Phage': {'174': '/phi'}, - } - genome_map = genome_mapper.constructMapperDict(genomes) - - self.failUnlessEqual("%(Mus musculus|mm8)s" % (genome_map), "/mm8") - self.failUnlessEqual("%(Phage|174)s" % (genome_map), "/phi") - self.failUnlessEqual("%(Mus musculus)s" % (genome_map), "/mm10") - self.failUnlessEqual("%(Mus musculus|mm8)s" % (genome_map), "/mm8") - self.failUnlessEqual("%(Mus musculus|mm10)s" % (genome_map), "/mm10") - - self.failUnlessEqual(len(genome_map.keys()), 6) - self.failUnlessEqual(len(genome_map.values()), 6) - self.failUnlessEqual(len(genome_map.items()), 6) - - -def suite(): - return unittest.makeSuite(testGenomeMapper,'test') - -if __name__ == "__main__": - unittest.main(defaultTest="suite") diff --git a/htsworkflow/pipeline/test/test_runfolder026.py b/htsworkflow/pipeline/test/test_runfolder026.py deleted file mode 100644 index 74247a6..0000000 --- a/htsworkflow/pipeline/test/test_runfolder026.py +++ /dev/null @@ -1,601 +0,0 @@ -#!/usr/bin/env python - -from datetime import datetime, date -import os -import tempfile -import shutil -import unittest - -from htsworkflow.pipeline import firecrest -from htsworkflow.pipeline import bustard -from htsworkflow.pipeline import gerald -from htsworkflow.pipeline import runfolder -from htsworkflow.pipeline.runfolder import ElementTree - - -def make_flowcell_id(runfolder_dir, flowcell_id=None): - if flowcell_id is None: - flowcell_id = '207BTAAXY' - - config = """ - - %s -""" % (flowcell_id,) - config_dir = os.path.join(runfolder_dir, 'Config') - - if not os.path.exists(config_dir): - os.mkdir(config_dir) - pathname = os.path.join(config_dir, 'FlowcellId.xml') - f = open(pathname,'w') - f.write(config) - f.close() - -def make_matrix(matrix_dir): - contents = """# Auto-generated frequency response matrix -> A -> C -> G -> T -0.77 0.15 -0.04 -0.04 -0.76 1.02 -0.05 -0.06 --0.10 -0.10 1.17 -0.03 --0.13 -0.12 0.80 1.27 -""" - s_matrix = os.path.join(matrix_dir, 's_matrix.txt') - f = open(s_matrix, 'w') - f.write(contents) - f.close() - -def make_phasing_params(bustard_dir): - for lane in range(1,9): - pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane)) - f = open(pathname, 'w') - f.write(""" - 0.009900 - 0.003500 - -""") - f.close() - -def make_gerald_config(gerald_dir): - config_xml = """ - - default - - - - - Need_to_specify_ELAND_genome_directory - 8 - - domain.com - diane - localhost:25 - /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane - /home/diane/gec - 1 - /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes - Need_to_specify_genome_file_name - genome - /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane - - _prb.txt - 12 - '((CHASTITY>=0.6))' - _qhg.txt - --symbolic - 32 - --scarf - _seq.txt - _sig2.txt - _sig.txt - @(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp - s_[1-8]_[0-9][0-9][0-9][0-9] - s - Sat Apr 19 19:08:30 2008 - /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald - all - http://host.domain.com/yourshare/ - - - - eland - eland - eland - eland - eland - eland - eland - eland - - - /g/dm3 - /g/equcab1 - /g/equcab1 - /g/canfam2 - /g/hg18 - /g/hg18 - /g/hg18 - /g/hg18 - - - 32 - 32 - 32 - 32 - 32 - 32 - 32 - 32 - - - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - - - -""" - pathname = os.path.join(gerald_dir, 'config.xml') - f = open(pathname,'w') - f.write(config_xml) - f.close() - - -def make_summary_htm(gerald_dir): - summary_htm = """ - - - - -

080416_HWI-EAS229_0024_207BTAAXX Summary

-

Summary Information For Experiment 080416_HWI-EAS229_0024_207BTAAXX on Machine HWI-EAS229

-



Chip Summary

- - - - -
MachineHWI-EAS229
Run Folder080416_HWI-EAS229_0024_207BTAAXX
Chip IDunknown
-



Lane Parameter Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LaneSample IDSample TargetSample TypeLengthFilterTiles
1unknowndm3ELAND32'((CHASTITY>=0.6))'Lane 1
2unknownequcab1ELAND32'((CHASTITY>=0.6))'Lane 2
3unknownequcab1ELAND32'((CHASTITY>=0.6))'Lane 3
4unknowncanfam2ELAND32'((CHASTITY>=0.6))'Lane 4
5unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 5
6unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 6
7unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 7
8unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 8
-



Lane Results Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Lane Clusters Av 1st Cycle Int % intensity after 20 cycles % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
117421 +/- 21397230 +/- 80123.73 +/- 10.7913.00 +/- 22.9132.03 +/- 18.456703.57 +/- 3753.854.55 +/- 4.81
220311 +/- 24027660 +/- 67817.03 +/- 4.4040.74 +/- 30.3329.54 +/- 9.035184.02 +/- 1631.543.27 +/- 3.94
320193 +/- 23997700 +/- 79715.75 +/- 3.3056.56 +/- 17.1627.33 +/- 7.484803.49 +/- 1313.313.07 +/- 2.86
415537 +/- 25317620 +/- 139215.37 +/- 3.7963.05 +/- 18.3015.88 +/- 4.993162.13 +/- 962.593.11 +/- 2.22
532047 +/- 33568093 +/- 83123.79 +/- 6.1853.36 +/- 18.0648.04 +/- 13.779866.23 +/- 2877.302.26 +/- 1.16
632946 +/- 47538227 +/- 73624.07 +/- 4.6954.65 +/- 12.5750.98 +/- 10.5410468.86 +/- 2228.532.21 +/- 2.33
739504 +/- 41718401 +/- 78522.55 +/- 4.5645.22 +/- 10.3448.41 +/- 9.679829.40 +/- 1993.202.26 +/- 1.11
837998 +/- 37928443 +/- 121139.03 +/- 7.5242.16 +/- 12.3540.98 +/- 14.898128.87 +/- 3055.343.57 +/- 2.77
- - -""" - pathname = os.path.join(gerald_dir, 'Summary.htm') - f = open(pathname, 'w') - f.write(summary_htm) - f.close() - -def make_eland_results(gerald_dir): - eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D. ->HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 chr2L.fa 8796855 R DD 24T ->HWI-EAS229_24_207BTAAXX:1:7:776:582 AGCTCANCCGATCGAAAACCTCNCCAAGCAAT NM 0 0 0 ->HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 Lambda.fa 8796855 R DD 24T -""" - for i in range(1,9): - pathname = os.path.join(gerald_dir, - 's_%d_eland_result.txt' % (i,)) - f = open(pathname, 'w') - f.write(eland_result) - f.close() - -class RunfolderTests(unittest.TestCase): - """ - Test components of the runfolder processing code - which includes firecrest, bustard, and gerald - """ - def setUp(self): - # make a fake runfolder directory - self.temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_') - - self.runfolder_dir = os.path.join(self.temp_dir, - '080102_HWI-EAS229_0010_207BTAAXX') - os.mkdir(self.runfolder_dir) - - self.data_dir = os.path.join(self.runfolder_dir, 'Data') - os.mkdir(self.data_dir) - - self.firecrest_dir = os.path.join(self.data_dir, - 'C1-33_Firecrest1.8.28_12-04-2008_diane' - ) - os.mkdir(self.firecrest_dir) - self.matrix_dir = os.path.join(self.firecrest_dir, 'Matrix') - os.mkdir(self.matrix_dir) - make_matrix(self.matrix_dir) - - self.bustard_dir = os.path.join(self.firecrest_dir, - 'Bustard1.8.28_12-04-2008_diane') - os.mkdir(self.bustard_dir) - make_phasing_params(self.bustard_dir) - - self.gerald_dir = os.path.join(self.bustard_dir, - 'GERALD_12-04-2008_diane') - os.mkdir(self.gerald_dir) - make_gerald_config(self.gerald_dir) - make_summary_htm(self.gerald_dir) - make_eland_results(self.gerald_dir) - - def tearDown(self): - shutil.rmtree(self.temp_dir) - - def test_firecrest(self): - """ - Construct a firecrest object - """ - f = firecrest.firecrest(self.firecrest_dir) - self.failUnlessEqual(f.version, '1.8.28') - self.failUnlessEqual(f.start, 1) - self.failUnlessEqual(f.stop, 33) - self.failUnlessEqual(f.user, 'diane') - self.failUnlessEqual(f.date, date(2008,4,12)) - - xml = f.get_elements() - # just make sure that element tree can serialize the tree - xml_str = ElementTree.tostring(xml) - - f2 = firecrest.Firecrest(xml=xml) - self.failUnlessEqual(f.version, f2.version) - self.failUnlessEqual(f.start, f2.start) - self.failUnlessEqual(f.stop, f2.stop) - self.failUnlessEqual(f.user, f2.user) - self.failUnlessEqual(f.date, f2.date) - - def test_bustard(self): - """ - construct a bustard object - """ - b = bustard.bustard(self.bustard_dir) - self.failUnlessEqual(b.version, '1.8.28') - self.failUnlessEqual(b.date, date(2008,4,12)) - self.failUnlessEqual(b.user, 'diane') - self.failUnlessEqual(len(b.phasing), 8) - self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099) - - xml = b.get_elements() - b2 = bustard.Bustard(xml=xml) - self.failUnlessEqual(b.version, b2.version) - self.failUnlessEqual(b.date, b2.date ) - self.failUnlessEqual(b.user, b2.user) - self.failUnlessEqual(len(b.phasing), len(b2.phasing)) - for key in b.phasing.keys(): - self.failUnlessEqual(b.phasing[key].lane, - b2.phasing[key].lane) - self.failUnlessEqual(b.phasing[key].phasing, - b2.phasing[key].phasing) - self.failUnlessEqual(b.phasing[key].prephasing, - b2.phasing[key].prephasing) - - def test_gerald(self): - # need to update gerald and make tests for it - g = gerald.gerald(self.gerald_dir) - - self.failUnlessEqual(g.version, - '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp') - self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30)) - self.failUnlessEqual(len(g.lanes), len(g.lanes.keys())) - self.failUnlessEqual(len(g.lanes), len(g.lanes.items())) - - - # list of genomes, matches what was defined up in - # make_gerald_config. - # the first None is to offset the genomes list to be 1..9 - # instead of pythons default 0..8 - genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2', - '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ] - - # test lane specific parameters from gerald config file - for i in range(1,9): - cur_lane = g.lanes[str(i)] - self.failUnlessEqual(cur_lane.analysis, 'eland') - self.failUnlessEqual(cur_lane.eland_genome, genomes[i]) - self.failUnlessEqual(cur_lane.read_length, '32') - self.failUnlessEqual(cur_lane.use_bases, 'Y'*32) - - # test data extracted from summary file - clusters = [None, - (17421, 2139), (20311, 2402), (20193, 2399), (15537, 2531), - (32047, 3356), (32946, 4753), (39504, 4171), (37998, 3792)] - - for i in range(1,9): - summary_lane = g.summary[str(i)] - self.failUnlessEqual(summary_lane.cluster, clusters[i]) - self.failUnlessEqual(summary_lane.lane, str(i)) - - xml = g.get_elements() - # just make sure that element tree can serialize the tree - xml_str = ElementTree.tostring(xml) - g2 = gerald.Gerald(xml=xml) - - # do it all again after extracting from the xml file - self.failUnlessEqual(g.version, g2.version) - self.failUnlessEqual(g.date, g2.date) - self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys())) - self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items())) - - # test lane specific parameters from gerald config file - for i in range(1,9): - g_lane = g.lanes[str(i)] - g2_lane = g2.lanes[str(i)] - self.failUnlessEqual(g_lane.analysis, g2_lane.analysis) - self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome) - self.failUnlessEqual(g_lane.read_length, g2_lane.read_length) - self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases) - - # test (some) summary elements - for i in range(1,9): - g_summary = g.summary[str(i)] - g2_summary = g2.summary[str(i)] - self.failUnlessEqual(g_summary.cluster, g2_summary.cluster) - self.failUnlessEqual(g_summary.lane, g2_summary.lane) - - g_eland = g.eland_results - g2_eland = g2.eland_results - for lane in g_eland.keys(): - self.failUnlessEqual(g_eland[lane].reads, - g2_eland[lane].reads) - self.failUnlessEqual(len(g_eland[lane].mapped_reads), - len(g2_eland[lane].mapped_reads)) - for k in g_eland[lane].mapped_reads.keys(): - self.failUnlessEqual(g_eland[lane].mapped_reads[k], - g2_eland[lane].mapped_reads[k]) - - self.failUnlessEqual(len(g_eland[lane].match_codes), - len(g2_eland[lane].match_codes)) - for k in g_eland[lane].match_codes.keys(): - self.failUnlessEqual(g_eland[lane].match_codes[k], - g2_eland[lane].match_codes[k]) - - - def test_eland(self): - dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa', - 'chr2L.fa': 'dm3/chr2L.fa', - 'Lambda.fa': 'Lambda.fa'} - genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map, - '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map } - eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps) - - for i in range(1,9): - lane = eland[str(i)] - self.failUnlessEqual(lane.reads, 4) - self.failUnlessEqual(lane.sample_name, "s") - self.failUnlessEqual(lane.lane_id, unicode(i)) - self.failUnlessEqual(len(lane.mapped_reads), 3) - self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1) - self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1) - self.failUnlessEqual(lane.match_codes['U1'], 2) - self.failUnlessEqual(lane.match_codes['NM'], 1) - - xml = eland.get_elements() - # just make sure that element tree can serialize the tree - xml_str = ElementTree.tostring(xml) - e2 = gerald.ELAND(xml=xml) - - for i in range(1,9): - l1 = eland[str(i)] - l2 = e2[str(i)] - self.failUnlessEqual(l1.reads, l2.reads) - self.failUnlessEqual(l1.sample_name, l2.sample_name) - self.failUnlessEqual(l1.lane_id, l2.lane_id) - self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads)) - self.failUnlessEqual(len(l1.mapped_reads), 3) - for k in l1.mapped_reads.keys(): - self.failUnlessEqual(l1.mapped_reads[k], - l2.mapped_reads[k]) - - self.failUnlessEqual(len(l1.match_codes), 9) - self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes)) - for k in l1.match_codes.keys(): - self.failUnlessEqual(l1.match_codes[k], - l2.match_codes[k]) - - def test_runfolder(self): - runs = runfolder.get_runs(self.runfolder_dir) - - # do we get the flowcell id from the filename? - self.failUnlessEqual(len(runs), 1) - self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-04-19.xml') - - # do we get the flowcell id from the FlowcellId.xml file - make_flowcell_id(self.runfolder_dir, '207BTAAXY') - runs = runfolder.get_runs(self.runfolder_dir) - self.failUnlessEqual(len(runs), 1) - self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-04-19.xml') - - r1 = runs[0] - xml = r1.get_elements() - xml_str = ElementTree.tostring(xml) - - r2 = runfolder.PipelineRun(xml=xml) - self.failUnlessEqual(r1.name, r2.name) - self.failIfEqual(r2.firecrest, None) - self.failIfEqual(r2.bustard, None) - self.failIfEqual(r2.gerald, None) - - -def suite(): - return unittest.makeSuite(RunfolderTests,'test') - -if __name__ == "__main__": - unittest.main(defaultTest="suite") - diff --git a/htsworkflow/pipeline/test/test_runfolder030.py b/htsworkflow/pipeline/test/test_runfolder030.py deleted file mode 100644 index e5d1ea1..0000000 --- a/htsworkflow/pipeline/test/test_runfolder030.py +++ /dev/null @@ -1,1024 +0,0 @@ -#!/usr/bin/env python - -from datetime import datetime, date -import os -import tempfile -import shutil -import unittest - -from htsworkflow.pipeline import firecrest -from htsworkflow.pipeline import bustard -from htsworkflow.pipeline import gerald -from htsworkflow.pipeline import runfolder -from htsworkflow.pipeline.runfolder import ElementTree - - -def make_flowcell_id(runfolder_dir, flowcell_id=None): - if flowcell_id is None: - flowcell_id = '207BTAAXY' - - config = """ - - %s -""" % (flowcell_id,) - config_dir = os.path.join(runfolder_dir, 'Config') - - if not os.path.exists(config_dir): - os.mkdir(config_dir) - pathname = os.path.join(config_dir, 'FlowcellId.xml') - f = open(pathname,'w') - f.write(config) - f.close() - -def make_matrix(matrix_dir): - contents = """# Auto-generated frequency response matrix -> A -> C -> G -> T -0.77 0.15 -0.04 -0.04 -0.76 1.02 -0.05 -0.06 --0.10 -0.10 1.17 -0.03 --0.13 -0.12 0.80 1.27 -""" - s_matrix = os.path.join(matrix_dir, 's_matrix.txt') - f = open(s_matrix, 'w') - f.write(contents) - f.close() - -def make_phasing_params(bustard_dir): - for lane in range(1,9): - pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane)) - f = open(pathname, 'w') - f.write(""" - 0.009900 - 0.003500 - -""") - f.close() - -def make_gerald_config(gerald_dir): - config_xml = """ - - default - - - - - Need_to_specify_ELAND_genome_directory - 8 - - domain.com - diane - localhost:25 - /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane - /home/diane/gec - 1 - /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes - Need_to_specify_genome_file_name - genome - /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane - - _prb.txt - 12 - '((CHASTITY>=0.6))' - _qhg.txt - --symbolic - 32 - --scarf - _seq.txt - _sig2.txt - _sig.txt - @(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp - s_[1-8]_[0-9][0-9][0-9][0-9] - s - Sat Apr 19 19:08:30 2008 - /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald - all - http://host.domain.com/yourshare/ - - - - eland - eland - eland - eland - eland - eland - eland - eland - - - /g/dm3 - /g/equcab1 - /g/equcab1 - /g/canfam2 - /g/hg18 - /g/hg18 - /g/hg18 - /g/hg18 - - - 32 - 32 - 32 - 32 - 32 - 32 - 32 - 32 - - - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - - - -""" - pathname = os.path.join(gerald_dir, 'config.xml') - f = open(pathname,'w') - f.write(config_xml) - f.close() - -def make_summary_htm(gerald_dir): - summary_htm=""" - - - - -

080627_HWI-EAS229_0036_3055HAXX Summary

-

Summary Information For Experiment 080627_HWI-EAS229_0036_3055HAXX on Machine HWI-EAS229

-



Chip Summary

- - - - -
MachineHWI-EAS229
Run Folder080627_HWI-EAS229_0036_3055HAXX
Chip IDunknown
-



Chip Results Summary

- - - - - - - - - - -
ClustersClusters (PF)Yield (kbases)
80933224435778031133022
-



Lane Parameter Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LaneSample IDSample TargetSample TypeLengthFilterNum TilesTiles
1unknownmm9ELAND26'((CHASTITY>=0.6))'100Lane 1
2unknownmm9ELAND26'((CHASTITY>=0.6))'100Lane 2
3unknownmm9ELAND26'((CHASTITY>=0.6))'100Lane 3
4unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 4
5unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 5
6unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 6
7unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 7
8unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 8
-



Lane Results Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Lane InfoTile Mean +/- SD for Lane
Lane Lane Yield (kbases) Clusters (raw)Clusters (PF) 1st Cycle Int (PF) % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Alignment Score (PF) % Error Rate (PF)
115804696483 +/- 907460787 +/- 4240329 +/- 35101.88 +/- 6.0363.21 +/- 3.2970.33 +/- 0.249054.08 +/- 59.160.46 +/- 0.18
2156564133738 +/- 793860217 +/- 1926444 +/- 3992.62 +/- 7.5845.20 +/- 3.3151.98 +/- 0.746692.04 +/- 92.490.46 +/- 0.09
3185818152142 +/- 1000271468 +/- 2827366 +/- 3691.53 +/- 8.6647.19 +/- 3.8082.24 +/- 0.4410598.68 +/- 64.130.41 +/- 0.04
43495315784 +/- 216213443 +/- 1728328 +/- 4097.53 +/- 9.8785.29 +/- 1.9180.02 +/- 0.5310368.82 +/- 71.080.15 +/- 0.05
5167936119735 +/- 846564590 +/- 2529417 +/- 3788.69 +/- 14.7954.10 +/- 2.5976.95 +/- 0.329936.47 +/- 65.750.28 +/- 0.02
6173463152177 +/- 814666716 +/- 2493372 +/- 3987.06 +/- 9.8643.98 +/- 3.1278.80 +/- 0.4310162.28 +/- 49.650.38 +/- 0.03
714928784649 +/- 732557418 +/- 3617295 +/- 2889.40 +/- 8.2367.97 +/- 1.8233.38 +/- 0.254247.92 +/- 32.371.00 +/- 0.03
810695354622 +/- 481241136 +/- 3309284 +/- 3790.21 +/- 9.1075.39 +/- 2.2748.33 +/- 0.296169.21 +/- 169.500.86 +/- 1.22
Tile mean across chip
Av.1011665447235492.3660.2965.258403.690.50
-



Expanded Lane Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Lane InfoPhasing InfoRaw Data (tile mean)Filtered Data (tile mean)
Lane Clusters (tile mean) (raw)% Phasing % Prephasing % Error Rate (raw) Equiv Perfect Clusters (raw) % retained Cycle 2-4 Av Int (PF) Cycle 2-10 Av % Loss (PF) Cycle 10-20 Av % Loss (PF) % Align (PF) % Error Rate (PF) Equiv Perfect Clusters (PF)
1964830.77000.31001.004967663.21317 +/- 320.13 +/- 0.44-1.14 +/- 0.3470.330.4641758
21337380.77000.31001.224046745.20415 +/- 330.29 +/- 0.40-0.79 +/- 0.3551.980.4630615
31521420.77000.31001.307858847.19344 +/- 260.68 +/- 0.51-0.77 +/- 0.4282.240.4157552
4157840.77000.31000.291109585.29306 +/- 340.20 +/- 0.69-1.28 +/- 0.6680.020.1510671
51197350.77000.31000.856033554.10380 +/- 320.34 +/- 0.49-1.55 +/- 4.6976.950.2849015
61521770.77000.31001.217090543.98333 +/- 270.57 +/- 0.50-0.91 +/- 0.3978.800.3851663
7846490.77000.31001.382106967.97272 +/- 201.15 +/- 0.52-0.84 +/- 0.5833.381.0018265
8546220.77000.31001.172133575.39262 +/- 311.10 +/- 0.59-1.01 +/- 0.4748.330.8619104
-

IVC Plots
-

IVC.htm -

-

All Intensity Plots
-

All.htm -

-

Error graphs:
-

Error.htm -

-Back to top -



Lane 1

- - - - - - - - - - - - - - - - - - - - - - - -
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
10001114972326.4894.3957.4470.29038.60.44
-Back to top -



Lane 2

- - - - - - - - - - - - - - - - - - - - - - - -
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
20001147793448.1283.6838.5753.76905.40.54
-Back to top -



Lane 3

- - - - - - - - - - - - - - - - - - - - - - - -
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
30001167904374.0586.9140.3681.310465.00.47
-Back to top -



Lane 4

- - - - - - - - - - - - - - - - - - - - - - - -
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
4000120308276.8592.8784.2680.410413.80.16
-Back to top -



Lane 5

- - - - - - - - - - - - -
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
-Back to top -



Lane 6

- - - - - - - - - - - - - - - - - - - - - - - -
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
60001166844348.1277.5938.1379.710264.40.44
-Back to top -



Lane 7

- - - - - - - - - - - - - - - - - - - - - - - -
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
7000198913269.9086.6664.5533.24217.51.02
-Back to top -



Lane 8

- - - - - - - - - - - - - - - - - - - - - - - -
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
8000164972243.6089.4073.1748.36182.80.71
-Back to top - - -""" - pathname = os.path.join(gerald_dir, 'Summary.htm') - f = open(pathname, 'w') - f.write(summary_htm) - f.close() - -def make_eland_results(gerald_dir): - eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D. ->HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 chr2L.fa 8796855 R DD 24T ->HWI-EAS229_24_207BTAAXX:1:7:776:582 AGCTCANCCGATCGAAAACCTCNCCAAGCAAT NM 0 0 0 ->HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 Lambda.fa 8796855 R DD 24T -""" - for i in range(1,9): - pathname = os.path.join(gerald_dir, - 's_%d_eland_result.txt' % (i,)) - f = open(pathname, 'w') - f.write(eland_result) - f.close() - -def make_runfolder(obj=None): - """ - Make a fake runfolder, attach all the directories to obj if defined - """ - # make a fake runfolder directory - temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_') - - runfolder_dir = os.path.join(temp_dir, - '080102_HWI-EAS229_0010_207BTAAXX') - os.mkdir(runfolder_dir) - - data_dir = os.path.join(runfolder_dir, 'Data') - os.mkdir(data_dir) - - firecrest_dir = os.path.join(data_dir, - 'C1-33_Firecrest1.8.28_12-04-2008_diane' - ) - os.mkdir(firecrest_dir) - matrix_dir = os.path.join(firecrest_dir, 'Matrix') - os.mkdir(matrix_dir) - make_matrix(matrix_dir) - - bustard_dir = os.path.join(firecrest_dir, - 'Bustard1.8.28_12-04-2008_diane') - os.mkdir(bustard_dir) - make_phasing_params(bustard_dir) - - gerald_dir = os.path.join(bustard_dir, - 'GERALD_12-04-2008_diane') - os.mkdir(gerald_dir) - make_gerald_config(gerald_dir) - make_summary_htm(gerald_dir) - make_eland_results(gerald_dir) - - if obj is not None: - obj.temp_dir = temp_dir - obj.runfolder_dir = runfolder_dir - obj.data_dir = data_dir - obj.firecrest_dir = firecrest_dir - obj.matrix_dir = matrix_dir - obj.bustard_dir = bustard_dir - obj.gerald_dir = gerald_dir - - -class RunfolderTests(unittest.TestCase): - """ - Test components of the runfolder processing code - which includes firecrest, bustard, and gerald - """ - def setUp(self): - # attaches all the directories to the object passed in - make_runfolder(self) - - def tearDown(self): - shutil.rmtree(self.temp_dir) - - def test_firecrest(self): - """ - Construct a firecrest object - """ - f = firecrest.firecrest(self.firecrest_dir) - self.failUnlessEqual(f.version, '1.8.28') - self.failUnlessEqual(f.start, 1) - self.failUnlessEqual(f.stop, 33) - self.failUnlessEqual(f.user, 'diane') - self.failUnlessEqual(f.date, date(2008,4,12)) - - xml = f.get_elements() - # just make sure that element tree can serialize the tree - xml_str = ElementTree.tostring(xml) - - f2 = firecrest.Firecrest(xml=xml) - self.failUnlessEqual(f.version, f2.version) - self.failUnlessEqual(f.start, f2.start) - self.failUnlessEqual(f.stop, f2.stop) - self.failUnlessEqual(f.user, f2.user) - self.failUnlessEqual(f.date, f2.date) - - def test_bustard(self): - """ - construct a bustard object - """ - b = bustard.bustard(self.bustard_dir) - self.failUnlessEqual(b.version, '1.8.28') - self.failUnlessEqual(b.date, date(2008,4,12)) - self.failUnlessEqual(b.user, 'diane') - self.failUnlessEqual(len(b.phasing), 8) - self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099) - - xml = b.get_elements() - b2 = bustard.Bustard(xml=xml) - self.failUnlessEqual(b.version, b2.version) - self.failUnlessEqual(b.date, b2.date ) - self.failUnlessEqual(b.user, b2.user) - self.failUnlessEqual(len(b.phasing), len(b2.phasing)) - for key in b.phasing.keys(): - self.failUnlessEqual(b.phasing[key].lane, - b2.phasing[key].lane) - self.failUnlessEqual(b.phasing[key].phasing, - b2.phasing[key].phasing) - self.failUnlessEqual(b.phasing[key].prephasing, - b2.phasing[key].prephasing) - - def test_gerald(self): - # need to update gerald and make tests for it - g = gerald.gerald(self.gerald_dir) - - self.failUnlessEqual(g.version, - '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp') - self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30)) - self.failUnlessEqual(len(g.lanes), len(g.lanes.keys())) - self.failUnlessEqual(len(g.lanes), len(g.lanes.items())) - - - # list of genomes, matches what was defined up in - # make_gerald_config. - # the first None is to offset the genomes list to be 1..9 - # instead of pythons default 0..8 - genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2', - '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ] - - # test lane specific parameters from gerald config file - for i in range(1,9): - cur_lane = g.lanes[str(i)] - self.failUnlessEqual(cur_lane.analysis, 'eland') - self.failUnlessEqual(cur_lane.eland_genome, genomes[i]) - self.failUnlessEqual(cur_lane.read_length, '32') - self.failUnlessEqual(cur_lane.use_bases, 'Y'*32) - - # test data extracted from summary file - clusters = [None, - (96483, 9074), (133738, 7938), - (152142, 10002), (15784, 2162), - (119735, 8465), (152177, 8146), - (84649, 7325), (54622, 4812),] - - for i in range(1,9): - summary_lane = g.summary[str(i)] - self.failUnlessEqual(summary_lane.cluster, clusters[i]) - self.failUnlessEqual(summary_lane.lane, str(i)) - - xml = g.get_elements() - # just make sure that element tree can serialize the tree - xml_str = ElementTree.tostring(xml) - g2 = gerald.Gerald(xml=xml) - - # do it all again after extracting from the xml file - self.failUnlessEqual(g.version, g2.version) - self.failUnlessEqual(g.date, g2.date) - self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys())) - self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items())) - - # test lane specific parameters from gerald config file - for i in range(1,9): - g_lane = g.lanes[str(i)] - g2_lane = g2.lanes[str(i)] - self.failUnlessEqual(g_lane.analysis, g2_lane.analysis) - self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome) - self.failUnlessEqual(g_lane.read_length, g2_lane.read_length) - self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases) - - # test (some) summary elements - for i in range(1,9): - g_summary = g.summary[str(i)] - g2_summary = g2.summary[str(i)] - self.failUnlessEqual(g_summary.cluster, g2_summary.cluster) - self.failUnlessEqual(g_summary.lane, g2_summary.lane) - - g_eland = g.eland_results - g2_eland = g2.eland_results - for lane in g_eland.keys(): - self.failUnlessEqual(g_eland[lane].reads, - g2_eland[lane].reads) - self.failUnlessEqual(len(g_eland[lane].mapped_reads), - len(g2_eland[lane].mapped_reads)) - for k in g_eland[lane].mapped_reads.keys(): - self.failUnlessEqual(g_eland[lane].mapped_reads[k], - g2_eland[lane].mapped_reads[k]) - - self.failUnlessEqual(len(g_eland[lane].match_codes), - len(g2_eland[lane].match_codes)) - for k in g_eland[lane].match_codes.keys(): - self.failUnlessEqual(g_eland[lane].match_codes[k], - g2_eland[lane].match_codes[k]) - - - def test_eland(self): - dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa', - 'chr2L.fa': 'dm3/chr2L.fa', - 'Lambda.fa': 'Lambda.fa'} - genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map, - '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map } - eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps) - - for i in range(1,9): - lane = eland[str(i)] - self.failUnlessEqual(lane.reads, 4) - self.failUnlessEqual(lane.sample_name, "s") - self.failUnlessEqual(lane.lane_id, unicode(i)) - self.failUnlessEqual(len(lane.mapped_reads), 3) - self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1) - self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1) - self.failUnlessEqual(lane.match_codes['U1'], 2) - self.failUnlessEqual(lane.match_codes['NM'], 1) - - xml = eland.get_elements() - # just make sure that element tree can serialize the tree - xml_str = ElementTree.tostring(xml) - e2 = gerald.ELAND(xml=xml) - - for i in range(1,9): - l1 = eland[str(i)] - l2 = e2[str(i)] - self.failUnlessEqual(l1.reads, l2.reads) - self.failUnlessEqual(l1.sample_name, l2.sample_name) - self.failUnlessEqual(l1.lane_id, l2.lane_id) - self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads)) - self.failUnlessEqual(len(l1.mapped_reads), 3) - for k in l1.mapped_reads.keys(): - self.failUnlessEqual(l1.mapped_reads[k], - l2.mapped_reads[k]) - - self.failUnlessEqual(len(l1.match_codes), 9) - self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes)) - for k in l1.match_codes.keys(): - self.failUnlessEqual(l1.match_codes[k], - l2.match_codes[k]) - - def test_runfolder(self): - runs = runfolder.get_runs(self.runfolder_dir) - - # do we get the flowcell id from the filename? - self.failUnlessEqual(len(runs), 1) - self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-04-19.xml') - - # do we get the flowcell id from the FlowcellId.xml file - make_flowcell_id(self.runfolder_dir, '207BTAAXY') - runs = runfolder.get_runs(self.runfolder_dir) - self.failUnlessEqual(len(runs), 1) - self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-04-19.xml') - - r1 = runs[0] - xml = r1.get_elements() - xml_str = ElementTree.tostring(xml) - - r2 = runfolder.PipelineRun(xml=xml) - self.failUnlessEqual(r1.name, r2.name) - self.failIfEqual(r2.firecrest, None) - self.failIfEqual(r2.bustard, None) - self.failIfEqual(r2.gerald, None) - - -def suite(): - return unittest.makeSuite(RunfolderTests,'test') - -if __name__ == "__main__": - unittest.main(defaultTest="suite") - diff --git a/htsworkflow/pipelines/__init__.py b/htsworkflow/pipelines/__init__.py new file mode 100644 index 0000000..beabfd1 --- /dev/null +++ b/htsworkflow/pipelines/__init__.py @@ -0,0 +1,6 @@ +""" +Provide code to interact with the vendor tools to produce useable "raw" data. + +the illumina sub-package contains components to interact with the Illumina provided +GAPipeline +""" diff --git a/htsworkflow/pipelines/bustard.py b/htsworkflow/pipelines/bustard.py new file mode 100644 index 0000000..1658b17 --- /dev/null +++ b/htsworkflow/pipelines/bustard.py @@ -0,0 +1,146 @@ + +from datetime import date +from glob import glob +import logging +import os +import time +import re + +from htsworkflow.pipelines.runfolder import \ + ElementTree, \ + VERSION_RE, \ + EUROPEAN_STRPTIME + +class Phasing(object): + PHASING = 'Phasing' + PREPHASING = 'Prephasing' + + def __init__(self, fromfile=None, xml=None): + self.lane = None + self.phasing = None + self.prephasing = None + + if fromfile is not None: + self._initialize_from_file(fromfile) + elif xml is not None: + self.set_elements(xml) + + def _initialize_from_file(self, pathname): + path, name = os.path.split(pathname) + basename, ext = os.path.splitext(name) + # the last character of the param base filename should be the + # lane number + tree = ElementTree.parse(pathname).getroot() + self.set_elements(tree) + self.lane = int(basename[-1]) + + def get_elements(self): + root = ElementTree.Element(Phasing.PHASING, {'lane': str(self.lane)}) + phasing = ElementTree.SubElement(root, Phasing.PHASING) + phasing.text = str(self.phasing) + prephasing = ElementTree.SubElement(root, Phasing.PREPHASING) + prephasing.text = str(self.prephasing) + return root + + def set_elements(self, tree): + if tree.tag not in ('Phasing', 'Parameters'): + raise ValueError('exptected Phasing or Parameters') + lane = tree.attrib.get('lane', None) + if lane is not None: + self.lane = int(lane) + for element in list(tree): + if element.tag == Phasing.PHASING: + self.phasing = float(element.text) + elif element.tag == Phasing.PREPHASING: + self.prephasing = float(element.text) + +class Bustard(object): + XML_VERSION = 1 + + # Xml Tags + BUSTARD = 'Bustard' + SOFTWARE_VERSION = 'version' + DATE = 'run_time' + USER = 'user' + PARAMETERS = 'Parameters' + + def __init__(self, xml=None): + self.version = None + self.date = date.today() + self.user = None + self.phasing = {} + + if xml is not None: + self.set_elements(xml) + + def _get_time(self): + return time.mktime(self.date.timetuple()) + time = property(_get_time, doc='return run time as seconds since epoch') + + def dump(self): + print "Bustard version:", self.version + print "Run date", self.date + print "user:", self.user + for lane, tree in self.phasing.items(): + print lane + print tree + + def get_elements(self): + root = ElementTree.Element('Bustard', + {'version': str(Bustard.XML_VERSION)}) + version = ElementTree.SubElement(root, Bustard.SOFTWARE_VERSION) + version.text = self.version + run_date = ElementTree.SubElement(root, Bustard.DATE) + run_date.text = str(self.time) + user = ElementTree.SubElement(root, Bustard.USER) + user.text = self.user + params = ElementTree.SubElement(root, Bustard.PARAMETERS) + for p in self.phasing.values(): + params.append(p.get_elements()) + return root + + def set_elements(self, tree): + if tree.tag != Bustard.BUSTARD: + raise ValueError('Expected "Bustard" SubElements') + xml_version = int(tree.attrib.get('version', 0)) + if xml_version > Bustard.XML_VERSION: + logging.warn('Bustard XML tree is a higher version than this class') + for element in list(tree): + if element.tag == Bustard.SOFTWARE_VERSION: + self.version = element.text + elif element.tag == Bustard.DATE: + self.date = date.fromtimestamp(float(element.text)) + elif element.tag == Bustard.USER: + self.user = element.text + elif element.tag == Bustard.PARAMETERS: + for param in element: + p = Phasing(xml=param) + self.phasing[p.lane] = p + else: + raise ValueError("Unrecognized tag: %s" % (element.tag,)) + + + +def bustard(pathname): + """ + Construct a Bustard object from pathname + """ + b = Bustard() + path, name = os.path.split(pathname) + groups = name.split("_") + version = re.search(VERSION_RE, groups[0]) + b.version = version.group(1) + t = time.strptime(groups[1], EUROPEAN_STRPTIME) + b.date = date(*t[0:3]) + b.user = groups[2] + paramfiles = glob(os.path.join(pathname, "params?.xml")) + for paramfile in paramfiles: + phasing = Phasing(paramfile) + assert (phasing.lane >= 1 and phasing.lane <= 8) + b.phasing[phasing.lane] = phasing + return b + +def fromxml(tree): + b = Bustard() + b.set_elements(tree) + return b diff --git a/htsworkflow/pipelines/configure_run.py b/htsworkflow/pipelines/configure_run.py new file mode 100644 index 0000000..0c1dc8b --- /dev/null +++ b/htsworkflow/pipelines/configure_run.py @@ -0,0 +1,606 @@ +#!/usr/bin/python +import subprocess +import logging +import time +import re +import os + +from htsworkflow.pipelines.retrieve_config import getCombinedOptions, saveConfigFile +from htsworkflow.pipelines.retrieve_config import FlowCellNotFound, WebError404 +from htsworkflow.pipelines.genome_mapper import DuplicateGenome, getAvailableGenomes, constructMapperDict +from htsworkflow.pipelines.run_status import GARunStatus + +from pyinotify import WatchManager, ThreadedNotifier +from pyinotify import EventsCodes, ProcessEvent + +class ConfigInfo: + + def __init__(self): + #run_path = firecrest analysis directory to run analysis from + self.run_path = None + self.bustard_path = None + self.config_filepath = None + self.status = None + + #top level directory where all analyses are placed + self.base_analysis_dir = None + #analysis_dir, top level analysis dir... + # base_analysis_dir + '/070924_USI-EAS44_0022_FC12150' + self.analysis_dir = None + + + def createStatusObject(self): + """ + Creates a status object which can be queried for + status of running the pipeline + + returns True if object created + returns False if object cannot be created + """ + if self.config_filepath is None: + return False + + self.status = GARunStatus(self.config_filepath) + return True + + + +#################################### +# inotify event processor + +s_firecrest_finished = re.compile('Firecrest[0-9\._\-A-Za-z]+/finished.txt') +s_bustard_finished = re.compile('Bustard[0-9\._\-A-Za-z]+/finished.txt') +s_gerald_finished = re.compile('GERALD[0-9\._\-A-Za-z]+/finished.txt') + +s_gerald_all = re.compile('Firecrest[0-9\._\-A-Za-z]+/Bustard[0-9\._\-A-Za-z]+/GERALD[0-9\._\-A-Za-z]+/') +s_bustard_all = re.compile('Firecrest[0-9\._\-A-Za-z]+/Bustard[0-9\._\-A-Za-z]+/') +s_firecrest_all = re.compile('Firecrest[0-9\._\-A-Za-z]+/') + +class RunEvent(ProcessEvent): + + def __init__(self, conf_info): + + self.run_status_dict = {'firecrest': False, + 'bustard': False, + 'gerald': False} + + self._ci = conf_info + + ProcessEvent.__init__(self) + + + def process_IN_CREATE(self, event): + fullpath = os.path.join(event.path, event.name) + if s_finished.search(fullpath): + logging.info("File Found: %s" % (fullpath)) + + if s_firecrest_finished.search(fullpath): + self.run_status_dict['firecrest'] = True + self._ci.status.updateFirecrest(event.name) + elif s_bustard_finished.search(fullpath): + self.run_status_dict['bustard'] = True + self._ci.status.updateBustard(event.name) + elif s_gerald_finished.search(fullpath): + self.run_status_dict['gerald'] = True + self._ci.status.updateGerald(event.name) + + #WARNING: The following order is important!! + # Firecrest regex will catch all gerald, bustard, and firecrest + # Bustard regex will catch all gerald and bustard + # Gerald regex will catch all gerald + # So, order needs to be Gerald, Bustard, Firecrest, or this + # won't work properly. + elif s_gerald_all.search(fullpath): + self._ci.status.updateGerald(event.name) + elif s_bustard_all.search(fullpath): + self._ci.status.updateBustard(event.name) + elif s_firecrest_all.search(fullpath): + self._ci.status.updateFirecrest(event.name) + + #print "Create: %s" % (os.path.join(event.path, event.name)) + + def process_IN_DELETE(self, event): + #print "Remove %s" % (os.path.join(event.path, event.name)) + pass + + + + +#FLAGS +# Config Step Error +RUN_ABORT = 'abort' +# Run Step Error +RUN_FAILED = 'failed' + + +##################################### +# Configure Step (goat_pipeline.py) +#Info +s_start = re.compile('Starting Genome Analyzer Pipeline') +s_gerald = re.compile("[\S\s]+--GERALD[\S\s]+--make[\S\s]+") +s_generating = re.compile('^Generating journals, Makefiles') +s_seq_folder = re.compile('^Sequence folder: ') +s_seq_folder_sub = re.compile('want to make ') +s_stderr_taskcomplete = re.compile('^Task complete, exiting') + +#Errors +s_invalid_cmdline = re.compile('Usage:[\S\s]*goat_pipeline.py') +s_species_dir_err = re.compile('Error: Lane [1-8]:') +s_goat_traceb = re.compile("^Traceback \(most recent call last\):") +s_missing_cycles = re.compile('^Error: Tile s_[1-8]_[0-9]+: Different number of cycles: [0-9]+ instead of [0-9]+') + +SUPPRESS_MISSING_CYCLES = False + + +##Ignore - Example of out above each ignore regex. +#NOTE: Commenting out an ignore will cause it to be +# logged as DEBUG with the logging module. +#CF_STDERR_IGNORE_LIST = [] +s_skip = re.compile('s_[0-8]_[0-9]+') + + +########################################## +# Pipeline Run Step (make -j8 recursive) + +##Info +s_finished = re.compile('finished') + +##Errors +s_make_error = re.compile('^make[\S\s]+Error') +s_no_gnuplot = re.compile('gnuplot: command not found') +s_no_convert = re.compile('^Can\'t exec "convert"') +s_no_ghostscript = re.compile('gs: command not found') + +##Ignore - Example of out above each ignore regex. +#NOTE: Commenting out an ignore will cause it to be +# logged as DEBUG with the logging module. +# +PL_STDERR_IGNORE_LIST = [] +# Info: PF 11802 +PL_STDERR_IGNORE_LIST.append( re.compile('^Info: PF') ) +# About to analyse intensity file s_4_0101_sig2.txt +PL_STDERR_IGNORE_LIST.append( re.compile('^About to analyse intensity file') ) +# Will send output to standard output +PL_STDERR_IGNORE_LIST.append( re.compile('^Will send output to standard output') ) +# Found 31877 clusters +PL_STDERR_IGNORE_LIST.append( re.compile('^Found [0-9]+ clusters') ) +# Will use quality criterion ((CHASTITY>=0.6) +PL_STDERR_IGNORE_LIST.append( re.compile('^Will use quality criterion') ) +# Quality criterion translated to (($F[5]>=0.6)) +PL_STDERR_IGNORE_LIST.append( re.compile('^Quality criterion translated to') ) +# opened /woldlab/trog/data1/king/070924_USI-EAS44_0022_FC12150/Data/C1-36_Firecrest1.9.1_14-11-2007_king.4/Bustard1.9.1_14-11-2007_king/s_4_0101_qhg.txt +# AND +# opened s_4_0103_qhg.txt +PL_STDERR_IGNORE_LIST.append( re.compile('^opened[\S\s]+qhg.txt') ) +# 81129 sequences out of 157651 passed filter criteria +PL_STDERR_IGNORE_LIST.append( re.compile('^[0-9]+ sequences out of [0-9]+ passed filter criteria') ) + + +def pl_stderr_ignore(line): + """ + Searches lines for lines to ignore (i.e. not to log) + + returns True if line should be ignored + returns False if line should NOT be ignored + """ + for s in PL_STDERR_IGNORE_LIST: + if s.search(line): + return True + return False + + +def config_stdout_handler(line, conf_info): + """ + Processes each line of output from GOAT + and stores useful information using the logging module + + Loads useful information into conf_info as well, for future + use outside the function. + + returns True if found condition that signifies success. + """ + + # Skip irrelevant line (without logging) + if s_skip.search(line): + pass + + # Detect invalid command-line arguments + elif s_invalid_cmdline.search(line): + logging.error("Invalid commandline options!") + + # Detect starting of configuration + elif s_start.search(line): + logging.info('START: Configuring pipeline') + + # Detect it made it past invalid arguments + elif s_gerald.search(line): + logging.info('Running make now') + + # Detect that make files have been generated (based on output) + elif s_generating.search(line): + logging.info('Make files generted') + return True + + # Capture run directory + elif s_seq_folder.search(line): + mo = s_seq_folder_sub.search(line) + #Output changed when using --tiles= + # at least in pipeline v0.3.0b2 + if mo: + firecrest_bustard_gerald_makefile = line[mo.end():] + firecrest_bustard_gerald, junk = \ + os.path.split(firecrest_bustard_gerald_makefile) + firecrest_bustard, junk = os.path.split(firecrest_bustard_gerald) + firecrest, junk = os.path.split(firecrest_bustard) + + conf_info.bustard_path = firecrest_bustard + conf_info.run_path = firecrest + + #Standard output handling + else: + print 'Sequence line:', line + mo = s_seq_folder.search(line) + conf_info.bustard_path = line[mo.end():] + conf_info.run_path, temp = os.path.split(conf_info.bustard_path) + + # Log all other output for debugging purposes + else: + logging.warning('CONF:?: %s' % (line)) + + return False + + + +def config_stderr_handler(line, conf_info): + """ + Processes each line of output from GOAT + and stores useful information using the logging module + + Loads useful information into conf_info as well, for future + use outside the function. + + returns RUN_ABORT upon detecting failure; + True on success message; + False if neutral message + (i.e. doesn't signify failure or success) + """ + global SUPPRESS_MISSING_CYCLES + + # Detect invalid species directory error + if s_species_dir_err.search(line): + logging.error(line) + return RUN_ABORT + # Detect goat_pipeline.py traceback + elif s_goat_traceb.search(line): + logging.error("Goat config script died, traceback in debug output") + return RUN_ABORT + # Detect indication of successful configuration (from stderr; odd, but ok) + elif s_stderr_taskcomplete.search(line): + logging.info('Configure step successful (from: stderr)') + return True + # Detect missing cycles + elif s_missing_cycles.search(line): + + # Only display error once + if not SUPPRESS_MISSING_CYCLES: + logging.error("Missing cycles detected; Not all cycles copied?") + logging.debug("CONF:STDERR:MISSING_CYCLES: %s" % (line)) + SUPPRESS_MISSING_CYCLES = True + return RUN_ABORT + + # Log all other output as debug output + else: + logging.debug('CONF:STDERR:?: %s' % (line)) + + # Neutral (not failure; nor success) + return False + + +#def pipeline_stdout_handler(line, conf_info): +# """ +# Processes each line of output from running the pipeline +# and stores useful information using the logging module +# +# Loads useful information into conf_info as well, for future +# use outside the function. +# +# returns True if found condition that signifies success. +# """ +# +# #f.write(line + '\n') +# +# return True + + + +def pipeline_stderr_handler(line, conf_info): + """ + Processes each line of stderr from pipelien run + and stores useful information using the logging module + + ##FIXME: Future feature (doesn't actually do this yet) + #Loads useful information into conf_info as well, for future + #use outside the function. + + returns RUN_FAILED upon detecting failure; + #True on success message; (no clear success state) + False if neutral message + (i.e. doesn't signify failure or success) + """ + + if pl_stderr_ignore(line): + pass + elif s_make_error.search(line): + logging.error("make error detected; run failed") + return RUN_FAILED + elif s_no_gnuplot.search(line): + logging.error("gnuplot not found") + return RUN_FAILED + elif s_no_convert.search(line): + logging.error("imagemagick's convert command not found") + return RUN_FAILED + elif s_no_ghostscript.search(line): + logging.error("ghostscript not found") + return RUN_FAILED + else: + logging.debug('PIPE:STDERR:?: %s' % (line)) + + return False + + +def retrieve_config(conf_info, flowcell, cfg_filepath, genome_dir): + """ + Gets the config file from server... + requires config file in: + /etc/ga_frontend/ga_frontend.conf + or + ~/.ga_frontend.conf + + with: + [config_file_server] + base_host_url: http://host:port + + return True if successful, False is failure + """ + options = getCombinedOptions() + + if options.url is None: + logging.error("~/.ga_frontend.conf or /etc/ga_frontend/ga_frontend.conf" \ + " missing base_host_url option") + return False + + try: + saveConfigFile(flowcell, options.url, cfg_filepath) + conf_info.config_filepath = cfg_filepath + except FlowCellNotFound, e: + logging.error(e) + return False + except WebError404, e: + logging.error(e) + return False + except IOError, e: + logging.error(e) + return False + except Exception, e: + logging.error(e) + return False + + f = open(cfg_filepath, 'r') + data = f.read() + f.close() + + genome_dict = getAvailableGenomes(genome_dir) + mapper_dict = constructMapperDict(genome_dict) + + logging.debug(data) + + f = open(cfg_filepath, 'w') + f.write(data % (mapper_dict)) + f.close() + + return True + + + +def configure(conf_info): + """ + Attempts to configure the GA pipeline using goat. + + Uses logging module to store information about status. + + returns True if configuration successful, otherwise False. + """ + #ERROR Test: + #pipe = subprocess.Popen(['goat_pipeline.py', + # '--GERALD=config32bk.txt', + # '--make .',], + # #'.'], + # stdout=subprocess.PIPE, + # stderr=subprocess.PIPE) + + #ERROR Test (2), causes goat_pipeline.py traceback + #pipe = subprocess.Popen(['goat_pipeline.py', + # '--GERALD=%s' % (conf_info.config_filepath), + # '--tiles=s_4_100,s_4_101,s_4_102,s_4_103,s_4_104', + # '--make', + # '.'], + # stdout=subprocess.PIPE, + # stderr=subprocess.PIPE) + + ########################## + # Run configuration step + # Not a test; actual configure attempt. + #pipe = subprocess.Popen(['goat_pipeline.py', + # '--GERALD=%s' % (conf_info.config_filepath), + # '--make', + # '.'], + # stdout=subprocess.PIPE, + # stderr=subprocess.PIPE) + + + stdout_filepath = os.path.join(conf_info.analysis_dir, + "pipeline_configure_stdout.txt") + stderr_filepath = os.path.join(conf_info.analysis_dir, + "pipeline_configure_stderr.txt") + + fout = open(stdout_filepath, 'w') + ferr = open(stderr_filepath, 'w') + + pipe = subprocess.Popen(['goat_pipeline.py', + '--GERALD=%s' % (conf_info.config_filepath), + #'--tiles=s_4_0100,s_4_0101,s_4_0102,s_4_0103,s_4_0104', + '--make', + conf_info.analysis_dir], + stdout=fout, + stderr=ferr) + + print "Configuring pipeline: %s" % (time.ctime()) + error_code = pipe.wait() + + # Clean up + fout.close() + ferr.close() + + + ################## + # Process stdout + fout = open(stdout_filepath, 'r') + + stdout_line = fout.readline() + + complete = False + while stdout_line != '': + # Handle stdout + if config_stdout_handler(stdout_line, conf_info): + complete = True + stdout_line = fout.readline() + + fout.close() + + + #error_code = pipe.wait() + if error_code: + logging.error('Recieved error_code: %s' % (error_code)) + else: + logging.info('We are go for launch!') + + #Process stderr + ferr = open(stderr_filepath, 'r') + stderr_line = ferr.readline() + + abort = 'NO!' + stderr_success = False + while stderr_line != '': + stderr_status = config_stderr_handler(stderr_line, conf_info) + if stderr_status == RUN_ABORT: + abort = RUN_ABORT + elif stderr_status is True: + stderr_success = True + stderr_line = ferr.readline() + + ferr.close() + + + #Success requirements: + # 1) The stdout completed without error + # 2) The program exited with status 0 + # 3) No errors found in stdout + print '#Expect: True, False, True, True' + print complete, bool(error_code), abort != RUN_ABORT, stderr_success is True + status = complete is True and \ + bool(error_code) is False and \ + abort != RUN_ABORT and \ + stderr_success is True + + # If everything was successful, but for some reason + # we didn't retrieve the path info, log it. + if status is True: + if conf_info.bustard_path is None or conf_info.run_path is None: + logging.error("Failed to retrieve run_path") + return False + + return status + + +def run_pipeline(conf_info): + """ + Run the pipeline and monitor status. + """ + # Fail if the run_path doesn't actually exist + if not os.path.exists(conf_info.run_path): + logging.error('Run path does not exist: %s' \ + % (conf_info.run_path)) + return False + + # Change cwd to run_path + stdout_filepath = os.path.join(conf_info.analysis_dir, 'pipeline_run_stdout.txt') + stderr_filepath = os.path.join(conf_info.analysis_dir, 'pipeline_run_stderr.txt') + + # Create status object + conf_info.createStatusObject() + + # Monitor file creation + wm = WatchManager() + mask = EventsCodes.IN_DELETE | EventsCodes.IN_CREATE + event = RunEvent(conf_info) + notifier = ThreadedNotifier(wm, event) + notifier.start() + wdd = wm.add_watch(conf_info.run_path, mask, rec=True) + + # Log pipeline starting + logging.info('STARTING PIPELINE @ %s' % (time.ctime())) + + # Start the pipeline (and hide!) + #pipe = subprocess.Popen(['make', + # '-j8', + # 'recursive'], + # stdout=subprocess.PIPE, + # stderr=subprocess.PIPE) + + fout = open(stdout_filepath, 'w') + ferr = open(stderr_filepath, 'w') + + pipe = subprocess.Popen(['make', + '--directory=%s' % (conf_info.run_path), + '-j8', + 'recursive'], + stdout=fout, + stderr=ferr) + #shell=True) + # Wait for run to finish + retcode = pipe.wait() + + + # Clean up + notifier.stop() + fout.close() + ferr.close() + + # Process stderr + ferr = open(stderr_filepath, 'r') + + run_failed_stderr = False + for line in ferr: + err_status = pipeline_stderr_handler(line, conf_info) + if err_status == RUN_FAILED: + run_failed_stderr = True + + ferr.close() + + # Finished file check! + print 'RUN SUCCESS CHECK:' + for key, value in event.run_status_dict.items(): + print ' %s: %s' % (key, value) + + dstatus = event.run_status_dict + + # Success or failure check + status = (retcode == 0) and \ + run_failed_stderr is False and \ + dstatus['firecrest'] is True and \ + dstatus['bustard'] is True and \ + dstatus['gerald'] is True + + return status + + diff --git a/htsworkflow/pipelines/firecrest.py b/htsworkflow/pipelines/firecrest.py new file mode 100644 index 0000000..ee6fded --- /dev/null +++ b/htsworkflow/pipelines/firecrest.py @@ -0,0 +1,127 @@ +""" +Extract information about the Firecrest run + +Firecrest - class holding the properties we found +firecrest - Firecrest factory function initalized from a directory name +fromxml - Firecrest factory function initalized from an xml dump from + the Firecrest object. +""" + +from datetime import date +import os +import re +import time + +from htsworkflow.pipelines.runfolder import \ + ElementTree, \ + VERSION_RE, \ + EUROPEAN_STRPTIME + +class Firecrest(object): + XML_VERSION=1 + + # xml tag names + FIRECREST = 'Firecrest' + SOFTWARE_VERSION = 'version' + START = 'FirstCycle' + STOP = 'LastCycle' + DATE = 'run_time' + USER = 'user' + MATRIX = 'matrix' + + def __init__(self, xml=None): + self.start = None + self.stop = None + self.version = None + self.date = date.today() + self.user = None + self.matrix = None + + if xml is not None: + self.set_elements(xml) + + def _get_time(self): + return time.mktime(self.date.timetuple()) + time = property(_get_time, doc='return run time as seconds since epoch') + + def dump(self): + print "Starting cycle:", self.start + print "Ending cycle:", self.stop + print "Firecrest version:", self.version + print "Run date:", self.date + print "user:", self.user + + def get_elements(self): + attribs = {'version': str(Firecrest.XML_VERSION) } + root = ElementTree.Element(Firecrest.FIRECREST, attrib=attribs) + version = ElementTree.SubElement(root, Firecrest.SOFTWARE_VERSION) + version.text = self.version + start_cycle = ElementTree.SubElement(root, Firecrest.START) + start_cycle.text = str(self.start) + stop_cycle = ElementTree.SubElement(root, Firecrest.STOP) + stop_cycle.text = str(self.stop) + run_date = ElementTree.SubElement(root, Firecrest.DATE) + run_date.text = str(self.time) + user = ElementTree.SubElement(root, Firecrest.USER) + user.text = self.user + matrix = ElementTree.SubElement(root, Firecrest.MATRIX) + matrix.text = self.matrix + return root + + def set_elements(self, tree): + if tree.tag != Firecrest.FIRECREST: + raise ValueError('Expected "Firecrest" SubElements') + xml_version = int(tree.attrib.get('version', 0)) + if xml_version > Firecrest.XML_VERSION: + logging.warn('Firecrest XML tree is a higher version than this class') + for element in list(tree): + if element.tag == Firecrest.SOFTWARE_VERSION: + self.version = element.text + elif element.tag == Firecrest.START: + self.start = int(element.text) + elif element.tag == Firecrest.STOP: + self.stop = int(element.text) + elif element.tag == Firecrest.DATE: + self.date = date.fromtimestamp(float(element.text)) + elif element.tag == Firecrest.USER: + self.user = element.text + elif element.tag == Firecrest.MATRIX: + self.matrix = element.text + else: + raise ValueError("Unrecognized tag: %s" % (element.tag,)) + +def firecrest(pathname): + """ + Examine the directory at pathname and initalize a Firecrest object + """ + f = Firecrest() + + # parse firecrest directory name + path, name = os.path.split(pathname) + groups = name.split('_') + # grab the start/stop cycle information + cycle = re.match("C([0-9]+)-([0-9]+)", groups[0]) + f.start = int(cycle.group(1)) + f.stop = int(cycle.group(2)) + # firecrest version + version = re.search(VERSION_RE, groups[1]) + f.version = (version.group(1)) + # datetime + t = time.strptime(groups[2], EUROPEAN_STRPTIME) + f.date = date(*t[0:3]) + # username + f.user = groups[3] + + # should I parse this deeper than just stashing the + # contents of the matrix file? + matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt') + f.matrix = open(matrix_pathname, 'r').read() + return f + +def fromxml(tree): + """ + Initialize a Firecrest object from an element tree node + """ + f = Firecrest() + f.set_elements(tree) + return f diff --git a/htsworkflow/pipelines/genome_mapper.py b/htsworkflow/pipelines/genome_mapper.py new file mode 100644 index 0000000..a8ea871 --- /dev/null +++ b/htsworkflow/pipelines/genome_mapper.py @@ -0,0 +1,137 @@ +#!/usr/bin/python +import glob +import sys +import os +import re + +import logging + +from htsworkflow.util.alphanum import alphanum + +class DuplicateGenome(Exception): pass + + +def _has_metainfo(genome_dir): + metapath = os.path.join(genome_dir, '_metainfo_') + if os.path.isfile(metapath): + return True + else: + return False + +def getAvailableGenomes(genome_base_dir): + """ + raises IOError (on genome_base_dir not found) + raises DuplicateGenome on duplicate genomes found. + + returns a double dictionary (i.e. d[species][build] = path) + """ + + # Need valid directory + if not os.path.exists(genome_base_dir): + msg = "Directory does not exist: %s" % (genome_base_dir) + raise IOError, msg + + # Find all subdirectories + filepath_list = glob.glob(os.path.join(genome_base_dir, '*')) + potential_genome_dirs = \ + [ filepath for filepath in filepath_list if os.path.isdir(filepath)] + + # Get list of metadata files + genome_dir_list = \ + [ dirpath \ + for dirpath in potential_genome_dirs \ + if _has_metainfo(dirpath) ] + + # Genome double dictionary + d = {} + + for genome_dir in genome_dir_list: + line = open(os.path.join(genome_dir, '_metainfo_'), 'r').readline().strip() + + # Get species, build... log and skip on failure + try: + species, build = line.split('|') + except: + logging.warning('Skipping: Invalid metafile (%s) line: %s' \ + % (metafile, line)) + continue + + build_dict = d.setdefault(species, {}) + if build in build_dict: + msg = "Duplicate genome for %s|%s" % (species, build) + raise DuplicateGenome, msg + + build_dict[build] = genome_dir + + return d + + +class constructMapperDict(object): + """ + Emulate a dictionary to map genome|build names to paths. + + It uses the dictionary generated by getAvailableGenomes. + """ + def __init__(self, genome_dict): + self.genome_dict = genome_dict + + def __getitem__(self, key): + """ + Return the best match for key + """ + elements = re.split("\|", key) + + if len(elements) == 1: + # we just the species name + # get the set of builds + builds = self.genome_dict[elements[0]] + + # sort build names the way humans would + keys = builds.keys() + keys.sort(cmp=alphanum) + + # return the path from the 'last' build name + return builds[keys[-1]] + + elif len(elements) == 2: + # we have species, and build name + return self.genome_dict[elements[0]][elements[1]] + else: + raise KeyError("Unrecognized key") + + def keys(self): + keys = [] + for species in self.genome_dict.keys(): + for build in self.genome_dict[species]: + keys.append([species+'|'+build]) + return keys + + def values(self): + values = [] + for species in self.genome_dict.keys(): + for build in self.genome_dict[species]: + values.append(self.genome_dict[species][build]) + return values + + def items(self): + items = [] + for species in self.genome_dict.keys(): + for build in self.genome_dict[species]: + key = [species+'|'+build] + value = self.genome_dict[species][build] + items.append((key, value)) + return items + +if __name__ == '__main__': + + if len(sys.argv) != 2: + print 'useage: %s ' % (sys.argv[0]) + sys.exit(1) + + d = getAvailableGenomes(sys.argv[1]) + d2 = constructMapperDict(d) + + for k,v in d2.items(): + print '%s: %s' % (k,v) + + diff --git a/htsworkflow/pipelines/gerald.py b/htsworkflow/pipelines/gerald.py new file mode 100644 index 0000000..109c6a8 --- /dev/null +++ b/htsworkflow/pipelines/gerald.py @@ -0,0 +1,719 @@ +""" +Provide access to information stored in the GERALD directory. +""" +from datetime import datetime, date +from glob import glob +import logging +import os +import stat +import time +import types + +from htsworkflow.pipelines.runfolder import \ + ElementTree, \ + EUROPEAN_STRPTIME, \ + LANES_PER_FLOWCELL, \ + VERSION_RE +from htsworkflow.util.ethelp import indent, flatten +from htsworkflow.util.opener import autoopen + +class Gerald(object): + """ + Capture meaning out of the GERALD directory + """ + XML_VERSION = 1 + GERALD='Gerald' + RUN_PARAMETERS='RunParameters' + SUMMARY='Summary' + + class LaneParameters(object): + """ + Make it easy to access elements of LaneSpecificRunParameters from python + """ + def __init__(self, gerald, key): + self._gerald = gerald + self._key = key + + def __get_attribute(self, xml_tag): + subtree = self._gerald.tree.find('LaneSpecificRunParameters') + container = subtree.find(xml_tag) + if container is None: + return None + if len(container.getchildren()) > LANES_PER_FLOWCELL: + raise RuntimeError('GERALD config.xml file changed') + lanes = [x.tag.split('_')[1] for x in container.getchildren()] + index = lanes.index(self._key) + element = container[index] + return element.text + def _get_analysis(self): + return self.__get_attribute('ANALYSIS') + analysis = property(_get_analysis) + + def _get_eland_genome(self): + genome = self.__get_attribute('ELAND_GENOME') + # default to the chipwide parameters if there isn't an + # entry in the lane specific paramaters + if genome is None: + subtree = self._gerald.tree.find('ChipWideRunParameters') + container = subtree.find('ELAND_GENOME') + genome = container.text + return genome + eland_genome = property(_get_eland_genome) + + def _get_read_length(self): + return self.__get_attribute('READ_LENGTH') + read_length = property(_get_read_length) + + def _get_use_bases(self): + return self.__get_attribute('USE_BASES') + use_bases = property(_get_use_bases) + + class LaneSpecificRunParameters(object): + """ + Provide access to LaneSpecificRunParameters + """ + def __init__(self, gerald): + self._gerald = gerald + self._keys = None + def __getitem__(self, key): + return Gerald.LaneParameters(self._gerald, key) + def keys(self): + if self._keys is None: + tree = self._gerald.tree + analysis = tree.find('LaneSpecificRunParameters/ANALYSIS') + # according to the pipeline specs I think their fields + # are sampleName_laneID, with sampleName defaulting to s + # since laneIDs are constant lets just try using + # those consistently. + self._keys = [ x.tag.split('_')[1] for x in analysis] + return self._keys + def values(self): + return [ self[x] for x in self.keys() ] + def items(self): + return zip(self.keys(), self.values()) + def __len__(self): + return len(self.keys()) + + def __init__(self, xml=None): + self.pathname = None + self.tree = None + + # parse lane parameters out of the config.xml file + self.lanes = Gerald.LaneSpecificRunParameters(self) + + self.summary = None + self.eland_results = None + + if xml is not None: + self.set_elements(xml) + + def _get_date(self): + if self.tree is None: + return datetime.today() + timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP') + epochstamp = time.mktime(time.strptime(timestamp, '%c')) + return datetime.fromtimestamp(epochstamp) + date = property(_get_date) + + def _get_time(self): + return time.mktime(self.date.timetuple()) + time = property(_get_time, doc='return run time as seconds since epoch') + + def _get_version(self): + if self.tree is None: + return None + return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION') + version = property(_get_version) + + def dump(self): + """ + Debugging function, report current object + """ + print 'Gerald version:', self.version + print 'Gerald run date:', self.date + print 'Gerald config.xml:', self.tree + self.summary.dump() + + def get_elements(self): + if self.tree is None or self.summary is None: + return None + + gerald = ElementTree.Element(Gerald.GERALD, + {'version': unicode(Gerald.XML_VERSION)}) + gerald.append(self.tree) + gerald.append(self.summary.get_elements()) + if self.eland_results: + gerald.append(self.eland_results.get_elements()) + return gerald + + def set_elements(self, tree): + if tree.tag != Gerald.GERALD: + raise ValueError('exptected GERALD') + xml_version = int(tree.attrib.get('version', 0)) + if xml_version > Gerald.XML_VERSION: + logging.warn('XML tree is a higher version than this class') + for element in list(tree): + tag = element.tag.lower() + if tag == Gerald.RUN_PARAMETERS.lower(): + self.tree = element + elif tag == Gerald.SUMMARY.lower(): + self.summary = Summary(xml=element) + elif tag == ELAND.ELAND.lower(): + self.eland_results = ELAND(xml=element) + else: + logging.warn("Unrecognized tag %s" % (element.tag,)) + + +def gerald(pathname): + g = Gerald() + g.pathname = pathname + path, name = os.path.split(pathname) + config_pathname = os.path.join(pathname, 'config.xml') + g.tree = ElementTree.parse(config_pathname).getroot() + + # parse Summary.htm file + summary_pathname = os.path.join(pathname, 'Summary.htm') + g.summary = Summary(summary_pathname) + # parse eland files + g.eland_results = eland(g.pathname, g) + return g + +def tonumber(v): + """ + Convert a value to int if its an int otherwise a float. + """ + try: + v = int(v) + except ValueError, e: + v = float(v) + return v + +def parse_mean_range(value): + """ + Parse values like 123 +/- 4.5 + """ + if value.strip() == 'unknown': + return 0, 0 + + average, pm, deviation = value.split() + if pm != '+/-': + raise RuntimeError("Summary.htm file format changed") + return tonumber(average), tonumber(deviation) + +def make_mean_range_element(parent, name, mean, deviation): + """ + Make an ElementTree subelement + """ + element = ElementTree.SubElement(parent, name, + { 'mean': unicode(mean), + 'deviation': unicode(deviation)}) + return element + +def parse_mean_range_element(element): + """ + Grab mean/deviation out of element + """ + return (tonumber(element.attrib['mean']), + tonumber(element.attrib['deviation'])) + +def parse_summary_element(element): + """ + Determine if we have a simple element or a mean/deviation element + """ + if len(element.attrib) > 0: + return parse_mean_range_element(element) + else: + return element.text + +class Summary(object): + """ + Extract some useful information from the Summary.htm file + """ + XML_VERSION = 2 + SUMMARY = 'Summary' + + class LaneResultSummary(object): + """ + Parse the LaneResultSummary table out of Summary.htm + Mostly for the cluster number + """ + LANE_RESULT_SUMMARY = 'LaneResultSummary' + TAGS = { + 'LaneYield': 'lane_yield', + 'Cluster': 'cluster', # Raw + 'ClusterPF': 'cluster_pass_filter', + 'AverageFirstCycleIntensity': 'average_first_cycle_intensity', + 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles', + 'PercentPassFilterClusters': 'percent_pass_filter_clusters', + 'PercentPassFilterAlign': 'percent_pass_filter_align', + 'AverageAlignmentScore': 'average_alignment_score', + 'PercentErrorRate': 'percent_error_rate' + } + + def __init__(self, html=None, xml=None): + self.lane = None + self.lane_yield = None + self.cluster = None + self.cluster_pass_filter = None + self.average_first_cycle_intensity = None + self.percent_intensity_after_20_cycles = None + self.percent_pass_filter_clusters = None + self.percent_pass_filter_align = None + self.average_alignment_score = None + self.percent_error_rate = None + + if html is not None: + self.set_elements_from_html(html) + if xml is not None: + self.set_elements(xml) + + def set_elements_from_html(self, data): + if not len(data) in (8,10): + raise RuntimeError("Summary.htm file format changed") + + # same in pre-0.3.0 Summary file and 0.3 summary file + self.lane = data[0] + + if len(data) == 8: + parsed_data = [ parse_mean_range(x) for x in data[1:] ] + # this is the < 0.3 Pipeline version + self.cluster = parsed_data[0] + self.average_first_cycle_intensity = parsed_data[1] + self.percent_intensity_after_20_cycles = parsed_data[2] + self.percent_pass_filter_clusters = parsed_data[3] + self.percent_pass_filter_align = parsed_data[4] + self.average_alignment_score = parsed_data[5] + self.percent_error_rate = parsed_data[6] + elif len(data) == 10: + parsed_data = [ parse_mean_range(x) for x in data[2:] ] + # this is the >= 0.3 summary file + self.lane_yield = data[1] + self.cluster = parsed_data[0] + self.cluster_pass_filter = parsed_data[1] + self.average_first_cycle_intensity = parsed_data[2] + self.percent_intensity_after_20_cycles = parsed_data[3] + self.percent_pass_filter_clusters = parsed_data[4] + self.percent_pass_filter_align = parsed_data[5] + self.average_alignment_score = parsed_data[6] + self.percent_error_rate = parsed_data[7] + + def get_elements(self): + lane_result = ElementTree.Element( + Summary.LaneResultSummary.LANE_RESULT_SUMMARY, + {'lane': self.lane}) + for tag, variable_name in Summary.LaneResultSummary.TAGS.items(): + value = getattr(self, variable_name) + if value is None: + continue + # it looks like a sequence + elif type(value) in (types.TupleType, types.ListType): + element = make_mean_range_element( + lane_result, + tag, + *value + ) + else: + element = ElementTree.SubElement(lane_result, tag) + element.text = value + return lane_result + + def set_elements(self, tree): + if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY: + raise ValueError('Expected %s' % ( + Summary.LaneResultSummary.LANE_RESULT_SUMMARY)) + self.lane = tree.attrib['lane'] + tags = Summary.LaneResultSummary.TAGS + for element in list(tree): + try: + variable_name = tags[element.tag] + setattr(self, variable_name, + parse_summary_element(element)) + except KeyError, e: + logging.warn('Unrecognized tag %s' % (element.tag,)) + + def __init__(self, filename=None, xml=None): + self.lane_results = {} + + if filename is not None: + self._extract_lane_results(filename) + if xml is not None: + self.set_elements(xml) + + def __getitem__(self, key): + return self.lane_results[key] + + def __len__(self): + return len(self.lane_results) + + def keys(self): + return self.lane_results.keys() + + def values(self): + return self.lane_results.values() + + def items(self): + return self.lane_results.items() + + def _flattened_row(self, row): + """ + flatten the children of a ... + """ + return [flatten(x) for x in row.getchildren() ] + + def _parse_table(self, table): + """ + assumes the first line is the header of a table, + and that the remaining rows are data + """ + rows = table.getchildren() + data = [] + for r in rows: + data.append(self._flattened_row(r)) + return data + + def _extract_named_tables(self, pathname): + """ + extract all the 'named' tables from a Summary.htm file + and return as a dictionary + + Named tables are

...

...
pairs + The contents of the h2 tag is considered to the name + of the table. + """ + tree = ElementTree.parse(pathname).getroot() + body = tree.find('body') + tables = {} + for i in range(len(body)): + if body[i].tag == 'h2' and body[i+1].tag == 'table': + # we have an interesting table + name = flatten(body[i]) + table = body[i+1] + data = self._parse_table(table) + tables[name] = data + return tables + + def _extract_lane_results(self, pathname): + """ + extract the Lane Results Summary table + """ + + tables = self._extract_named_tables(pathname) + + # parse lane result summary + lane_summary = tables['Lane Results Summary'] + # this is version 1 of the summary file + if len(lane_summary[-1]) == 8: + # strip header + headers = lane_summary[0] + # grab the lane by lane data + lane_summary = lane_summary[1:] + + # this is version 2 of the summary file + if len(lane_summary[-1]) == 10: + # lane_summary[0] is a different less specific header row + headers = lane_summary[1] + lane_summary = lane_summary[2:10] + # after the last lane, there's a set of chip wide averages + + for r in lane_summary: + lrs = Summary.LaneResultSummary(html=r) + self.lane_results[lrs.lane] = lrs + + def get_elements(self): + summary = ElementTree.Element(Summary.SUMMARY, + {'version': unicode(Summary.XML_VERSION)}) + for lane in self.lane_results.values(): + summary.append(lane.get_elements()) + return summary + + def set_elements(self, tree): + if tree.tag != Summary.SUMMARY: + return ValueError("Expected %s" % (Summary.SUMMARY,)) + xml_version = int(tree.attrib.get('version', 0)) + if xml_version > Summary.XML_VERSION: + logging.warn('Summary XML tree is a higher version than this class') + for element in list(tree): + lrs = Summary.LaneResultSummary() + lrs.set_elements(element) + self.lane_results[lrs.lane] = lrs + + def dump(self): + """ + Debugging function, report current object + """ + pass + + +def build_genome_fasta_map(genome_dir): + # build fasta to fasta file map + genome = genome_dir.split(os.path.sep)[-1] + fasta_map = {} + for vld_file in glob(os.path.join(genome_dir, '*.vld')): + is_link = False + if os.path.islink(vld_file): + is_link = True + vld_file = os.path.realpath(vld_file) + path, vld_name = os.path.split(vld_file) + name, ext = os.path.splitext(vld_name) + if is_link: + fasta_map[name] = name + else: + fasta_map[name] = os.path.join(genome, name) + return fasta_map + +class ElandLane(object): + """ + Process an eland result file + """ + XML_VERSION = 1 + LANE = 'ElandLane' + SAMPLE_NAME = 'SampleName' + LANE_ID = 'LaneID' + GENOME_MAP = 'GenomeMap' + GENOME_ITEM = 'GenomeItem' + MAPPED_READS = 'MappedReads' + MAPPED_ITEM = 'MappedItem' + MATCH_CODES = 'MatchCodes' + MATCH_ITEM = 'Code' + READS = 'Reads' + + def __init__(self, pathname=None, genome_map=None, xml=None): + self.pathname = pathname + self._sample_name = None + self._lane_id = None + self._reads = None + self._mapped_reads = None + self._match_codes = None + if genome_map is None: + genome_map = {} + self.genome_map = genome_map + + if xml is not None: + self.set_elements(xml) + + def _update(self): + """ + Actually read the file and actually count the reads + """ + # can't do anything if we don't have a file to process + if self.pathname is None: + return + + if os.stat(self.pathname)[stat.ST_SIZE] == 0: + raise RuntimeError("Eland isn't done, try again later.") + + reads = 0 + mapped_reads = {} + + match_codes = {'NM':0, 'QC':0, 'RM':0, + 'U0':0, 'U1':0, 'U2':0, + 'R0':0, 'R1':0, 'R2':0, + } + for line in autoopen(self.pathname,'r'): + reads += 1 + fields = line.split() + # code = fields[2] + # match_codes[code] = match_codes.setdefault(code, 0) + 1 + # the QC/NM etc codes are in the 3rd field and always present + match_codes[fields[2]] += 1 + # ignore lines that don't have a fasta filename + if len(fields) < 7: + continue + fasta = self.genome_map.get(fields[6], fields[6]) + mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1 + self._match_codes = match_codes + self._mapped_reads = mapped_reads + self._reads = reads + + def _update_name(self): + # extract the sample name + if self.pathname is None: + return + + path, name = os.path.split(self.pathname) + split_name = name.split('_') + self._sample_name = split_name[0] + self._lane_id = split_name[1] + + def _get_sample_name(self): + if self._sample_name is None: + self._update_name() + return self._sample_name + sample_name = property(_get_sample_name) + + def _get_lane_id(self): + if self._lane_id is None: + self._update_name() + return self._lane_id + lane_id = property(_get_lane_id) + + def _get_reads(self): + if self._reads is None: + self._update() + return self._reads + reads = property(_get_reads) + + def _get_mapped_reads(self): + if self._mapped_reads is None: + self._update() + return self._mapped_reads + mapped_reads = property(_get_mapped_reads) + + def _get_match_codes(self): + if self._match_codes is None: + self._update() + return self._match_codes + match_codes = property(_get_match_codes) + + def get_elements(self): + lane = ElementTree.Element(ElandLane.LANE, + {'version': + unicode(ElandLane.XML_VERSION)}) + sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME) + sample_tag.text = self.sample_name + lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID) + lane_tag.text = self.lane_id + genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP) + for k, v in self.genome_map.items(): + item = ElementTree.SubElement( + genome_map, ElandLane.GENOME_ITEM, + {'name':k, 'value':unicode(v)}) + mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS) + for k, v in self.mapped_reads.items(): + item = ElementTree.SubElement( + mapped_reads, ElandLane.MAPPED_ITEM, + {'name':k, 'value':unicode(v)}) + match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES) + for k, v in self.match_codes.items(): + item = ElementTree.SubElement( + match_codes, ElandLane.MATCH_ITEM, + {'name':k, 'value':unicode(v)}) + reads = ElementTree.SubElement(lane, ElandLane.READS) + reads.text = unicode(self.reads) + + return lane + + def set_elements(self, tree): + if tree.tag != ElandLane.LANE: + raise ValueError('Exptecting %s' % (ElandLane.LANE,)) + + # reset dictionaries + self._mapped_reads = {} + self._match_codes = {} + + for element in tree: + tag = element.tag.lower() + if tag == ElandLane.SAMPLE_NAME.lower(): + self._sample_name = element.text + elif tag == ElandLane.LANE_ID.lower(): + self._lane_id = element.text + elif tag == ElandLane.GENOME_MAP.lower(): + for child in element: + name = child.attrib['name'] + value = child.attrib['value'] + self.genome_map[name] = value + elif tag == ElandLane.MAPPED_READS.lower(): + for child in element: + name = child.attrib['name'] + value = child.attrib['value'] + self._mapped_reads[name] = int(value) + elif tag == ElandLane.MATCH_CODES.lower(): + for child in element: + name = child.attrib['name'] + value = int(child.attrib['value']) + self._match_codes[name] = value + elif tag == ElandLane.READS.lower(): + self._reads = int(element.text) + else: + logging.warn("ElandLane unrecognized tag %s" % (element.tag,)) + +def extract_eland_sequence(instream, outstream, start, end): + """ + Extract a chunk of sequence out of an eland file + """ + for line in instream: + record = line.split() + if len(record) > 1: + result = [record[0], record[1][start:end]] + else: + result = [record[0][start:end]] + outstream.write("\t".join(result)) + outstream.write(os.linesep) + +class ELAND(object): + """ + Summarize information from eland files + """ + XML_VERSION = 1 + + ELAND = 'ElandCollection' + LANE = 'Lane' + LANE_ID = 'id' + + def __init__(self, xml=None): + # we need information from the gerald config.xml + self.results = {} + + if xml is not None: + self.set_elements(xml) + + def __len__(self): + return len(self.results) + + def keys(self): + return self.results.keys() + + def values(self): + return self.results.values() + + def items(self): + return self.results.items() + + def __getitem__(self, key): + return self.results[key] + + def get_elements(self): + root = ElementTree.Element(ELAND.ELAND, + {'version': unicode(ELAND.XML_VERSION)}) + for lane_id, lane in self.results.items(): + eland_lane = lane.get_elements() + eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id) + root.append(eland_lane) + return root + + def set_elements(self, tree): + if tree.tag.lower() != ELAND.ELAND.lower(): + raise ValueError('Expecting %s', ELAND.ELAND) + for element in list(tree): + lane_id = element.attrib[ELAND.LANE_ID] + lane = ElandLane(xml=element) + self.results[lane_id] = lane + +def eland(basedir, gerald=None, genome_maps=None): + e = ELAND() + + file_list = glob(os.path.join(basedir, "*_eland_result.txt")) + if len(file_list) == 0: + # lets handle compressed eland files too + file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2")) + + for pathname in file_list: + # yes the lane_id is also being computed in ElandLane._update + # I didn't want to clutter up my constructor + # but I needed to persist the sample_name/lane_id for + # runfolder summary_report + path, name = os.path.split(pathname) + split_name = name.split('_') + lane_id = split_name[1] + + if genome_maps is not None: + genome_map = genome_maps[lane_id] + elif gerald is not None: + genome_dir = gerald.lanes[lane_id].eland_genome + genome_map = build_genome_fasta_map(genome_dir) + else: + genome_map = {} + + eland_result = ElandLane(pathname, genome_map) + e.results[lane_id] = eland_result + return e diff --git a/htsworkflow/pipelines/recipe_parser.py b/htsworkflow/pipelines/recipe_parser.py new file mode 100644 index 0000000..7f5ced6 --- /dev/null +++ b/htsworkflow/pipelines/recipe_parser.py @@ -0,0 +1,48 @@ +from xml import sax + + +def get_cycles(recipe_xml_filepath): + """ + returns the number of cycles found in Recipe*.xml + """ + handler = CycleXmlHandler() + sax.parse(recipe_xml_filepath, handler) + return handler.cycle_count + + + +class CycleXmlHandler(sax.ContentHandler): + + def __init__(self): + self.cycle_count = 0 + self.in_protocol = False + sax.ContentHandler.__init__(self) + + + def startDocument(self): + self.cycle_count = 0 + self.in_protocol = False + + + def startElement(self, name, attrs): + + #Only count Incorporations as cycles if within + # the protocol section of the xml document. + if name == "Incorporation" and self.in_protocol: + #print 'Found a cycle!' + self.cycle_count += 1 + return + + elif name == 'Protocol': + #print 'In protocol' + self.in_protocol = True + return + + #print 'Skipping: %s' % (name) + + + def endElement(self, name): + + if name == 'Protocol': + #print 'End protocol' + self.in_protocol = False diff --git a/htsworkflow/pipelines/retrieve_config.py b/htsworkflow/pipelines/retrieve_config.py new file mode 100644 index 0000000..380c0de --- /dev/null +++ b/htsworkflow/pipelines/retrieve_config.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python + +from optparse import OptionParser, IndentedHelpFormatter +from ConfigParser import SafeConfigParser + +import logging +import os +import sys +import urllib2 + +CONFIG_SYSTEM = '/etc/hts_frontend/hts_frontend.conf' +CONFIG_USER = os.path.expanduser('~/.hts_frontend.conf') + +#Disable or enable commandline arg parsing; disabled by default. +DISABLE_CMDLINE = True + +class FlowCellNotFound(Exception): pass +class WebError404(Exception): pass + +class DummyOptions: + """ + Used when command line parsing is disabled; default + """ + def __init__(self): + self.url = None + self.output_filepath = None + self.flowcell = None + self.genome_dir = None + +class PreformattedDescriptionFormatter(IndentedHelpFormatter): + + #def format_description(self, description): + # + # if description: + # return description + "\n" + # else: + # return "" + + def format_epilog(self, epilog): + """ + It was removing my preformated epilog, so this should override + that behavior! Muhahaha! + """ + if epilog: + return "\n" + epilog + "\n" + else: + return "" + + +def constructOptionParser(): + """ + returns a pre-setup optparser + """ + global DISABLE_CMDLINE + + if DISABLE_CMDLINE: + return None + + parser = OptionParser(formatter=PreformattedDescriptionFormatter()) + + parser.set_description('Retrieves eland config file from hts_frontend web frontend.') + + parser.epilog = """ +Config File: + * %s (System wide) + * %s (User specific; overrides system) + * command line overrides all config file options + + Example Config File: + + [config_file_server] + base_host_url=http://somewhere.domain:port +""" % (CONFIG_SYSTEM, CONFIG_USER) + + #Special formatter for allowing preformatted description. + ##parser.format_epilog(PreformattedDescriptionFormatter()) + + parser.add_option("-u", "--url", + action="store", type="string", dest="url") + + parser.add_option("-o", "--output", + action="store", type="string", dest="output_filepath") + + parser.add_option("-f", "--flowcell", + action="store", type="string", dest="flowcell") + + parser.add_option("-g", "--genome_dir", + action="store", type="string", dest="genome_dir") + + #parser.set_default("url", "default") + + return parser + +def constructConfigParser(): + """ + returns a pre-setup config parser + """ + parser = SafeConfigParser() + parser.read([CONFIG_SYSTEM, CONFIG_USER]) + if not parser.has_section('config_file_server'): + parser.add_section('config_file_server') + if not parser.has_section('local_setup'): + parser.add_section('local_setup') + + return parser + + +def getCombinedOptions(): + """ + Returns optparse options after it has be updated with ConfigParser + config files and merged with parsed commandline options. + """ + cl_parser = constructOptionParser() + conf_parser = constructConfigParser() + + if cl_parser is None: + options = DummyOptions() + else: + options, args = cl_parser.parse_args() + + if options.url is None: + if conf_parser.has_option('config_file_server', 'base_host_url'): + options.url = conf_parser.get('config_file_server', 'base_host_url') + + if options.genome_dir is None: + if conf_parser.has_option('local_setup', 'genome_dir'): + options.genome_dir = conf_parser.get('local_setup', 'genome_dir') + + print 'USING OPTIONS:' + print ' URL:', options.url + print ' OUT:', options.output_filepath + print ' FC:', options.flowcell + print 'GDIR:', options.genome_dir + print '' + + return options + + +def saveConfigFile(flowcell, base_host_url, output_filepath): + """ + retrieves the flowcell eland config file, give the base_host_url + (i.e. http://sub.domain.edu:port) + """ + url = base_host_url + '/eland_config/%s/' % (flowcell) + + f = open(output_filepath, 'w') + #try: + try: + web = urllib2.urlopen(url) + except urllib2.URLError, e: + errmsg = 'URLError: %d' % (e.code,) + logging.error(errmsg) + logging.error('opened %s' % (url,)) + logging.error('%s' % ( e.read(),)) + raise IOError(errmsg) + + #except IOError, msg: + # if str(msg).find("Connection refused") >= 0: + # print 'Error: Connection refused for: %s' % (url) + # f.close() + # sys.exit(1) + # elif str(msg).find("Name or service not known") >= 0: + # print 'Error: Invalid domain or ip address for: %s' % (url) + # f.close() + # sys.exit(2) + # else: + # raise IOError, msg + + data = web.read() + + if data.find('Hmm, config file for') >= 0: + msg = "Flowcell (%s) not found in DB; full url(%s)" % (flowcell, url) + raise FlowCellNotFound, msg + + if data.find('404 - Not Found') >= 0: + msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \ + "Did you get right port #?" % (flowcell, base_host_url, url) + raise FlowCellNotFound, msg + + f.write(data) + web.close() + f.close() + logging.info('Wrote config file to %s' % (output_filepath,)) + + diff --git a/htsworkflow/pipelines/run_status.py b/htsworkflow/pipelines/run_status.py new file mode 100644 index 0000000..39dc54c --- /dev/null +++ b/htsworkflow/pipelines/run_status.py @@ -0,0 +1,478 @@ +import glob +import re +import os +import sys +import time +import threading + +s_comment = re.compile('^#') +s_general_read_len = re.compile('^READ_LENGTH ') +s_read_len = re.compile('^[1-8]+:READ_LENGTH ') + +s_firecrest = None + +def _four_digit_num_in_string(num): + if num < 0: + pass + elif num < 10: + return '000' + str(num) + elif num < 100: + return '00' + str(num) + elif num < 1000: + return '0' + str(num) + elif num < 10000: + return str(num) + + msg = 'Invalid number: %s' % (num) + raise ValueError, msg + +def _two_digit_num_in_string(num): + if num < 0: + pass + elif num < 10: + return '0' + str(num) + elif num < 100: + return str(num) + + msg = 'Invalid number: %s' % (num) + raise ValueError, msg + + +# FIRECREST PATTERNS +# _p2f(, lane, tile, cycle) +PATTERN_FIRECREST_QCM = 's_%s_%s_%s_qcm.xml' + +# _p2f(, lane, tile) +PATTERN_FIRECREST_INT = 's_%s_%s_02_int.txt' +PATTERN_FIRECREST_NSE = 's_%s_%s_nse.txt.gz' +PATTERN_FIRECREST_POS = 's_%s_%s_pos.txt' +PATTERN_FIRECREST_IDX = 's_%s_%s_idx.txt' +PATTERN_FIRECREST_CLU1 = 's_%s_%s_01_1_clu.txt' +PATTERN_FIRECREST_CLU2 = 's_%s_%s_01_2_clu.txt' +PATTERN_FIRECREST_CLU3 = 's_%s_%s_01_3_clu.txt' +PATTERN_FIRECREST_CLU4 = 's_%s_%s_01_4_clu.txt' + + +# BUSTARD PATTERNS +# _p2f(, lane, tile) +PATTERN_BUSTARD_SIG2 = 's_%s_%s_sig2.txt' +PATTERN_BUSTARD_PRB = 's_%s_%s_prb.txt' + + + +# GERALD PATTERNS +# _p2f(, lane, tile) +PATTERN_GERALD_ALLTMP = 's_%s_%s_all.txt.tmp' +PATTERN_GERALD_QRAWTMP = 's_%s_%s_qraw.txt.tmp' +PATTERN_GERALD_ALLPNGTMP = 's_%s_%s_all.tmp.png' +PATTERN_GERALD_ALIGNTMP = 's_%s_%s_align.txt.tmp' +PATTERN_GERALD_QVALTMP = 's_%s_%s_qval.txt.tmp' +PATTERN_GERALD_SCORETMP = 's_%s_%s_score.txt.tmp' +PATTERN_GERALD_PREALIGNTMP = 's_%s_%s_prealign.txt.tmp' +PATTERN_GERALD_REALIGNTMP = 's_%s_%s_realign.txt.tmp' +PATTERN_GERALD_RESCORETMP = 's_%s_%s_rescore.txt.tmp' +PATTERN_GERALD_RESCOREPNG = 's_%s_%s_rescore.png' +PATTERN_GERALD_ERRORSTMPPNG = 's_%s_%s_errors.tmp.png' +PATTERN_GERALD_QCALTMP = 's_%s_%s_qcal.txt.tmp' +PATTERN_GERALD_QVAL = 's_%s_%s_qval.txt' + +# _p2f(, lane) +PATTERN_GERALD_SEQPRETMP = 's_%s_seqpre.txt.tmp' +PATTERN_GERALD_RESULTTMP = 's_%s_eland_result.txt.tmp' +PATTERN_GERALD_SIGMEANSTMP = 's_%s_Signal_Means.txt.tmp' +PATTERN_GERALD_CALLPNG = 's_%s_call.png' +PATTERN_GERALD_ALLPNG = 's_%s_all.png' +PATTERN_GERALD_PERCENTALLPNG = 's_%s_percent_all.png' +PATTERN_GERALD_PERCENTCALLPNG = 's_%s_percent_call.png' +PATTERN_GERALD_PERCENTBASEPNG = 's_%s_percent_base.png' +PATTERN_GERALD_FILTTMP = 's_%s_filt.txt.tmp' +PATTERN_GERALD_FRAGTMP = 's_%s_frag.txt.tmp' +PATTERN_GERALD_QREPORTTMP = 's_%s_qreport.txt.tmp' +PATTERN_GERALD_QTABLETMP = 's_%s_qtable.txt.tmp' +PATTERN_GERALD_QCALREPORTTMP = 's_%s_qcalreport.txt.tmp' +PATTERN_GERALD_SEQUENCETMP = 's_%s_sequence.txt.tmp' +PATTERN_GERALD_LANEFINISHED = 's_%s_finished.txt' + + + +def _p2f(pattern, lane, tile=None, cycle=None): + """ + Converts a pattern plus info into file names + """ + + # lane, and cycle provided (INVALID) + if tile is None and cycle is not None: + msg = "Handling of cycle without tile is not currently implemented." + raise ValueError, msg + + # lane, tile, cycle provided + elif cycle: + return pattern % (lane, + _four_digit_num_in_string(tile), + _two_digit_num_in_string(cycle)) + + # lane, tile provided + elif tile: + return pattern % (lane, _four_digit_num_in_string(tile)) + + # lane provided + else: + return pattern % (lane) + + +class GARunStatus(object): + + def __init__(self, conf_filepath): + """ + Given an eland config file in the top level directory + of a run, predicts the files that will be generated + during a run and provides methods for retrieving + (completed, total) for each step or entire run. + """ + #print 'self._conf_filepath = %s' % (conf_filepath) + self._conf_filepath = conf_filepath + self._base_dir, junk = os.path.split(conf_filepath) + self._image_dir = os.path.join(self._base_dir, 'Images') + + self.lanes = [] + self.lane_read_length = {} + self.tiles = None + self.cycles = None + + self.status = {} + self.status['firecrest'] = {} + self.status['bustard'] = {} + self.status['gerald'] = {} + + self._process_config() + self._count_tiles() + self._count_cycles() + self._generate_expected() + + + def _process_config(self): + """ + Grabs info from self._conf_filepath + """ + f = open(self._conf_filepath, 'r') + + for line in f: + + #Skip comment lines for now. + if s_comment.search(line): + continue + + mo = s_general_read_len.search(line) + if mo: + read_length = int(line[mo.end():]) + #Handle general READ_LENGTH + for i in range(1,9): + self.lane_read_length[i] = read_length + + mo = s_read_len.search(line) + if mo: + read_length = int(line[mo.end():]) + lanes, junk = line.split(':') + + #Convert lanes from string of lanes to list of lane #s. + lanes = [ int(i) for i in lanes ] + + + for lane in lanes: + + #Keep track of which lanes are being run. + if lane not in self.lanes: + self.lanes.append(lane) + + #Update with lane specific read lengths + self.lane_read_length[lane] = read_length + + self.lanes.sort() + + + def _count_tiles(self): + """ + Count the number of tiles being used + """ + self.tiles = len(glob.glob(os.path.join(self._image_dir, + 'L001', + 'C1.1', + 's_1_*_a.tif'))) + + def _count_cycles(self): + """ + Figures out the number of cycles that are available + """ + #print 'self._image_dir = %s' % (self._image_dir) + cycle_dirs = glob.glob(os.path.join(self._image_dir, 'L001', 'C*.1')) + #print 'cycle_dirs = %s' % (cycle_dirs) + cycle_list = [] + for cycle_dir in cycle_dirs: + junk, c = os.path.split(cycle_dir) + cycle_list.append(int(c[1:c.find('.')])) + + self.cycles = max(cycle_list) + + + + + def _generate_expected(self): + """ + generates a list of files we expect to find. + """ + + firecrest = self.status['firecrest'] + bustard = self.status['bustard'] + gerald = self.status['gerald'] + + + for lane in self.lanes: + for tile in range(1,self.tiles+1): + for cycle in range(1, self.cycles+1): + + ########################## + # LANE, TILE, CYCLE LAYER + + # FIRECREST + firecrest[_p2f(PATTERN_FIRECREST_QCM, lane, tile, cycle)] = False + + + ################### + # LANE, TILE LAYER + + # FIRECREST + firecrest[_p2f(PATTERN_FIRECREST_INT, lane, tile)] = False + firecrest[_p2f(PATTERN_FIRECREST_NSE, lane, tile)] = False + firecrest[_p2f(PATTERN_FIRECREST_POS, lane, tile)] = False + firecrest[_p2f(PATTERN_FIRECREST_IDX, lane, tile)] = False + firecrest[_p2f(PATTERN_FIRECREST_CLU1, lane, tile)] = False + firecrest[_p2f(PATTERN_FIRECREST_CLU2, lane, tile)] = False + firecrest[_p2f(PATTERN_FIRECREST_CLU3, lane, tile)] = False + firecrest[_p2f(PATTERN_FIRECREST_CLU4, lane, tile)] = False + + + # BUSTARD + bustard[_p2f(PATTERN_BUSTARD_SIG2, lane, tile)] = False + bustard[_p2f(PATTERN_BUSTARD_PRB, lane, tile)] = False + + + # GERALD + #gerald[_p2f(PATTERN_GERALD_ALLTMP, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_QRAWTMP, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_ALLPNGTMP, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_ALIGNTMP, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_QVALTMP, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_SCORETMP, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_PREALIGNTMP, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_REALIGNTMP, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_RESCORETMP, lane, tile)] = False + gerald[_p2f(PATTERN_GERALD_RESCOREPNG, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_ERRORSTMPPNG, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_QCALTMP, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_QVAL, lane, tile)] = False + + ################### + # LANE LAYER + + # GERALD + #gerald[_p2f(PATTERN_GERALD_SEQPRETMP, lane)] = False + #gerald[_p2f(PATTERN_GERALD_RESULTTMP, lane)] = False + #gerald[_p2f(PATTERN_GERALD_SIGMEANSTMP, lane)] = False + gerald[_p2f(PATTERN_GERALD_CALLPNG, lane)] = False + gerald[_p2f(PATTERN_GERALD_ALLPNG, lane)] = False + gerald[_p2f(PATTERN_GERALD_PERCENTALLPNG, lane)] = False + gerald[_p2f(PATTERN_GERALD_PERCENTCALLPNG, lane)] = False + gerald[_p2f(PATTERN_GERALD_PERCENTBASEPNG, lane)] = False + #gerald[_p2f(PATTERN_GERALD_FILTTMP, lane)] = False + #gerald[_p2f(PATTERN_GERALD_FRAGTMP, lane)] = False + #gerald[_p2f(PATTERN_GERALD_QREPORTTMP, lane)] = False + #gerald[_p2f(PATTERN_GERALD_QTABLETMP, lane)] = False + #gerald[_p2f(PATTERN_GERALD_QCALREPORTTMP, lane)] = False + #gerald[_p2f(PATTERN_GERALD_SEQUENCETMP, lane)] = False + gerald[_p2f(PATTERN_GERALD_LANEFINISHED, lane)] = False + + + + ################# + # LOOPS FINISHED + + # FIRECREST + firecrest['offsets_finished.txt'] = False + firecrest['finished.txt'] = False + + # BUSTARD + bustard['finished.txt'] = False + + # GERALD + gerald['tiles.txt'] = False + gerald['FullAll.htm'] = False + #gerald['All.htm.tmp'] = False + #gerald['Signal_Means.txt.tmp'] = False + #gerald['plotIntensity_for_IVC'] = False + #gerald['IVC.htm.tmp'] = False + gerald['FullError.htm'] = False + gerald['FullPerfect.htm'] = False + #gerald['Error.htm.tmp'] = False + #gerald['Perfect.htm.tmp'] = False + #gerald['Summary.htm.tmp'] = False + #gerald['Tile.htm.tmp'] = False + gerald['finished.txt'] = False + + def statusFirecrest(self): + """ + returns (, ) + """ + firecrest = self.status['firecrest'] + total = len(firecrest) + completed = firecrest.values().count(True) + + return (completed, total) + + + def statusBustard(self): + """ + returns (, ) + """ + bustard = self.status['bustard'] + total = len(bustard) + completed = bustard.values().count(True) + + return (completed, total) + + + def statusGerald(self): + """ + returns (, ) + """ + gerald = self.status['gerald'] + total = len(gerald) + completed = gerald.values().count(True) + + return (completed, total) + + + def statusTotal(self): + """ + returns (, ) + """ + #f = firecrest c = completed + #b = bustard t = total + #g = gerald + fc, ft = self.statusFirecrest() + bc, bt = self.statusBustard() + gc, gt = self.statusGerald() + + return (fc+bc+gc, ft+bt+gt) + + + def statusReport(self): + """ + Generate the basic percent complete report + """ + def _percentCompleted(completed, total): + """ + Returns precent completed as float + """ + return (completed / float(total)) * 100 + + fc, ft = self.statusFirecrest() + bc, bt = self.statusBustard() + gc, gt = self.statusGerald() + tc, tt = self.statusTotal() + + fp = _percentCompleted(fc, ft) + bp = _percentCompleted(bc, bt) + gp = _percentCompleted(gc, gt) + tp = _percentCompleted(tc, tt) + + report = ['Firecrest: %s%% (%s/%s)' % (fp, fc, ft), + ' Bustard: %s%% (%s/%s)' % (bp, bc, bt), + ' Gerald: %s%% (%s/%s)' % (gp, gc, gt), + '-----------------------', + ' Total: %s%% (%s/%s)' % (tp, tc, tt), + ] + return report + + def updateFirecrest(self, filename): + """ + Marks firecrest filename as being completed. + """ + self.status['firecrest'][filename] = True + + + def updateBustard(self, filename): + """ + Marks bustard filename as being completed. + """ + self.status['bustard'][filename] = True + + + def updateGerald(self, filename): + """ + Marks gerald filename as being completed. + """ + self.status['gerald'][filename] = True + + + +################################################## +# Functions to be called by Thread(target=) +def _cmdLineStatusMonitorFunc(conf_info): + """ + Given a ConfigInfo object, provides status to stdout. + + You should probably use startCmdLineStatusMonitor() + instead of ths function. + + Use with: + t = threading.Thread(target=_cmdLineStatusMonitorFunc, + args=[conf_info]) + t.setDaemon(True) + t.start() + """ + SLEEP_AMOUNT = 30 + + while 1: + if conf_info.status is None: + print "No status object yet." + time.sleep(SLEEP_AMOUNT) + continue + + report = conf_info.status.statusReport() + print os.linesep.join(report) + print + + time.sleep(SLEEP_AMOUNT) + + +############################################# +# Start monitor thread convenience functions +def startCmdLineStatusMonitor(conf_info): + """ + Starts a command line status monitor given a conf_info object. + """ + t = threading.Thread(target=_cmdLineStatusMonitorFunc, args=[conf_info]) + t.setDaemon(True) + t.start() + +from optparse import OptionParser +def make_parser(): + usage = "%prog: config file" + + parser = OptionParser() + return parser + +def main(cmdline=None): + parser = make_parser() + opt, args = parser.parse_args(cmdline) + + if len(args) != 1: + parser.error("need name of configuration file") + + status = GARunStatus(args[0]) + print os.linesep.join(status.statusReport()) + return 0 + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) + diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py new file mode 100644 index 0000000..dee3231 --- /dev/null +++ b/htsworkflow/pipelines/runfolder.py @@ -0,0 +1,313 @@ +""" +Core information needed to inspect a runfolder. +""" +from glob import glob +import logging +import os +import re +import shutil +import stat +import subprocess +import sys +import time + +try: + from xml.etree import ElementTree +except ImportError, e: + from elementtree import ElementTree + +EUROPEAN_STRPTIME = "%d-%m-%Y" +EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})" +VERSION_RE = "([0-9\.]+)" +USER_RE = "([a-zA-Z0-9]+)" +LANES_PER_FLOWCELL = 8 + +from htsworkflow.util.alphanum import alphanum +from htsworkflow.util.ethelp import indent, flatten + + +class PipelineRun(object): + """ + Capture "interesting" information about a pipeline run + """ + XML_VERSION = 1 + PIPELINE_RUN = 'PipelineRun' + FLOWCELL_ID = 'FlowcellID' + + def __init__(self, pathname=None, firecrest=None, bustard=None, gerald=None, xml=None): + if pathname is not None: + self.pathname = os.path.normpath(pathname) + else: + self.pathname = None + self._name = None + self._flowcell_id = None + self.firecrest = firecrest + self.bustard = bustard + self.gerald = gerald + + if xml is not None: + self.set_elements(xml) + + def _get_flowcell_id(self): + # extract flowcell ID + if self._flowcell_id is None: + config_dir = os.path.join(self.pathname, 'Config') + flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml') + if os.path.exists(flowcell_id_path): + flowcell_id_tree = ElementTree.parse(flowcell_id_path) + self._flowcell_id = flowcell_id_tree.findtext('Text') + else: + path_fields = self.pathname.split('_') + if len(path_fields) > 0: + # guessing last element of filename + flowcell_id = path_fields[-1] + else: + flowcell_id = 'unknown' + + logging.warning( + "Flowcell id was not found, guessing %s" % ( + flowcell_id)) + self._flowcell_id = flowcell_id + return self._flowcell_id + flowcell_id = property(_get_flowcell_id) + + def get_elements(self): + """ + make one master xml file from all of our sub-components. + """ + root = ElementTree.Element(PipelineRun.PIPELINE_RUN) + flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID) + flowcell.text = self.flowcell_id + root.append(self.firecrest.get_elements()) + root.append(self.bustard.get_elements()) + root.append(self.gerald.get_elements()) + return root + + def set_elements(self, tree): + # this file gets imported by all the others, + # so we need to hide the imports to avoid a cyclic imports + from htsworkflow.pipelines import firecrest + from htsworkflow.pipelines import bustard + from htsworkflow.pipelines import gerald + + tag = tree.tag.lower() + if tag != PipelineRun.PIPELINE_RUN.lower(): + raise ValueError('Pipeline Run Expecting %s got %s' % ( + PipelineRun.PIPELINE_RUN, tag)) + for element in tree: + tag = element.tag.lower() + if tag == PipelineRun.FLOWCELL_ID.lower(): + self._flowcell_id = element.text + #ok the xword.Xword.XWORD pattern for module.class.constant is lame + elif tag == firecrest.Firecrest.FIRECREST.lower(): + self.firecrest = firecrest.Firecrest(xml=element) + elif tag == bustard.Bustard.BUSTARD.lower(): + self.bustard = bustard.Bustard(xml=element) + elif tag == gerald.Gerald.GERALD.lower(): + self.gerald = gerald.Gerald(xml=element) + else: + logging.warn('PipelineRun unrecognized tag %s' % (tag,)) + + def _get_run_name(self): + """ + Given a run tuple, find the latest date and use that as our name + """ + if self._name is None: + tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time) + timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax)) + self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml' + return self._name + name = property(_get_run_name) + + def save(self, destdir=None): + if destdir is None: + destdir = '' + logging.info("Saving run report "+ self.name) + xml = self.get_elements() + indent(xml) + dest_pathname = os.path.join(destdir, self.name) + ElementTree.ElementTree(xml).write(dest_pathname) + + def load(self, filename): + logging.info("Loading run report from " + filename) + tree = ElementTree.parse(filename).getroot() + self.set_elements(tree) + +def get_runs(runfolder): + """ + Search through a run folder for all the various sub component runs + and then return a PipelineRun for each different combination. + + For example if there are two different GERALD runs, this will + generate two different PipelineRun objects, that differ + in there gerald component. + """ + from htsworkflow.pipelines import firecrest + from htsworkflow.pipelines import bustard + from htsworkflow.pipelines import gerald + + datadir = os.path.join(runfolder, 'Data') + + logging.info('Searching for runs in ' + datadir) + runs = [] + for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")): + f = firecrest.firecrest(firecrest_pathname) + bustard_glob = os.path.join(firecrest_pathname, "Bustard*") + for bustard_pathname in glob(bustard_glob): + b = bustard.bustard(bustard_pathname) + gerald_glob = os.path.join(bustard_pathname, 'GERALD*') + for gerald_pathname in glob(gerald_glob): + try: + g = gerald.gerald(gerald_pathname) + runs.append(PipelineRun(runfolder, f, b, g)) + except IOError, e: + print "Ignoring", str(e) + return runs + + +def extract_run_parameters(runs): + """ + Search through runfolder_path for various runs and grab their parameters + """ + for run in runs: + run.save() + +def summarize_mapped_reads(mapped_reads): + """ + Summarize per chromosome reads into a genome count + But handle spike-in/contamination symlinks seperately. + """ + summarized_reads = {} + genome_reads = 0 + genome = 'unknown' + for k, v in mapped_reads.items(): + path, k = os.path.split(k) + if len(path) > 0: + genome = path + genome_reads += v + else: + summarized_reads[k] = summarized_reads.setdefault(k, 0) + v + summarized_reads[genome] = genome_reads + return summarized_reads + +def summary_report(runs): + """ + Summarize cluster numbers and mapped read counts for a runfolder + """ + report = [] + for run in runs: + # print a run name? + report.append('Summary for %s' % (run.name,)) + # sort the report + eland_keys = run.gerald.eland_results.results.keys() + eland_keys.sort(alphanum) + + lane_results = run.gerald.summary.lane_results + for lane_id in eland_keys: + result = run.gerald.eland_results.results[lane_id] + report.append("Sample name %s" % (result.sample_name)) + report.append("Lane id %s" % (result.lane_id,)) + cluster = lane_results[result.lane_id].cluster + report.append("Clusters %d +/- %d" % (cluster[0], cluster[1])) + report.append("Total Reads: %d" % (result.reads)) + mc = result._match_codes + nm = mc['NM'] + nm_percent = float(nm)/result.reads * 100 + qc = mc['QC'] + qc_percent = float(qc)/result.reads * 100 + + report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent)) + report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent)) + report.append('Unique (0,1,2 mismatches) %d %d %d' % \ + (mc['U0'], mc['U1'], mc['U2'])) + report.append('Repeat (0,1,2 mismatches) %d %d %d' % \ + (mc['R0'], mc['R1'], mc['R2'])) + report.append("Mapped Reads") + mapped_reads = summarize_mapped_reads(result.mapped_reads) + for name, counts in mapped_reads.items(): + report.append(" %s: %d" % (name, counts)) + report.append('---') + report.append('') + return os.linesep.join(report) + +def extract_results(runs, output_base_dir=None): + if output_base_dir is None: + output_base_dir = os.getcwd() + + for r in runs: + result_dir = os.path.join(output_base_dir, r.flowcell_id) + logging.info("Using %s as result directory" % (result_dir,)) + if not os.path.exists(result_dir): + os.mkdir(result_dir) + + # create cycle_dir + cycle = "C%d-%d" % (r.firecrest.start, r.firecrest.stop) + logging.info("Filling in %s" % (cycle,)) + cycle_dir = os.path.join(result_dir, cycle) + if os.path.exists(cycle_dir): + logging.error("%s already exists, not overwriting" % (cycle_dir,)) + continue + else: + os.mkdir(cycle_dir) + + # copy stuff out of the main run + g = r.gerald + + # save run file + r.save(cycle_dir) + + # Copy Summary.htm + summary_path = os.path.join(r.gerald.pathname, 'Summary.htm') + if os.path.exists(summary_path): + logging.info('Copying %s to %s' % (summary_path, cycle_dir)) + shutil.copy(summary_path, cycle_dir) + else: + logging.info('Summary file %s was not found' % (summary_path,)) + + # tar score files + score_files = [] + for f in os.listdir(g.pathname): + if re.match('.*_score.txt', f): + score_files.append(f) + + tar_cmd = ['/bin/tar', 'c'] + score_files + bzip_cmd = [ 'bzip2', '-9', '-c' ] + tar_dest_name =os.path.join(cycle_dir, 'scores.tar.bz2') + tar_dest = open(tar_dest_name, 'w') + logging.info("Compressing score files in %s" % (g.pathname,)) + logging.info("Running tar: " + " ".join(tar_cmd[:10])) + logging.info("Running bzip2: " + " ".join(bzip_cmd)) + logging.info("Writing to %s" %(tar_dest_name)) + + tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, cwd=g.pathname) + bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest) + tar.wait() + + # copy & bzip eland files + for eland_lane in g.eland_results.values(): + source_name = eland_lane.pathname + path, name = os.path.split(eland_lane.pathname) + dest_name = os.path.join(cycle_dir, name+'.bz2') + + args = ['bzip2', '-9', '-c', source_name] + logging.info('Running: %s' % ( " ".join(args) )) + bzip_dest = open(dest_name, 'w') + bzip = subprocess.Popen(args, stdout=bzip_dest) + logging.info('Saving to %s' % (dest_name, )) + bzip.wait() + +def clean_runs(runs): + """ + Clean up run folders to optimize for compression. + """ + # TODO: implement this. + # rm RunLog*.xml + # rm pipeline_*.txt + # rm gclog.txt + # rm NetCopy.log + # rm nfn.log + # rm Images/L* + # cd Data/C1-*_Firecrest* + # make clean_intermediate + + pass diff --git a/htsworkflow/pipelines/test/test_genome_mapper.py b/htsworkflow/pipelines/test/test_genome_mapper.py new file mode 100644 index 0000000..8ba1ba5 --- /dev/null +++ b/htsworkflow/pipelines/test/test_genome_mapper.py @@ -0,0 +1,33 @@ +import unittest + +from StringIO import StringIO +from htsworkflow.pipelines import genome_mapper + +class testGenomeMapper(unittest.TestCase): + def test_construct_mapper(self): + genomes = { + 'Arabidopsis thaliana': {'v01212004': '/arabidopsis'}, + 'Homo sapiens': {'hg18': '/hg18'}, + 'Mus musculus': {'mm8': '/mm8', + 'mm9': '/mm9', + 'mm10': '/mm10'}, + 'Phage': {'174': '/phi'}, + } + genome_map = genome_mapper.constructMapperDict(genomes) + + self.failUnlessEqual("%(Mus musculus|mm8)s" % (genome_map), "/mm8") + self.failUnlessEqual("%(Phage|174)s" % (genome_map), "/phi") + self.failUnlessEqual("%(Mus musculus)s" % (genome_map), "/mm10") + self.failUnlessEqual("%(Mus musculus|mm8)s" % (genome_map), "/mm8") + self.failUnlessEqual("%(Mus musculus|mm10)s" % (genome_map), "/mm10") + + self.failUnlessEqual(len(genome_map.keys()), 6) + self.failUnlessEqual(len(genome_map.values()), 6) + self.failUnlessEqual(len(genome_map.items()), 6) + + +def suite(): + return unittest.makeSuite(testGenomeMapper,'test') + +if __name__ == "__main__": + unittest.main(defaultTest="suite") diff --git a/htsworkflow/pipelines/test/test_runfolder026.py b/htsworkflow/pipelines/test/test_runfolder026.py new file mode 100644 index 0000000..40d29cf --- /dev/null +++ b/htsworkflow/pipelines/test/test_runfolder026.py @@ -0,0 +1,601 @@ +#!/usr/bin/env python + +from datetime import datetime, date +import os +import tempfile +import shutil +import unittest + +from htsworkflow.pipelines import firecrest +from htsworkflow.pipelines import bustard +from htsworkflow.pipelines import gerald +from htsworkflow.pipelines import runfolder +from htsworkflow.pipelines.runfolder import ElementTree + + +def make_flowcell_id(runfolder_dir, flowcell_id=None): + if flowcell_id is None: + flowcell_id = '207BTAAXY' + + config = """ + + %s +""" % (flowcell_id,) + config_dir = os.path.join(runfolder_dir, 'Config') + + if not os.path.exists(config_dir): + os.mkdir(config_dir) + pathname = os.path.join(config_dir, 'FlowcellId.xml') + f = open(pathname,'w') + f.write(config) + f.close() + +def make_matrix(matrix_dir): + contents = """# Auto-generated frequency response matrix +> A +> C +> G +> T +0.77 0.15 -0.04 -0.04 +0.76 1.02 -0.05 -0.06 +-0.10 -0.10 1.17 -0.03 +-0.13 -0.12 0.80 1.27 +""" + s_matrix = os.path.join(matrix_dir, 's_matrix.txt') + f = open(s_matrix, 'w') + f.write(contents) + f.close() + +def make_phasing_params(bustard_dir): + for lane in range(1,9): + pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane)) + f = open(pathname, 'w') + f.write(""" + 0.009900 + 0.003500 + +""") + f.close() + +def make_gerald_config(gerald_dir): + config_xml = """ + + default + + + + + Need_to_specify_ELAND_genome_directory + 8 + + domain.com + diane + localhost:25 + /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane + /home/diane/gec + 1 + /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes + Need_to_specify_genome_file_name + genome + /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane + + _prb.txt + 12 + '((CHASTITY>=0.6))' + _qhg.txt + --symbolic + 32 + --scarf + _seq.txt + _sig2.txt + _sig.txt + @(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp + s_[1-8]_[0-9][0-9][0-9][0-9] + s + Sat Apr 19 19:08:30 2008 + /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald + all + http://host.domain.com/yourshare/ + + + + eland + eland + eland + eland + eland + eland + eland + eland + + + /g/dm3 + /g/equcab1 + /g/equcab1 + /g/canfam2 + /g/hg18 + /g/hg18 + /g/hg18 + /g/hg18 + + + 32 + 32 + 32 + 32 + 32 + 32 + 32 + 32 + + + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + + + +""" + pathname = os.path.join(gerald_dir, 'config.xml') + f = open(pathname,'w') + f.write(config_xml) + f.close() + + +def make_summary_htm(gerald_dir): + summary_htm = """ + + + + +

080416_HWI-EAS229_0024_207BTAAXX Summary

+

Summary Information For Experiment 080416_HWI-EAS229_0024_207BTAAXX on Machine HWI-EAS229

+



Chip Summary

+ + + + +
MachineHWI-EAS229
Run Folder080416_HWI-EAS229_0024_207BTAAXX
Chip IDunknown
+



Lane Parameter Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LaneSample IDSample TargetSample TypeLengthFilterTiles
1unknowndm3ELAND32'((CHASTITY>=0.6))'Lane 1
2unknownequcab1ELAND32'((CHASTITY>=0.6))'Lane 2
3unknownequcab1ELAND32'((CHASTITY>=0.6))'Lane 3
4unknowncanfam2ELAND32'((CHASTITY>=0.6))'Lane 4
5unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 5
6unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 6
7unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 7
8unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 8
+



Lane Results Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Lane Clusters Av 1st Cycle Int % intensity after 20 cycles % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
117421 +/- 21397230 +/- 80123.73 +/- 10.7913.00 +/- 22.9132.03 +/- 18.456703.57 +/- 3753.854.55 +/- 4.81
220311 +/- 24027660 +/- 67817.03 +/- 4.4040.74 +/- 30.3329.54 +/- 9.035184.02 +/- 1631.543.27 +/- 3.94
320193 +/- 23997700 +/- 79715.75 +/- 3.3056.56 +/- 17.1627.33 +/- 7.484803.49 +/- 1313.313.07 +/- 2.86
415537 +/- 25317620 +/- 139215.37 +/- 3.7963.05 +/- 18.3015.88 +/- 4.993162.13 +/- 962.593.11 +/- 2.22
532047 +/- 33568093 +/- 83123.79 +/- 6.1853.36 +/- 18.0648.04 +/- 13.779866.23 +/- 2877.302.26 +/- 1.16
632946 +/- 47538227 +/- 73624.07 +/- 4.6954.65 +/- 12.5750.98 +/- 10.5410468.86 +/- 2228.532.21 +/- 2.33
739504 +/- 41718401 +/- 78522.55 +/- 4.5645.22 +/- 10.3448.41 +/- 9.679829.40 +/- 1993.202.26 +/- 1.11
837998 +/- 37928443 +/- 121139.03 +/- 7.5242.16 +/- 12.3540.98 +/- 14.898128.87 +/- 3055.343.57 +/- 2.77
+ + +""" + pathname = os.path.join(gerald_dir, 'Summary.htm') + f = open(pathname, 'w') + f.write(summary_htm) + f.close() + +def make_eland_results(gerald_dir): + eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D. +>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 chr2L.fa 8796855 R DD 24T +>HWI-EAS229_24_207BTAAXX:1:7:776:582 AGCTCANCCGATCGAAAACCTCNCCAAGCAAT NM 0 0 0 +>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 Lambda.fa 8796855 R DD 24T +""" + for i in range(1,9): + pathname = os.path.join(gerald_dir, + 's_%d_eland_result.txt' % (i,)) + f = open(pathname, 'w') + f.write(eland_result) + f.close() + +class RunfolderTests(unittest.TestCase): + """ + Test components of the runfolder processing code + which includes firecrest, bustard, and gerald + """ + def setUp(self): + # make a fake runfolder directory + self.temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_') + + self.runfolder_dir = os.path.join(self.temp_dir, + '080102_HWI-EAS229_0010_207BTAAXX') + os.mkdir(self.runfolder_dir) + + self.data_dir = os.path.join(self.runfolder_dir, 'Data') + os.mkdir(self.data_dir) + + self.firecrest_dir = os.path.join(self.data_dir, + 'C1-33_Firecrest1.8.28_12-04-2008_diane' + ) + os.mkdir(self.firecrest_dir) + self.matrix_dir = os.path.join(self.firecrest_dir, 'Matrix') + os.mkdir(self.matrix_dir) + make_matrix(self.matrix_dir) + + self.bustard_dir = os.path.join(self.firecrest_dir, + 'Bustard1.8.28_12-04-2008_diane') + os.mkdir(self.bustard_dir) + make_phasing_params(self.bustard_dir) + + self.gerald_dir = os.path.join(self.bustard_dir, + 'GERALD_12-04-2008_diane') + os.mkdir(self.gerald_dir) + make_gerald_config(self.gerald_dir) + make_summary_htm(self.gerald_dir) + make_eland_results(self.gerald_dir) + + def tearDown(self): + shutil.rmtree(self.temp_dir) + + def test_firecrest(self): + """ + Construct a firecrest object + """ + f = firecrest.firecrest(self.firecrest_dir) + self.failUnlessEqual(f.version, '1.8.28') + self.failUnlessEqual(f.start, 1) + self.failUnlessEqual(f.stop, 33) + self.failUnlessEqual(f.user, 'diane') + self.failUnlessEqual(f.date, date(2008,4,12)) + + xml = f.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + + f2 = firecrest.Firecrest(xml=xml) + self.failUnlessEqual(f.version, f2.version) + self.failUnlessEqual(f.start, f2.start) + self.failUnlessEqual(f.stop, f2.stop) + self.failUnlessEqual(f.user, f2.user) + self.failUnlessEqual(f.date, f2.date) + + def test_bustard(self): + """ + construct a bustard object + """ + b = bustard.bustard(self.bustard_dir) + self.failUnlessEqual(b.version, '1.8.28') + self.failUnlessEqual(b.date, date(2008,4,12)) + self.failUnlessEqual(b.user, 'diane') + self.failUnlessEqual(len(b.phasing), 8) + self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099) + + xml = b.get_elements() + b2 = bustard.Bustard(xml=xml) + self.failUnlessEqual(b.version, b2.version) + self.failUnlessEqual(b.date, b2.date ) + self.failUnlessEqual(b.user, b2.user) + self.failUnlessEqual(len(b.phasing), len(b2.phasing)) + for key in b.phasing.keys(): + self.failUnlessEqual(b.phasing[key].lane, + b2.phasing[key].lane) + self.failUnlessEqual(b.phasing[key].phasing, + b2.phasing[key].phasing) + self.failUnlessEqual(b.phasing[key].prephasing, + b2.phasing[key].prephasing) + + def test_gerald(self): + # need to update gerald and make tests for it + g = gerald.gerald(self.gerald_dir) + + self.failUnlessEqual(g.version, + '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp') + self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30)) + self.failUnlessEqual(len(g.lanes), len(g.lanes.keys())) + self.failUnlessEqual(len(g.lanes), len(g.lanes.items())) + + + # list of genomes, matches what was defined up in + # make_gerald_config. + # the first None is to offset the genomes list to be 1..9 + # instead of pythons default 0..8 + genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2', + '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ] + + # test lane specific parameters from gerald config file + for i in range(1,9): + cur_lane = g.lanes[str(i)] + self.failUnlessEqual(cur_lane.analysis, 'eland') + self.failUnlessEqual(cur_lane.eland_genome, genomes[i]) + self.failUnlessEqual(cur_lane.read_length, '32') + self.failUnlessEqual(cur_lane.use_bases, 'Y'*32) + + # test data extracted from summary file + clusters = [None, + (17421, 2139), (20311, 2402), (20193, 2399), (15537, 2531), + (32047, 3356), (32946, 4753), (39504, 4171), (37998, 3792)] + + for i in range(1,9): + summary_lane = g.summary[str(i)] + self.failUnlessEqual(summary_lane.cluster, clusters[i]) + self.failUnlessEqual(summary_lane.lane, str(i)) + + xml = g.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + g2 = gerald.Gerald(xml=xml) + + # do it all again after extracting from the xml file + self.failUnlessEqual(g.version, g2.version) + self.failUnlessEqual(g.date, g2.date) + self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys())) + self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items())) + + # test lane specific parameters from gerald config file + for i in range(1,9): + g_lane = g.lanes[str(i)] + g2_lane = g2.lanes[str(i)] + self.failUnlessEqual(g_lane.analysis, g2_lane.analysis) + self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome) + self.failUnlessEqual(g_lane.read_length, g2_lane.read_length) + self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases) + + # test (some) summary elements + for i in range(1,9): + g_summary = g.summary[str(i)] + g2_summary = g2.summary[str(i)] + self.failUnlessEqual(g_summary.cluster, g2_summary.cluster) + self.failUnlessEqual(g_summary.lane, g2_summary.lane) + + g_eland = g.eland_results + g2_eland = g2.eland_results + for lane in g_eland.keys(): + self.failUnlessEqual(g_eland[lane].reads, + g2_eland[lane].reads) + self.failUnlessEqual(len(g_eland[lane].mapped_reads), + len(g2_eland[lane].mapped_reads)) + for k in g_eland[lane].mapped_reads.keys(): + self.failUnlessEqual(g_eland[lane].mapped_reads[k], + g2_eland[lane].mapped_reads[k]) + + self.failUnlessEqual(len(g_eland[lane].match_codes), + len(g2_eland[lane].match_codes)) + for k in g_eland[lane].match_codes.keys(): + self.failUnlessEqual(g_eland[lane].match_codes[k], + g2_eland[lane].match_codes[k]) + + + def test_eland(self): + dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa', + 'chr2L.fa': 'dm3/chr2L.fa', + 'Lambda.fa': 'Lambda.fa'} + genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map, + '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map } + eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps) + + for i in range(1,9): + lane = eland[str(i)] + self.failUnlessEqual(lane.reads, 4) + self.failUnlessEqual(lane.sample_name, "s") + self.failUnlessEqual(lane.lane_id, unicode(i)) + self.failUnlessEqual(len(lane.mapped_reads), 3) + self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1) + self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1) + self.failUnlessEqual(lane.match_codes['U1'], 2) + self.failUnlessEqual(lane.match_codes['NM'], 1) + + xml = eland.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + e2 = gerald.ELAND(xml=xml) + + for i in range(1,9): + l1 = eland[str(i)] + l2 = e2[str(i)] + self.failUnlessEqual(l1.reads, l2.reads) + self.failUnlessEqual(l1.sample_name, l2.sample_name) + self.failUnlessEqual(l1.lane_id, l2.lane_id) + self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads)) + self.failUnlessEqual(len(l1.mapped_reads), 3) + for k in l1.mapped_reads.keys(): + self.failUnlessEqual(l1.mapped_reads[k], + l2.mapped_reads[k]) + + self.failUnlessEqual(len(l1.match_codes), 9) + self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes)) + for k in l1.match_codes.keys(): + self.failUnlessEqual(l1.match_codes[k], + l2.match_codes[k]) + + def test_runfolder(self): + runs = runfolder.get_runs(self.runfolder_dir) + + # do we get the flowcell id from the filename? + self.failUnlessEqual(len(runs), 1) + self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-04-19.xml') + + # do we get the flowcell id from the FlowcellId.xml file + make_flowcell_id(self.runfolder_dir, '207BTAAXY') + runs = runfolder.get_runs(self.runfolder_dir) + self.failUnlessEqual(len(runs), 1) + self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-04-19.xml') + + r1 = runs[0] + xml = r1.get_elements() + xml_str = ElementTree.tostring(xml) + + r2 = runfolder.PipelineRun(xml=xml) + self.failUnlessEqual(r1.name, r2.name) + self.failIfEqual(r2.firecrest, None) + self.failIfEqual(r2.bustard, None) + self.failIfEqual(r2.gerald, None) + + +def suite(): + return unittest.makeSuite(RunfolderTests,'test') + +if __name__ == "__main__": + unittest.main(defaultTest="suite") + diff --git a/htsworkflow/pipelines/test/test_runfolder030.py b/htsworkflow/pipelines/test/test_runfolder030.py new file mode 100644 index 0000000..7457b12 --- /dev/null +++ b/htsworkflow/pipelines/test/test_runfolder030.py @@ -0,0 +1,1024 @@ +#!/usr/bin/env python + +from datetime import datetime, date +import os +import tempfile +import shutil +import unittest + +from htsworkflow.pipelines import firecrest +from htsworkflow.pipelines import bustard +from htsworkflow.pipelines import gerald +from htsworkflow.pipelines import runfolder +from htsworkflow.pipelines.runfolder import ElementTree + + +def make_flowcell_id(runfolder_dir, flowcell_id=None): + if flowcell_id is None: + flowcell_id = '207BTAAXY' + + config = """ + + %s +""" % (flowcell_id,) + config_dir = os.path.join(runfolder_dir, 'Config') + + if not os.path.exists(config_dir): + os.mkdir(config_dir) + pathname = os.path.join(config_dir, 'FlowcellId.xml') + f = open(pathname,'w') + f.write(config) + f.close() + +def make_matrix(matrix_dir): + contents = """# Auto-generated frequency response matrix +> A +> C +> G +> T +0.77 0.15 -0.04 -0.04 +0.76 1.02 -0.05 -0.06 +-0.10 -0.10 1.17 -0.03 +-0.13 -0.12 0.80 1.27 +""" + s_matrix = os.path.join(matrix_dir, 's_matrix.txt') + f = open(s_matrix, 'w') + f.write(contents) + f.close() + +def make_phasing_params(bustard_dir): + for lane in range(1,9): + pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane)) + f = open(pathname, 'w') + f.write(""" + 0.009900 + 0.003500 + +""") + f.close() + +def make_gerald_config(gerald_dir): + config_xml = """ + + default + + + + + Need_to_specify_ELAND_genome_directory + 8 + + domain.com + diane + localhost:25 + /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane + /home/diane/gec + 1 + /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes + Need_to_specify_genome_file_name + genome + /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane + + _prb.txt + 12 + '((CHASTITY>=0.6))' + _qhg.txt + --symbolic + 32 + --scarf + _seq.txt + _sig2.txt + _sig.txt + @(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp + s_[1-8]_[0-9][0-9][0-9][0-9] + s + Sat Apr 19 19:08:30 2008 + /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald + all + http://host.domain.com/yourshare/ + + + + eland + eland + eland + eland + eland + eland + eland + eland + + + /g/dm3 + /g/equcab1 + /g/equcab1 + /g/canfam2 + /g/hg18 + /g/hg18 + /g/hg18 + /g/hg18 + + + 32 + 32 + 32 + 32 + 32 + 32 + 32 + 32 + + + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + + + +""" + pathname = os.path.join(gerald_dir, 'config.xml') + f = open(pathname,'w') + f.write(config_xml) + f.close() + +def make_summary_htm(gerald_dir): + summary_htm=""" + + + + +

080627_HWI-EAS229_0036_3055HAXX Summary

+

Summary Information For Experiment 080627_HWI-EAS229_0036_3055HAXX on Machine HWI-EAS229

+



Chip Summary

+ + + + +
MachineHWI-EAS229
Run Folder080627_HWI-EAS229_0036_3055HAXX
Chip IDunknown
+



Chip Results Summary

+ + + + + + + + + + +
ClustersClusters (PF)Yield (kbases)
80933224435778031133022
+



Lane Parameter Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LaneSample IDSample TargetSample TypeLengthFilterNum TilesTiles
1unknownmm9ELAND26'((CHASTITY>=0.6))'100Lane 1
2unknownmm9ELAND26'((CHASTITY>=0.6))'100Lane 2
3unknownmm9ELAND26'((CHASTITY>=0.6))'100Lane 3
4unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 4
5unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 5
6unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 6
7unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 7
8unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 8
+



Lane Results Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Lane InfoTile Mean +/- SD for Lane
Lane Lane Yield (kbases) Clusters (raw)Clusters (PF) 1st Cycle Int (PF) % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Alignment Score (PF) % Error Rate (PF)
115804696483 +/- 907460787 +/- 4240329 +/- 35101.88 +/- 6.0363.21 +/- 3.2970.33 +/- 0.249054.08 +/- 59.160.46 +/- 0.18
2156564133738 +/- 793860217 +/- 1926444 +/- 3992.62 +/- 7.5845.20 +/- 3.3151.98 +/- 0.746692.04 +/- 92.490.46 +/- 0.09
3185818152142 +/- 1000271468 +/- 2827366 +/- 3691.53 +/- 8.6647.19 +/- 3.8082.24 +/- 0.4410598.68 +/- 64.130.41 +/- 0.04
43495315784 +/- 216213443 +/- 1728328 +/- 4097.53 +/- 9.8785.29 +/- 1.9180.02 +/- 0.5310368.82 +/- 71.080.15 +/- 0.05
5167936119735 +/- 846564590 +/- 2529417 +/- 3788.69 +/- 14.7954.10 +/- 2.5976.95 +/- 0.329936.47 +/- 65.750.28 +/- 0.02
6173463152177 +/- 814666716 +/- 2493372 +/- 3987.06 +/- 9.8643.98 +/- 3.1278.80 +/- 0.4310162.28 +/- 49.650.38 +/- 0.03
714928784649 +/- 732557418 +/- 3617295 +/- 2889.40 +/- 8.2367.97 +/- 1.8233.38 +/- 0.254247.92 +/- 32.371.00 +/- 0.03
810695354622 +/- 481241136 +/- 3309284 +/- 3790.21 +/- 9.1075.39 +/- 2.2748.33 +/- 0.296169.21 +/- 169.500.86 +/- 1.22
Tile mean across chip
Av.1011665447235492.3660.2965.258403.690.50
+



Expanded Lane Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Lane InfoPhasing InfoRaw Data (tile mean)Filtered Data (tile mean)
Lane Clusters (tile mean) (raw)% Phasing % Prephasing % Error Rate (raw) Equiv Perfect Clusters (raw) % retained Cycle 2-4 Av Int (PF) Cycle 2-10 Av % Loss (PF) Cycle 10-20 Av % Loss (PF) % Align (PF) % Error Rate (PF) Equiv Perfect Clusters (PF)
1964830.77000.31001.004967663.21317 +/- 320.13 +/- 0.44-1.14 +/- 0.3470.330.4641758
21337380.77000.31001.224046745.20415 +/- 330.29 +/- 0.40-0.79 +/- 0.3551.980.4630615
31521420.77000.31001.307858847.19344 +/- 260.68 +/- 0.51-0.77 +/- 0.4282.240.4157552
4157840.77000.31000.291109585.29306 +/- 340.20 +/- 0.69-1.28 +/- 0.6680.020.1510671
51197350.77000.31000.856033554.10380 +/- 320.34 +/- 0.49-1.55 +/- 4.6976.950.2849015
61521770.77000.31001.217090543.98333 +/- 270.57 +/- 0.50-0.91 +/- 0.3978.800.3851663
7846490.77000.31001.382106967.97272 +/- 201.15 +/- 0.52-0.84 +/- 0.5833.381.0018265
8546220.77000.31001.172133575.39262 +/- 311.10 +/- 0.59-1.01 +/- 0.4748.330.8619104
+

IVC Plots
+

IVC.htm +

+

All Intensity Plots
+

All.htm +

+

Error graphs:
+

Error.htm +

+Back to top +



Lane 1

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
10001114972326.4894.3957.4470.29038.60.44
+Back to top +



Lane 2

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
20001147793448.1283.6838.5753.76905.40.54
+Back to top +



Lane 3

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
30001167904374.0586.9140.3681.310465.00.47
+Back to top +



Lane 4

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
4000120308276.8592.8784.2680.410413.80.16
+Back to top +



Lane 5

+ + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
+Back to top +



Lane 6

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
60001166844348.1277.5938.1379.710264.40.44
+Back to top +



Lane 7

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
7000198913269.9086.6664.5533.24217.51.02
+Back to top +



Lane 8

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
8000164972243.6089.4073.1748.36182.80.71
+Back to top + + +""" + pathname = os.path.join(gerald_dir, 'Summary.htm') + f = open(pathname, 'w') + f.write(summary_htm) + f.close() + +def make_eland_results(gerald_dir): + eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D. +>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 chr2L.fa 8796855 R DD 24T +>HWI-EAS229_24_207BTAAXX:1:7:776:582 AGCTCANCCGATCGAAAACCTCNCCAAGCAAT NM 0 0 0 +>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 Lambda.fa 8796855 R DD 24T +""" + for i in range(1,9): + pathname = os.path.join(gerald_dir, + 's_%d_eland_result.txt' % (i,)) + f = open(pathname, 'w') + f.write(eland_result) + f.close() + +def make_runfolder(obj=None): + """ + Make a fake runfolder, attach all the directories to obj if defined + """ + # make a fake runfolder directory + temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_') + + runfolder_dir = os.path.join(temp_dir, + '080102_HWI-EAS229_0010_207BTAAXX') + os.mkdir(runfolder_dir) + + data_dir = os.path.join(runfolder_dir, 'Data') + os.mkdir(data_dir) + + firecrest_dir = os.path.join(data_dir, + 'C1-33_Firecrest1.8.28_12-04-2008_diane' + ) + os.mkdir(firecrest_dir) + matrix_dir = os.path.join(firecrest_dir, 'Matrix') + os.mkdir(matrix_dir) + make_matrix(matrix_dir) + + bustard_dir = os.path.join(firecrest_dir, + 'Bustard1.8.28_12-04-2008_diane') + os.mkdir(bustard_dir) + make_phasing_params(bustard_dir) + + gerald_dir = os.path.join(bustard_dir, + 'GERALD_12-04-2008_diane') + os.mkdir(gerald_dir) + make_gerald_config(gerald_dir) + make_summary_htm(gerald_dir) + make_eland_results(gerald_dir) + + if obj is not None: + obj.temp_dir = temp_dir + obj.runfolder_dir = runfolder_dir + obj.data_dir = data_dir + obj.firecrest_dir = firecrest_dir + obj.matrix_dir = matrix_dir + obj.bustard_dir = bustard_dir + obj.gerald_dir = gerald_dir + + +class RunfolderTests(unittest.TestCase): + """ + Test components of the runfolder processing code + which includes firecrest, bustard, and gerald + """ + def setUp(self): + # attaches all the directories to the object passed in + make_runfolder(self) + + def tearDown(self): + shutil.rmtree(self.temp_dir) + + def test_firecrest(self): + """ + Construct a firecrest object + """ + f = firecrest.firecrest(self.firecrest_dir) + self.failUnlessEqual(f.version, '1.8.28') + self.failUnlessEqual(f.start, 1) + self.failUnlessEqual(f.stop, 33) + self.failUnlessEqual(f.user, 'diane') + self.failUnlessEqual(f.date, date(2008,4,12)) + + xml = f.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + + f2 = firecrest.Firecrest(xml=xml) + self.failUnlessEqual(f.version, f2.version) + self.failUnlessEqual(f.start, f2.start) + self.failUnlessEqual(f.stop, f2.stop) + self.failUnlessEqual(f.user, f2.user) + self.failUnlessEqual(f.date, f2.date) + + def test_bustard(self): + """ + construct a bustard object + """ + b = bustard.bustard(self.bustard_dir) + self.failUnlessEqual(b.version, '1.8.28') + self.failUnlessEqual(b.date, date(2008,4,12)) + self.failUnlessEqual(b.user, 'diane') + self.failUnlessEqual(len(b.phasing), 8) + self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099) + + xml = b.get_elements() + b2 = bustard.Bustard(xml=xml) + self.failUnlessEqual(b.version, b2.version) + self.failUnlessEqual(b.date, b2.date ) + self.failUnlessEqual(b.user, b2.user) + self.failUnlessEqual(len(b.phasing), len(b2.phasing)) + for key in b.phasing.keys(): + self.failUnlessEqual(b.phasing[key].lane, + b2.phasing[key].lane) + self.failUnlessEqual(b.phasing[key].phasing, + b2.phasing[key].phasing) + self.failUnlessEqual(b.phasing[key].prephasing, + b2.phasing[key].prephasing) + + def test_gerald(self): + # need to update gerald and make tests for it + g = gerald.gerald(self.gerald_dir) + + self.failUnlessEqual(g.version, + '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp') + self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30)) + self.failUnlessEqual(len(g.lanes), len(g.lanes.keys())) + self.failUnlessEqual(len(g.lanes), len(g.lanes.items())) + + + # list of genomes, matches what was defined up in + # make_gerald_config. + # the first None is to offset the genomes list to be 1..9 + # instead of pythons default 0..8 + genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2', + '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ] + + # test lane specific parameters from gerald config file + for i in range(1,9): + cur_lane = g.lanes[str(i)] + self.failUnlessEqual(cur_lane.analysis, 'eland') + self.failUnlessEqual(cur_lane.eland_genome, genomes[i]) + self.failUnlessEqual(cur_lane.read_length, '32') + self.failUnlessEqual(cur_lane.use_bases, 'Y'*32) + + # test data extracted from summary file + clusters = [None, + (96483, 9074), (133738, 7938), + (152142, 10002), (15784, 2162), + (119735, 8465), (152177, 8146), + (84649, 7325), (54622, 4812),] + + for i in range(1,9): + summary_lane = g.summary[str(i)] + self.failUnlessEqual(summary_lane.cluster, clusters[i]) + self.failUnlessEqual(summary_lane.lane, str(i)) + + xml = g.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + g2 = gerald.Gerald(xml=xml) + + # do it all again after extracting from the xml file + self.failUnlessEqual(g.version, g2.version) + self.failUnlessEqual(g.date, g2.date) + self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys())) + self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items())) + + # test lane specific parameters from gerald config file + for i in range(1,9): + g_lane = g.lanes[str(i)] + g2_lane = g2.lanes[str(i)] + self.failUnlessEqual(g_lane.analysis, g2_lane.analysis) + self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome) + self.failUnlessEqual(g_lane.read_length, g2_lane.read_length) + self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases) + + # test (some) summary elements + for i in range(1,9): + g_summary = g.summary[str(i)] + g2_summary = g2.summary[str(i)] + self.failUnlessEqual(g_summary.cluster, g2_summary.cluster) + self.failUnlessEqual(g_summary.lane, g2_summary.lane) + + g_eland = g.eland_results + g2_eland = g2.eland_results + for lane in g_eland.keys(): + self.failUnlessEqual(g_eland[lane].reads, + g2_eland[lane].reads) + self.failUnlessEqual(len(g_eland[lane].mapped_reads), + len(g2_eland[lane].mapped_reads)) + for k in g_eland[lane].mapped_reads.keys(): + self.failUnlessEqual(g_eland[lane].mapped_reads[k], + g2_eland[lane].mapped_reads[k]) + + self.failUnlessEqual(len(g_eland[lane].match_codes), + len(g2_eland[lane].match_codes)) + for k in g_eland[lane].match_codes.keys(): + self.failUnlessEqual(g_eland[lane].match_codes[k], + g2_eland[lane].match_codes[k]) + + + def test_eland(self): + dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa', + 'chr2L.fa': 'dm3/chr2L.fa', + 'Lambda.fa': 'Lambda.fa'} + genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map, + '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map } + eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps) + + for i in range(1,9): + lane = eland[str(i)] + self.failUnlessEqual(lane.reads, 4) + self.failUnlessEqual(lane.sample_name, "s") + self.failUnlessEqual(lane.lane_id, unicode(i)) + self.failUnlessEqual(len(lane.mapped_reads), 3) + self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1) + self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1) + self.failUnlessEqual(lane.match_codes['U1'], 2) + self.failUnlessEqual(lane.match_codes['NM'], 1) + + xml = eland.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + e2 = gerald.ELAND(xml=xml) + + for i in range(1,9): + l1 = eland[str(i)] + l2 = e2[str(i)] + self.failUnlessEqual(l1.reads, l2.reads) + self.failUnlessEqual(l1.sample_name, l2.sample_name) + self.failUnlessEqual(l1.lane_id, l2.lane_id) + self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads)) + self.failUnlessEqual(len(l1.mapped_reads), 3) + for k in l1.mapped_reads.keys(): + self.failUnlessEqual(l1.mapped_reads[k], + l2.mapped_reads[k]) + + self.failUnlessEqual(len(l1.match_codes), 9) + self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes)) + for k in l1.match_codes.keys(): + self.failUnlessEqual(l1.match_codes[k], + l2.match_codes[k]) + + def test_runfolder(self): + runs = runfolder.get_runs(self.runfolder_dir) + + # do we get the flowcell id from the filename? + self.failUnlessEqual(len(runs), 1) + self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-04-19.xml') + + # do we get the flowcell id from the FlowcellId.xml file + make_flowcell_id(self.runfolder_dir, '207BTAAXY') + runs = runfolder.get_runs(self.runfolder_dir) + self.failUnlessEqual(len(runs), 1) + self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-04-19.xml') + + r1 = runs[0] + xml = r1.get_elements() + xml_str = ElementTree.tostring(xml) + + r2 = runfolder.PipelineRun(xml=xml) + self.failUnlessEqual(r1.name, r2.name) + self.failIfEqual(r2.firecrest, None) + self.failIfEqual(r2.bustard, None) + self.failIfEqual(r2.gerald, None) + + +def suite(): + return unittest.makeSuite(RunfolderTests,'test') + +if __name__ == "__main__": + unittest.main(defaultTest="suite") +