+++ /dev/null
-
-from datetime import date
-from glob import glob
-import logging
-import os
-import time
-import re
-
-from htsworkflow.pipeline.runfolder import \
- ElementTree, \
- VERSION_RE, \
- EUROPEAN_STRPTIME
-
-class Phasing(object):
- PHASING = 'Phasing'
- PREPHASING = 'Prephasing'
-
- def __init__(self, fromfile=None, xml=None):
- self.lane = None
- self.phasing = None
- self.prephasing = None
-
- if fromfile is not None:
- self._initialize_from_file(fromfile)
- elif xml is not None:
- self.set_elements(xml)
-
- def _initialize_from_file(self, pathname):
- path, name = os.path.split(pathname)
- basename, ext = os.path.splitext(name)
- # the last character of the param base filename should be the
- # lane number
- tree = ElementTree.parse(pathname).getroot()
- self.set_elements(tree)
- self.lane = int(basename[-1])
-
- def get_elements(self):
- root = ElementTree.Element(Phasing.PHASING, {'lane': str(self.lane)})
- phasing = ElementTree.SubElement(root, Phasing.PHASING)
- phasing.text = str(self.phasing)
- prephasing = ElementTree.SubElement(root, Phasing.PREPHASING)
- prephasing.text = str(self.prephasing)
- return root
-
- def set_elements(self, tree):
- if tree.tag not in ('Phasing', 'Parameters'):
- raise ValueError('exptected Phasing or Parameters')
- lane = tree.attrib.get('lane', None)
- if lane is not None:
- self.lane = int(lane)
- for element in list(tree):
- if element.tag == Phasing.PHASING:
- self.phasing = float(element.text)
- elif element.tag == Phasing.PREPHASING:
- self.prephasing = float(element.text)
-
-class Bustard(object):
- XML_VERSION = 1
-
- # Xml Tags
- BUSTARD = 'Bustard'
- SOFTWARE_VERSION = 'version'
- DATE = 'run_time'
- USER = 'user'
- PARAMETERS = 'Parameters'
-
- def __init__(self, xml=None):
- self.version = None
- self.date = date.today()
- self.user = None
- self.phasing = {}
-
- if xml is not None:
- self.set_elements(xml)
-
- def _get_time(self):
- return time.mktime(self.date.timetuple())
- time = property(_get_time, doc='return run time as seconds since epoch')
-
- def dump(self):
- print "Bustard version:", self.version
- print "Run date", self.date
- print "user:", self.user
- for lane, tree in self.phasing.items():
- print lane
- print tree
-
- def get_elements(self):
- root = ElementTree.Element('Bustard',
- {'version': str(Bustard.XML_VERSION)})
- version = ElementTree.SubElement(root, Bustard.SOFTWARE_VERSION)
- version.text = self.version
- run_date = ElementTree.SubElement(root, Bustard.DATE)
- run_date.text = str(self.time)
- user = ElementTree.SubElement(root, Bustard.USER)
- user.text = self.user
- params = ElementTree.SubElement(root, Bustard.PARAMETERS)
- for p in self.phasing.values():
- params.append(p.get_elements())
- return root
-
- def set_elements(self, tree):
- if tree.tag != Bustard.BUSTARD:
- raise ValueError('Expected "Bustard" SubElements')
- xml_version = int(tree.attrib.get('version', 0))
- if xml_version > Bustard.XML_VERSION:
- logging.warn('Bustard XML tree is a higher version than this class')
- for element in list(tree):
- if element.tag == Bustard.SOFTWARE_VERSION:
- self.version = element.text
- elif element.tag == Bustard.DATE:
- self.date = date.fromtimestamp(float(element.text))
- elif element.tag == Bustard.USER:
- self.user = element.text
- elif element.tag == Bustard.PARAMETERS:
- for param in element:
- p = Phasing(xml=param)
- self.phasing[p.lane] = p
- else:
- raise ValueError("Unrecognized tag: %s" % (element.tag,))
-
-
-
-def bustard(pathname):
- """
- Construct a Bustard object from pathname
- """
- b = Bustard()
- path, name = os.path.split(pathname)
- groups = name.split("_")
- version = re.search(VERSION_RE, groups[0])
- b.version = version.group(1)
- t = time.strptime(groups[1], EUROPEAN_STRPTIME)
- b.date = date(*t[0:3])
- b.user = groups[2]
- paramfiles = glob(os.path.join(pathname, "params?.xml"))
- for paramfile in paramfiles:
- phasing = Phasing(paramfile)
- assert (phasing.lane >= 1 and phasing.lane <= 8)
- b.phasing[phasing.lane] = phasing
- return b
-
-def fromxml(tree):
- b = Bustard()
- b.set_elements(tree)
- return b
+++ /dev/null
-#!/usr/bin/python
-import subprocess
-import logging
-import time
-import re
-import os
-
-from htsworkflow.pipeline.retrieve_config import getCombinedOptions, saveConfigFile
-from htsworkflow.pipeline.retrieve_config import FlowCellNotFound, WebError404
-from htsworkflow.pipeline.genome_mapper import DuplicateGenome, getAvailableGenomes, constructMapperDict
-from htsworkflow.pipeline.run_status import GARunStatus
-
-from pyinotify import WatchManager, ThreadedNotifier
-from pyinotify import EventsCodes, ProcessEvent
-
-class ConfigInfo:
-
- def __init__(self):
- #run_path = firecrest analysis directory to run analysis from
- self.run_path = None
- self.bustard_path = None
- self.config_filepath = None
- self.status = None
-
- #top level directory where all analyses are placed
- self.base_analysis_dir = None
- #analysis_dir, top level analysis dir...
- # base_analysis_dir + '/070924_USI-EAS44_0022_FC12150'
- self.analysis_dir = None
-
-
- def createStatusObject(self):
- """
- Creates a status object which can be queried for
- status of running the pipeline
-
- returns True if object created
- returns False if object cannot be created
- """
- if self.config_filepath is None:
- return False
-
- self.status = GARunStatus(self.config_filepath)
- return True
-
-
-
-####################################
-# inotify event processor
-
-s_firecrest_finished = re.compile('Firecrest[0-9\._\-A-Za-z]+/finished.txt')
-s_bustard_finished = re.compile('Bustard[0-9\._\-A-Za-z]+/finished.txt')
-s_gerald_finished = re.compile('GERALD[0-9\._\-A-Za-z]+/finished.txt')
-
-s_gerald_all = re.compile('Firecrest[0-9\._\-A-Za-z]+/Bustard[0-9\._\-A-Za-z]+/GERALD[0-9\._\-A-Za-z]+/')
-s_bustard_all = re.compile('Firecrest[0-9\._\-A-Za-z]+/Bustard[0-9\._\-A-Za-z]+/')
-s_firecrest_all = re.compile('Firecrest[0-9\._\-A-Za-z]+/')
-
-class RunEvent(ProcessEvent):
-
- def __init__(self, conf_info):
-
- self.run_status_dict = {'firecrest': False,
- 'bustard': False,
- 'gerald': False}
-
- self._ci = conf_info
-
- ProcessEvent.__init__(self)
-
-
- def process_IN_CREATE(self, event):
- fullpath = os.path.join(event.path, event.name)
- if s_finished.search(fullpath):
- logging.info("File Found: %s" % (fullpath))
-
- if s_firecrest_finished.search(fullpath):
- self.run_status_dict['firecrest'] = True
- self._ci.status.updateFirecrest(event.name)
- elif s_bustard_finished.search(fullpath):
- self.run_status_dict['bustard'] = True
- self._ci.status.updateBustard(event.name)
- elif s_gerald_finished.search(fullpath):
- self.run_status_dict['gerald'] = True
- self._ci.status.updateGerald(event.name)
-
- #WARNING: The following order is important!!
- # Firecrest regex will catch all gerald, bustard, and firecrest
- # Bustard regex will catch all gerald and bustard
- # Gerald regex will catch all gerald
- # So, order needs to be Gerald, Bustard, Firecrest, or this
- # won't work properly.
- elif s_gerald_all.search(fullpath):
- self._ci.status.updateGerald(event.name)
- elif s_bustard_all.search(fullpath):
- self._ci.status.updateBustard(event.name)
- elif s_firecrest_all.search(fullpath):
- self._ci.status.updateFirecrest(event.name)
-
- #print "Create: %s" % (os.path.join(event.path, event.name))
-
- def process_IN_DELETE(self, event):
- #print "Remove %s" % (os.path.join(event.path, event.name))
- pass
-
-
-
-
-#FLAGS
-# Config Step Error
-RUN_ABORT = 'abort'
-# Run Step Error
-RUN_FAILED = 'failed'
-
-
-#####################################
-# Configure Step (goat_pipeline.py)
-#Info
-s_start = re.compile('Starting Genome Analyzer Pipeline')
-s_gerald = re.compile("[\S\s]+--GERALD[\S\s]+--make[\S\s]+")
-s_generating = re.compile('^Generating journals, Makefiles')
-s_seq_folder = re.compile('^Sequence folder: ')
-s_seq_folder_sub = re.compile('want to make ')
-s_stderr_taskcomplete = re.compile('^Task complete, exiting')
-
-#Errors
-s_invalid_cmdline = re.compile('Usage:[\S\s]*goat_pipeline.py')
-s_species_dir_err = re.compile('Error: Lane [1-8]:')
-s_goat_traceb = re.compile("^Traceback \(most recent call last\):")
-s_missing_cycles = re.compile('^Error: Tile s_[1-8]_[0-9]+: Different number of cycles: [0-9]+ instead of [0-9]+')
-
-SUPPRESS_MISSING_CYCLES = False
-
-
-##Ignore - Example of out above each ignore regex.
-#NOTE: Commenting out an ignore will cause it to be
-# logged as DEBUG with the logging module.
-#CF_STDERR_IGNORE_LIST = []
-s_skip = re.compile('s_[0-8]_[0-9]+')
-
-
-##########################################
-# Pipeline Run Step (make -j8 recursive)
-
-##Info
-s_finished = re.compile('finished')
-
-##Errors
-s_make_error = re.compile('^make[\S\s]+Error')
-s_no_gnuplot = re.compile('gnuplot: command not found')
-s_no_convert = re.compile('^Can\'t exec "convert"')
-s_no_ghostscript = re.compile('gs: command not found')
-
-##Ignore - Example of out above each ignore regex.
-#NOTE: Commenting out an ignore will cause it to be
-# logged as DEBUG with the logging module.
-#
-PL_STDERR_IGNORE_LIST = []
-# Info: PF 11802
-PL_STDERR_IGNORE_LIST.append( re.compile('^Info: PF') )
-# About to analyse intensity file s_4_0101_sig2.txt
-PL_STDERR_IGNORE_LIST.append( re.compile('^About to analyse intensity file') )
-# Will send output to standard output
-PL_STDERR_IGNORE_LIST.append( re.compile('^Will send output to standard output') )
-# Found 31877 clusters
-PL_STDERR_IGNORE_LIST.append( re.compile('^Found [0-9]+ clusters') )
-# Will use quality criterion ((CHASTITY>=0.6)
-PL_STDERR_IGNORE_LIST.append( re.compile('^Will use quality criterion') )
-# Quality criterion translated to (($F[5]>=0.6))
-PL_STDERR_IGNORE_LIST.append( re.compile('^Quality criterion translated to') )
-# opened /woldlab/trog/data1/king/070924_USI-EAS44_0022_FC12150/Data/C1-36_Firecrest1.9.1_14-11-2007_king.4/Bustard1.9.1_14-11-2007_king/s_4_0101_qhg.txt
-# AND
-# opened s_4_0103_qhg.txt
-PL_STDERR_IGNORE_LIST.append( re.compile('^opened[\S\s]+qhg.txt') )
-# 81129 sequences out of 157651 passed filter criteria
-PL_STDERR_IGNORE_LIST.append( re.compile('^[0-9]+ sequences out of [0-9]+ passed filter criteria') )
-
-
-def pl_stderr_ignore(line):
- """
- Searches lines for lines to ignore (i.e. not to log)
-
- returns True if line should be ignored
- returns False if line should NOT be ignored
- """
- for s in PL_STDERR_IGNORE_LIST:
- if s.search(line):
- return True
- return False
-
-
-def config_stdout_handler(line, conf_info):
- """
- Processes each line of output from GOAT
- and stores useful information using the logging module
-
- Loads useful information into conf_info as well, for future
- use outside the function.
-
- returns True if found condition that signifies success.
- """
-
- # Skip irrelevant line (without logging)
- if s_skip.search(line):
- pass
-
- # Detect invalid command-line arguments
- elif s_invalid_cmdline.search(line):
- logging.error("Invalid commandline options!")
-
- # Detect starting of configuration
- elif s_start.search(line):
- logging.info('START: Configuring pipeline')
-
- # Detect it made it past invalid arguments
- elif s_gerald.search(line):
- logging.info('Running make now')
-
- # Detect that make files have been generated (based on output)
- elif s_generating.search(line):
- logging.info('Make files generted')
- return True
-
- # Capture run directory
- elif s_seq_folder.search(line):
- mo = s_seq_folder_sub.search(line)
- #Output changed when using --tiles=<tiles>
- # at least in pipeline v0.3.0b2
- if mo:
- firecrest_bustard_gerald_makefile = line[mo.end():]
- firecrest_bustard_gerald, junk = \
- os.path.split(firecrest_bustard_gerald_makefile)
- firecrest_bustard, junk = os.path.split(firecrest_bustard_gerald)
- firecrest, junk = os.path.split(firecrest_bustard)
-
- conf_info.bustard_path = firecrest_bustard
- conf_info.run_path = firecrest
-
- #Standard output handling
- else:
- print 'Sequence line:', line
- mo = s_seq_folder.search(line)
- conf_info.bustard_path = line[mo.end():]
- conf_info.run_path, temp = os.path.split(conf_info.bustard_path)
-
- # Log all other output for debugging purposes
- else:
- logging.warning('CONF:?: %s' % (line))
-
- return False
-
-
-
-def config_stderr_handler(line, conf_info):
- """
- Processes each line of output from GOAT
- and stores useful information using the logging module
-
- Loads useful information into conf_info as well, for future
- use outside the function.
-
- returns RUN_ABORT upon detecting failure;
- True on success message;
- False if neutral message
- (i.e. doesn't signify failure or success)
- """
- global SUPPRESS_MISSING_CYCLES
-
- # Detect invalid species directory error
- if s_species_dir_err.search(line):
- logging.error(line)
- return RUN_ABORT
- # Detect goat_pipeline.py traceback
- elif s_goat_traceb.search(line):
- logging.error("Goat config script died, traceback in debug output")
- return RUN_ABORT
- # Detect indication of successful configuration (from stderr; odd, but ok)
- elif s_stderr_taskcomplete.search(line):
- logging.info('Configure step successful (from: stderr)')
- return True
- # Detect missing cycles
- elif s_missing_cycles.search(line):
-
- # Only display error once
- if not SUPPRESS_MISSING_CYCLES:
- logging.error("Missing cycles detected; Not all cycles copied?")
- logging.debug("CONF:STDERR:MISSING_CYCLES: %s" % (line))
- SUPPRESS_MISSING_CYCLES = True
- return RUN_ABORT
-
- # Log all other output as debug output
- else:
- logging.debug('CONF:STDERR:?: %s' % (line))
-
- # Neutral (not failure; nor success)
- return False
-
-
-#def pipeline_stdout_handler(line, conf_info):
-# """
-# Processes each line of output from running the pipeline
-# and stores useful information using the logging module
-#
-# Loads useful information into conf_info as well, for future
-# use outside the function.
-#
-# returns True if found condition that signifies success.
-# """
-#
-# #f.write(line + '\n')
-#
-# return True
-
-
-
-def pipeline_stderr_handler(line, conf_info):
- """
- Processes each line of stderr from pipelien run
- and stores useful information using the logging module
-
- ##FIXME: Future feature (doesn't actually do this yet)
- #Loads useful information into conf_info as well, for future
- #use outside the function.
-
- returns RUN_FAILED upon detecting failure;
- #True on success message; (no clear success state)
- False if neutral message
- (i.e. doesn't signify failure or success)
- """
-
- if pl_stderr_ignore(line):
- pass
- elif s_make_error.search(line):
- logging.error("make error detected; run failed")
- return RUN_FAILED
- elif s_no_gnuplot.search(line):
- logging.error("gnuplot not found")
- return RUN_FAILED
- elif s_no_convert.search(line):
- logging.error("imagemagick's convert command not found")
- return RUN_FAILED
- elif s_no_ghostscript.search(line):
- logging.error("ghostscript not found")
- return RUN_FAILED
- else:
- logging.debug('PIPE:STDERR:?: %s' % (line))
-
- return False
-
-
-def retrieve_config(conf_info, flowcell, cfg_filepath, genome_dir):
- """
- Gets the config file from server...
- requires config file in:
- /etc/ga_frontend/ga_frontend.conf
- or
- ~/.ga_frontend.conf
-
- with:
- [config_file_server]
- base_host_url: http://host:port
-
- return True if successful, False is failure
- """
- options = getCombinedOptions()
-
- if options.url is None:
- logging.error("~/.ga_frontend.conf or /etc/ga_frontend/ga_frontend.conf" \
- " missing base_host_url option")
- return False
-
- try:
- saveConfigFile(flowcell, options.url, cfg_filepath)
- conf_info.config_filepath = cfg_filepath
- except FlowCellNotFound, e:
- logging.error(e)
- return False
- except WebError404, e:
- logging.error(e)
- return False
- except IOError, e:
- logging.error(e)
- return False
- except Exception, e:
- logging.error(e)
- return False
-
- f = open(cfg_filepath, 'r')
- data = f.read()
- f.close()
-
- genome_dict = getAvailableGenomes(genome_dir)
- mapper_dict = constructMapperDict(genome_dict)
-
- logging.debug(data)
-
- f = open(cfg_filepath, 'w')
- f.write(data % (mapper_dict))
- f.close()
-
- return True
-
-
-
-def configure(conf_info):
- """
- Attempts to configure the GA pipeline using goat.
-
- Uses logging module to store information about status.
-
- returns True if configuration successful, otherwise False.
- """
- #ERROR Test:
- #pipe = subprocess.Popen(['goat_pipeline.py',
- # '--GERALD=config32bk.txt',
- # '--make .',],
- # #'.'],
- # stdout=subprocess.PIPE,
- # stderr=subprocess.PIPE)
-
- #ERROR Test (2), causes goat_pipeline.py traceback
- #pipe = subprocess.Popen(['goat_pipeline.py',
- # '--GERALD=%s' % (conf_info.config_filepath),
- # '--tiles=s_4_100,s_4_101,s_4_102,s_4_103,s_4_104',
- # '--make',
- # '.'],
- # stdout=subprocess.PIPE,
- # stderr=subprocess.PIPE)
-
- ##########################
- # Run configuration step
- # Not a test; actual configure attempt.
- #pipe = subprocess.Popen(['goat_pipeline.py',
- # '--GERALD=%s' % (conf_info.config_filepath),
- # '--make',
- # '.'],
- # stdout=subprocess.PIPE,
- # stderr=subprocess.PIPE)
-
-
- stdout_filepath = os.path.join(conf_info.analysis_dir,
- "pipeline_configure_stdout.txt")
- stderr_filepath = os.path.join(conf_info.analysis_dir,
- "pipeline_configure_stderr.txt")
-
- fout = open(stdout_filepath, 'w')
- ferr = open(stderr_filepath, 'w')
-
- pipe = subprocess.Popen(['goat_pipeline.py',
- '--GERALD=%s' % (conf_info.config_filepath),
- #'--tiles=s_4_0100,s_4_0101,s_4_0102,s_4_0103,s_4_0104',
- '--make',
- conf_info.analysis_dir],
- stdout=fout,
- stderr=ferr)
-
- print "Configuring pipeline: %s" % (time.ctime())
- error_code = pipe.wait()
-
- # Clean up
- fout.close()
- ferr.close()
-
-
- ##################
- # Process stdout
- fout = open(stdout_filepath, 'r')
-
- stdout_line = fout.readline()
-
- complete = False
- while stdout_line != '':
- # Handle stdout
- if config_stdout_handler(stdout_line, conf_info):
- complete = True
- stdout_line = fout.readline()
-
- fout.close()
-
-
- #error_code = pipe.wait()
- if error_code:
- logging.error('Recieved error_code: %s' % (error_code))
- else:
- logging.info('We are go for launch!')
-
- #Process stderr
- ferr = open(stderr_filepath, 'r')
- stderr_line = ferr.readline()
-
- abort = 'NO!'
- stderr_success = False
- while stderr_line != '':
- stderr_status = config_stderr_handler(stderr_line, conf_info)
- if stderr_status == RUN_ABORT:
- abort = RUN_ABORT
- elif stderr_status is True:
- stderr_success = True
- stderr_line = ferr.readline()
-
- ferr.close()
-
-
- #Success requirements:
- # 1) The stdout completed without error
- # 2) The program exited with status 0
- # 3) No errors found in stdout
- print '#Expect: True, False, True, True'
- print complete, bool(error_code), abort != RUN_ABORT, stderr_success is True
- status = complete is True and \
- bool(error_code) is False and \
- abort != RUN_ABORT and \
- stderr_success is True
-
- # If everything was successful, but for some reason
- # we didn't retrieve the path info, log it.
- if status is True:
- if conf_info.bustard_path is None or conf_info.run_path is None:
- logging.error("Failed to retrieve run_path")
- return False
-
- return status
-
-
-def run_pipeline(conf_info):
- """
- Run the pipeline and monitor status.
- """
- # Fail if the run_path doesn't actually exist
- if not os.path.exists(conf_info.run_path):
- logging.error('Run path does not exist: %s' \
- % (conf_info.run_path))
- return False
-
- # Change cwd to run_path
- stdout_filepath = os.path.join(conf_info.analysis_dir, 'pipeline_run_stdout.txt')
- stderr_filepath = os.path.join(conf_info.analysis_dir, 'pipeline_run_stderr.txt')
-
- # Create status object
- conf_info.createStatusObject()
-
- # Monitor file creation
- wm = WatchManager()
- mask = EventsCodes.IN_DELETE | EventsCodes.IN_CREATE
- event = RunEvent(conf_info)
- notifier = ThreadedNotifier(wm, event)
- notifier.start()
- wdd = wm.add_watch(conf_info.run_path, mask, rec=True)
-
- # Log pipeline starting
- logging.info('STARTING PIPELINE @ %s' % (time.ctime()))
-
- # Start the pipeline (and hide!)
- #pipe = subprocess.Popen(['make',
- # '-j8',
- # 'recursive'],
- # stdout=subprocess.PIPE,
- # stderr=subprocess.PIPE)
-
- fout = open(stdout_filepath, 'w')
- ferr = open(stderr_filepath, 'w')
-
- pipe = subprocess.Popen(['make',
- '--directory=%s' % (conf_info.run_path),
- '-j8',
- 'recursive'],
- stdout=fout,
- stderr=ferr)
- #shell=True)
- # Wait for run to finish
- retcode = pipe.wait()
-
-
- # Clean up
- notifier.stop()
- fout.close()
- ferr.close()
-
- # Process stderr
- ferr = open(stderr_filepath, 'r')
-
- run_failed_stderr = False
- for line in ferr:
- err_status = pipeline_stderr_handler(line, conf_info)
- if err_status == RUN_FAILED:
- run_failed_stderr = True
-
- ferr.close()
-
- # Finished file check!
- print 'RUN SUCCESS CHECK:'
- for key, value in event.run_status_dict.items():
- print ' %s: %s' % (key, value)
-
- dstatus = event.run_status_dict
-
- # Success or failure check
- status = (retcode == 0) and \
- run_failed_stderr is False and \
- dstatus['firecrest'] is True and \
- dstatus['bustard'] is True and \
- dstatus['gerald'] is True
-
- return status
-
-
+++ /dev/null
-"""
-Extract information about the Firecrest run
-
-Firecrest - class holding the properties we found
-firecrest - Firecrest factory function initalized from a directory name
-fromxml - Firecrest factory function initalized from an xml dump from
- the Firecrest object.
-"""
-
-from datetime import date
-import os
-import re
-import time
-
-from htsworkflow.pipeline.runfolder import \
- ElementTree, \
- VERSION_RE, \
- EUROPEAN_STRPTIME
-
-class Firecrest(object):
- XML_VERSION=1
-
- # xml tag names
- FIRECREST = 'Firecrest'
- SOFTWARE_VERSION = 'version'
- START = 'FirstCycle'
- STOP = 'LastCycle'
- DATE = 'run_time'
- USER = 'user'
- MATRIX = 'matrix'
-
- def __init__(self, xml=None):
- self.start = None
- self.stop = None
- self.version = None
- self.date = date.today()
- self.user = None
- self.matrix = None
-
- if xml is not None:
- self.set_elements(xml)
-
- def _get_time(self):
- return time.mktime(self.date.timetuple())
- time = property(_get_time, doc='return run time as seconds since epoch')
-
- def dump(self):
- print "Starting cycle:", self.start
- print "Ending cycle:", self.stop
- print "Firecrest version:", self.version
- print "Run date:", self.date
- print "user:", self.user
-
- def get_elements(self):
- attribs = {'version': str(Firecrest.XML_VERSION) }
- root = ElementTree.Element(Firecrest.FIRECREST, attrib=attribs)
- version = ElementTree.SubElement(root, Firecrest.SOFTWARE_VERSION)
- version.text = self.version
- start_cycle = ElementTree.SubElement(root, Firecrest.START)
- start_cycle.text = str(self.start)
- stop_cycle = ElementTree.SubElement(root, Firecrest.STOP)
- stop_cycle.text = str(self.stop)
- run_date = ElementTree.SubElement(root, Firecrest.DATE)
- run_date.text = str(self.time)
- user = ElementTree.SubElement(root, Firecrest.USER)
- user.text = self.user
- matrix = ElementTree.SubElement(root, Firecrest.MATRIX)
- matrix.text = self.matrix
- return root
-
- def set_elements(self, tree):
- if tree.tag != Firecrest.FIRECREST:
- raise ValueError('Expected "Firecrest" SubElements')
- xml_version = int(tree.attrib.get('version', 0))
- if xml_version > Firecrest.XML_VERSION:
- logging.warn('Firecrest XML tree is a higher version than this class')
- for element in list(tree):
- if element.tag == Firecrest.SOFTWARE_VERSION:
- self.version = element.text
- elif element.tag == Firecrest.START:
- self.start = int(element.text)
- elif element.tag == Firecrest.STOP:
- self.stop = int(element.text)
- elif element.tag == Firecrest.DATE:
- self.date = date.fromtimestamp(float(element.text))
- elif element.tag == Firecrest.USER:
- self.user = element.text
- elif element.tag == Firecrest.MATRIX:
- self.matrix = element.text
- else:
- raise ValueError("Unrecognized tag: %s" % (element.tag,))
-
-def firecrest(pathname):
- """
- Examine the directory at pathname and initalize a Firecrest object
- """
- f = Firecrest()
-
- # parse firecrest directory name
- path, name = os.path.split(pathname)
- groups = name.split('_')
- # grab the start/stop cycle information
- cycle = re.match("C([0-9]+)-([0-9]+)", groups[0])
- f.start = int(cycle.group(1))
- f.stop = int(cycle.group(2))
- # firecrest version
- version = re.search(VERSION_RE, groups[1])
- f.version = (version.group(1))
- # datetime
- t = time.strptime(groups[2], EUROPEAN_STRPTIME)
- f.date = date(*t[0:3])
- # username
- f.user = groups[3]
-
- # should I parse this deeper than just stashing the
- # contents of the matrix file?
- matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt')
- f.matrix = open(matrix_pathname, 'r').read()
- return f
-
-def fromxml(tree):
- """
- Initialize a Firecrest object from an element tree node
- """
- f = Firecrest()
- f.set_elements(tree)
- return f
+++ /dev/null
-#!/usr/bin/python
-import glob
-import sys
-import os
-import re
-
-import logging
-
-from htsworkflow.util.alphanum import alphanum
-
-class DuplicateGenome(Exception): pass
-
-
-def _has_metainfo(genome_dir):
- metapath = os.path.join(genome_dir, '_metainfo_')
- if os.path.isfile(metapath):
- return True
- else:
- return False
-
-def getAvailableGenomes(genome_base_dir):
- """
- raises IOError (on genome_base_dir not found)
- raises DuplicateGenome on duplicate genomes found.
-
- returns a double dictionary (i.e. d[species][build] = path)
- """
-
- # Need valid directory
- if not os.path.exists(genome_base_dir):
- msg = "Directory does not exist: %s" % (genome_base_dir)
- raise IOError, msg
-
- # Find all subdirectories
- filepath_list = glob.glob(os.path.join(genome_base_dir, '*'))
- potential_genome_dirs = \
- [ filepath for filepath in filepath_list if os.path.isdir(filepath)]
-
- # Get list of metadata files
- genome_dir_list = \
- [ dirpath \
- for dirpath in potential_genome_dirs \
- if _has_metainfo(dirpath) ]
-
- # Genome double dictionary
- d = {}
-
- for genome_dir in genome_dir_list:
- line = open(os.path.join(genome_dir, '_metainfo_'), 'r').readline().strip()
-
- # Get species, build... log and skip on failure
- try:
- species, build = line.split('|')
- except:
- logging.warning('Skipping: Invalid metafile (%s) line: %s' \
- % (metafile, line))
- continue
-
- build_dict = d.setdefault(species, {})
- if build in build_dict:
- msg = "Duplicate genome for %s|%s" % (species, build)
- raise DuplicateGenome, msg
-
- build_dict[build] = genome_dir
-
- return d
-
-
-class constructMapperDict(object):
- """
- Emulate a dictionary to map genome|build names to paths.
-
- It uses the dictionary generated by getAvailableGenomes.
- """
- def __init__(self, genome_dict):
- self.genome_dict = genome_dict
-
- def __getitem__(self, key):
- """
- Return the best match for key
- """
- elements = re.split("\|", key)
-
- if len(elements) == 1:
- # we just the species name
- # get the set of builds
- builds = self.genome_dict[elements[0]]
-
- # sort build names the way humans would
- keys = builds.keys()
- keys.sort(cmp=alphanum)
-
- # return the path from the 'last' build name
- return builds[keys[-1]]
-
- elif len(elements) == 2:
- # we have species, and build name
- return self.genome_dict[elements[0]][elements[1]]
- else:
- raise KeyError("Unrecognized key")
-
- def keys(self):
- keys = []
- for species in self.genome_dict.keys():
- for build in self.genome_dict[species]:
- keys.append([species+'|'+build])
- return keys
-
- def values(self):
- values = []
- for species in self.genome_dict.keys():
- for build in self.genome_dict[species]:
- values.append(self.genome_dict[species][build])
- return values
-
- def items(self):
- items = []
- for species in self.genome_dict.keys():
- for build in self.genome_dict[species]:
- key = [species+'|'+build]
- value = self.genome_dict[species][build]
- items.append((key, value))
- return items
-
-if __name__ == '__main__':
-
- if len(sys.argv) != 2:
- print 'useage: %s <base_genome_dir>' % (sys.argv[0])
- sys.exit(1)
-
- d = getAvailableGenomes(sys.argv[1])
- d2 = constructMapperDict(d)
-
- for k,v in d2.items():
- print '%s: %s' % (k,v)
-
-
+++ /dev/null
-"""
-Provide access to information stored in the GERALD directory.
-"""
-from datetime import datetime, date
-from glob import glob
-import logging
-import os
-import stat
-import time
-import types
-
-from htsworkflow.pipeline.runfolder import \
- ElementTree, \
- EUROPEAN_STRPTIME, \
- LANES_PER_FLOWCELL, \
- VERSION_RE
-from htsworkflow.util.ethelp import indent, flatten
-from htsworkflow.util.opener import autoopen
-
-class Gerald(object):
- """
- Capture meaning out of the GERALD directory
- """
- XML_VERSION = 1
- GERALD='Gerald'
- RUN_PARAMETERS='RunParameters'
- SUMMARY='Summary'
-
- class LaneParameters(object):
- """
- Make it easy to access elements of LaneSpecificRunParameters from python
- """
- def __init__(self, gerald, key):
- self._gerald = gerald
- self._key = key
-
- def __get_attribute(self, xml_tag):
- subtree = self._gerald.tree.find('LaneSpecificRunParameters')
- container = subtree.find(xml_tag)
- if container is None:
- return None
- if len(container.getchildren()) > LANES_PER_FLOWCELL:
- raise RuntimeError('GERALD config.xml file changed')
- lanes = [x.tag.split('_')[1] for x in container.getchildren()]
- index = lanes.index(self._key)
- element = container[index]
- return element.text
- def _get_analysis(self):
- return self.__get_attribute('ANALYSIS')
- analysis = property(_get_analysis)
-
- def _get_eland_genome(self):
- genome = self.__get_attribute('ELAND_GENOME')
- # default to the chipwide parameters if there isn't an
- # entry in the lane specific paramaters
- if genome is None:
- subtree = self._gerald.tree.find('ChipWideRunParameters')
- container = subtree.find('ELAND_GENOME')
- genome = container.text
- return genome
- eland_genome = property(_get_eland_genome)
-
- def _get_read_length(self):
- return self.__get_attribute('READ_LENGTH')
- read_length = property(_get_read_length)
-
- def _get_use_bases(self):
- return self.__get_attribute('USE_BASES')
- use_bases = property(_get_use_bases)
-
- class LaneSpecificRunParameters(object):
- """
- Provide access to LaneSpecificRunParameters
- """
- def __init__(self, gerald):
- self._gerald = gerald
- self._keys = None
- def __getitem__(self, key):
- return Gerald.LaneParameters(self._gerald, key)
- def keys(self):
- if self._keys is None:
- tree = self._gerald.tree
- analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
- # according to the pipeline specs I think their fields
- # are sampleName_laneID, with sampleName defaulting to s
- # since laneIDs are constant lets just try using
- # those consistently.
- self._keys = [ x.tag.split('_')[1] for x in analysis]
- return self._keys
- def values(self):
- return [ self[x] for x in self.keys() ]
- def items(self):
- return zip(self.keys(), self.values())
- def __len__(self):
- return len(self.keys())
-
- def __init__(self, xml=None):
- self.pathname = None
- self.tree = None
-
- # parse lane parameters out of the config.xml file
- self.lanes = Gerald.LaneSpecificRunParameters(self)
-
- self.summary = None
- self.eland_results = None
-
- if xml is not None:
- self.set_elements(xml)
-
- def _get_date(self):
- if self.tree is None:
- return datetime.today()
- timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
- epochstamp = time.mktime(time.strptime(timestamp, '%c'))
- return datetime.fromtimestamp(epochstamp)
- date = property(_get_date)
-
- def _get_time(self):
- return time.mktime(self.date.timetuple())
- time = property(_get_time, doc='return run time as seconds since epoch')
-
- def _get_version(self):
- if self.tree is None:
- return None
- return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
- version = property(_get_version)
-
- def dump(self):
- """
- Debugging function, report current object
- """
- print 'Gerald version:', self.version
- print 'Gerald run date:', self.date
- print 'Gerald config.xml:', self.tree
- self.summary.dump()
-
- def get_elements(self):
- if self.tree is None or self.summary is None:
- return None
-
- gerald = ElementTree.Element(Gerald.GERALD,
- {'version': unicode(Gerald.XML_VERSION)})
- gerald.append(self.tree)
- gerald.append(self.summary.get_elements())
- if self.eland_results:
- gerald.append(self.eland_results.get_elements())
- return gerald
-
- def set_elements(self, tree):
- if tree.tag != Gerald.GERALD:
- raise ValueError('exptected GERALD')
- xml_version = int(tree.attrib.get('version', 0))
- if xml_version > Gerald.XML_VERSION:
- logging.warn('XML tree is a higher version than this class')
- for element in list(tree):
- tag = element.tag.lower()
- if tag == Gerald.RUN_PARAMETERS.lower():
- self.tree = element
- elif tag == Gerald.SUMMARY.lower():
- self.summary = Summary(xml=element)
- elif tag == ELAND.ELAND.lower():
- self.eland_results = ELAND(xml=element)
- else:
- logging.warn("Unrecognized tag %s" % (element.tag,))
-
-
-def gerald(pathname):
- g = Gerald()
- g.pathname = pathname
- path, name = os.path.split(pathname)
- config_pathname = os.path.join(pathname, 'config.xml')
- g.tree = ElementTree.parse(config_pathname).getroot()
-
- # parse Summary.htm file
- summary_pathname = os.path.join(pathname, 'Summary.htm')
- g.summary = Summary(summary_pathname)
- # parse eland files
- g.eland_results = eland(g.pathname, g)
- return g
-
-def tonumber(v):
- """
- Convert a value to int if its an int otherwise a float.
- """
- try:
- v = int(v)
- except ValueError, e:
- v = float(v)
- return v
-
-def parse_mean_range(value):
- """
- Parse values like 123 +/- 4.5
- """
- if value.strip() == 'unknown':
- return 0, 0
-
- average, pm, deviation = value.split()
- if pm != '+/-':
- raise RuntimeError("Summary.htm file format changed")
- return tonumber(average), tonumber(deviation)
-
-def make_mean_range_element(parent, name, mean, deviation):
- """
- Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
- """
- element = ElementTree.SubElement(parent, name,
- { 'mean': unicode(mean),
- 'deviation': unicode(deviation)})
- return element
-
-def parse_mean_range_element(element):
- """
- Grab mean/deviation out of element
- """
- return (tonumber(element.attrib['mean']),
- tonumber(element.attrib['deviation']))
-
-def parse_summary_element(element):
- """
- Determine if we have a simple element or a mean/deviation element
- """
- if len(element.attrib) > 0:
- return parse_mean_range_element(element)
- else:
- return element.text
-
-class Summary(object):
- """
- Extract some useful information from the Summary.htm file
- """
- XML_VERSION = 2
- SUMMARY = 'Summary'
-
- class LaneResultSummary(object):
- """
- Parse the LaneResultSummary table out of Summary.htm
- Mostly for the cluster number
- """
- LANE_RESULT_SUMMARY = 'LaneResultSummary'
- TAGS = {
- 'LaneYield': 'lane_yield',
- 'Cluster': 'cluster', # Raw
- 'ClusterPF': 'cluster_pass_filter',
- 'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
- 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
- 'PercentPassFilterClusters': 'percent_pass_filter_clusters',
- 'PercentPassFilterAlign': 'percent_pass_filter_align',
- 'AverageAlignmentScore': 'average_alignment_score',
- 'PercentErrorRate': 'percent_error_rate'
- }
-
- def __init__(self, html=None, xml=None):
- self.lane = None
- self.lane_yield = None
- self.cluster = None
- self.cluster_pass_filter = None
- self.average_first_cycle_intensity = None
- self.percent_intensity_after_20_cycles = None
- self.percent_pass_filter_clusters = None
- self.percent_pass_filter_align = None
- self.average_alignment_score = None
- self.percent_error_rate = None
-
- if html is not None:
- self.set_elements_from_html(html)
- if xml is not None:
- self.set_elements(xml)
-
- def set_elements_from_html(self, data):
- if not len(data) in (8,10):
- raise RuntimeError("Summary.htm file format changed")
-
- # same in pre-0.3.0 Summary file and 0.3 summary file
- self.lane = data[0]
-
- if len(data) == 8:
- parsed_data = [ parse_mean_range(x) for x in data[1:] ]
- # this is the < 0.3 Pipeline version
- self.cluster = parsed_data[0]
- self.average_first_cycle_intensity = parsed_data[1]
- self.percent_intensity_after_20_cycles = parsed_data[2]
- self.percent_pass_filter_clusters = parsed_data[3]
- self.percent_pass_filter_align = parsed_data[4]
- self.average_alignment_score = parsed_data[5]
- self.percent_error_rate = parsed_data[6]
- elif len(data) == 10:
- parsed_data = [ parse_mean_range(x) for x in data[2:] ]
- # this is the >= 0.3 summary file
- self.lane_yield = data[1]
- self.cluster = parsed_data[0]
- self.cluster_pass_filter = parsed_data[1]
- self.average_first_cycle_intensity = parsed_data[2]
- self.percent_intensity_after_20_cycles = parsed_data[3]
- self.percent_pass_filter_clusters = parsed_data[4]
- self.percent_pass_filter_align = parsed_data[5]
- self.average_alignment_score = parsed_data[6]
- self.percent_error_rate = parsed_data[7]
-
- def get_elements(self):
- lane_result = ElementTree.Element(
- Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
- {'lane': self.lane})
- for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
- value = getattr(self, variable_name)
- if value is None:
- continue
- # it looks like a sequence
- elif type(value) in (types.TupleType, types.ListType):
- element = make_mean_range_element(
- lane_result,
- tag,
- *value
- )
- else:
- element = ElementTree.SubElement(lane_result, tag)
- element.text = value
- return lane_result
-
- def set_elements(self, tree):
- if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
- raise ValueError('Expected %s' % (
- Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
- self.lane = tree.attrib['lane']
- tags = Summary.LaneResultSummary.TAGS
- for element in list(tree):
- try:
- variable_name = tags[element.tag]
- setattr(self, variable_name,
- parse_summary_element(element))
- except KeyError, e:
- logging.warn('Unrecognized tag %s' % (element.tag,))
-
- def __init__(self, filename=None, xml=None):
- self.lane_results = {}
-
- if filename is not None:
- self._extract_lane_results(filename)
- if xml is not None:
- self.set_elements(xml)
-
- def __getitem__(self, key):
- return self.lane_results[key]
-
- def __len__(self):
- return len(self.lane_results)
-
- def keys(self):
- return self.lane_results.keys()
-
- def values(self):
- return self.lane_results.values()
-
- def items(self):
- return self.lane_results.items()
-
- def _flattened_row(self, row):
- """
- flatten the children of a <tr>...</tr>
- """
- return [flatten(x) for x in row.getchildren() ]
-
- def _parse_table(self, table):
- """
- assumes the first line is the header of a table,
- and that the remaining rows are data
- """
- rows = table.getchildren()
- data = []
- for r in rows:
- data.append(self._flattened_row(r))
- return data
-
- def _extract_named_tables(self, pathname):
- """
- extract all the 'named' tables from a Summary.htm file
- and return as a dictionary
-
- Named tables are <h2>...</h2><table>...</table> pairs
- The contents of the h2 tag is considered to the name
- of the table.
- """
- tree = ElementTree.parse(pathname).getroot()
- body = tree.find('body')
- tables = {}
- for i in range(len(body)):
- if body[i].tag == 'h2' and body[i+1].tag == 'table':
- # we have an interesting table
- name = flatten(body[i])
- table = body[i+1]
- data = self._parse_table(table)
- tables[name] = data
- return tables
-
- def _extract_lane_results(self, pathname):
- """
- extract the Lane Results Summary table
- """
-
- tables = self._extract_named_tables(pathname)
-
- # parse lane result summary
- lane_summary = tables['Lane Results Summary']
- # this is version 1 of the summary file
- if len(lane_summary[-1]) == 8:
- # strip header
- headers = lane_summary[0]
- # grab the lane by lane data
- lane_summary = lane_summary[1:]
-
- # this is version 2 of the summary file
- if len(lane_summary[-1]) == 10:
- # lane_summary[0] is a different less specific header row
- headers = lane_summary[1]
- lane_summary = lane_summary[2:10]
- # after the last lane, there's a set of chip wide averages
-
- for r in lane_summary:
- lrs = Summary.LaneResultSummary(html=r)
- self.lane_results[lrs.lane] = lrs
-
- def get_elements(self):
- summary = ElementTree.Element(Summary.SUMMARY,
- {'version': unicode(Summary.XML_VERSION)})
- for lane in self.lane_results.values():
- summary.append(lane.get_elements())
- return summary
-
- def set_elements(self, tree):
- if tree.tag != Summary.SUMMARY:
- return ValueError("Expected %s" % (Summary.SUMMARY,))
- xml_version = int(tree.attrib.get('version', 0))
- if xml_version > Summary.XML_VERSION:
- logging.warn('Summary XML tree is a higher version than this class')
- for element in list(tree):
- lrs = Summary.LaneResultSummary()
- lrs.set_elements(element)
- self.lane_results[lrs.lane] = lrs
-
- def dump(self):
- """
- Debugging function, report current object
- """
- pass
-
-
-def build_genome_fasta_map(genome_dir):
- # build fasta to fasta file map
- genome = genome_dir.split(os.path.sep)[-1]
- fasta_map = {}
- for vld_file in glob(os.path.join(genome_dir, '*.vld')):
- is_link = False
- if os.path.islink(vld_file):
- is_link = True
- vld_file = os.path.realpath(vld_file)
- path, vld_name = os.path.split(vld_file)
- name, ext = os.path.splitext(vld_name)
- if is_link:
- fasta_map[name] = name
- else:
- fasta_map[name] = os.path.join(genome, name)
- return fasta_map
-
-class ElandLane(object):
- """
- Process an eland result file
- """
- XML_VERSION = 1
- LANE = 'ElandLane'
- SAMPLE_NAME = 'SampleName'
- LANE_ID = 'LaneID'
- GENOME_MAP = 'GenomeMap'
- GENOME_ITEM = 'GenomeItem'
- MAPPED_READS = 'MappedReads'
- MAPPED_ITEM = 'MappedItem'
- MATCH_CODES = 'MatchCodes'
- MATCH_ITEM = 'Code'
- READS = 'Reads'
-
- def __init__(self, pathname=None, genome_map=None, xml=None):
- self.pathname = pathname
- self._sample_name = None
- self._lane_id = None
- self._reads = None
- self._mapped_reads = None
- self._match_codes = None
- if genome_map is None:
- genome_map = {}
- self.genome_map = genome_map
-
- if xml is not None:
- self.set_elements(xml)
-
- def _update(self):
- """
- Actually read the file and actually count the reads
- """
- # can't do anything if we don't have a file to process
- if self.pathname is None:
- return
-
- if os.stat(self.pathname)[stat.ST_SIZE] == 0:
- raise RuntimeError("Eland isn't done, try again later.")
-
- reads = 0
- mapped_reads = {}
-
- match_codes = {'NM':0, 'QC':0, 'RM':0,
- 'U0':0, 'U1':0, 'U2':0,
- 'R0':0, 'R1':0, 'R2':0,
- }
- for line in autoopen(self.pathname,'r'):
- reads += 1
- fields = line.split()
- # code = fields[2]
- # match_codes[code] = match_codes.setdefault(code, 0) + 1
- # the QC/NM etc codes are in the 3rd field and always present
- match_codes[fields[2]] += 1
- # ignore lines that don't have a fasta filename
- if len(fields) < 7:
- continue
- fasta = self.genome_map.get(fields[6], fields[6])
- mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
- self._match_codes = match_codes
- self._mapped_reads = mapped_reads
- self._reads = reads
-
- def _update_name(self):
- # extract the sample name
- if self.pathname is None:
- return
-
- path, name = os.path.split(self.pathname)
- split_name = name.split('_')
- self._sample_name = split_name[0]
- self._lane_id = split_name[1]
-
- def _get_sample_name(self):
- if self._sample_name is None:
- self._update_name()
- return self._sample_name
- sample_name = property(_get_sample_name)
-
- def _get_lane_id(self):
- if self._lane_id is None:
- self._update_name()
- return self._lane_id
- lane_id = property(_get_lane_id)
-
- def _get_reads(self):
- if self._reads is None:
- self._update()
- return self._reads
- reads = property(_get_reads)
-
- def _get_mapped_reads(self):
- if self._mapped_reads is None:
- self._update()
- return self._mapped_reads
- mapped_reads = property(_get_mapped_reads)
-
- def _get_match_codes(self):
- if self._match_codes is None:
- self._update()
- return self._match_codes
- match_codes = property(_get_match_codes)
-
- def get_elements(self):
- lane = ElementTree.Element(ElandLane.LANE,
- {'version':
- unicode(ElandLane.XML_VERSION)})
- sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME)
- sample_tag.text = self.sample_name
- lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID)
- lane_tag.text = self.lane_id
- genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
- for k, v in self.genome_map.items():
- item = ElementTree.SubElement(
- genome_map, ElandLane.GENOME_ITEM,
- {'name':k, 'value':unicode(v)})
- mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
- for k, v in self.mapped_reads.items():
- item = ElementTree.SubElement(
- mapped_reads, ElandLane.MAPPED_ITEM,
- {'name':k, 'value':unicode(v)})
- match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
- for k, v in self.match_codes.items():
- item = ElementTree.SubElement(
- match_codes, ElandLane.MATCH_ITEM,
- {'name':k, 'value':unicode(v)})
- reads = ElementTree.SubElement(lane, ElandLane.READS)
- reads.text = unicode(self.reads)
-
- return lane
-
- def set_elements(self, tree):
- if tree.tag != ElandLane.LANE:
- raise ValueError('Exptecting %s' % (ElandLane.LANE,))
-
- # reset dictionaries
- self._mapped_reads = {}
- self._match_codes = {}
-
- for element in tree:
- tag = element.tag.lower()
- if tag == ElandLane.SAMPLE_NAME.lower():
- self._sample_name = element.text
- elif tag == ElandLane.LANE_ID.lower():
- self._lane_id = element.text
- elif tag == ElandLane.GENOME_MAP.lower():
- for child in element:
- name = child.attrib['name']
- value = child.attrib['value']
- self.genome_map[name] = value
- elif tag == ElandLane.MAPPED_READS.lower():
- for child in element:
- name = child.attrib['name']
- value = child.attrib['value']
- self._mapped_reads[name] = int(value)
- elif tag == ElandLane.MATCH_CODES.lower():
- for child in element:
- name = child.attrib['name']
- value = int(child.attrib['value'])
- self._match_codes[name] = value
- elif tag == ElandLane.READS.lower():
- self._reads = int(element.text)
- else:
- logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
-
-def extract_eland_sequence(instream, outstream, start, end):
- """
- Extract a chunk of sequence out of an eland file
- """
- for line in instream:
- record = line.split()
- if len(record) > 1:
- result = [record[0], record[1][start:end]]
- else:
- result = [record[0][start:end]]
- outstream.write("\t".join(result))
- outstream.write(os.linesep)
-
-class ELAND(object):
- """
- Summarize information from eland files
- """
- XML_VERSION = 1
-
- ELAND = 'ElandCollection'
- LANE = 'Lane'
- LANE_ID = 'id'
-
- def __init__(self, xml=None):
- # we need information from the gerald config.xml
- self.results = {}
-
- if xml is not None:
- self.set_elements(xml)
-
- def __len__(self):
- return len(self.results)
-
- def keys(self):
- return self.results.keys()
-
- def values(self):
- return self.results.values()
-
- def items(self):
- return self.results.items()
-
- def __getitem__(self, key):
- return self.results[key]
-
- def get_elements(self):
- root = ElementTree.Element(ELAND.ELAND,
- {'version': unicode(ELAND.XML_VERSION)})
- for lane_id, lane in self.results.items():
- eland_lane = lane.get_elements()
- eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
- root.append(eland_lane)
- return root
-
- def set_elements(self, tree):
- if tree.tag.lower() != ELAND.ELAND.lower():
- raise ValueError('Expecting %s', ELAND.ELAND)
- for element in list(tree):
- lane_id = element.attrib[ELAND.LANE_ID]
- lane = ElandLane(xml=element)
- self.results[lane_id] = lane
-
-def eland(basedir, gerald=None, genome_maps=None):
- e = ELAND()
-
- file_list = glob(os.path.join(basedir, "*_eland_result.txt"))
- if len(file_list) == 0:
- # lets handle compressed eland files too
- file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2"))
-
- for pathname in file_list:
- # yes the lane_id is also being computed in ElandLane._update
- # I didn't want to clutter up my constructor
- # but I needed to persist the sample_name/lane_id for
- # runfolder summary_report
- path, name = os.path.split(pathname)
- split_name = name.split('_')
- lane_id = split_name[1]
-
- if genome_maps is not None:
- genome_map = genome_maps[lane_id]
- elif gerald is not None:
- genome_dir = gerald.lanes[lane_id].eland_genome
- genome_map = build_genome_fasta_map(genome_dir)
- else:
- genome_map = {}
-
- eland_result = ElandLane(pathname, genome_map)
- e.results[lane_id] = eland_result
- return e
+++ /dev/null
-from xml import sax
-
-
-def get_cycles(recipe_xml_filepath):
- """
- returns the number of cycles found in Recipe*.xml
- """
- handler = CycleXmlHandler()
- sax.parse(recipe_xml_filepath, handler)
- return handler.cycle_count
-
-
-
-class CycleXmlHandler(sax.ContentHandler):
-
- def __init__(self):
- self.cycle_count = 0
- self.in_protocol = False
- sax.ContentHandler.__init__(self)
-
-
- def startDocument(self):
- self.cycle_count = 0
- self.in_protocol = False
-
-
- def startElement(self, name, attrs):
-
- #Only count Incorporations as cycles if within
- # the protocol section of the xml document.
- if name == "Incorporation" and self.in_protocol:
- #print 'Found a cycle!'
- self.cycle_count += 1
- return
-
- elif name == 'Protocol':
- #print 'In protocol'
- self.in_protocol = True
- return
-
- #print 'Skipping: %s' % (name)
-
-
- def endElement(self, name):
-
- if name == 'Protocol':
- #print 'End protocol'
- self.in_protocol = False
+++ /dev/null
-#!/usr/bin/env python
-
-from optparse import OptionParser, IndentedHelpFormatter
-from ConfigParser import SafeConfigParser
-
-import logging
-import os
-import sys
-import urllib2
-
-CONFIG_SYSTEM = '/etc/ga_frontend/ga_frontend.conf'
-CONFIG_USER = os.path.expanduser('~/.ga_frontend.conf')
-
-#Disable or enable commandline arg parsing; disabled by default.
-DISABLE_CMDLINE = True
-
-class FlowCellNotFound(Exception): pass
-class WebError404(Exception): pass
-
-class DummyOptions:
- """
- Used when command line parsing is disabled; default
- """
- def __init__(self):
- self.url = None
- self.output_filepath = None
- self.flowcell = None
- self.genome_dir = None
-
-class PreformattedDescriptionFormatter(IndentedHelpFormatter):
-
- #def format_description(self, description):
- #
- # if description:
- # return description + "\n"
- # else:
- # return ""
-
- def format_epilog(self, epilog):
- """
- It was removing my preformated epilog, so this should override
- that behavior! Muhahaha!
- """
- if epilog:
- return "\n" + epilog + "\n"
- else:
- return ""
-
-
-def constructOptionParser():
- """
- returns a pre-setup optparser
- """
- global DISABLE_CMDLINE
-
- if DISABLE_CMDLINE:
- return None
-
- parser = OptionParser(formatter=PreformattedDescriptionFormatter())
-
- parser.set_description('Retrieves eland config file from ga_frontend web frontend.')
-
- parser.epilog = """
-Config File:
- * %s (System wide)
- * %s (User specific; overrides system)
- * command line overrides all config file options
-
- Example Config File:
-
- [config_file_server]
- base_host_url=http://somewhere.domain:port
-""" % (CONFIG_SYSTEM, CONFIG_USER)
-
- #Special formatter for allowing preformatted description.
- ##parser.format_epilog(PreformattedDescriptionFormatter())
-
- parser.add_option("-u", "--url",
- action="store", type="string", dest="url")
-
- parser.add_option("-o", "--output",
- action="store", type="string", dest="output_filepath")
-
- parser.add_option("-f", "--flowcell",
- action="store", type="string", dest="flowcell")
-
- parser.add_option("-g", "--genome_dir",
- action="store", type="string", dest="genome_dir")
-
- #parser.set_default("url", "default")
-
- return parser
-
-def constructConfigParser():
- """
- returns a pre-setup config parser
- """
- parser = SafeConfigParser()
- parser.read([CONFIG_SYSTEM, CONFIG_USER])
- if not parser.has_section('config_file_server'):
- parser.add_section('config_file_server')
- if not parser.has_section('local_setup'):
- parser.add_section('local_setup')
-
- return parser
-
-
-def getCombinedOptions():
- """
- Returns optparse options after it has be updated with ConfigParser
- config files and merged with parsed commandline options.
- """
- cl_parser = constructOptionParser()
- conf_parser = constructConfigParser()
-
- if cl_parser is None:
- options = DummyOptions()
- else:
- options, args = cl_parser.parse_args()
-
- if options.url is None:
- if conf_parser.has_option('config_file_server', 'base_host_url'):
- options.url = conf_parser.get('config_file_server', 'base_host_url')
-
- if options.genome_dir is None:
- if conf_parser.has_option('local_setup', 'genome_dir'):
- options.genome_dir = conf_parser.get('local_setup', 'genome_dir')
-
- print 'USING OPTIONS:'
- print ' URL:', options.url
- print ' OUT:', options.output_filepath
- print ' FC:', options.flowcell
- print 'GDIR:', options.genome_dir
- print ''
-
- return options
-
-
-def saveConfigFile(flowcell, base_host_url, output_filepath):
- """
- retrieves the flowcell eland config file, give the base_host_url
- (i.e. http://sub.domain.edu:port)
- """
- url = base_host_url + '/eland_config/%s/' % (flowcell)
-
- f = open(output_filepath, 'w')
- #try:
- try:
- web = urllib2.urlopen(url)
- except urllib2.URLError, e:
- errmsg = 'URLError: %d' % (e.code,)
- logging.error(errmsg)
- logging.error('opened %s' % (url,))
- logging.error('%s' % ( e.read(),))
- raise IOError(errmsg)
-
- #except IOError, msg:
- # if str(msg).find("Connection refused") >= 0:
- # print 'Error: Connection refused for: %s' % (url)
- # f.close()
- # sys.exit(1)
- # elif str(msg).find("Name or service not known") >= 0:
- # print 'Error: Invalid domain or ip address for: %s' % (url)
- # f.close()
- # sys.exit(2)
- # else:
- # raise IOError, msg
-
- data = web.read()
-
- if data.find('Hmm, config file for') >= 0:
- msg = "Flowcell (%s) not found in DB; full url(%s)" % (flowcell, url)
- raise FlowCellNotFound, msg
-
- if data.find('404 - Not Found') >= 0:
- msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \
- "Did you get right port #?" % (flowcell, base_host_url, url)
- raise FlowCellNotFound, msg
-
- f.write(data)
- web.close()
- f.close()
- logging.info('Wrote config file to %s' % (output_filepath,))
-
-
+++ /dev/null
-import glob
-import re
-import os
-import sys
-import time
-import threading
-
-s_comment = re.compile('^#')
-s_general_read_len = re.compile('^READ_LENGTH ')
-s_read_len = re.compile('^[1-8]+:READ_LENGTH ')
-
-s_firecrest = None
-
-def _four_digit_num_in_string(num):
- if num < 0:
- pass
- elif num < 10:
- return '000' + str(num)
- elif num < 100:
- return '00' + str(num)
- elif num < 1000:
- return '0' + str(num)
- elif num < 10000:
- return str(num)
-
- msg = 'Invalid number: %s' % (num)
- raise ValueError, msg
-
-def _two_digit_num_in_string(num):
- if num < 0:
- pass
- elif num < 10:
- return '0' + str(num)
- elif num < 100:
- return str(num)
-
- msg = 'Invalid number: %s' % (num)
- raise ValueError, msg
-
-
-# FIRECREST PATTERNS
-# _p2f(<pattern>, lane, tile, cycle)
-PATTERN_FIRECREST_QCM = 's_%s_%s_%s_qcm.xml'
-
-# _p2f(<pattern>, lane, tile)
-PATTERN_FIRECREST_INT = 's_%s_%s_02_int.txt'
-PATTERN_FIRECREST_NSE = 's_%s_%s_nse.txt.gz'
-PATTERN_FIRECREST_POS = 's_%s_%s_pos.txt'
-PATTERN_FIRECREST_IDX = 's_%s_%s_idx.txt'
-PATTERN_FIRECREST_CLU1 = 's_%s_%s_01_1_clu.txt'
-PATTERN_FIRECREST_CLU2 = 's_%s_%s_01_2_clu.txt'
-PATTERN_FIRECREST_CLU3 = 's_%s_%s_01_3_clu.txt'
-PATTERN_FIRECREST_CLU4 = 's_%s_%s_01_4_clu.txt'
-
-
-# BUSTARD PATTERNS
-# _p2f(<pattern>, lane, tile)
-PATTERN_BUSTARD_SIG2 = 's_%s_%s_sig2.txt'
-PATTERN_BUSTARD_PRB = 's_%s_%s_prb.txt'
-
-
-
-# GERALD PATTERNS
-# _p2f(<pattern>, lane, tile)
-PATTERN_GERALD_ALLTMP = 's_%s_%s_all.txt.tmp'
-PATTERN_GERALD_QRAWTMP = 's_%s_%s_qraw.txt.tmp'
-PATTERN_GERALD_ALLPNGTMP = 's_%s_%s_all.tmp.png'
-PATTERN_GERALD_ALIGNTMP = 's_%s_%s_align.txt.tmp'
-PATTERN_GERALD_QVALTMP = 's_%s_%s_qval.txt.tmp'
-PATTERN_GERALD_SCORETMP = 's_%s_%s_score.txt.tmp'
-PATTERN_GERALD_PREALIGNTMP = 's_%s_%s_prealign.txt.tmp'
-PATTERN_GERALD_REALIGNTMP = 's_%s_%s_realign.txt.tmp'
-PATTERN_GERALD_RESCORETMP = 's_%s_%s_rescore.txt.tmp'
-PATTERN_GERALD_RESCOREPNG = 's_%s_%s_rescore.png'
-PATTERN_GERALD_ERRORSTMPPNG = 's_%s_%s_errors.tmp.png'
-PATTERN_GERALD_QCALTMP = 's_%s_%s_qcal.txt.tmp'
-PATTERN_GERALD_QVAL = 's_%s_%s_qval.txt'
-
-# _p2f(<pattern>, lane)
-PATTERN_GERALD_SEQPRETMP = 's_%s_seqpre.txt.tmp'
-PATTERN_GERALD_RESULTTMP = 's_%s_eland_result.txt.tmp'
-PATTERN_GERALD_SIGMEANSTMP = 's_%s_Signal_Means.txt.tmp'
-PATTERN_GERALD_CALLPNG = 's_%s_call.png'
-PATTERN_GERALD_ALLPNG = 's_%s_all.png'
-PATTERN_GERALD_PERCENTALLPNG = 's_%s_percent_all.png'
-PATTERN_GERALD_PERCENTCALLPNG = 's_%s_percent_call.png'
-PATTERN_GERALD_PERCENTBASEPNG = 's_%s_percent_base.png'
-PATTERN_GERALD_FILTTMP = 's_%s_filt.txt.tmp'
-PATTERN_GERALD_FRAGTMP = 's_%s_frag.txt.tmp'
-PATTERN_GERALD_QREPORTTMP = 's_%s_qreport.txt.tmp'
-PATTERN_GERALD_QTABLETMP = 's_%s_qtable.txt.tmp'
-PATTERN_GERALD_QCALREPORTTMP = 's_%s_qcalreport.txt.tmp'
-PATTERN_GERALD_SEQUENCETMP = 's_%s_sequence.txt.tmp'
-PATTERN_GERALD_LANEFINISHED = 's_%s_finished.txt'
-
-
-
-def _p2f(pattern, lane, tile=None, cycle=None):
- """
- Converts a pattern plus info into file names
- """
-
- # lane, and cycle provided (INVALID)
- if tile is None and cycle is not None:
- msg = "Handling of cycle without tile is not currently implemented."
- raise ValueError, msg
-
- # lane, tile, cycle provided
- elif cycle:
- return pattern % (lane,
- _four_digit_num_in_string(tile),
- _two_digit_num_in_string(cycle))
-
- # lane, tile provided
- elif tile:
- return pattern % (lane, _four_digit_num_in_string(tile))
-
- # lane provided
- else:
- return pattern % (lane)
-
-
-class GARunStatus(object):
-
- def __init__(self, conf_filepath):
- """
- Given an eland config file in the top level directory
- of a run, predicts the files that will be generated
- during a run and provides methods for retrieving
- (completed, total) for each step or entire run.
- """
- #print 'self._conf_filepath = %s' % (conf_filepath)
- self._conf_filepath = conf_filepath
- self._base_dir, junk = os.path.split(conf_filepath)
- self._image_dir = os.path.join(self._base_dir, 'Images')
-
- self.lanes = []
- self.lane_read_length = {}
- self.tiles = None
- self.cycles = None
-
- self.status = {}
- self.status['firecrest'] = {}
- self.status['bustard'] = {}
- self.status['gerald'] = {}
-
- self._process_config()
- self._count_tiles()
- self._count_cycles()
- self._generate_expected()
-
-
- def _process_config(self):
- """
- Grabs info from self._conf_filepath
- """
- f = open(self._conf_filepath, 'r')
-
- for line in f:
-
- #Skip comment lines for now.
- if s_comment.search(line):
- continue
-
- mo = s_general_read_len.search(line)
- if mo:
- read_length = int(line[mo.end():])
- #Handle general READ_LENGTH
- for i in range(1,9):
- self.lane_read_length[i] = read_length
-
- mo = s_read_len.search(line)
- if mo:
- read_length = int(line[mo.end():])
- lanes, junk = line.split(':')
-
- #Convert lanes from string of lanes to list of lane #s.
- lanes = [ int(i) for i in lanes ]
-
-
- for lane in lanes:
-
- #Keep track of which lanes are being run.
- if lane not in self.lanes:
- self.lanes.append(lane)
-
- #Update with lane specific read lengths
- self.lane_read_length[lane] = read_length
-
- self.lanes.sort()
-
-
- def _count_tiles(self):
- """
- Count the number of tiles being used
- """
- self.tiles = len(glob.glob(os.path.join(self._image_dir,
- 'L001',
- 'C1.1',
- 's_1_*_a.tif')))
-
- def _count_cycles(self):
- """
- Figures out the number of cycles that are available
- """
- #print 'self._image_dir = %s' % (self._image_dir)
- cycle_dirs = glob.glob(os.path.join(self._image_dir, 'L001', 'C*.1'))
- #print 'cycle_dirs = %s' % (cycle_dirs)
- cycle_list = []
- for cycle_dir in cycle_dirs:
- junk, c = os.path.split(cycle_dir)
- cycle_list.append(int(c[1:c.find('.')]))
-
- self.cycles = max(cycle_list)
-
-
-
-
- def _generate_expected(self):
- """
- generates a list of files we expect to find.
- """
-
- firecrest = self.status['firecrest']
- bustard = self.status['bustard']
- gerald = self.status['gerald']
-
-
- for lane in self.lanes:
- for tile in range(1,self.tiles+1):
- for cycle in range(1, self.cycles+1):
-
- ##########################
- # LANE, TILE, CYCLE LAYER
-
- # FIRECREST
- firecrest[_p2f(PATTERN_FIRECREST_QCM, lane, tile, cycle)] = False
-
-
- ###################
- # LANE, TILE LAYER
-
- # FIRECREST
- firecrest[_p2f(PATTERN_FIRECREST_INT, lane, tile)] = False
- firecrest[_p2f(PATTERN_FIRECREST_NSE, lane, tile)] = False
- firecrest[_p2f(PATTERN_FIRECREST_POS, lane, tile)] = False
- firecrest[_p2f(PATTERN_FIRECREST_IDX, lane, tile)] = False
- firecrest[_p2f(PATTERN_FIRECREST_CLU1, lane, tile)] = False
- firecrest[_p2f(PATTERN_FIRECREST_CLU2, lane, tile)] = False
- firecrest[_p2f(PATTERN_FIRECREST_CLU3, lane, tile)] = False
- firecrest[_p2f(PATTERN_FIRECREST_CLU4, lane, tile)] = False
-
-
- # BUSTARD
- bustard[_p2f(PATTERN_BUSTARD_SIG2, lane, tile)] = False
- bustard[_p2f(PATTERN_BUSTARD_PRB, lane, tile)] = False
-
-
- # GERALD
- #gerald[_p2f(PATTERN_GERALD_ALLTMP, lane, tile)] = False
- #gerald[_p2f(PATTERN_GERALD_QRAWTMP, lane, tile)] = False
- #gerald[_p2f(PATTERN_GERALD_ALLPNGTMP, lane, tile)] = False
- #gerald[_p2f(PATTERN_GERALD_ALIGNTMP, lane, tile)] = False
- #gerald[_p2f(PATTERN_GERALD_QVALTMP, lane, tile)] = False
- #gerald[_p2f(PATTERN_GERALD_SCORETMP, lane, tile)] = False
- #gerald[_p2f(PATTERN_GERALD_PREALIGNTMP, lane, tile)] = False
- #gerald[_p2f(PATTERN_GERALD_REALIGNTMP, lane, tile)] = False
- #gerald[_p2f(PATTERN_GERALD_RESCORETMP, lane, tile)] = False
- gerald[_p2f(PATTERN_GERALD_RESCOREPNG, lane, tile)] = False
- #gerald[_p2f(PATTERN_GERALD_ERRORSTMPPNG, lane, tile)] = False
- #gerald[_p2f(PATTERN_GERALD_QCALTMP, lane, tile)] = False
- #gerald[_p2f(PATTERN_GERALD_QVAL, lane, tile)] = False
-
- ###################
- # LANE LAYER
-
- # GERALD
- #gerald[_p2f(PATTERN_GERALD_SEQPRETMP, lane)] = False
- #gerald[_p2f(PATTERN_GERALD_RESULTTMP, lane)] = False
- #gerald[_p2f(PATTERN_GERALD_SIGMEANSTMP, lane)] = False
- gerald[_p2f(PATTERN_GERALD_CALLPNG, lane)] = False
- gerald[_p2f(PATTERN_GERALD_ALLPNG, lane)] = False
- gerald[_p2f(PATTERN_GERALD_PERCENTALLPNG, lane)] = False
- gerald[_p2f(PATTERN_GERALD_PERCENTCALLPNG, lane)] = False
- gerald[_p2f(PATTERN_GERALD_PERCENTBASEPNG, lane)] = False
- #gerald[_p2f(PATTERN_GERALD_FILTTMP, lane)] = False
- #gerald[_p2f(PATTERN_GERALD_FRAGTMP, lane)] = False
- #gerald[_p2f(PATTERN_GERALD_QREPORTTMP, lane)] = False
- #gerald[_p2f(PATTERN_GERALD_QTABLETMP, lane)] = False
- #gerald[_p2f(PATTERN_GERALD_QCALREPORTTMP, lane)] = False
- #gerald[_p2f(PATTERN_GERALD_SEQUENCETMP, lane)] = False
- gerald[_p2f(PATTERN_GERALD_LANEFINISHED, lane)] = False
-
-
-
- #################
- # LOOPS FINISHED
-
- # FIRECREST
- firecrest['offsets_finished.txt'] = False
- firecrest['finished.txt'] = False
-
- # BUSTARD
- bustard['finished.txt'] = False
-
- # GERALD
- gerald['tiles.txt'] = False
- gerald['FullAll.htm'] = False
- #gerald['All.htm.tmp'] = False
- #gerald['Signal_Means.txt.tmp'] = False
- #gerald['plotIntensity_for_IVC'] = False
- #gerald['IVC.htm.tmp'] = False
- gerald['FullError.htm'] = False
- gerald['FullPerfect.htm'] = False
- #gerald['Error.htm.tmp'] = False
- #gerald['Perfect.htm.tmp'] = False
- #gerald['Summary.htm.tmp'] = False
- #gerald['Tile.htm.tmp'] = False
- gerald['finished.txt'] = False
-
- def statusFirecrest(self):
- """
- returns (<completed>, <total>)
- """
- firecrest = self.status['firecrest']
- total = len(firecrest)
- completed = firecrest.values().count(True)
-
- return (completed, total)
-
-
- def statusBustard(self):
- """
- returns (<completed>, <total>)
- """
- bustard = self.status['bustard']
- total = len(bustard)
- completed = bustard.values().count(True)
-
- return (completed, total)
-
-
- def statusGerald(self):
- """
- returns (<completed>, <total>)
- """
- gerald = self.status['gerald']
- total = len(gerald)
- completed = gerald.values().count(True)
-
- return (completed, total)
-
-
- def statusTotal(self):
- """
- returns (<completed>, <total>)
- """
- #f = firecrest c = completed
- #b = bustard t = total
- #g = gerald
- fc, ft = self.statusFirecrest()
- bc, bt = self.statusBustard()
- gc, gt = self.statusGerald()
-
- return (fc+bc+gc, ft+bt+gt)
-
-
- def statusReport(self):
- """
- Generate the basic percent complete report
- """
- def _percentCompleted(completed, total):
- """
- Returns precent completed as float
- """
- return (completed / float(total)) * 100
-
- fc, ft = self.statusFirecrest()
- bc, bt = self.statusBustard()
- gc, gt = self.statusGerald()
- tc, tt = self.statusTotal()
-
- fp = _percentCompleted(fc, ft)
- bp = _percentCompleted(bc, bt)
- gp = _percentCompleted(gc, gt)
- tp = _percentCompleted(tc, tt)
-
- report = ['Firecrest: %s%% (%s/%s)' % (fp, fc, ft),
- ' Bustard: %s%% (%s/%s)' % (bp, bc, bt),
- ' Gerald: %s%% (%s/%s)' % (gp, gc, gt),
- '-----------------------',
- ' Total: %s%% (%s/%s)' % (tp, tc, tt),
- ]
- return report
-
- def updateFirecrest(self, filename):
- """
- Marks firecrest filename as being completed.
- """
- self.status['firecrest'][filename] = True
-
-
- def updateBustard(self, filename):
- """
- Marks bustard filename as being completed.
- """
- self.status['bustard'][filename] = True
-
-
- def updateGerald(self, filename):
- """
- Marks gerald filename as being completed.
- """
- self.status['gerald'][filename] = True
-
-
-
-##################################################
-# Functions to be called by Thread(target=<func>)
-def _cmdLineStatusMonitorFunc(conf_info):
- """
- Given a ConfigInfo object, provides status to stdout.
-
- You should probably use startCmdLineStatusMonitor()
- instead of ths function.
-
- Use with:
- t = threading.Thread(target=_cmdLineStatusMonitorFunc,
- args=[conf_info])
- t.setDaemon(True)
- t.start()
- """
- SLEEP_AMOUNT = 30
-
- while 1:
- if conf_info.status is None:
- print "No status object yet."
- time.sleep(SLEEP_AMOUNT)
- continue
-
- report = conf_info.status.statusReport()
- print os.linesep.join(report)
- print
-
- time.sleep(SLEEP_AMOUNT)
-
-
-#############################################
-# Start monitor thread convenience functions
-def startCmdLineStatusMonitor(conf_info):
- """
- Starts a command line status monitor given a conf_info object.
- """
- t = threading.Thread(target=_cmdLineStatusMonitorFunc, args=[conf_info])
- t.setDaemon(True)
- t.start()
-
-from optparse import OptionParser
-def make_parser():
- usage = "%prog: config file"
-
- parser = OptionParser()
- return parser
-
-def main(cmdline=None):
- parser = make_parser()
- opt, args = parser.parse_args(cmdline)
-
- if len(args) != 1:
- parser.error("need name of configuration file")
-
- status = GARunStatus(args[0])
- print os.linesep.join(status.statusReport())
- return 0
-
-if __name__ == "__main__":
- sys.exit(main(sys.argv[1:]))
-
+++ /dev/null
-"""
-Core information needed to inspect a runfolder.
-"""
-from glob import glob
-import logging
-import os
-import re
-import shutil
-import stat
-import subprocess
-import sys
-import time
-
-try:
- from xml.etree import ElementTree
-except ImportError, e:
- from elementtree import ElementTree
-
-EUROPEAN_STRPTIME = "%d-%m-%Y"
-EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
-VERSION_RE = "([0-9\.]+)"
-USER_RE = "([a-zA-Z0-9]+)"
-LANES_PER_FLOWCELL = 8
-
-from htsworkflow.util.alphanum import alphanum
-from htsworkflow.util.ethelp import indent, flatten
-
-
-class PipelineRun(object):
- """
- Capture "interesting" information about a pipeline run
- """
- XML_VERSION = 1
- PIPELINE_RUN = 'PipelineRun'
- FLOWCELL_ID = 'FlowcellID'
-
- def __init__(self, pathname=None, firecrest=None, bustard=None, gerald=None, xml=None):
- if pathname is not None:
- self.pathname = os.path.normpath(pathname)
- else:
- self.pathname = None
- self._name = None
- self._flowcell_id = None
- self.firecrest = firecrest
- self.bustard = bustard
- self.gerald = gerald
-
- if xml is not None:
- self.set_elements(xml)
-
- def _get_flowcell_id(self):
- # extract flowcell ID
- if self._flowcell_id is None:
- config_dir = os.path.join(self.pathname, 'Config')
- flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
- if os.path.exists(flowcell_id_path):
- flowcell_id_tree = ElementTree.parse(flowcell_id_path)
- self._flowcell_id = flowcell_id_tree.findtext('Text')
- else:
- path_fields = self.pathname.split('_')
- if len(path_fields) > 0:
- # guessing last element of filename
- flowcell_id = path_fields[-1]
- else:
- flowcell_id = 'unknown'
-
- logging.warning(
- "Flowcell id was not found, guessing %s" % (
- flowcell_id))
- self._flowcell_id = flowcell_id
- return self._flowcell_id
- flowcell_id = property(_get_flowcell_id)
-
- def get_elements(self):
- """
- make one master xml file from all of our sub-components.
- """
- root = ElementTree.Element(PipelineRun.PIPELINE_RUN)
- flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID)
- flowcell.text = self.flowcell_id
- root.append(self.firecrest.get_elements())
- root.append(self.bustard.get_elements())
- root.append(self.gerald.get_elements())
- return root
-
- def set_elements(self, tree):
- # this file gets imported by all the others,
- # so we need to hide the imports to avoid a cyclic imports
- from htsworkflow.pipeline import firecrest
- from htsworkflow.pipeline import bustard
- from htsworkflow.pipeline import gerald
-
- tag = tree.tag.lower()
- if tag != PipelineRun.PIPELINE_RUN.lower():
- raise ValueError('Pipeline Run Expecting %s got %s' % (
- PipelineRun.PIPELINE_RUN, tag))
- for element in tree:
- tag = element.tag.lower()
- if tag == PipelineRun.FLOWCELL_ID.lower():
- self._flowcell_id = element.text
- #ok the xword.Xword.XWORD pattern for module.class.constant is lame
- elif tag == firecrest.Firecrest.FIRECREST.lower():
- self.firecrest = firecrest.Firecrest(xml=element)
- elif tag == bustard.Bustard.BUSTARD.lower():
- self.bustard = bustard.Bustard(xml=element)
- elif tag == gerald.Gerald.GERALD.lower():
- self.gerald = gerald.Gerald(xml=element)
- else:
- logging.warn('PipelineRun unrecognized tag %s' % (tag,))
-
- def _get_run_name(self):
- """
- Given a run tuple, find the latest date and use that as our name
- """
- if self._name is None:
- tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time)
- timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
- self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml'
- return self._name
- name = property(_get_run_name)
-
- def save(self, destdir=None):
- if destdir is None:
- destdir = ''
- logging.info("Saving run report "+ self.name)
- xml = self.get_elements()
- indent(xml)
- dest_pathname = os.path.join(destdir, self.name)
- ElementTree.ElementTree(xml).write(dest_pathname)
-
- def load(self, filename):
- logging.info("Loading run report from " + filename)
- tree = ElementTree.parse(filename).getroot()
- self.set_elements(tree)
-
-def get_runs(runfolder):
- """
- Search through a run folder for all the various sub component runs
- and then return a PipelineRun for each different combination.
-
- For example if there are two different GERALD runs, this will
- generate two different PipelineRun objects, that differ
- in there gerald component.
- """
- from htsworkflow.pipeline import firecrest
- from htsworkflow.pipeline import bustard
- from htsworkflow.pipeline import gerald
-
- datadir = os.path.join(runfolder, 'Data')
-
- logging.info('Searching for runs in ' + datadir)
- runs = []
- for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
- f = firecrest.firecrest(firecrest_pathname)
- bustard_glob = os.path.join(firecrest_pathname, "Bustard*")
- for bustard_pathname in glob(bustard_glob):
- b = bustard.bustard(bustard_pathname)
- gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
- for gerald_pathname in glob(gerald_glob):
- try:
- g = gerald.gerald(gerald_pathname)
- runs.append(PipelineRun(runfolder, f, b, g))
- except IOError, e:
- print "Ignoring", str(e)
- return runs
-
-
-def extract_run_parameters(runs):
- """
- Search through runfolder_path for various runs and grab their parameters
- """
- for run in runs:
- run.save()
-
-def summarize_mapped_reads(mapped_reads):
- """
- Summarize per chromosome reads into a genome count
- But handle spike-in/contamination symlinks seperately.
- """
- summarized_reads = {}
- genome_reads = 0
- genome = 'unknown'
- for k, v in mapped_reads.items():
- path, k = os.path.split(k)
- if len(path) > 0:
- genome = path
- genome_reads += v
- else:
- summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
- summarized_reads[genome] = genome_reads
- return summarized_reads
-
-def summary_report(runs):
- """
- Summarize cluster numbers and mapped read counts for a runfolder
- """
- report = []
- for run in runs:
- # print a run name?
- report.append('Summary for %s' % (run.name,))
- # sort the report
- eland_keys = run.gerald.eland_results.results.keys()
- eland_keys.sort(alphanum)
-
- lane_results = run.gerald.summary.lane_results
- for lane_id in eland_keys:
- result = run.gerald.eland_results.results[lane_id]
- report.append("Sample name %s" % (result.sample_name))
- report.append("Lane id %s" % (result.lane_id,))
- cluster = lane_results[result.lane_id].cluster
- report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
- report.append("Total Reads: %d" % (result.reads))
- mc = result._match_codes
- nm = mc['NM']
- nm_percent = float(nm)/result.reads * 100
- qc = mc['QC']
- qc_percent = float(qc)/result.reads * 100
-
- report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent))
- report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent))
- report.append('Unique (0,1,2 mismatches) %d %d %d' % \
- (mc['U0'], mc['U1'], mc['U2']))
- report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
- (mc['R0'], mc['R1'], mc['R2']))
- report.append("Mapped Reads")
- mapped_reads = summarize_mapped_reads(result.mapped_reads)
- for name, counts in mapped_reads.items():
- report.append(" %s: %d" % (name, counts))
- report.append('---')
- report.append('')
- return os.linesep.join(report)
-
-def extract_results(runs, output_base_dir=None):
- if output_base_dir is None:
- output_base_dir = os.getcwd()
-
- for r in runs:
- result_dir = os.path.join(output_base_dir, r.flowcell_id)
- logging.info("Using %s as result directory" % (result_dir,))
- if not os.path.exists(result_dir):
- os.mkdir(result_dir)
-
- # create cycle_dir
- cycle = "C%d-%d" % (r.firecrest.start, r.firecrest.stop)
- logging.info("Filling in %s" % (cycle,))
- cycle_dir = os.path.join(result_dir, cycle)
- if os.path.exists(cycle_dir):
- logging.error("%s already exists, not overwriting" % (cycle_dir,))
- continue
- else:
- os.mkdir(cycle_dir)
-
- # copy stuff out of the main run
- g = r.gerald
-
- # save run file
- r.save(cycle_dir)
-
- # Copy Summary.htm
- summary_path = os.path.join(r.gerald.pathname, 'Summary.htm')
- if os.path.exists(summary_path):
- logging.info('Copying %s to %s' % (summary_path, cycle_dir))
- shutil.copy(summary_path, cycle_dir)
- else:
- logging.info('Summary file %s was not found' % (summary_path,))
-
- # tar score files
- score_files = []
- for f in os.listdir(g.pathname):
- if re.match('.*_score.txt', f):
- score_files.append(f)
-
- tar_cmd = ['/bin/tar', 'c'] + score_files
- bzip_cmd = [ 'bzip2', '-9', '-c' ]
- tar_dest_name =os.path.join(cycle_dir, 'scores.tar.bz2')
- tar_dest = open(tar_dest_name, 'w')
- logging.info("Compressing score files in %s" % (g.pathname,))
- logging.info("Running tar: " + " ".join(tar_cmd[:10]))
- logging.info("Running bzip2: " + " ".join(bzip_cmd))
- logging.info("Writing to %s" %(tar_dest_name))
-
- tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, cwd=g.pathname)
- bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest)
- tar.wait()
-
- # copy & bzip eland files
- for eland_lane in g.eland_results.values():
- source_name = eland_lane.pathname
- path, name = os.path.split(eland_lane.pathname)
- dest_name = os.path.join(cycle_dir, name+'.bz2')
-
- args = ['bzip2', '-9', '-c', source_name]
- logging.info('Running: %s' % ( " ".join(args) ))
- bzip_dest = open(dest_name, 'w')
- bzip = subprocess.Popen(args, stdout=bzip_dest)
- logging.info('Saving to %s' % (dest_name, ))
- bzip.wait()
-
-def clean_runs(runs):
- """
- Clean up run folders to optimize for compression.
- """
- # TODO: implement this.
- # rm RunLog*.xml
- # rm pipeline_*.txt
- # rm gclog.txt
- # rm NetCopy.log
- # rm nfn.log
- # rm Images/L*
- # cd Data/C1-*_Firecrest*
- # make clean_intermediate
-
- pass
+++ /dev/null
-import unittest
-
-from StringIO import StringIO
-from htsworkflow.pipeline import genome_mapper
-
-class testGenomeMapper(unittest.TestCase):
- def test_construct_mapper(self):
- genomes = {
- 'Arabidopsis thaliana': {'v01212004': '/arabidopsis'},
- 'Homo sapiens': {'hg18': '/hg18'},
- 'Mus musculus': {'mm8': '/mm8',
- 'mm9': '/mm9',
- 'mm10': '/mm10'},
- 'Phage': {'174': '/phi'},
- }
- genome_map = genome_mapper.constructMapperDict(genomes)
-
- self.failUnlessEqual("%(Mus musculus|mm8)s" % (genome_map), "/mm8")
- self.failUnlessEqual("%(Phage|174)s" % (genome_map), "/phi")
- self.failUnlessEqual("%(Mus musculus)s" % (genome_map), "/mm10")
- self.failUnlessEqual("%(Mus musculus|mm8)s" % (genome_map), "/mm8")
- self.failUnlessEqual("%(Mus musculus|mm10)s" % (genome_map), "/mm10")
-
- self.failUnlessEqual(len(genome_map.keys()), 6)
- self.failUnlessEqual(len(genome_map.values()), 6)
- self.failUnlessEqual(len(genome_map.items()), 6)
-
-
-def suite():
- return unittest.makeSuite(testGenomeMapper,'test')
-
-if __name__ == "__main__":
- unittest.main(defaultTest="suite")
+++ /dev/null
-#!/usr/bin/env python
-
-from datetime import datetime, date
-import os
-import tempfile
-import shutil
-import unittest
-
-from htsworkflow.pipeline import firecrest
-from htsworkflow.pipeline import bustard
-from htsworkflow.pipeline import gerald
-from htsworkflow.pipeline import runfolder
-from htsworkflow.pipeline.runfolder import ElementTree
-
-
-def make_flowcell_id(runfolder_dir, flowcell_id=None):
- if flowcell_id is None:
- flowcell_id = '207BTAAXY'
-
- config = """<?xml version="1.0"?>
-<FlowcellId>
- <Text>%s</Text>
-</FlowcellId>""" % (flowcell_id,)
- config_dir = os.path.join(runfolder_dir, 'Config')
-
- if not os.path.exists(config_dir):
- os.mkdir(config_dir)
- pathname = os.path.join(config_dir, 'FlowcellId.xml')
- f = open(pathname,'w')
- f.write(config)
- f.close()
-
-def make_matrix(matrix_dir):
- contents = """# Auto-generated frequency response matrix
-> A
-> C
-> G
-> T
-0.77 0.15 -0.04 -0.04
-0.76 1.02 -0.05 -0.06
--0.10 -0.10 1.17 -0.03
--0.13 -0.12 0.80 1.27
-"""
- s_matrix = os.path.join(matrix_dir, 's_matrix.txt')
- f = open(s_matrix, 'w')
- f.write(contents)
- f.close()
-
-def make_phasing_params(bustard_dir):
- for lane in range(1,9):
- pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane))
- f = open(pathname, 'w')
- f.write("""<Parameters>
- <Phasing>0.009900</Phasing>
- <Prephasing>0.003500</Prephasing>
-</Parameters>
-""")
- f.close()
-
-def make_gerald_config(gerald_dir):
- config_xml = """<RunParameters>
-<ChipWideRunParameters>
- <ANALYSIS>default</ANALYSIS>
- <BAD_LANES></BAD_LANES>
- <BAD_TILES></BAD_TILES>
- <CONTAM_DIR></CONTAM_DIR>
- <CONTAM_FILE></CONTAM_FILE>
- <ELAND_GENOME>Need_to_specify_ELAND_genome_directory</ELAND_GENOME>
- <ELAND_MULTIPLE_INSTANCES>8</ELAND_MULTIPLE_INSTANCES>
- <ELAND_REPEAT></ELAND_REPEAT>
- <EMAIL_DOMAIN>domain.com</EMAIL_DOMAIN>
- <EMAIL_LIST>diane</EMAIL_LIST>
- <EMAIL_SERVER>localhost:25</EMAIL_SERVER>
- <EXPT_DIR>/home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane</EXPT_DIR>
- <EXPT_DIR_ROOT>/home/diane/gec</EXPT_DIR_ROOT>
- <FORCE>1</FORCE>
- <GENOME_DIR>/home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes</GENOME_DIR>
- <GENOME_FILE>Need_to_specify_genome_file_name</GENOME_FILE>
- <HAMSTER_FLAG>genome</HAMSTER_FLAG>
- <OUT_DIR>/home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane</OUT_DIR>
- <POST_RUN_COMMAND></POST_RUN_COMMAND>
- <PRB_FILE_SUFFIX>_prb.txt</PRB_FILE_SUFFIX>
- <PURE_BASES>12</PURE_BASES>
- <QF_PARAMS>'((CHASTITY>=0.6))'</QF_PARAMS>
- <QHG_FILE_SUFFIX>_qhg.txt</QHG_FILE_SUFFIX>
- <QUALITY_FORMAT>--symbolic</QUALITY_FORMAT>
- <READ_LENGTH>32</READ_LENGTH>
- <SEQUENCE_FORMAT>--scarf</SEQUENCE_FORMAT>
- <SEQ_FILE_SUFFIX>_seq.txt</SEQ_FILE_SUFFIX>
- <SIG_FILE_SUFFIX_DEPHASED>_sig2.txt</SIG_FILE_SUFFIX_DEPHASED>
- <SIG_FILE_SUFFIX_NOT_DEPHASED>_sig.txt</SIG_FILE_SUFFIX_NOT_DEPHASED>
- <SOFTWARE_VERSION>@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp</SOFTWARE_VERSION>
- <TILE_REGEX>s_[1-8]_[0-9][0-9][0-9][0-9]</TILE_REGEX>
- <TILE_ROOT>s</TILE_ROOT>
- <TIME_STAMP>Sat Apr 19 19:08:30 2008</TIME_STAMP>
- <TOOLS_DIR>/home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald</TOOLS_DIR>
- <USE_BASES>all</USE_BASES>
- <WEB_DIR_ROOT>http://host.domain.com/yourshare/</WEB_DIR_ROOT>
-</ChipWideRunParameters>
-<LaneSpecificRunParameters>
- <ANALYSIS>
- <s_1>eland</s_1>
- <s_2>eland</s_2>
- <s_3>eland</s_3>
- <s_4>eland</s_4>
- <s_5>eland</s_5>
- <s_6>eland</s_6>
- <s_7>eland</s_7>
- <s_8>eland</s_8>
- </ANALYSIS>
- <ELAND_GENOME>
- <s_1>/g/dm3</s_1>
- <s_2>/g/equcab1</s_2>
- <s_3>/g/equcab1</s_3>
- <s_4>/g/canfam2</s_4>
- <s_5>/g/hg18</s_5>
- <s_6>/g/hg18</s_6>
- <s_7>/g/hg18</s_7>
- <s_8>/g/hg18</s_8>
- </ELAND_GENOME>
- <READ_LENGTH>
- <s_1>32</s_1>
- <s_2>32</s_2>
- <s_3>32</s_3>
- <s_4>32</s_4>
- <s_5>32</s_5>
- <s_6>32</s_6>
- <s_7>32</s_7>
- <s_8>32</s_8>
- </READ_LENGTH>
- <USE_BASES>
- <s_1>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_1>
- <s_2>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_2>
- <s_3>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_3>
- <s_4>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_4>
- <s_5>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_5>
- <s_6>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_6>
- <s_7>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_7>
- <s_8>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_8>
- </USE_BASES>
-</LaneSpecificRunParameters>
-</RunParameters>
-"""
- pathname = os.path.join(gerald_dir, 'config.xml')
- f = open(pathname,'w')
- f.write(config_xml)
- f.close()
-
-
-def make_summary_htm(gerald_dir):
- summary_htm = """<!--RUN_TIME Mon Apr 21 11:52:25 2008 -->
-<!--SOFTWARE_VERSION @(#) $Id: jerboa.pl,v 1.31 2007/03/05 17:52:15 km Exp $-->
-<html>
-<body>
-
-<a name="Top"><h2><title>080416_HWI-EAS229_0024_207BTAAXX Summary</title></h2></a>
-<h1>Summary Information For Experiment 080416_HWI-EAS229_0024_207BTAAXX on Machine HWI-EAS229</h1>
-<h2><br></br>Chip Summary<br></br></h2>
-<table border="1" cellpadding="5">
-<tr><td>Machine</td><td>HWI-EAS229</td></tr>
-<tr><td>Run Folder</td><td>080416_HWI-EAS229_0024_207BTAAXX</td></tr>
-<tr><td>Chip ID</td><td>unknown</td></tr>
-</table>
-<h2><br></br>Lane Parameter Summary<br></br></h2>
-<table border="1" cellpadding="5">
-<tr>
-<td>Lane</td>
-<td>Sample ID</td>
-<td>Sample Target</td>
-<td>Sample Type</td>
-<td>Length</td>
-<td>Filter</td>
-<td>Tiles</td>
-</tr>
-<tr>
-<td>1</td>
-<td>unknown</td>
-<td>dm3</td>
-<td>ELAND</td>
-<td>32</td>
-<td>'((CHASTITY>=0.6))'</td>
-<td><a href="#Lane1">Lane 1</a></td>
-</tr>
-<tr>
-<td>2</td>
-<td>unknown</td>
-<td>equcab1</td>
-<td>ELAND</td>
-<td>32</td>
-<td>'((CHASTITY>=0.6))'</td>
-<td><a href="#Lane2">Lane 2</a></td>
-</tr>
-<tr>
-<td>3</td>
-<td>unknown</td>
-<td>equcab1</td>
-<td>ELAND</td>
-<td>32</td>
-<td>'((CHASTITY>=0.6))'</td>
-<td><a href="#Lane3">Lane 3</a></td>
-</tr>
-<tr>
-<td>4</td>
-<td>unknown</td>
-<td>canfam2</td>
-<td>ELAND</td>
-<td>32</td>
-<td>'((CHASTITY>=0.6))'</td>
-<td><a href="#Lane4">Lane 4</a></td>
-</tr>
-<tr>
-<td>5</td>
-<td>unknown</td>
-<td>hg18</td>
-<td>ELAND</td>
-<td>32</td>
-<td>'((CHASTITY>=0.6))'</td>
-<td><a href="#Lane5">Lane 5</a></td>
-</tr>
-<tr>
-<td>6</td>
-<td>unknown</td>
-<td>hg18</td>
-<td>ELAND</td>
-<td>32</td>
-<td>'((CHASTITY>=0.6))'</td>
-<td><a href="#Lane6">Lane 6</a></td>
-</tr>
-<tr>
-<td>7</td>
-<td>unknown</td>
-<td>hg18</td>
-<td>ELAND</td>
-<td>32</td>
-<td>'((CHASTITY>=0.6))'</td>
-<td><a href="#Lane7">Lane 7</a></td>
-</tr>
-<tr>
-<td>8</td>
-<td>unknown</td>
-<td>hg18</td>
-<td>ELAND</td>
-<td>32</td>
-<td>'((CHASTITY>=0.6))'</td>
-<td><a href="#Lane8">Lane 8</a></td>
-</tr>
-</table>
-<h2><br></br>Lane Results Summary<br></br></h2>
-<table border="1" cellpadding="5">
-<tr>
-
-<td>Lane </td>
-<td>Clusters </td>
-<td>Av 1st Cycle Int </td>
-<td>% intensity after 20 cycles </td>
-<td>% PF Clusters </td>
-<td>% Align (PF) </td>
-<td>Av Alignment Score (PF) </td>
-<td> % Error Rate (PF) </td>
-</tr>
-<tr>
-<td>1</td>
-<td>17421 +/- 2139</td>
-<td>7230 +/- 801</td>
-<td>23.73 +/- 10.79</td>
-<td>13.00 +/- 22.91</td>
-<td>32.03 +/- 18.45</td>
-<td>6703.57 +/- 3753.85</td>
-<td>4.55 +/- 4.81</td>
-</tr>
-<tr>
-<td>2</td>
-<td>20311 +/- 2402</td>
-<td>7660 +/- 678</td>
-<td>17.03 +/- 4.40</td>
-<td>40.74 +/- 30.33</td>
-<td>29.54 +/- 9.03</td>
-<td>5184.02 +/- 1631.54</td>
-<td>3.27 +/- 3.94</td>
-</tr>
-<tr>
-<td>3</td>
-<td>20193 +/- 2399</td>
-<td>7700 +/- 797</td>
-<td>15.75 +/- 3.30</td>
-<td>56.56 +/- 17.16</td>
-<td>27.33 +/- 7.48</td>
-<td>4803.49 +/- 1313.31</td>
-<td>3.07 +/- 2.86</td>
-</tr>
-<tr>
-<td>4</td>
-<td>15537 +/- 2531</td>
-<td>7620 +/- 1392</td>
-<td>15.37 +/- 3.79</td>
-<td>63.05 +/- 18.30</td>
-<td>15.88 +/- 4.99</td>
-<td>3162.13 +/- 962.59</td>
-<td>3.11 +/- 2.22</td>
-</tr>
-<tr>
-<td>5</td>
-<td>32047 +/- 3356</td>
-<td>8093 +/- 831</td>
-<td>23.79 +/- 6.18</td>
-<td>53.36 +/- 18.06</td>
-<td>48.04 +/- 13.77</td>
-<td>9866.23 +/- 2877.30</td>
-<td>2.26 +/- 1.16</td>
-</tr>
-<tr>
-<td>6</td>
-<td>32946 +/- 4753</td>
-<td>8227 +/- 736</td>
-<td>24.07 +/- 4.69</td>
-<td>54.65 +/- 12.57</td>
-<td>50.98 +/- 10.54</td>
-<td>10468.86 +/- 2228.53</td>
-<td>2.21 +/- 2.33</td>
-</tr>
-<tr>
-<td>7</td>
-<td>39504 +/- 4171</td>
-<td>8401 +/- 785</td>
-<td>22.55 +/- 4.56</td>
-<td>45.22 +/- 10.34</td>
-<td>48.41 +/- 9.67</td>
-<td>9829.40 +/- 1993.20</td>
-<td>2.26 +/- 1.11</td>
-</tr>
-<tr>
-<td>8</td>
-<td>37998 +/- 3792</td>
-<td>8443 +/- 1211</td>
-<td>39.03 +/- 7.52</td>
-<td>42.16 +/- 12.35</td>
-<td>40.98 +/- 14.89</td>
-<td>8128.87 +/- 3055.34</td>
-<td>3.57 +/- 2.77</td>
-</tr>
-</table>
-</body>
-</html>
-"""
- pathname = os.path.join(gerald_dir, 'Summary.htm')
- f = open(pathname, 'w')
- f.write(summary_htm)
- f.close()
-
-def make_eland_results(gerald_dir):
- eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D.
->HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 chr2L.fa 8796855 R DD 24T
->HWI-EAS229_24_207BTAAXX:1:7:776:582 AGCTCANCCGATCGAAAACCTCNCCAAGCAAT NM 0 0 0
->HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 Lambda.fa 8796855 R DD 24T
-"""
- for i in range(1,9):
- pathname = os.path.join(gerald_dir,
- 's_%d_eland_result.txt' % (i,))
- f = open(pathname, 'w')
- f.write(eland_result)
- f.close()
-
-class RunfolderTests(unittest.TestCase):
- """
- Test components of the runfolder processing code
- which includes firecrest, bustard, and gerald
- """
- def setUp(self):
- # make a fake runfolder directory
- self.temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_')
-
- self.runfolder_dir = os.path.join(self.temp_dir,
- '080102_HWI-EAS229_0010_207BTAAXX')
- os.mkdir(self.runfolder_dir)
-
- self.data_dir = os.path.join(self.runfolder_dir, 'Data')
- os.mkdir(self.data_dir)
-
- self.firecrest_dir = os.path.join(self.data_dir,
- 'C1-33_Firecrest1.8.28_12-04-2008_diane'
- )
- os.mkdir(self.firecrest_dir)
- self.matrix_dir = os.path.join(self.firecrest_dir, 'Matrix')
- os.mkdir(self.matrix_dir)
- make_matrix(self.matrix_dir)
-
- self.bustard_dir = os.path.join(self.firecrest_dir,
- 'Bustard1.8.28_12-04-2008_diane')
- os.mkdir(self.bustard_dir)
- make_phasing_params(self.bustard_dir)
-
- self.gerald_dir = os.path.join(self.bustard_dir,
- 'GERALD_12-04-2008_diane')
- os.mkdir(self.gerald_dir)
- make_gerald_config(self.gerald_dir)
- make_summary_htm(self.gerald_dir)
- make_eland_results(self.gerald_dir)
-
- def tearDown(self):
- shutil.rmtree(self.temp_dir)
-
- def test_firecrest(self):
- """
- Construct a firecrest object
- """
- f = firecrest.firecrest(self.firecrest_dir)
- self.failUnlessEqual(f.version, '1.8.28')
- self.failUnlessEqual(f.start, 1)
- self.failUnlessEqual(f.stop, 33)
- self.failUnlessEqual(f.user, 'diane')
- self.failUnlessEqual(f.date, date(2008,4,12))
-
- xml = f.get_elements()
- # just make sure that element tree can serialize the tree
- xml_str = ElementTree.tostring(xml)
-
- f2 = firecrest.Firecrest(xml=xml)
- self.failUnlessEqual(f.version, f2.version)
- self.failUnlessEqual(f.start, f2.start)
- self.failUnlessEqual(f.stop, f2.stop)
- self.failUnlessEqual(f.user, f2.user)
- self.failUnlessEqual(f.date, f2.date)
-
- def test_bustard(self):
- """
- construct a bustard object
- """
- b = bustard.bustard(self.bustard_dir)
- self.failUnlessEqual(b.version, '1.8.28')
- self.failUnlessEqual(b.date, date(2008,4,12))
- self.failUnlessEqual(b.user, 'diane')
- self.failUnlessEqual(len(b.phasing), 8)
- self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099)
-
- xml = b.get_elements()
- b2 = bustard.Bustard(xml=xml)
- self.failUnlessEqual(b.version, b2.version)
- self.failUnlessEqual(b.date, b2.date )
- self.failUnlessEqual(b.user, b2.user)
- self.failUnlessEqual(len(b.phasing), len(b2.phasing))
- for key in b.phasing.keys():
- self.failUnlessEqual(b.phasing[key].lane,
- b2.phasing[key].lane)
- self.failUnlessEqual(b.phasing[key].phasing,
- b2.phasing[key].phasing)
- self.failUnlessEqual(b.phasing[key].prephasing,
- b2.phasing[key].prephasing)
-
- def test_gerald(self):
- # need to update gerald and make tests for it
- g = gerald.gerald(self.gerald_dir)
-
- self.failUnlessEqual(g.version,
- '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp')
- self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30))
- self.failUnlessEqual(len(g.lanes), len(g.lanes.keys()))
- self.failUnlessEqual(len(g.lanes), len(g.lanes.items()))
-
-
- # list of genomes, matches what was defined up in
- # make_gerald_config.
- # the first None is to offset the genomes list to be 1..9
- # instead of pythons default 0..8
- genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2',
- '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ]
-
- # test lane specific parameters from gerald config file
- for i in range(1,9):
- cur_lane = g.lanes[str(i)]
- self.failUnlessEqual(cur_lane.analysis, 'eland')
- self.failUnlessEqual(cur_lane.eland_genome, genomes[i])
- self.failUnlessEqual(cur_lane.read_length, '32')
- self.failUnlessEqual(cur_lane.use_bases, 'Y'*32)
-
- # test data extracted from summary file
- clusters = [None,
- (17421, 2139), (20311, 2402), (20193, 2399), (15537, 2531),
- (32047, 3356), (32946, 4753), (39504, 4171), (37998, 3792)]
-
- for i in range(1,9):
- summary_lane = g.summary[str(i)]
- self.failUnlessEqual(summary_lane.cluster, clusters[i])
- self.failUnlessEqual(summary_lane.lane, str(i))
-
- xml = g.get_elements()
- # just make sure that element tree can serialize the tree
- xml_str = ElementTree.tostring(xml)
- g2 = gerald.Gerald(xml=xml)
-
- # do it all again after extracting from the xml file
- self.failUnlessEqual(g.version, g2.version)
- self.failUnlessEqual(g.date, g2.date)
- self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys()))
- self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items()))
-
- # test lane specific parameters from gerald config file
- for i in range(1,9):
- g_lane = g.lanes[str(i)]
- g2_lane = g2.lanes[str(i)]
- self.failUnlessEqual(g_lane.analysis, g2_lane.analysis)
- self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome)
- self.failUnlessEqual(g_lane.read_length, g2_lane.read_length)
- self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases)
-
- # test (some) summary elements
- for i in range(1,9):
- g_summary = g.summary[str(i)]
- g2_summary = g2.summary[str(i)]
- self.failUnlessEqual(g_summary.cluster, g2_summary.cluster)
- self.failUnlessEqual(g_summary.lane, g2_summary.lane)
-
- g_eland = g.eland_results
- g2_eland = g2.eland_results
- for lane in g_eland.keys():
- self.failUnlessEqual(g_eland[lane].reads,
- g2_eland[lane].reads)
- self.failUnlessEqual(len(g_eland[lane].mapped_reads),
- len(g2_eland[lane].mapped_reads))
- for k in g_eland[lane].mapped_reads.keys():
- self.failUnlessEqual(g_eland[lane].mapped_reads[k],
- g2_eland[lane].mapped_reads[k])
-
- self.failUnlessEqual(len(g_eland[lane].match_codes),
- len(g2_eland[lane].match_codes))
- for k in g_eland[lane].match_codes.keys():
- self.failUnlessEqual(g_eland[lane].match_codes[k],
- g2_eland[lane].match_codes[k])
-
-
- def test_eland(self):
- dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa',
- 'chr2L.fa': 'dm3/chr2L.fa',
- 'Lambda.fa': 'Lambda.fa'}
- genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map,
- '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map }
- eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps)
-
- for i in range(1,9):
- lane = eland[str(i)]
- self.failUnlessEqual(lane.reads, 4)
- self.failUnlessEqual(lane.sample_name, "s")
- self.failUnlessEqual(lane.lane_id, unicode(i))
- self.failUnlessEqual(len(lane.mapped_reads), 3)
- self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1)
- self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1)
- self.failUnlessEqual(lane.match_codes['U1'], 2)
- self.failUnlessEqual(lane.match_codes['NM'], 1)
-
- xml = eland.get_elements()
- # just make sure that element tree can serialize the tree
- xml_str = ElementTree.tostring(xml)
- e2 = gerald.ELAND(xml=xml)
-
- for i in range(1,9):
- l1 = eland[str(i)]
- l2 = e2[str(i)]
- self.failUnlessEqual(l1.reads, l2.reads)
- self.failUnlessEqual(l1.sample_name, l2.sample_name)
- self.failUnlessEqual(l1.lane_id, l2.lane_id)
- self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads))
- self.failUnlessEqual(len(l1.mapped_reads), 3)
- for k in l1.mapped_reads.keys():
- self.failUnlessEqual(l1.mapped_reads[k],
- l2.mapped_reads[k])
-
- self.failUnlessEqual(len(l1.match_codes), 9)
- self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes))
- for k in l1.match_codes.keys():
- self.failUnlessEqual(l1.match_codes[k],
- l2.match_codes[k])
-
- def test_runfolder(self):
- runs = runfolder.get_runs(self.runfolder_dir)
-
- # do we get the flowcell id from the filename?
- self.failUnlessEqual(len(runs), 1)
- self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-04-19.xml')
-
- # do we get the flowcell id from the FlowcellId.xml file
- make_flowcell_id(self.runfolder_dir, '207BTAAXY')
- runs = runfolder.get_runs(self.runfolder_dir)
- self.failUnlessEqual(len(runs), 1)
- self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-04-19.xml')
-
- r1 = runs[0]
- xml = r1.get_elements()
- xml_str = ElementTree.tostring(xml)
-
- r2 = runfolder.PipelineRun(xml=xml)
- self.failUnlessEqual(r1.name, r2.name)
- self.failIfEqual(r2.firecrest, None)
- self.failIfEqual(r2.bustard, None)
- self.failIfEqual(r2.gerald, None)
-
-
-def suite():
- return unittest.makeSuite(RunfolderTests,'test')
-
-if __name__ == "__main__":
- unittest.main(defaultTest="suite")
-
+++ /dev/null
-#!/usr/bin/env python
-
-from datetime import datetime, date
-import os
-import tempfile
-import shutil
-import unittest
-
-from htsworkflow.pipeline import firecrest
-from htsworkflow.pipeline import bustard
-from htsworkflow.pipeline import gerald
-from htsworkflow.pipeline import runfolder
-from htsworkflow.pipeline.runfolder import ElementTree
-
-
-def make_flowcell_id(runfolder_dir, flowcell_id=None):
- if flowcell_id is None:
- flowcell_id = '207BTAAXY'
-
- config = """<?xml version="1.0"?>
-<FlowcellId>
- <Text>%s</Text>
-</FlowcellId>""" % (flowcell_id,)
- config_dir = os.path.join(runfolder_dir, 'Config')
-
- if not os.path.exists(config_dir):
- os.mkdir(config_dir)
- pathname = os.path.join(config_dir, 'FlowcellId.xml')
- f = open(pathname,'w')
- f.write(config)
- f.close()
-
-def make_matrix(matrix_dir):
- contents = """# Auto-generated frequency response matrix
-> A
-> C
-> G
-> T
-0.77 0.15 -0.04 -0.04
-0.76 1.02 -0.05 -0.06
--0.10 -0.10 1.17 -0.03
--0.13 -0.12 0.80 1.27
-"""
- s_matrix = os.path.join(matrix_dir, 's_matrix.txt')
- f = open(s_matrix, 'w')
- f.write(contents)
- f.close()
-
-def make_phasing_params(bustard_dir):
- for lane in range(1,9):
- pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane))
- f = open(pathname, 'w')
- f.write("""<Parameters>
- <Phasing>0.009900</Phasing>
- <Prephasing>0.003500</Prephasing>
-</Parameters>
-""")
- f.close()
-
-def make_gerald_config(gerald_dir):
- config_xml = """<RunParameters>
-<ChipWideRunParameters>
- <ANALYSIS>default</ANALYSIS>
- <BAD_LANES></BAD_LANES>
- <BAD_TILES></BAD_TILES>
- <CONTAM_DIR></CONTAM_DIR>
- <CONTAM_FILE></CONTAM_FILE>
- <ELAND_GENOME>Need_to_specify_ELAND_genome_directory</ELAND_GENOME>
- <ELAND_MULTIPLE_INSTANCES>8</ELAND_MULTIPLE_INSTANCES>
- <ELAND_REPEAT></ELAND_REPEAT>
- <EMAIL_DOMAIN>domain.com</EMAIL_DOMAIN>
- <EMAIL_LIST>diane</EMAIL_LIST>
- <EMAIL_SERVER>localhost:25</EMAIL_SERVER>
- <EXPT_DIR>/home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane</EXPT_DIR>
- <EXPT_DIR_ROOT>/home/diane/gec</EXPT_DIR_ROOT>
- <FORCE>1</FORCE>
- <GENOME_DIR>/home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes</GENOME_DIR>
- <GENOME_FILE>Need_to_specify_genome_file_name</GENOME_FILE>
- <HAMSTER_FLAG>genome</HAMSTER_FLAG>
- <OUT_DIR>/home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane</OUT_DIR>
- <POST_RUN_COMMAND></POST_RUN_COMMAND>
- <PRB_FILE_SUFFIX>_prb.txt</PRB_FILE_SUFFIX>
- <PURE_BASES>12</PURE_BASES>
- <QF_PARAMS>'((CHASTITY>=0.6))'</QF_PARAMS>
- <QHG_FILE_SUFFIX>_qhg.txt</QHG_FILE_SUFFIX>
- <QUALITY_FORMAT>--symbolic</QUALITY_FORMAT>
- <READ_LENGTH>32</READ_LENGTH>
- <SEQUENCE_FORMAT>--scarf</SEQUENCE_FORMAT>
- <SEQ_FILE_SUFFIX>_seq.txt</SEQ_FILE_SUFFIX>
- <SIG_FILE_SUFFIX_DEPHASED>_sig2.txt</SIG_FILE_SUFFIX_DEPHASED>
- <SIG_FILE_SUFFIX_NOT_DEPHASED>_sig.txt</SIG_FILE_SUFFIX_NOT_DEPHASED>
- <SOFTWARE_VERSION>@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp</SOFTWARE_VERSION>
- <TILE_REGEX>s_[1-8]_[0-9][0-9][0-9][0-9]</TILE_REGEX>
- <TILE_ROOT>s</TILE_ROOT>
- <TIME_STAMP>Sat Apr 19 19:08:30 2008</TIME_STAMP>
- <TOOLS_DIR>/home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald</TOOLS_DIR>
- <USE_BASES>all</USE_BASES>
- <WEB_DIR_ROOT>http://host.domain.com/yourshare/</WEB_DIR_ROOT>
-</ChipWideRunParameters>
-<LaneSpecificRunParameters>
- <ANALYSIS>
- <s_1>eland</s_1>
- <s_2>eland</s_2>
- <s_3>eland</s_3>
- <s_4>eland</s_4>
- <s_5>eland</s_5>
- <s_6>eland</s_6>
- <s_7>eland</s_7>
- <s_8>eland</s_8>
- </ANALYSIS>
- <ELAND_GENOME>
- <s_1>/g/dm3</s_1>
- <s_2>/g/equcab1</s_2>
- <s_3>/g/equcab1</s_3>
- <s_4>/g/canfam2</s_4>
- <s_5>/g/hg18</s_5>
- <s_6>/g/hg18</s_6>
- <s_7>/g/hg18</s_7>
- <s_8>/g/hg18</s_8>
- </ELAND_GENOME>
- <READ_LENGTH>
- <s_1>32</s_1>
- <s_2>32</s_2>
- <s_3>32</s_3>
- <s_4>32</s_4>
- <s_5>32</s_5>
- <s_6>32</s_6>
- <s_7>32</s_7>
- <s_8>32</s_8>
- </READ_LENGTH>
- <USE_BASES>
- <s_1>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_1>
- <s_2>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_2>
- <s_3>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_3>
- <s_4>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_4>
- <s_5>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_5>
- <s_6>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_6>
- <s_7>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_7>
- <s_8>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_8>
- </USE_BASES>
-</LaneSpecificRunParameters>
-</RunParameters>
-"""
- pathname = os.path.join(gerald_dir, 'config.xml')
- f = open(pathname,'w')
- f.write(config_xml)
- f.close()
-
-def make_summary_htm(gerald_dir):
- summary_htm="""<!--RUN_TIME Wed Jul 2 06:47:44 2008 -->
-<!--SOFTWARE_VERSION @(#) $Id: jerboa.pl,v 1.94 2007/12/04 09:59:07 rshaw Exp $-->
-<html>
-<body>
-
-<a name="Top"><h2><title>080627_HWI-EAS229_0036_3055HAXX Summary</title></h2></a>
-<h1>Summary Information For Experiment 080627_HWI-EAS229_0036_3055HAXX on Machine HWI-EAS229</h1>
-<h2><br></br>Chip Summary<br></br></h2>
-<table border="1" cellpadding="5">
-<tr><td>Machine</td><td>HWI-EAS229</td></tr>
-<tr><td>Run Folder</td><td>080627_HWI-EAS229_0036_3055HAXX</td></tr>
-<tr><td>Chip ID</td><td>unknown</td></tr>
-</table>
-<h2><br></br>Chip Results Summary<br></br></h2>
-<table border="1" cellpadding="5">
-<tr>
-<td>Clusters</td>
-<td>Clusters (PF)</td>
-<td>Yield (kbases)</td>
-</tr>
-<tr><td>80933224</td>
-<td>43577803</td>
-<td>1133022</td>
-</tr>
-</table>
-<h2><br></br>Lane Parameter Summary<br></br></h2>
-<table border="1" cellpadding="5">
-<tr>
-<td>Lane</td>
-<td>Sample ID</td>
-<td>Sample Target</td>
-<td>Sample Type</td>
-<td>Length</td>
-<td>Filter</td>
-<td>Num Tiles</td>
-<td>Tiles</td>
-</tr>
-<tr>
-<td>1</td>
-<td>unknown</td>
-<td>mm9</td>
-<td>ELAND</td>
-<td>26</td>
-<td>'((CHASTITY>=0.6))'</td>
-<td>100</td>
-<td><a href="#Lane1">Lane 1</a></td>
-</tr>
-<tr>
-<td>2</td>
-<td>unknown</td>
-<td>mm9</td>
-<td>ELAND</td>
-<td>26</td>
-<td>'((CHASTITY>=0.6))'</td>
-<td>100</td>
-<td><a href="#Lane2">Lane 2</a></td>
-</tr>
-<tr>
-<td>3</td>
-<td>unknown</td>
-<td>mm9</td>
-<td>ELAND</td>
-<td>26</td>
-<td>'((CHASTITY>=0.6))'</td>
-<td>100</td>
-<td><a href="#Lane3">Lane 3</a></td>
-</tr>
-<tr>
-<td>4</td>
-<td>unknown</td>
-<td>elegans170</td>
-<td>ELAND</td>
-<td>26</td>
-<td>'((CHASTITY>=0.6))'</td>
-<td>100</td>
-<td><a href="#Lane4">Lane 4</a></td>
-</tr>
-<tr>
-<td>5</td>
-<td>unknown</td>
-<td>elegans170</td>
-<td>ELAND</td>
-<td>26</td>
-<td>'((CHASTITY>=0.6))'</td>
-<td>100</td>
-<td><a href="#Lane5">Lane 5</a></td>
-</tr>
-<tr>
-<td>6</td>
-<td>unknown</td>
-<td>elegans170</td>
-<td>ELAND</td>
-<td>26</td>
-<td>'((CHASTITY>=0.6))'</td>
-<td>100</td>
-<td><a href="#Lane6">Lane 6</a></td>
-</tr>
-<tr>
-<td>7</td>
-<td>unknown</td>
-<td>elegans170</td>
-<td>ELAND</td>
-<td>26</td>
-<td>'((CHASTITY>=0.6))'</td>
-<td>100</td>
-<td><a href="#Lane7">Lane 7</a></td>
-</tr>
-<tr>
-<td>8</td>
-<td>unknown</td>
-<td>elegans170</td>
-<td>ELAND</td>
-<td>26</td>
-<td>'((CHASTITY>=0.6))'</td>
-<td>100</td>
-<td><a href="#Lane8">Lane 8</a></td>
-</tr>
-</table>
-<h2><br></br>Lane Results Summary<br></br></h2>
-<table border="1" cellpadding="5">
-<tr>
-<td colspan="2">Lane Info</td>
-<td colspan="8">Tile Mean +/- SD for Lane</td>
-</tr>
-<tr>
-<td>Lane </td>
-<td>Lane Yield (kbases) </td>
-<td>Clusters (raw)</td>
-<td>Clusters (PF) </td>
-<td>1st Cycle Int (PF) </td>
-<td>% intensity after 20 cycles (PF) </td>
-<td>% PF Clusters </td>
-<td>% Align (PF) </td>
-<td>Alignment Score (PF) </td>
-<td> % Error Rate (PF) </td>
-</tr>
-<tr>
-<td>1</td>
-<td>158046</td>
-<td>96483 +/- 9074</td>
-<td>60787 +/- 4240</td>
-<td>329 +/- 35</td>
-<td>101.88 +/- 6.03</td>
-<td>63.21 +/- 3.29</td>
-<td>70.33 +/- 0.24</td>
-<td>9054.08 +/- 59.16</td>
-<td>0.46 +/- 0.18</td>
-</tr>
-<tr>
-<td>2</td>
-<td>156564</td>
-<td>133738 +/- 7938</td>
-<td>60217 +/- 1926</td>
-<td>444 +/- 39</td>
-<td>92.62 +/- 7.58</td>
-<td>45.20 +/- 3.31</td>
-<td>51.98 +/- 0.74</td>
-<td>6692.04 +/- 92.49</td>
-<td>0.46 +/- 0.09</td>
-</tr>
-<tr>
-<td>3</td>
-<td>185818</td>
-<td>152142 +/- 10002</td>
-<td>71468 +/- 2827</td>
-<td>366 +/- 36</td>
-<td>91.53 +/- 8.66</td>
-<td>47.19 +/- 3.80</td>
-<td>82.24 +/- 0.44</td>
-<td>10598.68 +/- 64.13</td>
-<td>0.41 +/- 0.04</td>
-</tr>
-<tr>
-<td>4</td>
-<td>34953</td>
-<td>15784 +/- 2162</td>
-<td>13443 +/- 1728</td>
-<td>328 +/- 40</td>
-<td>97.53 +/- 9.87</td>
-<td>85.29 +/- 1.91</td>
-<td>80.02 +/- 0.53</td>
-<td>10368.82 +/- 71.08</td>
-<td>0.15 +/- 0.05</td>
-</tr>
-<tr>
-<td>5</td>
-<td>167936</td>
-<td>119735 +/- 8465</td>
-<td>64590 +/- 2529</td>
-<td>417 +/- 37</td>
-<td>88.69 +/- 14.79</td>
-<td>54.10 +/- 2.59</td>
-<td>76.95 +/- 0.32</td>
-<td>9936.47 +/- 65.75</td>
-<td>0.28 +/- 0.02</td>
-</tr>
-<tr>
-<td>6</td>
-<td>173463</td>
-<td>152177 +/- 8146</td>
-<td>66716 +/- 2493</td>
-<td>372 +/- 39</td>
-<td>87.06 +/- 9.86</td>
-<td>43.98 +/- 3.12</td>
-<td>78.80 +/- 0.43</td>
-<td>10162.28 +/- 49.65</td>
-<td>0.38 +/- 0.03</td>
-</tr>
-<tr>
-<td>7</td>
-<td>149287</td>
-<td>84649 +/- 7325</td>
-<td>57418 +/- 3617</td>
-<td>295 +/- 28</td>
-<td>89.40 +/- 8.23</td>
-<td>67.97 +/- 1.82</td>
-<td>33.38 +/- 0.25</td>
-<td>4247.92 +/- 32.37</td>
-<td>1.00 +/- 0.03</td>
-</tr>
-<tr>
-<td>8</td>
-<td>106953</td>
-<td>54622 +/- 4812</td>
-<td>41136 +/- 3309</td>
-<td>284 +/- 37</td>
-<td>90.21 +/- 9.10</td>
-<td>75.39 +/- 2.27</td>
-<td>48.33 +/- 0.29</td>
-<td>6169.21 +/- 169.50</td>
-<td>0.86 +/- 1.22</td>
-</tr>
-<tr><td colspan="13">Tile mean across chip</td></tr>
-<tr>
-<td>Av.</td>
-<td></td>
-<td>101166</td>
-<td>54472</td>
-<td>354</td>
-<td>92.36</td>
-<td>60.29</td>
-<td>65.25</td>
-<td>8403.69</td>
-<td>0.50</td>
-</tr>
-</table>
-<h2><br></br>Expanded Lane Summary<br></br></h2>
-<table border="1" cellpadding="5">
-<tr>
-
-<tr><td colspan="2">Lane Info</td>
-<td colspan="2">Phasing Info</td>
-<td colspan="2">Raw Data (tile mean)</td>
-<td colspan="7">Filtered Data (tile mean)</td></tr>
-<td>Lane </td>
-<td>Clusters (tile mean) (raw)</td>
-<td>% Phasing </td>
-<td>% Prephasing </td>
-<td>% Error Rate (raw) </td>
-<td> Equiv Perfect Clusters (raw) </td>
-<td>% retained </td>
-<td>Cycle 2-4 Av Int (PF) </td>
-<td>Cycle 2-10 Av % Loss (PF) </td>
-<td>Cycle 10-20 Av % Loss (PF) </td>
-<td>% Align (PF) </td>
-<td>% Error Rate (PF) </td>
-<td> Equiv Perfect Clusters (PF) </td>
-</tr>
-<tr>
-<td>1</td>
-<td>96483</td>
-<td>0.7700</td>
-<td>0.3100</td>
-<td>1.00</td>
-<td>49676</td>
-<td>63.21</td>
-<td>317 +/- 32</td>
-<td>0.13 +/- 0.44</td>
-<td>-1.14 +/- 0.34</td>
-<td>70.33</td>
-<td>0.46</td>
-<td>41758</td>
-</tr>
-<tr>
-<td>2</td>
-<td>133738</td>
-<td>0.7700</td>
-<td>0.3100</td>
-<td>1.22</td>
-<td>40467</td>
-<td>45.20</td>
-<td>415 +/- 33</td>
-<td>0.29 +/- 0.40</td>
-<td>-0.79 +/- 0.35</td>
-<td>51.98</td>
-<td>0.46</td>
-<td>30615</td>
-</tr>
-<tr>
-<td>3</td>
-<td>152142</td>
-<td>0.7700</td>
-<td>0.3100</td>
-<td>1.30</td>
-<td>78588</td>
-<td>47.19</td>
-<td>344 +/- 26</td>
-<td>0.68 +/- 0.51</td>
-<td>-0.77 +/- 0.42</td>
-<td>82.24</td>
-<td>0.41</td>
-<td>57552</td>
-</tr>
-<tr>
-<td>4</td>
-<td>15784</td>
-<td>0.7700</td>
-<td>0.3100</td>
-<td>0.29</td>
-<td>11095</td>
-<td>85.29</td>
-<td>306 +/- 34</td>
-<td>0.20 +/- 0.69</td>
-<td>-1.28 +/- 0.66</td>
-<td>80.02</td>
-<td>0.15</td>
-<td>10671</td>
-</tr>
-<tr>
-<td>5</td>
-<td>119735</td>
-<td>0.7700</td>
-<td>0.3100</td>
-<td>0.85</td>
-<td>60335</td>
-<td>54.10</td>
-<td>380 +/- 32</td>
-<td>0.34 +/- 0.49</td>
-<td>-1.55 +/- 4.69</td>
-<td>76.95</td>
-<td>0.28</td>
-<td>49015</td>
-</tr>
-<tr>
-<td>6</td>
-<td>152177</td>
-<td>0.7700</td>
-<td>0.3100</td>
-<td>1.21</td>
-<td>70905</td>
-<td>43.98</td>
-<td>333 +/- 27</td>
-<td>0.57 +/- 0.50</td>
-<td>-0.91 +/- 0.39</td>
-<td>78.80</td>
-<td>0.38</td>
-<td>51663</td>
-</tr>
-<tr>
-<td>7</td>
-<td>84649</td>
-<td>0.7700</td>
-<td>0.3100</td>
-<td>1.38</td>
-<td>21069</td>
-<td>67.97</td>
-<td>272 +/- 20</td>
-<td>1.15 +/- 0.52</td>
-<td>-0.84 +/- 0.58</td>
-<td>33.38</td>
-<td>1.00</td>
-<td>18265</td>
-</tr>
-<tr>
-<td>8</td>
-<td>54622</td>
-<td>0.7700</td>
-<td>0.3100</td>
-<td>1.17</td>
-<td>21335</td>
-<td>75.39</td>
-<td>262 +/- 31</td>
-<td>1.10 +/- 0.59</td>
-<td>-1.01 +/- 0.47</td>
-<td>48.33</td>
-<td>0.86</td>
-<td>19104</td>
-</tr>
-</table>
-<b><br></br>IVC Plots</b>
-<p> <a href='IVC.htm' target="_blank"> IVC.htm
- </a></p>
-<b><br></br>All Intensity Plots</b>
-<p> <a href='All.htm' target="_blank"> All.htm
- </a></p>
-<b><br></br>Error graphs: </b>
-<p> <a href='Error.htm' target="_blank"> Error.htm
- </a></p>
-<td><a href="#Top">Back to top</a></td>
-<a name="Lane1"><h2><br></br>Lane 1<br></br></h2></a>
-<table border="1" cellpadding="5">
-<tr>
-<td>Lane </td>
-<td>Tile </td>
-<td>Clusters (raw)</td>
-<td>Av 1st Cycle Int (PF) </td>
-<td>Av % intensity after 20 cycles (PF) </td>
-<td>% PF Clusters </td>
-<td>% Align (PF) </td>
-<td>Av Alignment Score (PF) </td>
-<td>% Error Rate (PF) </td>
-</tr>
-<tr>
-<td>1</td>
-<td>0001</td>
-<td>114972</td>
-<td>326.48</td>
-<td>94.39</td>
-<td>57.44</td>
-<td>70.2</td>
-<td>9038.6</td>
-<td>0.44</td>
-</tr>
-</table>
-<td><a href="#Top">Back to top</a></td>
-<a name="Lane2"><h2><br></br>Lane 2<br></br></h2></a>
-<table border="1" cellpadding="5">
-<tr>
-<td>Lane </td>
-<td>Tile </td>
-<td>Clusters (raw)</td>
-<td>Av 1st Cycle Int (PF) </td>
-<td>Av % intensity after 20 cycles (PF) </td>
-<td>% PF Clusters </td>
-<td>% Align (PF) </td>
-<td>Av Alignment Score (PF) </td>
-<td>% Error Rate (PF) </td>
-</tr>
-<tr>
-<td>2</td>
-<td>0001</td>
-<td>147793</td>
-<td>448.12</td>
-<td>83.68</td>
-<td>38.57</td>
-<td>53.7</td>
-<td>6905.4</td>
-<td>0.54</td>
-</tr>
-</table>
-<td><a href="#Top">Back to top</a></td>
-<a name="Lane3"><h2><br></br>Lane 3<br></br></h2></a>
-<table border="1" cellpadding="5">
-<tr>
-<td>Lane </td>
-<td>Tile </td>
-<td>Clusters (raw)</td>
-<td>Av 1st Cycle Int (PF) </td>
-<td>Av % intensity after 20 cycles (PF) </td>
-<td>% PF Clusters </td>
-<td>% Align (PF) </td>
-<td>Av Alignment Score (PF) </td>
-<td>% Error Rate (PF) </td>
-</tr>
-<tr>
-<td>3</td>
-<td>0001</td>
-<td>167904</td>
-<td>374.05</td>
-<td>86.91</td>
-<td>40.36</td>
-<td>81.3</td>
-<td>10465.0</td>
-<td>0.47</td>
-</tr>
-</table>
-<td><a href="#Top">Back to top</a></td>
-<a name="Lane4"><h2><br></br>Lane 4<br></br></h2></a>
-<table border="1" cellpadding="5">
-<tr>
-<td>Lane </td>
-<td>Tile </td>
-<td>Clusters (raw)</td>
-<td>Av 1st Cycle Int (PF) </td>
-<td>Av % intensity after 20 cycles (PF) </td>
-<td>% PF Clusters </td>
-<td>% Align (PF) </td>
-<td>Av Alignment Score (PF) </td>
-<td>% Error Rate (PF) </td>
-</tr>
-<tr>
-<td>4</td>
-<td>0001</td>
-<td>20308</td>
-<td>276.85</td>
-<td>92.87</td>
-<td>84.26</td>
-<td>80.4</td>
-<td>10413.8</td>
-<td>0.16</td>
-</tr>
-</table>
-<td><a href="#Top">Back to top</a></td>
-<a name="Lane5"><h2><br></br>Lane 5<br></br></h2></a>
-<table border="1" cellpadding="5">
-<tr>
-<td>Lane </td>
-<td>Tile </td>
-<td>Clusters (raw)</td>
-<td>Av 1st Cycle Int (PF) </td>
-<td>Av % intensity after 20 cycles (PF) </td>
-<td>% PF Clusters </td>
-<td>% Align (PF) </td>
-<td>Av Alignment Score (PF) </td>
-<td>% Error Rate (PF) </td>
-</tr>
-</table>
-<td><a href="#Top">Back to top</a></td>
-<a name="Lane6"><h2><br></br>Lane 6<br></br></h2></a>
-<table border="1" cellpadding="5">
-<tr>
-<td>Lane </td>
-<td>Tile </td>
-<td>Clusters (raw)</td>
-<td>Av 1st Cycle Int (PF) </td>
-<td>Av % intensity after 20 cycles (PF) </td>
-<td>% PF Clusters </td>
-<td>% Align (PF) </td>
-<td>Av Alignment Score (PF) </td>
-<td>% Error Rate (PF) </td>
-</tr>
-<tr>
-<td>6</td>
-<td>0001</td>
-<td>166844</td>
-<td>348.12</td>
-<td>77.59</td>
-<td>38.13</td>
-<td>79.7</td>
-<td>10264.4</td>
-<td>0.44</td>
-</tr>
-</table>
-<td><a href="#Top">Back to top</a></td>
-<a name="Lane7"><h2><br></br>Lane 7<br></br></h2></a>
-<table border="1" cellpadding="5">
-<tr>
-<td>Lane </td>
-<td>Tile </td>
-<td>Clusters (raw)</td>
-<td>Av 1st Cycle Int (PF) </td>
-<td>Av % intensity after 20 cycles (PF) </td>
-<td>% PF Clusters </td>
-<td>% Align (PF) </td>
-<td>Av Alignment Score (PF) </td>
-<td>% Error Rate (PF) </td>
-</tr>
-<tr>
-<td>7</td>
-<td>0001</td>
-<td>98913</td>
-<td>269.90</td>
-<td>86.66</td>
-<td>64.55</td>
-<td>33.2</td>
-<td>4217.5</td>
-<td>1.02</td>
-</tr>
-</table>
-<td><a href="#Top">Back to top</a></td>
-<a name="Lane8"><h2><br></br>Lane 8<br></br></h2></a>
-<table border="1" cellpadding="5">
-<tr>
-<td>Lane </td>
-<td>Tile </td>
-<td>Clusters (raw)</td>
-<td>Av 1st Cycle Int (PF) </td>
-<td>Av % intensity after 20 cycles (PF) </td>
-<td>% PF Clusters </td>
-<td>% Align (PF) </td>
-<td>Av Alignment Score (PF) </td>
-<td>% Error Rate (PF) </td>
-</tr>
-<tr>
-<td>8</td>
-<td>0001</td>
-<td>64972</td>
-<td>243.60</td>
-<td>89.40</td>
-<td>73.17</td>
-<td>48.3</td>
-<td>6182.8</td>
-<td>0.71</td>
-</tr>
-</table>
-<td><a href="#Top">Back to top</a></td>
-</body>
-</html>
-"""
- pathname = os.path.join(gerald_dir, 'Summary.htm')
- f = open(pathname, 'w')
- f.write(summary_htm)
- f.close()
-
-def make_eland_results(gerald_dir):
- eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D.
->HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 chr2L.fa 8796855 R DD 24T
->HWI-EAS229_24_207BTAAXX:1:7:776:582 AGCTCANCCGATCGAAAACCTCNCCAAGCAAT NM 0 0 0
->HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 Lambda.fa 8796855 R DD 24T
-"""
- for i in range(1,9):
- pathname = os.path.join(gerald_dir,
- 's_%d_eland_result.txt' % (i,))
- f = open(pathname, 'w')
- f.write(eland_result)
- f.close()
-
-def make_runfolder(obj=None):
- """
- Make a fake runfolder, attach all the directories to obj if defined
- """
- # make a fake runfolder directory
- temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_')
-
- runfolder_dir = os.path.join(temp_dir,
- '080102_HWI-EAS229_0010_207BTAAXX')
- os.mkdir(runfolder_dir)
-
- data_dir = os.path.join(runfolder_dir, 'Data')
- os.mkdir(data_dir)
-
- firecrest_dir = os.path.join(data_dir,
- 'C1-33_Firecrest1.8.28_12-04-2008_diane'
- )
- os.mkdir(firecrest_dir)
- matrix_dir = os.path.join(firecrest_dir, 'Matrix')
- os.mkdir(matrix_dir)
- make_matrix(matrix_dir)
-
- bustard_dir = os.path.join(firecrest_dir,
- 'Bustard1.8.28_12-04-2008_diane')
- os.mkdir(bustard_dir)
- make_phasing_params(bustard_dir)
-
- gerald_dir = os.path.join(bustard_dir,
- 'GERALD_12-04-2008_diane')
- os.mkdir(gerald_dir)
- make_gerald_config(gerald_dir)
- make_summary_htm(gerald_dir)
- make_eland_results(gerald_dir)
-
- if obj is not None:
- obj.temp_dir = temp_dir
- obj.runfolder_dir = runfolder_dir
- obj.data_dir = data_dir
- obj.firecrest_dir = firecrest_dir
- obj.matrix_dir = matrix_dir
- obj.bustard_dir = bustard_dir
- obj.gerald_dir = gerald_dir
-
-
-class RunfolderTests(unittest.TestCase):
- """
- Test components of the runfolder processing code
- which includes firecrest, bustard, and gerald
- """
- def setUp(self):
- # attaches all the directories to the object passed in
- make_runfolder(self)
-
- def tearDown(self):
- shutil.rmtree(self.temp_dir)
-
- def test_firecrest(self):
- """
- Construct a firecrest object
- """
- f = firecrest.firecrest(self.firecrest_dir)
- self.failUnlessEqual(f.version, '1.8.28')
- self.failUnlessEqual(f.start, 1)
- self.failUnlessEqual(f.stop, 33)
- self.failUnlessEqual(f.user, 'diane')
- self.failUnlessEqual(f.date, date(2008,4,12))
-
- xml = f.get_elements()
- # just make sure that element tree can serialize the tree
- xml_str = ElementTree.tostring(xml)
-
- f2 = firecrest.Firecrest(xml=xml)
- self.failUnlessEqual(f.version, f2.version)
- self.failUnlessEqual(f.start, f2.start)
- self.failUnlessEqual(f.stop, f2.stop)
- self.failUnlessEqual(f.user, f2.user)
- self.failUnlessEqual(f.date, f2.date)
-
- def test_bustard(self):
- """
- construct a bustard object
- """
- b = bustard.bustard(self.bustard_dir)
- self.failUnlessEqual(b.version, '1.8.28')
- self.failUnlessEqual(b.date, date(2008,4,12))
- self.failUnlessEqual(b.user, 'diane')
- self.failUnlessEqual(len(b.phasing), 8)
- self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099)
-
- xml = b.get_elements()
- b2 = bustard.Bustard(xml=xml)
- self.failUnlessEqual(b.version, b2.version)
- self.failUnlessEqual(b.date, b2.date )
- self.failUnlessEqual(b.user, b2.user)
- self.failUnlessEqual(len(b.phasing), len(b2.phasing))
- for key in b.phasing.keys():
- self.failUnlessEqual(b.phasing[key].lane,
- b2.phasing[key].lane)
- self.failUnlessEqual(b.phasing[key].phasing,
- b2.phasing[key].phasing)
- self.failUnlessEqual(b.phasing[key].prephasing,
- b2.phasing[key].prephasing)
-
- def test_gerald(self):
- # need to update gerald and make tests for it
- g = gerald.gerald(self.gerald_dir)
-
- self.failUnlessEqual(g.version,
- '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp')
- self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30))
- self.failUnlessEqual(len(g.lanes), len(g.lanes.keys()))
- self.failUnlessEqual(len(g.lanes), len(g.lanes.items()))
-
-
- # list of genomes, matches what was defined up in
- # make_gerald_config.
- # the first None is to offset the genomes list to be 1..9
- # instead of pythons default 0..8
- genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2',
- '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ]
-
- # test lane specific parameters from gerald config file
- for i in range(1,9):
- cur_lane = g.lanes[str(i)]
- self.failUnlessEqual(cur_lane.analysis, 'eland')
- self.failUnlessEqual(cur_lane.eland_genome, genomes[i])
- self.failUnlessEqual(cur_lane.read_length, '32')
- self.failUnlessEqual(cur_lane.use_bases, 'Y'*32)
-
- # test data extracted from summary file
- clusters = [None,
- (96483, 9074), (133738, 7938),
- (152142, 10002), (15784, 2162),
- (119735, 8465), (152177, 8146),
- (84649, 7325), (54622, 4812),]
-
- for i in range(1,9):
- summary_lane = g.summary[str(i)]
- self.failUnlessEqual(summary_lane.cluster, clusters[i])
- self.failUnlessEqual(summary_lane.lane, str(i))
-
- xml = g.get_elements()
- # just make sure that element tree can serialize the tree
- xml_str = ElementTree.tostring(xml)
- g2 = gerald.Gerald(xml=xml)
-
- # do it all again after extracting from the xml file
- self.failUnlessEqual(g.version, g2.version)
- self.failUnlessEqual(g.date, g2.date)
- self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys()))
- self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items()))
-
- # test lane specific parameters from gerald config file
- for i in range(1,9):
- g_lane = g.lanes[str(i)]
- g2_lane = g2.lanes[str(i)]
- self.failUnlessEqual(g_lane.analysis, g2_lane.analysis)
- self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome)
- self.failUnlessEqual(g_lane.read_length, g2_lane.read_length)
- self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases)
-
- # test (some) summary elements
- for i in range(1,9):
- g_summary = g.summary[str(i)]
- g2_summary = g2.summary[str(i)]
- self.failUnlessEqual(g_summary.cluster, g2_summary.cluster)
- self.failUnlessEqual(g_summary.lane, g2_summary.lane)
-
- g_eland = g.eland_results
- g2_eland = g2.eland_results
- for lane in g_eland.keys():
- self.failUnlessEqual(g_eland[lane].reads,
- g2_eland[lane].reads)
- self.failUnlessEqual(len(g_eland[lane].mapped_reads),
- len(g2_eland[lane].mapped_reads))
- for k in g_eland[lane].mapped_reads.keys():
- self.failUnlessEqual(g_eland[lane].mapped_reads[k],
- g2_eland[lane].mapped_reads[k])
-
- self.failUnlessEqual(len(g_eland[lane].match_codes),
- len(g2_eland[lane].match_codes))
- for k in g_eland[lane].match_codes.keys():
- self.failUnlessEqual(g_eland[lane].match_codes[k],
- g2_eland[lane].match_codes[k])
-
-
- def test_eland(self):
- dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa',
- 'chr2L.fa': 'dm3/chr2L.fa',
- 'Lambda.fa': 'Lambda.fa'}
- genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map,
- '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map }
- eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps)
-
- for i in range(1,9):
- lane = eland[str(i)]
- self.failUnlessEqual(lane.reads, 4)
- self.failUnlessEqual(lane.sample_name, "s")
- self.failUnlessEqual(lane.lane_id, unicode(i))
- self.failUnlessEqual(len(lane.mapped_reads), 3)
- self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1)
- self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1)
- self.failUnlessEqual(lane.match_codes['U1'], 2)
- self.failUnlessEqual(lane.match_codes['NM'], 1)
-
- xml = eland.get_elements()
- # just make sure that element tree can serialize the tree
- xml_str = ElementTree.tostring(xml)
- e2 = gerald.ELAND(xml=xml)
-
- for i in range(1,9):
- l1 = eland[str(i)]
- l2 = e2[str(i)]
- self.failUnlessEqual(l1.reads, l2.reads)
- self.failUnlessEqual(l1.sample_name, l2.sample_name)
- self.failUnlessEqual(l1.lane_id, l2.lane_id)
- self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads))
- self.failUnlessEqual(len(l1.mapped_reads), 3)
- for k in l1.mapped_reads.keys():
- self.failUnlessEqual(l1.mapped_reads[k],
- l2.mapped_reads[k])
-
- self.failUnlessEqual(len(l1.match_codes), 9)
- self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes))
- for k in l1.match_codes.keys():
- self.failUnlessEqual(l1.match_codes[k],
- l2.match_codes[k])
-
- def test_runfolder(self):
- runs = runfolder.get_runs(self.runfolder_dir)
-
- # do we get the flowcell id from the filename?
- self.failUnlessEqual(len(runs), 1)
- self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-04-19.xml')
-
- # do we get the flowcell id from the FlowcellId.xml file
- make_flowcell_id(self.runfolder_dir, '207BTAAXY')
- runs = runfolder.get_runs(self.runfolder_dir)
- self.failUnlessEqual(len(runs), 1)
- self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-04-19.xml')
-
- r1 = runs[0]
- xml = r1.get_elements()
- xml_str = ElementTree.tostring(xml)
-
- r2 = runfolder.PipelineRun(xml=xml)
- self.failUnlessEqual(r1.name, r2.name)
- self.failIfEqual(r2.firecrest, None)
- self.failIfEqual(r2.bustard, None)
- self.failIfEqual(r2.gerald, None)
-
-
-def suite():
- return unittest.makeSuite(RunfolderTests,'test')
-
-if __name__ == "__main__":
- unittest.main(defaultTest="suite")
-
--- /dev/null
+"""
+Provide code to interact with the vendor tools to produce useable "raw" data.
+
+the illumina sub-package contains components to interact with the Illumina provided
+GAPipeline
+"""
--- /dev/null
+
+from datetime import date
+from glob import glob
+import logging
+import os
+import time
+import re
+
+from htsworkflow.pipelines.runfolder import \
+ ElementTree, \
+ VERSION_RE, \
+ EUROPEAN_STRPTIME
+
+class Phasing(object):
+ PHASING = 'Phasing'
+ PREPHASING = 'Prephasing'
+
+ def __init__(self, fromfile=None, xml=None):
+ self.lane = None
+ self.phasing = None
+ self.prephasing = None
+
+ if fromfile is not None:
+ self._initialize_from_file(fromfile)
+ elif xml is not None:
+ self.set_elements(xml)
+
+ def _initialize_from_file(self, pathname):
+ path, name = os.path.split(pathname)
+ basename, ext = os.path.splitext(name)
+ # the last character of the param base filename should be the
+ # lane number
+ tree = ElementTree.parse(pathname).getroot()
+ self.set_elements(tree)
+ self.lane = int(basename[-1])
+
+ def get_elements(self):
+ root = ElementTree.Element(Phasing.PHASING, {'lane': str(self.lane)})
+ phasing = ElementTree.SubElement(root, Phasing.PHASING)
+ phasing.text = str(self.phasing)
+ prephasing = ElementTree.SubElement(root, Phasing.PREPHASING)
+ prephasing.text = str(self.prephasing)
+ return root
+
+ def set_elements(self, tree):
+ if tree.tag not in ('Phasing', 'Parameters'):
+ raise ValueError('exptected Phasing or Parameters')
+ lane = tree.attrib.get('lane', None)
+ if lane is not None:
+ self.lane = int(lane)
+ for element in list(tree):
+ if element.tag == Phasing.PHASING:
+ self.phasing = float(element.text)
+ elif element.tag == Phasing.PREPHASING:
+ self.prephasing = float(element.text)
+
+class Bustard(object):
+ XML_VERSION = 1
+
+ # Xml Tags
+ BUSTARD = 'Bustard'
+ SOFTWARE_VERSION = 'version'
+ DATE = 'run_time'
+ USER = 'user'
+ PARAMETERS = 'Parameters'
+
+ def __init__(self, xml=None):
+ self.version = None
+ self.date = date.today()
+ self.user = None
+ self.phasing = {}
+
+ if xml is not None:
+ self.set_elements(xml)
+
+ def _get_time(self):
+ return time.mktime(self.date.timetuple())
+ time = property(_get_time, doc='return run time as seconds since epoch')
+
+ def dump(self):
+ print "Bustard version:", self.version
+ print "Run date", self.date
+ print "user:", self.user
+ for lane, tree in self.phasing.items():
+ print lane
+ print tree
+
+ def get_elements(self):
+ root = ElementTree.Element('Bustard',
+ {'version': str(Bustard.XML_VERSION)})
+ version = ElementTree.SubElement(root, Bustard.SOFTWARE_VERSION)
+ version.text = self.version
+ run_date = ElementTree.SubElement(root, Bustard.DATE)
+ run_date.text = str(self.time)
+ user = ElementTree.SubElement(root, Bustard.USER)
+ user.text = self.user
+ params = ElementTree.SubElement(root, Bustard.PARAMETERS)
+ for p in self.phasing.values():
+ params.append(p.get_elements())
+ return root
+
+ def set_elements(self, tree):
+ if tree.tag != Bustard.BUSTARD:
+ raise ValueError('Expected "Bustard" SubElements')
+ xml_version = int(tree.attrib.get('version', 0))
+ if xml_version > Bustard.XML_VERSION:
+ logging.warn('Bustard XML tree is a higher version than this class')
+ for element in list(tree):
+ if element.tag == Bustard.SOFTWARE_VERSION:
+ self.version = element.text
+ elif element.tag == Bustard.DATE:
+ self.date = date.fromtimestamp(float(element.text))
+ elif element.tag == Bustard.USER:
+ self.user = element.text
+ elif element.tag == Bustard.PARAMETERS:
+ for param in element:
+ p = Phasing(xml=param)
+ self.phasing[p.lane] = p
+ else:
+ raise ValueError("Unrecognized tag: %s" % (element.tag,))
+
+
+
+def bustard(pathname):
+ """
+ Construct a Bustard object from pathname
+ """
+ b = Bustard()
+ path, name = os.path.split(pathname)
+ groups = name.split("_")
+ version = re.search(VERSION_RE, groups[0])
+ b.version = version.group(1)
+ t = time.strptime(groups[1], EUROPEAN_STRPTIME)
+ b.date = date(*t[0:3])
+ b.user = groups[2]
+ paramfiles = glob(os.path.join(pathname, "params?.xml"))
+ for paramfile in paramfiles:
+ phasing = Phasing(paramfile)
+ assert (phasing.lane >= 1 and phasing.lane <= 8)
+ b.phasing[phasing.lane] = phasing
+ return b
+
+def fromxml(tree):
+ b = Bustard()
+ b.set_elements(tree)
+ return b
--- /dev/null
+#!/usr/bin/python
+import subprocess
+import logging
+import time
+import re
+import os
+
+from htsworkflow.pipelines.retrieve_config import getCombinedOptions, saveConfigFile
+from htsworkflow.pipelines.retrieve_config import FlowCellNotFound, WebError404
+from htsworkflow.pipelines.genome_mapper import DuplicateGenome, getAvailableGenomes, constructMapperDict
+from htsworkflow.pipelines.run_status import GARunStatus
+
+from pyinotify import WatchManager, ThreadedNotifier
+from pyinotify import EventsCodes, ProcessEvent
+
+class ConfigInfo:
+
+ def __init__(self):
+ #run_path = firecrest analysis directory to run analysis from
+ self.run_path = None
+ self.bustard_path = None
+ self.config_filepath = None
+ self.status = None
+
+ #top level directory where all analyses are placed
+ self.base_analysis_dir = None
+ #analysis_dir, top level analysis dir...
+ # base_analysis_dir + '/070924_USI-EAS44_0022_FC12150'
+ self.analysis_dir = None
+
+
+ def createStatusObject(self):
+ """
+ Creates a status object which can be queried for
+ status of running the pipeline
+
+ returns True if object created
+ returns False if object cannot be created
+ """
+ if self.config_filepath is None:
+ return False
+
+ self.status = GARunStatus(self.config_filepath)
+ return True
+
+
+
+####################################
+# inotify event processor
+
+s_firecrest_finished = re.compile('Firecrest[0-9\._\-A-Za-z]+/finished.txt')
+s_bustard_finished = re.compile('Bustard[0-9\._\-A-Za-z]+/finished.txt')
+s_gerald_finished = re.compile('GERALD[0-9\._\-A-Za-z]+/finished.txt')
+
+s_gerald_all = re.compile('Firecrest[0-9\._\-A-Za-z]+/Bustard[0-9\._\-A-Za-z]+/GERALD[0-9\._\-A-Za-z]+/')
+s_bustard_all = re.compile('Firecrest[0-9\._\-A-Za-z]+/Bustard[0-9\._\-A-Za-z]+/')
+s_firecrest_all = re.compile('Firecrest[0-9\._\-A-Za-z]+/')
+
+class RunEvent(ProcessEvent):
+
+ def __init__(self, conf_info):
+
+ self.run_status_dict = {'firecrest': False,
+ 'bustard': False,
+ 'gerald': False}
+
+ self._ci = conf_info
+
+ ProcessEvent.__init__(self)
+
+
+ def process_IN_CREATE(self, event):
+ fullpath = os.path.join(event.path, event.name)
+ if s_finished.search(fullpath):
+ logging.info("File Found: %s" % (fullpath))
+
+ if s_firecrest_finished.search(fullpath):
+ self.run_status_dict['firecrest'] = True
+ self._ci.status.updateFirecrest(event.name)
+ elif s_bustard_finished.search(fullpath):
+ self.run_status_dict['bustard'] = True
+ self._ci.status.updateBustard(event.name)
+ elif s_gerald_finished.search(fullpath):
+ self.run_status_dict['gerald'] = True
+ self._ci.status.updateGerald(event.name)
+
+ #WARNING: The following order is important!!
+ # Firecrest regex will catch all gerald, bustard, and firecrest
+ # Bustard regex will catch all gerald and bustard
+ # Gerald regex will catch all gerald
+ # So, order needs to be Gerald, Bustard, Firecrest, or this
+ # won't work properly.
+ elif s_gerald_all.search(fullpath):
+ self._ci.status.updateGerald(event.name)
+ elif s_bustard_all.search(fullpath):
+ self._ci.status.updateBustard(event.name)
+ elif s_firecrest_all.search(fullpath):
+ self._ci.status.updateFirecrest(event.name)
+
+ #print "Create: %s" % (os.path.join(event.path, event.name))
+
+ def process_IN_DELETE(self, event):
+ #print "Remove %s" % (os.path.join(event.path, event.name))
+ pass
+
+
+
+
+#FLAGS
+# Config Step Error
+RUN_ABORT = 'abort'
+# Run Step Error
+RUN_FAILED = 'failed'
+
+
+#####################################
+# Configure Step (goat_pipeline.py)
+#Info
+s_start = re.compile('Starting Genome Analyzer Pipeline')
+s_gerald = re.compile("[\S\s]+--GERALD[\S\s]+--make[\S\s]+")
+s_generating = re.compile('^Generating journals, Makefiles')
+s_seq_folder = re.compile('^Sequence folder: ')
+s_seq_folder_sub = re.compile('want to make ')
+s_stderr_taskcomplete = re.compile('^Task complete, exiting')
+
+#Errors
+s_invalid_cmdline = re.compile('Usage:[\S\s]*goat_pipeline.py')
+s_species_dir_err = re.compile('Error: Lane [1-8]:')
+s_goat_traceb = re.compile("^Traceback \(most recent call last\):")
+s_missing_cycles = re.compile('^Error: Tile s_[1-8]_[0-9]+: Different number of cycles: [0-9]+ instead of [0-9]+')
+
+SUPPRESS_MISSING_CYCLES = False
+
+
+##Ignore - Example of out above each ignore regex.
+#NOTE: Commenting out an ignore will cause it to be
+# logged as DEBUG with the logging module.
+#CF_STDERR_IGNORE_LIST = []
+s_skip = re.compile('s_[0-8]_[0-9]+')
+
+
+##########################################
+# Pipeline Run Step (make -j8 recursive)
+
+##Info
+s_finished = re.compile('finished')
+
+##Errors
+s_make_error = re.compile('^make[\S\s]+Error')
+s_no_gnuplot = re.compile('gnuplot: command not found')
+s_no_convert = re.compile('^Can\'t exec "convert"')
+s_no_ghostscript = re.compile('gs: command not found')
+
+##Ignore - Example of out above each ignore regex.
+#NOTE: Commenting out an ignore will cause it to be
+# logged as DEBUG with the logging module.
+#
+PL_STDERR_IGNORE_LIST = []
+# Info: PF 11802
+PL_STDERR_IGNORE_LIST.append( re.compile('^Info: PF') )
+# About to analyse intensity file s_4_0101_sig2.txt
+PL_STDERR_IGNORE_LIST.append( re.compile('^About to analyse intensity file') )
+# Will send output to standard output
+PL_STDERR_IGNORE_LIST.append( re.compile('^Will send output to standard output') )
+# Found 31877 clusters
+PL_STDERR_IGNORE_LIST.append( re.compile('^Found [0-9]+ clusters') )
+# Will use quality criterion ((CHASTITY>=0.6)
+PL_STDERR_IGNORE_LIST.append( re.compile('^Will use quality criterion') )
+# Quality criterion translated to (($F[5]>=0.6))
+PL_STDERR_IGNORE_LIST.append( re.compile('^Quality criterion translated to') )
+# opened /woldlab/trog/data1/king/070924_USI-EAS44_0022_FC12150/Data/C1-36_Firecrest1.9.1_14-11-2007_king.4/Bustard1.9.1_14-11-2007_king/s_4_0101_qhg.txt
+# AND
+# opened s_4_0103_qhg.txt
+PL_STDERR_IGNORE_LIST.append( re.compile('^opened[\S\s]+qhg.txt') )
+# 81129 sequences out of 157651 passed filter criteria
+PL_STDERR_IGNORE_LIST.append( re.compile('^[0-9]+ sequences out of [0-9]+ passed filter criteria') )
+
+
+def pl_stderr_ignore(line):
+ """
+ Searches lines for lines to ignore (i.e. not to log)
+
+ returns True if line should be ignored
+ returns False if line should NOT be ignored
+ """
+ for s in PL_STDERR_IGNORE_LIST:
+ if s.search(line):
+ return True
+ return False
+
+
+def config_stdout_handler(line, conf_info):
+ """
+ Processes each line of output from GOAT
+ and stores useful information using the logging module
+
+ Loads useful information into conf_info as well, for future
+ use outside the function.
+
+ returns True if found condition that signifies success.
+ """
+
+ # Skip irrelevant line (without logging)
+ if s_skip.search(line):
+ pass
+
+ # Detect invalid command-line arguments
+ elif s_invalid_cmdline.search(line):
+ logging.error("Invalid commandline options!")
+
+ # Detect starting of configuration
+ elif s_start.search(line):
+ logging.info('START: Configuring pipeline')
+
+ # Detect it made it past invalid arguments
+ elif s_gerald.search(line):
+ logging.info('Running make now')
+
+ # Detect that make files have been generated (based on output)
+ elif s_generating.search(line):
+ logging.info('Make files generted')
+ return True
+
+ # Capture run directory
+ elif s_seq_folder.search(line):
+ mo = s_seq_folder_sub.search(line)
+ #Output changed when using --tiles=<tiles>
+ # at least in pipeline v0.3.0b2
+ if mo:
+ firecrest_bustard_gerald_makefile = line[mo.end():]
+ firecrest_bustard_gerald, junk = \
+ os.path.split(firecrest_bustard_gerald_makefile)
+ firecrest_bustard, junk = os.path.split(firecrest_bustard_gerald)
+ firecrest, junk = os.path.split(firecrest_bustard)
+
+ conf_info.bustard_path = firecrest_bustard
+ conf_info.run_path = firecrest
+
+ #Standard output handling
+ else:
+ print 'Sequence line:', line
+ mo = s_seq_folder.search(line)
+ conf_info.bustard_path = line[mo.end():]
+ conf_info.run_path, temp = os.path.split(conf_info.bustard_path)
+
+ # Log all other output for debugging purposes
+ else:
+ logging.warning('CONF:?: %s' % (line))
+
+ return False
+
+
+
+def config_stderr_handler(line, conf_info):
+ """
+ Processes each line of output from GOAT
+ and stores useful information using the logging module
+
+ Loads useful information into conf_info as well, for future
+ use outside the function.
+
+ returns RUN_ABORT upon detecting failure;
+ True on success message;
+ False if neutral message
+ (i.e. doesn't signify failure or success)
+ """
+ global SUPPRESS_MISSING_CYCLES
+
+ # Detect invalid species directory error
+ if s_species_dir_err.search(line):
+ logging.error(line)
+ return RUN_ABORT
+ # Detect goat_pipeline.py traceback
+ elif s_goat_traceb.search(line):
+ logging.error("Goat config script died, traceback in debug output")
+ return RUN_ABORT
+ # Detect indication of successful configuration (from stderr; odd, but ok)
+ elif s_stderr_taskcomplete.search(line):
+ logging.info('Configure step successful (from: stderr)')
+ return True
+ # Detect missing cycles
+ elif s_missing_cycles.search(line):
+
+ # Only display error once
+ if not SUPPRESS_MISSING_CYCLES:
+ logging.error("Missing cycles detected; Not all cycles copied?")
+ logging.debug("CONF:STDERR:MISSING_CYCLES: %s" % (line))
+ SUPPRESS_MISSING_CYCLES = True
+ return RUN_ABORT
+
+ # Log all other output as debug output
+ else:
+ logging.debug('CONF:STDERR:?: %s' % (line))
+
+ # Neutral (not failure; nor success)
+ return False
+
+
+#def pipeline_stdout_handler(line, conf_info):
+# """
+# Processes each line of output from running the pipeline
+# and stores useful information using the logging module
+#
+# Loads useful information into conf_info as well, for future
+# use outside the function.
+#
+# returns True if found condition that signifies success.
+# """
+#
+# #f.write(line + '\n')
+#
+# return True
+
+
+
+def pipeline_stderr_handler(line, conf_info):
+ """
+ Processes each line of stderr from pipelien run
+ and stores useful information using the logging module
+
+ ##FIXME: Future feature (doesn't actually do this yet)
+ #Loads useful information into conf_info as well, for future
+ #use outside the function.
+
+ returns RUN_FAILED upon detecting failure;
+ #True on success message; (no clear success state)
+ False if neutral message
+ (i.e. doesn't signify failure or success)
+ """
+
+ if pl_stderr_ignore(line):
+ pass
+ elif s_make_error.search(line):
+ logging.error("make error detected; run failed")
+ return RUN_FAILED
+ elif s_no_gnuplot.search(line):
+ logging.error("gnuplot not found")
+ return RUN_FAILED
+ elif s_no_convert.search(line):
+ logging.error("imagemagick's convert command not found")
+ return RUN_FAILED
+ elif s_no_ghostscript.search(line):
+ logging.error("ghostscript not found")
+ return RUN_FAILED
+ else:
+ logging.debug('PIPE:STDERR:?: %s' % (line))
+
+ return False
+
+
+def retrieve_config(conf_info, flowcell, cfg_filepath, genome_dir):
+ """
+ Gets the config file from server...
+ requires config file in:
+ /etc/ga_frontend/ga_frontend.conf
+ or
+ ~/.ga_frontend.conf
+
+ with:
+ [config_file_server]
+ base_host_url: http://host:port
+
+ return True if successful, False is failure
+ """
+ options = getCombinedOptions()
+
+ if options.url is None:
+ logging.error("~/.ga_frontend.conf or /etc/ga_frontend/ga_frontend.conf" \
+ " missing base_host_url option")
+ return False
+
+ try:
+ saveConfigFile(flowcell, options.url, cfg_filepath)
+ conf_info.config_filepath = cfg_filepath
+ except FlowCellNotFound, e:
+ logging.error(e)
+ return False
+ except WebError404, e:
+ logging.error(e)
+ return False
+ except IOError, e:
+ logging.error(e)
+ return False
+ except Exception, e:
+ logging.error(e)
+ return False
+
+ f = open(cfg_filepath, 'r')
+ data = f.read()
+ f.close()
+
+ genome_dict = getAvailableGenomes(genome_dir)
+ mapper_dict = constructMapperDict(genome_dict)
+
+ logging.debug(data)
+
+ f = open(cfg_filepath, 'w')
+ f.write(data % (mapper_dict))
+ f.close()
+
+ return True
+
+
+
+def configure(conf_info):
+ """
+ Attempts to configure the GA pipeline using goat.
+
+ Uses logging module to store information about status.
+
+ returns True if configuration successful, otherwise False.
+ """
+ #ERROR Test:
+ #pipe = subprocess.Popen(['goat_pipeline.py',
+ # '--GERALD=config32bk.txt',
+ # '--make .',],
+ # #'.'],
+ # stdout=subprocess.PIPE,
+ # stderr=subprocess.PIPE)
+
+ #ERROR Test (2), causes goat_pipeline.py traceback
+ #pipe = subprocess.Popen(['goat_pipeline.py',
+ # '--GERALD=%s' % (conf_info.config_filepath),
+ # '--tiles=s_4_100,s_4_101,s_4_102,s_4_103,s_4_104',
+ # '--make',
+ # '.'],
+ # stdout=subprocess.PIPE,
+ # stderr=subprocess.PIPE)
+
+ ##########################
+ # Run configuration step
+ # Not a test; actual configure attempt.
+ #pipe = subprocess.Popen(['goat_pipeline.py',
+ # '--GERALD=%s' % (conf_info.config_filepath),
+ # '--make',
+ # '.'],
+ # stdout=subprocess.PIPE,
+ # stderr=subprocess.PIPE)
+
+
+ stdout_filepath = os.path.join(conf_info.analysis_dir,
+ "pipeline_configure_stdout.txt")
+ stderr_filepath = os.path.join(conf_info.analysis_dir,
+ "pipeline_configure_stderr.txt")
+
+ fout = open(stdout_filepath, 'w')
+ ferr = open(stderr_filepath, 'w')
+
+ pipe = subprocess.Popen(['goat_pipeline.py',
+ '--GERALD=%s' % (conf_info.config_filepath),
+ #'--tiles=s_4_0100,s_4_0101,s_4_0102,s_4_0103,s_4_0104',
+ '--make',
+ conf_info.analysis_dir],
+ stdout=fout,
+ stderr=ferr)
+
+ print "Configuring pipeline: %s" % (time.ctime())
+ error_code = pipe.wait()
+
+ # Clean up
+ fout.close()
+ ferr.close()
+
+
+ ##################
+ # Process stdout
+ fout = open(stdout_filepath, 'r')
+
+ stdout_line = fout.readline()
+
+ complete = False
+ while stdout_line != '':
+ # Handle stdout
+ if config_stdout_handler(stdout_line, conf_info):
+ complete = True
+ stdout_line = fout.readline()
+
+ fout.close()
+
+
+ #error_code = pipe.wait()
+ if error_code:
+ logging.error('Recieved error_code: %s' % (error_code))
+ else:
+ logging.info('We are go for launch!')
+
+ #Process stderr
+ ferr = open(stderr_filepath, 'r')
+ stderr_line = ferr.readline()
+
+ abort = 'NO!'
+ stderr_success = False
+ while stderr_line != '':
+ stderr_status = config_stderr_handler(stderr_line, conf_info)
+ if stderr_status == RUN_ABORT:
+ abort = RUN_ABORT
+ elif stderr_status is True:
+ stderr_success = True
+ stderr_line = ferr.readline()
+
+ ferr.close()
+
+
+ #Success requirements:
+ # 1) The stdout completed without error
+ # 2) The program exited with status 0
+ # 3) No errors found in stdout
+ print '#Expect: True, False, True, True'
+ print complete, bool(error_code), abort != RUN_ABORT, stderr_success is True
+ status = complete is True and \
+ bool(error_code) is False and \
+ abort != RUN_ABORT and \
+ stderr_success is True
+
+ # If everything was successful, but for some reason
+ # we didn't retrieve the path info, log it.
+ if status is True:
+ if conf_info.bustard_path is None or conf_info.run_path is None:
+ logging.error("Failed to retrieve run_path")
+ return False
+
+ return status
+
+
+def run_pipeline(conf_info):
+ """
+ Run the pipeline and monitor status.
+ """
+ # Fail if the run_path doesn't actually exist
+ if not os.path.exists(conf_info.run_path):
+ logging.error('Run path does not exist: %s' \
+ % (conf_info.run_path))
+ return False
+
+ # Change cwd to run_path
+ stdout_filepath = os.path.join(conf_info.analysis_dir, 'pipeline_run_stdout.txt')
+ stderr_filepath = os.path.join(conf_info.analysis_dir, 'pipeline_run_stderr.txt')
+
+ # Create status object
+ conf_info.createStatusObject()
+
+ # Monitor file creation
+ wm = WatchManager()
+ mask = EventsCodes.IN_DELETE | EventsCodes.IN_CREATE
+ event = RunEvent(conf_info)
+ notifier = ThreadedNotifier(wm, event)
+ notifier.start()
+ wdd = wm.add_watch(conf_info.run_path, mask, rec=True)
+
+ # Log pipeline starting
+ logging.info('STARTING PIPELINE @ %s' % (time.ctime()))
+
+ # Start the pipeline (and hide!)
+ #pipe = subprocess.Popen(['make',
+ # '-j8',
+ # 'recursive'],
+ # stdout=subprocess.PIPE,
+ # stderr=subprocess.PIPE)
+
+ fout = open(stdout_filepath, 'w')
+ ferr = open(stderr_filepath, 'w')
+
+ pipe = subprocess.Popen(['make',
+ '--directory=%s' % (conf_info.run_path),
+ '-j8',
+ 'recursive'],
+ stdout=fout,
+ stderr=ferr)
+ #shell=True)
+ # Wait for run to finish
+ retcode = pipe.wait()
+
+
+ # Clean up
+ notifier.stop()
+ fout.close()
+ ferr.close()
+
+ # Process stderr
+ ferr = open(stderr_filepath, 'r')
+
+ run_failed_stderr = False
+ for line in ferr:
+ err_status = pipeline_stderr_handler(line, conf_info)
+ if err_status == RUN_FAILED:
+ run_failed_stderr = True
+
+ ferr.close()
+
+ # Finished file check!
+ print 'RUN SUCCESS CHECK:'
+ for key, value in event.run_status_dict.items():
+ print ' %s: %s' % (key, value)
+
+ dstatus = event.run_status_dict
+
+ # Success or failure check
+ status = (retcode == 0) and \
+ run_failed_stderr is False and \
+ dstatus['firecrest'] is True and \
+ dstatus['bustard'] is True and \
+ dstatus['gerald'] is True
+
+ return status
+
+
--- /dev/null
+"""
+Extract information about the Firecrest run
+
+Firecrest - class holding the properties we found
+firecrest - Firecrest factory function initalized from a directory name
+fromxml - Firecrest factory function initalized from an xml dump from
+ the Firecrest object.
+"""
+
+from datetime import date
+import os
+import re
+import time
+
+from htsworkflow.pipelines.runfolder import \
+ ElementTree, \
+ VERSION_RE, \
+ EUROPEAN_STRPTIME
+
+class Firecrest(object):
+ XML_VERSION=1
+
+ # xml tag names
+ FIRECREST = 'Firecrest'
+ SOFTWARE_VERSION = 'version'
+ START = 'FirstCycle'
+ STOP = 'LastCycle'
+ DATE = 'run_time'
+ USER = 'user'
+ MATRIX = 'matrix'
+
+ def __init__(self, xml=None):
+ self.start = None
+ self.stop = None
+ self.version = None
+ self.date = date.today()
+ self.user = None
+ self.matrix = None
+
+ if xml is not None:
+ self.set_elements(xml)
+
+ def _get_time(self):
+ return time.mktime(self.date.timetuple())
+ time = property(_get_time, doc='return run time as seconds since epoch')
+
+ def dump(self):
+ print "Starting cycle:", self.start
+ print "Ending cycle:", self.stop
+ print "Firecrest version:", self.version
+ print "Run date:", self.date
+ print "user:", self.user
+
+ def get_elements(self):
+ attribs = {'version': str(Firecrest.XML_VERSION) }
+ root = ElementTree.Element(Firecrest.FIRECREST, attrib=attribs)
+ version = ElementTree.SubElement(root, Firecrest.SOFTWARE_VERSION)
+ version.text = self.version
+ start_cycle = ElementTree.SubElement(root, Firecrest.START)
+ start_cycle.text = str(self.start)
+ stop_cycle = ElementTree.SubElement(root, Firecrest.STOP)
+ stop_cycle.text = str(self.stop)
+ run_date = ElementTree.SubElement(root, Firecrest.DATE)
+ run_date.text = str(self.time)
+ user = ElementTree.SubElement(root, Firecrest.USER)
+ user.text = self.user
+ matrix = ElementTree.SubElement(root, Firecrest.MATRIX)
+ matrix.text = self.matrix
+ return root
+
+ def set_elements(self, tree):
+ if tree.tag != Firecrest.FIRECREST:
+ raise ValueError('Expected "Firecrest" SubElements')
+ xml_version = int(tree.attrib.get('version', 0))
+ if xml_version > Firecrest.XML_VERSION:
+ logging.warn('Firecrest XML tree is a higher version than this class')
+ for element in list(tree):
+ if element.tag == Firecrest.SOFTWARE_VERSION:
+ self.version = element.text
+ elif element.tag == Firecrest.START:
+ self.start = int(element.text)
+ elif element.tag == Firecrest.STOP:
+ self.stop = int(element.text)
+ elif element.tag == Firecrest.DATE:
+ self.date = date.fromtimestamp(float(element.text))
+ elif element.tag == Firecrest.USER:
+ self.user = element.text
+ elif element.tag == Firecrest.MATRIX:
+ self.matrix = element.text
+ else:
+ raise ValueError("Unrecognized tag: %s" % (element.tag,))
+
+def firecrest(pathname):
+ """
+ Examine the directory at pathname and initalize a Firecrest object
+ """
+ f = Firecrest()
+
+ # parse firecrest directory name
+ path, name = os.path.split(pathname)
+ groups = name.split('_')
+ # grab the start/stop cycle information
+ cycle = re.match("C([0-9]+)-([0-9]+)", groups[0])
+ f.start = int(cycle.group(1))
+ f.stop = int(cycle.group(2))
+ # firecrest version
+ version = re.search(VERSION_RE, groups[1])
+ f.version = (version.group(1))
+ # datetime
+ t = time.strptime(groups[2], EUROPEAN_STRPTIME)
+ f.date = date(*t[0:3])
+ # username
+ f.user = groups[3]
+
+ # should I parse this deeper than just stashing the
+ # contents of the matrix file?
+ matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt')
+ f.matrix = open(matrix_pathname, 'r').read()
+ return f
+
+def fromxml(tree):
+ """
+ Initialize a Firecrest object from an element tree node
+ """
+ f = Firecrest()
+ f.set_elements(tree)
+ return f
--- /dev/null
+#!/usr/bin/python
+import glob
+import sys
+import os
+import re
+
+import logging
+
+from htsworkflow.util.alphanum import alphanum
+
+class DuplicateGenome(Exception): pass
+
+
+def _has_metainfo(genome_dir):
+ metapath = os.path.join(genome_dir, '_metainfo_')
+ if os.path.isfile(metapath):
+ return True
+ else:
+ return False
+
+def getAvailableGenomes(genome_base_dir):
+ """
+ raises IOError (on genome_base_dir not found)
+ raises DuplicateGenome on duplicate genomes found.
+
+ returns a double dictionary (i.e. d[species][build] = path)
+ """
+
+ # Need valid directory
+ if not os.path.exists(genome_base_dir):
+ msg = "Directory does not exist: %s" % (genome_base_dir)
+ raise IOError, msg
+
+ # Find all subdirectories
+ filepath_list = glob.glob(os.path.join(genome_base_dir, '*'))
+ potential_genome_dirs = \
+ [ filepath for filepath in filepath_list if os.path.isdir(filepath)]
+
+ # Get list of metadata files
+ genome_dir_list = \
+ [ dirpath \
+ for dirpath in potential_genome_dirs \
+ if _has_metainfo(dirpath) ]
+
+ # Genome double dictionary
+ d = {}
+
+ for genome_dir in genome_dir_list:
+ line = open(os.path.join(genome_dir, '_metainfo_'), 'r').readline().strip()
+
+ # Get species, build... log and skip on failure
+ try:
+ species, build = line.split('|')
+ except:
+ logging.warning('Skipping: Invalid metafile (%s) line: %s' \
+ % (metafile, line))
+ continue
+
+ build_dict = d.setdefault(species, {})
+ if build in build_dict:
+ msg = "Duplicate genome for %s|%s" % (species, build)
+ raise DuplicateGenome, msg
+
+ build_dict[build] = genome_dir
+
+ return d
+
+
+class constructMapperDict(object):
+ """
+ Emulate a dictionary to map genome|build names to paths.
+
+ It uses the dictionary generated by getAvailableGenomes.
+ """
+ def __init__(self, genome_dict):
+ self.genome_dict = genome_dict
+
+ def __getitem__(self, key):
+ """
+ Return the best match for key
+ """
+ elements = re.split("\|", key)
+
+ if len(elements) == 1:
+ # we just the species name
+ # get the set of builds
+ builds = self.genome_dict[elements[0]]
+
+ # sort build names the way humans would
+ keys = builds.keys()
+ keys.sort(cmp=alphanum)
+
+ # return the path from the 'last' build name
+ return builds[keys[-1]]
+
+ elif len(elements) == 2:
+ # we have species, and build name
+ return self.genome_dict[elements[0]][elements[1]]
+ else:
+ raise KeyError("Unrecognized key")
+
+ def keys(self):
+ keys = []
+ for species in self.genome_dict.keys():
+ for build in self.genome_dict[species]:
+ keys.append([species+'|'+build])
+ return keys
+
+ def values(self):
+ values = []
+ for species in self.genome_dict.keys():
+ for build in self.genome_dict[species]:
+ values.append(self.genome_dict[species][build])
+ return values
+
+ def items(self):
+ items = []
+ for species in self.genome_dict.keys():
+ for build in self.genome_dict[species]:
+ key = [species+'|'+build]
+ value = self.genome_dict[species][build]
+ items.append((key, value))
+ return items
+
+if __name__ == '__main__':
+
+ if len(sys.argv) != 2:
+ print 'useage: %s <base_genome_dir>' % (sys.argv[0])
+ sys.exit(1)
+
+ d = getAvailableGenomes(sys.argv[1])
+ d2 = constructMapperDict(d)
+
+ for k,v in d2.items():
+ print '%s: %s' % (k,v)
+
+
--- /dev/null
+"""
+Provide access to information stored in the GERALD directory.
+"""
+from datetime import datetime, date
+from glob import glob
+import logging
+import os
+import stat
+import time
+import types
+
+from htsworkflow.pipelines.runfolder import \
+ ElementTree, \
+ EUROPEAN_STRPTIME, \
+ LANES_PER_FLOWCELL, \
+ VERSION_RE
+from htsworkflow.util.ethelp import indent, flatten
+from htsworkflow.util.opener import autoopen
+
+class Gerald(object):
+ """
+ Capture meaning out of the GERALD directory
+ """
+ XML_VERSION = 1
+ GERALD='Gerald'
+ RUN_PARAMETERS='RunParameters'
+ SUMMARY='Summary'
+
+ class LaneParameters(object):
+ """
+ Make it easy to access elements of LaneSpecificRunParameters from python
+ """
+ def __init__(self, gerald, key):
+ self._gerald = gerald
+ self._key = key
+
+ def __get_attribute(self, xml_tag):
+ subtree = self._gerald.tree.find('LaneSpecificRunParameters')
+ container = subtree.find(xml_tag)
+ if container is None:
+ return None
+ if len(container.getchildren()) > LANES_PER_FLOWCELL:
+ raise RuntimeError('GERALD config.xml file changed')
+ lanes = [x.tag.split('_')[1] for x in container.getchildren()]
+ index = lanes.index(self._key)
+ element = container[index]
+ return element.text
+ def _get_analysis(self):
+ return self.__get_attribute('ANALYSIS')
+ analysis = property(_get_analysis)
+
+ def _get_eland_genome(self):
+ genome = self.__get_attribute('ELAND_GENOME')
+ # default to the chipwide parameters if there isn't an
+ # entry in the lane specific paramaters
+ if genome is None:
+ subtree = self._gerald.tree.find('ChipWideRunParameters')
+ container = subtree.find('ELAND_GENOME')
+ genome = container.text
+ return genome
+ eland_genome = property(_get_eland_genome)
+
+ def _get_read_length(self):
+ return self.__get_attribute('READ_LENGTH')
+ read_length = property(_get_read_length)
+
+ def _get_use_bases(self):
+ return self.__get_attribute('USE_BASES')
+ use_bases = property(_get_use_bases)
+
+ class LaneSpecificRunParameters(object):
+ """
+ Provide access to LaneSpecificRunParameters
+ """
+ def __init__(self, gerald):
+ self._gerald = gerald
+ self._keys = None
+ def __getitem__(self, key):
+ return Gerald.LaneParameters(self._gerald, key)
+ def keys(self):
+ if self._keys is None:
+ tree = self._gerald.tree
+ analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
+ # according to the pipeline specs I think their fields
+ # are sampleName_laneID, with sampleName defaulting to s
+ # since laneIDs are constant lets just try using
+ # those consistently.
+ self._keys = [ x.tag.split('_')[1] for x in analysis]
+ return self._keys
+ def values(self):
+ return [ self[x] for x in self.keys() ]
+ def items(self):
+ return zip(self.keys(), self.values())
+ def __len__(self):
+ return len(self.keys())
+
+ def __init__(self, xml=None):
+ self.pathname = None
+ self.tree = None
+
+ # parse lane parameters out of the config.xml file
+ self.lanes = Gerald.LaneSpecificRunParameters(self)
+
+ self.summary = None
+ self.eland_results = None
+
+ if xml is not None:
+ self.set_elements(xml)
+
+ def _get_date(self):
+ if self.tree is None:
+ return datetime.today()
+ timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
+ epochstamp = time.mktime(time.strptime(timestamp, '%c'))
+ return datetime.fromtimestamp(epochstamp)
+ date = property(_get_date)
+
+ def _get_time(self):
+ return time.mktime(self.date.timetuple())
+ time = property(_get_time, doc='return run time as seconds since epoch')
+
+ def _get_version(self):
+ if self.tree is None:
+ return None
+ return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION')
+ version = property(_get_version)
+
+ def dump(self):
+ """
+ Debugging function, report current object
+ """
+ print 'Gerald version:', self.version
+ print 'Gerald run date:', self.date
+ print 'Gerald config.xml:', self.tree
+ self.summary.dump()
+
+ def get_elements(self):
+ if self.tree is None or self.summary is None:
+ return None
+
+ gerald = ElementTree.Element(Gerald.GERALD,
+ {'version': unicode(Gerald.XML_VERSION)})
+ gerald.append(self.tree)
+ gerald.append(self.summary.get_elements())
+ if self.eland_results:
+ gerald.append(self.eland_results.get_elements())
+ return gerald
+
+ def set_elements(self, tree):
+ if tree.tag != Gerald.GERALD:
+ raise ValueError('exptected GERALD')
+ xml_version = int(tree.attrib.get('version', 0))
+ if xml_version > Gerald.XML_VERSION:
+ logging.warn('XML tree is a higher version than this class')
+ for element in list(tree):
+ tag = element.tag.lower()
+ if tag == Gerald.RUN_PARAMETERS.lower():
+ self.tree = element
+ elif tag == Gerald.SUMMARY.lower():
+ self.summary = Summary(xml=element)
+ elif tag == ELAND.ELAND.lower():
+ self.eland_results = ELAND(xml=element)
+ else:
+ logging.warn("Unrecognized tag %s" % (element.tag,))
+
+
+def gerald(pathname):
+ g = Gerald()
+ g.pathname = pathname
+ path, name = os.path.split(pathname)
+ config_pathname = os.path.join(pathname, 'config.xml')
+ g.tree = ElementTree.parse(config_pathname).getroot()
+
+ # parse Summary.htm file
+ summary_pathname = os.path.join(pathname, 'Summary.htm')
+ g.summary = Summary(summary_pathname)
+ # parse eland files
+ g.eland_results = eland(g.pathname, g)
+ return g
+
+def tonumber(v):
+ """
+ Convert a value to int if its an int otherwise a float.
+ """
+ try:
+ v = int(v)
+ except ValueError, e:
+ v = float(v)
+ return v
+
+def parse_mean_range(value):
+ """
+ Parse values like 123 +/- 4.5
+ """
+ if value.strip() == 'unknown':
+ return 0, 0
+
+ average, pm, deviation = value.split()
+ if pm != '+/-':
+ raise RuntimeError("Summary.htm file format changed")
+ return tonumber(average), tonumber(deviation)
+
+def make_mean_range_element(parent, name, mean, deviation):
+ """
+ Make an ElementTree subelement <Name mean='mean', deviation='deviation'/>
+ """
+ element = ElementTree.SubElement(parent, name,
+ { 'mean': unicode(mean),
+ 'deviation': unicode(deviation)})
+ return element
+
+def parse_mean_range_element(element):
+ """
+ Grab mean/deviation out of element
+ """
+ return (tonumber(element.attrib['mean']),
+ tonumber(element.attrib['deviation']))
+
+def parse_summary_element(element):
+ """
+ Determine if we have a simple element or a mean/deviation element
+ """
+ if len(element.attrib) > 0:
+ return parse_mean_range_element(element)
+ else:
+ return element.text
+
+class Summary(object):
+ """
+ Extract some useful information from the Summary.htm file
+ """
+ XML_VERSION = 2
+ SUMMARY = 'Summary'
+
+ class LaneResultSummary(object):
+ """
+ Parse the LaneResultSummary table out of Summary.htm
+ Mostly for the cluster number
+ """
+ LANE_RESULT_SUMMARY = 'LaneResultSummary'
+ TAGS = {
+ 'LaneYield': 'lane_yield',
+ 'Cluster': 'cluster', # Raw
+ 'ClusterPF': 'cluster_pass_filter',
+ 'AverageFirstCycleIntensity': 'average_first_cycle_intensity',
+ 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles',
+ 'PercentPassFilterClusters': 'percent_pass_filter_clusters',
+ 'PercentPassFilterAlign': 'percent_pass_filter_align',
+ 'AverageAlignmentScore': 'average_alignment_score',
+ 'PercentErrorRate': 'percent_error_rate'
+ }
+
+ def __init__(self, html=None, xml=None):
+ self.lane = None
+ self.lane_yield = None
+ self.cluster = None
+ self.cluster_pass_filter = None
+ self.average_first_cycle_intensity = None
+ self.percent_intensity_after_20_cycles = None
+ self.percent_pass_filter_clusters = None
+ self.percent_pass_filter_align = None
+ self.average_alignment_score = None
+ self.percent_error_rate = None
+
+ if html is not None:
+ self.set_elements_from_html(html)
+ if xml is not None:
+ self.set_elements(xml)
+
+ def set_elements_from_html(self, data):
+ if not len(data) in (8,10):
+ raise RuntimeError("Summary.htm file format changed")
+
+ # same in pre-0.3.0 Summary file and 0.3 summary file
+ self.lane = data[0]
+
+ if len(data) == 8:
+ parsed_data = [ parse_mean_range(x) for x in data[1:] ]
+ # this is the < 0.3 Pipeline version
+ self.cluster = parsed_data[0]
+ self.average_first_cycle_intensity = parsed_data[1]
+ self.percent_intensity_after_20_cycles = parsed_data[2]
+ self.percent_pass_filter_clusters = parsed_data[3]
+ self.percent_pass_filter_align = parsed_data[4]
+ self.average_alignment_score = parsed_data[5]
+ self.percent_error_rate = parsed_data[6]
+ elif len(data) == 10:
+ parsed_data = [ parse_mean_range(x) for x in data[2:] ]
+ # this is the >= 0.3 summary file
+ self.lane_yield = data[1]
+ self.cluster = parsed_data[0]
+ self.cluster_pass_filter = parsed_data[1]
+ self.average_first_cycle_intensity = parsed_data[2]
+ self.percent_intensity_after_20_cycles = parsed_data[3]
+ self.percent_pass_filter_clusters = parsed_data[4]
+ self.percent_pass_filter_align = parsed_data[5]
+ self.average_alignment_score = parsed_data[6]
+ self.percent_error_rate = parsed_data[7]
+
+ def get_elements(self):
+ lane_result = ElementTree.Element(
+ Summary.LaneResultSummary.LANE_RESULT_SUMMARY,
+ {'lane': self.lane})
+ for tag, variable_name in Summary.LaneResultSummary.TAGS.items():
+ value = getattr(self, variable_name)
+ if value is None:
+ continue
+ # it looks like a sequence
+ elif type(value) in (types.TupleType, types.ListType):
+ element = make_mean_range_element(
+ lane_result,
+ tag,
+ *value
+ )
+ else:
+ element = ElementTree.SubElement(lane_result, tag)
+ element.text = value
+ return lane_result
+
+ def set_elements(self, tree):
+ if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY:
+ raise ValueError('Expected %s' % (
+ Summary.LaneResultSummary.LANE_RESULT_SUMMARY))
+ self.lane = tree.attrib['lane']
+ tags = Summary.LaneResultSummary.TAGS
+ for element in list(tree):
+ try:
+ variable_name = tags[element.tag]
+ setattr(self, variable_name,
+ parse_summary_element(element))
+ except KeyError, e:
+ logging.warn('Unrecognized tag %s' % (element.tag,))
+
+ def __init__(self, filename=None, xml=None):
+ self.lane_results = {}
+
+ if filename is not None:
+ self._extract_lane_results(filename)
+ if xml is not None:
+ self.set_elements(xml)
+
+ def __getitem__(self, key):
+ return self.lane_results[key]
+
+ def __len__(self):
+ return len(self.lane_results)
+
+ def keys(self):
+ return self.lane_results.keys()
+
+ def values(self):
+ return self.lane_results.values()
+
+ def items(self):
+ return self.lane_results.items()
+
+ def _flattened_row(self, row):
+ """
+ flatten the children of a <tr>...</tr>
+ """
+ return [flatten(x) for x in row.getchildren() ]
+
+ def _parse_table(self, table):
+ """
+ assumes the first line is the header of a table,
+ and that the remaining rows are data
+ """
+ rows = table.getchildren()
+ data = []
+ for r in rows:
+ data.append(self._flattened_row(r))
+ return data
+
+ def _extract_named_tables(self, pathname):
+ """
+ extract all the 'named' tables from a Summary.htm file
+ and return as a dictionary
+
+ Named tables are <h2>...</h2><table>...</table> pairs
+ The contents of the h2 tag is considered to the name
+ of the table.
+ """
+ tree = ElementTree.parse(pathname).getroot()
+ body = tree.find('body')
+ tables = {}
+ for i in range(len(body)):
+ if body[i].tag == 'h2' and body[i+1].tag == 'table':
+ # we have an interesting table
+ name = flatten(body[i])
+ table = body[i+1]
+ data = self._parse_table(table)
+ tables[name] = data
+ return tables
+
+ def _extract_lane_results(self, pathname):
+ """
+ extract the Lane Results Summary table
+ """
+
+ tables = self._extract_named_tables(pathname)
+
+ # parse lane result summary
+ lane_summary = tables['Lane Results Summary']
+ # this is version 1 of the summary file
+ if len(lane_summary[-1]) == 8:
+ # strip header
+ headers = lane_summary[0]
+ # grab the lane by lane data
+ lane_summary = lane_summary[1:]
+
+ # this is version 2 of the summary file
+ if len(lane_summary[-1]) == 10:
+ # lane_summary[0] is a different less specific header row
+ headers = lane_summary[1]
+ lane_summary = lane_summary[2:10]
+ # after the last lane, there's a set of chip wide averages
+
+ for r in lane_summary:
+ lrs = Summary.LaneResultSummary(html=r)
+ self.lane_results[lrs.lane] = lrs
+
+ def get_elements(self):
+ summary = ElementTree.Element(Summary.SUMMARY,
+ {'version': unicode(Summary.XML_VERSION)})
+ for lane in self.lane_results.values():
+ summary.append(lane.get_elements())
+ return summary
+
+ def set_elements(self, tree):
+ if tree.tag != Summary.SUMMARY:
+ return ValueError("Expected %s" % (Summary.SUMMARY,))
+ xml_version = int(tree.attrib.get('version', 0))
+ if xml_version > Summary.XML_VERSION:
+ logging.warn('Summary XML tree is a higher version than this class')
+ for element in list(tree):
+ lrs = Summary.LaneResultSummary()
+ lrs.set_elements(element)
+ self.lane_results[lrs.lane] = lrs
+
+ def dump(self):
+ """
+ Debugging function, report current object
+ """
+ pass
+
+
+def build_genome_fasta_map(genome_dir):
+ # build fasta to fasta file map
+ genome = genome_dir.split(os.path.sep)[-1]
+ fasta_map = {}
+ for vld_file in glob(os.path.join(genome_dir, '*.vld')):
+ is_link = False
+ if os.path.islink(vld_file):
+ is_link = True
+ vld_file = os.path.realpath(vld_file)
+ path, vld_name = os.path.split(vld_file)
+ name, ext = os.path.splitext(vld_name)
+ if is_link:
+ fasta_map[name] = name
+ else:
+ fasta_map[name] = os.path.join(genome, name)
+ return fasta_map
+
+class ElandLane(object):
+ """
+ Process an eland result file
+ """
+ XML_VERSION = 1
+ LANE = 'ElandLane'
+ SAMPLE_NAME = 'SampleName'
+ LANE_ID = 'LaneID'
+ GENOME_MAP = 'GenomeMap'
+ GENOME_ITEM = 'GenomeItem'
+ MAPPED_READS = 'MappedReads'
+ MAPPED_ITEM = 'MappedItem'
+ MATCH_CODES = 'MatchCodes'
+ MATCH_ITEM = 'Code'
+ READS = 'Reads'
+
+ def __init__(self, pathname=None, genome_map=None, xml=None):
+ self.pathname = pathname
+ self._sample_name = None
+ self._lane_id = None
+ self._reads = None
+ self._mapped_reads = None
+ self._match_codes = None
+ if genome_map is None:
+ genome_map = {}
+ self.genome_map = genome_map
+
+ if xml is not None:
+ self.set_elements(xml)
+
+ def _update(self):
+ """
+ Actually read the file and actually count the reads
+ """
+ # can't do anything if we don't have a file to process
+ if self.pathname is None:
+ return
+
+ if os.stat(self.pathname)[stat.ST_SIZE] == 0:
+ raise RuntimeError("Eland isn't done, try again later.")
+
+ reads = 0
+ mapped_reads = {}
+
+ match_codes = {'NM':0, 'QC':0, 'RM':0,
+ 'U0':0, 'U1':0, 'U2':0,
+ 'R0':0, 'R1':0, 'R2':0,
+ }
+ for line in autoopen(self.pathname,'r'):
+ reads += 1
+ fields = line.split()
+ # code = fields[2]
+ # match_codes[code] = match_codes.setdefault(code, 0) + 1
+ # the QC/NM etc codes are in the 3rd field and always present
+ match_codes[fields[2]] += 1
+ # ignore lines that don't have a fasta filename
+ if len(fields) < 7:
+ continue
+ fasta = self.genome_map.get(fields[6], fields[6])
+ mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1
+ self._match_codes = match_codes
+ self._mapped_reads = mapped_reads
+ self._reads = reads
+
+ def _update_name(self):
+ # extract the sample name
+ if self.pathname is None:
+ return
+
+ path, name = os.path.split(self.pathname)
+ split_name = name.split('_')
+ self._sample_name = split_name[0]
+ self._lane_id = split_name[1]
+
+ def _get_sample_name(self):
+ if self._sample_name is None:
+ self._update_name()
+ return self._sample_name
+ sample_name = property(_get_sample_name)
+
+ def _get_lane_id(self):
+ if self._lane_id is None:
+ self._update_name()
+ return self._lane_id
+ lane_id = property(_get_lane_id)
+
+ def _get_reads(self):
+ if self._reads is None:
+ self._update()
+ return self._reads
+ reads = property(_get_reads)
+
+ def _get_mapped_reads(self):
+ if self._mapped_reads is None:
+ self._update()
+ return self._mapped_reads
+ mapped_reads = property(_get_mapped_reads)
+
+ def _get_match_codes(self):
+ if self._match_codes is None:
+ self._update()
+ return self._match_codes
+ match_codes = property(_get_match_codes)
+
+ def get_elements(self):
+ lane = ElementTree.Element(ElandLane.LANE,
+ {'version':
+ unicode(ElandLane.XML_VERSION)})
+ sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME)
+ sample_tag.text = self.sample_name
+ lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID)
+ lane_tag.text = self.lane_id
+ genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP)
+ for k, v in self.genome_map.items():
+ item = ElementTree.SubElement(
+ genome_map, ElandLane.GENOME_ITEM,
+ {'name':k, 'value':unicode(v)})
+ mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS)
+ for k, v in self.mapped_reads.items():
+ item = ElementTree.SubElement(
+ mapped_reads, ElandLane.MAPPED_ITEM,
+ {'name':k, 'value':unicode(v)})
+ match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES)
+ for k, v in self.match_codes.items():
+ item = ElementTree.SubElement(
+ match_codes, ElandLane.MATCH_ITEM,
+ {'name':k, 'value':unicode(v)})
+ reads = ElementTree.SubElement(lane, ElandLane.READS)
+ reads.text = unicode(self.reads)
+
+ return lane
+
+ def set_elements(self, tree):
+ if tree.tag != ElandLane.LANE:
+ raise ValueError('Exptecting %s' % (ElandLane.LANE,))
+
+ # reset dictionaries
+ self._mapped_reads = {}
+ self._match_codes = {}
+
+ for element in tree:
+ tag = element.tag.lower()
+ if tag == ElandLane.SAMPLE_NAME.lower():
+ self._sample_name = element.text
+ elif tag == ElandLane.LANE_ID.lower():
+ self._lane_id = element.text
+ elif tag == ElandLane.GENOME_MAP.lower():
+ for child in element:
+ name = child.attrib['name']
+ value = child.attrib['value']
+ self.genome_map[name] = value
+ elif tag == ElandLane.MAPPED_READS.lower():
+ for child in element:
+ name = child.attrib['name']
+ value = child.attrib['value']
+ self._mapped_reads[name] = int(value)
+ elif tag == ElandLane.MATCH_CODES.lower():
+ for child in element:
+ name = child.attrib['name']
+ value = int(child.attrib['value'])
+ self._match_codes[name] = value
+ elif tag == ElandLane.READS.lower():
+ self._reads = int(element.text)
+ else:
+ logging.warn("ElandLane unrecognized tag %s" % (element.tag,))
+
+def extract_eland_sequence(instream, outstream, start, end):
+ """
+ Extract a chunk of sequence out of an eland file
+ """
+ for line in instream:
+ record = line.split()
+ if len(record) > 1:
+ result = [record[0], record[1][start:end]]
+ else:
+ result = [record[0][start:end]]
+ outstream.write("\t".join(result))
+ outstream.write(os.linesep)
+
+class ELAND(object):
+ """
+ Summarize information from eland files
+ """
+ XML_VERSION = 1
+
+ ELAND = 'ElandCollection'
+ LANE = 'Lane'
+ LANE_ID = 'id'
+
+ def __init__(self, xml=None):
+ # we need information from the gerald config.xml
+ self.results = {}
+
+ if xml is not None:
+ self.set_elements(xml)
+
+ def __len__(self):
+ return len(self.results)
+
+ def keys(self):
+ return self.results.keys()
+
+ def values(self):
+ return self.results.values()
+
+ def items(self):
+ return self.results.items()
+
+ def __getitem__(self, key):
+ return self.results[key]
+
+ def get_elements(self):
+ root = ElementTree.Element(ELAND.ELAND,
+ {'version': unicode(ELAND.XML_VERSION)})
+ for lane_id, lane in self.results.items():
+ eland_lane = lane.get_elements()
+ eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id)
+ root.append(eland_lane)
+ return root
+
+ def set_elements(self, tree):
+ if tree.tag.lower() != ELAND.ELAND.lower():
+ raise ValueError('Expecting %s', ELAND.ELAND)
+ for element in list(tree):
+ lane_id = element.attrib[ELAND.LANE_ID]
+ lane = ElandLane(xml=element)
+ self.results[lane_id] = lane
+
+def eland(basedir, gerald=None, genome_maps=None):
+ e = ELAND()
+
+ file_list = glob(os.path.join(basedir, "*_eland_result.txt"))
+ if len(file_list) == 0:
+ # lets handle compressed eland files too
+ file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2"))
+
+ for pathname in file_list:
+ # yes the lane_id is also being computed in ElandLane._update
+ # I didn't want to clutter up my constructor
+ # but I needed to persist the sample_name/lane_id for
+ # runfolder summary_report
+ path, name = os.path.split(pathname)
+ split_name = name.split('_')
+ lane_id = split_name[1]
+
+ if genome_maps is not None:
+ genome_map = genome_maps[lane_id]
+ elif gerald is not None:
+ genome_dir = gerald.lanes[lane_id].eland_genome
+ genome_map = build_genome_fasta_map(genome_dir)
+ else:
+ genome_map = {}
+
+ eland_result = ElandLane(pathname, genome_map)
+ e.results[lane_id] = eland_result
+ return e
--- /dev/null
+from xml import sax
+
+
+def get_cycles(recipe_xml_filepath):
+ """
+ returns the number of cycles found in Recipe*.xml
+ """
+ handler = CycleXmlHandler()
+ sax.parse(recipe_xml_filepath, handler)
+ return handler.cycle_count
+
+
+
+class CycleXmlHandler(sax.ContentHandler):
+
+ def __init__(self):
+ self.cycle_count = 0
+ self.in_protocol = False
+ sax.ContentHandler.__init__(self)
+
+
+ def startDocument(self):
+ self.cycle_count = 0
+ self.in_protocol = False
+
+
+ def startElement(self, name, attrs):
+
+ #Only count Incorporations as cycles if within
+ # the protocol section of the xml document.
+ if name == "Incorporation" and self.in_protocol:
+ #print 'Found a cycle!'
+ self.cycle_count += 1
+ return
+
+ elif name == 'Protocol':
+ #print 'In protocol'
+ self.in_protocol = True
+ return
+
+ #print 'Skipping: %s' % (name)
+
+
+ def endElement(self, name):
+
+ if name == 'Protocol':
+ #print 'End protocol'
+ self.in_protocol = False
--- /dev/null
+#!/usr/bin/env python
+
+from optparse import OptionParser, IndentedHelpFormatter
+from ConfigParser import SafeConfigParser
+
+import logging
+import os
+import sys
+import urllib2
+
+CONFIG_SYSTEM = '/etc/hts_frontend/hts_frontend.conf'
+CONFIG_USER = os.path.expanduser('~/.hts_frontend.conf')
+
+#Disable or enable commandline arg parsing; disabled by default.
+DISABLE_CMDLINE = True
+
+class FlowCellNotFound(Exception): pass
+class WebError404(Exception): pass
+
+class DummyOptions:
+ """
+ Used when command line parsing is disabled; default
+ """
+ def __init__(self):
+ self.url = None
+ self.output_filepath = None
+ self.flowcell = None
+ self.genome_dir = None
+
+class PreformattedDescriptionFormatter(IndentedHelpFormatter):
+
+ #def format_description(self, description):
+ #
+ # if description:
+ # return description + "\n"
+ # else:
+ # return ""
+
+ def format_epilog(self, epilog):
+ """
+ It was removing my preformated epilog, so this should override
+ that behavior! Muhahaha!
+ """
+ if epilog:
+ return "\n" + epilog + "\n"
+ else:
+ return ""
+
+
+def constructOptionParser():
+ """
+ returns a pre-setup optparser
+ """
+ global DISABLE_CMDLINE
+
+ if DISABLE_CMDLINE:
+ return None
+
+ parser = OptionParser(formatter=PreformattedDescriptionFormatter())
+
+ parser.set_description('Retrieves eland config file from hts_frontend web frontend.')
+
+ parser.epilog = """
+Config File:
+ * %s (System wide)
+ * %s (User specific; overrides system)
+ * command line overrides all config file options
+
+ Example Config File:
+
+ [config_file_server]
+ base_host_url=http://somewhere.domain:port
+""" % (CONFIG_SYSTEM, CONFIG_USER)
+
+ #Special formatter for allowing preformatted description.
+ ##parser.format_epilog(PreformattedDescriptionFormatter())
+
+ parser.add_option("-u", "--url",
+ action="store", type="string", dest="url")
+
+ parser.add_option("-o", "--output",
+ action="store", type="string", dest="output_filepath")
+
+ parser.add_option("-f", "--flowcell",
+ action="store", type="string", dest="flowcell")
+
+ parser.add_option("-g", "--genome_dir",
+ action="store", type="string", dest="genome_dir")
+
+ #parser.set_default("url", "default")
+
+ return parser
+
+def constructConfigParser():
+ """
+ returns a pre-setup config parser
+ """
+ parser = SafeConfigParser()
+ parser.read([CONFIG_SYSTEM, CONFIG_USER])
+ if not parser.has_section('config_file_server'):
+ parser.add_section('config_file_server')
+ if not parser.has_section('local_setup'):
+ parser.add_section('local_setup')
+
+ return parser
+
+
+def getCombinedOptions():
+ """
+ Returns optparse options after it has be updated with ConfigParser
+ config files and merged with parsed commandline options.
+ """
+ cl_parser = constructOptionParser()
+ conf_parser = constructConfigParser()
+
+ if cl_parser is None:
+ options = DummyOptions()
+ else:
+ options, args = cl_parser.parse_args()
+
+ if options.url is None:
+ if conf_parser.has_option('config_file_server', 'base_host_url'):
+ options.url = conf_parser.get('config_file_server', 'base_host_url')
+
+ if options.genome_dir is None:
+ if conf_parser.has_option('local_setup', 'genome_dir'):
+ options.genome_dir = conf_parser.get('local_setup', 'genome_dir')
+
+ print 'USING OPTIONS:'
+ print ' URL:', options.url
+ print ' OUT:', options.output_filepath
+ print ' FC:', options.flowcell
+ print 'GDIR:', options.genome_dir
+ print ''
+
+ return options
+
+
+def saveConfigFile(flowcell, base_host_url, output_filepath):
+ """
+ retrieves the flowcell eland config file, give the base_host_url
+ (i.e. http://sub.domain.edu:port)
+ """
+ url = base_host_url + '/eland_config/%s/' % (flowcell)
+
+ f = open(output_filepath, 'w')
+ #try:
+ try:
+ web = urllib2.urlopen(url)
+ except urllib2.URLError, e:
+ errmsg = 'URLError: %d' % (e.code,)
+ logging.error(errmsg)
+ logging.error('opened %s' % (url,))
+ logging.error('%s' % ( e.read(),))
+ raise IOError(errmsg)
+
+ #except IOError, msg:
+ # if str(msg).find("Connection refused") >= 0:
+ # print 'Error: Connection refused for: %s' % (url)
+ # f.close()
+ # sys.exit(1)
+ # elif str(msg).find("Name or service not known") >= 0:
+ # print 'Error: Invalid domain or ip address for: %s' % (url)
+ # f.close()
+ # sys.exit(2)
+ # else:
+ # raise IOError, msg
+
+ data = web.read()
+
+ if data.find('Hmm, config file for') >= 0:
+ msg = "Flowcell (%s) not found in DB; full url(%s)" % (flowcell, url)
+ raise FlowCellNotFound, msg
+
+ if data.find('404 - Not Found') >= 0:
+ msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \
+ "Did you get right port #?" % (flowcell, base_host_url, url)
+ raise FlowCellNotFound, msg
+
+ f.write(data)
+ web.close()
+ f.close()
+ logging.info('Wrote config file to %s' % (output_filepath,))
+
+
--- /dev/null
+import glob
+import re
+import os
+import sys
+import time
+import threading
+
+s_comment = re.compile('^#')
+s_general_read_len = re.compile('^READ_LENGTH ')
+s_read_len = re.compile('^[1-8]+:READ_LENGTH ')
+
+s_firecrest = None
+
+def _four_digit_num_in_string(num):
+ if num < 0:
+ pass
+ elif num < 10:
+ return '000' + str(num)
+ elif num < 100:
+ return '00' + str(num)
+ elif num < 1000:
+ return '0' + str(num)
+ elif num < 10000:
+ return str(num)
+
+ msg = 'Invalid number: %s' % (num)
+ raise ValueError, msg
+
+def _two_digit_num_in_string(num):
+ if num < 0:
+ pass
+ elif num < 10:
+ return '0' + str(num)
+ elif num < 100:
+ return str(num)
+
+ msg = 'Invalid number: %s' % (num)
+ raise ValueError, msg
+
+
+# FIRECREST PATTERNS
+# _p2f(<pattern>, lane, tile, cycle)
+PATTERN_FIRECREST_QCM = 's_%s_%s_%s_qcm.xml'
+
+# _p2f(<pattern>, lane, tile)
+PATTERN_FIRECREST_INT = 's_%s_%s_02_int.txt'
+PATTERN_FIRECREST_NSE = 's_%s_%s_nse.txt.gz'
+PATTERN_FIRECREST_POS = 's_%s_%s_pos.txt'
+PATTERN_FIRECREST_IDX = 's_%s_%s_idx.txt'
+PATTERN_FIRECREST_CLU1 = 's_%s_%s_01_1_clu.txt'
+PATTERN_FIRECREST_CLU2 = 's_%s_%s_01_2_clu.txt'
+PATTERN_FIRECREST_CLU3 = 's_%s_%s_01_3_clu.txt'
+PATTERN_FIRECREST_CLU4 = 's_%s_%s_01_4_clu.txt'
+
+
+# BUSTARD PATTERNS
+# _p2f(<pattern>, lane, tile)
+PATTERN_BUSTARD_SIG2 = 's_%s_%s_sig2.txt'
+PATTERN_BUSTARD_PRB = 's_%s_%s_prb.txt'
+
+
+
+# GERALD PATTERNS
+# _p2f(<pattern>, lane, tile)
+PATTERN_GERALD_ALLTMP = 's_%s_%s_all.txt.tmp'
+PATTERN_GERALD_QRAWTMP = 's_%s_%s_qraw.txt.tmp'
+PATTERN_GERALD_ALLPNGTMP = 's_%s_%s_all.tmp.png'
+PATTERN_GERALD_ALIGNTMP = 's_%s_%s_align.txt.tmp'
+PATTERN_GERALD_QVALTMP = 's_%s_%s_qval.txt.tmp'
+PATTERN_GERALD_SCORETMP = 's_%s_%s_score.txt.tmp'
+PATTERN_GERALD_PREALIGNTMP = 's_%s_%s_prealign.txt.tmp'
+PATTERN_GERALD_REALIGNTMP = 's_%s_%s_realign.txt.tmp'
+PATTERN_GERALD_RESCORETMP = 's_%s_%s_rescore.txt.tmp'
+PATTERN_GERALD_RESCOREPNG = 's_%s_%s_rescore.png'
+PATTERN_GERALD_ERRORSTMPPNG = 's_%s_%s_errors.tmp.png'
+PATTERN_GERALD_QCALTMP = 's_%s_%s_qcal.txt.tmp'
+PATTERN_GERALD_QVAL = 's_%s_%s_qval.txt'
+
+# _p2f(<pattern>, lane)
+PATTERN_GERALD_SEQPRETMP = 's_%s_seqpre.txt.tmp'
+PATTERN_GERALD_RESULTTMP = 's_%s_eland_result.txt.tmp'
+PATTERN_GERALD_SIGMEANSTMP = 's_%s_Signal_Means.txt.tmp'
+PATTERN_GERALD_CALLPNG = 's_%s_call.png'
+PATTERN_GERALD_ALLPNG = 's_%s_all.png'
+PATTERN_GERALD_PERCENTALLPNG = 's_%s_percent_all.png'
+PATTERN_GERALD_PERCENTCALLPNG = 's_%s_percent_call.png'
+PATTERN_GERALD_PERCENTBASEPNG = 's_%s_percent_base.png'
+PATTERN_GERALD_FILTTMP = 's_%s_filt.txt.tmp'
+PATTERN_GERALD_FRAGTMP = 's_%s_frag.txt.tmp'
+PATTERN_GERALD_QREPORTTMP = 's_%s_qreport.txt.tmp'
+PATTERN_GERALD_QTABLETMP = 's_%s_qtable.txt.tmp'
+PATTERN_GERALD_QCALREPORTTMP = 's_%s_qcalreport.txt.tmp'
+PATTERN_GERALD_SEQUENCETMP = 's_%s_sequence.txt.tmp'
+PATTERN_GERALD_LANEFINISHED = 's_%s_finished.txt'
+
+
+
+def _p2f(pattern, lane, tile=None, cycle=None):
+ """
+ Converts a pattern plus info into file names
+ """
+
+ # lane, and cycle provided (INVALID)
+ if tile is None and cycle is not None:
+ msg = "Handling of cycle without tile is not currently implemented."
+ raise ValueError, msg
+
+ # lane, tile, cycle provided
+ elif cycle:
+ return pattern % (lane,
+ _four_digit_num_in_string(tile),
+ _two_digit_num_in_string(cycle))
+
+ # lane, tile provided
+ elif tile:
+ return pattern % (lane, _four_digit_num_in_string(tile))
+
+ # lane provided
+ else:
+ return pattern % (lane)
+
+
+class GARunStatus(object):
+
+ def __init__(self, conf_filepath):
+ """
+ Given an eland config file in the top level directory
+ of a run, predicts the files that will be generated
+ during a run and provides methods for retrieving
+ (completed, total) for each step or entire run.
+ """
+ #print 'self._conf_filepath = %s' % (conf_filepath)
+ self._conf_filepath = conf_filepath
+ self._base_dir, junk = os.path.split(conf_filepath)
+ self._image_dir = os.path.join(self._base_dir, 'Images')
+
+ self.lanes = []
+ self.lane_read_length = {}
+ self.tiles = None
+ self.cycles = None
+
+ self.status = {}
+ self.status['firecrest'] = {}
+ self.status['bustard'] = {}
+ self.status['gerald'] = {}
+
+ self._process_config()
+ self._count_tiles()
+ self._count_cycles()
+ self._generate_expected()
+
+
+ def _process_config(self):
+ """
+ Grabs info from self._conf_filepath
+ """
+ f = open(self._conf_filepath, 'r')
+
+ for line in f:
+
+ #Skip comment lines for now.
+ if s_comment.search(line):
+ continue
+
+ mo = s_general_read_len.search(line)
+ if mo:
+ read_length = int(line[mo.end():])
+ #Handle general READ_LENGTH
+ for i in range(1,9):
+ self.lane_read_length[i] = read_length
+
+ mo = s_read_len.search(line)
+ if mo:
+ read_length = int(line[mo.end():])
+ lanes, junk = line.split(':')
+
+ #Convert lanes from string of lanes to list of lane #s.
+ lanes = [ int(i) for i in lanes ]
+
+
+ for lane in lanes:
+
+ #Keep track of which lanes are being run.
+ if lane not in self.lanes:
+ self.lanes.append(lane)
+
+ #Update with lane specific read lengths
+ self.lane_read_length[lane] = read_length
+
+ self.lanes.sort()
+
+
+ def _count_tiles(self):
+ """
+ Count the number of tiles being used
+ """
+ self.tiles = len(glob.glob(os.path.join(self._image_dir,
+ 'L001',
+ 'C1.1',
+ 's_1_*_a.tif')))
+
+ def _count_cycles(self):
+ """
+ Figures out the number of cycles that are available
+ """
+ #print 'self._image_dir = %s' % (self._image_dir)
+ cycle_dirs = glob.glob(os.path.join(self._image_dir, 'L001', 'C*.1'))
+ #print 'cycle_dirs = %s' % (cycle_dirs)
+ cycle_list = []
+ for cycle_dir in cycle_dirs:
+ junk, c = os.path.split(cycle_dir)
+ cycle_list.append(int(c[1:c.find('.')]))
+
+ self.cycles = max(cycle_list)
+
+
+
+
+ def _generate_expected(self):
+ """
+ generates a list of files we expect to find.
+ """
+
+ firecrest = self.status['firecrest']
+ bustard = self.status['bustard']
+ gerald = self.status['gerald']
+
+
+ for lane in self.lanes:
+ for tile in range(1,self.tiles+1):
+ for cycle in range(1, self.cycles+1):
+
+ ##########################
+ # LANE, TILE, CYCLE LAYER
+
+ # FIRECREST
+ firecrest[_p2f(PATTERN_FIRECREST_QCM, lane, tile, cycle)] = False
+
+
+ ###################
+ # LANE, TILE LAYER
+
+ # FIRECREST
+ firecrest[_p2f(PATTERN_FIRECREST_INT, lane, tile)] = False
+ firecrest[_p2f(PATTERN_FIRECREST_NSE, lane, tile)] = False
+ firecrest[_p2f(PATTERN_FIRECREST_POS, lane, tile)] = False
+ firecrest[_p2f(PATTERN_FIRECREST_IDX, lane, tile)] = False
+ firecrest[_p2f(PATTERN_FIRECREST_CLU1, lane, tile)] = False
+ firecrest[_p2f(PATTERN_FIRECREST_CLU2, lane, tile)] = False
+ firecrest[_p2f(PATTERN_FIRECREST_CLU3, lane, tile)] = False
+ firecrest[_p2f(PATTERN_FIRECREST_CLU4, lane, tile)] = False
+
+
+ # BUSTARD
+ bustard[_p2f(PATTERN_BUSTARD_SIG2, lane, tile)] = False
+ bustard[_p2f(PATTERN_BUSTARD_PRB, lane, tile)] = False
+
+
+ # GERALD
+ #gerald[_p2f(PATTERN_GERALD_ALLTMP, lane, tile)] = False
+ #gerald[_p2f(PATTERN_GERALD_QRAWTMP, lane, tile)] = False
+ #gerald[_p2f(PATTERN_GERALD_ALLPNGTMP, lane, tile)] = False
+ #gerald[_p2f(PATTERN_GERALD_ALIGNTMP, lane, tile)] = False
+ #gerald[_p2f(PATTERN_GERALD_QVALTMP, lane, tile)] = False
+ #gerald[_p2f(PATTERN_GERALD_SCORETMP, lane, tile)] = False
+ #gerald[_p2f(PATTERN_GERALD_PREALIGNTMP, lane, tile)] = False
+ #gerald[_p2f(PATTERN_GERALD_REALIGNTMP, lane, tile)] = False
+ #gerald[_p2f(PATTERN_GERALD_RESCORETMP, lane, tile)] = False
+ gerald[_p2f(PATTERN_GERALD_RESCOREPNG, lane, tile)] = False
+ #gerald[_p2f(PATTERN_GERALD_ERRORSTMPPNG, lane, tile)] = False
+ #gerald[_p2f(PATTERN_GERALD_QCALTMP, lane, tile)] = False
+ #gerald[_p2f(PATTERN_GERALD_QVAL, lane, tile)] = False
+
+ ###################
+ # LANE LAYER
+
+ # GERALD
+ #gerald[_p2f(PATTERN_GERALD_SEQPRETMP, lane)] = False
+ #gerald[_p2f(PATTERN_GERALD_RESULTTMP, lane)] = False
+ #gerald[_p2f(PATTERN_GERALD_SIGMEANSTMP, lane)] = False
+ gerald[_p2f(PATTERN_GERALD_CALLPNG, lane)] = False
+ gerald[_p2f(PATTERN_GERALD_ALLPNG, lane)] = False
+ gerald[_p2f(PATTERN_GERALD_PERCENTALLPNG, lane)] = False
+ gerald[_p2f(PATTERN_GERALD_PERCENTCALLPNG, lane)] = False
+ gerald[_p2f(PATTERN_GERALD_PERCENTBASEPNG, lane)] = False
+ #gerald[_p2f(PATTERN_GERALD_FILTTMP, lane)] = False
+ #gerald[_p2f(PATTERN_GERALD_FRAGTMP, lane)] = False
+ #gerald[_p2f(PATTERN_GERALD_QREPORTTMP, lane)] = False
+ #gerald[_p2f(PATTERN_GERALD_QTABLETMP, lane)] = False
+ #gerald[_p2f(PATTERN_GERALD_QCALREPORTTMP, lane)] = False
+ #gerald[_p2f(PATTERN_GERALD_SEQUENCETMP, lane)] = False
+ gerald[_p2f(PATTERN_GERALD_LANEFINISHED, lane)] = False
+
+
+
+ #################
+ # LOOPS FINISHED
+
+ # FIRECREST
+ firecrest['offsets_finished.txt'] = False
+ firecrest['finished.txt'] = False
+
+ # BUSTARD
+ bustard['finished.txt'] = False
+
+ # GERALD
+ gerald['tiles.txt'] = False
+ gerald['FullAll.htm'] = False
+ #gerald['All.htm.tmp'] = False
+ #gerald['Signal_Means.txt.tmp'] = False
+ #gerald['plotIntensity_for_IVC'] = False
+ #gerald['IVC.htm.tmp'] = False
+ gerald['FullError.htm'] = False
+ gerald['FullPerfect.htm'] = False
+ #gerald['Error.htm.tmp'] = False
+ #gerald['Perfect.htm.tmp'] = False
+ #gerald['Summary.htm.tmp'] = False
+ #gerald['Tile.htm.tmp'] = False
+ gerald['finished.txt'] = False
+
+ def statusFirecrest(self):
+ """
+ returns (<completed>, <total>)
+ """
+ firecrest = self.status['firecrest']
+ total = len(firecrest)
+ completed = firecrest.values().count(True)
+
+ return (completed, total)
+
+
+ def statusBustard(self):
+ """
+ returns (<completed>, <total>)
+ """
+ bustard = self.status['bustard']
+ total = len(bustard)
+ completed = bustard.values().count(True)
+
+ return (completed, total)
+
+
+ def statusGerald(self):
+ """
+ returns (<completed>, <total>)
+ """
+ gerald = self.status['gerald']
+ total = len(gerald)
+ completed = gerald.values().count(True)
+
+ return (completed, total)
+
+
+ def statusTotal(self):
+ """
+ returns (<completed>, <total>)
+ """
+ #f = firecrest c = completed
+ #b = bustard t = total
+ #g = gerald
+ fc, ft = self.statusFirecrest()
+ bc, bt = self.statusBustard()
+ gc, gt = self.statusGerald()
+
+ return (fc+bc+gc, ft+bt+gt)
+
+
+ def statusReport(self):
+ """
+ Generate the basic percent complete report
+ """
+ def _percentCompleted(completed, total):
+ """
+ Returns precent completed as float
+ """
+ return (completed / float(total)) * 100
+
+ fc, ft = self.statusFirecrest()
+ bc, bt = self.statusBustard()
+ gc, gt = self.statusGerald()
+ tc, tt = self.statusTotal()
+
+ fp = _percentCompleted(fc, ft)
+ bp = _percentCompleted(bc, bt)
+ gp = _percentCompleted(gc, gt)
+ tp = _percentCompleted(tc, tt)
+
+ report = ['Firecrest: %s%% (%s/%s)' % (fp, fc, ft),
+ ' Bustard: %s%% (%s/%s)' % (bp, bc, bt),
+ ' Gerald: %s%% (%s/%s)' % (gp, gc, gt),
+ '-----------------------',
+ ' Total: %s%% (%s/%s)' % (tp, tc, tt),
+ ]
+ return report
+
+ def updateFirecrest(self, filename):
+ """
+ Marks firecrest filename as being completed.
+ """
+ self.status['firecrest'][filename] = True
+
+
+ def updateBustard(self, filename):
+ """
+ Marks bustard filename as being completed.
+ """
+ self.status['bustard'][filename] = True
+
+
+ def updateGerald(self, filename):
+ """
+ Marks gerald filename as being completed.
+ """
+ self.status['gerald'][filename] = True
+
+
+
+##################################################
+# Functions to be called by Thread(target=<func>)
+def _cmdLineStatusMonitorFunc(conf_info):
+ """
+ Given a ConfigInfo object, provides status to stdout.
+
+ You should probably use startCmdLineStatusMonitor()
+ instead of ths function.
+
+ Use with:
+ t = threading.Thread(target=_cmdLineStatusMonitorFunc,
+ args=[conf_info])
+ t.setDaemon(True)
+ t.start()
+ """
+ SLEEP_AMOUNT = 30
+
+ while 1:
+ if conf_info.status is None:
+ print "No status object yet."
+ time.sleep(SLEEP_AMOUNT)
+ continue
+
+ report = conf_info.status.statusReport()
+ print os.linesep.join(report)
+ print
+
+ time.sleep(SLEEP_AMOUNT)
+
+
+#############################################
+# Start monitor thread convenience functions
+def startCmdLineStatusMonitor(conf_info):
+ """
+ Starts a command line status monitor given a conf_info object.
+ """
+ t = threading.Thread(target=_cmdLineStatusMonitorFunc, args=[conf_info])
+ t.setDaemon(True)
+ t.start()
+
+from optparse import OptionParser
+def make_parser():
+ usage = "%prog: config file"
+
+ parser = OptionParser()
+ return parser
+
+def main(cmdline=None):
+ parser = make_parser()
+ opt, args = parser.parse_args(cmdline)
+
+ if len(args) != 1:
+ parser.error("need name of configuration file")
+
+ status = GARunStatus(args[0])
+ print os.linesep.join(status.statusReport())
+ return 0
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
+
--- /dev/null
+"""
+Core information needed to inspect a runfolder.
+"""
+from glob import glob
+import logging
+import os
+import re
+import shutil
+import stat
+import subprocess
+import sys
+import time
+
+try:
+ from xml.etree import ElementTree
+except ImportError, e:
+ from elementtree import ElementTree
+
+EUROPEAN_STRPTIME = "%d-%m-%Y"
+EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
+VERSION_RE = "([0-9\.]+)"
+USER_RE = "([a-zA-Z0-9]+)"
+LANES_PER_FLOWCELL = 8
+
+from htsworkflow.util.alphanum import alphanum
+from htsworkflow.util.ethelp import indent, flatten
+
+
+class PipelineRun(object):
+ """
+ Capture "interesting" information about a pipeline run
+ """
+ XML_VERSION = 1
+ PIPELINE_RUN = 'PipelineRun'
+ FLOWCELL_ID = 'FlowcellID'
+
+ def __init__(self, pathname=None, firecrest=None, bustard=None, gerald=None, xml=None):
+ if pathname is not None:
+ self.pathname = os.path.normpath(pathname)
+ else:
+ self.pathname = None
+ self._name = None
+ self._flowcell_id = None
+ self.firecrest = firecrest
+ self.bustard = bustard
+ self.gerald = gerald
+
+ if xml is not None:
+ self.set_elements(xml)
+
+ def _get_flowcell_id(self):
+ # extract flowcell ID
+ if self._flowcell_id is None:
+ config_dir = os.path.join(self.pathname, 'Config')
+ flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
+ if os.path.exists(flowcell_id_path):
+ flowcell_id_tree = ElementTree.parse(flowcell_id_path)
+ self._flowcell_id = flowcell_id_tree.findtext('Text')
+ else:
+ path_fields = self.pathname.split('_')
+ if len(path_fields) > 0:
+ # guessing last element of filename
+ flowcell_id = path_fields[-1]
+ else:
+ flowcell_id = 'unknown'
+
+ logging.warning(
+ "Flowcell id was not found, guessing %s" % (
+ flowcell_id))
+ self._flowcell_id = flowcell_id
+ return self._flowcell_id
+ flowcell_id = property(_get_flowcell_id)
+
+ def get_elements(self):
+ """
+ make one master xml file from all of our sub-components.
+ """
+ root = ElementTree.Element(PipelineRun.PIPELINE_RUN)
+ flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID)
+ flowcell.text = self.flowcell_id
+ root.append(self.firecrest.get_elements())
+ root.append(self.bustard.get_elements())
+ root.append(self.gerald.get_elements())
+ return root
+
+ def set_elements(self, tree):
+ # this file gets imported by all the others,
+ # so we need to hide the imports to avoid a cyclic imports
+ from htsworkflow.pipelines import firecrest
+ from htsworkflow.pipelines import bustard
+ from htsworkflow.pipelines import gerald
+
+ tag = tree.tag.lower()
+ if tag != PipelineRun.PIPELINE_RUN.lower():
+ raise ValueError('Pipeline Run Expecting %s got %s' % (
+ PipelineRun.PIPELINE_RUN, tag))
+ for element in tree:
+ tag = element.tag.lower()
+ if tag == PipelineRun.FLOWCELL_ID.lower():
+ self._flowcell_id = element.text
+ #ok the xword.Xword.XWORD pattern for module.class.constant is lame
+ elif tag == firecrest.Firecrest.FIRECREST.lower():
+ self.firecrest = firecrest.Firecrest(xml=element)
+ elif tag == bustard.Bustard.BUSTARD.lower():
+ self.bustard = bustard.Bustard(xml=element)
+ elif tag == gerald.Gerald.GERALD.lower():
+ self.gerald = gerald.Gerald(xml=element)
+ else:
+ logging.warn('PipelineRun unrecognized tag %s' % (tag,))
+
+ def _get_run_name(self):
+ """
+ Given a run tuple, find the latest date and use that as our name
+ """
+ if self._name is None:
+ tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time)
+ timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
+ self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml'
+ return self._name
+ name = property(_get_run_name)
+
+ def save(self, destdir=None):
+ if destdir is None:
+ destdir = ''
+ logging.info("Saving run report "+ self.name)
+ xml = self.get_elements()
+ indent(xml)
+ dest_pathname = os.path.join(destdir, self.name)
+ ElementTree.ElementTree(xml).write(dest_pathname)
+
+ def load(self, filename):
+ logging.info("Loading run report from " + filename)
+ tree = ElementTree.parse(filename).getroot()
+ self.set_elements(tree)
+
+def get_runs(runfolder):
+ """
+ Search through a run folder for all the various sub component runs
+ and then return a PipelineRun for each different combination.
+
+ For example if there are two different GERALD runs, this will
+ generate two different PipelineRun objects, that differ
+ in there gerald component.
+ """
+ from htsworkflow.pipelines import firecrest
+ from htsworkflow.pipelines import bustard
+ from htsworkflow.pipelines import gerald
+
+ datadir = os.path.join(runfolder, 'Data')
+
+ logging.info('Searching for runs in ' + datadir)
+ runs = []
+ for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
+ f = firecrest.firecrest(firecrest_pathname)
+ bustard_glob = os.path.join(firecrest_pathname, "Bustard*")
+ for bustard_pathname in glob(bustard_glob):
+ b = bustard.bustard(bustard_pathname)
+ gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
+ for gerald_pathname in glob(gerald_glob):
+ try:
+ g = gerald.gerald(gerald_pathname)
+ runs.append(PipelineRun(runfolder, f, b, g))
+ except IOError, e:
+ print "Ignoring", str(e)
+ return runs
+
+
+def extract_run_parameters(runs):
+ """
+ Search through runfolder_path for various runs and grab their parameters
+ """
+ for run in runs:
+ run.save()
+
+def summarize_mapped_reads(mapped_reads):
+ """
+ Summarize per chromosome reads into a genome count
+ But handle spike-in/contamination symlinks seperately.
+ """
+ summarized_reads = {}
+ genome_reads = 0
+ genome = 'unknown'
+ for k, v in mapped_reads.items():
+ path, k = os.path.split(k)
+ if len(path) > 0:
+ genome = path
+ genome_reads += v
+ else:
+ summarized_reads[k] = summarized_reads.setdefault(k, 0) + v
+ summarized_reads[genome] = genome_reads
+ return summarized_reads
+
+def summary_report(runs):
+ """
+ Summarize cluster numbers and mapped read counts for a runfolder
+ """
+ report = []
+ for run in runs:
+ # print a run name?
+ report.append('Summary for %s' % (run.name,))
+ # sort the report
+ eland_keys = run.gerald.eland_results.results.keys()
+ eland_keys.sort(alphanum)
+
+ lane_results = run.gerald.summary.lane_results
+ for lane_id in eland_keys:
+ result = run.gerald.eland_results.results[lane_id]
+ report.append("Sample name %s" % (result.sample_name))
+ report.append("Lane id %s" % (result.lane_id,))
+ cluster = lane_results[result.lane_id].cluster
+ report.append("Clusters %d +/- %d" % (cluster[0], cluster[1]))
+ report.append("Total Reads: %d" % (result.reads))
+ mc = result._match_codes
+ nm = mc['NM']
+ nm_percent = float(nm)/result.reads * 100
+ qc = mc['QC']
+ qc_percent = float(qc)/result.reads * 100
+
+ report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent))
+ report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent))
+ report.append('Unique (0,1,2 mismatches) %d %d %d' % \
+ (mc['U0'], mc['U1'], mc['U2']))
+ report.append('Repeat (0,1,2 mismatches) %d %d %d' % \
+ (mc['R0'], mc['R1'], mc['R2']))
+ report.append("Mapped Reads")
+ mapped_reads = summarize_mapped_reads(result.mapped_reads)
+ for name, counts in mapped_reads.items():
+ report.append(" %s: %d" % (name, counts))
+ report.append('---')
+ report.append('')
+ return os.linesep.join(report)
+
+def extract_results(runs, output_base_dir=None):
+ if output_base_dir is None:
+ output_base_dir = os.getcwd()
+
+ for r in runs:
+ result_dir = os.path.join(output_base_dir, r.flowcell_id)
+ logging.info("Using %s as result directory" % (result_dir,))
+ if not os.path.exists(result_dir):
+ os.mkdir(result_dir)
+
+ # create cycle_dir
+ cycle = "C%d-%d" % (r.firecrest.start, r.firecrest.stop)
+ logging.info("Filling in %s" % (cycle,))
+ cycle_dir = os.path.join(result_dir, cycle)
+ if os.path.exists(cycle_dir):
+ logging.error("%s already exists, not overwriting" % (cycle_dir,))
+ continue
+ else:
+ os.mkdir(cycle_dir)
+
+ # copy stuff out of the main run
+ g = r.gerald
+
+ # save run file
+ r.save(cycle_dir)
+
+ # Copy Summary.htm
+ summary_path = os.path.join(r.gerald.pathname, 'Summary.htm')
+ if os.path.exists(summary_path):
+ logging.info('Copying %s to %s' % (summary_path, cycle_dir))
+ shutil.copy(summary_path, cycle_dir)
+ else:
+ logging.info('Summary file %s was not found' % (summary_path,))
+
+ # tar score files
+ score_files = []
+ for f in os.listdir(g.pathname):
+ if re.match('.*_score.txt', f):
+ score_files.append(f)
+
+ tar_cmd = ['/bin/tar', 'c'] + score_files
+ bzip_cmd = [ 'bzip2', '-9', '-c' ]
+ tar_dest_name =os.path.join(cycle_dir, 'scores.tar.bz2')
+ tar_dest = open(tar_dest_name, 'w')
+ logging.info("Compressing score files in %s" % (g.pathname,))
+ logging.info("Running tar: " + " ".join(tar_cmd[:10]))
+ logging.info("Running bzip2: " + " ".join(bzip_cmd))
+ logging.info("Writing to %s" %(tar_dest_name))
+
+ tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, cwd=g.pathname)
+ bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest)
+ tar.wait()
+
+ # copy & bzip eland files
+ for eland_lane in g.eland_results.values():
+ source_name = eland_lane.pathname
+ path, name = os.path.split(eland_lane.pathname)
+ dest_name = os.path.join(cycle_dir, name+'.bz2')
+
+ args = ['bzip2', '-9', '-c', source_name]
+ logging.info('Running: %s' % ( " ".join(args) ))
+ bzip_dest = open(dest_name, 'w')
+ bzip = subprocess.Popen(args, stdout=bzip_dest)
+ logging.info('Saving to %s' % (dest_name, ))
+ bzip.wait()
+
+def clean_runs(runs):
+ """
+ Clean up run folders to optimize for compression.
+ """
+ # TODO: implement this.
+ # rm RunLog*.xml
+ # rm pipeline_*.txt
+ # rm gclog.txt
+ # rm NetCopy.log
+ # rm nfn.log
+ # rm Images/L*
+ # cd Data/C1-*_Firecrest*
+ # make clean_intermediate
+
+ pass
--- /dev/null
+import unittest
+
+from StringIO import StringIO
+from htsworkflow.pipelines import genome_mapper
+
+class testGenomeMapper(unittest.TestCase):
+ def test_construct_mapper(self):
+ genomes = {
+ 'Arabidopsis thaliana': {'v01212004': '/arabidopsis'},
+ 'Homo sapiens': {'hg18': '/hg18'},
+ 'Mus musculus': {'mm8': '/mm8',
+ 'mm9': '/mm9',
+ 'mm10': '/mm10'},
+ 'Phage': {'174': '/phi'},
+ }
+ genome_map = genome_mapper.constructMapperDict(genomes)
+
+ self.failUnlessEqual("%(Mus musculus|mm8)s" % (genome_map), "/mm8")
+ self.failUnlessEqual("%(Phage|174)s" % (genome_map), "/phi")
+ self.failUnlessEqual("%(Mus musculus)s" % (genome_map), "/mm10")
+ self.failUnlessEqual("%(Mus musculus|mm8)s" % (genome_map), "/mm8")
+ self.failUnlessEqual("%(Mus musculus|mm10)s" % (genome_map), "/mm10")
+
+ self.failUnlessEqual(len(genome_map.keys()), 6)
+ self.failUnlessEqual(len(genome_map.values()), 6)
+ self.failUnlessEqual(len(genome_map.items()), 6)
+
+
+def suite():
+ return unittest.makeSuite(testGenomeMapper,'test')
+
+if __name__ == "__main__":
+ unittest.main(defaultTest="suite")
--- /dev/null
+#!/usr/bin/env python
+
+from datetime import datetime, date
+import os
+import tempfile
+import shutil
+import unittest
+
+from htsworkflow.pipelines import firecrest
+from htsworkflow.pipelines import bustard
+from htsworkflow.pipelines import gerald
+from htsworkflow.pipelines import runfolder
+from htsworkflow.pipelines.runfolder import ElementTree
+
+
+def make_flowcell_id(runfolder_dir, flowcell_id=None):
+ if flowcell_id is None:
+ flowcell_id = '207BTAAXY'
+
+ config = """<?xml version="1.0"?>
+<FlowcellId>
+ <Text>%s</Text>
+</FlowcellId>""" % (flowcell_id,)
+ config_dir = os.path.join(runfolder_dir, 'Config')
+
+ if not os.path.exists(config_dir):
+ os.mkdir(config_dir)
+ pathname = os.path.join(config_dir, 'FlowcellId.xml')
+ f = open(pathname,'w')
+ f.write(config)
+ f.close()
+
+def make_matrix(matrix_dir):
+ contents = """# Auto-generated frequency response matrix
+> A
+> C
+> G
+> T
+0.77 0.15 -0.04 -0.04
+0.76 1.02 -0.05 -0.06
+-0.10 -0.10 1.17 -0.03
+-0.13 -0.12 0.80 1.27
+"""
+ s_matrix = os.path.join(matrix_dir, 's_matrix.txt')
+ f = open(s_matrix, 'w')
+ f.write(contents)
+ f.close()
+
+def make_phasing_params(bustard_dir):
+ for lane in range(1,9):
+ pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane))
+ f = open(pathname, 'w')
+ f.write("""<Parameters>
+ <Phasing>0.009900</Phasing>
+ <Prephasing>0.003500</Prephasing>
+</Parameters>
+""")
+ f.close()
+
+def make_gerald_config(gerald_dir):
+ config_xml = """<RunParameters>
+<ChipWideRunParameters>
+ <ANALYSIS>default</ANALYSIS>
+ <BAD_LANES></BAD_LANES>
+ <BAD_TILES></BAD_TILES>
+ <CONTAM_DIR></CONTAM_DIR>
+ <CONTAM_FILE></CONTAM_FILE>
+ <ELAND_GENOME>Need_to_specify_ELAND_genome_directory</ELAND_GENOME>
+ <ELAND_MULTIPLE_INSTANCES>8</ELAND_MULTIPLE_INSTANCES>
+ <ELAND_REPEAT></ELAND_REPEAT>
+ <EMAIL_DOMAIN>domain.com</EMAIL_DOMAIN>
+ <EMAIL_LIST>diane</EMAIL_LIST>
+ <EMAIL_SERVER>localhost:25</EMAIL_SERVER>
+ <EXPT_DIR>/home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane</EXPT_DIR>
+ <EXPT_DIR_ROOT>/home/diane/gec</EXPT_DIR_ROOT>
+ <FORCE>1</FORCE>
+ <GENOME_DIR>/home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes</GENOME_DIR>
+ <GENOME_FILE>Need_to_specify_genome_file_name</GENOME_FILE>
+ <HAMSTER_FLAG>genome</HAMSTER_FLAG>
+ <OUT_DIR>/home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane</OUT_DIR>
+ <POST_RUN_COMMAND></POST_RUN_COMMAND>
+ <PRB_FILE_SUFFIX>_prb.txt</PRB_FILE_SUFFIX>
+ <PURE_BASES>12</PURE_BASES>
+ <QF_PARAMS>'((CHASTITY>=0.6))'</QF_PARAMS>
+ <QHG_FILE_SUFFIX>_qhg.txt</QHG_FILE_SUFFIX>
+ <QUALITY_FORMAT>--symbolic</QUALITY_FORMAT>
+ <READ_LENGTH>32</READ_LENGTH>
+ <SEQUENCE_FORMAT>--scarf</SEQUENCE_FORMAT>
+ <SEQ_FILE_SUFFIX>_seq.txt</SEQ_FILE_SUFFIX>
+ <SIG_FILE_SUFFIX_DEPHASED>_sig2.txt</SIG_FILE_SUFFIX_DEPHASED>
+ <SIG_FILE_SUFFIX_NOT_DEPHASED>_sig.txt</SIG_FILE_SUFFIX_NOT_DEPHASED>
+ <SOFTWARE_VERSION>@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp</SOFTWARE_VERSION>
+ <TILE_REGEX>s_[1-8]_[0-9][0-9][0-9][0-9]</TILE_REGEX>
+ <TILE_ROOT>s</TILE_ROOT>
+ <TIME_STAMP>Sat Apr 19 19:08:30 2008</TIME_STAMP>
+ <TOOLS_DIR>/home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald</TOOLS_DIR>
+ <USE_BASES>all</USE_BASES>
+ <WEB_DIR_ROOT>http://host.domain.com/yourshare/</WEB_DIR_ROOT>
+</ChipWideRunParameters>
+<LaneSpecificRunParameters>
+ <ANALYSIS>
+ <s_1>eland</s_1>
+ <s_2>eland</s_2>
+ <s_3>eland</s_3>
+ <s_4>eland</s_4>
+ <s_5>eland</s_5>
+ <s_6>eland</s_6>
+ <s_7>eland</s_7>
+ <s_8>eland</s_8>
+ </ANALYSIS>
+ <ELAND_GENOME>
+ <s_1>/g/dm3</s_1>
+ <s_2>/g/equcab1</s_2>
+ <s_3>/g/equcab1</s_3>
+ <s_4>/g/canfam2</s_4>
+ <s_5>/g/hg18</s_5>
+ <s_6>/g/hg18</s_6>
+ <s_7>/g/hg18</s_7>
+ <s_8>/g/hg18</s_8>
+ </ELAND_GENOME>
+ <READ_LENGTH>
+ <s_1>32</s_1>
+ <s_2>32</s_2>
+ <s_3>32</s_3>
+ <s_4>32</s_4>
+ <s_5>32</s_5>
+ <s_6>32</s_6>
+ <s_7>32</s_7>
+ <s_8>32</s_8>
+ </READ_LENGTH>
+ <USE_BASES>
+ <s_1>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_1>
+ <s_2>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_2>
+ <s_3>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_3>
+ <s_4>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_4>
+ <s_5>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_5>
+ <s_6>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_6>
+ <s_7>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_7>
+ <s_8>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_8>
+ </USE_BASES>
+</LaneSpecificRunParameters>
+</RunParameters>
+"""
+ pathname = os.path.join(gerald_dir, 'config.xml')
+ f = open(pathname,'w')
+ f.write(config_xml)
+ f.close()
+
+
+def make_summary_htm(gerald_dir):
+ summary_htm = """<!--RUN_TIME Mon Apr 21 11:52:25 2008 -->
+<!--SOFTWARE_VERSION @(#) $Id: jerboa.pl,v 1.31 2007/03/05 17:52:15 km Exp $-->
+<html>
+<body>
+
+<a name="Top"><h2><title>080416_HWI-EAS229_0024_207BTAAXX Summary</title></h2></a>
+<h1>Summary Information For Experiment 080416_HWI-EAS229_0024_207BTAAXX on Machine HWI-EAS229</h1>
+<h2><br></br>Chip Summary<br></br></h2>
+<table border="1" cellpadding="5">
+<tr><td>Machine</td><td>HWI-EAS229</td></tr>
+<tr><td>Run Folder</td><td>080416_HWI-EAS229_0024_207BTAAXX</td></tr>
+<tr><td>Chip ID</td><td>unknown</td></tr>
+</table>
+<h2><br></br>Lane Parameter Summary<br></br></h2>
+<table border="1" cellpadding="5">
+<tr>
+<td>Lane</td>
+<td>Sample ID</td>
+<td>Sample Target</td>
+<td>Sample Type</td>
+<td>Length</td>
+<td>Filter</td>
+<td>Tiles</td>
+</tr>
+<tr>
+<td>1</td>
+<td>unknown</td>
+<td>dm3</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane1">Lane 1</a></td>
+</tr>
+<tr>
+<td>2</td>
+<td>unknown</td>
+<td>equcab1</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane2">Lane 2</a></td>
+</tr>
+<tr>
+<td>3</td>
+<td>unknown</td>
+<td>equcab1</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane3">Lane 3</a></td>
+</tr>
+<tr>
+<td>4</td>
+<td>unknown</td>
+<td>canfam2</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane4">Lane 4</a></td>
+</tr>
+<tr>
+<td>5</td>
+<td>unknown</td>
+<td>hg18</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane5">Lane 5</a></td>
+</tr>
+<tr>
+<td>6</td>
+<td>unknown</td>
+<td>hg18</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane6">Lane 6</a></td>
+</tr>
+<tr>
+<td>7</td>
+<td>unknown</td>
+<td>hg18</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane7">Lane 7</a></td>
+</tr>
+<tr>
+<td>8</td>
+<td>unknown</td>
+<td>hg18</td>
+<td>ELAND</td>
+<td>32</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td><a href="#Lane8">Lane 8</a></td>
+</tr>
+</table>
+<h2><br></br>Lane Results Summary<br></br></h2>
+<table border="1" cellpadding="5">
+<tr>
+
+<td>Lane </td>
+<td>Clusters </td>
+<td>Av 1st Cycle Int </td>
+<td>% intensity after 20 cycles </td>
+<td>% PF Clusters </td>
+<td>% Align (PF) </td>
+<td>Av Alignment Score (PF) </td>
+<td> % Error Rate (PF) </td>
+</tr>
+<tr>
+<td>1</td>
+<td>17421 +/- 2139</td>
+<td>7230 +/- 801</td>
+<td>23.73 +/- 10.79</td>
+<td>13.00 +/- 22.91</td>
+<td>32.03 +/- 18.45</td>
+<td>6703.57 +/- 3753.85</td>
+<td>4.55 +/- 4.81</td>
+</tr>
+<tr>
+<td>2</td>
+<td>20311 +/- 2402</td>
+<td>7660 +/- 678</td>
+<td>17.03 +/- 4.40</td>
+<td>40.74 +/- 30.33</td>
+<td>29.54 +/- 9.03</td>
+<td>5184.02 +/- 1631.54</td>
+<td>3.27 +/- 3.94</td>
+</tr>
+<tr>
+<td>3</td>
+<td>20193 +/- 2399</td>
+<td>7700 +/- 797</td>
+<td>15.75 +/- 3.30</td>
+<td>56.56 +/- 17.16</td>
+<td>27.33 +/- 7.48</td>
+<td>4803.49 +/- 1313.31</td>
+<td>3.07 +/- 2.86</td>
+</tr>
+<tr>
+<td>4</td>
+<td>15537 +/- 2531</td>
+<td>7620 +/- 1392</td>
+<td>15.37 +/- 3.79</td>
+<td>63.05 +/- 18.30</td>
+<td>15.88 +/- 4.99</td>
+<td>3162.13 +/- 962.59</td>
+<td>3.11 +/- 2.22</td>
+</tr>
+<tr>
+<td>5</td>
+<td>32047 +/- 3356</td>
+<td>8093 +/- 831</td>
+<td>23.79 +/- 6.18</td>
+<td>53.36 +/- 18.06</td>
+<td>48.04 +/- 13.77</td>
+<td>9866.23 +/- 2877.30</td>
+<td>2.26 +/- 1.16</td>
+</tr>
+<tr>
+<td>6</td>
+<td>32946 +/- 4753</td>
+<td>8227 +/- 736</td>
+<td>24.07 +/- 4.69</td>
+<td>54.65 +/- 12.57</td>
+<td>50.98 +/- 10.54</td>
+<td>10468.86 +/- 2228.53</td>
+<td>2.21 +/- 2.33</td>
+</tr>
+<tr>
+<td>7</td>
+<td>39504 +/- 4171</td>
+<td>8401 +/- 785</td>
+<td>22.55 +/- 4.56</td>
+<td>45.22 +/- 10.34</td>
+<td>48.41 +/- 9.67</td>
+<td>9829.40 +/- 1993.20</td>
+<td>2.26 +/- 1.11</td>
+</tr>
+<tr>
+<td>8</td>
+<td>37998 +/- 3792</td>
+<td>8443 +/- 1211</td>
+<td>39.03 +/- 7.52</td>
+<td>42.16 +/- 12.35</td>
+<td>40.98 +/- 14.89</td>
+<td>8128.87 +/- 3055.34</td>
+<td>3.57 +/- 2.77</td>
+</tr>
+</table>
+</body>
+</html>
+"""
+ pathname = os.path.join(gerald_dir, 'Summary.htm')
+ f = open(pathname, 'w')
+ f.write(summary_htm)
+ f.close()
+
+def make_eland_results(gerald_dir):
+ eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D.
+>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 chr2L.fa 8796855 R DD 24T
+>HWI-EAS229_24_207BTAAXX:1:7:776:582 AGCTCANCCGATCGAAAACCTCNCCAAGCAAT NM 0 0 0
+>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 Lambda.fa 8796855 R DD 24T
+"""
+ for i in range(1,9):
+ pathname = os.path.join(gerald_dir,
+ 's_%d_eland_result.txt' % (i,))
+ f = open(pathname, 'w')
+ f.write(eland_result)
+ f.close()
+
+class RunfolderTests(unittest.TestCase):
+ """
+ Test components of the runfolder processing code
+ which includes firecrest, bustard, and gerald
+ """
+ def setUp(self):
+ # make a fake runfolder directory
+ self.temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_')
+
+ self.runfolder_dir = os.path.join(self.temp_dir,
+ '080102_HWI-EAS229_0010_207BTAAXX')
+ os.mkdir(self.runfolder_dir)
+
+ self.data_dir = os.path.join(self.runfolder_dir, 'Data')
+ os.mkdir(self.data_dir)
+
+ self.firecrest_dir = os.path.join(self.data_dir,
+ 'C1-33_Firecrest1.8.28_12-04-2008_diane'
+ )
+ os.mkdir(self.firecrest_dir)
+ self.matrix_dir = os.path.join(self.firecrest_dir, 'Matrix')
+ os.mkdir(self.matrix_dir)
+ make_matrix(self.matrix_dir)
+
+ self.bustard_dir = os.path.join(self.firecrest_dir,
+ 'Bustard1.8.28_12-04-2008_diane')
+ os.mkdir(self.bustard_dir)
+ make_phasing_params(self.bustard_dir)
+
+ self.gerald_dir = os.path.join(self.bustard_dir,
+ 'GERALD_12-04-2008_diane')
+ os.mkdir(self.gerald_dir)
+ make_gerald_config(self.gerald_dir)
+ make_summary_htm(self.gerald_dir)
+ make_eland_results(self.gerald_dir)
+
+ def tearDown(self):
+ shutil.rmtree(self.temp_dir)
+
+ def test_firecrest(self):
+ """
+ Construct a firecrest object
+ """
+ f = firecrest.firecrest(self.firecrest_dir)
+ self.failUnlessEqual(f.version, '1.8.28')
+ self.failUnlessEqual(f.start, 1)
+ self.failUnlessEqual(f.stop, 33)
+ self.failUnlessEqual(f.user, 'diane')
+ self.failUnlessEqual(f.date, date(2008,4,12))
+
+ xml = f.get_elements()
+ # just make sure that element tree can serialize the tree
+ xml_str = ElementTree.tostring(xml)
+
+ f2 = firecrest.Firecrest(xml=xml)
+ self.failUnlessEqual(f.version, f2.version)
+ self.failUnlessEqual(f.start, f2.start)
+ self.failUnlessEqual(f.stop, f2.stop)
+ self.failUnlessEqual(f.user, f2.user)
+ self.failUnlessEqual(f.date, f2.date)
+
+ def test_bustard(self):
+ """
+ construct a bustard object
+ """
+ b = bustard.bustard(self.bustard_dir)
+ self.failUnlessEqual(b.version, '1.8.28')
+ self.failUnlessEqual(b.date, date(2008,4,12))
+ self.failUnlessEqual(b.user, 'diane')
+ self.failUnlessEqual(len(b.phasing), 8)
+ self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099)
+
+ xml = b.get_elements()
+ b2 = bustard.Bustard(xml=xml)
+ self.failUnlessEqual(b.version, b2.version)
+ self.failUnlessEqual(b.date, b2.date )
+ self.failUnlessEqual(b.user, b2.user)
+ self.failUnlessEqual(len(b.phasing), len(b2.phasing))
+ for key in b.phasing.keys():
+ self.failUnlessEqual(b.phasing[key].lane,
+ b2.phasing[key].lane)
+ self.failUnlessEqual(b.phasing[key].phasing,
+ b2.phasing[key].phasing)
+ self.failUnlessEqual(b.phasing[key].prephasing,
+ b2.phasing[key].prephasing)
+
+ def test_gerald(self):
+ # need to update gerald and make tests for it
+ g = gerald.gerald(self.gerald_dir)
+
+ self.failUnlessEqual(g.version,
+ '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp')
+ self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30))
+ self.failUnlessEqual(len(g.lanes), len(g.lanes.keys()))
+ self.failUnlessEqual(len(g.lanes), len(g.lanes.items()))
+
+
+ # list of genomes, matches what was defined up in
+ # make_gerald_config.
+ # the first None is to offset the genomes list to be 1..9
+ # instead of pythons default 0..8
+ genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2',
+ '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ]
+
+ # test lane specific parameters from gerald config file
+ for i in range(1,9):
+ cur_lane = g.lanes[str(i)]
+ self.failUnlessEqual(cur_lane.analysis, 'eland')
+ self.failUnlessEqual(cur_lane.eland_genome, genomes[i])
+ self.failUnlessEqual(cur_lane.read_length, '32')
+ self.failUnlessEqual(cur_lane.use_bases, 'Y'*32)
+
+ # test data extracted from summary file
+ clusters = [None,
+ (17421, 2139), (20311, 2402), (20193, 2399), (15537, 2531),
+ (32047, 3356), (32946, 4753), (39504, 4171), (37998, 3792)]
+
+ for i in range(1,9):
+ summary_lane = g.summary[str(i)]
+ self.failUnlessEqual(summary_lane.cluster, clusters[i])
+ self.failUnlessEqual(summary_lane.lane, str(i))
+
+ xml = g.get_elements()
+ # just make sure that element tree can serialize the tree
+ xml_str = ElementTree.tostring(xml)
+ g2 = gerald.Gerald(xml=xml)
+
+ # do it all again after extracting from the xml file
+ self.failUnlessEqual(g.version, g2.version)
+ self.failUnlessEqual(g.date, g2.date)
+ self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys()))
+ self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items()))
+
+ # test lane specific parameters from gerald config file
+ for i in range(1,9):
+ g_lane = g.lanes[str(i)]
+ g2_lane = g2.lanes[str(i)]
+ self.failUnlessEqual(g_lane.analysis, g2_lane.analysis)
+ self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome)
+ self.failUnlessEqual(g_lane.read_length, g2_lane.read_length)
+ self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases)
+
+ # test (some) summary elements
+ for i in range(1,9):
+ g_summary = g.summary[str(i)]
+ g2_summary = g2.summary[str(i)]
+ self.failUnlessEqual(g_summary.cluster, g2_summary.cluster)
+ self.failUnlessEqual(g_summary.lane, g2_summary.lane)
+
+ g_eland = g.eland_results
+ g2_eland = g2.eland_results
+ for lane in g_eland.keys():
+ self.failUnlessEqual(g_eland[lane].reads,
+ g2_eland[lane].reads)
+ self.failUnlessEqual(len(g_eland[lane].mapped_reads),
+ len(g2_eland[lane].mapped_reads))
+ for k in g_eland[lane].mapped_reads.keys():
+ self.failUnlessEqual(g_eland[lane].mapped_reads[k],
+ g2_eland[lane].mapped_reads[k])
+
+ self.failUnlessEqual(len(g_eland[lane].match_codes),
+ len(g2_eland[lane].match_codes))
+ for k in g_eland[lane].match_codes.keys():
+ self.failUnlessEqual(g_eland[lane].match_codes[k],
+ g2_eland[lane].match_codes[k])
+
+
+ def test_eland(self):
+ dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa',
+ 'chr2L.fa': 'dm3/chr2L.fa',
+ 'Lambda.fa': 'Lambda.fa'}
+ genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map,
+ '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map }
+ eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps)
+
+ for i in range(1,9):
+ lane = eland[str(i)]
+ self.failUnlessEqual(lane.reads, 4)
+ self.failUnlessEqual(lane.sample_name, "s")
+ self.failUnlessEqual(lane.lane_id, unicode(i))
+ self.failUnlessEqual(len(lane.mapped_reads), 3)
+ self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1)
+ self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1)
+ self.failUnlessEqual(lane.match_codes['U1'], 2)
+ self.failUnlessEqual(lane.match_codes['NM'], 1)
+
+ xml = eland.get_elements()
+ # just make sure that element tree can serialize the tree
+ xml_str = ElementTree.tostring(xml)
+ e2 = gerald.ELAND(xml=xml)
+
+ for i in range(1,9):
+ l1 = eland[str(i)]
+ l2 = e2[str(i)]
+ self.failUnlessEqual(l1.reads, l2.reads)
+ self.failUnlessEqual(l1.sample_name, l2.sample_name)
+ self.failUnlessEqual(l1.lane_id, l2.lane_id)
+ self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads))
+ self.failUnlessEqual(len(l1.mapped_reads), 3)
+ for k in l1.mapped_reads.keys():
+ self.failUnlessEqual(l1.mapped_reads[k],
+ l2.mapped_reads[k])
+
+ self.failUnlessEqual(len(l1.match_codes), 9)
+ self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes))
+ for k in l1.match_codes.keys():
+ self.failUnlessEqual(l1.match_codes[k],
+ l2.match_codes[k])
+
+ def test_runfolder(self):
+ runs = runfolder.get_runs(self.runfolder_dir)
+
+ # do we get the flowcell id from the filename?
+ self.failUnlessEqual(len(runs), 1)
+ self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-04-19.xml')
+
+ # do we get the flowcell id from the FlowcellId.xml file
+ make_flowcell_id(self.runfolder_dir, '207BTAAXY')
+ runs = runfolder.get_runs(self.runfolder_dir)
+ self.failUnlessEqual(len(runs), 1)
+ self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-04-19.xml')
+
+ r1 = runs[0]
+ xml = r1.get_elements()
+ xml_str = ElementTree.tostring(xml)
+
+ r2 = runfolder.PipelineRun(xml=xml)
+ self.failUnlessEqual(r1.name, r2.name)
+ self.failIfEqual(r2.firecrest, None)
+ self.failIfEqual(r2.bustard, None)
+ self.failIfEqual(r2.gerald, None)
+
+
+def suite():
+ return unittest.makeSuite(RunfolderTests,'test')
+
+if __name__ == "__main__":
+ unittest.main(defaultTest="suite")
+
--- /dev/null
+#!/usr/bin/env python
+
+from datetime import datetime, date
+import os
+import tempfile
+import shutil
+import unittest
+
+from htsworkflow.pipelines import firecrest
+from htsworkflow.pipelines import bustard
+from htsworkflow.pipelines import gerald
+from htsworkflow.pipelines import runfolder
+from htsworkflow.pipelines.runfolder import ElementTree
+
+
+def make_flowcell_id(runfolder_dir, flowcell_id=None):
+ if flowcell_id is None:
+ flowcell_id = '207BTAAXY'
+
+ config = """<?xml version="1.0"?>
+<FlowcellId>
+ <Text>%s</Text>
+</FlowcellId>""" % (flowcell_id,)
+ config_dir = os.path.join(runfolder_dir, 'Config')
+
+ if not os.path.exists(config_dir):
+ os.mkdir(config_dir)
+ pathname = os.path.join(config_dir, 'FlowcellId.xml')
+ f = open(pathname,'w')
+ f.write(config)
+ f.close()
+
+def make_matrix(matrix_dir):
+ contents = """# Auto-generated frequency response matrix
+> A
+> C
+> G
+> T
+0.77 0.15 -0.04 -0.04
+0.76 1.02 -0.05 -0.06
+-0.10 -0.10 1.17 -0.03
+-0.13 -0.12 0.80 1.27
+"""
+ s_matrix = os.path.join(matrix_dir, 's_matrix.txt')
+ f = open(s_matrix, 'w')
+ f.write(contents)
+ f.close()
+
+def make_phasing_params(bustard_dir):
+ for lane in range(1,9):
+ pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane))
+ f = open(pathname, 'w')
+ f.write("""<Parameters>
+ <Phasing>0.009900</Phasing>
+ <Prephasing>0.003500</Prephasing>
+</Parameters>
+""")
+ f.close()
+
+def make_gerald_config(gerald_dir):
+ config_xml = """<RunParameters>
+<ChipWideRunParameters>
+ <ANALYSIS>default</ANALYSIS>
+ <BAD_LANES></BAD_LANES>
+ <BAD_TILES></BAD_TILES>
+ <CONTAM_DIR></CONTAM_DIR>
+ <CONTAM_FILE></CONTAM_FILE>
+ <ELAND_GENOME>Need_to_specify_ELAND_genome_directory</ELAND_GENOME>
+ <ELAND_MULTIPLE_INSTANCES>8</ELAND_MULTIPLE_INSTANCES>
+ <ELAND_REPEAT></ELAND_REPEAT>
+ <EMAIL_DOMAIN>domain.com</EMAIL_DOMAIN>
+ <EMAIL_LIST>diane</EMAIL_LIST>
+ <EMAIL_SERVER>localhost:25</EMAIL_SERVER>
+ <EXPT_DIR>/home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane</EXPT_DIR>
+ <EXPT_DIR_ROOT>/home/diane/gec</EXPT_DIR_ROOT>
+ <FORCE>1</FORCE>
+ <GENOME_DIR>/home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes</GENOME_DIR>
+ <GENOME_FILE>Need_to_specify_genome_file_name</GENOME_FILE>
+ <HAMSTER_FLAG>genome</HAMSTER_FLAG>
+ <OUT_DIR>/home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane</OUT_DIR>
+ <POST_RUN_COMMAND></POST_RUN_COMMAND>
+ <PRB_FILE_SUFFIX>_prb.txt</PRB_FILE_SUFFIX>
+ <PURE_BASES>12</PURE_BASES>
+ <QF_PARAMS>'((CHASTITY>=0.6))'</QF_PARAMS>
+ <QHG_FILE_SUFFIX>_qhg.txt</QHG_FILE_SUFFIX>
+ <QUALITY_FORMAT>--symbolic</QUALITY_FORMAT>
+ <READ_LENGTH>32</READ_LENGTH>
+ <SEQUENCE_FORMAT>--scarf</SEQUENCE_FORMAT>
+ <SEQ_FILE_SUFFIX>_seq.txt</SEQ_FILE_SUFFIX>
+ <SIG_FILE_SUFFIX_DEPHASED>_sig2.txt</SIG_FILE_SUFFIX_DEPHASED>
+ <SIG_FILE_SUFFIX_NOT_DEPHASED>_sig.txt</SIG_FILE_SUFFIX_NOT_DEPHASED>
+ <SOFTWARE_VERSION>@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp</SOFTWARE_VERSION>
+ <TILE_REGEX>s_[1-8]_[0-9][0-9][0-9][0-9]</TILE_REGEX>
+ <TILE_ROOT>s</TILE_ROOT>
+ <TIME_STAMP>Sat Apr 19 19:08:30 2008</TIME_STAMP>
+ <TOOLS_DIR>/home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald</TOOLS_DIR>
+ <USE_BASES>all</USE_BASES>
+ <WEB_DIR_ROOT>http://host.domain.com/yourshare/</WEB_DIR_ROOT>
+</ChipWideRunParameters>
+<LaneSpecificRunParameters>
+ <ANALYSIS>
+ <s_1>eland</s_1>
+ <s_2>eland</s_2>
+ <s_3>eland</s_3>
+ <s_4>eland</s_4>
+ <s_5>eland</s_5>
+ <s_6>eland</s_6>
+ <s_7>eland</s_7>
+ <s_8>eland</s_8>
+ </ANALYSIS>
+ <ELAND_GENOME>
+ <s_1>/g/dm3</s_1>
+ <s_2>/g/equcab1</s_2>
+ <s_3>/g/equcab1</s_3>
+ <s_4>/g/canfam2</s_4>
+ <s_5>/g/hg18</s_5>
+ <s_6>/g/hg18</s_6>
+ <s_7>/g/hg18</s_7>
+ <s_8>/g/hg18</s_8>
+ </ELAND_GENOME>
+ <READ_LENGTH>
+ <s_1>32</s_1>
+ <s_2>32</s_2>
+ <s_3>32</s_3>
+ <s_4>32</s_4>
+ <s_5>32</s_5>
+ <s_6>32</s_6>
+ <s_7>32</s_7>
+ <s_8>32</s_8>
+ </READ_LENGTH>
+ <USE_BASES>
+ <s_1>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_1>
+ <s_2>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_2>
+ <s_3>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_3>
+ <s_4>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_4>
+ <s_5>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_5>
+ <s_6>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_6>
+ <s_7>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_7>
+ <s_8>YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY</s_8>
+ </USE_BASES>
+</LaneSpecificRunParameters>
+</RunParameters>
+"""
+ pathname = os.path.join(gerald_dir, 'config.xml')
+ f = open(pathname,'w')
+ f.write(config_xml)
+ f.close()
+
+def make_summary_htm(gerald_dir):
+ summary_htm="""<!--RUN_TIME Wed Jul 2 06:47:44 2008 -->
+<!--SOFTWARE_VERSION @(#) $Id: jerboa.pl,v 1.94 2007/12/04 09:59:07 rshaw Exp $-->
+<html>
+<body>
+
+<a name="Top"><h2><title>080627_HWI-EAS229_0036_3055HAXX Summary</title></h2></a>
+<h1>Summary Information For Experiment 080627_HWI-EAS229_0036_3055HAXX on Machine HWI-EAS229</h1>
+<h2><br></br>Chip Summary<br></br></h2>
+<table border="1" cellpadding="5">
+<tr><td>Machine</td><td>HWI-EAS229</td></tr>
+<tr><td>Run Folder</td><td>080627_HWI-EAS229_0036_3055HAXX</td></tr>
+<tr><td>Chip ID</td><td>unknown</td></tr>
+</table>
+<h2><br></br>Chip Results Summary<br></br></h2>
+<table border="1" cellpadding="5">
+<tr>
+<td>Clusters</td>
+<td>Clusters (PF)</td>
+<td>Yield (kbases)</td>
+</tr>
+<tr><td>80933224</td>
+<td>43577803</td>
+<td>1133022</td>
+</tr>
+</table>
+<h2><br></br>Lane Parameter Summary<br></br></h2>
+<table border="1" cellpadding="5">
+<tr>
+<td>Lane</td>
+<td>Sample ID</td>
+<td>Sample Target</td>
+<td>Sample Type</td>
+<td>Length</td>
+<td>Filter</td>
+<td>Num Tiles</td>
+<td>Tiles</td>
+</tr>
+<tr>
+<td>1</td>
+<td>unknown</td>
+<td>mm9</td>
+<td>ELAND</td>
+<td>26</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td>100</td>
+<td><a href="#Lane1">Lane 1</a></td>
+</tr>
+<tr>
+<td>2</td>
+<td>unknown</td>
+<td>mm9</td>
+<td>ELAND</td>
+<td>26</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td>100</td>
+<td><a href="#Lane2">Lane 2</a></td>
+</tr>
+<tr>
+<td>3</td>
+<td>unknown</td>
+<td>mm9</td>
+<td>ELAND</td>
+<td>26</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td>100</td>
+<td><a href="#Lane3">Lane 3</a></td>
+</tr>
+<tr>
+<td>4</td>
+<td>unknown</td>
+<td>elegans170</td>
+<td>ELAND</td>
+<td>26</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td>100</td>
+<td><a href="#Lane4">Lane 4</a></td>
+</tr>
+<tr>
+<td>5</td>
+<td>unknown</td>
+<td>elegans170</td>
+<td>ELAND</td>
+<td>26</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td>100</td>
+<td><a href="#Lane5">Lane 5</a></td>
+</tr>
+<tr>
+<td>6</td>
+<td>unknown</td>
+<td>elegans170</td>
+<td>ELAND</td>
+<td>26</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td>100</td>
+<td><a href="#Lane6">Lane 6</a></td>
+</tr>
+<tr>
+<td>7</td>
+<td>unknown</td>
+<td>elegans170</td>
+<td>ELAND</td>
+<td>26</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td>100</td>
+<td><a href="#Lane7">Lane 7</a></td>
+</tr>
+<tr>
+<td>8</td>
+<td>unknown</td>
+<td>elegans170</td>
+<td>ELAND</td>
+<td>26</td>
+<td>'((CHASTITY>=0.6))'</td>
+<td>100</td>
+<td><a href="#Lane8">Lane 8</a></td>
+</tr>
+</table>
+<h2><br></br>Lane Results Summary<br></br></h2>
+<table border="1" cellpadding="5">
+<tr>
+<td colspan="2">Lane Info</td>
+<td colspan="8">Tile Mean +/- SD for Lane</td>
+</tr>
+<tr>
+<td>Lane </td>
+<td>Lane Yield (kbases) </td>
+<td>Clusters (raw)</td>
+<td>Clusters (PF) </td>
+<td>1st Cycle Int (PF) </td>
+<td>% intensity after 20 cycles (PF) </td>
+<td>% PF Clusters </td>
+<td>% Align (PF) </td>
+<td>Alignment Score (PF) </td>
+<td> % Error Rate (PF) </td>
+</tr>
+<tr>
+<td>1</td>
+<td>158046</td>
+<td>96483 +/- 9074</td>
+<td>60787 +/- 4240</td>
+<td>329 +/- 35</td>
+<td>101.88 +/- 6.03</td>
+<td>63.21 +/- 3.29</td>
+<td>70.33 +/- 0.24</td>
+<td>9054.08 +/- 59.16</td>
+<td>0.46 +/- 0.18</td>
+</tr>
+<tr>
+<td>2</td>
+<td>156564</td>
+<td>133738 +/- 7938</td>
+<td>60217 +/- 1926</td>
+<td>444 +/- 39</td>
+<td>92.62 +/- 7.58</td>
+<td>45.20 +/- 3.31</td>
+<td>51.98 +/- 0.74</td>
+<td>6692.04 +/- 92.49</td>
+<td>0.46 +/- 0.09</td>
+</tr>
+<tr>
+<td>3</td>
+<td>185818</td>
+<td>152142 +/- 10002</td>
+<td>71468 +/- 2827</td>
+<td>366 +/- 36</td>
+<td>91.53 +/- 8.66</td>
+<td>47.19 +/- 3.80</td>
+<td>82.24 +/- 0.44</td>
+<td>10598.68 +/- 64.13</td>
+<td>0.41 +/- 0.04</td>
+</tr>
+<tr>
+<td>4</td>
+<td>34953</td>
+<td>15784 +/- 2162</td>
+<td>13443 +/- 1728</td>
+<td>328 +/- 40</td>
+<td>97.53 +/- 9.87</td>
+<td>85.29 +/- 1.91</td>
+<td>80.02 +/- 0.53</td>
+<td>10368.82 +/- 71.08</td>
+<td>0.15 +/- 0.05</td>
+</tr>
+<tr>
+<td>5</td>
+<td>167936</td>
+<td>119735 +/- 8465</td>
+<td>64590 +/- 2529</td>
+<td>417 +/- 37</td>
+<td>88.69 +/- 14.79</td>
+<td>54.10 +/- 2.59</td>
+<td>76.95 +/- 0.32</td>
+<td>9936.47 +/- 65.75</td>
+<td>0.28 +/- 0.02</td>
+</tr>
+<tr>
+<td>6</td>
+<td>173463</td>
+<td>152177 +/- 8146</td>
+<td>66716 +/- 2493</td>
+<td>372 +/- 39</td>
+<td>87.06 +/- 9.86</td>
+<td>43.98 +/- 3.12</td>
+<td>78.80 +/- 0.43</td>
+<td>10162.28 +/- 49.65</td>
+<td>0.38 +/- 0.03</td>
+</tr>
+<tr>
+<td>7</td>
+<td>149287</td>
+<td>84649 +/- 7325</td>
+<td>57418 +/- 3617</td>
+<td>295 +/- 28</td>
+<td>89.40 +/- 8.23</td>
+<td>67.97 +/- 1.82</td>
+<td>33.38 +/- 0.25</td>
+<td>4247.92 +/- 32.37</td>
+<td>1.00 +/- 0.03</td>
+</tr>
+<tr>
+<td>8</td>
+<td>106953</td>
+<td>54622 +/- 4812</td>
+<td>41136 +/- 3309</td>
+<td>284 +/- 37</td>
+<td>90.21 +/- 9.10</td>
+<td>75.39 +/- 2.27</td>
+<td>48.33 +/- 0.29</td>
+<td>6169.21 +/- 169.50</td>
+<td>0.86 +/- 1.22</td>
+</tr>
+<tr><td colspan="13">Tile mean across chip</td></tr>
+<tr>
+<td>Av.</td>
+<td></td>
+<td>101166</td>
+<td>54472</td>
+<td>354</td>
+<td>92.36</td>
+<td>60.29</td>
+<td>65.25</td>
+<td>8403.69</td>
+<td>0.50</td>
+</tr>
+</table>
+<h2><br></br>Expanded Lane Summary<br></br></h2>
+<table border="1" cellpadding="5">
+<tr>
+
+<tr><td colspan="2">Lane Info</td>
+<td colspan="2">Phasing Info</td>
+<td colspan="2">Raw Data (tile mean)</td>
+<td colspan="7">Filtered Data (tile mean)</td></tr>
+<td>Lane </td>
+<td>Clusters (tile mean) (raw)</td>
+<td>% Phasing </td>
+<td>% Prephasing </td>
+<td>% Error Rate (raw) </td>
+<td> Equiv Perfect Clusters (raw) </td>
+<td>% retained </td>
+<td>Cycle 2-4 Av Int (PF) </td>
+<td>Cycle 2-10 Av % Loss (PF) </td>
+<td>Cycle 10-20 Av % Loss (PF) </td>
+<td>% Align (PF) </td>
+<td>% Error Rate (PF) </td>
+<td> Equiv Perfect Clusters (PF) </td>
+</tr>
+<tr>
+<td>1</td>
+<td>96483</td>
+<td>0.7700</td>
+<td>0.3100</td>
+<td>1.00</td>
+<td>49676</td>
+<td>63.21</td>
+<td>317 +/- 32</td>
+<td>0.13 +/- 0.44</td>
+<td>-1.14 +/- 0.34</td>
+<td>70.33</td>
+<td>0.46</td>
+<td>41758</td>
+</tr>
+<tr>
+<td>2</td>
+<td>133738</td>
+<td>0.7700</td>
+<td>0.3100</td>
+<td>1.22</td>
+<td>40467</td>
+<td>45.20</td>
+<td>415 +/- 33</td>
+<td>0.29 +/- 0.40</td>
+<td>-0.79 +/- 0.35</td>
+<td>51.98</td>
+<td>0.46</td>
+<td>30615</td>
+</tr>
+<tr>
+<td>3</td>
+<td>152142</td>
+<td>0.7700</td>
+<td>0.3100</td>
+<td>1.30</td>
+<td>78588</td>
+<td>47.19</td>
+<td>344 +/- 26</td>
+<td>0.68 +/- 0.51</td>
+<td>-0.77 +/- 0.42</td>
+<td>82.24</td>
+<td>0.41</td>
+<td>57552</td>
+</tr>
+<tr>
+<td>4</td>
+<td>15784</td>
+<td>0.7700</td>
+<td>0.3100</td>
+<td>0.29</td>
+<td>11095</td>
+<td>85.29</td>
+<td>306 +/- 34</td>
+<td>0.20 +/- 0.69</td>
+<td>-1.28 +/- 0.66</td>
+<td>80.02</td>
+<td>0.15</td>
+<td>10671</td>
+</tr>
+<tr>
+<td>5</td>
+<td>119735</td>
+<td>0.7700</td>
+<td>0.3100</td>
+<td>0.85</td>
+<td>60335</td>
+<td>54.10</td>
+<td>380 +/- 32</td>
+<td>0.34 +/- 0.49</td>
+<td>-1.55 +/- 4.69</td>
+<td>76.95</td>
+<td>0.28</td>
+<td>49015</td>
+</tr>
+<tr>
+<td>6</td>
+<td>152177</td>
+<td>0.7700</td>
+<td>0.3100</td>
+<td>1.21</td>
+<td>70905</td>
+<td>43.98</td>
+<td>333 +/- 27</td>
+<td>0.57 +/- 0.50</td>
+<td>-0.91 +/- 0.39</td>
+<td>78.80</td>
+<td>0.38</td>
+<td>51663</td>
+</tr>
+<tr>
+<td>7</td>
+<td>84649</td>
+<td>0.7700</td>
+<td>0.3100</td>
+<td>1.38</td>
+<td>21069</td>
+<td>67.97</td>
+<td>272 +/- 20</td>
+<td>1.15 +/- 0.52</td>
+<td>-0.84 +/- 0.58</td>
+<td>33.38</td>
+<td>1.00</td>
+<td>18265</td>
+</tr>
+<tr>
+<td>8</td>
+<td>54622</td>
+<td>0.7700</td>
+<td>0.3100</td>
+<td>1.17</td>
+<td>21335</td>
+<td>75.39</td>
+<td>262 +/- 31</td>
+<td>1.10 +/- 0.59</td>
+<td>-1.01 +/- 0.47</td>
+<td>48.33</td>
+<td>0.86</td>
+<td>19104</td>
+</tr>
+</table>
+<b><br></br>IVC Plots</b>
+<p> <a href='IVC.htm' target="_blank"> IVC.htm
+ </a></p>
+<b><br></br>All Intensity Plots</b>
+<p> <a href='All.htm' target="_blank"> All.htm
+ </a></p>
+<b><br></br>Error graphs: </b>
+<p> <a href='Error.htm' target="_blank"> Error.htm
+ </a></p>
+<td><a href="#Top">Back to top</a></td>
+<a name="Lane1"><h2><br></br>Lane 1<br></br></h2></a>
+<table border="1" cellpadding="5">
+<tr>
+<td>Lane </td>
+<td>Tile </td>
+<td>Clusters (raw)</td>
+<td>Av 1st Cycle Int (PF) </td>
+<td>Av % intensity after 20 cycles (PF) </td>
+<td>% PF Clusters </td>
+<td>% Align (PF) </td>
+<td>Av Alignment Score (PF) </td>
+<td>% Error Rate (PF) </td>
+</tr>
+<tr>
+<td>1</td>
+<td>0001</td>
+<td>114972</td>
+<td>326.48</td>
+<td>94.39</td>
+<td>57.44</td>
+<td>70.2</td>
+<td>9038.6</td>
+<td>0.44</td>
+</tr>
+</table>
+<td><a href="#Top">Back to top</a></td>
+<a name="Lane2"><h2><br></br>Lane 2<br></br></h2></a>
+<table border="1" cellpadding="5">
+<tr>
+<td>Lane </td>
+<td>Tile </td>
+<td>Clusters (raw)</td>
+<td>Av 1st Cycle Int (PF) </td>
+<td>Av % intensity after 20 cycles (PF) </td>
+<td>% PF Clusters </td>
+<td>% Align (PF) </td>
+<td>Av Alignment Score (PF) </td>
+<td>% Error Rate (PF) </td>
+</tr>
+<tr>
+<td>2</td>
+<td>0001</td>
+<td>147793</td>
+<td>448.12</td>
+<td>83.68</td>
+<td>38.57</td>
+<td>53.7</td>
+<td>6905.4</td>
+<td>0.54</td>
+</tr>
+</table>
+<td><a href="#Top">Back to top</a></td>
+<a name="Lane3"><h2><br></br>Lane 3<br></br></h2></a>
+<table border="1" cellpadding="5">
+<tr>
+<td>Lane </td>
+<td>Tile </td>
+<td>Clusters (raw)</td>
+<td>Av 1st Cycle Int (PF) </td>
+<td>Av % intensity after 20 cycles (PF) </td>
+<td>% PF Clusters </td>
+<td>% Align (PF) </td>
+<td>Av Alignment Score (PF) </td>
+<td>% Error Rate (PF) </td>
+</tr>
+<tr>
+<td>3</td>
+<td>0001</td>
+<td>167904</td>
+<td>374.05</td>
+<td>86.91</td>
+<td>40.36</td>
+<td>81.3</td>
+<td>10465.0</td>
+<td>0.47</td>
+</tr>
+</table>
+<td><a href="#Top">Back to top</a></td>
+<a name="Lane4"><h2><br></br>Lane 4<br></br></h2></a>
+<table border="1" cellpadding="5">
+<tr>
+<td>Lane </td>
+<td>Tile </td>
+<td>Clusters (raw)</td>
+<td>Av 1st Cycle Int (PF) </td>
+<td>Av % intensity after 20 cycles (PF) </td>
+<td>% PF Clusters </td>
+<td>% Align (PF) </td>
+<td>Av Alignment Score (PF) </td>
+<td>% Error Rate (PF) </td>
+</tr>
+<tr>
+<td>4</td>
+<td>0001</td>
+<td>20308</td>
+<td>276.85</td>
+<td>92.87</td>
+<td>84.26</td>
+<td>80.4</td>
+<td>10413.8</td>
+<td>0.16</td>
+</tr>
+</table>
+<td><a href="#Top">Back to top</a></td>
+<a name="Lane5"><h2><br></br>Lane 5<br></br></h2></a>
+<table border="1" cellpadding="5">
+<tr>
+<td>Lane </td>
+<td>Tile </td>
+<td>Clusters (raw)</td>
+<td>Av 1st Cycle Int (PF) </td>
+<td>Av % intensity after 20 cycles (PF) </td>
+<td>% PF Clusters </td>
+<td>% Align (PF) </td>
+<td>Av Alignment Score (PF) </td>
+<td>% Error Rate (PF) </td>
+</tr>
+</table>
+<td><a href="#Top">Back to top</a></td>
+<a name="Lane6"><h2><br></br>Lane 6<br></br></h2></a>
+<table border="1" cellpadding="5">
+<tr>
+<td>Lane </td>
+<td>Tile </td>
+<td>Clusters (raw)</td>
+<td>Av 1st Cycle Int (PF) </td>
+<td>Av % intensity after 20 cycles (PF) </td>
+<td>% PF Clusters </td>
+<td>% Align (PF) </td>
+<td>Av Alignment Score (PF) </td>
+<td>% Error Rate (PF) </td>
+</tr>
+<tr>
+<td>6</td>
+<td>0001</td>
+<td>166844</td>
+<td>348.12</td>
+<td>77.59</td>
+<td>38.13</td>
+<td>79.7</td>
+<td>10264.4</td>
+<td>0.44</td>
+</tr>
+</table>
+<td><a href="#Top">Back to top</a></td>
+<a name="Lane7"><h2><br></br>Lane 7<br></br></h2></a>
+<table border="1" cellpadding="5">
+<tr>
+<td>Lane </td>
+<td>Tile </td>
+<td>Clusters (raw)</td>
+<td>Av 1st Cycle Int (PF) </td>
+<td>Av % intensity after 20 cycles (PF) </td>
+<td>% PF Clusters </td>
+<td>% Align (PF) </td>
+<td>Av Alignment Score (PF) </td>
+<td>% Error Rate (PF) </td>
+</tr>
+<tr>
+<td>7</td>
+<td>0001</td>
+<td>98913</td>
+<td>269.90</td>
+<td>86.66</td>
+<td>64.55</td>
+<td>33.2</td>
+<td>4217.5</td>
+<td>1.02</td>
+</tr>
+</table>
+<td><a href="#Top">Back to top</a></td>
+<a name="Lane8"><h2><br></br>Lane 8<br></br></h2></a>
+<table border="1" cellpadding="5">
+<tr>
+<td>Lane </td>
+<td>Tile </td>
+<td>Clusters (raw)</td>
+<td>Av 1st Cycle Int (PF) </td>
+<td>Av % intensity after 20 cycles (PF) </td>
+<td>% PF Clusters </td>
+<td>% Align (PF) </td>
+<td>Av Alignment Score (PF) </td>
+<td>% Error Rate (PF) </td>
+</tr>
+<tr>
+<td>8</td>
+<td>0001</td>
+<td>64972</td>
+<td>243.60</td>
+<td>89.40</td>
+<td>73.17</td>
+<td>48.3</td>
+<td>6182.8</td>
+<td>0.71</td>
+</tr>
+</table>
+<td><a href="#Top">Back to top</a></td>
+</body>
+</html>
+"""
+ pathname = os.path.join(gerald_dir, 'Summary.htm')
+ f = open(pathname, 'w')
+ f.write(summary_htm)
+ f.close()
+
+def make_eland_results(gerald_dir):
+ eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D.
+>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 chr2L.fa 8796855 R DD 24T
+>HWI-EAS229_24_207BTAAXX:1:7:776:582 AGCTCANCCGATCGAAAACCTCNCCAAGCAAT NM 0 0 0
+>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 Lambda.fa 8796855 R DD 24T
+"""
+ for i in range(1,9):
+ pathname = os.path.join(gerald_dir,
+ 's_%d_eland_result.txt' % (i,))
+ f = open(pathname, 'w')
+ f.write(eland_result)
+ f.close()
+
+def make_runfolder(obj=None):
+ """
+ Make a fake runfolder, attach all the directories to obj if defined
+ """
+ # make a fake runfolder directory
+ temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_')
+
+ runfolder_dir = os.path.join(temp_dir,
+ '080102_HWI-EAS229_0010_207BTAAXX')
+ os.mkdir(runfolder_dir)
+
+ data_dir = os.path.join(runfolder_dir, 'Data')
+ os.mkdir(data_dir)
+
+ firecrest_dir = os.path.join(data_dir,
+ 'C1-33_Firecrest1.8.28_12-04-2008_diane'
+ )
+ os.mkdir(firecrest_dir)
+ matrix_dir = os.path.join(firecrest_dir, 'Matrix')
+ os.mkdir(matrix_dir)
+ make_matrix(matrix_dir)
+
+ bustard_dir = os.path.join(firecrest_dir,
+ 'Bustard1.8.28_12-04-2008_diane')
+ os.mkdir(bustard_dir)
+ make_phasing_params(bustard_dir)
+
+ gerald_dir = os.path.join(bustard_dir,
+ 'GERALD_12-04-2008_diane')
+ os.mkdir(gerald_dir)
+ make_gerald_config(gerald_dir)
+ make_summary_htm(gerald_dir)
+ make_eland_results(gerald_dir)
+
+ if obj is not None:
+ obj.temp_dir = temp_dir
+ obj.runfolder_dir = runfolder_dir
+ obj.data_dir = data_dir
+ obj.firecrest_dir = firecrest_dir
+ obj.matrix_dir = matrix_dir
+ obj.bustard_dir = bustard_dir
+ obj.gerald_dir = gerald_dir
+
+
+class RunfolderTests(unittest.TestCase):
+ """
+ Test components of the runfolder processing code
+ which includes firecrest, bustard, and gerald
+ """
+ def setUp(self):
+ # attaches all the directories to the object passed in
+ make_runfolder(self)
+
+ def tearDown(self):
+ shutil.rmtree(self.temp_dir)
+
+ def test_firecrest(self):
+ """
+ Construct a firecrest object
+ """
+ f = firecrest.firecrest(self.firecrest_dir)
+ self.failUnlessEqual(f.version, '1.8.28')
+ self.failUnlessEqual(f.start, 1)
+ self.failUnlessEqual(f.stop, 33)
+ self.failUnlessEqual(f.user, 'diane')
+ self.failUnlessEqual(f.date, date(2008,4,12))
+
+ xml = f.get_elements()
+ # just make sure that element tree can serialize the tree
+ xml_str = ElementTree.tostring(xml)
+
+ f2 = firecrest.Firecrest(xml=xml)
+ self.failUnlessEqual(f.version, f2.version)
+ self.failUnlessEqual(f.start, f2.start)
+ self.failUnlessEqual(f.stop, f2.stop)
+ self.failUnlessEqual(f.user, f2.user)
+ self.failUnlessEqual(f.date, f2.date)
+
+ def test_bustard(self):
+ """
+ construct a bustard object
+ """
+ b = bustard.bustard(self.bustard_dir)
+ self.failUnlessEqual(b.version, '1.8.28')
+ self.failUnlessEqual(b.date, date(2008,4,12))
+ self.failUnlessEqual(b.user, 'diane')
+ self.failUnlessEqual(len(b.phasing), 8)
+ self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099)
+
+ xml = b.get_elements()
+ b2 = bustard.Bustard(xml=xml)
+ self.failUnlessEqual(b.version, b2.version)
+ self.failUnlessEqual(b.date, b2.date )
+ self.failUnlessEqual(b.user, b2.user)
+ self.failUnlessEqual(len(b.phasing), len(b2.phasing))
+ for key in b.phasing.keys():
+ self.failUnlessEqual(b.phasing[key].lane,
+ b2.phasing[key].lane)
+ self.failUnlessEqual(b.phasing[key].phasing,
+ b2.phasing[key].phasing)
+ self.failUnlessEqual(b.phasing[key].prephasing,
+ b2.phasing[key].prephasing)
+
+ def test_gerald(self):
+ # need to update gerald and make tests for it
+ g = gerald.gerald(self.gerald_dir)
+
+ self.failUnlessEqual(g.version,
+ '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp')
+ self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30))
+ self.failUnlessEqual(len(g.lanes), len(g.lanes.keys()))
+ self.failUnlessEqual(len(g.lanes), len(g.lanes.items()))
+
+
+ # list of genomes, matches what was defined up in
+ # make_gerald_config.
+ # the first None is to offset the genomes list to be 1..9
+ # instead of pythons default 0..8
+ genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2',
+ '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ]
+
+ # test lane specific parameters from gerald config file
+ for i in range(1,9):
+ cur_lane = g.lanes[str(i)]
+ self.failUnlessEqual(cur_lane.analysis, 'eland')
+ self.failUnlessEqual(cur_lane.eland_genome, genomes[i])
+ self.failUnlessEqual(cur_lane.read_length, '32')
+ self.failUnlessEqual(cur_lane.use_bases, 'Y'*32)
+
+ # test data extracted from summary file
+ clusters = [None,
+ (96483, 9074), (133738, 7938),
+ (152142, 10002), (15784, 2162),
+ (119735, 8465), (152177, 8146),
+ (84649, 7325), (54622, 4812),]
+
+ for i in range(1,9):
+ summary_lane = g.summary[str(i)]
+ self.failUnlessEqual(summary_lane.cluster, clusters[i])
+ self.failUnlessEqual(summary_lane.lane, str(i))
+
+ xml = g.get_elements()
+ # just make sure that element tree can serialize the tree
+ xml_str = ElementTree.tostring(xml)
+ g2 = gerald.Gerald(xml=xml)
+
+ # do it all again after extracting from the xml file
+ self.failUnlessEqual(g.version, g2.version)
+ self.failUnlessEqual(g.date, g2.date)
+ self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys()))
+ self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items()))
+
+ # test lane specific parameters from gerald config file
+ for i in range(1,9):
+ g_lane = g.lanes[str(i)]
+ g2_lane = g2.lanes[str(i)]
+ self.failUnlessEqual(g_lane.analysis, g2_lane.analysis)
+ self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome)
+ self.failUnlessEqual(g_lane.read_length, g2_lane.read_length)
+ self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases)
+
+ # test (some) summary elements
+ for i in range(1,9):
+ g_summary = g.summary[str(i)]
+ g2_summary = g2.summary[str(i)]
+ self.failUnlessEqual(g_summary.cluster, g2_summary.cluster)
+ self.failUnlessEqual(g_summary.lane, g2_summary.lane)
+
+ g_eland = g.eland_results
+ g2_eland = g2.eland_results
+ for lane in g_eland.keys():
+ self.failUnlessEqual(g_eland[lane].reads,
+ g2_eland[lane].reads)
+ self.failUnlessEqual(len(g_eland[lane].mapped_reads),
+ len(g2_eland[lane].mapped_reads))
+ for k in g_eland[lane].mapped_reads.keys():
+ self.failUnlessEqual(g_eland[lane].mapped_reads[k],
+ g2_eland[lane].mapped_reads[k])
+
+ self.failUnlessEqual(len(g_eland[lane].match_codes),
+ len(g2_eland[lane].match_codes))
+ for k in g_eland[lane].match_codes.keys():
+ self.failUnlessEqual(g_eland[lane].match_codes[k],
+ g2_eland[lane].match_codes[k])
+
+
+ def test_eland(self):
+ dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa',
+ 'chr2L.fa': 'dm3/chr2L.fa',
+ 'Lambda.fa': 'Lambda.fa'}
+ genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map,
+ '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map }
+ eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps)
+
+ for i in range(1,9):
+ lane = eland[str(i)]
+ self.failUnlessEqual(lane.reads, 4)
+ self.failUnlessEqual(lane.sample_name, "s")
+ self.failUnlessEqual(lane.lane_id, unicode(i))
+ self.failUnlessEqual(len(lane.mapped_reads), 3)
+ self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1)
+ self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1)
+ self.failUnlessEqual(lane.match_codes['U1'], 2)
+ self.failUnlessEqual(lane.match_codes['NM'], 1)
+
+ xml = eland.get_elements()
+ # just make sure that element tree can serialize the tree
+ xml_str = ElementTree.tostring(xml)
+ e2 = gerald.ELAND(xml=xml)
+
+ for i in range(1,9):
+ l1 = eland[str(i)]
+ l2 = e2[str(i)]
+ self.failUnlessEqual(l1.reads, l2.reads)
+ self.failUnlessEqual(l1.sample_name, l2.sample_name)
+ self.failUnlessEqual(l1.lane_id, l2.lane_id)
+ self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads))
+ self.failUnlessEqual(len(l1.mapped_reads), 3)
+ for k in l1.mapped_reads.keys():
+ self.failUnlessEqual(l1.mapped_reads[k],
+ l2.mapped_reads[k])
+
+ self.failUnlessEqual(len(l1.match_codes), 9)
+ self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes))
+ for k in l1.match_codes.keys():
+ self.failUnlessEqual(l1.match_codes[k],
+ l2.match_codes[k])
+
+ def test_runfolder(self):
+ runs = runfolder.get_runs(self.runfolder_dir)
+
+ # do we get the flowcell id from the filename?
+ self.failUnlessEqual(len(runs), 1)
+ self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-04-19.xml')
+
+ # do we get the flowcell id from the FlowcellId.xml file
+ make_flowcell_id(self.runfolder_dir, '207BTAAXY')
+ runs = runfolder.get_runs(self.runfolder_dir)
+ self.failUnlessEqual(len(runs), 1)
+ self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-04-19.xml')
+
+ r1 = runs[0]
+ xml = r1.get_elements()
+ xml_str = ElementTree.tostring(xml)
+
+ r2 = runfolder.PipelineRun(xml=xml)
+ self.failUnlessEqual(r1.name, r2.name)
+ self.failIfEqual(r2.firecrest, None)
+ self.failIfEqual(r2.bustard, None)
+ self.failIfEqual(r2.gerald, None)
+
+
+def suite():
+ return unittest.makeSuite(RunfolderTests,'test')
+
+if __name__ == "__main__":
+ unittest.main(defaultTest="suite")
+