From: Diane Trout Date: Wed, 15 Oct 2008 18:59:34 +0000 (+0000) Subject: Rename trunk from gaworkflow to htsworkflow (and update all of the imports) X-Git-Tag: stanford.caltech-merged-database-2009-jan-15~33 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=3a5137956b1a37068d4b235a0bbb12fd3c21c1d2 Rename trunk from gaworkflow to htsworkflow (and update all of the imports) Fix the queuecommands test script to deal with the 1 sec delay hack --- diff --git a/gaworkflow/__init__.py b/gaworkflow/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/gaworkflow/automation/__init__.py b/gaworkflow/automation/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/gaworkflow/automation/copier.py b/gaworkflow/automation/copier.py deleted file mode 100644 index 2e0d3ae..0000000 --- a/gaworkflow/automation/copier.py +++ /dev/null @@ -1,245 +0,0 @@ -import ConfigParser -import copy -import logging -import logging.handlers -import os -import re -import subprocess -import sys -import time -import traceback - -from benderjab import rpc - -def runfolder_validate(fname): - """ - Return True if fname looks like a runfolder name - """ - if re.match("^[0-9]{6}_[-A-Za-z0-9_]*$", fname): - return True - else: - return False - -class rsync(object): - def __init__(self, source, dest, pwfile): - self.pwfile = os.path.expanduser(pwfile) - self.cmd = ['/usr/bin/rsync', ] - self.cmd.append('--password-file=%s' % (self.pwfile)) - self.source_base = source - self.dest_base = dest - self.processes = {} - self.exit_code = None - - def list(self): - """Get a directory listing""" - args = copy.copy(self.cmd) - args.append(self.source_base) - - logging.debug("Rsync cmd:" + " ".join(args)) - short_process = subprocess.Popen(args, stdout=subprocess.PIPE) - return self.list_filter(short_process.stdout) - - def list_filter(self, lines): - """ - parse rsync directory listing - """ - dirs_to_copy = [] - direntries = [ x[0:42].split() + [x[43:-1]] for x in lines ] - for permissions, size, filedate, filetime, filename in direntries: - if permissions[0] == 'd': - # hey its a directory, the first step to being something we want to - # copy - if re.match("[0-9]{6}", filename): - # it starts with something that looks like a 6 digit date - # aka good enough for me - dirs_to_copy.append(filename) - return dirs_to_copy - - def create_copy_process(self, dirname): - args = copy.copy(self.cmd) - # we want to copy everything - args.append('-rlt') - # from here - args.append(os.path.join(self.source_base, dirname)) - # to here - args.append(self.dest_base) - logging.debug("Rsync cmd:" + " ".join(args)) - return subprocess.Popen(args) - - def copy(self): - """ - copy any interesting looking directories over - return list of items that we started copying. - """ - # clean up any lingering non-running processes - self.poll() - - # what's available to copy? - dirs_to_copy = self.list() - - # lets start copying - started = [] - for d in dirs_to_copy: - process = self.processes.get(d, None) - - if process is None: - # we don't have a process, so make one - logging.info("rsyncing %s" % (d)) - self.processes[d] = self.create_copy_process(d) - started.append(d) - return started - - def poll(self): - """ - check currently running processes to see if they're done - - return path roots that have finished. - """ - for dir_key, proc_value in self.processes.items(): - retcode = proc_value.poll() - if retcode is None: - # process hasn't finished yet - pass - elif retcode == 0: - logging.info("finished rsyncing %s, exitcode %d" %( dir_key, retcode)) - del self.processes[dir_key] - else: - logging.error("rsync failed for %s, exit code %d" % (dir_key, retcode)) - - def __len__(self): - """ - Return how many active rsync processes we currently have - - Call poll first to close finished processes. - """ - return len(self.processes) - - def keys(self): - """ - Return list of current run folder names - """ - return self.processes.keys() - -class CopierBot(rpc.XmlRpcBot): - def __init__(self, section=None, configfile=None): - #if configfile is None: - # configfile = '~/.gaworkflow' - - super(CopierBot, self).__init__(section, configfile) - - # options for rsync command - self.cfg['rsync_password_file'] = None - self.cfg['rsync_source'] = None - self.cfg['rsync_destination'] = None - - # options for reporting we're done - self.cfg['notify_users'] = None - self.cfg['notify_runner'] = None - - self.pending = [] - self.rsync = None - self.notify_users = None - self.notify_runner = None - - self.register_function(self.startCopy) - self.register_function(self.sequencingFinished) - self.eventTasks.append(self.update) - - def read_config(self, section=None, configfile=None): - """ - read the config file - """ - super(CopierBot, self).read_config(section, configfile) - - password = self._check_required_option('rsync_password_file') - source = self._check_required_option('rsync_source') - destination = self._check_required_option('rsync_destination') - self.rsync = rsync(source, destination, password) - - self.notify_users = self._parse_user_list(self.cfg['notify_users']) - try: - self.notify_runner = \ - self._parse_user_list(self.cfg['notify_runner'], - require_resource=True) - except bot.JIDMissingResource: - msg = 'need a full jabber ID + resource for xml-rpc destinations' - logging.FATAL(msg) - raise bot.JIDMissingResource(msg) - - def startCopy(self, *args): - """ - start our copy - """ - logging.info("starting copy scan") - started = self.rsync.copy() - logging.info("copying:" + " ".join(started)+".") - return started - - def sequencingFinished(self, runDir, *args): - """ - The run was finished, if we're done copying, pass the message on - """ - # close any open processes - self.rsync.poll() - - # see if we're still copying - if runfolder_validate(runDir): - logging.info("recevied sequencing finshed for %s" % (runDir)) - self.pending.append(runDir) - self.startCopy() - return "PENDING" - else: - errmsg = "received bad runfolder name (%s)" % (runDir) - logging.warning(errmsg) - # maybe I should use a different error message - raise RuntimeError(errmsg) - - def reportSequencingFinished(self, runDir): - """ - Send the sequencingFinished message to the interested parties - """ - if self.notify_users is not None: - for u in self.notify_users: - self.send(u, 'Sequencing run %s finished' % (runDir)) - if self.notify_runner is not None: - for r in self.notify_runner: - self.rpc_send(r, (runDir,), 'sequencingFinished') - logging.info("forwarding sequencingFinshed message for %s" % (runDir)) - - def update(self, *args): - """ - Update our current status. - Report if we've finished copying files. - """ - self.rsync.poll() - for p in self.pending: - if p not in self.rsync.keys(): - self.reportSequencingFinished(p) - self.pending.remove(p) - - def _parser(self, msg, who): - """ - Parse xmpp chat messages - """ - help = u"I can [copy], or report current [status]" - if re.match(u"help", msg): - reply = help - elif re.match("copy", msg): - started = self.startCopy() - reply = u"started copying " + ", ".join(started) - elif re.match(u"status", msg): - msg = [u"Currently %d rsync processes are running." % (len(self.rsync))] - for d in self.rsync.keys(): - msg.append(u" " + d) - reply = os.linesep.join(msg) - else: - reply = u"I didn't understand '%s'" % (unicode(msg)) - return reply - -def main(args=None): - bot = CopierBot() - bot.main(args) - -if __name__ == "__main__": - sys.exit(main(sys.argv[1:])) - diff --git a/gaworkflow/automation/runner.py b/gaworkflow/automation/runner.py deleted file mode 100644 index a5f6843..0000000 --- a/gaworkflow/automation/runner.py +++ /dev/null @@ -1,222 +0,0 @@ -#!/usr/bin/env python -from glob import glob -import logging -import os -import re -import sys -import time -import threading - -from benderjab import rpc - -from gaworkflow.pipeline.configure_run import * - -#s_fc = re.compile('FC[0-9]+') -s_fc = re.compile('_[0-9a-zA-Z]*$') - - -def _get_flowcell_from_rundir(run_dir): - """ - Returns flowcell string based on run_dir. - Returns None and logs error if flowcell can't be found. - """ - junk, dirname = os.path.split(run_dir) - mo = s_fc.search(dirname) - if not mo: - logging.error('RunDir 2 FlowCell error: %s' % (run_dir)) - return None - - return dirname[mo.start()+1:] - - - -class Runner(rpc.XmlRpcBot): - """ - Manage running pipeline jobs. - """ - def __init__(self, section=None, configfile=None): - #if configfile is None: - # self.configfile = "~/.gaworkflow" - super(Runner, self).__init__(section, configfile) - - self.cfg['notify_users'] = None - self.cfg['genome_dir'] = None - self.cfg['base_analysis_dir'] = None - - self.cfg['notify_users'] = None - self.cfg['notify_postanalysis'] = None - - self.conf_info_dict = {} - - self.register_function(self.sequencingFinished) - #self.eventTasks.append(self.update) - - - def read_config(self, section=None, configfile=None): - super(Runner, self).read_config(section, configfile) - - self.genome_dir = self._check_required_option('genome_dir') - self.base_analysis_dir = self._check_required_option('base_analysis_dir') - - self.notify_users = self._parse_user_list(self.cfg['notify_users']) - #FIXME: process notify_postpipeline cfg - - - def _parser(self, msg, who): - """ - Parse xmpp chat messages - """ - help = u"I can send [start] a run, or report [status]" - if re.match(u"help", msg): - reply = help - elif re.match("status", msg): - words = msg.split() - if len(words) == 2: - reply = self.getStatusReport(words[1]) - else: - reply = u"Status available for: %s" \ - % (', '.join([k for k in self.conf_info_dict.keys()])) - elif re.match(u"start", msg): - words = msg.split() - if len(words) == 2: - self.sequencingFinished(words[1]) - reply = u"starting run for %s" % (words[1]) - else: - reply = u"need runfolder name" - else: - reply = u"I didn't understand '%s'" %(msg) - - logging.debug("reply: " + str(reply)) - return reply - - - def getStatusReport(self, fc_num): - """ - Returns text status report for flow cell number - """ - if fc_num not in self.conf_info_dict: - return "No record of a %s run." % (fc_num) - - status = self.conf_info_dict[fc_num].status - - if status is None: - return "No status information for %s yet." \ - " Probably still in configure step. Try again later." % (fc_num) - - output = status.statusReport() - - return '\n'.join(output) - - - def sequencingFinished(self, run_dir): - """ - Sequenceing (and copying) is finished, time to start pipeline - """ - logging.debug("received sequencing finished message") - - # Setup config info object - ci = ConfigInfo() - ci.base_analysis_dir = self.base_analysis_dir - ci.analysis_dir = os.path.join(self.base_analysis_dir, run_dir) - - # get flowcell from run_dir name - flowcell = _get_flowcell_from_rundir(run_dir) - - # Store ci object in dictionary - self.conf_info_dict[flowcell] = ci - - - # Launch the job in it's own thread and turn. - self.launchJob(run_dir, flowcell, ci) - return "started" - - - def pipelineFinished(self, run_dir): - # need to strip off self.watch_dir from rundir I suspect. - logging.info("pipeline finished in" + str(run_dir)) - #pattern = self.watch_dir - #if pattern[-1] != os.path.sep: - # pattern += os.path.sep - #stripped_run_dir = re.sub(pattern, "", run_dir) - #logging.debug("stripped to " + stripped_run_dir) - - # Notify each user that the run has finished. - if self.notify_users is not None: - for u in self.notify_users: - self.send(u, 'Pipeline run %s finished' % (run_dir)) - - #if self.notify_runner is not None: - # for r in self.notify_runner: - # self.rpc_send(r, (stripped_run_dir,), 'sequencingFinished') - - def reportMsg(self, msg): - - if self.notify_users is not None: - for u in self.notify_users: - self.send(u, msg) - - - def _runner(self, run_dir, flowcell, conf_info): - - # retrieve config step - cfg_filepath = os.path.join(conf_info.analysis_dir, - 'config32auto.txt') - status_retrieve_cfg = retrieve_config(conf_info, - flowcell, - cfg_filepath, - self.genome_dir) - if status_retrieve_cfg: - logging.info("Runner: Retrieve config: success") - self.reportMsg("Retrieve config (%s): success" % (run_dir)) - else: - logging.error("Runner: Retrieve config: failed") - self.reportMsg("Retrieve config (%s): FAILED" % (run_dir)) - - - # configure step - if status_retrieve_cfg: - status = configure(conf_info) - if status: - logging.info("Runner: Configure: success") - self.reportMsg("Configure (%s): success" % (run_dir)) - self.reportMsg( - os.linesep.join(glob(os.path.join(run_dir,'Data','C*'))) - ) - else: - logging.error("Runner: Configure: failed") - self.reportMsg("Configure (%s): FAILED" % (run_dir)) - - #if successful, continue - if status: - # Setup status cmdline status monitor - #startCmdLineStatusMonitor(ci) - - # running step - print 'Running pipeline now!' - run_status = run_pipeline(conf_info) - if run_status is True: - logging.info('Runner: Pipeline: success') - self.reportMsg("Pipeline run (%s): Finished" % (run_dir,)) - else: - logging.info('Runner: Pipeline: failed') - self.reportMsg("Pipeline run (%s): FAILED" % (run_dir)) - - - def launchJob(self, run_dir, flowcell, conf_info): - """ - Starts up a thread for running the pipeline - """ - t = threading.Thread(target=self._runner, - args=[run_dir, flowcell, conf_info]) - t.setDaemon(True) - t.start() - - - -def main(args=None): - bot = Runner() - return bot.main(args) - -if __name__ == "__main__": - sys.exit(main(sys.argv[1:])) - diff --git a/gaworkflow/automation/spoolwatcher.py b/gaworkflow/automation/spoolwatcher.py deleted file mode 100644 index 56ad42f..0000000 --- a/gaworkflow/automation/spoolwatcher.py +++ /dev/null @@ -1,229 +0,0 @@ -#!/usr/bin/env python -import logging -import os -import re -import sys -import time -#import glob - -from gaworkflow.util import mount - -# this uses pyinotify -import pyinotify -from pyinotify import EventsCodes - -from benderjab import rpc - - -class WatcherEvents(object): - # two events need to be tracked - # one to send startCopy - # one to send OMG its broken - # OMG its broken needs to stop when we've seen enough - # cycles - # this should be per runfolder. - # read the xml files - def __init__(self): - pass - - -class Handler(pyinotify.ProcessEvent): - def __init__(self, watchmanager, bot): - self.last_event_time = None - self.watchmanager = watchmanager - self.bot = bot - - def process_IN_CREATE(self, event): - self.last_event_time = time.time() - msg = "Create: %s" % os.path.join(event.path, event.name) - if event.name.lower() == "run.completed": - try: - self.bot.sequencingFinished(event.path) - except IOError, e: - logging.error("Couldn't send sequencingFinished") - logging.debug(msg) - - def process_IN_DELETE(self, event): - logging.debug("Remove: %s" % os.path.join(event.path, event.name)) - - def process_IN_UNMOUNT(self, event): - pathname = os.path.join(event.path, event.name) - logging.debug("IN_UNMOUNT: %s" % (pathname,)) - self.bot.unmount_watch() - -class SpoolWatcher(rpc.XmlRpcBot): - """ - Watch a directory and send a message when another process is done writing. - - This monitors a directory tree using inotify (linux specific) and - after some files having been written will send a message after - seconds of no file writing. - - (Basically when the solexa machine finishes dumping a round of data - this'll hopefully send out a message saying hey look theres data available - - """ - # these params need to be in the config file - # I wonder where I should put the documentation - #:Parameters: - # `watchdir` - which directory tree to monitor for modifications - # `profile` - specify which .gaworkflow profile to use - # `write_timeout` - how many seconds to wait for writes to finish to - # the spool - # `notify_timeout` - how often to timeout from notify - - def __init__(self, section=None, configfile=None): - #if configfile is None: - # self.configfile = "~/.gaworkflow" - super(SpoolWatcher, self).__init__(section, configfile) - - self.cfg['watchdir'] = None - self.cfg['write_timeout'] = 10 - self.cfg['notify_users'] = None - self.cfg['notify_runner'] = None - - self.notify_timeout = 0.001 - self.wm = pyinotify.WatchManager() - self.handler = Handler(self.wm, self) - self.notifier = pyinotify.Notifier(self.wm, self.handler) - self.wdd = None - self.mount_point = None - self.mounted = True - - self.notify_users = None - self.notify_runner = None - - self.eventTasks.append(self.process_notify) - - def read_config(self, section=None, configfile=None): - super(SpoolWatcher, self).read_config(section, configfile) - - self.watch_dir = self._check_required_option('watchdir') - self.write_timeout = int(self.cfg['write_timeout']) - - self.notify_users = self._parse_user_list(self.cfg['notify_users']) - try: - self.notify_runner = \ - self._parse_user_list(self.cfg['notify_runner'], - require_resource=True) - except bot.JIDMissingResource: - msg = 'need a full jabber ID + resource for xml-rpc destinations' - logging.FATAL(msg) - raise bot.JIDMissingResource(msg) - - def add_watch(self, watchdir=None): - """ - start watching watchdir or self.watch_dir - we're currently limited to watching one directory tree. - """ - # the one tree limit is mostly because self.wdd is a single item - # but managing it as a list might be a bit more annoying - if watchdir is None: - watchdir = self.watch_dir - logging.info("Watching:"+str(watchdir)) - - self.mount_point = mount.find_mount_point_for(watchdir) - - mask = EventsCodes.IN_CREATE | EventsCodes.IN_UNMOUNT - # rec traverses the tree and adds all the directories that are there - # at the start. - # auto_add will add in new directories as they are created - self.wdd = self.wm.add_watch(watchdir, mask, rec=True, auto_add=True) - - def unmount_watch(self): - if self.wdd is not None: - self.wm.rm_watch(self.wdd.values()) - self.wdd = None - self.mounted = False - - def process_notify(self, *args): - # process the queue of events as explained above - self.notifier.process_events() - #check events waits timeout - if self.notifier.check_events(self.notify_timeout): - # read notified events and enqeue them - self.notifier.read_events() - # should we do something? - # has something happened? - last_event_time = self.handler.last_event_time - if last_event_time is not None: - time_delta = time.time() - last_event_time - if time_delta > self.write_timeout: - self.startCopy() - self.handler.last_event_time = None - # handle unmounted filesystems - if not self.mounted: - if mount.is_mounted(self.mount_point): - # we've been remounted. Huzzah! - # restart the watch - self.add_watch() - self.mounted = True - logging.info( - "%s was remounted, restarting watch" % \ - (self.mount_point) - ) - - def _parser(self, msg, who): - """ - Parse xmpp chat messages - """ - help = u"I can send [copy] message, or squencer [finished]" - if re.match(u"help", msg): - reply = help - elif re.match("copy", msg): - self.startCopy() - reply = u"sent copy message" - elif re.match(u"finished", msg): - words = msg.split() - if len(words) == 2: - self.sequencingFinished(words[1]) - reply = u"sending sequencing finished for %s" % (words[1]) - else: - reply = u"need runfolder name" - else: - reply = u"I didn't understand '%s'" %(msg) - return reply - - def start(self, daemonize): - """ - Start application - """ - self.add_watch() - super(SpoolWatcher, self).start(daemonize) - - def stop(self): - """ - shutdown application - """ - # destroy the inotify's instance on this interrupt (stop monitoring) - self.notifier.stop() - super(SpoolWatcher, self).stop() - - def startCopy(self): - logging.debug("writes seem to have stopped") - if self.notify_runner is not None: - for r in self.notify_runner: - self.rpc_send(r, tuple(), 'startCopy') - - def sequencingFinished(self, run_dir): - # need to strip off self.watch_dir from rundir I suspect. - logging.info("run.completed in " + str(run_dir)) - pattern = self.watch_dir - if pattern[-1] != os.path.sep: - pattern += os.path.sep - stripped_run_dir = re.sub(pattern, "", run_dir) - logging.debug("stripped to " + stripped_run_dir) - if self.notify_users is not None: - for u in self.notify_users: - self.send(u, 'Sequencing run %s finished' % (stripped_run_dir)) - if self.notify_runner is not None: - for r in self.notify_runner: - self.rpc_send(r, (stripped_run_dir,), 'sequencingFinished') - -def main(args=None): - bot = SpoolWatcher() - return bot.main(args) - -if __name__ == "__main__": - sys.exit(main(sys.argv[1:])) - diff --git a/gaworkflow/frontend/__init__.py b/gaworkflow/frontend/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/gaworkflow/frontend/eland_config/__init__.py b/gaworkflow/frontend/eland_config/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/gaworkflow/frontend/eland_config/forms.py b/gaworkflow/frontend/eland_config/forms.py deleted file mode 100644 index 2488359..0000000 --- a/gaworkflow/frontend/eland_config/forms.py +++ /dev/null @@ -1,163 +0,0 @@ -from django import newforms as forms -from django.newforms.util import ErrorList - - -SPECIES_LIST = [#('--choose--', '--Choose--'), - ('hg18', 'Homo sapiens (Hg18)'), - ('Mm8', 'Mus musculus (Mm8)'), - ('arabv6', 'Arabadopsis Thaliana v6'), - ('other', 'Other species (Include in description)')] - - -class DivErrorList(ErrorList): - def __unicode__(self): - return self.as_divs() - - def as_divs(self): - if not self: return u'' - return u'
%s
' % (''.join([u'
%s
' % e for e in self])) - - - -class ConfigForm(forms.Form): - - flow_cell_number = forms.CharField(min_length=2) - run_date = forms.DateTimeField() - advanced_run = forms.BooleanField(required=False) - read_length = forms.IntegerField(min_value=1, initial=32) - #eland_repeat = forms.BooleanField() - - #needs a for loop or something to allow for n configurations - #analysis_type = forms.ChoiceField(choices=[('eland','eland')]) - lane1_species = forms.ChoiceField(choices=SPECIES_LIST) - lane1_description = forms.CharField(widget=forms.TextInput(attrs={'size':'60'})) - - lane2_species = forms.ChoiceField(choices=SPECIES_LIST) - lane2_description = forms.CharField(widget=forms.TextInput(attrs={'size':'60'})) - - lane3_species = forms.ChoiceField(choices=SPECIES_LIST) - lane3_description = forms.CharField(widget=forms.TextInput(attrs={'size':'60'})) - - lane4_species = forms.ChoiceField(choices=SPECIES_LIST) - lane4_description = forms.CharField(widget=forms.TextInput(attrs={'size':'60'})) - - lane5_species = forms.ChoiceField(choices=SPECIES_LIST) - lane5_description = forms.CharField(widget=forms.TextInput(attrs={'size':'60'})) - - lane6_species = forms.ChoiceField(choices=SPECIES_LIST) - lane6_description = forms.CharField(widget=forms.TextInput(attrs={'size':'60'})) - - lane7_species = forms.ChoiceField(choices=SPECIES_LIST) - lane7_description = forms.CharField(widget=forms.TextInput(attrs={'size':'60'})) - - lane8_species = forms.ChoiceField(choices=SPECIES_LIST) - lane8_description = forms.CharField(widget=forms.TextInput(attrs={'size':'60'})) - - notes = forms.CharField(widget=forms.Textarea(attrs={'cols':'70'}), required=False) - - #lane_specific_read_length = forms.IntegerField(min_value=1) - - #eland_genome_lanes = forms.MultipleChoiceField(choices=[('lane1','1'), - # ('lane2','2'), - # ('lane3','3'), - # ('lane4','4'), - # ('lane5','5'), - # ('lane6','6'), - # ('lane7','7'), - # ('lane8','8') ]) - - #eland_genome = forms.ChoiceField(choices=) - - #use_bases_lanes = forms.MultipleChoiceField(choices=[('lane1','1'), - # ('lane2','2'), - # ('lane3','3'), - # ('lane4','4'), - # ('lane5','5'), - # ('lane6','6'), - # ('lane7','7'), - # ('lane8','8') ]) - - #use_bases_mask = forms.CharField() - - #sequence_format = forms.ChoiceField(choices=[('scarf', 'scarf')]) - - - - #subject = forms.CharField(max_length=100) - #message = forms.CharField() - #sender = forms.EmailField() - #cc_myself = forms.BooleanField() - - def as_custom(self): - """ - Displays customized html output - """ - html = [] - - fcn = self['flow_cell_number'] - - html.append(fcn.label_tag() + ': ' + str(fcn) + str(fcn.errors) + '
') - - run_date = self['run_date'] - html.append(run_date.label_tag() + ': ' + str(run_date) + str(run_date.errors) + '
') - - arun = self['advanced_run'] - html.append(arun.label_tag() + ': ' + str(arun) + str(arun.errors) + '
') - - rl = self['read_length'] - html.append(rl.label_tag() + ': ' + str(rl) + str(rl.errors) + '

') - - html.append('') - html.append(' ' \ - % ('Lane', 'Species', 'Description')) - - l1s = self['lane1_species'] - l1d = self['lane1_description'] - html.append(' ' \ - % ('1', str(l1s), str(l1s.errors), str(l1d), str(l1d.errors))) - - l2s = self['lane2_species'] - l2d = self['lane2_description'] - html.append(' ' \ - % ('2', str(l2s), str(l2s.errors), str(l2d), str(l2d.errors))) - - l3s = self['lane3_species'] - l3d = self['lane3_description'] - html.append(' ' \ - % ('3', str(l3s), str(l3s.errors), str(l3d), str(l3d.errors))) - - l4s = self['lane4_species'] - l4d = self['lane4_description'] - html.append(' ' \ - % ('4', str(l4s), str(l4s.errors), str(l4d), str(l4d.errors))) - - l5s = self['lane5_species'] - l5d = self['lane5_description'] - html.append(' ' \ - % ('5', str(l5s), str(l5s.errors), str(l5d), str(l5d.errors))) - - l6s = self['lane6_species'] - l6d = self['lane6_description'] - html.append(' ' \ - % ('6', str(l6s), str(l6s.errors), str(l6d), str(l6d.errors))) - - l7s = self['lane7_species'] - l7d = self['lane7_description'] - html.append(' ' \ - % ('7', str(l7s), str(l7s.errors), str(l7d), str(l7d.errors))) - - l8s = self['lane8_species'] - l8d = self['lane8_description'] - html.append(' ' \ - % ('8', str(l8s), str(l8s.errors), str(l8d), str(l8d.errors))) - - html.append('
%s%s%s
%s%s %s%s %s
%s%s %s%s %s
%s%s %s%s %s
%s%s %s%s %s
%s%s %s%s %s
%s%s %s%s %s
%s%s %s%s %s
%s%s %s%s %s

') - - notes = self['notes'] - html.append('

Notes:

') - html.append(' %s
' % (str(notes))) - - return '\n'.join(html) - - - \ No newline at end of file diff --git a/gaworkflow/frontend/eland_config/models.py b/gaworkflow/frontend/eland_config/models.py deleted file mode 100644 index 71a8362..0000000 --- a/gaworkflow/frontend/eland_config/models.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.db import models - -# Create your models here. diff --git a/gaworkflow/frontend/eland_config/urls.py b/gaworkflow/frontend/eland_config/urls.py deleted file mode 100644 index 2eeee70..0000000 --- a/gaworkflow/frontend/eland_config/urls.py +++ /dev/null @@ -1,10 +0,0 @@ -from django.conf.urls.defaults import * - -urlpatterns = patterns('', - # Example: - - (r'^(?P\w+)/$', 'gaworkflow.frontend.eland_config.views.config'), - (r'^$', 'gaworkflow.frontend.eland_config.views.config'), - #(r'^$', 'gaworkflow.frontend.eland_config.views.index') - -) diff --git a/gaworkflow/frontend/eland_config/views.py b/gaworkflow/frontend/eland_config/views.py deleted file mode 100644 index f23f7ae..0000000 --- a/gaworkflow/frontend/eland_config/views.py +++ /dev/null @@ -1,413 +0,0 @@ -from django.http import HttpResponse -from django.shortcuts import render_to_response -from django.core.exceptions import ObjectDoesNotExist - -from gaworkflow.frontend.eland_config import forms -from gaworkflow.frontend import settings -from gaworkflow.frontend.fctracker import models - -import os -import glob -# Create your views here. - - -def _validate_input(data): - #if data.find('..') == -1 or data.find('/') == -1 or data.find('\\') == -1: - return data.replace('..', '').replace('/', '_').replace('\\', '_') - -#def contact(request): -# if request.method == 'POST': -# form = ContactForm(request.POST) -# if form.is_valid(): -# # Do form processing here... -# return HttpResponseRedirect('/url/on_success/') -# else: -# form = ContactForm() -# return - - - -#def _saveConfigFile(form): -# """ -# Given a valid form, save eland config to file based on flowcell number. -# """ -# assert form.is_valid() -# -# clean_data = form.cleaned_data -# flowcell = clean_data['flow_cell_number'].replace('/','_').replace('..', '__') -# -# file_path = os.path.join(settings.UPLOADTO_CONFIG_FILE, flowcell) -# -# f = open(file_path, 'w') -# cfg = generateElandConfig(form) -# f.write(cfg) -# f.close() -# -# -#def _saveToDb(form): -# """ -# Save info to the database. -# """ -# clean_data = form.cleaned_data -# -# fc_id = clean_data['flow_cell_number'] -# -# try: -# fc = models.FlowCell.objects.get(flowcell_id=fc_id) -# except models.FlowCell.DoesNotExist: -# fc = models.FlowCell() -# -# fc.flowcell_id = fc_id -# fc.run_date = clean_data['run_date'] -# -# #LANE 1 -# fc.lane1_sample = clean_data['lane1_description'] -# species_name = clean_data['lane1_species'] -# try: -# specie = models.Specie.objects.get(scientific_name=species_name) -# except models.Specie.DoesNotExist: -# specie = models.Specie(scientific_name=species_name) -# specie.save() -# fc.lane1_species = specie -# -# #LANE 2 -# fc.lane2_sample = clean_data['lane2_description'] -# species_name = clean_data['lane2_species'] -# try: -# specie = models.Specie.objects.get(scientific_name=species_name) -# except models.Specie.DoesNotExist: -# specie = models.Specie(scientific_name=species_name) -# specie.save() -# fc.lane2_species = specie -# -# #LANE 3 -# fc.lane3_sample = clean_data['lane3_description'] -# species_name = clean_data['lane3_species'] -# try: -# specie = models.Specie.objects.get(scientific_name=species_name) -# except models.Specie.DoesNotExist: -# specie = models.Specie(scientific_name=species_name) -# specie.save() -# fc.lane3_species = specie -# -# #LANE 4 -# fc.lane4_sample = clean_data['lane4_description'] -# species_name = clean_data['lane4_species'] -# try: -# specie = models.Specie.objects.get(scientific_name=species_name) -# except models.Specie.DoesNotExist: -# specie = models.Specie(scientific_name=species_name) -# specie.save() -# fc.lane4_species = specie -# -# #LANE 5 -# fc.lane5_sample = clean_data['lane5_description'] -# species_name = clean_data['lane5_species'] -# try: -# specie = models.Specie.objects.get(scientific_name=species_name) -# except models.Specie.DoesNotExist: -# specie = models.Specie(scientific_name=species_name) -# specie.save() -# fc.lane5_species = specie -# -# #LANE 6 -# fc.lane6_sample = clean_data['lane6_description'] -# species_name = clean_data['lane6_species'] -# try: -# specie = models.Specie.objects.get(scientific_name=species_name) -# except models.Specie.DoesNotExist: -# specie = models.Specie(scientific_name=species_name) -# specie.save() -# fc.lane6_species = specie -# -# #LANE 7 -# fc.lane7_sample = clean_data['lane7_description'] -# species_name = clean_data['lane7_species'] -# try: -# specie = models.Specie.objects.get(scientific_name=species_name) -# except models.Specie.DoesNotExist: -# specie = models.Specie(scientific_name=species_name) -# specie.save() -# fc.lane7_species = specie -# -# #LANE 8 -# fc.lane8_sample = clean_data['lane8_description'] -# species_name = clean_data['lane8_species'] -# try: -# specie = models.Specie.objects.get(scientific_name=species_name) -# except models.Specie.DoesNotExist: -# specie = models.Specie(scientific_name=species_name) -# specie.save() -# fc.lane8_species = specie -# -# fc.notes = clean_data['notes'] -# -# fc.save() -# -# return fc -# -# -#def generateElandConfig(form): -# data = [] -# -# form = form.cleaned_data -# -# BASE_DIR = '/data-store01/compbio/genomes' -# -# data.append("# FLOWCELL: %s" % (form['flow_cell_number'])) -# data.append("#") -# -# notes = form['notes'].replace('\r\n', '\n').replace('\r', '\n') -# notes = notes.replace('\n', '\n# ') -# data.append("# NOTES:") -# data.append("# %s\n#" % (notes)) -# -# #Convert all newline conventions to unix style -# l1d = form['lane1_description'].replace('\r\n', '\n').replace('\r', '\n') -# l2d = form['lane2_description'].replace('\r\n', '\n').replace('\r', '\n') -# l3d = form['lane3_description'].replace('\r\n', '\n').replace('\r', '\n') -# l4d = form['lane4_description'].replace('\r\n', '\n').replace('\r', '\n') -# l5d = form['lane5_description'].replace('\r\n', '\n').replace('\r', '\n') -# l6d = form['lane6_description'].replace('\r\n', '\n').replace('\r', '\n') -# l7d = form['lane7_description'].replace('\r\n', '\n').replace('\r', '\n') -# l8d = form['lane8_description'].replace('\r\n', '\n').replace('\r', '\n') -# -# # Turn new lines into indented commented newlines -# l1d = l1d.replace('\n', '\n# ') -# l2d = l2d.replace('\n', '\n# ') -# l3d = l3d.replace('\n', '\n# ') -# l4d = l4d.replace('\n', '\n# ') -# l5d = l5d.replace('\n', '\n# ') -# l6d = l6d.replace('\n', '\n# ') -# l7d = l7d.replace('\n', '\n# ') -# l8d = l8d.replace('\n', '\n# ') -# -# data.append("# Lane1: %s" % (l1d)) -# data.append("# Lane2: %s" % (l2d)) -# data.append("# Lane3: %s" % (l3d)) -# data.append("# Lane4: %s" % (l4d)) -# data.append("# Lane5: %s" % (l5d)) -# data.append("# Lane6: %s" % (l6d)) -# data.append("# Lane7: %s" % (l7d)) -# data.append("# Lane8: %s" % (l8d)) -# -# #data.append("GENOME_DIR %s" % (BASE_DIR)) -# #data.append("CONTAM_DIR %s" % (BASE_DIR)) -# read_length = form['read_length'] -# data.append("READ_LENGTH %d" % (read_length)) -# #data.append("ELAND_REPEAT") -# data.append("ELAND_MULTIPLE_INSTANCES 8") -# -# #Construct genome dictionary to figure out what lanes to put -# # in the config file. -# genome_dict = {} -# l1s = form['lane1_species'] -# genome_dict.setdefault(l1s, []).append('1') -# l2s = form['lane2_species'] -# genome_dict.setdefault(l2s, []).append('2') -# l3s = form['lane3_species'] -# genome_dict.setdefault(l3s, []).append('3') -# l4s = form['lane4_species'] -# genome_dict.setdefault(l4s, []).append('4') -# l5s = form['lane5_species'] -# genome_dict.setdefault(l5s, []).append('5') -# l6s = form['lane6_species'] -# genome_dict.setdefault(l6s, []).append('6') -# l7s = form['lane7_species'] -# genome_dict.setdefault(l7s, []).append('7') -# l8s = form['lane8_species'] -# genome_dict.setdefault(l8s, []).append('8') -# -# genome_list = genome_dict.keys() -# genome_list.sort() -# -# #Loop through and create entries for each species. -# for genome in genome_list: -# lanes = ''.join(genome_dict[genome]) -# data.append('%s:ANALYSIS eland' % (lanes)) -# data.append('%s:READ_LENGTH %s' % (lanes, read_length)) -# data.append('%s:ELAND_GENOME %s' % (lanes, os.path.join(BASE_DIR, genome))) -# data.append('%s:USE_BASES %s' % (lanes, 'Y'*int(read_length))) -# -# data.append('SEQUENCE_FORMAT --scarf') -# -# return '\n'.join(data) - - -def getElandConfig(flowcell, regenerate=False): - - file_path = os.path.join(settings.UPLOADTO_CONFIG_FILE, flowcell) - - #If we are regenerating the config file, skip - # reading of existing file. If the file doesn't - # exist, try to generate it form the DB. - if not regenerate and os.path.isfile(file_path): - f = open(file_path, 'r') - data = f.read() - f.close() - return data - - try: - fcObj = models.FlowCell.objects.get(flowcell_id__iexact=flowcell) - except ObjectDoesNotExist: - return None - - data = [] - - #form = form.cleaned_data - - BASE_DIR = '/data-store01/compbio/genomes' - - data.append("# FLOWCELL: %s" % (fcObj.flowcell_id)) - data.append("#") - - notes = fcObj.notes.replace('\r\n', '\n').replace('\r', '\n') - notes = notes.replace('\n', '\n# ') - data.append("# NOTES:") - data.append("# %s\n#" % (notes)) - - #Convert all newline conventions to unix style - l1d = str(fcObj.lane_1_library.library_id) + '|' \ - + fcObj.lane_1_library.library_name.replace('\r\n', '\n').replace('\r', '\n') - l2d = str(fcObj.lane_2_library.library_id) + '|' \ - + fcObj.lane_2_library.library_name.replace('\r\n', '\n').replace('\r', '\n') - l3d = str(fcObj.lane_3_library.library_id) + '|' \ - + fcObj.lane_3_library.library_name.replace('\r\n', '\n').replace('\r', '\n') - l4d = str(fcObj.lane_4_library.library_id) + '|' \ - + fcObj.lane_4_library.library_name.replace('\r\n', '\n').replace('\r', '\n') - - l5d = str(fcObj.lane_5_library.library_id) + '|' \ - + fcObj.lane_5_library.library_name.replace('\r\n', '\n').replace('\r', '\n') - l6d = str(fcObj.lane_6_library.library_id) + '|' \ - + fcObj.lane_6_library.library_name.replace('\r\n', '\n').replace('\r', '\n') - l7d = str(fcObj.lane_7_library.library_id) + '|' \ - + fcObj.lane_7_library.library_name.replace('\r\n', '\n').replace('\r', '\n') - l8d = str(fcObj.lane_8_library.library_id) + '|' \ - + fcObj.lane_8_library.library_name.replace('\r\n', '\n').replace('\r', '\n') - - # Turn new lines into indented commented newlines - l1d = l1d.replace('\n', '\n# ') - l2d = l2d.replace('\n', '\n# ') - l3d = l3d.replace('\n', '\n# ') - l4d = l4d.replace('\n', '\n# ') - l5d = l5d.replace('\n', '\n# ') - l6d = l6d.replace('\n', '\n# ') - l7d = l7d.replace('\n', '\n# ') - l8d = l8d.replace('\n', '\n# ') - - data.append("# Lane1: %s" % (l1d)) - data.append("# Lane2: %s" % (l2d)) - data.append("# Lane3: %s" % (l3d)) - data.append("# Lane4: %s" % (l4d)) - data.append("# Lane5: %s" % (l5d)) - data.append("# Lane6: %s" % (l6d)) - data.append("# Lane7: %s" % (l7d)) - data.append("# Lane8: %s" % (l8d)) - - #data.append("GENOME_DIR %s" % (BASE_DIR)) - #data.append("CONTAM_DIR %s" % (BASE_DIR)) - read_length = fcObj.read_length - data.append("READ_LENGTH %d" % (read_length)) - #data.append("ELAND_REPEAT") - data.append("ELAND_MULTIPLE_INSTANCES 8") - - #Construct genome dictionary to figure out what lanes to put - # in the config file. - genome_dict = {} - - #l1s = form['lane1_species'] - l1s = fcObj.lane_1_library.library_species.scientific_name #+ '|' + \ - #fcObj.lane_1_library.library_species.use_genome_build - genome_dict.setdefault(l1s, []).append('1') - l2s = fcObj.lane_2_library.library_species.scientific_name #+ '|' + \ - #fcObj.lane_2_library.library_species.use_genome_build - genome_dict.setdefault(l2s, []).append('2') - l3s = fcObj.lane_3_library.library_species.scientific_name #+ '|' + \ - #fcObj.lane_3_library.library_species.use_genome_build - genome_dict.setdefault(l3s, []).append('3') - l4s = fcObj.lane_4_library.library_species.scientific_name #+ '|' + \ - #fcObj.lane_4_library.library_species.use_genome_build - genome_dict.setdefault(l4s, []).append('4') - l5s = fcObj.lane_5_library.library_species.scientific_name #+ '|' + \ - #fcObj.lane_5_library.library_species.use_genome_build - genome_dict.setdefault(l5s, []).append('5') - l6s = fcObj.lane_6_library.library_species.scientific_name #+ '|' + \ - #fcObj.lane_6_library.library_species.use_genome_build - genome_dict.setdefault(l6s, []).append('6') - l7s = fcObj.lane_7_library.library_species.scientific_name #+ '|' + \ - #fcObj.lane_7_library.library_species.use_genome_build - genome_dict.setdefault(l7s, []).append('7') - l8s = fcObj.lane_8_library.library_species.scientific_name #+ '|' + \ - #fcObj.lane_8_library.library_species.use_genome_build - genome_dict.setdefault(l8s, []).append('8') - - genome_list = genome_dict.keys() - genome_list.sort() - - #Loop through and create entries for each species. - for genome in genome_list: - lanes = ''.join(genome_dict[genome]) - data.append('%s:ANALYSIS eland' % (lanes)) - data.append('%s:READ_LENGTH %s' % (lanes, read_length)) - data.append('%s:ELAND_GENOME %s' % (lanes, '%%(%s)s' % (genome))) - data.append('%s:USE_BASES %s' % (lanes, 'Y'*int(read_length))) - - data.append('SEQUENCE_FORMAT --scarf') - - data = '\n'.join(data) - - f = open(file_path, 'w') - f.write(data) - f.close() - - return data - - - -def config(request, flowcell=None): - """ - Returns eland config file for a given flowcell number, - or returns a list of available flowcell numbers. - """ - - # Provide INDEX of available Flowcell config files. - if flowcell is None: - #Find all FC* config files and report an index html file - #fc_list = [ os.path.split(file_path)[1] for file_path in glob.glob(os.path.join(settings.UPLOADTO_CONFIG_FILE, 'FC*')) ] - fc_list = [ fc.flowcell_id for fc in models.FlowCell.objects.all() ] - - #Convert FC* list to html links - fc_html = [ '%s' % (fc_name, fc_name) for fc_name in fc_list ] - - return HttpResponse('
'.join(fc_html)) - - #FIXME: Should validate flowcell input before using. - flowcell = _validate_input(flowcell) - cfg = getElandConfig(flowcell, regenerate=True) - - if not cfg: - return HttpResponse("Hmm, config file for %s does not seem to exist." % (flowcell)) - - - return HttpResponse(cfg, mimetype="text/plain") - - - - -#def index(request): -# """ -# Return a form for filling out information about the flowcell -# """ -# if request.method == 'POST': -# form = forms.ConfigForm(request.POST, error_class=forms.DivErrorList) -# if form.is_valid(): -# #cfg = generateElandConfig(form) -# _saveConfigFile(form) -# _saveToDb(form) -# return HttpResponse("Eland Config Saved!", mimetype="text/plain") -# else: -# return render_to_response('config_form.html', {'form': form }) -# -# else: -# fm = forms.ConfigForm(error_class=forms.DivErrorList) -# return render_to_response('config_form.html', {'form': fm }) diff --git a/gaworkflow/frontend/fctracker/__init__.py b/gaworkflow/frontend/fctracker/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/gaworkflow/frontend/fctracker/models.py b/gaworkflow/frontend/fctracker/models.py deleted file mode 100644 index 231afec..0000000 --- a/gaworkflow/frontend/fctracker/models.py +++ /dev/null @@ -1,283 +0,0 @@ -from django.db import models -from django.contrib.auth.models import User -from gaworkflow.frontend import settings - -# Create your models here. - -class Antibody(models.Model): - antigene = models.CharField(max_length=500, db_index=True) - catalog = models.CharField(max_length=50, unique=True, db_index=True) - antibodies = models.CharField(max_length=500, db_index=True) - source = models.CharField(max_length=500, blank=True, db_index=True) - biology = models.TextField(blank=True) - notes = models.TextField(blank=True) - def __str__(self): - return '%s - %s (%s)' % (self.antigene, self.antibodies, self.catalog) - class Meta: - verbose_name_plural = "antibodies" - ordering = ["antigene"] - class Admin: - list_display = ('antigene','antibodies','catalog','source','biology','notes') - list_filter = ('antibodies','source') - fields = ( - (None, { - 'fields': (('antigene','antibodies'),('catalog','source'),('biology'),('notes')) - }), - ) - -class Cellline(models.Model): - cellline_name = models.CharField(max_length=100, unique=True, db_index=True) - notes = models.TextField(blank=True) - def __str__(self): - return '%s' % (self.cellline_name) - - class Meta: - ordering = ["cellline_name"] - - class Admin: - fields = ( - (None, { - 'fields': (('cellline_name'),('notes'),) - }), - ) - -class Condition(models.Model): - condition_name = models.CharField(max_length=2000, unique=True, db_index=True) - notes = models.TextField(blank=True) - def __str__(self): - return '%s' % (self.condition_name) - - class Meta: - ordering = ["condition_name"] - - class Admin: - fields = ( - (None, { - 'fields': (('condition_name'),('notes'),) - }), - ) - -class Species(models.Model): - - scientific_name = models.CharField(max_length=256, unique=False, db_index=True, core=True) - common_name = models.CharField(max_length=256, blank=True) - use_genome_build = models.CharField(max_length=100, blank=False, null=False) - - def __str__(self): - return '%s (%s)|%s' % (self.scientific_name, self.common_name, self.use_genome_build) - - class Meta: - verbose_name_plural = "species" - ordering = ["scientific_name"] - - class Admin: - fields = ( - (None, { - 'fields': (('scientific_name', 'common_name'), ('use_genome_build')) - }), - ) - -class Lab(models.Model): - - name = models.CharField(max_length=100, blank=False, unique=True) - - def __str__(self): - return self.name - - class Admin: - pass - -class UserProfile(models.Model): - - # This allows you to use user.get_profile() to get this object - user = models.ForeignKey(User, unique=True) - - lab = models.ForeignKey(Lab) - #email = models.CharField(max_length=50, blank=True, null=True) - - def __str__(self): - return '%s (%s lab)' % (self.user, self.lab) - - class Meta: - #verbose_name_plural = "people" - #ordering = ["lab"] - pass - - class Admin: - #fields = ( - # (None, { - # 'fields': (('email', 'lab'), ('email')) - # }), - #) - pass - - -class Library(models.Model): - - library_id = models.CharField(max_length=30, primary_key=True, db_index=True, core=True) - library_name = models.CharField(max_length=100, unique=True, core=True) - library_species = models.ForeignKey(Species, core=True) - cell_line = models.ForeignKey(Cellline,core=True) - condition = models.ForeignKey(Condition,core=True) - antibody = models.ForeignKey(Antibody,blank=True,null=True,core=True) - - EXPERIMENT_TYPES = ( - ('INPUT_RXLCh','INPUT_RXLCh'), - ('ChIP-seq', 'ChIP-seq'), - ('Sheared', 'Sheared'), - ('RNA-seq', 'RNA-seq'), - ('Methyl-seq', 'Methyl-seq'), - ('DIP-seq', 'DIP-seq'), - ) - experiment_type = models.CharField(max_length=50, choices=EXPERIMENT_TYPES, - default='RNA-seq') - - creation_date = models.DateField(blank=True, null=True) - made_for = models.ForeignKey(User) - made_by = models.CharField(max_length=50, blank=True, default="Lorian") - - PROTOCOL_END_POINTS = ( - ('?', 'Unknown'), - ('Sample', 'Raw sample'), - ('Progress', 'In progress'), - ('1A', 'Ligation, then gel'), - ('PCR', 'Ligation, then PCR'), - ('1Ab', 'Ligation, PCR, then gel'), - ('1Aa', 'Ligation, gel, then PCR'), - ('2A', 'Ligation, PCR, gel, PCR'), - ('Done', 'Completed'), - ) - stopping_point = models.CharField(max_length=25, choices=PROTOCOL_END_POINTS, default='Done') - amplified_from_sample = models.ForeignKey('self', blank=True, null=True) - - undiluted_concentration = models.DecimalField("Undiluted concentration (ng/ul)", max_digits=5, decimal_places=2, default=0, blank=True, null=True) - successful_pM = models.DecimalField(max_digits=5, decimal_places=2, blank=True, null=True) - ten_nM_dilution = models.BooleanField() - avg_lib_size = models.IntegerField(default=225, blank=True, null=True) - notes = models.TextField(blank=True) - - def __str__(self): - return '#%s: %s' % (self.library_id, self.library_name) - - class Meta: - verbose_name_plural = "libraries" - ordering = ["-library_id"] - - class Admin: - date_hierarchy = "creation_date" - save_as = True - save_on_top = True - search_fields = ['library_name', 'library_id'] - list_display = ('library_id', 'library_name', 'made_for', 'creation_date', 'stopping_point') - list_display_links = ('library_id', 'library_name') - list_filter = ('stopping_point', 'library_species', 'made_for', 'made_by', 'experiment_type') - fields = ( - (None, { - 'fields': (('library_id', 'library_name'), ('library_species', 'experiment_type'),) - }), - ('Creation Information:', { - 'fields' : (('made_for', 'made_by', 'creation_date'), ('stopping_point', 'amplified_from_sample'), ('undiluted_concentration', 'library_size'), 'notes',) - }), - ('Run Information:', { - 'fields' : (('ten_nM_dilution','successful_pM'),) - }), - ) - -class FlowCell(models.Model): - - flowcell_id = models.CharField(max_length=20, unique=True, db_index=True, core=True) - run_date = models.DateTimeField(core=True) - advanced_run = models.BooleanField(default=False) - read_length = models.IntegerField(default=32) - - - FLOWCELL_STATUSES = ( - ('No', 'Not run'), - ('F', 'Failed'), - ('Del', 'Data deleted'), - ('A', 'Data available'), - ('In', 'In progress'), - ) - flowcell_status = models.CharField(max_length=10, choices=FLOWCELL_STATUSES) - - lane_1_library = models.ForeignKey(Library, related_name="lane_1_library") - lane_2_library = models.ForeignKey(Library, related_name="lane_2_library") - lane_3_library = models.ForeignKey(Library, related_name="lane_3_library") - lane_4_library = models.ForeignKey(Library, related_name="lane_4_library") - lane_5_library = models.ForeignKey(Library, related_name="lane_5_library") - lane_6_library = models.ForeignKey(Library, related_name="lane_6_library") - lane_7_library = models.ForeignKey(Library, related_name="lane_7_library") - lane_8_library = models.ForeignKey(Library, related_name="lane_8_library") - - lane_1_pM = models.DecimalField(max_digits=5, decimal_places=2, default=4) - lane_2_pM = models.DecimalField(max_digits=5, decimal_places=2, default=4) - lane_3_pM = models.DecimalField(max_digits=5, decimal_places=2, default=4) - lane_4_pM = models.DecimalField(max_digits=5, decimal_places=2, default=4) - lane_5_pM = models.DecimalField(max_digits=5, decimal_places=2, default=4) - lane_6_pM = models.DecimalField(max_digits=5, decimal_places=2, default=4) - lane_7_pM = models.DecimalField(max_digits=5, decimal_places=2, default=4) - lane_8_pM = models.DecimalField(max_digits=5, decimal_places=2, default=4) - - lane_1_cluster_estimate = models.CharField(max_length=25, blank=True, null=True) - lane_2_cluster_estimate = models.CharField(max_length=25, blank=True, null=True) - lane_3_cluster_estimate = models.CharField(max_length=25, blank=True, null=True) - lane_4_cluster_estimate = models.CharField(max_length=25, blank=True, null=True) - lane_5_cluster_estimate = models.CharField(max_length=25, blank=True, null=True) - lane_6_cluster_estimate = models.CharField(max_length=25, blank=True, null=True) - lane_7_cluster_estimate = models.CharField(max_length=25, blank=True, null=True) - lane_8_cluster_estimate = models.CharField(max_length=25, blank=True, null=True) - - kit_1000148 = models.IntegerField(blank=True, null=True) - kit_1000147 = models.IntegerField(blank=True, null=True) - kit_1000183 = models.IntegerField(blank=True, null=True) - kit_1001625 = models.IntegerField(blank=True, null=True) - - cluster_station_id = models.CharField(max_length=50, blank=True, null=True) - sequencer_id = models.CharField(max_length=50, blank=True, null=True) - - notes = models.TextField(blank=True) - - def __str__(self): - return '%s (%s)' % (self.flowcell_id, self.run_date) - - class Meta: - ordering = ["-run_date"] - - class Admin: - date_hierarchy = "run_date" - save_as = True - save_on_top = True - search_fields = ['flowcell_id', 'lane_1_library__library_id', 'lane_1_library__library_name', 'lane_2_library__library_id', 'lane_2_library__library_name', 'lane_3_library__library_id', 'lane_3_library__library_name', 'lane_4_library__library_id', 'lane_4_library__library_name', 'lane_5_library__library_id', 'lane_5_library__library_name', 'lane_6_library__library_id', 'lane_6_library__library_name', 'lane_7_library__library_id', 'lane_7_library__library_name', 'lane_8_library__library_id', 'lane_8_library__library_name'] - list_display = ('run_date', 'flowcell_status', 'flowcell_id', 'lane_1_library', 'lane_2_library', 'lane_3_library', 'lane_4_library', 'lane_5_library', 'lane_6_library', 'lane_7_library', 'lane_8_library') - list_display_links = ('run_date', 'flowcell_id', 'lane_1_library', 'lane_2_library', 'lane_3_library', 'lane_4_library', 'lane_5_library', 'lane_6_library', 'lane_7_library', 'lane_8_library') - fields = ( - (None, { - 'fields': ('run_date', ('flowcell_id', 'flowcell_status'), ('read_length', 'advanced_run'),) - }), - ('Lanes:', { - 'fields' : (('lane_1_library', 'lane_1_pM'), ('lane_2_library', 'lane_2_pM'), ('lane_3_library', 'lane_3_pM'), ('lane_4_library', 'lane_4_pM'), ('lane_5_library', 'lane_5_pM'), ('lane_6_library', 'lane_6_pM'), ('lane_7_library', 'lane_7_pM'), ('lane_8_library', 'lane_8_pM'),) - }), - (None, { - 'fields' : ('notes',) - }), - ('Kits & Machines:', { - 'classes': 'collapse', - 'fields' : (('kit_1000148', 'kit_1000147', 'kit_1000183', 'kit_1001625'), ('cluster_station_id', 'sequencer_id'),) - }), - ('Cluster Estimates:', { - 'classes': 'collapse', - 'fields' : (('lane_1_cluster_estimate', 'lane_2_cluster_estimate'), ('lane_3_cluster_estimate', 'lane_4_cluster_estimate'), ('lane_5_cluster_estimate', 'lane_6_cluster_estimate'), ('lane_7_cluster_estimate', 'lane_8_cluster_estimate',),) - }), - ) - -# Did not finish implementing, removing to avoid further confusion. -#class ElandResult(models.Model): -# -# class Admin: pass -# -# flow_cell = models.ForeignKey(FlowCell) -# config_file = models.FileField(upload_to=settings.UPLOADTO_CONFIG_FILE) -# eland_result_pack = models.FileField(upload_to=settings.UPLOADTO_ELAND_RESULT_PACKS) -# bed_file_pack = models.FileField(upload_to=settings.UPLOADTO_BED_PACKS) -# -# notes = models.TextField(blank=True) diff --git a/gaworkflow/frontend/fctracker/views.py b/gaworkflow/frontend/fctracker/views.py deleted file mode 100644 index 2299e4f..0000000 --- a/gaworkflow/frontend/fctracker/views.py +++ /dev/null @@ -1 +0,0 @@ -# Create your views here. \ No newline at end of file diff --git a/gaworkflow/frontend/manage.py b/gaworkflow/frontend/manage.py deleted file mode 100644 index 5e78ea9..0000000 --- a/gaworkflow/frontend/manage.py +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env python -from django.core.management import execute_manager -try: - import settings # Assumed to be in the same directory. -except ImportError: - import sys - sys.stderr.write("Error: Can't find the file 'settings.py' in the directory containing %r. It appears you've customized things.\nYou'll have to run django-admin.py, passing it your settings module.\n(If the file settings.py does indeed exist, it's causing an ImportError somehow.)\n" % __file__) - sys.exit(1) - -if __name__ == "__main__": - execute_manager(settings) diff --git a/gaworkflow/frontend/settings.py b/gaworkflow/frontend/settings.py deleted file mode 100644 index 853cc1e..0000000 --- a/gaworkflow/frontend/settings.py +++ /dev/null @@ -1,94 +0,0 @@ -import os - -# Django settings for elandifier project. - -DEBUG = True -TEMPLATE_DEBUG = DEBUG - -ADMINS = ( - # ('Your Name', 'your_email@domain.com'), -) - -MANAGERS = ADMINS - -DATABASE_ENGINE = 'sqlite3' # 'postgresql_psycopg2', 'postgresql', 'mysql', 'sqlite3' or 'ado_mssql'. -DATABASE_NAME = os.path.abspath('../../fctracker.db') # Or path to database file if using sqlite3. -DATABASE_USER = '' # Not used with sqlite3. -DATABASE_PASSWORD = '' # Not used with sqlite3. -DATABASE_HOST = '' # Set to empty string for localhost. Not used with sqlite3. -DATABASE_PORT = '' # Set to empty string for default. Not used with sqlite3. - -# Local time zone for this installation. Choices can be found here: -# http://www.postgresql.org/docs/8.1/static/datetime-keywords.html#DATETIME-TIMEZONE-SET-TABLE -# although not all variations may be possible on all operating systems. -# If running in a Windows environment this must be set to the same as your -# system time zone. -TIME_ZONE = 'America/Los_Angeles' - -# Language code for this installation. All choices can be found here: -# http://www.w3.org/TR/REC-html40/struct/dirlang.html#langcodes -# http://blogs.law.harvard.edu/tech/stories/storyReader$15 -LANGUAGE_CODE = 'en-us' - -SITE_ID = 1 - -# If you set this to False, Django will make some optimizations so as not -# to load the internationalization machinery. -USE_I18N = True - -# Absolute path to the directory that holds media. -# Example: "/home/media/media.lawrence.com/" -MEDIA_ROOT = '' - -# URL that handles the media served from MEDIA_ROOT. -# Example: "http://media.lawrence.com" -MEDIA_URL = '' - -# URL prefix for admin media -- CSS, JavaScript and images. Make sure to use a -# trailing slash. -# Examples: "http://foo.com/media/", "/media/". -ADMIN_MEDIA_PREFIX = '/media/' - -# Make this unique, and don't share it with anybody. -SECRET_KEY = '(ekv^=gf(j9f(x25@a7r+8)hqlz%&_1!tw^75l%^041#vi=@4n' - -# List of callables that know how to import templates from various sources. -TEMPLATE_LOADERS = ( - 'django.template.loaders.filesystem.load_template_source', - 'django.template.loaders.app_directories.load_template_source', -# 'django.template.loaders.eggs.load_template_source', -) - -MIDDLEWARE_CLASSES = ( - 'django.middleware.common.CommonMiddleware', - 'django.contrib.sessions.middleware.SessionMiddleware', - 'django.contrib.auth.middleware.AuthenticationMiddleware', - 'django.middleware.doc.XViewMiddleware', -) - -ROOT_URLCONF = 'gaworkflow.frontend.urls' - -TEMPLATE_DIRS = ( - # Put strings here, like "/home/html/django_templates" or "C:/www/django/templates". - # Always use forward slashes, even on Windows. - # Don't forget to use absolute paths, not relative paths. - os.path.abspath("../../templates"), -) - -INSTALLED_APPS = ( - 'django.contrib.admin', - 'django.contrib.auth', - 'django.contrib.contenttypes', - 'django.contrib.sessions', - 'django.contrib.sites', - 'gaworkflow.frontend.eland_config', - 'gaworkflow.frontend.fctracker', - 'django.contrib.databrowse', -) - -# Project specific settings -UPLOADTO_HOME = os.path.abspath('../../uploads') -UPLOADTO_CONFIG_FILE = os.path.join(UPLOADTO_HOME, 'eland_config') -UPLOADTO_ELAND_RESULT_PACKS = os.path.join(UPLOADTO_HOME, 'eland_results') -UPLOADTO_BED_PACKS = os.path.join(UPLOADTO_HOME, 'bed_packs') - diff --git a/gaworkflow/frontend/urls.py b/gaworkflow/frontend/urls.py deleted file mode 100644 index 0c67015..0000000 --- a/gaworkflow/frontend/urls.py +++ /dev/null @@ -1,16 +0,0 @@ -from django.conf.urls.defaults import * - -# Databrowser: -from django.contrib import databrowse -from fctracker.models import Library, FlowCell -databrowse.site.register(Library) -databrowse.site.register(FlowCell) - -urlpatterns = patterns('', - # Base: - (r'^eland_config/', include('gaworkflow.frontend.eland_config.urls')), - # Admin: - (r'^admin/', include('django.contrib.admin.urls')), - # Databrowser: - (r'^databrowse/(.*)', databrowse.site.root), -) diff --git a/gaworkflow/pipeline/__init__.py b/gaworkflow/pipeline/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/gaworkflow/pipeline/bustard.py b/gaworkflow/pipeline/bustard.py deleted file mode 100644 index 5a7a1f3..0000000 --- a/gaworkflow/pipeline/bustard.py +++ /dev/null @@ -1,146 +0,0 @@ - -from datetime import date -from glob import glob -import logging -import os -import time -import re - -from gaworkflow.pipeline.runfolder import \ - ElementTree, \ - VERSION_RE, \ - EUROPEAN_STRPTIME - -class Phasing(object): - PHASING = 'Phasing' - PREPHASING = 'Prephasing' - - def __init__(self, fromfile=None, xml=None): - self.lane = None - self.phasing = None - self.prephasing = None - - if fromfile is not None: - self._initialize_from_file(fromfile) - elif xml is not None: - self.set_elements(xml) - - def _initialize_from_file(self, pathname): - path, name = os.path.split(pathname) - basename, ext = os.path.splitext(name) - # the last character of the param base filename should be the - # lane number - tree = ElementTree.parse(pathname).getroot() - self.set_elements(tree) - self.lane = int(basename[-1]) - - def get_elements(self): - root = ElementTree.Element(Phasing.PHASING, {'lane': str(self.lane)}) - phasing = ElementTree.SubElement(root, Phasing.PHASING) - phasing.text = str(self.phasing) - prephasing = ElementTree.SubElement(root, Phasing.PREPHASING) - prephasing.text = str(self.prephasing) - return root - - def set_elements(self, tree): - if tree.tag not in ('Phasing', 'Parameters'): - raise ValueError('exptected Phasing or Parameters') - lane = tree.attrib.get('lane', None) - if lane is not None: - self.lane = int(lane) - for element in list(tree): - if element.tag == Phasing.PHASING: - self.phasing = float(element.text) - elif element.tag == Phasing.PREPHASING: - self.prephasing = float(element.text) - -class Bustard(object): - XML_VERSION = 1 - - # Xml Tags - BUSTARD = 'Bustard' - SOFTWARE_VERSION = 'version' - DATE = 'run_time' - USER = 'user' - PARAMETERS = 'Parameters' - - def __init__(self, xml=None): - self.version = None - self.date = date.today() - self.user = None - self.phasing = {} - - if xml is not None: - self.set_elements(xml) - - def _get_time(self): - return time.mktime(self.date.timetuple()) - time = property(_get_time, doc='return run time as seconds since epoch') - - def dump(self): - print "Bustard version:", self.version - print "Run date", self.date - print "user:", self.user - for lane, tree in self.phasing.items(): - print lane - print tree - - def get_elements(self): - root = ElementTree.Element('Bustard', - {'version': str(Bustard.XML_VERSION)}) - version = ElementTree.SubElement(root, Bustard.SOFTWARE_VERSION) - version.text = self.version - run_date = ElementTree.SubElement(root, Bustard.DATE) - run_date.text = str(self.time) - user = ElementTree.SubElement(root, Bustard.USER) - user.text = self.user - params = ElementTree.SubElement(root, Bustard.PARAMETERS) - for p in self.phasing.values(): - params.append(p.get_elements()) - return root - - def set_elements(self, tree): - if tree.tag != Bustard.BUSTARD: - raise ValueError('Expected "Bustard" SubElements') - xml_version = int(tree.attrib.get('version', 0)) - if xml_version > Bustard.XML_VERSION: - logging.warn('Bustard XML tree is a higher version than this class') - for element in list(tree): - if element.tag == Bustard.SOFTWARE_VERSION: - self.version = element.text - elif element.tag == Bustard.DATE: - self.date = date.fromtimestamp(float(element.text)) - elif element.tag == Bustard.USER: - self.user = element.text - elif element.tag == Bustard.PARAMETERS: - for param in element: - p = Phasing(xml=param) - self.phasing[p.lane] = p - else: - raise ValueError("Unrecognized tag: %s" % (element.tag,)) - - - -def bustard(pathname): - """ - Construct a Bustard object from pathname - """ - b = Bustard() - path, name = os.path.split(pathname) - groups = name.split("_") - version = re.search(VERSION_RE, groups[0]) - b.version = version.group(1) - t = time.strptime(groups[1], EUROPEAN_STRPTIME) - b.date = date(*t[0:3]) - b.user = groups[2] - paramfiles = glob(os.path.join(pathname, "params?.xml")) - for paramfile in paramfiles: - phasing = Phasing(paramfile) - assert (phasing.lane >= 1 and phasing.lane <= 8) - b.phasing[phasing.lane] = phasing - return b - -def fromxml(tree): - b = Bustard() - b.set_elements(tree) - return b diff --git a/gaworkflow/pipeline/configure_run.py b/gaworkflow/pipeline/configure_run.py deleted file mode 100644 index e75d73a..0000000 --- a/gaworkflow/pipeline/configure_run.py +++ /dev/null @@ -1,606 +0,0 @@ -#!/usr/bin/python -import subprocess -import logging -import time -import re -import os - -from gaworkflow.pipeline.retrieve_config import getCombinedOptions, saveConfigFile -from gaworkflow.pipeline.retrieve_config import FlowCellNotFound, WebError404 -from gaworkflow.pipeline.genome_mapper import DuplicateGenome, getAvailableGenomes, constructMapperDict -from gaworkflow.pipeline.run_status import GARunStatus - -from pyinotify import WatchManager, ThreadedNotifier -from pyinotify import EventsCodes, ProcessEvent - -class ConfigInfo: - - def __init__(self): - #run_path = firecrest analysis directory to run analysis from - self.run_path = None - self.bustard_path = None - self.config_filepath = None - self.status = None - - #top level directory where all analyses are placed - self.base_analysis_dir = None - #analysis_dir, top level analysis dir... - # base_analysis_dir + '/070924_USI-EAS44_0022_FC12150' - self.analysis_dir = None - - - def createStatusObject(self): - """ - Creates a status object which can be queried for - status of running the pipeline - - returns True if object created - returns False if object cannot be created - """ - if self.config_filepath is None: - return False - - self.status = GARunStatus(self.config_filepath) - return True - - - -#################################### -# inotify event processor - -s_firecrest_finished = re.compile('Firecrest[0-9\._\-A-Za-z]+/finished.txt') -s_bustard_finished = re.compile('Bustard[0-9\._\-A-Za-z]+/finished.txt') -s_gerald_finished = re.compile('GERALD[0-9\._\-A-Za-z]+/finished.txt') - -s_gerald_all = re.compile('Firecrest[0-9\._\-A-Za-z]+/Bustard[0-9\._\-A-Za-z]+/GERALD[0-9\._\-A-Za-z]+/') -s_bustard_all = re.compile('Firecrest[0-9\._\-A-Za-z]+/Bustard[0-9\._\-A-Za-z]+/') -s_firecrest_all = re.compile('Firecrest[0-9\._\-A-Za-z]+/') - -class RunEvent(ProcessEvent): - - def __init__(self, conf_info): - - self.run_status_dict = {'firecrest': False, - 'bustard': False, - 'gerald': False} - - self._ci = conf_info - - ProcessEvent.__init__(self) - - - def process_IN_CREATE(self, event): - fullpath = os.path.join(event.path, event.name) - if s_finished.search(fullpath): - logging.info("File Found: %s" % (fullpath)) - - if s_firecrest_finished.search(fullpath): - self.run_status_dict['firecrest'] = True - self._ci.status.updateFirecrest(event.name) - elif s_bustard_finished.search(fullpath): - self.run_status_dict['bustard'] = True - self._ci.status.updateBustard(event.name) - elif s_gerald_finished.search(fullpath): - self.run_status_dict['gerald'] = True - self._ci.status.updateGerald(event.name) - - #WARNING: The following order is important!! - # Firecrest regex will catch all gerald, bustard, and firecrest - # Bustard regex will catch all gerald and bustard - # Gerald regex will catch all gerald - # So, order needs to be Gerald, Bustard, Firecrest, or this - # won't work properly. - elif s_gerald_all.search(fullpath): - self._ci.status.updateGerald(event.name) - elif s_bustard_all.search(fullpath): - self._ci.status.updateBustard(event.name) - elif s_firecrest_all.search(fullpath): - self._ci.status.updateFirecrest(event.name) - - #print "Create: %s" % (os.path.join(event.path, event.name)) - - def process_IN_DELETE(self, event): - #print "Remove %s" % (os.path.join(event.path, event.name)) - pass - - - - -#FLAGS -# Config Step Error -RUN_ABORT = 'abort' -# Run Step Error -RUN_FAILED = 'failed' - - -##################################### -# Configure Step (goat_pipeline.py) -#Info -s_start = re.compile('Starting Genome Analyzer Pipeline') -s_gerald = re.compile("[\S\s]+--GERALD[\S\s]+--make[\S\s]+") -s_generating = re.compile('^Generating journals, Makefiles') -s_seq_folder = re.compile('^Sequence folder: ') -s_seq_folder_sub = re.compile('want to make ') -s_stderr_taskcomplete = re.compile('^Task complete, exiting') - -#Errors -s_invalid_cmdline = re.compile('Usage:[\S\s]*goat_pipeline.py') -s_species_dir_err = re.compile('Error: Lane [1-8]:') -s_goat_traceb = re.compile("^Traceback \(most recent call last\):") -s_missing_cycles = re.compile('^Error: Tile s_[1-8]_[0-9]+: Different number of cycles: [0-9]+ instead of [0-9]+') - -SUPPRESS_MISSING_CYCLES = False - - -##Ignore - Example of out above each ignore regex. -#NOTE: Commenting out an ignore will cause it to be -# logged as DEBUG with the logging module. -#CF_STDERR_IGNORE_LIST = [] -s_skip = re.compile('s_[0-8]_[0-9]+') - - -########################################## -# Pipeline Run Step (make -j8 recursive) - -##Info -s_finished = re.compile('finished') - -##Errors -s_make_error = re.compile('^make[\S\s]+Error') -s_no_gnuplot = re.compile('gnuplot: command not found') -s_no_convert = re.compile('^Can\'t exec "convert"') -s_no_ghostscript = re.compile('gs: command not found') - -##Ignore - Example of out above each ignore regex. -#NOTE: Commenting out an ignore will cause it to be -# logged as DEBUG with the logging module. -# -PL_STDERR_IGNORE_LIST = [] -# Info: PF 11802 -PL_STDERR_IGNORE_LIST.append( re.compile('^Info: PF') ) -# About to analyse intensity file s_4_0101_sig2.txt -PL_STDERR_IGNORE_LIST.append( re.compile('^About to analyse intensity file') ) -# Will send output to standard output -PL_STDERR_IGNORE_LIST.append( re.compile('^Will send output to standard output') ) -# Found 31877 clusters -PL_STDERR_IGNORE_LIST.append( re.compile('^Found [0-9]+ clusters') ) -# Will use quality criterion ((CHASTITY>=0.6) -PL_STDERR_IGNORE_LIST.append( re.compile('^Will use quality criterion') ) -# Quality criterion translated to (($F[5]>=0.6)) -PL_STDERR_IGNORE_LIST.append( re.compile('^Quality criterion translated to') ) -# opened /woldlab/trog/data1/king/070924_USI-EAS44_0022_FC12150/Data/C1-36_Firecrest1.9.1_14-11-2007_king.4/Bustard1.9.1_14-11-2007_king/s_4_0101_qhg.txt -# AND -# opened s_4_0103_qhg.txt -PL_STDERR_IGNORE_LIST.append( re.compile('^opened[\S\s]+qhg.txt') ) -# 81129 sequences out of 157651 passed filter criteria -PL_STDERR_IGNORE_LIST.append( re.compile('^[0-9]+ sequences out of [0-9]+ passed filter criteria') ) - - -def pl_stderr_ignore(line): - """ - Searches lines for lines to ignore (i.e. not to log) - - returns True if line should be ignored - returns False if line should NOT be ignored - """ - for s in PL_STDERR_IGNORE_LIST: - if s.search(line): - return True - return False - - -def config_stdout_handler(line, conf_info): - """ - Processes each line of output from GOAT - and stores useful information using the logging module - - Loads useful information into conf_info as well, for future - use outside the function. - - returns True if found condition that signifies success. - """ - - # Skip irrelevant line (without logging) - if s_skip.search(line): - pass - - # Detect invalid command-line arguments - elif s_invalid_cmdline.search(line): - logging.error("Invalid commandline options!") - - # Detect starting of configuration - elif s_start.search(line): - logging.info('START: Configuring pipeline') - - # Detect it made it past invalid arguments - elif s_gerald.search(line): - logging.info('Running make now') - - # Detect that make files have been generated (based on output) - elif s_generating.search(line): - logging.info('Make files generted') - return True - - # Capture run directory - elif s_seq_folder.search(line): - mo = s_seq_folder_sub.search(line) - #Output changed when using --tiles= - # at least in pipeline v0.3.0b2 - if mo: - firecrest_bustard_gerald_makefile = line[mo.end():] - firecrest_bustard_gerald, junk = \ - os.path.split(firecrest_bustard_gerald_makefile) - firecrest_bustard, junk = os.path.split(firecrest_bustard_gerald) - firecrest, junk = os.path.split(firecrest_bustard) - - conf_info.bustard_path = firecrest_bustard - conf_info.run_path = firecrest - - #Standard output handling - else: - print 'Sequence line:', line - mo = s_seq_folder.search(line) - conf_info.bustard_path = line[mo.end():] - conf_info.run_path, temp = os.path.split(conf_info.bustard_path) - - # Log all other output for debugging purposes - else: - logging.warning('CONF:?: %s' % (line)) - - return False - - - -def config_stderr_handler(line, conf_info): - """ - Processes each line of output from GOAT - and stores useful information using the logging module - - Loads useful information into conf_info as well, for future - use outside the function. - - returns RUN_ABORT upon detecting failure; - True on success message; - False if neutral message - (i.e. doesn't signify failure or success) - """ - global SUPPRESS_MISSING_CYCLES - - # Detect invalid species directory error - if s_species_dir_err.search(line): - logging.error(line) - return RUN_ABORT - # Detect goat_pipeline.py traceback - elif s_goat_traceb.search(line): - logging.error("Goat config script died, traceback in debug output") - return RUN_ABORT - # Detect indication of successful configuration (from stderr; odd, but ok) - elif s_stderr_taskcomplete.search(line): - logging.info('Configure step successful (from: stderr)') - return True - # Detect missing cycles - elif s_missing_cycles.search(line): - - # Only display error once - if not SUPPRESS_MISSING_CYCLES: - logging.error("Missing cycles detected; Not all cycles copied?") - logging.debug("CONF:STDERR:MISSING_CYCLES: %s" % (line)) - SUPPRESS_MISSING_CYCLES = True - return RUN_ABORT - - # Log all other output as debug output - else: - logging.debug('CONF:STDERR:?: %s' % (line)) - - # Neutral (not failure; nor success) - return False - - -#def pipeline_stdout_handler(line, conf_info): -# """ -# Processes each line of output from running the pipeline -# and stores useful information using the logging module -# -# Loads useful information into conf_info as well, for future -# use outside the function. -# -# returns True if found condition that signifies success. -# """ -# -# #f.write(line + '\n') -# -# return True - - - -def pipeline_stderr_handler(line, conf_info): - """ - Processes each line of stderr from pipelien run - and stores useful information using the logging module - - ##FIXME: Future feature (doesn't actually do this yet) - #Loads useful information into conf_info as well, for future - #use outside the function. - - returns RUN_FAILED upon detecting failure; - #True on success message; (no clear success state) - False if neutral message - (i.e. doesn't signify failure or success) - """ - - if pl_stderr_ignore(line): - pass - elif s_make_error.search(line): - logging.error("make error detected; run failed") - return RUN_FAILED - elif s_no_gnuplot.search(line): - logging.error("gnuplot not found") - return RUN_FAILED - elif s_no_convert.search(line): - logging.error("imagemagick's convert command not found") - return RUN_FAILED - elif s_no_ghostscript.search(line): - logging.error("ghostscript not found") - return RUN_FAILED - else: - logging.debug('PIPE:STDERR:?: %s' % (line)) - - return False - - -def retrieve_config(conf_info, flowcell, cfg_filepath, genome_dir): - """ - Gets the config file from server... - requires config file in: - /etc/ga_frontend/ga_frontend.conf - or - ~/.ga_frontend.conf - - with: - [config_file_server] - base_host_url: http://host:port - - return True if successful, False is failure - """ - options = getCombinedOptions() - - if options.url is None: - logging.error("~/.ga_frontend.conf or /etc/ga_frontend/ga_frontend.conf" \ - " missing base_host_url option") - return False - - try: - saveConfigFile(flowcell, options.url, cfg_filepath) - conf_info.config_filepath = cfg_filepath - except FlowCellNotFound, e: - logging.error(e) - return False - except WebError404, e: - logging.error(e) - return False - except IOError, e: - logging.error(e) - return False - except Exception, e: - logging.error(e) - return False - - f = open(cfg_filepath, 'r') - data = f.read() - f.close() - - genome_dict = getAvailableGenomes(genome_dir) - mapper_dict = constructMapperDict(genome_dict) - - logging.debug(data) - - f = open(cfg_filepath, 'w') - f.write(data % (mapper_dict)) - f.close() - - return True - - - -def configure(conf_info): - """ - Attempts to configure the GA pipeline using goat. - - Uses logging module to store information about status. - - returns True if configuration successful, otherwise False. - """ - #ERROR Test: - #pipe = subprocess.Popen(['goat_pipeline.py', - # '--GERALD=config32bk.txt', - # '--make .',], - # #'.'], - # stdout=subprocess.PIPE, - # stderr=subprocess.PIPE) - - #ERROR Test (2), causes goat_pipeline.py traceback - #pipe = subprocess.Popen(['goat_pipeline.py', - # '--GERALD=%s' % (conf_info.config_filepath), - # '--tiles=s_4_100,s_4_101,s_4_102,s_4_103,s_4_104', - # '--make', - # '.'], - # stdout=subprocess.PIPE, - # stderr=subprocess.PIPE) - - ########################## - # Run configuration step - # Not a test; actual configure attempt. - #pipe = subprocess.Popen(['goat_pipeline.py', - # '--GERALD=%s' % (conf_info.config_filepath), - # '--make', - # '.'], - # stdout=subprocess.PIPE, - # stderr=subprocess.PIPE) - - - stdout_filepath = os.path.join(conf_info.analysis_dir, - "pipeline_configure_stdout.txt") - stderr_filepath = os.path.join(conf_info.analysis_dir, - "pipeline_configure_stderr.txt") - - fout = open(stdout_filepath, 'w') - ferr = open(stderr_filepath, 'w') - - pipe = subprocess.Popen(['goat_pipeline.py', - '--GERALD=%s' % (conf_info.config_filepath), - #'--tiles=s_4_0100,s_4_0101,s_4_0102,s_4_0103,s_4_0104', - '--make', - conf_info.analysis_dir], - stdout=fout, - stderr=ferr) - - print "Configuring pipeline: %s" % (time.ctime()) - error_code = pipe.wait() - - # Clean up - fout.close() - ferr.close() - - - ################## - # Process stdout - fout = open(stdout_filepath, 'r') - - stdout_line = fout.readline() - - complete = False - while stdout_line != '': - # Handle stdout - if config_stdout_handler(stdout_line, conf_info): - complete = True - stdout_line = fout.readline() - - fout.close() - - - #error_code = pipe.wait() - if error_code: - logging.error('Recieved error_code: %s' % (error_code)) - else: - logging.info('We are go for launch!') - - #Process stderr - ferr = open(stderr_filepath, 'r') - stderr_line = ferr.readline() - - abort = 'NO!' - stderr_success = False - while stderr_line != '': - stderr_status = config_stderr_handler(stderr_line, conf_info) - if stderr_status == RUN_ABORT: - abort = RUN_ABORT - elif stderr_status is True: - stderr_success = True - stderr_line = ferr.readline() - - ferr.close() - - - #Success requirements: - # 1) The stdout completed without error - # 2) The program exited with status 0 - # 3) No errors found in stdout - print '#Expect: True, False, True, True' - print complete, bool(error_code), abort != RUN_ABORT, stderr_success is True - status = complete is True and \ - bool(error_code) is False and \ - abort != RUN_ABORT and \ - stderr_success is True - - # If everything was successful, but for some reason - # we didn't retrieve the path info, log it. - if status is True: - if conf_info.bustard_path is None or conf_info.run_path is None: - logging.error("Failed to retrieve run_path") - return False - - return status - - -def run_pipeline(conf_info): - """ - Run the pipeline and monitor status. - """ - # Fail if the run_path doesn't actually exist - if not os.path.exists(conf_info.run_path): - logging.error('Run path does not exist: %s' \ - % (conf_info.run_path)) - return False - - # Change cwd to run_path - stdout_filepath = os.path.join(conf_info.analysis_dir, 'pipeline_run_stdout.txt') - stderr_filepath = os.path.join(conf_info.analysis_dir, 'pipeline_run_stderr.txt') - - # Create status object - conf_info.createStatusObject() - - # Monitor file creation - wm = WatchManager() - mask = EventsCodes.IN_DELETE | EventsCodes.IN_CREATE - event = RunEvent(conf_info) - notifier = ThreadedNotifier(wm, event) - notifier.start() - wdd = wm.add_watch(conf_info.run_path, mask, rec=True) - - # Log pipeline starting - logging.info('STARTING PIPELINE @ %s' % (time.ctime())) - - # Start the pipeline (and hide!) - #pipe = subprocess.Popen(['make', - # '-j8', - # 'recursive'], - # stdout=subprocess.PIPE, - # stderr=subprocess.PIPE) - - fout = open(stdout_filepath, 'w') - ferr = open(stderr_filepath, 'w') - - pipe = subprocess.Popen(['make', - '--directory=%s' % (conf_info.run_path), - '-j8', - 'recursive'], - stdout=fout, - stderr=ferr) - #shell=True) - # Wait for run to finish - retcode = pipe.wait() - - - # Clean up - notifier.stop() - fout.close() - ferr.close() - - # Process stderr - ferr = open(stderr_filepath, 'r') - - run_failed_stderr = False - for line in ferr: - err_status = pipeline_stderr_handler(line, conf_info) - if err_status == RUN_FAILED: - run_failed_stderr = True - - ferr.close() - - # Finished file check! - print 'RUN SUCCESS CHECK:' - for key, value in event.run_status_dict.items(): - print ' %s: %s' % (key, value) - - dstatus = event.run_status_dict - - # Success or failure check - status = (retcode == 0) and \ - run_failed_stderr is False and \ - dstatus['firecrest'] is True and \ - dstatus['bustard'] is True and \ - dstatus['gerald'] is True - - return status - - diff --git a/gaworkflow/pipeline/firecrest.py b/gaworkflow/pipeline/firecrest.py deleted file mode 100644 index dd06aec..0000000 --- a/gaworkflow/pipeline/firecrest.py +++ /dev/null @@ -1,127 +0,0 @@ -""" -Extract information about the Firecrest run - -Firecrest - class holding the properties we found -firecrest - Firecrest factory function initalized from a directory name -fromxml - Firecrest factory function initalized from an xml dump from - the Firecrest object. -""" - -from datetime import date -import os -import re -import time - -from gaworkflow.pipeline.runfolder import \ - ElementTree, \ - VERSION_RE, \ - EUROPEAN_STRPTIME - -class Firecrest(object): - XML_VERSION=1 - - # xml tag names - FIRECREST = 'Firecrest' - SOFTWARE_VERSION = 'version' - START = 'FirstCycle' - STOP = 'LastCycle' - DATE = 'run_time' - USER = 'user' - MATRIX = 'matrix' - - def __init__(self, xml=None): - self.start = None - self.stop = None - self.version = None - self.date = date.today() - self.user = None - self.matrix = None - - if xml is not None: - self.set_elements(xml) - - def _get_time(self): - return time.mktime(self.date.timetuple()) - time = property(_get_time, doc='return run time as seconds since epoch') - - def dump(self): - print "Starting cycle:", self.start - print "Ending cycle:", self.stop - print "Firecrest version:", self.version - print "Run date:", self.date - print "user:", self.user - - def get_elements(self): - attribs = {'version': str(Firecrest.XML_VERSION) } - root = ElementTree.Element(Firecrest.FIRECREST, attrib=attribs) - version = ElementTree.SubElement(root, Firecrest.SOFTWARE_VERSION) - version.text = self.version - start_cycle = ElementTree.SubElement(root, Firecrest.START) - start_cycle.text = str(self.start) - stop_cycle = ElementTree.SubElement(root, Firecrest.STOP) - stop_cycle.text = str(self.stop) - run_date = ElementTree.SubElement(root, Firecrest.DATE) - run_date.text = str(self.time) - user = ElementTree.SubElement(root, Firecrest.USER) - user.text = self.user - matrix = ElementTree.SubElement(root, Firecrest.MATRIX) - matrix.text = self.matrix - return root - - def set_elements(self, tree): - if tree.tag != Firecrest.FIRECREST: - raise ValueError('Expected "Firecrest" SubElements') - xml_version = int(tree.attrib.get('version', 0)) - if xml_version > Firecrest.XML_VERSION: - logging.warn('Firecrest XML tree is a higher version than this class') - for element in list(tree): - if element.tag == Firecrest.SOFTWARE_VERSION: - self.version = element.text - elif element.tag == Firecrest.START: - self.start = int(element.text) - elif element.tag == Firecrest.STOP: - self.stop = int(element.text) - elif element.tag == Firecrest.DATE: - self.date = date.fromtimestamp(float(element.text)) - elif element.tag == Firecrest.USER: - self.user = element.text - elif element.tag == Firecrest.MATRIX: - self.matrix = element.text - else: - raise ValueError("Unrecognized tag: %s" % (element.tag,)) - -def firecrest(pathname): - """ - Examine the directory at pathname and initalize a Firecrest object - """ - f = Firecrest() - - # parse firecrest directory name - path, name = os.path.split(pathname) - groups = name.split('_') - # grab the start/stop cycle information - cycle = re.match("C([0-9]+)-([0-9]+)", groups[0]) - f.start = int(cycle.group(1)) - f.stop = int(cycle.group(2)) - # firecrest version - version = re.search(VERSION_RE, groups[1]) - f.version = (version.group(1)) - # datetime - t = time.strptime(groups[2], EUROPEAN_STRPTIME) - f.date = date(*t[0:3]) - # username - f.user = groups[3] - - # should I parse this deeper than just stashing the - # contents of the matrix file? - matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt') - f.matrix = open(matrix_pathname, 'r').read() - return f - -def fromxml(tree): - """ - Initialize a Firecrest object from an element tree node - """ - f = Firecrest() - f.set_elements(tree) - return f diff --git a/gaworkflow/pipeline/genome_mapper.py b/gaworkflow/pipeline/genome_mapper.py deleted file mode 100644 index 90c619b..0000000 --- a/gaworkflow/pipeline/genome_mapper.py +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/python -import glob -import sys -import os -import re - -import logging - -from gaworkflow.util.alphanum import alphanum - -class DuplicateGenome(Exception): pass - - -def _has_metainfo(genome_dir): - metapath = os.path.join(genome_dir, '_metainfo_') - if os.path.isfile(metapath): - return True - else: - return False - -def getAvailableGenomes(genome_base_dir): - """ - raises IOError (on genome_base_dir not found) - raises DuplicateGenome on duplicate genomes found. - - returns a double dictionary (i.e. d[species][build] = path) - """ - - # Need valid directory - if not os.path.exists(genome_base_dir): - msg = "Directory does not exist: %s" % (genome_base_dir) - raise IOError, msg - - # Find all subdirectories - filepath_list = glob.glob(os.path.join(genome_base_dir, '*')) - potential_genome_dirs = \ - [ filepath for filepath in filepath_list if os.path.isdir(filepath)] - - # Get list of metadata files - genome_dir_list = \ - [ dirpath \ - for dirpath in potential_genome_dirs \ - if _has_metainfo(dirpath) ] - - # Genome double dictionary - d = {} - - for genome_dir in genome_dir_list: - line = open(os.path.join(genome_dir, '_metainfo_'), 'r').readline().strip() - - # Get species, build... log and skip on failure - try: - species, build = line.split('|') - except: - logging.warning('Skipping: Invalid metafile (%s) line: %s' \ - % (metafile, line)) - continue - - build_dict = d.setdefault(species, {}) - if build in build_dict: - msg = "Duplicate genome for %s|%s" % (species, build) - raise DuplicateGenome, msg - - build_dict[build] = genome_dir - - return d - - -class constructMapperDict(object): - """ - Emulate a dictionary to map genome|build names to paths. - - It uses the dictionary generated by getAvailableGenomes. - """ - def __init__(self, genome_dict): - self.genome_dict = genome_dict - - def __getitem__(self, key): - """ - Return the best match for key - """ - elements = re.split("\|", key) - - if len(elements) == 1: - # we just the species name - # get the set of builds - builds = self.genome_dict[elements[0]] - - # sort build names the way humans would - keys = builds.keys() - keys.sort(cmp=alphanum) - - # return the path from the 'last' build name - return builds[keys[-1]] - - elif len(elements) == 2: - # we have species, and build name - return self.genome_dict[elements[0]][elements[1]] - else: - raise KeyError("Unrecognized key") - - def keys(self): - keys = [] - for species in self.genome_dict.keys(): - for build in self.genome_dict[species]: - keys.append([species+'|'+build]) - return keys - - def values(self): - values = [] - for species in self.genome_dict.keys(): - for build in self.genome_dict[species]: - values.append(self.genome_dict[species][build]) - return values - - def items(self): - items = [] - for species in self.genome_dict.keys(): - for build in self.genome_dict[species]: - key = [species+'|'+build] - value = self.genome_dict[species][build] - items.append((key, value)) - return items - -if __name__ == '__main__': - - if len(sys.argv) != 2: - print 'useage: %s ' % (sys.argv[0]) - sys.exit(1) - - d = getAvailableGenomes(sys.argv[1]) - d2 = constructMapperDict(d) - - for k,v in d2.items(): - print '%s: %s' % (k,v) - - diff --git a/gaworkflow/pipeline/gerald.py b/gaworkflow/pipeline/gerald.py deleted file mode 100644 index f268767..0000000 --- a/gaworkflow/pipeline/gerald.py +++ /dev/null @@ -1,719 +0,0 @@ -""" -Provide access to information stored in the GERALD directory. -""" -from datetime import datetime, date -from glob import glob -import logging -import os -import stat -import time -import types - -from gaworkflow.pipeline.runfolder import \ - ElementTree, \ - EUROPEAN_STRPTIME, \ - LANES_PER_FLOWCELL, \ - VERSION_RE -from gaworkflow.util.ethelp import indent, flatten -from gaworkflow.util.opener import autoopen - -class Gerald(object): - """ - Capture meaning out of the GERALD directory - """ - XML_VERSION = 1 - GERALD='Gerald' - RUN_PARAMETERS='RunParameters' - SUMMARY='Summary' - - class LaneParameters(object): - """ - Make it easy to access elements of LaneSpecificRunParameters from python - """ - def __init__(self, gerald, key): - self._gerald = gerald - self._key = key - - def __get_attribute(self, xml_tag): - subtree = self._gerald.tree.find('LaneSpecificRunParameters') - container = subtree.find(xml_tag) - if container is None: - return None - if len(container.getchildren()) > LANES_PER_FLOWCELL: - raise RuntimeError('GERALD config.xml file changed') - lanes = [x.tag.split('_')[1] for x in container.getchildren()] - index = lanes.index(self._key) - element = container[index] - return element.text - def _get_analysis(self): - return self.__get_attribute('ANALYSIS') - analysis = property(_get_analysis) - - def _get_eland_genome(self): - genome = self.__get_attribute('ELAND_GENOME') - # default to the chipwide parameters if there isn't an - # entry in the lane specific paramaters - if genome is None: - subtree = self._gerald.tree.find('ChipWideRunParameters') - container = subtree.find('ELAND_GENOME') - genome = container.text - return genome - eland_genome = property(_get_eland_genome) - - def _get_read_length(self): - return self.__get_attribute('READ_LENGTH') - read_length = property(_get_read_length) - - def _get_use_bases(self): - return self.__get_attribute('USE_BASES') - use_bases = property(_get_use_bases) - - class LaneSpecificRunParameters(object): - """ - Provide access to LaneSpecificRunParameters - """ - def __init__(self, gerald): - self._gerald = gerald - self._keys = None - def __getitem__(self, key): - return Gerald.LaneParameters(self._gerald, key) - def keys(self): - if self._keys is None: - tree = self._gerald.tree - analysis = tree.find('LaneSpecificRunParameters/ANALYSIS') - # according to the pipeline specs I think their fields - # are sampleName_laneID, with sampleName defaulting to s - # since laneIDs are constant lets just try using - # those consistently. - self._keys = [ x.tag.split('_')[1] for x in analysis] - return self._keys - def values(self): - return [ self[x] for x in self.keys() ] - def items(self): - return zip(self.keys(), self.values()) - def __len__(self): - return len(self.keys()) - - def __init__(self, xml=None): - self.pathname = None - self.tree = None - - # parse lane parameters out of the config.xml file - self.lanes = Gerald.LaneSpecificRunParameters(self) - - self.summary = None - self.eland_results = None - - if xml is not None: - self.set_elements(xml) - - def _get_date(self): - if self.tree is None: - return datetime.today() - timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP') - epochstamp = time.mktime(time.strptime(timestamp, '%c')) - return datetime.fromtimestamp(epochstamp) - date = property(_get_date) - - def _get_time(self): - return time.mktime(self.date.timetuple()) - time = property(_get_time, doc='return run time as seconds since epoch') - - def _get_version(self): - if self.tree is None: - return None - return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION') - version = property(_get_version) - - def dump(self): - """ - Debugging function, report current object - """ - print 'Gerald version:', self.version - print 'Gerald run date:', self.date - print 'Gerald config.xml:', self.tree - self.summary.dump() - - def get_elements(self): - if self.tree is None or self.summary is None: - return None - - gerald = ElementTree.Element(Gerald.GERALD, - {'version': unicode(Gerald.XML_VERSION)}) - gerald.append(self.tree) - gerald.append(self.summary.get_elements()) - if self.eland_results: - gerald.append(self.eland_results.get_elements()) - return gerald - - def set_elements(self, tree): - if tree.tag != Gerald.GERALD: - raise ValueError('exptected GERALD') - xml_version = int(tree.attrib.get('version', 0)) - if xml_version > Gerald.XML_VERSION: - logging.warn('XML tree is a higher version than this class') - for element in list(tree): - tag = element.tag.lower() - if tag == Gerald.RUN_PARAMETERS.lower(): - self.tree = element - elif tag == Gerald.SUMMARY.lower(): - self.summary = Summary(xml=element) - elif tag == ELAND.ELAND.lower(): - self.eland_results = ELAND(xml=element) - else: - logging.warn("Unrecognized tag %s" % (element.tag,)) - - -def gerald(pathname): - g = Gerald() - g.pathname = pathname - path, name = os.path.split(pathname) - config_pathname = os.path.join(pathname, 'config.xml') - g.tree = ElementTree.parse(config_pathname).getroot() - - # parse Summary.htm file - summary_pathname = os.path.join(pathname, 'Summary.htm') - g.summary = Summary(summary_pathname) - # parse eland files - g.eland_results = eland(g.pathname, g) - return g - -def tonumber(v): - """ - Convert a value to int if its an int otherwise a float. - """ - try: - v = int(v) - except ValueError, e: - v = float(v) - return v - -def parse_mean_range(value): - """ - Parse values like 123 +/- 4.5 - """ - if value.strip() == 'unknown': - return 0, 0 - - average, pm, deviation = value.split() - if pm != '+/-': - raise RuntimeError("Summary.htm file format changed") - return tonumber(average), tonumber(deviation) - -def make_mean_range_element(parent, name, mean, deviation): - """ - Make an ElementTree subelement - """ - element = ElementTree.SubElement(parent, name, - { 'mean': unicode(mean), - 'deviation': unicode(deviation)}) - return element - -def parse_mean_range_element(element): - """ - Grab mean/deviation out of element - """ - return (tonumber(element.attrib['mean']), - tonumber(element.attrib['deviation'])) - -def parse_summary_element(element): - """ - Determine if we have a simple element or a mean/deviation element - """ - if len(element.attrib) > 0: - return parse_mean_range_element(element) - else: - return element.text - -class Summary(object): - """ - Extract some useful information from the Summary.htm file - """ - XML_VERSION = 2 - SUMMARY = 'Summary' - - class LaneResultSummary(object): - """ - Parse the LaneResultSummary table out of Summary.htm - Mostly for the cluster number - """ - LANE_RESULT_SUMMARY = 'LaneResultSummary' - TAGS = { - 'LaneYield': 'lane_yield', - 'Cluster': 'cluster', # Raw - 'ClusterPF': 'cluster_pass_filter', - 'AverageFirstCycleIntensity': 'average_first_cycle_intensity', - 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles', - 'PercentPassFilterClusters': 'percent_pass_filter_clusters', - 'PercentPassFilterAlign': 'percent_pass_filter_align', - 'AverageAlignmentScore': 'average_alignment_score', - 'PercentErrorRate': 'percent_error_rate' - } - - def __init__(self, html=None, xml=None): - self.lane = None - self.lane_yield = None - self.cluster = None - self.cluster_pass_filter = None - self.average_first_cycle_intensity = None - self.percent_intensity_after_20_cycles = None - self.percent_pass_filter_clusters = None - self.percent_pass_filter_align = None - self.average_alignment_score = None - self.percent_error_rate = None - - if html is not None: - self.set_elements_from_html(html) - if xml is not None: - self.set_elements(xml) - - def set_elements_from_html(self, data): - if not len(data) in (8,10): - raise RuntimeError("Summary.htm file format changed") - - # same in pre-0.3.0 Summary file and 0.3 summary file - self.lane = data[0] - - if len(data) == 8: - parsed_data = [ parse_mean_range(x) for x in data[1:] ] - # this is the < 0.3 Pipeline version - self.cluster = parsed_data[0] - self.average_first_cycle_intensity = parsed_data[1] - self.percent_intensity_after_20_cycles = parsed_data[2] - self.percent_pass_filter_clusters = parsed_data[3] - self.percent_pass_filter_align = parsed_data[4] - self.average_alignment_score = parsed_data[5] - self.percent_error_rate = parsed_data[6] - elif len(data) == 10: - parsed_data = [ parse_mean_range(x) for x in data[2:] ] - # this is the >= 0.3 summary file - self.lane_yield = data[1] - self.cluster = parsed_data[0] - self.cluster_pass_filter = parsed_data[1] - self.average_first_cycle_intensity = parsed_data[2] - self.percent_intensity_after_20_cycles = parsed_data[3] - self.percent_pass_filter_clusters = parsed_data[4] - self.percent_pass_filter_align = parsed_data[5] - self.average_alignment_score = parsed_data[6] - self.percent_error_rate = parsed_data[7] - - def get_elements(self): - lane_result = ElementTree.Element( - Summary.LaneResultSummary.LANE_RESULT_SUMMARY, - {'lane': self.lane}) - for tag, variable_name in Summary.LaneResultSummary.TAGS.items(): - value = getattr(self, variable_name) - if value is None: - continue - # it looks like a sequence - elif type(value) in (types.TupleType, types.ListType): - element = make_mean_range_element( - lane_result, - tag, - *value - ) - else: - element = ElementTree.SubElement(lane_result, tag) - element.text = value - return lane_result - - def set_elements(self, tree): - if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY: - raise ValueError('Expected %s' % ( - Summary.LaneResultSummary.LANE_RESULT_SUMMARY)) - self.lane = tree.attrib['lane'] - tags = Summary.LaneResultSummary.TAGS - for element in list(tree): - try: - variable_name = tags[element.tag] - setattr(self, variable_name, - parse_summary_element(element)) - except KeyError, e: - logging.warn('Unrecognized tag %s' % (element.tag,)) - - def __init__(self, filename=None, xml=None): - self.lane_results = {} - - if filename is not None: - self._extract_lane_results(filename) - if xml is not None: - self.set_elements(xml) - - def __getitem__(self, key): - return self.lane_results[key] - - def __len__(self): - return len(self.lane_results) - - def keys(self): - return self.lane_results.keys() - - def values(self): - return self.lane_results.values() - - def items(self): - return self.lane_results.items() - - def _flattened_row(self, row): - """ - flatten the children of a ... - """ - return [flatten(x) for x in row.getchildren() ] - - def _parse_table(self, table): - """ - assumes the first line is the header of a table, - and that the remaining rows are data - """ - rows = table.getchildren() - data = [] - for r in rows: - data.append(self._flattened_row(r)) - return data - - def _extract_named_tables(self, pathname): - """ - extract all the 'named' tables from a Summary.htm file - and return as a dictionary - - Named tables are

...

...
pairs - The contents of the h2 tag is considered to the name - of the table. - """ - tree = ElementTree.parse(pathname).getroot() - body = tree.find('body') - tables = {} - for i in range(len(body)): - if body[i].tag == 'h2' and body[i+1].tag == 'table': - # we have an interesting table - name = flatten(body[i]) - table = body[i+1] - data = self._parse_table(table) - tables[name] = data - return tables - - def _extract_lane_results(self, pathname): - """ - extract the Lane Results Summary table - """ - - tables = self._extract_named_tables(pathname) - - # parse lane result summary - lane_summary = tables['Lane Results Summary'] - # this is version 1 of the summary file - if len(lane_summary[-1]) == 8: - # strip header - headers = lane_summary[0] - # grab the lane by lane data - lane_summary = lane_summary[1:] - - # this is version 2 of the summary file - if len(lane_summary[-1]) == 10: - # lane_summary[0] is a different less specific header row - headers = lane_summary[1] - lane_summary = lane_summary[2:10] - # after the last lane, there's a set of chip wide averages - - for r in lane_summary: - lrs = Summary.LaneResultSummary(html=r) - self.lane_results[lrs.lane] = lrs - - def get_elements(self): - summary = ElementTree.Element(Summary.SUMMARY, - {'version': unicode(Summary.XML_VERSION)}) - for lane in self.lane_results.values(): - summary.append(lane.get_elements()) - return summary - - def set_elements(self, tree): - if tree.tag != Summary.SUMMARY: - return ValueError("Expected %s" % (Summary.SUMMARY,)) - xml_version = int(tree.attrib.get('version', 0)) - if xml_version > Summary.XML_VERSION: - logging.warn('Summary XML tree is a higher version than this class') - for element in list(tree): - lrs = Summary.LaneResultSummary() - lrs.set_elements(element) - self.lane_results[lrs.lane] = lrs - - def dump(self): - """ - Debugging function, report current object - """ - pass - - -def build_genome_fasta_map(genome_dir): - # build fasta to fasta file map - genome = genome_dir.split(os.path.sep)[-1] - fasta_map = {} - for vld_file in glob(os.path.join(genome_dir, '*.vld')): - is_link = False - if os.path.islink(vld_file): - is_link = True - vld_file = os.path.realpath(vld_file) - path, vld_name = os.path.split(vld_file) - name, ext = os.path.splitext(vld_name) - if is_link: - fasta_map[name] = name - else: - fasta_map[name] = os.path.join(genome, name) - return fasta_map - -class ElandLane(object): - """ - Process an eland result file - """ - XML_VERSION = 1 - LANE = 'ElandLane' - SAMPLE_NAME = 'SampleName' - LANE_ID = 'LaneID' - GENOME_MAP = 'GenomeMap' - GENOME_ITEM = 'GenomeItem' - MAPPED_READS = 'MappedReads' - MAPPED_ITEM = 'MappedItem' - MATCH_CODES = 'MatchCodes' - MATCH_ITEM = 'Code' - READS = 'Reads' - - def __init__(self, pathname=None, genome_map=None, xml=None): - self.pathname = pathname - self._sample_name = None - self._lane_id = None - self._reads = None - self._mapped_reads = None - self._match_codes = None - if genome_map is None: - genome_map = {} - self.genome_map = genome_map - - if xml is not None: - self.set_elements(xml) - - def _update(self): - """ - Actually read the file and actually count the reads - """ - # can't do anything if we don't have a file to process - if self.pathname is None: - return - - if os.stat(self.pathname)[stat.ST_SIZE] == 0: - raise RuntimeError("Eland isn't done, try again later.") - - reads = 0 - mapped_reads = {} - - match_codes = {'NM':0, 'QC':0, 'RM':0, - 'U0':0, 'U1':0, 'U2':0, - 'R0':0, 'R1':0, 'R2':0, - } - for line in autoopen(self.pathname,'r'): - reads += 1 - fields = line.split() - # code = fields[2] - # match_codes[code] = match_codes.setdefault(code, 0) + 1 - # the QC/NM etc codes are in the 3rd field and always present - match_codes[fields[2]] += 1 - # ignore lines that don't have a fasta filename - if len(fields) < 7: - continue - fasta = self.genome_map.get(fields[6], fields[6]) - mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1 - self._match_codes = match_codes - self._mapped_reads = mapped_reads - self._reads = reads - - def _update_name(self): - # extract the sample name - if self.pathname is None: - return - - path, name = os.path.split(self.pathname) - split_name = name.split('_') - self._sample_name = split_name[0] - self._lane_id = split_name[1] - - def _get_sample_name(self): - if self._sample_name is None: - self._update_name() - return self._sample_name - sample_name = property(_get_sample_name) - - def _get_lane_id(self): - if self._lane_id is None: - self._update_name() - return self._lane_id - lane_id = property(_get_lane_id) - - def _get_reads(self): - if self._reads is None: - self._update() - return self._reads - reads = property(_get_reads) - - def _get_mapped_reads(self): - if self._mapped_reads is None: - self._update() - return self._mapped_reads - mapped_reads = property(_get_mapped_reads) - - def _get_match_codes(self): - if self._match_codes is None: - self._update() - return self._match_codes - match_codes = property(_get_match_codes) - - def get_elements(self): - lane = ElementTree.Element(ElandLane.LANE, - {'version': - unicode(ElandLane.XML_VERSION)}) - sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME) - sample_tag.text = self.sample_name - lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID) - lane_tag.text = self.lane_id - genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP) - for k, v in self.genome_map.items(): - item = ElementTree.SubElement( - genome_map, ElandLane.GENOME_ITEM, - {'name':k, 'value':unicode(v)}) - mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS) - for k, v in self.mapped_reads.items(): - item = ElementTree.SubElement( - mapped_reads, ElandLane.MAPPED_ITEM, - {'name':k, 'value':unicode(v)}) - match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES) - for k, v in self.match_codes.items(): - item = ElementTree.SubElement( - match_codes, ElandLane.MATCH_ITEM, - {'name':k, 'value':unicode(v)}) - reads = ElementTree.SubElement(lane, ElandLane.READS) - reads.text = unicode(self.reads) - - return lane - - def set_elements(self, tree): - if tree.tag != ElandLane.LANE: - raise ValueError('Exptecting %s' % (ElandLane.LANE,)) - - # reset dictionaries - self._mapped_reads = {} - self._match_codes = {} - - for element in tree: - tag = element.tag.lower() - if tag == ElandLane.SAMPLE_NAME.lower(): - self._sample_name = element.text - elif tag == ElandLane.LANE_ID.lower(): - self._lane_id = element.text - elif tag == ElandLane.GENOME_MAP.lower(): - for child in element: - name = child.attrib['name'] - value = child.attrib['value'] - self.genome_map[name] = value - elif tag == ElandLane.MAPPED_READS.lower(): - for child in element: - name = child.attrib['name'] - value = child.attrib['value'] - self._mapped_reads[name] = int(value) - elif tag == ElandLane.MATCH_CODES.lower(): - for child in element: - name = child.attrib['name'] - value = int(child.attrib['value']) - self._match_codes[name] = value - elif tag == ElandLane.READS.lower(): - self._reads = int(element.text) - else: - logging.warn("ElandLane unrecognized tag %s" % (element.tag,)) - -def extract_eland_sequence(instream, outstream, start, end): - """ - Extract a chunk of sequence out of an eland file - """ - for line in instream: - record = line.split() - if len(record) > 1: - result = [record[0], record[1][start:end]] - else: - result = [record[0][start:end]] - outstream.write("\t".join(result)) - outstream.write(os.linesep) - -class ELAND(object): - """ - Summarize information from eland files - """ - XML_VERSION = 1 - - ELAND = 'ElandCollection' - LANE = 'Lane' - LANE_ID = 'id' - - def __init__(self, xml=None): - # we need information from the gerald config.xml - self.results = {} - - if xml is not None: - self.set_elements(xml) - - def __len__(self): - return len(self.results) - - def keys(self): - return self.results.keys() - - def values(self): - return self.results.values() - - def items(self): - return self.results.items() - - def __getitem__(self, key): - return self.results[key] - - def get_elements(self): - root = ElementTree.Element(ELAND.ELAND, - {'version': unicode(ELAND.XML_VERSION)}) - for lane_id, lane in self.results.items(): - eland_lane = lane.get_elements() - eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id) - root.append(eland_lane) - return root - - def set_elements(self, tree): - if tree.tag.lower() != ELAND.ELAND.lower(): - raise ValueError('Expecting %s', ELAND.ELAND) - for element in list(tree): - lane_id = element.attrib[ELAND.LANE_ID] - lane = ElandLane(xml=element) - self.results[lane_id] = lane - -def eland(basedir, gerald=None, genome_maps=None): - e = ELAND() - - file_list = glob(os.path.join(basedir, "*_eland_result.txt")) - if len(file_list) == 0: - # lets handle compressed eland files too - file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2")) - - for pathname in file_list: - # yes the lane_id is also being computed in ElandLane._update - # I didn't want to clutter up my constructor - # but I needed to persist the sample_name/lane_id for - # runfolder summary_report - path, name = os.path.split(pathname) - split_name = name.split('_') - lane_id = split_name[1] - - if genome_maps is not None: - genome_map = genome_maps[lane_id] - elif gerald is not None: - genome_dir = gerald.lanes[lane_id].eland_genome - genome_map = build_genome_fasta_map(genome_dir) - else: - genome_map = {} - - eland_result = ElandLane(pathname, genome_map) - e.results[lane_id] = eland_result - return e diff --git a/gaworkflow/pipeline/recipe_parser.py b/gaworkflow/pipeline/recipe_parser.py deleted file mode 100644 index 7f5ced6..0000000 --- a/gaworkflow/pipeline/recipe_parser.py +++ /dev/null @@ -1,48 +0,0 @@ -from xml import sax - - -def get_cycles(recipe_xml_filepath): - """ - returns the number of cycles found in Recipe*.xml - """ - handler = CycleXmlHandler() - sax.parse(recipe_xml_filepath, handler) - return handler.cycle_count - - - -class CycleXmlHandler(sax.ContentHandler): - - def __init__(self): - self.cycle_count = 0 - self.in_protocol = False - sax.ContentHandler.__init__(self) - - - def startDocument(self): - self.cycle_count = 0 - self.in_protocol = False - - - def startElement(self, name, attrs): - - #Only count Incorporations as cycles if within - # the protocol section of the xml document. - if name == "Incorporation" and self.in_protocol: - #print 'Found a cycle!' - self.cycle_count += 1 - return - - elif name == 'Protocol': - #print 'In protocol' - self.in_protocol = True - return - - #print 'Skipping: %s' % (name) - - - def endElement(self, name): - - if name == 'Protocol': - #print 'End protocol' - self.in_protocol = False diff --git a/gaworkflow/pipeline/retrieve_config.py b/gaworkflow/pipeline/retrieve_config.py deleted file mode 100644 index 72cff17..0000000 --- a/gaworkflow/pipeline/retrieve_config.py +++ /dev/null @@ -1,185 +0,0 @@ -#!/usr/bin/env python - -from optparse import OptionParser, IndentedHelpFormatter -from ConfigParser import SafeConfigParser - -import logging -import os -import sys -import urllib2 - -CONFIG_SYSTEM = '/etc/ga_frontend/ga_frontend.conf' -CONFIG_USER = os.path.expanduser('~/.ga_frontend.conf') - -#Disable or enable commandline arg parsing; disabled by default. -DISABLE_CMDLINE = True - -class FlowCellNotFound(Exception): pass -class WebError404(Exception): pass - -class DummyOptions: - """ - Used when command line parsing is disabled; default - """ - def __init__(self): - self.url = None - self.output_filepath = None - self.flowcell = None - self.genome_dir = None - -class PreformattedDescriptionFormatter(IndentedHelpFormatter): - - #def format_description(self, description): - # - # if description: - # return description + "\n" - # else: - # return "" - - def format_epilog(self, epilog): - """ - It was removing my preformated epilog, so this should override - that behavior! Muhahaha! - """ - if epilog: - return "\n" + epilog + "\n" - else: - return "" - - -def constructOptionParser(): - """ - returns a pre-setup optparser - """ - global DISABLE_CMDLINE - - if DISABLE_CMDLINE: - return None - - parser = OptionParser(formatter=PreformattedDescriptionFormatter()) - - parser.set_description('Retrieves eland config file from ga_frontend web frontend.') - - parser.epilog = """ -Config File: - * %s (System wide) - * %s (User specific; overrides system) - * command line overrides all config file options - - Example Config File: - - [config_file_server] - base_host_url=http://somewhere.domain:port -""" % (CONFIG_SYSTEM, CONFIG_USER) - - #Special formatter for allowing preformatted description. - ##parser.format_epilog(PreformattedDescriptionFormatter()) - - parser.add_option("-u", "--url", - action="store", type="string", dest="url") - - parser.add_option("-o", "--output", - action="store", type="string", dest="output_filepath") - - parser.add_option("-f", "--flowcell", - action="store", type="string", dest="flowcell") - - parser.add_option("-g", "--genome_dir", - action="store", type="string", dest="genome_dir") - - #parser.set_default("url", "default") - - return parser - -def constructConfigParser(): - """ - returns a pre-setup config parser - """ - parser = SafeConfigParser() - parser.read([CONFIG_SYSTEM, CONFIG_USER]) - if not parser.has_section('config_file_server'): - parser.add_section('config_file_server') - if not parser.has_section('local_setup'): - parser.add_section('local_setup') - - return parser - - -def getCombinedOptions(): - """ - Returns optparse options after it has be updated with ConfigParser - config files and merged with parsed commandline options. - """ - cl_parser = constructOptionParser() - conf_parser = constructConfigParser() - - if cl_parser is None: - options = DummyOptions() - else: - options, args = cl_parser.parse_args() - - if options.url is None: - if conf_parser.has_option('config_file_server', 'base_host_url'): - options.url = conf_parser.get('config_file_server', 'base_host_url') - - if options.genome_dir is None: - if conf_parser.has_option('local_setup', 'genome_dir'): - options.genome_dir = conf_parser.get('local_setup', 'genome_dir') - - print 'USING OPTIONS:' - print ' URL:', options.url - print ' OUT:', options.output_filepath - print ' FC:', options.flowcell - print 'GDIR:', options.genome_dir - print '' - - return options - - -def saveConfigFile(flowcell, base_host_url, output_filepath): - """ - retrieves the flowcell eland config file, give the base_host_url - (i.e. http://sub.domain.edu:port) - """ - url = base_host_url + '/eland_config/%s/' % (flowcell) - - f = open(output_filepath, 'w') - #try: - try: - web = urllib2.urlopen(url) - except urllib2.URLError, e: - errmsg = 'URLError: %d' % (e.code,) - logging.error(errmsg) - logging.error('opened %s' % (url,)) - logging.error('%s' % ( e.read(),)) - raise IOError(errmsg) - - #except IOError, msg: - # if str(msg).find("Connection refused") >= 0: - # print 'Error: Connection refused for: %s' % (url) - # f.close() - # sys.exit(1) - # elif str(msg).find("Name or service not known") >= 0: - # print 'Error: Invalid domain or ip address for: %s' % (url) - # f.close() - # sys.exit(2) - # else: - # raise IOError, msg - - data = web.read() - - if data.find('Hmm, config file for') >= 0: - msg = "Flowcell (%s) not found in DB; full url(%s)" % (flowcell, url) - raise FlowCellNotFound, msg - - if data.find('404 - Not Found') >= 0: - msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \ - "Did you get right port #?" % (flowcell, base_host_url, url) - raise FlowCellNotFound, msg - - f.write(data) - web.close() - f.close() - logging.info('Wrote config file to %s' % (output_filepath,)) - - diff --git a/gaworkflow/pipeline/run_status.py b/gaworkflow/pipeline/run_status.py deleted file mode 100644 index 39dc54c..0000000 --- a/gaworkflow/pipeline/run_status.py +++ /dev/null @@ -1,478 +0,0 @@ -import glob -import re -import os -import sys -import time -import threading - -s_comment = re.compile('^#') -s_general_read_len = re.compile('^READ_LENGTH ') -s_read_len = re.compile('^[1-8]+:READ_LENGTH ') - -s_firecrest = None - -def _four_digit_num_in_string(num): - if num < 0: - pass - elif num < 10: - return '000' + str(num) - elif num < 100: - return '00' + str(num) - elif num < 1000: - return '0' + str(num) - elif num < 10000: - return str(num) - - msg = 'Invalid number: %s' % (num) - raise ValueError, msg - -def _two_digit_num_in_string(num): - if num < 0: - pass - elif num < 10: - return '0' + str(num) - elif num < 100: - return str(num) - - msg = 'Invalid number: %s' % (num) - raise ValueError, msg - - -# FIRECREST PATTERNS -# _p2f(, lane, tile, cycle) -PATTERN_FIRECREST_QCM = 's_%s_%s_%s_qcm.xml' - -# _p2f(, lane, tile) -PATTERN_FIRECREST_INT = 's_%s_%s_02_int.txt' -PATTERN_FIRECREST_NSE = 's_%s_%s_nse.txt.gz' -PATTERN_FIRECREST_POS = 's_%s_%s_pos.txt' -PATTERN_FIRECREST_IDX = 's_%s_%s_idx.txt' -PATTERN_FIRECREST_CLU1 = 's_%s_%s_01_1_clu.txt' -PATTERN_FIRECREST_CLU2 = 's_%s_%s_01_2_clu.txt' -PATTERN_FIRECREST_CLU3 = 's_%s_%s_01_3_clu.txt' -PATTERN_FIRECREST_CLU4 = 's_%s_%s_01_4_clu.txt' - - -# BUSTARD PATTERNS -# _p2f(, lane, tile) -PATTERN_BUSTARD_SIG2 = 's_%s_%s_sig2.txt' -PATTERN_BUSTARD_PRB = 's_%s_%s_prb.txt' - - - -# GERALD PATTERNS -# _p2f(, lane, tile) -PATTERN_GERALD_ALLTMP = 's_%s_%s_all.txt.tmp' -PATTERN_GERALD_QRAWTMP = 's_%s_%s_qraw.txt.tmp' -PATTERN_GERALD_ALLPNGTMP = 's_%s_%s_all.tmp.png' -PATTERN_GERALD_ALIGNTMP = 's_%s_%s_align.txt.tmp' -PATTERN_GERALD_QVALTMP = 's_%s_%s_qval.txt.tmp' -PATTERN_GERALD_SCORETMP = 's_%s_%s_score.txt.tmp' -PATTERN_GERALD_PREALIGNTMP = 's_%s_%s_prealign.txt.tmp' -PATTERN_GERALD_REALIGNTMP = 's_%s_%s_realign.txt.tmp' -PATTERN_GERALD_RESCORETMP = 's_%s_%s_rescore.txt.tmp' -PATTERN_GERALD_RESCOREPNG = 's_%s_%s_rescore.png' -PATTERN_GERALD_ERRORSTMPPNG = 's_%s_%s_errors.tmp.png' -PATTERN_GERALD_QCALTMP = 's_%s_%s_qcal.txt.tmp' -PATTERN_GERALD_QVAL = 's_%s_%s_qval.txt' - -# _p2f(, lane) -PATTERN_GERALD_SEQPRETMP = 's_%s_seqpre.txt.tmp' -PATTERN_GERALD_RESULTTMP = 's_%s_eland_result.txt.tmp' -PATTERN_GERALD_SIGMEANSTMP = 's_%s_Signal_Means.txt.tmp' -PATTERN_GERALD_CALLPNG = 's_%s_call.png' -PATTERN_GERALD_ALLPNG = 's_%s_all.png' -PATTERN_GERALD_PERCENTALLPNG = 's_%s_percent_all.png' -PATTERN_GERALD_PERCENTCALLPNG = 's_%s_percent_call.png' -PATTERN_GERALD_PERCENTBASEPNG = 's_%s_percent_base.png' -PATTERN_GERALD_FILTTMP = 's_%s_filt.txt.tmp' -PATTERN_GERALD_FRAGTMP = 's_%s_frag.txt.tmp' -PATTERN_GERALD_QREPORTTMP = 's_%s_qreport.txt.tmp' -PATTERN_GERALD_QTABLETMP = 's_%s_qtable.txt.tmp' -PATTERN_GERALD_QCALREPORTTMP = 's_%s_qcalreport.txt.tmp' -PATTERN_GERALD_SEQUENCETMP = 's_%s_sequence.txt.tmp' -PATTERN_GERALD_LANEFINISHED = 's_%s_finished.txt' - - - -def _p2f(pattern, lane, tile=None, cycle=None): - """ - Converts a pattern plus info into file names - """ - - # lane, and cycle provided (INVALID) - if tile is None and cycle is not None: - msg = "Handling of cycle without tile is not currently implemented." - raise ValueError, msg - - # lane, tile, cycle provided - elif cycle: - return pattern % (lane, - _four_digit_num_in_string(tile), - _two_digit_num_in_string(cycle)) - - # lane, tile provided - elif tile: - return pattern % (lane, _four_digit_num_in_string(tile)) - - # lane provided - else: - return pattern % (lane) - - -class GARunStatus(object): - - def __init__(self, conf_filepath): - """ - Given an eland config file in the top level directory - of a run, predicts the files that will be generated - during a run and provides methods for retrieving - (completed, total) for each step or entire run. - """ - #print 'self._conf_filepath = %s' % (conf_filepath) - self._conf_filepath = conf_filepath - self._base_dir, junk = os.path.split(conf_filepath) - self._image_dir = os.path.join(self._base_dir, 'Images') - - self.lanes = [] - self.lane_read_length = {} - self.tiles = None - self.cycles = None - - self.status = {} - self.status['firecrest'] = {} - self.status['bustard'] = {} - self.status['gerald'] = {} - - self._process_config() - self._count_tiles() - self._count_cycles() - self._generate_expected() - - - def _process_config(self): - """ - Grabs info from self._conf_filepath - """ - f = open(self._conf_filepath, 'r') - - for line in f: - - #Skip comment lines for now. - if s_comment.search(line): - continue - - mo = s_general_read_len.search(line) - if mo: - read_length = int(line[mo.end():]) - #Handle general READ_LENGTH - for i in range(1,9): - self.lane_read_length[i] = read_length - - mo = s_read_len.search(line) - if mo: - read_length = int(line[mo.end():]) - lanes, junk = line.split(':') - - #Convert lanes from string of lanes to list of lane #s. - lanes = [ int(i) for i in lanes ] - - - for lane in lanes: - - #Keep track of which lanes are being run. - if lane not in self.lanes: - self.lanes.append(lane) - - #Update with lane specific read lengths - self.lane_read_length[lane] = read_length - - self.lanes.sort() - - - def _count_tiles(self): - """ - Count the number of tiles being used - """ - self.tiles = len(glob.glob(os.path.join(self._image_dir, - 'L001', - 'C1.1', - 's_1_*_a.tif'))) - - def _count_cycles(self): - """ - Figures out the number of cycles that are available - """ - #print 'self._image_dir = %s' % (self._image_dir) - cycle_dirs = glob.glob(os.path.join(self._image_dir, 'L001', 'C*.1')) - #print 'cycle_dirs = %s' % (cycle_dirs) - cycle_list = [] - for cycle_dir in cycle_dirs: - junk, c = os.path.split(cycle_dir) - cycle_list.append(int(c[1:c.find('.')])) - - self.cycles = max(cycle_list) - - - - - def _generate_expected(self): - """ - generates a list of files we expect to find. - """ - - firecrest = self.status['firecrest'] - bustard = self.status['bustard'] - gerald = self.status['gerald'] - - - for lane in self.lanes: - for tile in range(1,self.tiles+1): - for cycle in range(1, self.cycles+1): - - ########################## - # LANE, TILE, CYCLE LAYER - - # FIRECREST - firecrest[_p2f(PATTERN_FIRECREST_QCM, lane, tile, cycle)] = False - - - ################### - # LANE, TILE LAYER - - # FIRECREST - firecrest[_p2f(PATTERN_FIRECREST_INT, lane, tile)] = False - firecrest[_p2f(PATTERN_FIRECREST_NSE, lane, tile)] = False - firecrest[_p2f(PATTERN_FIRECREST_POS, lane, tile)] = False - firecrest[_p2f(PATTERN_FIRECREST_IDX, lane, tile)] = False - firecrest[_p2f(PATTERN_FIRECREST_CLU1, lane, tile)] = False - firecrest[_p2f(PATTERN_FIRECREST_CLU2, lane, tile)] = False - firecrest[_p2f(PATTERN_FIRECREST_CLU3, lane, tile)] = False - firecrest[_p2f(PATTERN_FIRECREST_CLU4, lane, tile)] = False - - - # BUSTARD - bustard[_p2f(PATTERN_BUSTARD_SIG2, lane, tile)] = False - bustard[_p2f(PATTERN_BUSTARD_PRB, lane, tile)] = False - - - # GERALD - #gerald[_p2f(PATTERN_GERALD_ALLTMP, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_QRAWTMP, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_ALLPNGTMP, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_ALIGNTMP, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_QVALTMP, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_SCORETMP, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_PREALIGNTMP, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_REALIGNTMP, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_RESCORETMP, lane, tile)] = False - gerald[_p2f(PATTERN_GERALD_RESCOREPNG, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_ERRORSTMPPNG, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_QCALTMP, lane, tile)] = False - #gerald[_p2f(PATTERN_GERALD_QVAL, lane, tile)] = False - - ################### - # LANE LAYER - - # GERALD - #gerald[_p2f(PATTERN_GERALD_SEQPRETMP, lane)] = False - #gerald[_p2f(PATTERN_GERALD_RESULTTMP, lane)] = False - #gerald[_p2f(PATTERN_GERALD_SIGMEANSTMP, lane)] = False - gerald[_p2f(PATTERN_GERALD_CALLPNG, lane)] = False - gerald[_p2f(PATTERN_GERALD_ALLPNG, lane)] = False - gerald[_p2f(PATTERN_GERALD_PERCENTALLPNG, lane)] = False - gerald[_p2f(PATTERN_GERALD_PERCENTCALLPNG, lane)] = False - gerald[_p2f(PATTERN_GERALD_PERCENTBASEPNG, lane)] = False - #gerald[_p2f(PATTERN_GERALD_FILTTMP, lane)] = False - #gerald[_p2f(PATTERN_GERALD_FRAGTMP, lane)] = False - #gerald[_p2f(PATTERN_GERALD_QREPORTTMP, lane)] = False - #gerald[_p2f(PATTERN_GERALD_QTABLETMP, lane)] = False - #gerald[_p2f(PATTERN_GERALD_QCALREPORTTMP, lane)] = False - #gerald[_p2f(PATTERN_GERALD_SEQUENCETMP, lane)] = False - gerald[_p2f(PATTERN_GERALD_LANEFINISHED, lane)] = False - - - - ################# - # LOOPS FINISHED - - # FIRECREST - firecrest['offsets_finished.txt'] = False - firecrest['finished.txt'] = False - - # BUSTARD - bustard['finished.txt'] = False - - # GERALD - gerald['tiles.txt'] = False - gerald['FullAll.htm'] = False - #gerald['All.htm.tmp'] = False - #gerald['Signal_Means.txt.tmp'] = False - #gerald['plotIntensity_for_IVC'] = False - #gerald['IVC.htm.tmp'] = False - gerald['FullError.htm'] = False - gerald['FullPerfect.htm'] = False - #gerald['Error.htm.tmp'] = False - #gerald['Perfect.htm.tmp'] = False - #gerald['Summary.htm.tmp'] = False - #gerald['Tile.htm.tmp'] = False - gerald['finished.txt'] = False - - def statusFirecrest(self): - """ - returns (, ) - """ - firecrest = self.status['firecrest'] - total = len(firecrest) - completed = firecrest.values().count(True) - - return (completed, total) - - - def statusBustard(self): - """ - returns (, ) - """ - bustard = self.status['bustard'] - total = len(bustard) - completed = bustard.values().count(True) - - return (completed, total) - - - def statusGerald(self): - """ - returns (, ) - """ - gerald = self.status['gerald'] - total = len(gerald) - completed = gerald.values().count(True) - - return (completed, total) - - - def statusTotal(self): - """ - returns (, ) - """ - #f = firecrest c = completed - #b = bustard t = total - #g = gerald - fc, ft = self.statusFirecrest() - bc, bt = self.statusBustard() - gc, gt = self.statusGerald() - - return (fc+bc+gc, ft+bt+gt) - - - def statusReport(self): - """ - Generate the basic percent complete report - """ - def _percentCompleted(completed, total): - """ - Returns precent completed as float - """ - return (completed / float(total)) * 100 - - fc, ft = self.statusFirecrest() - bc, bt = self.statusBustard() - gc, gt = self.statusGerald() - tc, tt = self.statusTotal() - - fp = _percentCompleted(fc, ft) - bp = _percentCompleted(bc, bt) - gp = _percentCompleted(gc, gt) - tp = _percentCompleted(tc, tt) - - report = ['Firecrest: %s%% (%s/%s)' % (fp, fc, ft), - ' Bustard: %s%% (%s/%s)' % (bp, bc, bt), - ' Gerald: %s%% (%s/%s)' % (gp, gc, gt), - '-----------------------', - ' Total: %s%% (%s/%s)' % (tp, tc, tt), - ] - return report - - def updateFirecrest(self, filename): - """ - Marks firecrest filename as being completed. - """ - self.status['firecrest'][filename] = True - - - def updateBustard(self, filename): - """ - Marks bustard filename as being completed. - """ - self.status['bustard'][filename] = True - - - def updateGerald(self, filename): - """ - Marks gerald filename as being completed. - """ - self.status['gerald'][filename] = True - - - -################################################## -# Functions to be called by Thread(target=) -def _cmdLineStatusMonitorFunc(conf_info): - """ - Given a ConfigInfo object, provides status to stdout. - - You should probably use startCmdLineStatusMonitor() - instead of ths function. - - Use with: - t = threading.Thread(target=_cmdLineStatusMonitorFunc, - args=[conf_info]) - t.setDaemon(True) - t.start() - """ - SLEEP_AMOUNT = 30 - - while 1: - if conf_info.status is None: - print "No status object yet." - time.sleep(SLEEP_AMOUNT) - continue - - report = conf_info.status.statusReport() - print os.linesep.join(report) - print - - time.sleep(SLEEP_AMOUNT) - - -############################################# -# Start monitor thread convenience functions -def startCmdLineStatusMonitor(conf_info): - """ - Starts a command line status monitor given a conf_info object. - """ - t = threading.Thread(target=_cmdLineStatusMonitorFunc, args=[conf_info]) - t.setDaemon(True) - t.start() - -from optparse import OptionParser -def make_parser(): - usage = "%prog: config file" - - parser = OptionParser() - return parser - -def main(cmdline=None): - parser = make_parser() - opt, args = parser.parse_args(cmdline) - - if len(args) != 1: - parser.error("need name of configuration file") - - status = GARunStatus(args[0]) - print os.linesep.join(status.statusReport()) - return 0 - -if __name__ == "__main__": - sys.exit(main(sys.argv[1:])) - diff --git a/gaworkflow/pipeline/runfolder.py b/gaworkflow/pipeline/runfolder.py deleted file mode 100644 index 65f6191..0000000 --- a/gaworkflow/pipeline/runfolder.py +++ /dev/null @@ -1,313 +0,0 @@ -""" -Core information needed to inspect a runfolder. -""" -from glob import glob -import logging -import os -import re -import shutil -import stat -import subprocess -import sys -import time - -try: - from xml.etree import ElementTree -except ImportError, e: - from elementtree import ElementTree - -EUROPEAN_STRPTIME = "%d-%m-%Y" -EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})" -VERSION_RE = "([0-9\.]+)" -USER_RE = "([a-zA-Z0-9]+)" -LANES_PER_FLOWCELL = 8 - -from gaworkflow.util.alphanum import alphanum -from gaworkflow.util.ethelp import indent, flatten - - -class PipelineRun(object): - """ - Capture "interesting" information about a pipeline run - """ - XML_VERSION = 1 - PIPELINE_RUN = 'PipelineRun' - FLOWCELL_ID = 'FlowcellID' - - def __init__(self, pathname=None, firecrest=None, bustard=None, gerald=None, xml=None): - if pathname is not None: - self.pathname = os.path.normpath(pathname) - else: - self.pathname = None - self._name = None - self._flowcell_id = None - self.firecrest = firecrest - self.bustard = bustard - self.gerald = gerald - - if xml is not None: - self.set_elements(xml) - - def _get_flowcell_id(self): - # extract flowcell ID - if self._flowcell_id is None: - config_dir = os.path.join(self.pathname, 'Config') - flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml') - if os.path.exists(flowcell_id_path): - flowcell_id_tree = ElementTree.parse(flowcell_id_path) - self._flowcell_id = flowcell_id_tree.findtext('Text') - else: - path_fields = self.pathname.split('_') - if len(path_fields) > 0: - # guessing last element of filename - flowcell_id = path_fields[-1] - else: - flowcell_id = 'unknown' - - logging.warning( - "Flowcell id was not found, guessing %s" % ( - flowcell_id)) - self._flowcell_id = flowcell_id - return self._flowcell_id - flowcell_id = property(_get_flowcell_id) - - def get_elements(self): - """ - make one master xml file from all of our sub-components. - """ - root = ElementTree.Element(PipelineRun.PIPELINE_RUN) - flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID) - flowcell.text = self.flowcell_id - root.append(self.firecrest.get_elements()) - root.append(self.bustard.get_elements()) - root.append(self.gerald.get_elements()) - return root - - def set_elements(self, tree): - # this file gets imported by all the others, - # so we need to hide the imports to avoid a cyclic imports - from gaworkflow.pipeline import firecrest - from gaworkflow.pipeline import bustard - from gaworkflow.pipeline import gerald - - tag = tree.tag.lower() - if tag != PipelineRun.PIPELINE_RUN.lower(): - raise ValueError('Pipeline Run Expecting %s got %s' % ( - PipelineRun.PIPELINE_RUN, tag)) - for element in tree: - tag = element.tag.lower() - if tag == PipelineRun.FLOWCELL_ID.lower(): - self._flowcell_id = element.text - #ok the xword.Xword.XWORD pattern for module.class.constant is lame - elif tag == firecrest.Firecrest.FIRECREST.lower(): - self.firecrest = firecrest.Firecrest(xml=element) - elif tag == bustard.Bustard.BUSTARD.lower(): - self.bustard = bustard.Bustard(xml=element) - elif tag == gerald.Gerald.GERALD.lower(): - self.gerald = gerald.Gerald(xml=element) - else: - logging.warn('PipelineRun unrecognized tag %s' % (tag,)) - - def _get_run_name(self): - """ - Given a run tuple, find the latest date and use that as our name - """ - if self._name is None: - tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time) - timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax)) - self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml' - return self._name - name = property(_get_run_name) - - def save(self, destdir=None): - if destdir is None: - destdir = '' - logging.info("Saving run report "+ self.name) - xml = self.get_elements() - indent(xml) - dest_pathname = os.path.join(destdir, self.name) - ElementTree.ElementTree(xml).write(dest_pathname) - - def load(self, filename): - logging.info("Loading run report from " + filename) - tree = ElementTree.parse(filename).getroot() - self.set_elements(tree) - -def get_runs(runfolder): - """ - Search through a run folder for all the various sub component runs - and then return a PipelineRun for each different combination. - - For example if there are two different GERALD runs, this will - generate two different PipelineRun objects, that differ - in there gerald component. - """ - from gaworkflow.pipeline import firecrest - from gaworkflow.pipeline import bustard - from gaworkflow.pipeline import gerald - - datadir = os.path.join(runfolder, 'Data') - - logging.info('Searching for runs in ' + datadir) - runs = [] - for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")): - f = firecrest.firecrest(firecrest_pathname) - bustard_glob = os.path.join(firecrest_pathname, "Bustard*") - for bustard_pathname in glob(bustard_glob): - b = bustard.bustard(bustard_pathname) - gerald_glob = os.path.join(bustard_pathname, 'GERALD*') - for gerald_pathname in glob(gerald_glob): - try: - g = gerald.gerald(gerald_pathname) - runs.append(PipelineRun(runfolder, f, b, g)) - except IOError, e: - print "Ignoring", str(e) - return runs - - -def extract_run_parameters(runs): - """ - Search through runfolder_path for various runs and grab their parameters - """ - for run in runs: - run.save() - -def summarize_mapped_reads(mapped_reads): - """ - Summarize per chromosome reads into a genome count - But handle spike-in/contamination symlinks seperately. - """ - summarized_reads = {} - genome_reads = 0 - genome = 'unknown' - for k, v in mapped_reads.items(): - path, k = os.path.split(k) - if len(path) > 0: - genome = path - genome_reads += v - else: - summarized_reads[k] = summarized_reads.setdefault(k, 0) + v - summarized_reads[genome] = genome_reads - return summarized_reads - -def summary_report(runs): - """ - Summarize cluster numbers and mapped read counts for a runfolder - """ - report = [] - for run in runs: - # print a run name? - report.append('Summary for %s' % (run.name,)) - # sort the report - eland_keys = run.gerald.eland_results.results.keys() - eland_keys.sort(alphanum) - - lane_results = run.gerald.summary.lane_results - for lane_id in eland_keys: - result = run.gerald.eland_results.results[lane_id] - report.append("Sample name %s" % (result.sample_name)) - report.append("Lane id %s" % (result.lane_id,)) - cluster = lane_results[result.lane_id].cluster - report.append("Clusters %d +/- %d" % (cluster[0], cluster[1])) - report.append("Total Reads: %d" % (result.reads)) - mc = result._match_codes - nm = mc['NM'] - nm_percent = float(nm)/result.reads * 100 - qc = mc['QC'] - qc_percent = float(qc)/result.reads * 100 - - report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent)) - report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent)) - report.append('Unique (0,1,2 mismatches) %d %d %d' % \ - (mc['U0'], mc['U1'], mc['U2'])) - report.append('Repeat (0,1,2 mismatches) %d %d %d' % \ - (mc['R0'], mc['R1'], mc['R2'])) - report.append("Mapped Reads") - mapped_reads = summarize_mapped_reads(result.mapped_reads) - for name, counts in mapped_reads.items(): - report.append(" %s: %d" % (name, counts)) - report.append('---') - report.append('') - return os.linesep.join(report) - -def extract_results(runs, output_base_dir=None): - if output_base_dir is None: - output_base_dir = os.getcwd() - - for r in runs: - result_dir = os.path.join(output_base_dir, r.flowcell_id) - logging.info("Using %s as result directory" % (result_dir,)) - if not os.path.exists(result_dir): - os.mkdir(result_dir) - - # create cycle_dir - cycle = "C%d-%d" % (r.firecrest.start, r.firecrest.stop) - logging.info("Filling in %s" % (cycle,)) - cycle_dir = os.path.join(result_dir, cycle) - if os.path.exists(cycle_dir): - logging.error("%s already exists, not overwriting" % (cycle_dir,)) - continue - else: - os.mkdir(cycle_dir) - - # copy stuff out of the main run - g = r.gerald - - # save run file - r.save(cycle_dir) - - # Copy Summary.htm - summary_path = os.path.join(r.gerald.pathname, 'Summary.htm') - if os.path.exists(summary_path): - logging.info('Copying %s to %s' % (summary_path, cycle_dir)) - shutil.copy(summary_path, cycle_dir) - else: - logging.info('Summary file %s was not found' % (summary_path,)) - - # tar score files - score_files = [] - for f in os.listdir(g.pathname): - if re.match('.*_score.txt', f): - score_files.append(f) - - tar_cmd = ['/bin/tar', 'c'] + score_files - bzip_cmd = [ 'bzip2', '-9', '-c' ] - tar_dest_name =os.path.join(cycle_dir, 'scores.tar.bz2') - tar_dest = open(tar_dest_name, 'w') - logging.info("Compressing score files in %s" % (g.pathname,)) - logging.info("Running tar: " + " ".join(tar_cmd[:10])) - logging.info("Running bzip2: " + " ".join(bzip_cmd)) - logging.info("Writing to %s" %(tar_dest_name)) - - tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, cwd=g.pathname) - bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest) - tar.wait() - - # copy & bzip eland files - for eland_lane in g.eland_results.values(): - source_name = eland_lane.pathname - path, name = os.path.split(eland_lane.pathname) - dest_name = os.path.join(cycle_dir, name+'.bz2') - - args = ['bzip2', '-9', '-c', source_name] - logging.info('Running: %s' % ( " ".join(args) )) - bzip_dest = open(dest_name, 'w') - bzip = subprocess.Popen(args, stdout=bzip_dest) - logging.info('Saving to %s' % (dest_name, )) - bzip.wait() - -def clean_runs(runs): - """ - Clean up run folders to optimize for compression. - """ - # TODO: implement this. - # rm RunLog*.xml - # rm pipeline_*.txt - # rm gclog.txt - # rm NetCopy.log - # rm nfn.log - # rm Images/L* - # cd Data/C1-*_Firecrest* - # make clean_intermediate - - pass diff --git a/gaworkflow/pipeline/test/test_genome_mapper.py b/gaworkflow/pipeline/test/test_genome_mapper.py deleted file mode 100644 index c8366d1..0000000 --- a/gaworkflow/pipeline/test/test_genome_mapper.py +++ /dev/null @@ -1,33 +0,0 @@ -import unittest - -from StringIO import StringIO -from gaworkflow.pipeline import genome_mapper - -class testGenomeMapper(unittest.TestCase): - def test_construct_mapper(self): - genomes = { - 'Arabidopsis thaliana': {'v01212004': '/arabidopsis'}, - 'Homo sapiens': {'hg18': '/hg18'}, - 'Mus musculus': {'mm8': '/mm8', - 'mm9': '/mm9', - 'mm10': '/mm10'}, - 'Phage': {'174': '/phi'}, - } - genome_map = genome_mapper.constructMapperDict(genomes) - - self.failUnlessEqual("%(Mus musculus|mm8)s" % (genome_map), "/mm8") - self.failUnlessEqual("%(Phage|174)s" % (genome_map), "/phi") - self.failUnlessEqual("%(Mus musculus)s" % (genome_map), "/mm10") - self.failUnlessEqual("%(Mus musculus|mm8)s" % (genome_map), "/mm8") - self.failUnlessEqual("%(Mus musculus|mm10)s" % (genome_map), "/mm10") - - self.failUnlessEqual(len(genome_map.keys()), 6) - self.failUnlessEqual(len(genome_map.values()), 6) - self.failUnlessEqual(len(genome_map.items()), 6) - - -def suite(): - return unittest.makeSuite(testGenomeMapper,'test') - -if __name__ == "__main__": - unittest.main(defaultTest="suite") diff --git a/gaworkflow/pipeline/test/test_runfolder026.py b/gaworkflow/pipeline/test/test_runfolder026.py deleted file mode 100644 index f539cfa..0000000 --- a/gaworkflow/pipeline/test/test_runfolder026.py +++ /dev/null @@ -1,601 +0,0 @@ -#!/usr/bin/env python - -from datetime import datetime, date -import os -import tempfile -import shutil -import unittest - -from gaworkflow.pipeline import firecrest -from gaworkflow.pipeline import bustard -from gaworkflow.pipeline import gerald -from gaworkflow.pipeline import runfolder -from gaworkflow.pipeline.runfolder import ElementTree - - -def make_flowcell_id(runfolder_dir, flowcell_id=None): - if flowcell_id is None: - flowcell_id = '207BTAAXY' - - config = """ - - %s -""" % (flowcell_id,) - config_dir = os.path.join(runfolder_dir, 'Config') - - if not os.path.exists(config_dir): - os.mkdir(config_dir) - pathname = os.path.join(config_dir, 'FlowcellId.xml') - f = open(pathname,'w') - f.write(config) - f.close() - -def make_matrix(matrix_dir): - contents = """# Auto-generated frequency response matrix -> A -> C -> G -> T -0.77 0.15 -0.04 -0.04 -0.76 1.02 -0.05 -0.06 --0.10 -0.10 1.17 -0.03 --0.13 -0.12 0.80 1.27 -""" - s_matrix = os.path.join(matrix_dir, 's_matrix.txt') - f = open(s_matrix, 'w') - f.write(contents) - f.close() - -def make_phasing_params(bustard_dir): - for lane in range(1,9): - pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane)) - f = open(pathname, 'w') - f.write(""" - 0.009900 - 0.003500 - -""") - f.close() - -def make_gerald_config(gerald_dir): - config_xml = """ - - default - - - - - Need_to_specify_ELAND_genome_directory - 8 - - domain.com - diane - localhost:25 - /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane - /home/diane/gec - 1 - /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes - Need_to_specify_genome_file_name - genome - /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane - - _prb.txt - 12 - '((CHASTITY>=0.6))' - _qhg.txt - --symbolic - 32 - --scarf - _seq.txt - _sig2.txt - _sig.txt - @(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp - s_[1-8]_[0-9][0-9][0-9][0-9] - s - Sat Apr 19 19:08:30 2008 - /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald - all - http://host.domain.com/yourshare/ - - - - eland - eland - eland - eland - eland - eland - eland - eland - - - /g/dm3 - /g/equcab1 - /g/equcab1 - /g/canfam2 - /g/hg18 - /g/hg18 - /g/hg18 - /g/hg18 - - - 32 - 32 - 32 - 32 - 32 - 32 - 32 - 32 - - - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - - - -""" - pathname = os.path.join(gerald_dir, 'config.xml') - f = open(pathname,'w') - f.write(config_xml) - f.close() - - -def make_summary_htm(gerald_dir): - summary_htm = """ - - - - -

080416_HWI-EAS229_0024_207BTAAXX Summary

-

Summary Information For Experiment 080416_HWI-EAS229_0024_207BTAAXX on Machine HWI-EAS229

-



Chip Summary

- - - - -
MachineHWI-EAS229
Run Folder080416_HWI-EAS229_0024_207BTAAXX
Chip IDunknown
-



Lane Parameter Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LaneSample IDSample TargetSample TypeLengthFilterTiles
1unknowndm3ELAND32'((CHASTITY>=0.6))'Lane 1
2unknownequcab1ELAND32'((CHASTITY>=0.6))'Lane 2
3unknownequcab1ELAND32'((CHASTITY>=0.6))'Lane 3
4unknowncanfam2ELAND32'((CHASTITY>=0.6))'Lane 4
5unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 5
6unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 6
7unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 7
8unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 8
-



Lane Results Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Lane Clusters Av 1st Cycle Int % intensity after 20 cycles % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
117421 +/- 21397230 +/- 80123.73 +/- 10.7913.00 +/- 22.9132.03 +/- 18.456703.57 +/- 3753.854.55 +/- 4.81
220311 +/- 24027660 +/- 67817.03 +/- 4.4040.74 +/- 30.3329.54 +/- 9.035184.02 +/- 1631.543.27 +/- 3.94
320193 +/- 23997700 +/- 79715.75 +/- 3.3056.56 +/- 17.1627.33 +/- 7.484803.49 +/- 1313.313.07 +/- 2.86
415537 +/- 25317620 +/- 139215.37 +/- 3.7963.05 +/- 18.3015.88 +/- 4.993162.13 +/- 962.593.11 +/- 2.22
532047 +/- 33568093 +/- 83123.79 +/- 6.1853.36 +/- 18.0648.04 +/- 13.779866.23 +/- 2877.302.26 +/- 1.16
632946 +/- 47538227 +/- 73624.07 +/- 4.6954.65 +/- 12.5750.98 +/- 10.5410468.86 +/- 2228.532.21 +/- 2.33
739504 +/- 41718401 +/- 78522.55 +/- 4.5645.22 +/- 10.3448.41 +/- 9.679829.40 +/- 1993.202.26 +/- 1.11
837998 +/- 37928443 +/- 121139.03 +/- 7.5242.16 +/- 12.3540.98 +/- 14.898128.87 +/- 3055.343.57 +/- 2.77
- - -""" - pathname = os.path.join(gerald_dir, 'Summary.htm') - f = open(pathname, 'w') - f.write(summary_htm) - f.close() - -def make_eland_results(gerald_dir): - eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D. ->HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 chr2L.fa 8796855 R DD 24T ->HWI-EAS229_24_207BTAAXX:1:7:776:582 AGCTCANCCGATCGAAAACCTCNCCAAGCAAT NM 0 0 0 ->HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 Lambda.fa 8796855 R DD 24T -""" - for i in range(1,9): - pathname = os.path.join(gerald_dir, - 's_%d_eland_result.txt' % (i,)) - f = open(pathname, 'w') - f.write(eland_result) - f.close() - -class RunfolderTests(unittest.TestCase): - """ - Test components of the runfolder processing code - which includes firecrest, bustard, and gerald - """ - def setUp(self): - # make a fake runfolder directory - self.temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_') - - self.runfolder_dir = os.path.join(self.temp_dir, - '080102_HWI-EAS229_0010_207BTAAXX') - os.mkdir(self.runfolder_dir) - - self.data_dir = os.path.join(self.runfolder_dir, 'Data') - os.mkdir(self.data_dir) - - self.firecrest_dir = os.path.join(self.data_dir, - 'C1-33_Firecrest1.8.28_12-04-2008_diane' - ) - os.mkdir(self.firecrest_dir) - self.matrix_dir = os.path.join(self.firecrest_dir, 'Matrix') - os.mkdir(self.matrix_dir) - make_matrix(self.matrix_dir) - - self.bustard_dir = os.path.join(self.firecrest_dir, - 'Bustard1.8.28_12-04-2008_diane') - os.mkdir(self.bustard_dir) - make_phasing_params(self.bustard_dir) - - self.gerald_dir = os.path.join(self.bustard_dir, - 'GERALD_12-04-2008_diane') - os.mkdir(self.gerald_dir) - make_gerald_config(self.gerald_dir) - make_summary_htm(self.gerald_dir) - make_eland_results(self.gerald_dir) - - def tearDown(self): - shutil.rmtree(self.temp_dir) - - def test_firecrest(self): - """ - Construct a firecrest object - """ - f = firecrest.firecrest(self.firecrest_dir) - self.failUnlessEqual(f.version, '1.8.28') - self.failUnlessEqual(f.start, 1) - self.failUnlessEqual(f.stop, 33) - self.failUnlessEqual(f.user, 'diane') - self.failUnlessEqual(f.date, date(2008,4,12)) - - xml = f.get_elements() - # just make sure that element tree can serialize the tree - xml_str = ElementTree.tostring(xml) - - f2 = firecrest.Firecrest(xml=xml) - self.failUnlessEqual(f.version, f2.version) - self.failUnlessEqual(f.start, f2.start) - self.failUnlessEqual(f.stop, f2.stop) - self.failUnlessEqual(f.user, f2.user) - self.failUnlessEqual(f.date, f2.date) - - def test_bustard(self): - """ - construct a bustard object - """ - b = bustard.bustard(self.bustard_dir) - self.failUnlessEqual(b.version, '1.8.28') - self.failUnlessEqual(b.date, date(2008,4,12)) - self.failUnlessEqual(b.user, 'diane') - self.failUnlessEqual(len(b.phasing), 8) - self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099) - - xml = b.get_elements() - b2 = bustard.Bustard(xml=xml) - self.failUnlessEqual(b.version, b2.version) - self.failUnlessEqual(b.date, b2.date ) - self.failUnlessEqual(b.user, b2.user) - self.failUnlessEqual(len(b.phasing), len(b2.phasing)) - for key in b.phasing.keys(): - self.failUnlessEqual(b.phasing[key].lane, - b2.phasing[key].lane) - self.failUnlessEqual(b.phasing[key].phasing, - b2.phasing[key].phasing) - self.failUnlessEqual(b.phasing[key].prephasing, - b2.phasing[key].prephasing) - - def test_gerald(self): - # need to update gerald and make tests for it - g = gerald.gerald(self.gerald_dir) - - self.failUnlessEqual(g.version, - '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp') - self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30)) - self.failUnlessEqual(len(g.lanes), len(g.lanes.keys())) - self.failUnlessEqual(len(g.lanes), len(g.lanes.items())) - - - # list of genomes, matches what was defined up in - # make_gerald_config. - # the first None is to offset the genomes list to be 1..9 - # instead of pythons default 0..8 - genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2', - '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ] - - # test lane specific parameters from gerald config file - for i in range(1,9): - cur_lane = g.lanes[str(i)] - self.failUnlessEqual(cur_lane.analysis, 'eland') - self.failUnlessEqual(cur_lane.eland_genome, genomes[i]) - self.failUnlessEqual(cur_lane.read_length, '32') - self.failUnlessEqual(cur_lane.use_bases, 'Y'*32) - - # test data extracted from summary file - clusters = [None, - (17421, 2139), (20311, 2402), (20193, 2399), (15537, 2531), - (32047, 3356), (32946, 4753), (39504, 4171), (37998, 3792)] - - for i in range(1,9): - summary_lane = g.summary[str(i)] - self.failUnlessEqual(summary_lane.cluster, clusters[i]) - self.failUnlessEqual(summary_lane.lane, str(i)) - - xml = g.get_elements() - # just make sure that element tree can serialize the tree - xml_str = ElementTree.tostring(xml) - g2 = gerald.Gerald(xml=xml) - - # do it all again after extracting from the xml file - self.failUnlessEqual(g.version, g2.version) - self.failUnlessEqual(g.date, g2.date) - self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys())) - self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items())) - - # test lane specific parameters from gerald config file - for i in range(1,9): - g_lane = g.lanes[str(i)] - g2_lane = g2.lanes[str(i)] - self.failUnlessEqual(g_lane.analysis, g2_lane.analysis) - self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome) - self.failUnlessEqual(g_lane.read_length, g2_lane.read_length) - self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases) - - # test (some) summary elements - for i in range(1,9): - g_summary = g.summary[str(i)] - g2_summary = g2.summary[str(i)] - self.failUnlessEqual(g_summary.cluster, g2_summary.cluster) - self.failUnlessEqual(g_summary.lane, g2_summary.lane) - - g_eland = g.eland_results - g2_eland = g2.eland_results - for lane in g_eland.keys(): - self.failUnlessEqual(g_eland[lane].reads, - g2_eland[lane].reads) - self.failUnlessEqual(len(g_eland[lane].mapped_reads), - len(g2_eland[lane].mapped_reads)) - for k in g_eland[lane].mapped_reads.keys(): - self.failUnlessEqual(g_eland[lane].mapped_reads[k], - g2_eland[lane].mapped_reads[k]) - - self.failUnlessEqual(len(g_eland[lane].match_codes), - len(g2_eland[lane].match_codes)) - for k in g_eland[lane].match_codes.keys(): - self.failUnlessEqual(g_eland[lane].match_codes[k], - g2_eland[lane].match_codes[k]) - - - def test_eland(self): - dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa', - 'chr2L.fa': 'dm3/chr2L.fa', - 'Lambda.fa': 'Lambda.fa'} - genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map, - '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map } - eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps) - - for i in range(1,9): - lane = eland[str(i)] - self.failUnlessEqual(lane.reads, 4) - self.failUnlessEqual(lane.sample_name, "s") - self.failUnlessEqual(lane.lane_id, unicode(i)) - self.failUnlessEqual(len(lane.mapped_reads), 3) - self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1) - self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1) - self.failUnlessEqual(lane.match_codes['U1'], 2) - self.failUnlessEqual(lane.match_codes['NM'], 1) - - xml = eland.get_elements() - # just make sure that element tree can serialize the tree - xml_str = ElementTree.tostring(xml) - e2 = gerald.ELAND(xml=xml) - - for i in range(1,9): - l1 = eland[str(i)] - l2 = e2[str(i)] - self.failUnlessEqual(l1.reads, l2.reads) - self.failUnlessEqual(l1.sample_name, l2.sample_name) - self.failUnlessEqual(l1.lane_id, l2.lane_id) - self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads)) - self.failUnlessEqual(len(l1.mapped_reads), 3) - for k in l1.mapped_reads.keys(): - self.failUnlessEqual(l1.mapped_reads[k], - l2.mapped_reads[k]) - - self.failUnlessEqual(len(l1.match_codes), 9) - self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes)) - for k in l1.match_codes.keys(): - self.failUnlessEqual(l1.match_codes[k], - l2.match_codes[k]) - - def test_runfolder(self): - runs = runfolder.get_runs(self.runfolder_dir) - - # do we get the flowcell id from the filename? - self.failUnlessEqual(len(runs), 1) - self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-04-19.xml') - - # do we get the flowcell id from the FlowcellId.xml file - make_flowcell_id(self.runfolder_dir, '207BTAAXY') - runs = runfolder.get_runs(self.runfolder_dir) - self.failUnlessEqual(len(runs), 1) - self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-04-19.xml') - - r1 = runs[0] - xml = r1.get_elements() - xml_str = ElementTree.tostring(xml) - - r2 = runfolder.PipelineRun(xml=xml) - self.failUnlessEqual(r1.name, r2.name) - self.failIfEqual(r2.firecrest, None) - self.failIfEqual(r2.bustard, None) - self.failIfEqual(r2.gerald, None) - - -def suite(): - return unittest.makeSuite(RunfolderTests,'test') - -if __name__ == "__main__": - unittest.main(defaultTest="suite") - diff --git a/gaworkflow/pipeline/test/test_runfolder030.py b/gaworkflow/pipeline/test/test_runfolder030.py deleted file mode 100644 index 5078412..0000000 --- a/gaworkflow/pipeline/test/test_runfolder030.py +++ /dev/null @@ -1,1024 +0,0 @@ -#!/usr/bin/env python - -from datetime import datetime, date -import os -import tempfile -import shutil -import unittest - -from gaworkflow.pipeline import firecrest -from gaworkflow.pipeline import bustard -from gaworkflow.pipeline import gerald -from gaworkflow.pipeline import runfolder -from gaworkflow.pipeline.runfolder import ElementTree - - -def make_flowcell_id(runfolder_dir, flowcell_id=None): - if flowcell_id is None: - flowcell_id = '207BTAAXY' - - config = """ - - %s -""" % (flowcell_id,) - config_dir = os.path.join(runfolder_dir, 'Config') - - if not os.path.exists(config_dir): - os.mkdir(config_dir) - pathname = os.path.join(config_dir, 'FlowcellId.xml') - f = open(pathname,'w') - f.write(config) - f.close() - -def make_matrix(matrix_dir): - contents = """# Auto-generated frequency response matrix -> A -> C -> G -> T -0.77 0.15 -0.04 -0.04 -0.76 1.02 -0.05 -0.06 --0.10 -0.10 1.17 -0.03 --0.13 -0.12 0.80 1.27 -""" - s_matrix = os.path.join(matrix_dir, 's_matrix.txt') - f = open(s_matrix, 'w') - f.write(contents) - f.close() - -def make_phasing_params(bustard_dir): - for lane in range(1,9): - pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane)) - f = open(pathname, 'w') - f.write(""" - 0.009900 - 0.003500 - -""") - f.close() - -def make_gerald_config(gerald_dir): - config_xml = """ - - default - - - - - Need_to_specify_ELAND_genome_directory - 8 - - domain.com - diane - localhost:25 - /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane - /home/diane/gec - 1 - /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes - Need_to_specify_genome_file_name - genome - /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane - - _prb.txt - 12 - '((CHASTITY>=0.6))' - _qhg.txt - --symbolic - 32 - --scarf - _seq.txt - _sig2.txt - _sig.txt - @(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp - s_[1-8]_[0-9][0-9][0-9][0-9] - s - Sat Apr 19 19:08:30 2008 - /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald - all - http://host.domain.com/yourshare/ - - - - eland - eland - eland - eland - eland - eland - eland - eland - - - /g/dm3 - /g/equcab1 - /g/equcab1 - /g/canfam2 - /g/hg18 - /g/hg18 - /g/hg18 - /g/hg18 - - - 32 - 32 - 32 - 32 - 32 - 32 - 32 - 32 - - - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY - - - -""" - pathname = os.path.join(gerald_dir, 'config.xml') - f = open(pathname,'w') - f.write(config_xml) - f.close() - -def make_summary_htm(gerald_dir): - summary_htm=""" - - - - -

080627_HWI-EAS229_0036_3055HAXX Summary

-

Summary Information For Experiment 080627_HWI-EAS229_0036_3055HAXX on Machine HWI-EAS229

-



Chip Summary

- - - - -
MachineHWI-EAS229
Run Folder080627_HWI-EAS229_0036_3055HAXX
Chip IDunknown
-



Chip Results Summary

- - - - - - - - - - -
ClustersClusters (PF)Yield (kbases)
80933224435778031133022
-



Lane Parameter Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LaneSample IDSample TargetSample TypeLengthFilterNum TilesTiles
1unknownmm9ELAND26'((CHASTITY>=0.6))'100Lane 1
2unknownmm9ELAND26'((CHASTITY>=0.6))'100Lane 2
3unknownmm9ELAND26'((CHASTITY>=0.6))'100Lane 3
4unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 4
5unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 5
6unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 6
7unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 7
8unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 8
-



Lane Results Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Lane InfoTile Mean +/- SD for Lane
Lane Lane Yield (kbases) Clusters (raw)Clusters (PF) 1st Cycle Int (PF) % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Alignment Score (PF) % Error Rate (PF)
115804696483 +/- 907460787 +/- 4240329 +/- 35101.88 +/- 6.0363.21 +/- 3.2970.33 +/- 0.249054.08 +/- 59.160.46 +/- 0.18
2156564133738 +/- 793860217 +/- 1926444 +/- 3992.62 +/- 7.5845.20 +/- 3.3151.98 +/- 0.746692.04 +/- 92.490.46 +/- 0.09
3185818152142 +/- 1000271468 +/- 2827366 +/- 3691.53 +/- 8.6647.19 +/- 3.8082.24 +/- 0.4410598.68 +/- 64.130.41 +/- 0.04
43495315784 +/- 216213443 +/- 1728328 +/- 4097.53 +/- 9.8785.29 +/- 1.9180.02 +/- 0.5310368.82 +/- 71.080.15 +/- 0.05
5167936119735 +/- 846564590 +/- 2529417 +/- 3788.69 +/- 14.7954.10 +/- 2.5976.95 +/- 0.329936.47 +/- 65.750.28 +/- 0.02
6173463152177 +/- 814666716 +/- 2493372 +/- 3987.06 +/- 9.8643.98 +/- 3.1278.80 +/- 0.4310162.28 +/- 49.650.38 +/- 0.03
714928784649 +/- 732557418 +/- 3617295 +/- 2889.40 +/- 8.2367.97 +/- 1.8233.38 +/- 0.254247.92 +/- 32.371.00 +/- 0.03
810695354622 +/- 481241136 +/- 3309284 +/- 3790.21 +/- 9.1075.39 +/- 2.2748.33 +/- 0.296169.21 +/- 169.500.86 +/- 1.22
Tile mean across chip
Av.1011665447235492.3660.2965.258403.690.50
-



Expanded Lane Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Lane InfoPhasing InfoRaw Data (tile mean)Filtered Data (tile mean)
Lane Clusters (tile mean) (raw)% Phasing % Prephasing % Error Rate (raw) Equiv Perfect Clusters (raw) % retained Cycle 2-4 Av Int (PF) Cycle 2-10 Av % Loss (PF) Cycle 10-20 Av % Loss (PF) % Align (PF) % Error Rate (PF) Equiv Perfect Clusters (PF)
1964830.77000.31001.004967663.21317 +/- 320.13 +/- 0.44-1.14 +/- 0.3470.330.4641758
21337380.77000.31001.224046745.20415 +/- 330.29 +/- 0.40-0.79 +/- 0.3551.980.4630615
31521420.77000.31001.307858847.19344 +/- 260.68 +/- 0.51-0.77 +/- 0.4282.240.4157552
4157840.77000.31000.291109585.29306 +/- 340.20 +/- 0.69-1.28 +/- 0.6680.020.1510671
51197350.77000.31000.856033554.10380 +/- 320.34 +/- 0.49-1.55 +/- 4.6976.950.2849015
61521770.77000.31001.217090543.98333 +/- 270.57 +/- 0.50-0.91 +/- 0.3978.800.3851663
7846490.77000.31001.382106967.97272 +/- 201.15 +/- 0.52-0.84 +/- 0.5833.381.0018265
8546220.77000.31001.172133575.39262 +/- 311.10 +/- 0.59-1.01 +/- 0.4748.330.8619104
-

IVC Plots
-

IVC.htm -

-

All Intensity Plots
-

All.htm -

-

Error graphs:
-

Error.htm -

-Back to top -



Lane 1

- - - - - - - - - - - - - - - - - - - - - - - -
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
10001114972326.4894.3957.4470.29038.60.44
-Back to top -



Lane 2

- - - - - - - - - - - - - - - - - - - - - - - -
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
20001147793448.1283.6838.5753.76905.40.54
-Back to top -



Lane 3

- - - - - - - - - - - - - - - - - - - - - - - -
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
30001167904374.0586.9140.3681.310465.00.47
-Back to top -



Lane 4

- - - - - - - - - - - - - - - - - - - - - - - -
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
4000120308276.8592.8784.2680.410413.80.16
-Back to top -



Lane 5

- - - - - - - - - - - - -
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
-Back to top -



Lane 6

- - - - - - - - - - - - - - - - - - - - - - - -
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
60001166844348.1277.5938.1379.710264.40.44
-Back to top -



Lane 7

- - - - - - - - - - - - - - - - - - - - - - - -
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
7000198913269.9086.6664.5533.24217.51.02
-Back to top -



Lane 8

- - - - - - - - - - - - - - - - - - - - - - - -
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
8000164972243.6089.4073.1748.36182.80.71
-Back to top - - -""" - pathname = os.path.join(gerald_dir, 'Summary.htm') - f = open(pathname, 'w') - f.write(summary_htm) - f.close() - -def make_eland_results(gerald_dir): - eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D. ->HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 chr2L.fa 8796855 R DD 24T ->HWI-EAS229_24_207BTAAXX:1:7:776:582 AGCTCANCCGATCGAAAACCTCNCCAAGCAAT NM 0 0 0 ->HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 Lambda.fa 8796855 R DD 24T -""" - for i in range(1,9): - pathname = os.path.join(gerald_dir, - 's_%d_eland_result.txt' % (i,)) - f = open(pathname, 'w') - f.write(eland_result) - f.close() - -def make_runfolder(obj=None): - """ - Make a fake runfolder, attach all the directories to obj if defined - """ - # make a fake runfolder directory - temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_') - - runfolder_dir = os.path.join(temp_dir, - '080102_HWI-EAS229_0010_207BTAAXX') - os.mkdir(runfolder_dir) - - data_dir = os.path.join(runfolder_dir, 'Data') - os.mkdir(data_dir) - - firecrest_dir = os.path.join(data_dir, - 'C1-33_Firecrest1.8.28_12-04-2008_diane' - ) - os.mkdir(firecrest_dir) - matrix_dir = os.path.join(firecrest_dir, 'Matrix') - os.mkdir(matrix_dir) - make_matrix(matrix_dir) - - bustard_dir = os.path.join(firecrest_dir, - 'Bustard1.8.28_12-04-2008_diane') - os.mkdir(bustard_dir) - make_phasing_params(bustard_dir) - - gerald_dir = os.path.join(bustard_dir, - 'GERALD_12-04-2008_diane') - os.mkdir(gerald_dir) - make_gerald_config(gerald_dir) - make_summary_htm(gerald_dir) - make_eland_results(gerald_dir) - - if obj is not None: - obj.temp_dir = temp_dir - obj.runfolder_dir = runfolder_dir - obj.data_dir = data_dir - obj.firecrest_dir = firecrest_dir - obj.matrix_dir = matrix_dir - obj.bustard_dir = bustard_dir - obj.gerald_dir = gerald_dir - - -class RunfolderTests(unittest.TestCase): - """ - Test components of the runfolder processing code - which includes firecrest, bustard, and gerald - """ - def setUp(self): - # attaches all the directories to the object passed in - make_runfolder(self) - - def tearDown(self): - shutil.rmtree(self.temp_dir) - - def test_firecrest(self): - """ - Construct a firecrest object - """ - f = firecrest.firecrest(self.firecrest_dir) - self.failUnlessEqual(f.version, '1.8.28') - self.failUnlessEqual(f.start, 1) - self.failUnlessEqual(f.stop, 33) - self.failUnlessEqual(f.user, 'diane') - self.failUnlessEqual(f.date, date(2008,4,12)) - - xml = f.get_elements() - # just make sure that element tree can serialize the tree - xml_str = ElementTree.tostring(xml) - - f2 = firecrest.Firecrest(xml=xml) - self.failUnlessEqual(f.version, f2.version) - self.failUnlessEqual(f.start, f2.start) - self.failUnlessEqual(f.stop, f2.stop) - self.failUnlessEqual(f.user, f2.user) - self.failUnlessEqual(f.date, f2.date) - - def test_bustard(self): - """ - construct a bustard object - """ - b = bustard.bustard(self.bustard_dir) - self.failUnlessEqual(b.version, '1.8.28') - self.failUnlessEqual(b.date, date(2008,4,12)) - self.failUnlessEqual(b.user, 'diane') - self.failUnlessEqual(len(b.phasing), 8) - self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099) - - xml = b.get_elements() - b2 = bustard.Bustard(xml=xml) - self.failUnlessEqual(b.version, b2.version) - self.failUnlessEqual(b.date, b2.date ) - self.failUnlessEqual(b.user, b2.user) - self.failUnlessEqual(len(b.phasing), len(b2.phasing)) - for key in b.phasing.keys(): - self.failUnlessEqual(b.phasing[key].lane, - b2.phasing[key].lane) - self.failUnlessEqual(b.phasing[key].phasing, - b2.phasing[key].phasing) - self.failUnlessEqual(b.phasing[key].prephasing, - b2.phasing[key].prephasing) - - def test_gerald(self): - # need to update gerald and make tests for it - g = gerald.gerald(self.gerald_dir) - - self.failUnlessEqual(g.version, - '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp') - self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30)) - self.failUnlessEqual(len(g.lanes), len(g.lanes.keys())) - self.failUnlessEqual(len(g.lanes), len(g.lanes.items())) - - - # list of genomes, matches what was defined up in - # make_gerald_config. - # the first None is to offset the genomes list to be 1..9 - # instead of pythons default 0..8 - genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2', - '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ] - - # test lane specific parameters from gerald config file - for i in range(1,9): - cur_lane = g.lanes[str(i)] - self.failUnlessEqual(cur_lane.analysis, 'eland') - self.failUnlessEqual(cur_lane.eland_genome, genomes[i]) - self.failUnlessEqual(cur_lane.read_length, '32') - self.failUnlessEqual(cur_lane.use_bases, 'Y'*32) - - # test data extracted from summary file - clusters = [None, - (96483, 9074), (133738, 7938), - (152142, 10002), (15784, 2162), - (119735, 8465), (152177, 8146), - (84649, 7325), (54622, 4812),] - - for i in range(1,9): - summary_lane = g.summary[str(i)] - self.failUnlessEqual(summary_lane.cluster, clusters[i]) - self.failUnlessEqual(summary_lane.lane, str(i)) - - xml = g.get_elements() - # just make sure that element tree can serialize the tree - xml_str = ElementTree.tostring(xml) - g2 = gerald.Gerald(xml=xml) - - # do it all again after extracting from the xml file - self.failUnlessEqual(g.version, g2.version) - self.failUnlessEqual(g.date, g2.date) - self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys())) - self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items())) - - # test lane specific parameters from gerald config file - for i in range(1,9): - g_lane = g.lanes[str(i)] - g2_lane = g2.lanes[str(i)] - self.failUnlessEqual(g_lane.analysis, g2_lane.analysis) - self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome) - self.failUnlessEqual(g_lane.read_length, g2_lane.read_length) - self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases) - - # test (some) summary elements - for i in range(1,9): - g_summary = g.summary[str(i)] - g2_summary = g2.summary[str(i)] - self.failUnlessEqual(g_summary.cluster, g2_summary.cluster) - self.failUnlessEqual(g_summary.lane, g2_summary.lane) - - g_eland = g.eland_results - g2_eland = g2.eland_results - for lane in g_eland.keys(): - self.failUnlessEqual(g_eland[lane].reads, - g2_eland[lane].reads) - self.failUnlessEqual(len(g_eland[lane].mapped_reads), - len(g2_eland[lane].mapped_reads)) - for k in g_eland[lane].mapped_reads.keys(): - self.failUnlessEqual(g_eland[lane].mapped_reads[k], - g2_eland[lane].mapped_reads[k]) - - self.failUnlessEqual(len(g_eland[lane].match_codes), - len(g2_eland[lane].match_codes)) - for k in g_eland[lane].match_codes.keys(): - self.failUnlessEqual(g_eland[lane].match_codes[k], - g2_eland[lane].match_codes[k]) - - - def test_eland(self): - dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa', - 'chr2L.fa': 'dm3/chr2L.fa', - 'Lambda.fa': 'Lambda.fa'} - genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map, - '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map } - eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps) - - for i in range(1,9): - lane = eland[str(i)] - self.failUnlessEqual(lane.reads, 4) - self.failUnlessEqual(lane.sample_name, "s") - self.failUnlessEqual(lane.lane_id, unicode(i)) - self.failUnlessEqual(len(lane.mapped_reads), 3) - self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1) - self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1) - self.failUnlessEqual(lane.match_codes['U1'], 2) - self.failUnlessEqual(lane.match_codes['NM'], 1) - - xml = eland.get_elements() - # just make sure that element tree can serialize the tree - xml_str = ElementTree.tostring(xml) - e2 = gerald.ELAND(xml=xml) - - for i in range(1,9): - l1 = eland[str(i)] - l2 = e2[str(i)] - self.failUnlessEqual(l1.reads, l2.reads) - self.failUnlessEqual(l1.sample_name, l2.sample_name) - self.failUnlessEqual(l1.lane_id, l2.lane_id) - self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads)) - self.failUnlessEqual(len(l1.mapped_reads), 3) - for k in l1.mapped_reads.keys(): - self.failUnlessEqual(l1.mapped_reads[k], - l2.mapped_reads[k]) - - self.failUnlessEqual(len(l1.match_codes), 9) - self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes)) - for k in l1.match_codes.keys(): - self.failUnlessEqual(l1.match_codes[k], - l2.match_codes[k]) - - def test_runfolder(self): - runs = runfolder.get_runs(self.runfolder_dir) - - # do we get the flowcell id from the filename? - self.failUnlessEqual(len(runs), 1) - self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-04-19.xml') - - # do we get the flowcell id from the FlowcellId.xml file - make_flowcell_id(self.runfolder_dir, '207BTAAXY') - runs = runfolder.get_runs(self.runfolder_dir) - self.failUnlessEqual(len(runs), 1) - self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-04-19.xml') - - r1 = runs[0] - xml = r1.get_elements() - xml_str = ElementTree.tostring(xml) - - r2 = runfolder.PipelineRun(xml=xml) - self.failUnlessEqual(r1.name, r2.name) - self.failIfEqual(r2.firecrest, None) - self.failIfEqual(r2.bustard, None) - self.failIfEqual(r2.gerald, None) - - -def suite(): - return unittest.makeSuite(RunfolderTests,'test') - -if __name__ == "__main__": - unittest.main(defaultTest="suite") - diff --git a/gaworkflow/util/__init__.py b/gaworkflow/util/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/gaworkflow/util/alphanum.py b/gaworkflow/util/alphanum.py deleted file mode 100644 index 8893bdb..0000000 --- a/gaworkflow/util/alphanum.py +++ /dev/null @@ -1,61 +0,0 @@ -# -# The Alphanum Algorithm is an improved sorting algorithm for strings -# containing numbers. Instead of sorting numbers in ASCII order like -# a standard sort, this algorithm sorts numbers in numeric order. -# -# The Alphanum Algorithm is discussed at http://www.DaveKoelle.com -# -#* Python implementation provided by Chris Hulan (chris.hulan@gmail.com) -#* Distributed under same license as original -# -# This library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 2.1 of the License, or any later version. -# -# This library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -# - -import re - -# -# TODO: Make decimal points be considered in the same class as digits -# - -def chunkify(str): - """return a list of numbers and non-numeric substrings of +str+ - - the numeric substrings are converted to integer, non-numeric are left as is - """ - chunks = re.findall("(\d+|\D+)",str) - chunks = [re.match('\d',x) and int(x) or x for x in chunks] #convert numeric strings to numbers - return chunks - -def alphanum(a,b): - """breaks +a+ and +b+ into pieces and returns left-to-right comparison of the pieces - - +a+ and +b+ are expected to be strings (for example file names) with numbers and non-numeric characters - Split the values into list of numbers and non numeric sub-strings and so comparison of numbers gives - Numeric sorting, comparison of non-numeric gives Lexicographic order - """ - # split strings into chunks - aChunks = chunkify(a) - bChunks = chunkify(b) - - return cmp(aChunks,bChunks) #built in comparison works once data is prepared - - - -if __name__ == "__main__": - unsorted = ["1000X Radonius Maximus","10X Radonius","200X Radonius","20X Radonius","20X Radonius Prime","30X Radonius","40X Radonius","Allegia 50 Clasteron","Allegia 500 Clasteron","Allegia 51 Clasteron","Allegia 51B Clasteron","Allegia 52 Clasteron","Allegia 60 Clasteron","Alpha 100","Alpha 2","Alpha 200","Alpha 2A","Alpha 2A-8000","Alpha 2A-900","Callisto Morphamax","Callisto Morphamax 500","Callisto Morphamax 5000","Callisto Morphamax 600","Callisto Morphamax 700","Callisto Morphamax 7000","Callisto Morphamax 7000 SE","Callisto Morphamax 7000 SE2","QRS-60 Intrinsia Machine","QRS-60F Intrinsia Machine","QRS-62 Intrinsia Machine","QRS-62F Intrinsia Machine","Xiph Xlater 10000","Xiph Xlater 2000","Xiph Xlater 300","Xiph Xlater 40","Xiph Xlater 5","Xiph Xlater 50","Xiph Xlater 500","Xiph Xlater 5000","Xiph Xlater 58"] - sorted = unsorted[:] - sorted.sort(alphanum) - print '+++++Sorted...++++' - print '\n'.join(sorted) diff --git a/gaworkflow/util/ethelp.py b/gaworkflow/util/ethelp.py deleted file mode 100644 index 19f6c9f..0000000 --- a/gaworkflow/util/ethelp.py +++ /dev/null @@ -1,32 +0,0 @@ -""" -ElementTree helper functions -""" -def indent(elem, level=0): - """ - reformat an element tree to be 'pretty' (indented) - """ - i = "\n" + level*" " - if len(elem): - if not elem.text or not elem.text.strip(): - elem.text = i + " " - for child in elem: - indent(child, level+1) - # we don't want the closing tag indented too far - child.tail = i - if not elem.tail or not elem.tail.strip(): - elem.tail = i - else: - if level and (not elem.tail or not elem.tail.strip()): - elem.tail = i - -def flatten(elem, include_tail=0): - """ - Extract the text from an element tree - (AKA extract the text that not part of XML tags) - """ - text = elem.text or "" - for e in elem: - text += flatten(e, 1) - if include_tail and elem.tail: text += elem.tail - return text - diff --git a/gaworkflow/util/fctracker.py b/gaworkflow/util/fctracker.py deleted file mode 100644 index 57b5dcf..0000000 --- a/gaworkflow/util/fctracker.py +++ /dev/null @@ -1,201 +0,0 @@ -""" -Provide some quick and dirty access and reporting for the fctracker database. - -The advantage to this code is that it doesn't depend on django being -installed, so it can run on machines other than the webserver. -""" -import datetime -import os -import re -import sys -import time - -if sys.version_info[0] + sys.version_info[1] * 0.1 >= 2.5: - # we're python 2.5 - import sqlite3 -else: - import pysqlite2.dbapi2 as sqlite3 - - -class fctracker: - """ - provide a simple way to interact with the flowcell data in fctracker.db - """ - def __init__(self, database): - # default to the current directory - if database is None: - self.database = self._guess_fctracker_path() - else: - self.database = database - self.conn = sqlite3.connect(self.database) - self._get_library() - self._get_species() - - def _guess_fctracker_path(self): - """ - Guess a few obvious places for the database - """ - fctracker = 'fctracker.db' - name = fctracker - # is it in the current dir? - if os.path.exists(name): - return name - name = os.path.expanduser(os.path.join('~', fctracker)) - if os.path.exists(name): - return name - raise RuntimeError("Can't find fctracker") - - def _make_dict_from_table(self, table_name, pkey_name): - """ - Convert a django table into a dictionary indexed by the primary key. - Yes, it really does just load everything into memory, hopefully - we stay under a few tens of thousands of runs for a while. - """ - table = {} - c = self.conn.cursor() - c.execute('select * from %s;' % (table_name)) - # extract just the field name - description = [ f[0] for f in c.description] - for row in c: - row_dict = dict(zip(description, row)) - table[row_dict[pkey_name]] = row_dict - c.close() - return table - - def _add_lanes_to_libraries(self): - """ - add flowcell/lane ids to new attribute 'lanes' in the library dictionary - """ - library_id_re = re.compile('lane_\d_library_id') - - for fc_id, fc in self.flowcells.items(): - lane_library = [ (x[0][5], x[1]) for x in fc.items() - if library_id_re.match(x[0]) ] - for lane, library_id in lane_library: - if not self.library[library_id].has_key('lanes'): - self.library[library_id]['lanes'] = [] - self.library[library_id]['lanes'].append((fc_id, lane)) - - def _get_library(self): - """ - attach the library dictionary to the instance - """ - self.library = self._make_dict_from_table( - 'fctracker_library', - 'library_id') - - - def _get_species(self): - """ - attach the species dictionary to the instance - """ - self.species = self._make_dict_from_table( - 'fctracker_species', - 'id' - ) - - def _get_flowcells(self, where=None): - """ - attach the flowcell dictionary to the instance - - where is a sql where clause. (eg "where run_date > '2008-1-1'") - that can be used to limit what flowcells we select - FIXME: please add sanitization code - """ - if where is None: - where = "" - self.flowcells = {} - c = self.conn.cursor() - c.execute('select * from fctracker_flowcell %s;' % (where)) - # extract just the field name - description = [ f[0] for f in c.description ] - for row in c: - row_dict = dict(zip(description, row)) - fcid, status = self._parse_flowcell_id(row_dict) - row_dict['flowcell_id'] = fcid - row_dict['flowcell_status'] = status - - for lane in [ 'lane_%d_library' % (i) for i in range(1,9) ]: - lane_library = self.library[row_dict[lane+"_id"]] - species_id = lane_library['library_species_id'] - lane_library['library_species'] = self.species[species_id] - row_dict[lane] = lane_library - # some useful parsing - run_date = time.strptime(row_dict['run_date'], '%Y-%m-%d %H:%M:%S') - run_date = datetime.datetime(*run_date[:6]) - row_dict['run_date'] = run_date - self.flowcells[row_dict['flowcell_id']] = row_dict - - self._add_lanes_to_libraries() - return self.flowcells - - def _parse_flowcell_id(self, flowcell_row): - """ - Return flowcell id and status - - We stored the status information in the flowcell id name. - this was dumb, but database schemas are hard to update. - """ - fields = flowcell_row['flowcell_id'].split() - fcid = None - status = None - if len(fields) > 0: - fcid = fields[0] - if len(fields) > 1: - status = fields[1] - return fcid, status - - -def flowcell_gone(cell): - """ - Use a variety of heuristics to determine if the flowcell drive - has been deleted. - """ - status = cell['flowcell_status'] - if status is None: - return False - failures = ['failed', 'deleted', 'not run'] - for f in failures: - if re.search(f, status): - return True - else: - return False - -def recoverable_drive_report(flowcells): - """ - Attempt to report what flowcells are still on a hard drive - """ - def format_status(status): - if status is None: - return "" - else: - return status+" " - - # sort flowcells by run date - flowcell_list = [] - for key, cell in flowcells.items(): - flowcell_list.append( (cell['run_date'], key) ) - flowcell_list.sort() - - report = [] - line = "%(date)s %(id)s %(status)s%(lane)s %(library_name)s (%(library_id)s) " - line += "%(species)s" - for run_date, flowcell_id in flowcell_list: - cell = flowcells[flowcell_id] - if flowcell_gone(cell): - continue - for l in range(1,9): - lane = 'lane_%d' % (l) - cell_library = cell['%s_library'%(lane)] - fields = { - 'date': cell['run_date'].strftime('%y-%b-%d'), - 'id': cell['flowcell_id'], - 'lane': l, - 'library_name': cell_library['library_name'], - 'library_id': cell['%s_library_id'%(lane)], - 'species': cell_library['library_species']['scientific_name'], - 'status': format_status(cell['flowcell_status']), - } - report.append(line % (fields)) - return os.linesep.join(report) - diff --git a/gaworkflow/util/makebed.py b/gaworkflow/util/makebed.py deleted file mode 100755 index 6f1511c..0000000 --- a/gaworkflow/util/makebed.py +++ /dev/null @@ -1,142 +0,0 @@ -""" -Utility functions to make bedfiles. -""" -import os -import re - -# map eland_result.txt sense -sense_map = { 'F': '+', 'R': '-'} -sense_color = { 'F': '0,0,255', 'R': '255,255,0' } - -def write_bed_header(outstream, name, description): - """ - Produce the headerline for a bedfile - """ - # provide default track names - if name is None: name = "track" - if description is None: description = "eland result file" - bed_header = 'track name="%s" description="%s" visibility=4 itemRgb="ON"' - bed_header += os.linesep - outstream.write(bed_header % (name, description)) - -def make_bed_from_eland_stream(instream, outstream, name, description, chromosome_prefix='chr'): - """ - read an eland result file from instream and write a bedfile to outstream - """ - # indexes into fields in eland_result.txt file - SEQ = 1 - CHR = 6 - START = 7 - SENSE = 8 - - write_bed_header(outstream, name, description) - - for line in instream: - fields = line.split() - # we need more than the CHR field, and it needs to match a chromosome - if len(fields) <= CHR or \ - (chromosome_prefix is not None and \ - fields[CHR][:3] != chromosome_prefix): - continue - start = fields[START] - stop = int(start) + len(fields[SEQ]) - chromosome, extension = fields[CHR].split('.') - assert extension == "fa" - outstream.write('%s %s %d read 0 %s - - %s%s' % ( - chromosome, - start, - stop, - sense_map[fields[SENSE]], - sense_color[fields[SENSE]], - os.linesep - )) - - -def make_bed_from_multi_eland_stream( - instream, - outstream, - name, - description, - chr_prefix='chr', - max_reads=255 - ): - """ - read a multi eland stream and write a bedfile - """ - write_bed_header(outstream, name, description) - parse_multi_eland(instream, outstream, chr_prefix, max_reads) - -def parse_multi_eland(instream, outstream, chr_prefix, max_reads=255): - - loc_pattern = '(?P(?P[0-9]+)(?P[FR])(?P[0-9]+))' - other_pattern = '(?P[^:,]+)' - split_re = re.compile('(%s|%s)' % (loc_pattern, other_pattern)) - - for line in instream: - rec = line.split() - if len(rec) > 3: - # colony_id = rec[0] - seq = rec[1] - # number of matches for 0, 1, and 2 mismatches - # m0, m1, m2 = [int(x) for x in rec[2].split(':')] - compressed_reads = rec[3] - cur_chr = "" - reads = {0: [], 1: [], 2:[]} - - for token in split_re.finditer(compressed_reads): - if token.group('chr') is not None: - cur_chr = token.group('chr')[:-3] # strip off .fa - elif token.group('fullloc') is not None: - matches = int(token.group('count')) - # only emit a bed line if - # our current chromosome starts with chromosome pattern - if chr_prefix is None or cur_chr.startswith(chr_prefix): - start = int(token.group('start')) - stop = start + len(seq) - orientation = token.group('dir') - strand = sense_map[orientation] - color = sense_color[orientation] - # build up list of reads for this record - reads[matches].append((cur_chr, start, stop, strand, color)) - - # report up to our max_read threshold reporting the fewer-mismatch - # matches first - reported_reads = 0 - keys = [0,1,2] - for mismatch, read_list in ((k, reads[k]) for k in keys): - reported_reads += len(read_list) - if reported_reads <= max_reads: - for cur_chr, start, stop, strand, color in read_list: - reported_reads += 1 - outstream.write('%s %d %d read 0 %s - - %s%s' % ( - cur_chr, - start, - stop, - sense_map[orientation], - sense_color[orientation], - os.linesep - )) - -def make_description(database, flowcell_id, lane): - """ - compute a bedfile name and description from the fctracker database - """ - from gaworkflow.util.fctracker import fctracker - - fc = fctracker(database) - cells = fc._get_flowcells("where flowcell_id='%s'" % (flowcell_id)) - if len(cells) != 1: - raise RuntimeError("couldn't find flowcell id %s" % (flowcell_id)) - lane = int(lane) - if lane < 1 or lane > 8: - raise RuntimeError("flowcells only have lanes 1-8") - - name = "%s-%s" % (flowcell_id, lane) - - cell_id, cell = cells.items()[0] - assert cell_id == flowcell_id - - cell_library_id = cell['lane_%d_library_id' %(lane,)] - cell_library = cell['lane_%d_library' %(lane,)] - description = "%s-%s" % (cell_library['library_name'], cell_library_id) - return name, description diff --git a/gaworkflow/util/mount.py b/gaworkflow/util/mount.py deleted file mode 100644 index 75dbe0a..0000000 --- a/gaworkflow/util/mount.py +++ /dev/null @@ -1,65 +0,0 @@ -""" -Utilities for working with unix-style mounts. -""" -import os -import subprocess - -def list_mount_points(): - """ - Return list of current mount points - - Note: unix-like OS specific - """ - mount_points = [] - likely_locations = ['/sbin/mount', '/bin/mount'] - for mount in likely_locations: - if os.path.exists(mount): - p = subprocess.Popen(mount, stdout=subprocess.PIPE) - p.wait() - for l in p.stdout.readlines(): - rec = l.split() - device = rec[0] - mount_point = rec[2] - assert rec[1] == 'on' - # looking at the output of mount on linux, osx, and - # sunos, the first 3 elements are always the same - # devicename on path - # everything after that displays the attributes - # of the mount points in wildly differing formats - mount_points.append(mount_point) - return mount_points - else: - raise RuntimeError("Couldn't find a mount executable") - -def is_mounted(point_to_check): - """ - Return true if argument exactly matches a current mount point. - """ - for mount_point in list_mount_points(): - if point_to_check == mount_point: - return True - else: - return False - -def find_mount_point_for(pathname): - """ - Find the deepest mount point pathname is located on - """ - realpath = os.path.realpath(pathname) - mount_points = list_mount_points() - - prefixes = set() - for current_mount in mount_points: - cp = os.path.commonprefix([current_mount, realpath]) - prefixes.add((len(cp), cp)) - - prefixes = list(prefixes) - prefixes.sort() - if len(prefixes) == 0: - return None - else: - print prefixes - # return longest common prefix - return prefixes[-1][1] - - diff --git a/gaworkflow/util/opener.py b/gaworkflow/util/opener.py deleted file mode 100644 index 035bb24..0000000 --- a/gaworkflow/util/opener.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Helpful utilities for turning random names/objects into streams. -""" -import os -import gzip -import bz2 -import types -import urllib2 - -def isfilelike(file_ref, mode): - """Does file_ref have the core file operations? - """ - # if mode is w/a check to make sure we writeable ops - # but always check to see if we can read - read_operations = ['read', 'readline', 'readlines'] - write_operations = [ 'write', 'writelines' ] - #random_operations = [ 'seek', 'tell' ] - if mode[0] in ('w', 'a'): - for o in write_operations: - if not hasattr(file_ref, o): - return False - for o in read_operations: - if not hasattr(file_ref, o): - return False - - return True - -def isurllike(file_ref, mode): - """ - does file_ref look like a url? - (AKA does it start with protocol:// ?) - """ - #what if mode is 'w'? - parsed = urllib2.urlparse.urlparse(file_ref) - schema, netloc, path, params, query, fragment = parsed - - return len(schema) > 0 - -def autoopen(file_ref, mode='r'): - """ - Attempt to intelligently turn file_ref into a readable stream - """ - # catch being passed a file - if type(file_ref) is types.FileType: - return file_ref - # does it look like a file? - elif isfilelike(file_ref, mode): - return file_ref - elif isurllike(file_ref, mode): - return urllib2.urlopen(file_ref) - elif os.path.splitext(file_ref)[1] == ".gz": - return gzip.open(file_ref, mode) - elif os.path.splitext(file_ref)[1] == '.bz2': - return bz2.BZ2File(file_ref, mode) - else: - return open(file_ref,mode) - diff --git a/gaworkflow/util/queuecommands.py b/gaworkflow/util/queuecommands.py deleted file mode 100644 index 78728ae..0000000 --- a/gaworkflow/util/queuecommands.py +++ /dev/null @@ -1,87 +0,0 @@ -""" -Run up to N simultanous jobs from provided of commands -""" - -import logging -from subprocess import PIPE -import subprocess -import select -import sys -import time - -class QueueCommands(object): - """ - Queue up N commands from cmd_list, launching more jobs as the first - finish. - """ - - def __init__(self, cmd_list, N=0, cwd=None): - """ - cmd_list is a list of elements suitable for subprocess - N is the number of simultanious processes to run. - 0 is all of them. - - WARNING: this will not work on windows - (It depends on being able to pass local file descriptors to the - select call with isn't supported by the Win32 API) - """ - self.to_run = cmd_list[:] - self.running = {} - self.N = N - self.cwd = cwd - - def under_process_limit(self): - """ - are we still under the total number of allowable jobs? - """ - if self.N == 0: - return True - - if len(self.running) < self.N: - return True - - return False - - def start_jobs(self): - """ - Launch jobs until we have the maximum allowable running - (or have run out of jobs) - """ - queue_log = logging.getLogger('queue') - queue_log.info('using %s as cwd' % (self.cwd,)) - - while (len(self.to_run) > 0) and self.under_process_limit(): - queue_log.info('%d left to run', len(self.to_run)) - cmd = self.to_run.pop(0) - p = subprocess.Popen(cmd, stdout=PIPE, cwd=self.cwd, shell=True) - self.running[p.stdout] = p - queue_log.info("Created process %d from %s" % (p.pid, str(cmd))) - - def run(self): - """ - run up to N jobs until we run out of jobs - """ - queue_log = logging.getLogger('queue') - - # to_run slowly gets consumed by start_jobs - while len(self.to_run) > 0 or len(self.running) > 0: - # fill any empty spots in our job queue - self.start_jobs() - - # build a list of file descriptors - # fds=file desciptors - fds = [ x.stdout for x in self.running.values()] - - # wait for something to finish - # wl= write list, xl=exception list (not used so get bad names) - read_list, wl, xl = select.select(fds, [], fds) - - # for everything that might have finished... - for pending_fd in read_list: - pending = self.running[pending_fd] - # if it really did finish, remove it from running jobs - if pending.poll() is not None: - queue_log.info("Process %d finished [%d]", - pending.pid, pending.returncode) - del self.running[pending_fd] - time.sleep(1) diff --git a/gaworkflow/util/test/test_ethelp.py b/gaworkflow/util/test/test_ethelp.py deleted file mode 100644 index 98df518..0000000 --- a/gaworkflow/util/test/test_ethelp.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -import unittest - -try: - from xml.etree import ElementTree -except ImportError, e: - from elementtree import ElementTree - -from gaworkflow.util.ethelp import indent, flatten - -class testETHelper(unittest.TestCase): - def setUp(self): - self.foo = 'asdf
' - self.foo_tree = ElementTree.fromstring(self.foo) - - def test_indent(self): - flat_foo = ElementTree.tostring(self.foo_tree) - self.failUnlessEqual(len(flat_foo.split('\n')), 1) - - indent(self.foo_tree) - pretty_foo = ElementTree.tostring(self.foo_tree) - self.failUnlessEqual(len(pretty_foo.split('\n')), 5) - - def test_flatten(self): - self.failUnless(flatten(self.foo_tree), 'asdf') - -def suite(): - return unittest.makeSuite(testETHelper, 'test') - -if __name__ == "__main__": - unittest.main(defaultTest='suite') - - - - diff --git a/gaworkflow/util/test/test_makebed.py b/gaworkflow/util/test/test_makebed.py deleted file mode 100644 index e96f29b..0000000 --- a/gaworkflow/util/test/test_makebed.py +++ /dev/null @@ -1,51 +0,0 @@ -import os -from StringIO import StringIO -import unittest - -from gaworkflow.util import makebed - -class testMakeBed(unittest.TestCase): - def test_multi_1_0_0_limit_1(self): - instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:112:383 TCAAATCTTATGCTANGAATCNCAAATTTTCT 1:0:0 mm9_chr13_random.fa:1240R0') - out = StringIO() - - makebed.parse_multi_eland(instream, out, 'mm9_chr', 1) - self.failUnlessEqual(out.getvalue(), 'mm9_chr13_random 1240 1272 read 0 - - - 255,255,0\n') - - def test_multi_1_0_0_limit_255(self): - instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:112:383 TCAAATCTTATGCTANGAATCNCAAATTTTCT 1:0:0 mm9_chr13_random.fa:1240R0') - out = StringIO() - - makebed.parse_multi_eland(instream, out, 'mm9_chr', 255) - self.failUnlessEqual(out.getvalue(), 'mm9_chr13_random 1240 1272 read 0 - - - 255,255,0\n') - - - def test_multi_2_0_0_limit_1(self): - instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:104:586 GTTCTCGCATAAACTNACTCTNAATAGATTCA 2:0:0 mm9_chr4.fa:42995432F0,mm9_chrX.fa:101541458F0') - out = StringIO() - - makebed.parse_multi_eland(instream, out, 'mm9_chr', 1) - self.failUnlessEqual(out.len, 0) - - def test_multi_2_0_0_limit_255(self): - instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:104:586 GTTCTCGCATAAACTNACTCTNAATAGATTCA 2:0:0 mm9_chr4.fa:42995432F0,mm9_chrX.fa:101541458F0') - out = StringIO() - - makebed.parse_multi_eland(instream, out, 'mm9_chr', 255) - self.failUnlessEqual(out.len, 98) - - def test_multi_0_2_0_limit_1(self): - instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:115:495 TCTCCCTGAAAAATANAAGTGNTGTTGGTGAG 0:2:1 mm9_chr14.fa:104434729F2,mm9_chr16.fa:63263818R1,mm9_chr2.fa:52265438R1') - out = StringIO() - - makebed.parse_multi_eland(instream, out, 'mm9_chr', 1) - print out.getvalue() - self.failUnlessEqual(out.len, 0) - -def suite(): - return unittest.makeSuite(testMakeBed, 'test') - -if __name__ == "__main__": - unittest.main(defaultTest='suite') - - diff --git a/gaworkflow/util/test/test_queuecommands.py b/gaworkflow/util/test/test_queuecommands.py deleted file mode 100644 index 6c54c04..0000000 --- a/gaworkflow/util/test/test_queuecommands.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -import logging -import time -import unittest - - -from gaworkflow.util.queuecommands import QueueCommands - -class testQueueCommands(unittest.TestCase): - def setUp(self): - logging.basicConfig(level=logging.DEBUG, - format='%(asctime)s %(name)-8s %(message)s') - - - - def test_unlimited_run(self): - """ - Run everything at once - """ - cmds = ['/bin/sleep 0', - '/bin/sleep 1', - '/bin/sleep 2',] - - q = QueueCommands(cmds) - start = time.time() - q.run() - end = time.time()-start - # we should only take the length of the longest sleep - self.failUnless( end > 1.9 and end < 2.1, - "took %s seconds, exected ~5" % (end,)) - - def test_limited_run(self): - """ - Run a limited number of jobs - """ - cmds = ['/bin/sleep 1', - '/bin/sleep 2', - '/bin/sleep 3',] - - q = QueueCommands(cmds, 2) - - start = time.time() - q.run() - end = time.time()-start - self.failUnless( end > 3.9 and end < 4.1, - "took %s seconds, expected ~6" % (end,)) - -def suite(): - return unittest.makeSuite(testQueueCommands, 'test') - -if __name__ == "__main__": - unittest.main(defaultTest='suite') - - - - diff --git a/htsworkflow/__init__.py b/htsworkflow/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/htsworkflow/automation/__init__.py b/htsworkflow/automation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/htsworkflow/automation/copier.py b/htsworkflow/automation/copier.py new file mode 100644 index 0000000..572cfb3 --- /dev/null +++ b/htsworkflow/automation/copier.py @@ -0,0 +1,245 @@ +import ConfigParser +import copy +import logging +import logging.handlers +import os +import re +import subprocess +import sys +import time +import traceback + +from benderjab import rpc + +def runfolder_validate(fname): + """ + Return True if fname looks like a runfolder name + """ + if re.match("^[0-9]{6}_[-A-Za-z0-9_]*$", fname): + return True + else: + return False + +class rsync(object): + def __init__(self, source, dest, pwfile): + self.pwfile = os.path.expanduser(pwfile) + self.cmd = ['/usr/bin/rsync', ] + self.cmd.append('--password-file=%s' % (self.pwfile)) + self.source_base = source + self.dest_base = dest + self.processes = {} + self.exit_code = None + + def list(self): + """Get a directory listing""" + args = copy.copy(self.cmd) + args.append(self.source_base) + + logging.debug("Rsync cmd:" + " ".join(args)) + short_process = subprocess.Popen(args, stdout=subprocess.PIPE) + return self.list_filter(short_process.stdout) + + def list_filter(self, lines): + """ + parse rsync directory listing + """ + dirs_to_copy = [] + direntries = [ x[0:42].split() + [x[43:-1]] for x in lines ] + for permissions, size, filedate, filetime, filename in direntries: + if permissions[0] == 'd': + # hey its a directory, the first step to being something we want to + # copy + if re.match("[0-9]{6}", filename): + # it starts with something that looks like a 6 digit date + # aka good enough for me + dirs_to_copy.append(filename) + return dirs_to_copy + + def create_copy_process(self, dirname): + args = copy.copy(self.cmd) + # we want to copy everything + args.append('-rlt') + # from here + args.append(os.path.join(self.source_base, dirname)) + # to here + args.append(self.dest_base) + logging.debug("Rsync cmd:" + " ".join(args)) + return subprocess.Popen(args) + + def copy(self): + """ + copy any interesting looking directories over + return list of items that we started copying. + """ + # clean up any lingering non-running processes + self.poll() + + # what's available to copy? + dirs_to_copy = self.list() + + # lets start copying + started = [] + for d in dirs_to_copy: + process = self.processes.get(d, None) + + if process is None: + # we don't have a process, so make one + logging.info("rsyncing %s" % (d)) + self.processes[d] = self.create_copy_process(d) + started.append(d) + return started + + def poll(self): + """ + check currently running processes to see if they're done + + return path roots that have finished. + """ + for dir_key, proc_value in self.processes.items(): + retcode = proc_value.poll() + if retcode is None: + # process hasn't finished yet + pass + elif retcode == 0: + logging.info("finished rsyncing %s, exitcode %d" %( dir_key, retcode)) + del self.processes[dir_key] + else: + logging.error("rsync failed for %s, exit code %d" % (dir_key, retcode)) + + def __len__(self): + """ + Return how many active rsync processes we currently have + + Call poll first to close finished processes. + """ + return len(self.processes) + + def keys(self): + """ + Return list of current run folder names + """ + return self.processes.keys() + +class CopierBot(rpc.XmlRpcBot): + def __init__(self, section=None, configfile=None): + #if configfile is None: + # configfile = '~/.htsworkflow' + + super(CopierBot, self).__init__(section, configfile) + + # options for rsync command + self.cfg['rsync_password_file'] = None + self.cfg['rsync_source'] = None + self.cfg['rsync_destination'] = None + + # options for reporting we're done + self.cfg['notify_users'] = None + self.cfg['notify_runner'] = None + + self.pending = [] + self.rsync = None + self.notify_users = None + self.notify_runner = None + + self.register_function(self.startCopy) + self.register_function(self.sequencingFinished) + self.eventTasks.append(self.update) + + def read_config(self, section=None, configfile=None): + """ + read the config file + """ + super(CopierBot, self).read_config(section, configfile) + + password = self._check_required_option('rsync_password_file') + source = self._check_required_option('rsync_source') + destination = self._check_required_option('rsync_destination') + self.rsync = rsync(source, destination, password) + + self.notify_users = self._parse_user_list(self.cfg['notify_users']) + try: + self.notify_runner = \ + self._parse_user_list(self.cfg['notify_runner'], + require_resource=True) + except bot.JIDMissingResource: + msg = 'need a full jabber ID + resource for xml-rpc destinations' + logging.FATAL(msg) + raise bot.JIDMissingResource(msg) + + def startCopy(self, *args): + """ + start our copy + """ + logging.info("starting copy scan") + started = self.rsync.copy() + logging.info("copying:" + " ".join(started)+".") + return started + + def sequencingFinished(self, runDir, *args): + """ + The run was finished, if we're done copying, pass the message on + """ + # close any open processes + self.rsync.poll() + + # see if we're still copying + if runfolder_validate(runDir): + logging.info("recevied sequencing finshed for %s" % (runDir)) + self.pending.append(runDir) + self.startCopy() + return "PENDING" + else: + errmsg = "received bad runfolder name (%s)" % (runDir) + logging.warning(errmsg) + # maybe I should use a different error message + raise RuntimeError(errmsg) + + def reportSequencingFinished(self, runDir): + """ + Send the sequencingFinished message to the interested parties + """ + if self.notify_users is not None: + for u in self.notify_users: + self.send(u, 'Sequencing run %s finished' % (runDir)) + if self.notify_runner is not None: + for r in self.notify_runner: + self.rpc_send(r, (runDir,), 'sequencingFinished') + logging.info("forwarding sequencingFinshed message for %s" % (runDir)) + + def update(self, *args): + """ + Update our current status. + Report if we've finished copying files. + """ + self.rsync.poll() + for p in self.pending: + if p not in self.rsync.keys(): + self.reportSequencingFinished(p) + self.pending.remove(p) + + def _parser(self, msg, who): + """ + Parse xmpp chat messages + """ + help = u"I can [copy], or report current [status]" + if re.match(u"help", msg): + reply = help + elif re.match("copy", msg): + started = self.startCopy() + reply = u"started copying " + ", ".join(started) + elif re.match(u"status", msg): + msg = [u"Currently %d rsync processes are running." % (len(self.rsync))] + for d in self.rsync.keys(): + msg.append(u" " + d) + reply = os.linesep.join(msg) + else: + reply = u"I didn't understand '%s'" % (unicode(msg)) + return reply + +def main(args=None): + bot = CopierBot() + bot.main(args) + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) + diff --git a/htsworkflow/automation/runner.py b/htsworkflow/automation/runner.py new file mode 100644 index 0000000..9c26940 --- /dev/null +++ b/htsworkflow/automation/runner.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python +from glob import glob +import logging +import os +import re +import sys +import time +import threading + +from benderjab import rpc + +from htsworkflow.pipeline.configure_run import * + +#s_fc = re.compile('FC[0-9]+') +s_fc = re.compile('_[0-9a-zA-Z]*$') + + +def _get_flowcell_from_rundir(run_dir): + """ + Returns flowcell string based on run_dir. + Returns None and logs error if flowcell can't be found. + """ + junk, dirname = os.path.split(run_dir) + mo = s_fc.search(dirname) + if not mo: + logging.error('RunDir 2 FlowCell error: %s' % (run_dir)) + return None + + return dirname[mo.start()+1:] + + + +class Runner(rpc.XmlRpcBot): + """ + Manage running pipeline jobs. + """ + def __init__(self, section=None, configfile=None): + #if configfile is None: + # self.configfile = "~/.htsworkflow" + super(Runner, self).__init__(section, configfile) + + self.cfg['notify_users'] = None + self.cfg['genome_dir'] = None + self.cfg['base_analysis_dir'] = None + + self.cfg['notify_users'] = None + self.cfg['notify_postanalysis'] = None + + self.conf_info_dict = {} + + self.register_function(self.sequencingFinished) + #self.eventTasks.append(self.update) + + + def read_config(self, section=None, configfile=None): + super(Runner, self).read_config(section, configfile) + + self.genome_dir = self._check_required_option('genome_dir') + self.base_analysis_dir = self._check_required_option('base_analysis_dir') + + self.notify_users = self._parse_user_list(self.cfg['notify_users']) + #FIXME: process notify_postpipeline cfg + + + def _parser(self, msg, who): + """ + Parse xmpp chat messages + """ + help = u"I can send [start] a run, or report [status]" + if re.match(u"help", msg): + reply = help + elif re.match("status", msg): + words = msg.split() + if len(words) == 2: + reply = self.getStatusReport(words[1]) + else: + reply = u"Status available for: %s" \ + % (', '.join([k for k in self.conf_info_dict.keys()])) + elif re.match(u"start", msg): + words = msg.split() + if len(words) == 2: + self.sequencingFinished(words[1]) + reply = u"starting run for %s" % (words[1]) + else: + reply = u"need runfolder name" + else: + reply = u"I didn't understand '%s'" %(msg) + + logging.debug("reply: " + str(reply)) + return reply + + + def getStatusReport(self, fc_num): + """ + Returns text status report for flow cell number + """ + if fc_num not in self.conf_info_dict: + return "No record of a %s run." % (fc_num) + + status = self.conf_info_dict[fc_num].status + + if status is None: + return "No status information for %s yet." \ + " Probably still in configure step. Try again later." % (fc_num) + + output = status.statusReport() + + return '\n'.join(output) + + + def sequencingFinished(self, run_dir): + """ + Sequenceing (and copying) is finished, time to start pipeline + """ + logging.debug("received sequencing finished message") + + # Setup config info object + ci = ConfigInfo() + ci.base_analysis_dir = self.base_analysis_dir + ci.analysis_dir = os.path.join(self.base_analysis_dir, run_dir) + + # get flowcell from run_dir name + flowcell = _get_flowcell_from_rundir(run_dir) + + # Store ci object in dictionary + self.conf_info_dict[flowcell] = ci + + + # Launch the job in it's own thread and turn. + self.launchJob(run_dir, flowcell, ci) + return "started" + + + def pipelineFinished(self, run_dir): + # need to strip off self.watch_dir from rundir I suspect. + logging.info("pipeline finished in" + str(run_dir)) + #pattern = self.watch_dir + #if pattern[-1] != os.path.sep: + # pattern += os.path.sep + #stripped_run_dir = re.sub(pattern, "", run_dir) + #logging.debug("stripped to " + stripped_run_dir) + + # Notify each user that the run has finished. + if self.notify_users is not None: + for u in self.notify_users: + self.send(u, 'Pipeline run %s finished' % (run_dir)) + + #if self.notify_runner is not None: + # for r in self.notify_runner: + # self.rpc_send(r, (stripped_run_dir,), 'sequencingFinished') + + def reportMsg(self, msg): + + if self.notify_users is not None: + for u in self.notify_users: + self.send(u, msg) + + + def _runner(self, run_dir, flowcell, conf_info): + + # retrieve config step + cfg_filepath = os.path.join(conf_info.analysis_dir, + 'config32auto.txt') + status_retrieve_cfg = retrieve_config(conf_info, + flowcell, + cfg_filepath, + self.genome_dir) + if status_retrieve_cfg: + logging.info("Runner: Retrieve config: success") + self.reportMsg("Retrieve config (%s): success" % (run_dir)) + else: + logging.error("Runner: Retrieve config: failed") + self.reportMsg("Retrieve config (%s): FAILED" % (run_dir)) + + + # configure step + if status_retrieve_cfg: + status = configure(conf_info) + if status: + logging.info("Runner: Configure: success") + self.reportMsg("Configure (%s): success" % (run_dir)) + self.reportMsg( + os.linesep.join(glob(os.path.join(run_dir,'Data','C*'))) + ) + else: + logging.error("Runner: Configure: failed") + self.reportMsg("Configure (%s): FAILED" % (run_dir)) + + #if successful, continue + if status: + # Setup status cmdline status monitor + #startCmdLineStatusMonitor(ci) + + # running step + print 'Running pipeline now!' + run_status = run_pipeline(conf_info) + if run_status is True: + logging.info('Runner: Pipeline: success') + self.reportMsg("Pipeline run (%s): Finished" % (run_dir,)) + else: + logging.info('Runner: Pipeline: failed') + self.reportMsg("Pipeline run (%s): FAILED" % (run_dir)) + + + def launchJob(self, run_dir, flowcell, conf_info): + """ + Starts up a thread for running the pipeline + """ + t = threading.Thread(target=self._runner, + args=[run_dir, flowcell, conf_info]) + t.setDaemon(True) + t.start() + + + +def main(args=None): + bot = Runner() + return bot.main(args) + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) + diff --git a/htsworkflow/automation/spoolwatcher.py b/htsworkflow/automation/spoolwatcher.py new file mode 100644 index 0000000..2a57535 --- /dev/null +++ b/htsworkflow/automation/spoolwatcher.py @@ -0,0 +1,229 @@ +#!/usr/bin/env python +import logging +import os +import re +import sys +import time +#import glob + +from htsworkflow.util import mount + +# this uses pyinotify +import pyinotify +from pyinotify import EventsCodes + +from benderjab import rpc + + +class WatcherEvents(object): + # two events need to be tracked + # one to send startCopy + # one to send OMG its broken + # OMG its broken needs to stop when we've seen enough + # cycles + # this should be per runfolder. + # read the xml files + def __init__(self): + pass + + +class Handler(pyinotify.ProcessEvent): + def __init__(self, watchmanager, bot): + self.last_event_time = None + self.watchmanager = watchmanager + self.bot = bot + + def process_IN_CREATE(self, event): + self.last_event_time = time.time() + msg = "Create: %s" % os.path.join(event.path, event.name) + if event.name.lower() == "run.completed": + try: + self.bot.sequencingFinished(event.path) + except IOError, e: + logging.error("Couldn't send sequencingFinished") + logging.debug(msg) + + def process_IN_DELETE(self, event): + logging.debug("Remove: %s" % os.path.join(event.path, event.name)) + + def process_IN_UNMOUNT(self, event): + pathname = os.path.join(event.path, event.name) + logging.debug("IN_UNMOUNT: %s" % (pathname,)) + self.bot.unmount_watch() + +class SpoolWatcher(rpc.XmlRpcBot): + """ + Watch a directory and send a message when another process is done writing. + + This monitors a directory tree using inotify (linux specific) and + after some files having been written will send a message after + seconds of no file writing. + + (Basically when the solexa machine finishes dumping a round of data + this'll hopefully send out a message saying hey look theres data available + + """ + # these params need to be in the config file + # I wonder where I should put the documentation + #:Parameters: + # `watchdir` - which directory tree to monitor for modifications + # `profile` - specify which .htsworkflow profile to use + # `write_timeout` - how many seconds to wait for writes to finish to + # the spool + # `notify_timeout` - how often to timeout from notify + + def __init__(self, section=None, configfile=None): + #if configfile is None: + # self.configfile = "~/.htsworkflow" + super(SpoolWatcher, self).__init__(section, configfile) + + self.cfg['watchdir'] = None + self.cfg['write_timeout'] = 10 + self.cfg['notify_users'] = None + self.cfg['notify_runner'] = None + + self.notify_timeout = 0.001 + self.wm = pyinotify.WatchManager() + self.handler = Handler(self.wm, self) + self.notifier = pyinotify.Notifier(self.wm, self.handler) + self.wdd = None + self.mount_point = None + self.mounted = True + + self.notify_users = None + self.notify_runner = None + + self.eventTasks.append(self.process_notify) + + def read_config(self, section=None, configfile=None): + super(SpoolWatcher, self).read_config(section, configfile) + + self.watch_dir = self._check_required_option('watchdir') + self.write_timeout = int(self.cfg['write_timeout']) + + self.notify_users = self._parse_user_list(self.cfg['notify_users']) + try: + self.notify_runner = \ + self._parse_user_list(self.cfg['notify_runner'], + require_resource=True) + except bot.JIDMissingResource: + msg = 'need a full jabber ID + resource for xml-rpc destinations' + logging.FATAL(msg) + raise bot.JIDMissingResource(msg) + + def add_watch(self, watchdir=None): + """ + start watching watchdir or self.watch_dir + we're currently limited to watching one directory tree. + """ + # the one tree limit is mostly because self.wdd is a single item + # but managing it as a list might be a bit more annoying + if watchdir is None: + watchdir = self.watch_dir + logging.info("Watching:"+str(watchdir)) + + self.mount_point = mount.find_mount_point_for(watchdir) + + mask = EventsCodes.IN_CREATE | EventsCodes.IN_UNMOUNT + # rec traverses the tree and adds all the directories that are there + # at the start. + # auto_add will add in new directories as they are created + self.wdd = self.wm.add_watch(watchdir, mask, rec=True, auto_add=True) + + def unmount_watch(self): + if self.wdd is not None: + self.wm.rm_watch(self.wdd.values()) + self.wdd = None + self.mounted = False + + def process_notify(self, *args): + # process the queue of events as explained above + self.notifier.process_events() + #check events waits timeout + if self.notifier.check_events(self.notify_timeout): + # read notified events and enqeue them + self.notifier.read_events() + # should we do something? + # has something happened? + last_event_time = self.handler.last_event_time + if last_event_time is not None: + time_delta = time.time() - last_event_time + if time_delta > self.write_timeout: + self.startCopy() + self.handler.last_event_time = None + # handle unmounted filesystems + if not self.mounted: + if mount.is_mounted(self.mount_point): + # we've been remounted. Huzzah! + # restart the watch + self.add_watch() + self.mounted = True + logging.info( + "%s was remounted, restarting watch" % \ + (self.mount_point) + ) + + def _parser(self, msg, who): + """ + Parse xmpp chat messages + """ + help = u"I can send [copy] message, or squencer [finished]" + if re.match(u"help", msg): + reply = help + elif re.match("copy", msg): + self.startCopy() + reply = u"sent copy message" + elif re.match(u"finished", msg): + words = msg.split() + if len(words) == 2: + self.sequencingFinished(words[1]) + reply = u"sending sequencing finished for %s" % (words[1]) + else: + reply = u"need runfolder name" + else: + reply = u"I didn't understand '%s'" %(msg) + return reply + + def start(self, daemonize): + """ + Start application + """ + self.add_watch() + super(SpoolWatcher, self).start(daemonize) + + def stop(self): + """ + shutdown application + """ + # destroy the inotify's instance on this interrupt (stop monitoring) + self.notifier.stop() + super(SpoolWatcher, self).stop() + + def startCopy(self): + logging.debug("writes seem to have stopped") + if self.notify_runner is not None: + for r in self.notify_runner: + self.rpc_send(r, tuple(), 'startCopy') + + def sequencingFinished(self, run_dir): + # need to strip off self.watch_dir from rundir I suspect. + logging.info("run.completed in " + str(run_dir)) + pattern = self.watch_dir + if pattern[-1] != os.path.sep: + pattern += os.path.sep + stripped_run_dir = re.sub(pattern, "", run_dir) + logging.debug("stripped to " + stripped_run_dir) + if self.notify_users is not None: + for u in self.notify_users: + self.send(u, 'Sequencing run %s finished' % (stripped_run_dir)) + if self.notify_runner is not None: + for r in self.notify_runner: + self.rpc_send(r, (stripped_run_dir,), 'sequencingFinished') + +def main(args=None): + bot = SpoolWatcher() + return bot.main(args) + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) + diff --git a/htsworkflow/frontend/__init__.py b/htsworkflow/frontend/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/htsworkflow/frontend/eland_config/__init__.py b/htsworkflow/frontend/eland_config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/htsworkflow/frontend/eland_config/forms.py b/htsworkflow/frontend/eland_config/forms.py new file mode 100644 index 0000000..2488359 --- /dev/null +++ b/htsworkflow/frontend/eland_config/forms.py @@ -0,0 +1,163 @@ +from django import newforms as forms +from django.newforms.util import ErrorList + + +SPECIES_LIST = [#('--choose--', '--Choose--'), + ('hg18', 'Homo sapiens (Hg18)'), + ('Mm8', 'Mus musculus (Mm8)'), + ('arabv6', 'Arabadopsis Thaliana v6'), + ('other', 'Other species (Include in description)')] + + +class DivErrorList(ErrorList): + def __unicode__(self): + return self.as_divs() + + def as_divs(self): + if not self: return u'' + return u'
%s
' % (''.join([u'
%s
' % e for e in self])) + + + +class ConfigForm(forms.Form): + + flow_cell_number = forms.CharField(min_length=2) + run_date = forms.DateTimeField() + advanced_run = forms.BooleanField(required=False) + read_length = forms.IntegerField(min_value=1, initial=32) + #eland_repeat = forms.BooleanField() + + #needs a for loop or something to allow for n configurations + #analysis_type = forms.ChoiceField(choices=[('eland','eland')]) + lane1_species = forms.ChoiceField(choices=SPECIES_LIST) + lane1_description = forms.CharField(widget=forms.TextInput(attrs={'size':'60'})) + + lane2_species = forms.ChoiceField(choices=SPECIES_LIST) + lane2_description = forms.CharField(widget=forms.TextInput(attrs={'size':'60'})) + + lane3_species = forms.ChoiceField(choices=SPECIES_LIST) + lane3_description = forms.CharField(widget=forms.TextInput(attrs={'size':'60'})) + + lane4_species = forms.ChoiceField(choices=SPECIES_LIST) + lane4_description = forms.CharField(widget=forms.TextInput(attrs={'size':'60'})) + + lane5_species = forms.ChoiceField(choices=SPECIES_LIST) + lane5_description = forms.CharField(widget=forms.TextInput(attrs={'size':'60'})) + + lane6_species = forms.ChoiceField(choices=SPECIES_LIST) + lane6_description = forms.CharField(widget=forms.TextInput(attrs={'size':'60'})) + + lane7_species = forms.ChoiceField(choices=SPECIES_LIST) + lane7_description = forms.CharField(widget=forms.TextInput(attrs={'size':'60'})) + + lane8_species = forms.ChoiceField(choices=SPECIES_LIST) + lane8_description = forms.CharField(widget=forms.TextInput(attrs={'size':'60'})) + + notes = forms.CharField(widget=forms.Textarea(attrs={'cols':'70'}), required=False) + + #lane_specific_read_length = forms.IntegerField(min_value=1) + + #eland_genome_lanes = forms.MultipleChoiceField(choices=[('lane1','1'), + # ('lane2','2'), + # ('lane3','3'), + # ('lane4','4'), + # ('lane5','5'), + # ('lane6','6'), + # ('lane7','7'), + # ('lane8','8') ]) + + #eland_genome = forms.ChoiceField(choices=) + + #use_bases_lanes = forms.MultipleChoiceField(choices=[('lane1','1'), + # ('lane2','2'), + # ('lane3','3'), + # ('lane4','4'), + # ('lane5','5'), + # ('lane6','6'), + # ('lane7','7'), + # ('lane8','8') ]) + + #use_bases_mask = forms.CharField() + + #sequence_format = forms.ChoiceField(choices=[('scarf', 'scarf')]) + + + + #subject = forms.CharField(max_length=100) + #message = forms.CharField() + #sender = forms.EmailField() + #cc_myself = forms.BooleanField() + + def as_custom(self): + """ + Displays customized html output + """ + html = [] + + fcn = self['flow_cell_number'] + + html.append(fcn.label_tag() + ': ' + str(fcn) + str(fcn.errors) + '
') + + run_date = self['run_date'] + html.append(run_date.label_tag() + ': ' + str(run_date) + str(run_date.errors) + '
') + + arun = self['advanced_run'] + html.append(arun.label_tag() + ': ' + str(arun) + str(arun.errors) + '
') + + rl = self['read_length'] + html.append(rl.label_tag() + ': ' + str(rl) + str(rl.errors) + '

') + + html.append('') + html.append(' ' \ + % ('Lane', 'Species', 'Description')) + + l1s = self['lane1_species'] + l1d = self['lane1_description'] + html.append(' ' \ + % ('1', str(l1s), str(l1s.errors), str(l1d), str(l1d.errors))) + + l2s = self['lane2_species'] + l2d = self['lane2_description'] + html.append(' ' \ + % ('2', str(l2s), str(l2s.errors), str(l2d), str(l2d.errors))) + + l3s = self['lane3_species'] + l3d = self['lane3_description'] + html.append(' ' \ + % ('3', str(l3s), str(l3s.errors), str(l3d), str(l3d.errors))) + + l4s = self['lane4_species'] + l4d = self['lane4_description'] + html.append(' ' \ + % ('4', str(l4s), str(l4s.errors), str(l4d), str(l4d.errors))) + + l5s = self['lane5_species'] + l5d = self['lane5_description'] + html.append(' ' \ + % ('5', str(l5s), str(l5s.errors), str(l5d), str(l5d.errors))) + + l6s = self['lane6_species'] + l6d = self['lane6_description'] + html.append(' ' \ + % ('6', str(l6s), str(l6s.errors), str(l6d), str(l6d.errors))) + + l7s = self['lane7_species'] + l7d = self['lane7_description'] + html.append(' ' \ + % ('7', str(l7s), str(l7s.errors), str(l7d), str(l7d.errors))) + + l8s = self['lane8_species'] + l8d = self['lane8_description'] + html.append(' ' \ + % ('8', str(l8s), str(l8s.errors), str(l8d), str(l8d.errors))) + + html.append('
%s%s%s
%s%s %s%s %s
%s%s %s%s %s
%s%s %s%s %s
%s%s %s%s %s
%s%s %s%s %s
%s%s %s%s %s
%s%s %s%s %s
%s%s %s%s %s

') + + notes = self['notes'] + html.append('

Notes:

') + html.append(' %s
' % (str(notes))) + + return '\n'.join(html) + + + \ No newline at end of file diff --git a/htsworkflow/frontend/eland_config/models.py b/htsworkflow/frontend/eland_config/models.py new file mode 100644 index 0000000..71a8362 --- /dev/null +++ b/htsworkflow/frontend/eland_config/models.py @@ -0,0 +1,3 @@ +from django.db import models + +# Create your models here. diff --git a/htsworkflow/frontend/eland_config/urls.py b/htsworkflow/frontend/eland_config/urls.py new file mode 100644 index 0000000..129f57c --- /dev/null +++ b/htsworkflow/frontend/eland_config/urls.py @@ -0,0 +1,10 @@ +from django.conf.urls.defaults import * + +urlpatterns = patterns('', + # Example: + + (r'^(?P\w+)/$', 'htsworkflow.frontend.eland_config.views.config'), + (r'^$', 'htsworkflow.frontend.eland_config.views.config'), + #(r'^$', 'htsworkflow.frontend.eland_config.views.index') + +) diff --git a/htsworkflow/frontend/eland_config/views.py b/htsworkflow/frontend/eland_config/views.py new file mode 100644 index 0000000..25edea9 --- /dev/null +++ b/htsworkflow/frontend/eland_config/views.py @@ -0,0 +1,413 @@ +from django.http import HttpResponse +from django.shortcuts import render_to_response +from django.core.exceptions import ObjectDoesNotExist + +from htsworkflow.frontend.eland_config import forms +from htsworkflow.frontend import settings +from htsworkflow.frontend.fctracker import models + +import os +import glob +# Create your views here. + + +def _validate_input(data): + #if data.find('..') == -1 or data.find('/') == -1 or data.find('\\') == -1: + return data.replace('..', '').replace('/', '_').replace('\\', '_') + +#def contact(request): +# if request.method == 'POST': +# form = ContactForm(request.POST) +# if form.is_valid(): +# # Do form processing here... +# return HttpResponseRedirect('/url/on_success/') +# else: +# form = ContactForm() +# return + + + +#def _saveConfigFile(form): +# """ +# Given a valid form, save eland config to file based on flowcell number. +# """ +# assert form.is_valid() +# +# clean_data = form.cleaned_data +# flowcell = clean_data['flow_cell_number'].replace('/','_').replace('..', '__') +# +# file_path = os.path.join(settings.UPLOADTO_CONFIG_FILE, flowcell) +# +# f = open(file_path, 'w') +# cfg = generateElandConfig(form) +# f.write(cfg) +# f.close() +# +# +#def _saveToDb(form): +# """ +# Save info to the database. +# """ +# clean_data = form.cleaned_data +# +# fc_id = clean_data['flow_cell_number'] +# +# try: +# fc = models.FlowCell.objects.get(flowcell_id=fc_id) +# except models.FlowCell.DoesNotExist: +# fc = models.FlowCell() +# +# fc.flowcell_id = fc_id +# fc.run_date = clean_data['run_date'] +# +# #LANE 1 +# fc.lane1_sample = clean_data['lane1_description'] +# species_name = clean_data['lane1_species'] +# try: +# specie = models.Specie.objects.get(scientific_name=species_name) +# except models.Specie.DoesNotExist: +# specie = models.Specie(scientific_name=species_name) +# specie.save() +# fc.lane1_species = specie +# +# #LANE 2 +# fc.lane2_sample = clean_data['lane2_description'] +# species_name = clean_data['lane2_species'] +# try: +# specie = models.Specie.objects.get(scientific_name=species_name) +# except models.Specie.DoesNotExist: +# specie = models.Specie(scientific_name=species_name) +# specie.save() +# fc.lane2_species = specie +# +# #LANE 3 +# fc.lane3_sample = clean_data['lane3_description'] +# species_name = clean_data['lane3_species'] +# try: +# specie = models.Specie.objects.get(scientific_name=species_name) +# except models.Specie.DoesNotExist: +# specie = models.Specie(scientific_name=species_name) +# specie.save() +# fc.lane3_species = specie +# +# #LANE 4 +# fc.lane4_sample = clean_data['lane4_description'] +# species_name = clean_data['lane4_species'] +# try: +# specie = models.Specie.objects.get(scientific_name=species_name) +# except models.Specie.DoesNotExist: +# specie = models.Specie(scientific_name=species_name) +# specie.save() +# fc.lane4_species = specie +# +# #LANE 5 +# fc.lane5_sample = clean_data['lane5_description'] +# species_name = clean_data['lane5_species'] +# try: +# specie = models.Specie.objects.get(scientific_name=species_name) +# except models.Specie.DoesNotExist: +# specie = models.Specie(scientific_name=species_name) +# specie.save() +# fc.lane5_species = specie +# +# #LANE 6 +# fc.lane6_sample = clean_data['lane6_description'] +# species_name = clean_data['lane6_species'] +# try: +# specie = models.Specie.objects.get(scientific_name=species_name) +# except models.Specie.DoesNotExist: +# specie = models.Specie(scientific_name=species_name) +# specie.save() +# fc.lane6_species = specie +# +# #LANE 7 +# fc.lane7_sample = clean_data['lane7_description'] +# species_name = clean_data['lane7_species'] +# try: +# specie = models.Specie.objects.get(scientific_name=species_name) +# except models.Specie.DoesNotExist: +# specie = models.Specie(scientific_name=species_name) +# specie.save() +# fc.lane7_species = specie +# +# #LANE 8 +# fc.lane8_sample = clean_data['lane8_description'] +# species_name = clean_data['lane8_species'] +# try: +# specie = models.Specie.objects.get(scientific_name=species_name) +# except models.Specie.DoesNotExist: +# specie = models.Specie(scientific_name=species_name) +# specie.save() +# fc.lane8_species = specie +# +# fc.notes = clean_data['notes'] +# +# fc.save() +# +# return fc +# +# +#def generateElandConfig(form): +# data = [] +# +# form = form.cleaned_data +# +# BASE_DIR = '/data-store01/compbio/genomes' +# +# data.append("# FLOWCELL: %s" % (form['flow_cell_number'])) +# data.append("#") +# +# notes = form['notes'].replace('\r\n', '\n').replace('\r', '\n') +# notes = notes.replace('\n', '\n# ') +# data.append("# NOTES:") +# data.append("# %s\n#" % (notes)) +# +# #Convert all newline conventions to unix style +# l1d = form['lane1_description'].replace('\r\n', '\n').replace('\r', '\n') +# l2d = form['lane2_description'].replace('\r\n', '\n').replace('\r', '\n') +# l3d = form['lane3_description'].replace('\r\n', '\n').replace('\r', '\n') +# l4d = form['lane4_description'].replace('\r\n', '\n').replace('\r', '\n') +# l5d = form['lane5_description'].replace('\r\n', '\n').replace('\r', '\n') +# l6d = form['lane6_description'].replace('\r\n', '\n').replace('\r', '\n') +# l7d = form['lane7_description'].replace('\r\n', '\n').replace('\r', '\n') +# l8d = form['lane8_description'].replace('\r\n', '\n').replace('\r', '\n') +# +# # Turn new lines into indented commented newlines +# l1d = l1d.replace('\n', '\n# ') +# l2d = l2d.replace('\n', '\n# ') +# l3d = l3d.replace('\n', '\n# ') +# l4d = l4d.replace('\n', '\n# ') +# l5d = l5d.replace('\n', '\n# ') +# l6d = l6d.replace('\n', '\n# ') +# l7d = l7d.replace('\n', '\n# ') +# l8d = l8d.replace('\n', '\n# ') +# +# data.append("# Lane1: %s" % (l1d)) +# data.append("# Lane2: %s" % (l2d)) +# data.append("# Lane3: %s" % (l3d)) +# data.append("# Lane4: %s" % (l4d)) +# data.append("# Lane5: %s" % (l5d)) +# data.append("# Lane6: %s" % (l6d)) +# data.append("# Lane7: %s" % (l7d)) +# data.append("# Lane8: %s" % (l8d)) +# +# #data.append("GENOME_DIR %s" % (BASE_DIR)) +# #data.append("CONTAM_DIR %s" % (BASE_DIR)) +# read_length = form['read_length'] +# data.append("READ_LENGTH %d" % (read_length)) +# #data.append("ELAND_REPEAT") +# data.append("ELAND_MULTIPLE_INSTANCES 8") +# +# #Construct genome dictionary to figure out what lanes to put +# # in the config file. +# genome_dict = {} +# l1s = form['lane1_species'] +# genome_dict.setdefault(l1s, []).append('1') +# l2s = form['lane2_species'] +# genome_dict.setdefault(l2s, []).append('2') +# l3s = form['lane3_species'] +# genome_dict.setdefault(l3s, []).append('3') +# l4s = form['lane4_species'] +# genome_dict.setdefault(l4s, []).append('4') +# l5s = form['lane5_species'] +# genome_dict.setdefault(l5s, []).append('5') +# l6s = form['lane6_species'] +# genome_dict.setdefault(l6s, []).append('6') +# l7s = form['lane7_species'] +# genome_dict.setdefault(l7s, []).append('7') +# l8s = form['lane8_species'] +# genome_dict.setdefault(l8s, []).append('8') +# +# genome_list = genome_dict.keys() +# genome_list.sort() +# +# #Loop through and create entries for each species. +# for genome in genome_list: +# lanes = ''.join(genome_dict[genome]) +# data.append('%s:ANALYSIS eland' % (lanes)) +# data.append('%s:READ_LENGTH %s' % (lanes, read_length)) +# data.append('%s:ELAND_GENOME %s' % (lanes, os.path.join(BASE_DIR, genome))) +# data.append('%s:USE_BASES %s' % (lanes, 'Y'*int(read_length))) +# +# data.append('SEQUENCE_FORMAT --scarf') +# +# return '\n'.join(data) + + +def getElandConfig(flowcell, regenerate=False): + + file_path = os.path.join(settings.UPLOADTO_CONFIG_FILE, flowcell) + + #If we are regenerating the config file, skip + # reading of existing file. If the file doesn't + # exist, try to generate it form the DB. + if not regenerate and os.path.isfile(file_path): + f = open(file_path, 'r') + data = f.read() + f.close() + return data + + try: + fcObj = models.FlowCell.objects.get(flowcell_id__iexact=flowcell) + except ObjectDoesNotExist: + return None + + data = [] + + #form = form.cleaned_data + + BASE_DIR = '/data-store01/compbio/genomes' + + data.append("# FLOWCELL: %s" % (fcObj.flowcell_id)) + data.append("#") + + notes = fcObj.notes.replace('\r\n', '\n').replace('\r', '\n') + notes = notes.replace('\n', '\n# ') + data.append("# NOTES:") + data.append("# %s\n#" % (notes)) + + #Convert all newline conventions to unix style + l1d = str(fcObj.lane_1_library.library_id) + '|' \ + + fcObj.lane_1_library.library_name.replace('\r\n', '\n').replace('\r', '\n') + l2d = str(fcObj.lane_2_library.library_id) + '|' \ + + fcObj.lane_2_library.library_name.replace('\r\n', '\n').replace('\r', '\n') + l3d = str(fcObj.lane_3_library.library_id) + '|' \ + + fcObj.lane_3_library.library_name.replace('\r\n', '\n').replace('\r', '\n') + l4d = str(fcObj.lane_4_library.library_id) + '|' \ + + fcObj.lane_4_library.library_name.replace('\r\n', '\n').replace('\r', '\n') + + l5d = str(fcObj.lane_5_library.library_id) + '|' \ + + fcObj.lane_5_library.library_name.replace('\r\n', '\n').replace('\r', '\n') + l6d = str(fcObj.lane_6_library.library_id) + '|' \ + + fcObj.lane_6_library.library_name.replace('\r\n', '\n').replace('\r', '\n') + l7d = str(fcObj.lane_7_library.library_id) + '|' \ + + fcObj.lane_7_library.library_name.replace('\r\n', '\n').replace('\r', '\n') + l8d = str(fcObj.lane_8_library.library_id) + '|' \ + + fcObj.lane_8_library.library_name.replace('\r\n', '\n').replace('\r', '\n') + + # Turn new lines into indented commented newlines + l1d = l1d.replace('\n', '\n# ') + l2d = l2d.replace('\n', '\n# ') + l3d = l3d.replace('\n', '\n# ') + l4d = l4d.replace('\n', '\n# ') + l5d = l5d.replace('\n', '\n# ') + l6d = l6d.replace('\n', '\n# ') + l7d = l7d.replace('\n', '\n# ') + l8d = l8d.replace('\n', '\n# ') + + data.append("# Lane1: %s" % (l1d)) + data.append("# Lane2: %s" % (l2d)) + data.append("# Lane3: %s" % (l3d)) + data.append("# Lane4: %s" % (l4d)) + data.append("# Lane5: %s" % (l5d)) + data.append("# Lane6: %s" % (l6d)) + data.append("# Lane7: %s" % (l7d)) + data.append("# Lane8: %s" % (l8d)) + + #data.append("GENOME_DIR %s" % (BASE_DIR)) + #data.append("CONTAM_DIR %s" % (BASE_DIR)) + read_length = fcObj.read_length + data.append("READ_LENGTH %d" % (read_length)) + #data.append("ELAND_REPEAT") + data.append("ELAND_MULTIPLE_INSTANCES 8") + + #Construct genome dictionary to figure out what lanes to put + # in the config file. + genome_dict = {} + + #l1s = form['lane1_species'] + l1s = fcObj.lane_1_library.library_species.scientific_name #+ '|' + \ + #fcObj.lane_1_library.library_species.use_genome_build + genome_dict.setdefault(l1s, []).append('1') + l2s = fcObj.lane_2_library.library_species.scientific_name #+ '|' + \ + #fcObj.lane_2_library.library_species.use_genome_build + genome_dict.setdefault(l2s, []).append('2') + l3s = fcObj.lane_3_library.library_species.scientific_name #+ '|' + \ + #fcObj.lane_3_library.library_species.use_genome_build + genome_dict.setdefault(l3s, []).append('3') + l4s = fcObj.lane_4_library.library_species.scientific_name #+ '|' + \ + #fcObj.lane_4_library.library_species.use_genome_build + genome_dict.setdefault(l4s, []).append('4') + l5s = fcObj.lane_5_library.library_species.scientific_name #+ '|' + \ + #fcObj.lane_5_library.library_species.use_genome_build + genome_dict.setdefault(l5s, []).append('5') + l6s = fcObj.lane_6_library.library_species.scientific_name #+ '|' + \ + #fcObj.lane_6_library.library_species.use_genome_build + genome_dict.setdefault(l6s, []).append('6') + l7s = fcObj.lane_7_library.library_species.scientific_name #+ '|' + \ + #fcObj.lane_7_library.library_species.use_genome_build + genome_dict.setdefault(l7s, []).append('7') + l8s = fcObj.lane_8_library.library_species.scientific_name #+ '|' + \ + #fcObj.lane_8_library.library_species.use_genome_build + genome_dict.setdefault(l8s, []).append('8') + + genome_list = genome_dict.keys() + genome_list.sort() + + #Loop through and create entries for each species. + for genome in genome_list: + lanes = ''.join(genome_dict[genome]) + data.append('%s:ANALYSIS eland' % (lanes)) + data.append('%s:READ_LENGTH %s' % (lanes, read_length)) + data.append('%s:ELAND_GENOME %s' % (lanes, '%%(%s)s' % (genome))) + data.append('%s:USE_BASES %s' % (lanes, 'Y'*int(read_length))) + + data.append('SEQUENCE_FORMAT --scarf') + + data = '\n'.join(data) + + f = open(file_path, 'w') + f.write(data) + f.close() + + return data + + + +def config(request, flowcell=None): + """ + Returns eland config file for a given flowcell number, + or returns a list of available flowcell numbers. + """ + + # Provide INDEX of available Flowcell config files. + if flowcell is None: + #Find all FC* config files and report an index html file + #fc_list = [ os.path.split(file_path)[1] for file_path in glob.glob(os.path.join(settings.UPLOADTO_CONFIG_FILE, 'FC*')) ] + fc_list = [ fc.flowcell_id for fc in models.FlowCell.objects.all() ] + + #Convert FC* list to html links + fc_html = [ '%s' % (fc_name, fc_name) for fc_name in fc_list ] + + return HttpResponse('
'.join(fc_html)) + + #FIXME: Should validate flowcell input before using. + flowcell = _validate_input(flowcell) + cfg = getElandConfig(flowcell, regenerate=True) + + if not cfg: + return HttpResponse("Hmm, config file for %s does not seem to exist." % (flowcell)) + + + return HttpResponse(cfg, mimetype="text/plain") + + + + +#def index(request): +# """ +# Return a form for filling out information about the flowcell +# """ +# if request.method == 'POST': +# form = forms.ConfigForm(request.POST, error_class=forms.DivErrorList) +# if form.is_valid(): +# #cfg = generateElandConfig(form) +# _saveConfigFile(form) +# _saveToDb(form) +# return HttpResponse("Eland Config Saved!", mimetype="text/plain") +# else: +# return render_to_response('config_form.html', {'form': form }) +# +# else: +# fm = forms.ConfigForm(error_class=forms.DivErrorList) +# return render_to_response('config_form.html', {'form': fm }) diff --git a/htsworkflow/frontend/fctracker/__init__.py b/htsworkflow/frontend/fctracker/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/htsworkflow/frontend/fctracker/models.py b/htsworkflow/frontend/fctracker/models.py new file mode 100644 index 0000000..e12f0fe --- /dev/null +++ b/htsworkflow/frontend/fctracker/models.py @@ -0,0 +1,283 @@ +from django.db import models +from django.contrib.auth.models import User +from htsworkflow.frontend import settings + +# Create your models here. + +class Antibody(models.Model): + antigene = models.CharField(max_length=500, db_index=True) + catalog = models.CharField(max_length=50, unique=True, db_index=True) + antibodies = models.CharField(max_length=500, db_index=True) + source = models.CharField(max_length=500, blank=True, db_index=True) + biology = models.TextField(blank=True) + notes = models.TextField(blank=True) + def __str__(self): + return '%s - %s (%s)' % (self.antigene, self.antibodies, self.catalog) + class Meta: + verbose_name_plural = "antibodies" + ordering = ["antigene"] + class Admin: + list_display = ('antigene','antibodies','catalog','source','biology','notes') + list_filter = ('antibodies','source') + fields = ( + (None, { + 'fields': (('antigene','antibodies'),('catalog','source'),('biology'),('notes')) + }), + ) + +class Cellline(models.Model): + cellline_name = models.CharField(max_length=100, unique=True, db_index=True) + notes = models.TextField(blank=True) + def __str__(self): + return '%s' % (self.cellline_name) + + class Meta: + ordering = ["cellline_name"] + + class Admin: + fields = ( + (None, { + 'fields': (('cellline_name'),('notes'),) + }), + ) + +class Condition(models.Model): + condition_name = models.CharField(max_length=2000, unique=True, db_index=True) + notes = models.TextField(blank=True) + def __str__(self): + return '%s' % (self.condition_name) + + class Meta: + ordering = ["condition_name"] + + class Admin: + fields = ( + (None, { + 'fields': (('condition_name'),('notes'),) + }), + ) + +class Species(models.Model): + + scientific_name = models.CharField(max_length=256, unique=False, db_index=True, core=True) + common_name = models.CharField(max_length=256, blank=True) + use_genome_build = models.CharField(max_length=100, blank=False, null=False) + + def __str__(self): + return '%s (%s)|%s' % (self.scientific_name, self.common_name, self.use_genome_build) + + class Meta: + verbose_name_plural = "species" + ordering = ["scientific_name"] + + class Admin: + fields = ( + (None, { + 'fields': (('scientific_name', 'common_name'), ('use_genome_build')) + }), + ) + +class Lab(models.Model): + + name = models.CharField(max_length=100, blank=False, unique=True) + + def __str__(self): + return self.name + + class Admin: + pass + +class UserProfile(models.Model): + + # This allows you to use user.get_profile() to get this object + user = models.ForeignKey(User, unique=True) + + lab = models.ForeignKey(Lab) + #email = models.CharField(max_length=50, blank=True, null=True) + + def __str__(self): + return '%s (%s lab)' % (self.user, self.lab) + + class Meta: + #verbose_name_plural = "people" + #ordering = ["lab"] + pass + + class Admin: + #fields = ( + # (None, { + # 'fields': (('email', 'lab'), ('email')) + # }), + #) + pass + + +class Library(models.Model): + + library_id = models.CharField(max_length=30, primary_key=True, db_index=True, core=True) + library_name = models.CharField(max_length=100, unique=True, core=True) + library_species = models.ForeignKey(Species, core=True) + cell_line = models.ForeignKey(Cellline,core=True) + condition = models.ForeignKey(Condition,core=True) + antibody = models.ForeignKey(Antibody,blank=True,null=True,core=True) + + EXPERIMENT_TYPES = ( + ('INPUT_RXLCh','INPUT_RXLCh'), + ('ChIP-seq', 'ChIP-seq'), + ('Sheared', 'Sheared'), + ('RNA-seq', 'RNA-seq'), + ('Methyl-seq', 'Methyl-seq'), + ('DIP-seq', 'DIP-seq'), + ) + experiment_type = models.CharField(max_length=50, choices=EXPERIMENT_TYPES, + default='RNA-seq') + + creation_date = models.DateField(blank=True, null=True) + made_for = models.ForeignKey(User) + made_by = models.CharField(max_length=50, blank=True, default="Lorian") + + PROTOCOL_END_POINTS = ( + ('?', 'Unknown'), + ('Sample', 'Raw sample'), + ('Progress', 'In progress'), + ('1A', 'Ligation, then gel'), + ('PCR', 'Ligation, then PCR'), + ('1Ab', 'Ligation, PCR, then gel'), + ('1Aa', 'Ligation, gel, then PCR'), + ('2A', 'Ligation, PCR, gel, PCR'), + ('Done', 'Completed'), + ) + stopping_point = models.CharField(max_length=25, choices=PROTOCOL_END_POINTS, default='Done') + amplified_from_sample = models.ForeignKey('self', blank=True, null=True) + + undiluted_concentration = models.DecimalField("Undiluted concentration (ng/ul)", max_digits=5, decimal_places=2, default=0, blank=True, null=True) + successful_pM = models.DecimalField(max_digits=5, decimal_places=2, blank=True, null=True) + ten_nM_dilution = models.BooleanField() + avg_lib_size = models.IntegerField(default=225, blank=True, null=True) + notes = models.TextField(blank=True) + + def __str__(self): + return '#%s: %s' % (self.library_id, self.library_name) + + class Meta: + verbose_name_plural = "libraries" + ordering = ["-library_id"] + + class Admin: + date_hierarchy = "creation_date" + save_as = True + save_on_top = True + search_fields = ['library_name', 'library_id'] + list_display = ('library_id', 'library_name', 'made_for', 'creation_date', 'stopping_point') + list_display_links = ('library_id', 'library_name') + list_filter = ('stopping_point', 'library_species', 'made_for', 'made_by', 'experiment_type') + fields = ( + (None, { + 'fields': (('library_id', 'library_name'), ('library_species', 'experiment_type'),) + }), + ('Creation Information:', { + 'fields' : (('made_for', 'made_by', 'creation_date'), ('stopping_point', 'amplified_from_sample'), ('undiluted_concentration', 'library_size'), 'notes',) + }), + ('Run Information:', { + 'fields' : (('ten_nM_dilution','successful_pM'),) + }), + ) + +class FlowCell(models.Model): + + flowcell_id = models.CharField(max_length=20, unique=True, db_index=True, core=True) + run_date = models.DateTimeField(core=True) + advanced_run = models.BooleanField(default=False) + read_length = models.IntegerField(default=32) + + + FLOWCELL_STATUSES = ( + ('No', 'Not run'), + ('F', 'Failed'), + ('Del', 'Data deleted'), + ('A', 'Data available'), + ('In', 'In progress'), + ) + flowcell_status = models.CharField(max_length=10, choices=FLOWCELL_STATUSES) + + lane_1_library = models.ForeignKey(Library, related_name="lane_1_library") + lane_2_library = models.ForeignKey(Library, related_name="lane_2_library") + lane_3_library = models.ForeignKey(Library, related_name="lane_3_library") + lane_4_library = models.ForeignKey(Library, related_name="lane_4_library") + lane_5_library = models.ForeignKey(Library, related_name="lane_5_library") + lane_6_library = models.ForeignKey(Library, related_name="lane_6_library") + lane_7_library = models.ForeignKey(Library, related_name="lane_7_library") + lane_8_library = models.ForeignKey(Library, related_name="lane_8_library") + + lane_1_pM = models.DecimalField(max_digits=5, decimal_places=2, default=4) + lane_2_pM = models.DecimalField(max_digits=5, decimal_places=2, default=4) + lane_3_pM = models.DecimalField(max_digits=5, decimal_places=2, default=4) + lane_4_pM = models.DecimalField(max_digits=5, decimal_places=2, default=4) + lane_5_pM = models.DecimalField(max_digits=5, decimal_places=2, default=4) + lane_6_pM = models.DecimalField(max_digits=5, decimal_places=2, default=4) + lane_7_pM = models.DecimalField(max_digits=5, decimal_places=2, default=4) + lane_8_pM = models.DecimalField(max_digits=5, decimal_places=2, default=4) + + lane_1_cluster_estimate = models.CharField(max_length=25, blank=True, null=True) + lane_2_cluster_estimate = models.CharField(max_length=25, blank=True, null=True) + lane_3_cluster_estimate = models.CharField(max_length=25, blank=True, null=True) + lane_4_cluster_estimate = models.CharField(max_length=25, blank=True, null=True) + lane_5_cluster_estimate = models.CharField(max_length=25, blank=True, null=True) + lane_6_cluster_estimate = models.CharField(max_length=25, blank=True, null=True) + lane_7_cluster_estimate = models.CharField(max_length=25, blank=True, null=True) + lane_8_cluster_estimate = models.CharField(max_length=25, blank=True, null=True) + + kit_1000148 = models.IntegerField(blank=True, null=True) + kit_1000147 = models.IntegerField(blank=True, null=True) + kit_1000183 = models.IntegerField(blank=True, null=True) + kit_1001625 = models.IntegerField(blank=True, null=True) + + cluster_station_id = models.CharField(max_length=50, blank=True, null=True) + sequencer_id = models.CharField(max_length=50, blank=True, null=True) + + notes = models.TextField(blank=True) + + def __str__(self): + return '%s (%s)' % (self.flowcell_id, self.run_date) + + class Meta: + ordering = ["-run_date"] + + class Admin: + date_hierarchy = "run_date" + save_as = True + save_on_top = True + search_fields = ['flowcell_id', 'lane_1_library__library_id', 'lane_1_library__library_name', 'lane_2_library__library_id', 'lane_2_library__library_name', 'lane_3_library__library_id', 'lane_3_library__library_name', 'lane_4_library__library_id', 'lane_4_library__library_name', 'lane_5_library__library_id', 'lane_5_library__library_name', 'lane_6_library__library_id', 'lane_6_library__library_name', 'lane_7_library__library_id', 'lane_7_library__library_name', 'lane_8_library__library_id', 'lane_8_library__library_name'] + list_display = ('run_date', 'flowcell_status', 'flowcell_id', 'lane_1_library', 'lane_2_library', 'lane_3_library', 'lane_4_library', 'lane_5_library', 'lane_6_library', 'lane_7_library', 'lane_8_library') + list_display_links = ('run_date', 'flowcell_id', 'lane_1_library', 'lane_2_library', 'lane_3_library', 'lane_4_library', 'lane_5_library', 'lane_6_library', 'lane_7_library', 'lane_8_library') + fields = ( + (None, { + 'fields': ('run_date', ('flowcell_id', 'flowcell_status'), ('read_length', 'advanced_run'),) + }), + ('Lanes:', { + 'fields' : (('lane_1_library', 'lane_1_pM'), ('lane_2_library', 'lane_2_pM'), ('lane_3_library', 'lane_3_pM'), ('lane_4_library', 'lane_4_pM'), ('lane_5_library', 'lane_5_pM'), ('lane_6_library', 'lane_6_pM'), ('lane_7_library', 'lane_7_pM'), ('lane_8_library', 'lane_8_pM'),) + }), + (None, { + 'fields' : ('notes',) + }), + ('Kits & Machines:', { + 'classes': 'collapse', + 'fields' : (('kit_1000148', 'kit_1000147', 'kit_1000183', 'kit_1001625'), ('cluster_station_id', 'sequencer_id'),) + }), + ('Cluster Estimates:', { + 'classes': 'collapse', + 'fields' : (('lane_1_cluster_estimate', 'lane_2_cluster_estimate'), ('lane_3_cluster_estimate', 'lane_4_cluster_estimate'), ('lane_5_cluster_estimate', 'lane_6_cluster_estimate'), ('lane_7_cluster_estimate', 'lane_8_cluster_estimate',),) + }), + ) + +# Did not finish implementing, removing to avoid further confusion. +#class ElandResult(models.Model): +# +# class Admin: pass +# +# flow_cell = models.ForeignKey(FlowCell) +# config_file = models.FileField(upload_to=settings.UPLOADTO_CONFIG_FILE) +# eland_result_pack = models.FileField(upload_to=settings.UPLOADTO_ELAND_RESULT_PACKS) +# bed_file_pack = models.FileField(upload_to=settings.UPLOADTO_BED_PACKS) +# +# notes = models.TextField(blank=True) diff --git a/htsworkflow/frontend/fctracker/views.py b/htsworkflow/frontend/fctracker/views.py new file mode 100644 index 0000000..2299e4f --- /dev/null +++ b/htsworkflow/frontend/fctracker/views.py @@ -0,0 +1 @@ +# Create your views here. \ No newline at end of file diff --git a/htsworkflow/frontend/manage.py b/htsworkflow/frontend/manage.py new file mode 100644 index 0000000..5e78ea9 --- /dev/null +++ b/htsworkflow/frontend/manage.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +from django.core.management import execute_manager +try: + import settings # Assumed to be in the same directory. +except ImportError: + import sys + sys.stderr.write("Error: Can't find the file 'settings.py' in the directory containing %r. It appears you've customized things.\nYou'll have to run django-admin.py, passing it your settings module.\n(If the file settings.py does indeed exist, it's causing an ImportError somehow.)\n" % __file__) + sys.exit(1) + +if __name__ == "__main__": + execute_manager(settings) diff --git a/htsworkflow/frontend/settings.py b/htsworkflow/frontend/settings.py new file mode 100644 index 0000000..c9bee20 --- /dev/null +++ b/htsworkflow/frontend/settings.py @@ -0,0 +1,94 @@ +import os + +# Django settings for elandifier project. + +DEBUG = True +TEMPLATE_DEBUG = DEBUG + +ADMINS = ( + # ('Your Name', 'your_email@domain.com'), +) + +MANAGERS = ADMINS + +DATABASE_ENGINE = 'sqlite3' # 'postgresql_psycopg2', 'postgresql', 'mysql', 'sqlite3' or 'ado_mssql'. +DATABASE_NAME = os.path.abspath('../../fctracker.db') # Or path to database file if using sqlite3. +DATABASE_USER = '' # Not used with sqlite3. +DATABASE_PASSWORD = '' # Not used with sqlite3. +DATABASE_HOST = '' # Set to empty string for localhost. Not used with sqlite3. +DATABASE_PORT = '' # Set to empty string for default. Not used with sqlite3. + +# Local time zone for this installation. Choices can be found here: +# http://www.postgresql.org/docs/8.1/static/datetime-keywords.html#DATETIME-TIMEZONE-SET-TABLE +# although not all variations may be possible on all operating systems. +# If running in a Windows environment this must be set to the same as your +# system time zone. +TIME_ZONE = 'America/Los_Angeles' + +# Language code for this installation. All choices can be found here: +# http://www.w3.org/TR/REC-html40/struct/dirlang.html#langcodes +# http://blogs.law.harvard.edu/tech/stories/storyReader$15 +LANGUAGE_CODE = 'en-us' + +SITE_ID = 1 + +# If you set this to False, Django will make some optimizations so as not +# to load the internationalization machinery. +USE_I18N = True + +# Absolute path to the directory that holds media. +# Example: "/home/media/media.lawrence.com/" +MEDIA_ROOT = '' + +# URL that handles the media served from MEDIA_ROOT. +# Example: "http://media.lawrence.com" +MEDIA_URL = '' + +# URL prefix for admin media -- CSS, JavaScript and images. Make sure to use a +# trailing slash. +# Examples: "http://foo.com/media/", "/media/". +ADMIN_MEDIA_PREFIX = '/media/' + +# Make this unique, and don't share it with anybody. +SECRET_KEY = '(ekv^=gf(j9f(x25@a7r+8)hqlz%&_1!tw^75l%^041#vi=@4n' + +# List of callables that know how to import templates from various sources. +TEMPLATE_LOADERS = ( + 'django.template.loaders.filesystem.load_template_source', + 'django.template.loaders.app_directories.load_template_source', +# 'django.template.loaders.eggs.load_template_source', +) + +MIDDLEWARE_CLASSES = ( + 'django.middleware.common.CommonMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.middleware.doc.XViewMiddleware', +) + +ROOT_URLCONF = 'htsworkflow.frontend.urls' + +TEMPLATE_DIRS = ( + # Put strings here, like "/home/html/django_templates" or "C:/www/django/templates". + # Always use forward slashes, even on Windows. + # Don't forget to use absolute paths, not relative paths. + os.path.abspath("../../templates"), +) + +INSTALLED_APPS = ( + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.sites', + 'htsworkflow.frontend.eland_config', + 'htsworkflow.frontend.fctracker', + 'django.contrib.databrowse', +) + +# Project specific settings +UPLOADTO_HOME = os.path.abspath('../../uploads') +UPLOADTO_CONFIG_FILE = os.path.join(UPLOADTO_HOME, 'eland_config') +UPLOADTO_ELAND_RESULT_PACKS = os.path.join(UPLOADTO_HOME, 'eland_results') +UPLOADTO_BED_PACKS = os.path.join(UPLOADTO_HOME, 'bed_packs') + diff --git a/htsworkflow/frontend/urls.py b/htsworkflow/frontend/urls.py new file mode 100644 index 0000000..a791b85 --- /dev/null +++ b/htsworkflow/frontend/urls.py @@ -0,0 +1,16 @@ +from django.conf.urls.defaults import * + +# Databrowser: +from django.contrib import databrowse +from fctracker.models import Library, FlowCell +databrowse.site.register(Library) +databrowse.site.register(FlowCell) + +urlpatterns = patterns('', + # Base: + (r'^eland_config/', include('htsworkflow.frontend.eland_config.urls')), + # Admin: + (r'^admin/', include('django.contrib.admin.urls')), + # Databrowser: + (r'^databrowse/(.*)', databrowse.site.root), +) diff --git a/htsworkflow/pipeline/__init__.py b/htsworkflow/pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/htsworkflow/pipeline/bustard.py b/htsworkflow/pipeline/bustard.py new file mode 100644 index 0000000..4e268f2 --- /dev/null +++ b/htsworkflow/pipeline/bustard.py @@ -0,0 +1,146 @@ + +from datetime import date +from glob import glob +import logging +import os +import time +import re + +from htsworkflow.pipeline.runfolder import \ + ElementTree, \ + VERSION_RE, \ + EUROPEAN_STRPTIME + +class Phasing(object): + PHASING = 'Phasing' + PREPHASING = 'Prephasing' + + def __init__(self, fromfile=None, xml=None): + self.lane = None + self.phasing = None + self.prephasing = None + + if fromfile is not None: + self._initialize_from_file(fromfile) + elif xml is not None: + self.set_elements(xml) + + def _initialize_from_file(self, pathname): + path, name = os.path.split(pathname) + basename, ext = os.path.splitext(name) + # the last character of the param base filename should be the + # lane number + tree = ElementTree.parse(pathname).getroot() + self.set_elements(tree) + self.lane = int(basename[-1]) + + def get_elements(self): + root = ElementTree.Element(Phasing.PHASING, {'lane': str(self.lane)}) + phasing = ElementTree.SubElement(root, Phasing.PHASING) + phasing.text = str(self.phasing) + prephasing = ElementTree.SubElement(root, Phasing.PREPHASING) + prephasing.text = str(self.prephasing) + return root + + def set_elements(self, tree): + if tree.tag not in ('Phasing', 'Parameters'): + raise ValueError('exptected Phasing or Parameters') + lane = tree.attrib.get('lane', None) + if lane is not None: + self.lane = int(lane) + for element in list(tree): + if element.tag == Phasing.PHASING: + self.phasing = float(element.text) + elif element.tag == Phasing.PREPHASING: + self.prephasing = float(element.text) + +class Bustard(object): + XML_VERSION = 1 + + # Xml Tags + BUSTARD = 'Bustard' + SOFTWARE_VERSION = 'version' + DATE = 'run_time' + USER = 'user' + PARAMETERS = 'Parameters' + + def __init__(self, xml=None): + self.version = None + self.date = date.today() + self.user = None + self.phasing = {} + + if xml is not None: + self.set_elements(xml) + + def _get_time(self): + return time.mktime(self.date.timetuple()) + time = property(_get_time, doc='return run time as seconds since epoch') + + def dump(self): + print "Bustard version:", self.version + print "Run date", self.date + print "user:", self.user + for lane, tree in self.phasing.items(): + print lane + print tree + + def get_elements(self): + root = ElementTree.Element('Bustard', + {'version': str(Bustard.XML_VERSION)}) + version = ElementTree.SubElement(root, Bustard.SOFTWARE_VERSION) + version.text = self.version + run_date = ElementTree.SubElement(root, Bustard.DATE) + run_date.text = str(self.time) + user = ElementTree.SubElement(root, Bustard.USER) + user.text = self.user + params = ElementTree.SubElement(root, Bustard.PARAMETERS) + for p in self.phasing.values(): + params.append(p.get_elements()) + return root + + def set_elements(self, tree): + if tree.tag != Bustard.BUSTARD: + raise ValueError('Expected "Bustard" SubElements') + xml_version = int(tree.attrib.get('version', 0)) + if xml_version > Bustard.XML_VERSION: + logging.warn('Bustard XML tree is a higher version than this class') + for element in list(tree): + if element.tag == Bustard.SOFTWARE_VERSION: + self.version = element.text + elif element.tag == Bustard.DATE: + self.date = date.fromtimestamp(float(element.text)) + elif element.tag == Bustard.USER: + self.user = element.text + elif element.tag == Bustard.PARAMETERS: + for param in element: + p = Phasing(xml=param) + self.phasing[p.lane] = p + else: + raise ValueError("Unrecognized tag: %s" % (element.tag,)) + + + +def bustard(pathname): + """ + Construct a Bustard object from pathname + """ + b = Bustard() + path, name = os.path.split(pathname) + groups = name.split("_") + version = re.search(VERSION_RE, groups[0]) + b.version = version.group(1) + t = time.strptime(groups[1], EUROPEAN_STRPTIME) + b.date = date(*t[0:3]) + b.user = groups[2] + paramfiles = glob(os.path.join(pathname, "params?.xml")) + for paramfile in paramfiles: + phasing = Phasing(paramfile) + assert (phasing.lane >= 1 and phasing.lane <= 8) + b.phasing[phasing.lane] = phasing + return b + +def fromxml(tree): + b = Bustard() + b.set_elements(tree) + return b diff --git a/htsworkflow/pipeline/configure_run.py b/htsworkflow/pipeline/configure_run.py new file mode 100644 index 0000000..d541de3 --- /dev/null +++ b/htsworkflow/pipeline/configure_run.py @@ -0,0 +1,606 @@ +#!/usr/bin/python +import subprocess +import logging +import time +import re +import os + +from htsworkflow.pipeline.retrieve_config import getCombinedOptions, saveConfigFile +from htsworkflow.pipeline.retrieve_config import FlowCellNotFound, WebError404 +from htsworkflow.pipeline.genome_mapper import DuplicateGenome, getAvailableGenomes, constructMapperDict +from htsworkflow.pipeline.run_status import GARunStatus + +from pyinotify import WatchManager, ThreadedNotifier +from pyinotify import EventsCodes, ProcessEvent + +class ConfigInfo: + + def __init__(self): + #run_path = firecrest analysis directory to run analysis from + self.run_path = None + self.bustard_path = None + self.config_filepath = None + self.status = None + + #top level directory where all analyses are placed + self.base_analysis_dir = None + #analysis_dir, top level analysis dir... + # base_analysis_dir + '/070924_USI-EAS44_0022_FC12150' + self.analysis_dir = None + + + def createStatusObject(self): + """ + Creates a status object which can be queried for + status of running the pipeline + + returns True if object created + returns False if object cannot be created + """ + if self.config_filepath is None: + return False + + self.status = GARunStatus(self.config_filepath) + return True + + + +#################################### +# inotify event processor + +s_firecrest_finished = re.compile('Firecrest[0-9\._\-A-Za-z]+/finished.txt') +s_bustard_finished = re.compile('Bustard[0-9\._\-A-Za-z]+/finished.txt') +s_gerald_finished = re.compile('GERALD[0-9\._\-A-Za-z]+/finished.txt') + +s_gerald_all = re.compile('Firecrest[0-9\._\-A-Za-z]+/Bustard[0-9\._\-A-Za-z]+/GERALD[0-9\._\-A-Za-z]+/') +s_bustard_all = re.compile('Firecrest[0-9\._\-A-Za-z]+/Bustard[0-9\._\-A-Za-z]+/') +s_firecrest_all = re.compile('Firecrest[0-9\._\-A-Za-z]+/') + +class RunEvent(ProcessEvent): + + def __init__(self, conf_info): + + self.run_status_dict = {'firecrest': False, + 'bustard': False, + 'gerald': False} + + self._ci = conf_info + + ProcessEvent.__init__(self) + + + def process_IN_CREATE(self, event): + fullpath = os.path.join(event.path, event.name) + if s_finished.search(fullpath): + logging.info("File Found: %s" % (fullpath)) + + if s_firecrest_finished.search(fullpath): + self.run_status_dict['firecrest'] = True + self._ci.status.updateFirecrest(event.name) + elif s_bustard_finished.search(fullpath): + self.run_status_dict['bustard'] = True + self._ci.status.updateBustard(event.name) + elif s_gerald_finished.search(fullpath): + self.run_status_dict['gerald'] = True + self._ci.status.updateGerald(event.name) + + #WARNING: The following order is important!! + # Firecrest regex will catch all gerald, bustard, and firecrest + # Bustard regex will catch all gerald and bustard + # Gerald regex will catch all gerald + # So, order needs to be Gerald, Bustard, Firecrest, or this + # won't work properly. + elif s_gerald_all.search(fullpath): + self._ci.status.updateGerald(event.name) + elif s_bustard_all.search(fullpath): + self._ci.status.updateBustard(event.name) + elif s_firecrest_all.search(fullpath): + self._ci.status.updateFirecrest(event.name) + + #print "Create: %s" % (os.path.join(event.path, event.name)) + + def process_IN_DELETE(self, event): + #print "Remove %s" % (os.path.join(event.path, event.name)) + pass + + + + +#FLAGS +# Config Step Error +RUN_ABORT = 'abort' +# Run Step Error +RUN_FAILED = 'failed' + + +##################################### +# Configure Step (goat_pipeline.py) +#Info +s_start = re.compile('Starting Genome Analyzer Pipeline') +s_gerald = re.compile("[\S\s]+--GERALD[\S\s]+--make[\S\s]+") +s_generating = re.compile('^Generating journals, Makefiles') +s_seq_folder = re.compile('^Sequence folder: ') +s_seq_folder_sub = re.compile('want to make ') +s_stderr_taskcomplete = re.compile('^Task complete, exiting') + +#Errors +s_invalid_cmdline = re.compile('Usage:[\S\s]*goat_pipeline.py') +s_species_dir_err = re.compile('Error: Lane [1-8]:') +s_goat_traceb = re.compile("^Traceback \(most recent call last\):") +s_missing_cycles = re.compile('^Error: Tile s_[1-8]_[0-9]+: Different number of cycles: [0-9]+ instead of [0-9]+') + +SUPPRESS_MISSING_CYCLES = False + + +##Ignore - Example of out above each ignore regex. +#NOTE: Commenting out an ignore will cause it to be +# logged as DEBUG with the logging module. +#CF_STDERR_IGNORE_LIST = [] +s_skip = re.compile('s_[0-8]_[0-9]+') + + +########################################## +# Pipeline Run Step (make -j8 recursive) + +##Info +s_finished = re.compile('finished') + +##Errors +s_make_error = re.compile('^make[\S\s]+Error') +s_no_gnuplot = re.compile('gnuplot: command not found') +s_no_convert = re.compile('^Can\'t exec "convert"') +s_no_ghostscript = re.compile('gs: command not found') + +##Ignore - Example of out above each ignore regex. +#NOTE: Commenting out an ignore will cause it to be +# logged as DEBUG with the logging module. +# +PL_STDERR_IGNORE_LIST = [] +# Info: PF 11802 +PL_STDERR_IGNORE_LIST.append( re.compile('^Info: PF') ) +# About to analyse intensity file s_4_0101_sig2.txt +PL_STDERR_IGNORE_LIST.append( re.compile('^About to analyse intensity file') ) +# Will send output to standard output +PL_STDERR_IGNORE_LIST.append( re.compile('^Will send output to standard output') ) +# Found 31877 clusters +PL_STDERR_IGNORE_LIST.append( re.compile('^Found [0-9]+ clusters') ) +# Will use quality criterion ((CHASTITY>=0.6) +PL_STDERR_IGNORE_LIST.append( re.compile('^Will use quality criterion') ) +# Quality criterion translated to (($F[5]>=0.6)) +PL_STDERR_IGNORE_LIST.append( re.compile('^Quality criterion translated to') ) +# opened /woldlab/trog/data1/king/070924_USI-EAS44_0022_FC12150/Data/C1-36_Firecrest1.9.1_14-11-2007_king.4/Bustard1.9.1_14-11-2007_king/s_4_0101_qhg.txt +# AND +# opened s_4_0103_qhg.txt +PL_STDERR_IGNORE_LIST.append( re.compile('^opened[\S\s]+qhg.txt') ) +# 81129 sequences out of 157651 passed filter criteria +PL_STDERR_IGNORE_LIST.append( re.compile('^[0-9]+ sequences out of [0-9]+ passed filter criteria') ) + + +def pl_stderr_ignore(line): + """ + Searches lines for lines to ignore (i.e. not to log) + + returns True if line should be ignored + returns False if line should NOT be ignored + """ + for s in PL_STDERR_IGNORE_LIST: + if s.search(line): + return True + return False + + +def config_stdout_handler(line, conf_info): + """ + Processes each line of output from GOAT + and stores useful information using the logging module + + Loads useful information into conf_info as well, for future + use outside the function. + + returns True if found condition that signifies success. + """ + + # Skip irrelevant line (without logging) + if s_skip.search(line): + pass + + # Detect invalid command-line arguments + elif s_invalid_cmdline.search(line): + logging.error("Invalid commandline options!") + + # Detect starting of configuration + elif s_start.search(line): + logging.info('START: Configuring pipeline') + + # Detect it made it past invalid arguments + elif s_gerald.search(line): + logging.info('Running make now') + + # Detect that make files have been generated (based on output) + elif s_generating.search(line): + logging.info('Make files generted') + return True + + # Capture run directory + elif s_seq_folder.search(line): + mo = s_seq_folder_sub.search(line) + #Output changed when using --tiles= + # at least in pipeline v0.3.0b2 + if mo: + firecrest_bustard_gerald_makefile = line[mo.end():] + firecrest_bustard_gerald, junk = \ + os.path.split(firecrest_bustard_gerald_makefile) + firecrest_bustard, junk = os.path.split(firecrest_bustard_gerald) + firecrest, junk = os.path.split(firecrest_bustard) + + conf_info.bustard_path = firecrest_bustard + conf_info.run_path = firecrest + + #Standard output handling + else: + print 'Sequence line:', line + mo = s_seq_folder.search(line) + conf_info.bustard_path = line[mo.end():] + conf_info.run_path, temp = os.path.split(conf_info.bustard_path) + + # Log all other output for debugging purposes + else: + logging.warning('CONF:?: %s' % (line)) + + return False + + + +def config_stderr_handler(line, conf_info): + """ + Processes each line of output from GOAT + and stores useful information using the logging module + + Loads useful information into conf_info as well, for future + use outside the function. + + returns RUN_ABORT upon detecting failure; + True on success message; + False if neutral message + (i.e. doesn't signify failure or success) + """ + global SUPPRESS_MISSING_CYCLES + + # Detect invalid species directory error + if s_species_dir_err.search(line): + logging.error(line) + return RUN_ABORT + # Detect goat_pipeline.py traceback + elif s_goat_traceb.search(line): + logging.error("Goat config script died, traceback in debug output") + return RUN_ABORT + # Detect indication of successful configuration (from stderr; odd, but ok) + elif s_stderr_taskcomplete.search(line): + logging.info('Configure step successful (from: stderr)') + return True + # Detect missing cycles + elif s_missing_cycles.search(line): + + # Only display error once + if not SUPPRESS_MISSING_CYCLES: + logging.error("Missing cycles detected; Not all cycles copied?") + logging.debug("CONF:STDERR:MISSING_CYCLES: %s" % (line)) + SUPPRESS_MISSING_CYCLES = True + return RUN_ABORT + + # Log all other output as debug output + else: + logging.debug('CONF:STDERR:?: %s' % (line)) + + # Neutral (not failure; nor success) + return False + + +#def pipeline_stdout_handler(line, conf_info): +# """ +# Processes each line of output from running the pipeline +# and stores useful information using the logging module +# +# Loads useful information into conf_info as well, for future +# use outside the function. +# +# returns True if found condition that signifies success. +# """ +# +# #f.write(line + '\n') +# +# return True + + + +def pipeline_stderr_handler(line, conf_info): + """ + Processes each line of stderr from pipelien run + and stores useful information using the logging module + + ##FIXME: Future feature (doesn't actually do this yet) + #Loads useful information into conf_info as well, for future + #use outside the function. + + returns RUN_FAILED upon detecting failure; + #True on success message; (no clear success state) + False if neutral message + (i.e. doesn't signify failure or success) + """ + + if pl_stderr_ignore(line): + pass + elif s_make_error.search(line): + logging.error("make error detected; run failed") + return RUN_FAILED + elif s_no_gnuplot.search(line): + logging.error("gnuplot not found") + return RUN_FAILED + elif s_no_convert.search(line): + logging.error("imagemagick's convert command not found") + return RUN_FAILED + elif s_no_ghostscript.search(line): + logging.error("ghostscript not found") + return RUN_FAILED + else: + logging.debug('PIPE:STDERR:?: %s' % (line)) + + return False + + +def retrieve_config(conf_info, flowcell, cfg_filepath, genome_dir): + """ + Gets the config file from server... + requires config file in: + /etc/ga_frontend/ga_frontend.conf + or + ~/.ga_frontend.conf + + with: + [config_file_server] + base_host_url: http://host:port + + return True if successful, False is failure + """ + options = getCombinedOptions() + + if options.url is None: + logging.error("~/.ga_frontend.conf or /etc/ga_frontend/ga_frontend.conf" \ + " missing base_host_url option") + return False + + try: + saveConfigFile(flowcell, options.url, cfg_filepath) + conf_info.config_filepath = cfg_filepath + except FlowCellNotFound, e: + logging.error(e) + return False + except WebError404, e: + logging.error(e) + return False + except IOError, e: + logging.error(e) + return False + except Exception, e: + logging.error(e) + return False + + f = open(cfg_filepath, 'r') + data = f.read() + f.close() + + genome_dict = getAvailableGenomes(genome_dir) + mapper_dict = constructMapperDict(genome_dict) + + logging.debug(data) + + f = open(cfg_filepath, 'w') + f.write(data % (mapper_dict)) + f.close() + + return True + + + +def configure(conf_info): + """ + Attempts to configure the GA pipeline using goat. + + Uses logging module to store information about status. + + returns True if configuration successful, otherwise False. + """ + #ERROR Test: + #pipe = subprocess.Popen(['goat_pipeline.py', + # '--GERALD=config32bk.txt', + # '--make .',], + # #'.'], + # stdout=subprocess.PIPE, + # stderr=subprocess.PIPE) + + #ERROR Test (2), causes goat_pipeline.py traceback + #pipe = subprocess.Popen(['goat_pipeline.py', + # '--GERALD=%s' % (conf_info.config_filepath), + # '--tiles=s_4_100,s_4_101,s_4_102,s_4_103,s_4_104', + # '--make', + # '.'], + # stdout=subprocess.PIPE, + # stderr=subprocess.PIPE) + + ########################## + # Run configuration step + # Not a test; actual configure attempt. + #pipe = subprocess.Popen(['goat_pipeline.py', + # '--GERALD=%s' % (conf_info.config_filepath), + # '--make', + # '.'], + # stdout=subprocess.PIPE, + # stderr=subprocess.PIPE) + + + stdout_filepath = os.path.join(conf_info.analysis_dir, + "pipeline_configure_stdout.txt") + stderr_filepath = os.path.join(conf_info.analysis_dir, + "pipeline_configure_stderr.txt") + + fout = open(stdout_filepath, 'w') + ferr = open(stderr_filepath, 'w') + + pipe = subprocess.Popen(['goat_pipeline.py', + '--GERALD=%s' % (conf_info.config_filepath), + #'--tiles=s_4_0100,s_4_0101,s_4_0102,s_4_0103,s_4_0104', + '--make', + conf_info.analysis_dir], + stdout=fout, + stderr=ferr) + + print "Configuring pipeline: %s" % (time.ctime()) + error_code = pipe.wait() + + # Clean up + fout.close() + ferr.close() + + + ################## + # Process stdout + fout = open(stdout_filepath, 'r') + + stdout_line = fout.readline() + + complete = False + while stdout_line != '': + # Handle stdout + if config_stdout_handler(stdout_line, conf_info): + complete = True + stdout_line = fout.readline() + + fout.close() + + + #error_code = pipe.wait() + if error_code: + logging.error('Recieved error_code: %s' % (error_code)) + else: + logging.info('We are go for launch!') + + #Process stderr + ferr = open(stderr_filepath, 'r') + stderr_line = ferr.readline() + + abort = 'NO!' + stderr_success = False + while stderr_line != '': + stderr_status = config_stderr_handler(stderr_line, conf_info) + if stderr_status == RUN_ABORT: + abort = RUN_ABORT + elif stderr_status is True: + stderr_success = True + stderr_line = ferr.readline() + + ferr.close() + + + #Success requirements: + # 1) The stdout completed without error + # 2) The program exited with status 0 + # 3) No errors found in stdout + print '#Expect: True, False, True, True' + print complete, bool(error_code), abort != RUN_ABORT, stderr_success is True + status = complete is True and \ + bool(error_code) is False and \ + abort != RUN_ABORT and \ + stderr_success is True + + # If everything was successful, but for some reason + # we didn't retrieve the path info, log it. + if status is True: + if conf_info.bustard_path is None or conf_info.run_path is None: + logging.error("Failed to retrieve run_path") + return False + + return status + + +def run_pipeline(conf_info): + """ + Run the pipeline and monitor status. + """ + # Fail if the run_path doesn't actually exist + if not os.path.exists(conf_info.run_path): + logging.error('Run path does not exist: %s' \ + % (conf_info.run_path)) + return False + + # Change cwd to run_path + stdout_filepath = os.path.join(conf_info.analysis_dir, 'pipeline_run_stdout.txt') + stderr_filepath = os.path.join(conf_info.analysis_dir, 'pipeline_run_stderr.txt') + + # Create status object + conf_info.createStatusObject() + + # Monitor file creation + wm = WatchManager() + mask = EventsCodes.IN_DELETE | EventsCodes.IN_CREATE + event = RunEvent(conf_info) + notifier = ThreadedNotifier(wm, event) + notifier.start() + wdd = wm.add_watch(conf_info.run_path, mask, rec=True) + + # Log pipeline starting + logging.info('STARTING PIPELINE @ %s' % (time.ctime())) + + # Start the pipeline (and hide!) + #pipe = subprocess.Popen(['make', + # '-j8', + # 'recursive'], + # stdout=subprocess.PIPE, + # stderr=subprocess.PIPE) + + fout = open(stdout_filepath, 'w') + ferr = open(stderr_filepath, 'w') + + pipe = subprocess.Popen(['make', + '--directory=%s' % (conf_info.run_path), + '-j8', + 'recursive'], + stdout=fout, + stderr=ferr) + #shell=True) + # Wait for run to finish + retcode = pipe.wait() + + + # Clean up + notifier.stop() + fout.close() + ferr.close() + + # Process stderr + ferr = open(stderr_filepath, 'r') + + run_failed_stderr = False + for line in ferr: + err_status = pipeline_stderr_handler(line, conf_info) + if err_status == RUN_FAILED: + run_failed_stderr = True + + ferr.close() + + # Finished file check! + print 'RUN SUCCESS CHECK:' + for key, value in event.run_status_dict.items(): + print ' %s: %s' % (key, value) + + dstatus = event.run_status_dict + + # Success or failure check + status = (retcode == 0) and \ + run_failed_stderr is False and \ + dstatus['firecrest'] is True and \ + dstatus['bustard'] is True and \ + dstatus['gerald'] is True + + return status + + diff --git a/htsworkflow/pipeline/firecrest.py b/htsworkflow/pipeline/firecrest.py new file mode 100644 index 0000000..89ea598 --- /dev/null +++ b/htsworkflow/pipeline/firecrest.py @@ -0,0 +1,127 @@ +""" +Extract information about the Firecrest run + +Firecrest - class holding the properties we found +firecrest - Firecrest factory function initalized from a directory name +fromxml - Firecrest factory function initalized from an xml dump from + the Firecrest object. +""" + +from datetime import date +import os +import re +import time + +from htsworkflow.pipeline.runfolder import \ + ElementTree, \ + VERSION_RE, \ + EUROPEAN_STRPTIME + +class Firecrest(object): + XML_VERSION=1 + + # xml tag names + FIRECREST = 'Firecrest' + SOFTWARE_VERSION = 'version' + START = 'FirstCycle' + STOP = 'LastCycle' + DATE = 'run_time' + USER = 'user' + MATRIX = 'matrix' + + def __init__(self, xml=None): + self.start = None + self.stop = None + self.version = None + self.date = date.today() + self.user = None + self.matrix = None + + if xml is not None: + self.set_elements(xml) + + def _get_time(self): + return time.mktime(self.date.timetuple()) + time = property(_get_time, doc='return run time as seconds since epoch') + + def dump(self): + print "Starting cycle:", self.start + print "Ending cycle:", self.stop + print "Firecrest version:", self.version + print "Run date:", self.date + print "user:", self.user + + def get_elements(self): + attribs = {'version': str(Firecrest.XML_VERSION) } + root = ElementTree.Element(Firecrest.FIRECREST, attrib=attribs) + version = ElementTree.SubElement(root, Firecrest.SOFTWARE_VERSION) + version.text = self.version + start_cycle = ElementTree.SubElement(root, Firecrest.START) + start_cycle.text = str(self.start) + stop_cycle = ElementTree.SubElement(root, Firecrest.STOP) + stop_cycle.text = str(self.stop) + run_date = ElementTree.SubElement(root, Firecrest.DATE) + run_date.text = str(self.time) + user = ElementTree.SubElement(root, Firecrest.USER) + user.text = self.user + matrix = ElementTree.SubElement(root, Firecrest.MATRIX) + matrix.text = self.matrix + return root + + def set_elements(self, tree): + if tree.tag != Firecrest.FIRECREST: + raise ValueError('Expected "Firecrest" SubElements') + xml_version = int(tree.attrib.get('version', 0)) + if xml_version > Firecrest.XML_VERSION: + logging.warn('Firecrest XML tree is a higher version than this class') + for element in list(tree): + if element.tag == Firecrest.SOFTWARE_VERSION: + self.version = element.text + elif element.tag == Firecrest.START: + self.start = int(element.text) + elif element.tag == Firecrest.STOP: + self.stop = int(element.text) + elif element.tag == Firecrest.DATE: + self.date = date.fromtimestamp(float(element.text)) + elif element.tag == Firecrest.USER: + self.user = element.text + elif element.tag == Firecrest.MATRIX: + self.matrix = element.text + else: + raise ValueError("Unrecognized tag: %s" % (element.tag,)) + +def firecrest(pathname): + """ + Examine the directory at pathname and initalize a Firecrest object + """ + f = Firecrest() + + # parse firecrest directory name + path, name = os.path.split(pathname) + groups = name.split('_') + # grab the start/stop cycle information + cycle = re.match("C([0-9]+)-([0-9]+)", groups[0]) + f.start = int(cycle.group(1)) + f.stop = int(cycle.group(2)) + # firecrest version + version = re.search(VERSION_RE, groups[1]) + f.version = (version.group(1)) + # datetime + t = time.strptime(groups[2], EUROPEAN_STRPTIME) + f.date = date(*t[0:3]) + # username + f.user = groups[3] + + # should I parse this deeper than just stashing the + # contents of the matrix file? + matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt') + f.matrix = open(matrix_pathname, 'r').read() + return f + +def fromxml(tree): + """ + Initialize a Firecrest object from an element tree node + """ + f = Firecrest() + f.set_elements(tree) + return f diff --git a/htsworkflow/pipeline/genome_mapper.py b/htsworkflow/pipeline/genome_mapper.py new file mode 100644 index 0000000..a8ea871 --- /dev/null +++ b/htsworkflow/pipeline/genome_mapper.py @@ -0,0 +1,137 @@ +#!/usr/bin/python +import glob +import sys +import os +import re + +import logging + +from htsworkflow.util.alphanum import alphanum + +class DuplicateGenome(Exception): pass + + +def _has_metainfo(genome_dir): + metapath = os.path.join(genome_dir, '_metainfo_') + if os.path.isfile(metapath): + return True + else: + return False + +def getAvailableGenomes(genome_base_dir): + """ + raises IOError (on genome_base_dir not found) + raises DuplicateGenome on duplicate genomes found. + + returns a double dictionary (i.e. d[species][build] = path) + """ + + # Need valid directory + if not os.path.exists(genome_base_dir): + msg = "Directory does not exist: %s" % (genome_base_dir) + raise IOError, msg + + # Find all subdirectories + filepath_list = glob.glob(os.path.join(genome_base_dir, '*')) + potential_genome_dirs = \ + [ filepath for filepath in filepath_list if os.path.isdir(filepath)] + + # Get list of metadata files + genome_dir_list = \ + [ dirpath \ + for dirpath in potential_genome_dirs \ + if _has_metainfo(dirpath) ] + + # Genome double dictionary + d = {} + + for genome_dir in genome_dir_list: + line = open(os.path.join(genome_dir, '_metainfo_'), 'r').readline().strip() + + # Get species, build... log and skip on failure + try: + species, build = line.split('|') + except: + logging.warning('Skipping: Invalid metafile (%s) line: %s' \ + % (metafile, line)) + continue + + build_dict = d.setdefault(species, {}) + if build in build_dict: + msg = "Duplicate genome for %s|%s" % (species, build) + raise DuplicateGenome, msg + + build_dict[build] = genome_dir + + return d + + +class constructMapperDict(object): + """ + Emulate a dictionary to map genome|build names to paths. + + It uses the dictionary generated by getAvailableGenomes. + """ + def __init__(self, genome_dict): + self.genome_dict = genome_dict + + def __getitem__(self, key): + """ + Return the best match for key + """ + elements = re.split("\|", key) + + if len(elements) == 1: + # we just the species name + # get the set of builds + builds = self.genome_dict[elements[0]] + + # sort build names the way humans would + keys = builds.keys() + keys.sort(cmp=alphanum) + + # return the path from the 'last' build name + return builds[keys[-1]] + + elif len(elements) == 2: + # we have species, and build name + return self.genome_dict[elements[0]][elements[1]] + else: + raise KeyError("Unrecognized key") + + def keys(self): + keys = [] + for species in self.genome_dict.keys(): + for build in self.genome_dict[species]: + keys.append([species+'|'+build]) + return keys + + def values(self): + values = [] + for species in self.genome_dict.keys(): + for build in self.genome_dict[species]: + values.append(self.genome_dict[species][build]) + return values + + def items(self): + items = [] + for species in self.genome_dict.keys(): + for build in self.genome_dict[species]: + key = [species+'|'+build] + value = self.genome_dict[species][build] + items.append((key, value)) + return items + +if __name__ == '__main__': + + if len(sys.argv) != 2: + print 'useage: %s ' % (sys.argv[0]) + sys.exit(1) + + d = getAvailableGenomes(sys.argv[1]) + d2 = constructMapperDict(d) + + for k,v in d2.items(): + print '%s: %s' % (k,v) + + diff --git a/htsworkflow/pipeline/gerald.py b/htsworkflow/pipeline/gerald.py new file mode 100644 index 0000000..07ca009 --- /dev/null +++ b/htsworkflow/pipeline/gerald.py @@ -0,0 +1,719 @@ +""" +Provide access to information stored in the GERALD directory. +""" +from datetime import datetime, date +from glob import glob +import logging +import os +import stat +import time +import types + +from htsworkflow.pipeline.runfolder import \ + ElementTree, \ + EUROPEAN_STRPTIME, \ + LANES_PER_FLOWCELL, \ + VERSION_RE +from htsworkflow.util.ethelp import indent, flatten +from htsworkflow.util.opener import autoopen + +class Gerald(object): + """ + Capture meaning out of the GERALD directory + """ + XML_VERSION = 1 + GERALD='Gerald' + RUN_PARAMETERS='RunParameters' + SUMMARY='Summary' + + class LaneParameters(object): + """ + Make it easy to access elements of LaneSpecificRunParameters from python + """ + def __init__(self, gerald, key): + self._gerald = gerald + self._key = key + + def __get_attribute(self, xml_tag): + subtree = self._gerald.tree.find('LaneSpecificRunParameters') + container = subtree.find(xml_tag) + if container is None: + return None + if len(container.getchildren()) > LANES_PER_FLOWCELL: + raise RuntimeError('GERALD config.xml file changed') + lanes = [x.tag.split('_')[1] for x in container.getchildren()] + index = lanes.index(self._key) + element = container[index] + return element.text + def _get_analysis(self): + return self.__get_attribute('ANALYSIS') + analysis = property(_get_analysis) + + def _get_eland_genome(self): + genome = self.__get_attribute('ELAND_GENOME') + # default to the chipwide parameters if there isn't an + # entry in the lane specific paramaters + if genome is None: + subtree = self._gerald.tree.find('ChipWideRunParameters') + container = subtree.find('ELAND_GENOME') + genome = container.text + return genome + eland_genome = property(_get_eland_genome) + + def _get_read_length(self): + return self.__get_attribute('READ_LENGTH') + read_length = property(_get_read_length) + + def _get_use_bases(self): + return self.__get_attribute('USE_BASES') + use_bases = property(_get_use_bases) + + class LaneSpecificRunParameters(object): + """ + Provide access to LaneSpecificRunParameters + """ + def __init__(self, gerald): + self._gerald = gerald + self._keys = None + def __getitem__(self, key): + return Gerald.LaneParameters(self._gerald, key) + def keys(self): + if self._keys is None: + tree = self._gerald.tree + analysis = tree.find('LaneSpecificRunParameters/ANALYSIS') + # according to the pipeline specs I think their fields + # are sampleName_laneID, with sampleName defaulting to s + # since laneIDs are constant lets just try using + # those consistently. + self._keys = [ x.tag.split('_')[1] for x in analysis] + return self._keys + def values(self): + return [ self[x] for x in self.keys() ] + def items(self): + return zip(self.keys(), self.values()) + def __len__(self): + return len(self.keys()) + + def __init__(self, xml=None): + self.pathname = None + self.tree = None + + # parse lane parameters out of the config.xml file + self.lanes = Gerald.LaneSpecificRunParameters(self) + + self.summary = None + self.eland_results = None + + if xml is not None: + self.set_elements(xml) + + def _get_date(self): + if self.tree is None: + return datetime.today() + timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP') + epochstamp = time.mktime(time.strptime(timestamp, '%c')) + return datetime.fromtimestamp(epochstamp) + date = property(_get_date) + + def _get_time(self): + return time.mktime(self.date.timetuple()) + time = property(_get_time, doc='return run time as seconds since epoch') + + def _get_version(self): + if self.tree is None: + return None + return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION') + version = property(_get_version) + + def dump(self): + """ + Debugging function, report current object + """ + print 'Gerald version:', self.version + print 'Gerald run date:', self.date + print 'Gerald config.xml:', self.tree + self.summary.dump() + + def get_elements(self): + if self.tree is None or self.summary is None: + return None + + gerald = ElementTree.Element(Gerald.GERALD, + {'version': unicode(Gerald.XML_VERSION)}) + gerald.append(self.tree) + gerald.append(self.summary.get_elements()) + if self.eland_results: + gerald.append(self.eland_results.get_elements()) + return gerald + + def set_elements(self, tree): + if tree.tag != Gerald.GERALD: + raise ValueError('exptected GERALD') + xml_version = int(tree.attrib.get('version', 0)) + if xml_version > Gerald.XML_VERSION: + logging.warn('XML tree is a higher version than this class') + for element in list(tree): + tag = element.tag.lower() + if tag == Gerald.RUN_PARAMETERS.lower(): + self.tree = element + elif tag == Gerald.SUMMARY.lower(): + self.summary = Summary(xml=element) + elif tag == ELAND.ELAND.lower(): + self.eland_results = ELAND(xml=element) + else: + logging.warn("Unrecognized tag %s" % (element.tag,)) + + +def gerald(pathname): + g = Gerald() + g.pathname = pathname + path, name = os.path.split(pathname) + config_pathname = os.path.join(pathname, 'config.xml') + g.tree = ElementTree.parse(config_pathname).getroot() + + # parse Summary.htm file + summary_pathname = os.path.join(pathname, 'Summary.htm') + g.summary = Summary(summary_pathname) + # parse eland files + g.eland_results = eland(g.pathname, g) + return g + +def tonumber(v): + """ + Convert a value to int if its an int otherwise a float. + """ + try: + v = int(v) + except ValueError, e: + v = float(v) + return v + +def parse_mean_range(value): + """ + Parse values like 123 +/- 4.5 + """ + if value.strip() == 'unknown': + return 0, 0 + + average, pm, deviation = value.split() + if pm != '+/-': + raise RuntimeError("Summary.htm file format changed") + return tonumber(average), tonumber(deviation) + +def make_mean_range_element(parent, name, mean, deviation): + """ + Make an ElementTree subelement + """ + element = ElementTree.SubElement(parent, name, + { 'mean': unicode(mean), + 'deviation': unicode(deviation)}) + return element + +def parse_mean_range_element(element): + """ + Grab mean/deviation out of element + """ + return (tonumber(element.attrib['mean']), + tonumber(element.attrib['deviation'])) + +def parse_summary_element(element): + """ + Determine if we have a simple element or a mean/deviation element + """ + if len(element.attrib) > 0: + return parse_mean_range_element(element) + else: + return element.text + +class Summary(object): + """ + Extract some useful information from the Summary.htm file + """ + XML_VERSION = 2 + SUMMARY = 'Summary' + + class LaneResultSummary(object): + """ + Parse the LaneResultSummary table out of Summary.htm + Mostly for the cluster number + """ + LANE_RESULT_SUMMARY = 'LaneResultSummary' + TAGS = { + 'LaneYield': 'lane_yield', + 'Cluster': 'cluster', # Raw + 'ClusterPF': 'cluster_pass_filter', + 'AverageFirstCycleIntensity': 'average_first_cycle_intensity', + 'PercentIntensityAfter20Cycles': 'percent_intensity_after_20_cycles', + 'PercentPassFilterClusters': 'percent_pass_filter_clusters', + 'PercentPassFilterAlign': 'percent_pass_filter_align', + 'AverageAlignmentScore': 'average_alignment_score', + 'PercentErrorRate': 'percent_error_rate' + } + + def __init__(self, html=None, xml=None): + self.lane = None + self.lane_yield = None + self.cluster = None + self.cluster_pass_filter = None + self.average_first_cycle_intensity = None + self.percent_intensity_after_20_cycles = None + self.percent_pass_filter_clusters = None + self.percent_pass_filter_align = None + self.average_alignment_score = None + self.percent_error_rate = None + + if html is not None: + self.set_elements_from_html(html) + if xml is not None: + self.set_elements(xml) + + def set_elements_from_html(self, data): + if not len(data) in (8,10): + raise RuntimeError("Summary.htm file format changed") + + # same in pre-0.3.0 Summary file and 0.3 summary file + self.lane = data[0] + + if len(data) == 8: + parsed_data = [ parse_mean_range(x) for x in data[1:] ] + # this is the < 0.3 Pipeline version + self.cluster = parsed_data[0] + self.average_first_cycle_intensity = parsed_data[1] + self.percent_intensity_after_20_cycles = parsed_data[2] + self.percent_pass_filter_clusters = parsed_data[3] + self.percent_pass_filter_align = parsed_data[4] + self.average_alignment_score = parsed_data[5] + self.percent_error_rate = parsed_data[6] + elif len(data) == 10: + parsed_data = [ parse_mean_range(x) for x in data[2:] ] + # this is the >= 0.3 summary file + self.lane_yield = data[1] + self.cluster = parsed_data[0] + self.cluster_pass_filter = parsed_data[1] + self.average_first_cycle_intensity = parsed_data[2] + self.percent_intensity_after_20_cycles = parsed_data[3] + self.percent_pass_filter_clusters = parsed_data[4] + self.percent_pass_filter_align = parsed_data[5] + self.average_alignment_score = parsed_data[6] + self.percent_error_rate = parsed_data[7] + + def get_elements(self): + lane_result = ElementTree.Element( + Summary.LaneResultSummary.LANE_RESULT_SUMMARY, + {'lane': self.lane}) + for tag, variable_name in Summary.LaneResultSummary.TAGS.items(): + value = getattr(self, variable_name) + if value is None: + continue + # it looks like a sequence + elif type(value) in (types.TupleType, types.ListType): + element = make_mean_range_element( + lane_result, + tag, + *value + ) + else: + element = ElementTree.SubElement(lane_result, tag) + element.text = value + return lane_result + + def set_elements(self, tree): + if tree.tag != Summary.LaneResultSummary.LANE_RESULT_SUMMARY: + raise ValueError('Expected %s' % ( + Summary.LaneResultSummary.LANE_RESULT_SUMMARY)) + self.lane = tree.attrib['lane'] + tags = Summary.LaneResultSummary.TAGS + for element in list(tree): + try: + variable_name = tags[element.tag] + setattr(self, variable_name, + parse_summary_element(element)) + except KeyError, e: + logging.warn('Unrecognized tag %s' % (element.tag,)) + + def __init__(self, filename=None, xml=None): + self.lane_results = {} + + if filename is not None: + self._extract_lane_results(filename) + if xml is not None: + self.set_elements(xml) + + def __getitem__(self, key): + return self.lane_results[key] + + def __len__(self): + return len(self.lane_results) + + def keys(self): + return self.lane_results.keys() + + def values(self): + return self.lane_results.values() + + def items(self): + return self.lane_results.items() + + def _flattened_row(self, row): + """ + flatten the children of a ... + """ + return [flatten(x) for x in row.getchildren() ] + + def _parse_table(self, table): + """ + assumes the first line is the header of a table, + and that the remaining rows are data + """ + rows = table.getchildren() + data = [] + for r in rows: + data.append(self._flattened_row(r)) + return data + + def _extract_named_tables(self, pathname): + """ + extract all the 'named' tables from a Summary.htm file + and return as a dictionary + + Named tables are

...

...
pairs + The contents of the h2 tag is considered to the name + of the table. + """ + tree = ElementTree.parse(pathname).getroot() + body = tree.find('body') + tables = {} + for i in range(len(body)): + if body[i].tag == 'h2' and body[i+1].tag == 'table': + # we have an interesting table + name = flatten(body[i]) + table = body[i+1] + data = self._parse_table(table) + tables[name] = data + return tables + + def _extract_lane_results(self, pathname): + """ + extract the Lane Results Summary table + """ + + tables = self._extract_named_tables(pathname) + + # parse lane result summary + lane_summary = tables['Lane Results Summary'] + # this is version 1 of the summary file + if len(lane_summary[-1]) == 8: + # strip header + headers = lane_summary[0] + # grab the lane by lane data + lane_summary = lane_summary[1:] + + # this is version 2 of the summary file + if len(lane_summary[-1]) == 10: + # lane_summary[0] is a different less specific header row + headers = lane_summary[1] + lane_summary = lane_summary[2:10] + # after the last lane, there's a set of chip wide averages + + for r in lane_summary: + lrs = Summary.LaneResultSummary(html=r) + self.lane_results[lrs.lane] = lrs + + def get_elements(self): + summary = ElementTree.Element(Summary.SUMMARY, + {'version': unicode(Summary.XML_VERSION)}) + for lane in self.lane_results.values(): + summary.append(lane.get_elements()) + return summary + + def set_elements(self, tree): + if tree.tag != Summary.SUMMARY: + return ValueError("Expected %s" % (Summary.SUMMARY,)) + xml_version = int(tree.attrib.get('version', 0)) + if xml_version > Summary.XML_VERSION: + logging.warn('Summary XML tree is a higher version than this class') + for element in list(tree): + lrs = Summary.LaneResultSummary() + lrs.set_elements(element) + self.lane_results[lrs.lane] = lrs + + def dump(self): + """ + Debugging function, report current object + """ + pass + + +def build_genome_fasta_map(genome_dir): + # build fasta to fasta file map + genome = genome_dir.split(os.path.sep)[-1] + fasta_map = {} + for vld_file in glob(os.path.join(genome_dir, '*.vld')): + is_link = False + if os.path.islink(vld_file): + is_link = True + vld_file = os.path.realpath(vld_file) + path, vld_name = os.path.split(vld_file) + name, ext = os.path.splitext(vld_name) + if is_link: + fasta_map[name] = name + else: + fasta_map[name] = os.path.join(genome, name) + return fasta_map + +class ElandLane(object): + """ + Process an eland result file + """ + XML_VERSION = 1 + LANE = 'ElandLane' + SAMPLE_NAME = 'SampleName' + LANE_ID = 'LaneID' + GENOME_MAP = 'GenomeMap' + GENOME_ITEM = 'GenomeItem' + MAPPED_READS = 'MappedReads' + MAPPED_ITEM = 'MappedItem' + MATCH_CODES = 'MatchCodes' + MATCH_ITEM = 'Code' + READS = 'Reads' + + def __init__(self, pathname=None, genome_map=None, xml=None): + self.pathname = pathname + self._sample_name = None + self._lane_id = None + self._reads = None + self._mapped_reads = None + self._match_codes = None + if genome_map is None: + genome_map = {} + self.genome_map = genome_map + + if xml is not None: + self.set_elements(xml) + + def _update(self): + """ + Actually read the file and actually count the reads + """ + # can't do anything if we don't have a file to process + if self.pathname is None: + return + + if os.stat(self.pathname)[stat.ST_SIZE] == 0: + raise RuntimeError("Eland isn't done, try again later.") + + reads = 0 + mapped_reads = {} + + match_codes = {'NM':0, 'QC':0, 'RM':0, + 'U0':0, 'U1':0, 'U2':0, + 'R0':0, 'R1':0, 'R2':0, + } + for line in autoopen(self.pathname,'r'): + reads += 1 + fields = line.split() + # code = fields[2] + # match_codes[code] = match_codes.setdefault(code, 0) + 1 + # the QC/NM etc codes are in the 3rd field and always present + match_codes[fields[2]] += 1 + # ignore lines that don't have a fasta filename + if len(fields) < 7: + continue + fasta = self.genome_map.get(fields[6], fields[6]) + mapped_reads[fasta] = mapped_reads.setdefault(fasta, 0) + 1 + self._match_codes = match_codes + self._mapped_reads = mapped_reads + self._reads = reads + + def _update_name(self): + # extract the sample name + if self.pathname is None: + return + + path, name = os.path.split(self.pathname) + split_name = name.split('_') + self._sample_name = split_name[0] + self._lane_id = split_name[1] + + def _get_sample_name(self): + if self._sample_name is None: + self._update_name() + return self._sample_name + sample_name = property(_get_sample_name) + + def _get_lane_id(self): + if self._lane_id is None: + self._update_name() + return self._lane_id + lane_id = property(_get_lane_id) + + def _get_reads(self): + if self._reads is None: + self._update() + return self._reads + reads = property(_get_reads) + + def _get_mapped_reads(self): + if self._mapped_reads is None: + self._update() + return self._mapped_reads + mapped_reads = property(_get_mapped_reads) + + def _get_match_codes(self): + if self._match_codes is None: + self._update() + return self._match_codes + match_codes = property(_get_match_codes) + + def get_elements(self): + lane = ElementTree.Element(ElandLane.LANE, + {'version': + unicode(ElandLane.XML_VERSION)}) + sample_tag = ElementTree.SubElement(lane, ElandLane.SAMPLE_NAME) + sample_tag.text = self.sample_name + lane_tag = ElementTree.SubElement(lane, ElandLane.LANE_ID) + lane_tag.text = self.lane_id + genome_map = ElementTree.SubElement(lane, ElandLane.GENOME_MAP) + for k, v in self.genome_map.items(): + item = ElementTree.SubElement( + genome_map, ElandLane.GENOME_ITEM, + {'name':k, 'value':unicode(v)}) + mapped_reads = ElementTree.SubElement(lane, ElandLane.MAPPED_READS) + for k, v in self.mapped_reads.items(): + item = ElementTree.SubElement( + mapped_reads, ElandLane.MAPPED_ITEM, + {'name':k, 'value':unicode(v)}) + match_codes = ElementTree.SubElement(lane, ElandLane.MATCH_CODES) + for k, v in self.match_codes.items(): + item = ElementTree.SubElement( + match_codes, ElandLane.MATCH_ITEM, + {'name':k, 'value':unicode(v)}) + reads = ElementTree.SubElement(lane, ElandLane.READS) + reads.text = unicode(self.reads) + + return lane + + def set_elements(self, tree): + if tree.tag != ElandLane.LANE: + raise ValueError('Exptecting %s' % (ElandLane.LANE,)) + + # reset dictionaries + self._mapped_reads = {} + self._match_codes = {} + + for element in tree: + tag = element.tag.lower() + if tag == ElandLane.SAMPLE_NAME.lower(): + self._sample_name = element.text + elif tag == ElandLane.LANE_ID.lower(): + self._lane_id = element.text + elif tag == ElandLane.GENOME_MAP.lower(): + for child in element: + name = child.attrib['name'] + value = child.attrib['value'] + self.genome_map[name] = value + elif tag == ElandLane.MAPPED_READS.lower(): + for child in element: + name = child.attrib['name'] + value = child.attrib['value'] + self._mapped_reads[name] = int(value) + elif tag == ElandLane.MATCH_CODES.lower(): + for child in element: + name = child.attrib['name'] + value = int(child.attrib['value']) + self._match_codes[name] = value + elif tag == ElandLane.READS.lower(): + self._reads = int(element.text) + else: + logging.warn("ElandLane unrecognized tag %s" % (element.tag,)) + +def extract_eland_sequence(instream, outstream, start, end): + """ + Extract a chunk of sequence out of an eland file + """ + for line in instream: + record = line.split() + if len(record) > 1: + result = [record[0], record[1][start:end]] + else: + result = [record[0][start:end]] + outstream.write("\t".join(result)) + outstream.write(os.linesep) + +class ELAND(object): + """ + Summarize information from eland files + """ + XML_VERSION = 1 + + ELAND = 'ElandCollection' + LANE = 'Lane' + LANE_ID = 'id' + + def __init__(self, xml=None): + # we need information from the gerald config.xml + self.results = {} + + if xml is not None: + self.set_elements(xml) + + def __len__(self): + return len(self.results) + + def keys(self): + return self.results.keys() + + def values(self): + return self.results.values() + + def items(self): + return self.results.items() + + def __getitem__(self, key): + return self.results[key] + + def get_elements(self): + root = ElementTree.Element(ELAND.ELAND, + {'version': unicode(ELAND.XML_VERSION)}) + for lane_id, lane in self.results.items(): + eland_lane = lane.get_elements() + eland_lane.attrib[ELAND.LANE_ID] = unicode(lane_id) + root.append(eland_lane) + return root + + def set_elements(self, tree): + if tree.tag.lower() != ELAND.ELAND.lower(): + raise ValueError('Expecting %s', ELAND.ELAND) + for element in list(tree): + lane_id = element.attrib[ELAND.LANE_ID] + lane = ElandLane(xml=element) + self.results[lane_id] = lane + +def eland(basedir, gerald=None, genome_maps=None): + e = ELAND() + + file_list = glob(os.path.join(basedir, "*_eland_result.txt")) + if len(file_list) == 0: + # lets handle compressed eland files too + file_list = glob(os.path.join(basedir, "*_eland_result.txt.bz2")) + + for pathname in file_list: + # yes the lane_id is also being computed in ElandLane._update + # I didn't want to clutter up my constructor + # but I needed to persist the sample_name/lane_id for + # runfolder summary_report + path, name = os.path.split(pathname) + split_name = name.split('_') + lane_id = split_name[1] + + if genome_maps is not None: + genome_map = genome_maps[lane_id] + elif gerald is not None: + genome_dir = gerald.lanes[lane_id].eland_genome + genome_map = build_genome_fasta_map(genome_dir) + else: + genome_map = {} + + eland_result = ElandLane(pathname, genome_map) + e.results[lane_id] = eland_result + return e diff --git a/htsworkflow/pipeline/recipe_parser.py b/htsworkflow/pipeline/recipe_parser.py new file mode 100644 index 0000000..7f5ced6 --- /dev/null +++ b/htsworkflow/pipeline/recipe_parser.py @@ -0,0 +1,48 @@ +from xml import sax + + +def get_cycles(recipe_xml_filepath): + """ + returns the number of cycles found in Recipe*.xml + """ + handler = CycleXmlHandler() + sax.parse(recipe_xml_filepath, handler) + return handler.cycle_count + + + +class CycleXmlHandler(sax.ContentHandler): + + def __init__(self): + self.cycle_count = 0 + self.in_protocol = False + sax.ContentHandler.__init__(self) + + + def startDocument(self): + self.cycle_count = 0 + self.in_protocol = False + + + def startElement(self, name, attrs): + + #Only count Incorporations as cycles if within + # the protocol section of the xml document. + if name == "Incorporation" and self.in_protocol: + #print 'Found a cycle!' + self.cycle_count += 1 + return + + elif name == 'Protocol': + #print 'In protocol' + self.in_protocol = True + return + + #print 'Skipping: %s' % (name) + + + def endElement(self, name): + + if name == 'Protocol': + #print 'End protocol' + self.in_protocol = False diff --git a/htsworkflow/pipeline/retrieve_config.py b/htsworkflow/pipeline/retrieve_config.py new file mode 100644 index 0000000..72cff17 --- /dev/null +++ b/htsworkflow/pipeline/retrieve_config.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python + +from optparse import OptionParser, IndentedHelpFormatter +from ConfigParser import SafeConfigParser + +import logging +import os +import sys +import urllib2 + +CONFIG_SYSTEM = '/etc/ga_frontend/ga_frontend.conf' +CONFIG_USER = os.path.expanduser('~/.ga_frontend.conf') + +#Disable or enable commandline arg parsing; disabled by default. +DISABLE_CMDLINE = True + +class FlowCellNotFound(Exception): pass +class WebError404(Exception): pass + +class DummyOptions: + """ + Used when command line parsing is disabled; default + """ + def __init__(self): + self.url = None + self.output_filepath = None + self.flowcell = None + self.genome_dir = None + +class PreformattedDescriptionFormatter(IndentedHelpFormatter): + + #def format_description(self, description): + # + # if description: + # return description + "\n" + # else: + # return "" + + def format_epilog(self, epilog): + """ + It was removing my preformated epilog, so this should override + that behavior! Muhahaha! + """ + if epilog: + return "\n" + epilog + "\n" + else: + return "" + + +def constructOptionParser(): + """ + returns a pre-setup optparser + """ + global DISABLE_CMDLINE + + if DISABLE_CMDLINE: + return None + + parser = OptionParser(formatter=PreformattedDescriptionFormatter()) + + parser.set_description('Retrieves eland config file from ga_frontend web frontend.') + + parser.epilog = """ +Config File: + * %s (System wide) + * %s (User specific; overrides system) + * command line overrides all config file options + + Example Config File: + + [config_file_server] + base_host_url=http://somewhere.domain:port +""" % (CONFIG_SYSTEM, CONFIG_USER) + + #Special formatter for allowing preformatted description. + ##parser.format_epilog(PreformattedDescriptionFormatter()) + + parser.add_option("-u", "--url", + action="store", type="string", dest="url") + + parser.add_option("-o", "--output", + action="store", type="string", dest="output_filepath") + + parser.add_option("-f", "--flowcell", + action="store", type="string", dest="flowcell") + + parser.add_option("-g", "--genome_dir", + action="store", type="string", dest="genome_dir") + + #parser.set_default("url", "default") + + return parser + +def constructConfigParser(): + """ + returns a pre-setup config parser + """ + parser = SafeConfigParser() + parser.read([CONFIG_SYSTEM, CONFIG_USER]) + if not parser.has_section('config_file_server'): + parser.add_section('config_file_server') + if not parser.has_section('local_setup'): + parser.add_section('local_setup') + + return parser + + +def getCombinedOptions(): + """ + Returns optparse options after it has be updated with ConfigParser + config files and merged with parsed commandline options. + """ + cl_parser = constructOptionParser() + conf_parser = constructConfigParser() + + if cl_parser is None: + options = DummyOptions() + else: + options, args = cl_parser.parse_args() + + if options.url is None: + if conf_parser.has_option('config_file_server', 'base_host_url'): + options.url = conf_parser.get('config_file_server', 'base_host_url') + + if options.genome_dir is None: + if conf_parser.has_option('local_setup', 'genome_dir'): + options.genome_dir = conf_parser.get('local_setup', 'genome_dir') + + print 'USING OPTIONS:' + print ' URL:', options.url + print ' OUT:', options.output_filepath + print ' FC:', options.flowcell + print 'GDIR:', options.genome_dir + print '' + + return options + + +def saveConfigFile(flowcell, base_host_url, output_filepath): + """ + retrieves the flowcell eland config file, give the base_host_url + (i.e. http://sub.domain.edu:port) + """ + url = base_host_url + '/eland_config/%s/' % (flowcell) + + f = open(output_filepath, 'w') + #try: + try: + web = urllib2.urlopen(url) + except urllib2.URLError, e: + errmsg = 'URLError: %d' % (e.code,) + logging.error(errmsg) + logging.error('opened %s' % (url,)) + logging.error('%s' % ( e.read(),)) + raise IOError(errmsg) + + #except IOError, msg: + # if str(msg).find("Connection refused") >= 0: + # print 'Error: Connection refused for: %s' % (url) + # f.close() + # sys.exit(1) + # elif str(msg).find("Name or service not known") >= 0: + # print 'Error: Invalid domain or ip address for: %s' % (url) + # f.close() + # sys.exit(2) + # else: + # raise IOError, msg + + data = web.read() + + if data.find('Hmm, config file for') >= 0: + msg = "Flowcell (%s) not found in DB; full url(%s)" % (flowcell, url) + raise FlowCellNotFound, msg + + if data.find('404 - Not Found') >= 0: + msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \ + "Did you get right port #?" % (flowcell, base_host_url, url) + raise FlowCellNotFound, msg + + f.write(data) + web.close() + f.close() + logging.info('Wrote config file to %s' % (output_filepath,)) + + diff --git a/htsworkflow/pipeline/run_status.py b/htsworkflow/pipeline/run_status.py new file mode 100644 index 0000000..39dc54c --- /dev/null +++ b/htsworkflow/pipeline/run_status.py @@ -0,0 +1,478 @@ +import glob +import re +import os +import sys +import time +import threading + +s_comment = re.compile('^#') +s_general_read_len = re.compile('^READ_LENGTH ') +s_read_len = re.compile('^[1-8]+:READ_LENGTH ') + +s_firecrest = None + +def _four_digit_num_in_string(num): + if num < 0: + pass + elif num < 10: + return '000' + str(num) + elif num < 100: + return '00' + str(num) + elif num < 1000: + return '0' + str(num) + elif num < 10000: + return str(num) + + msg = 'Invalid number: %s' % (num) + raise ValueError, msg + +def _two_digit_num_in_string(num): + if num < 0: + pass + elif num < 10: + return '0' + str(num) + elif num < 100: + return str(num) + + msg = 'Invalid number: %s' % (num) + raise ValueError, msg + + +# FIRECREST PATTERNS +# _p2f(, lane, tile, cycle) +PATTERN_FIRECREST_QCM = 's_%s_%s_%s_qcm.xml' + +# _p2f(, lane, tile) +PATTERN_FIRECREST_INT = 's_%s_%s_02_int.txt' +PATTERN_FIRECREST_NSE = 's_%s_%s_nse.txt.gz' +PATTERN_FIRECREST_POS = 's_%s_%s_pos.txt' +PATTERN_FIRECREST_IDX = 's_%s_%s_idx.txt' +PATTERN_FIRECREST_CLU1 = 's_%s_%s_01_1_clu.txt' +PATTERN_FIRECREST_CLU2 = 's_%s_%s_01_2_clu.txt' +PATTERN_FIRECREST_CLU3 = 's_%s_%s_01_3_clu.txt' +PATTERN_FIRECREST_CLU4 = 's_%s_%s_01_4_clu.txt' + + +# BUSTARD PATTERNS +# _p2f(, lane, tile) +PATTERN_BUSTARD_SIG2 = 's_%s_%s_sig2.txt' +PATTERN_BUSTARD_PRB = 's_%s_%s_prb.txt' + + + +# GERALD PATTERNS +# _p2f(, lane, tile) +PATTERN_GERALD_ALLTMP = 's_%s_%s_all.txt.tmp' +PATTERN_GERALD_QRAWTMP = 's_%s_%s_qraw.txt.tmp' +PATTERN_GERALD_ALLPNGTMP = 's_%s_%s_all.tmp.png' +PATTERN_GERALD_ALIGNTMP = 's_%s_%s_align.txt.tmp' +PATTERN_GERALD_QVALTMP = 's_%s_%s_qval.txt.tmp' +PATTERN_GERALD_SCORETMP = 's_%s_%s_score.txt.tmp' +PATTERN_GERALD_PREALIGNTMP = 's_%s_%s_prealign.txt.tmp' +PATTERN_GERALD_REALIGNTMP = 's_%s_%s_realign.txt.tmp' +PATTERN_GERALD_RESCORETMP = 's_%s_%s_rescore.txt.tmp' +PATTERN_GERALD_RESCOREPNG = 's_%s_%s_rescore.png' +PATTERN_GERALD_ERRORSTMPPNG = 's_%s_%s_errors.tmp.png' +PATTERN_GERALD_QCALTMP = 's_%s_%s_qcal.txt.tmp' +PATTERN_GERALD_QVAL = 's_%s_%s_qval.txt' + +# _p2f(, lane) +PATTERN_GERALD_SEQPRETMP = 's_%s_seqpre.txt.tmp' +PATTERN_GERALD_RESULTTMP = 's_%s_eland_result.txt.tmp' +PATTERN_GERALD_SIGMEANSTMP = 's_%s_Signal_Means.txt.tmp' +PATTERN_GERALD_CALLPNG = 's_%s_call.png' +PATTERN_GERALD_ALLPNG = 's_%s_all.png' +PATTERN_GERALD_PERCENTALLPNG = 's_%s_percent_all.png' +PATTERN_GERALD_PERCENTCALLPNG = 's_%s_percent_call.png' +PATTERN_GERALD_PERCENTBASEPNG = 's_%s_percent_base.png' +PATTERN_GERALD_FILTTMP = 's_%s_filt.txt.tmp' +PATTERN_GERALD_FRAGTMP = 's_%s_frag.txt.tmp' +PATTERN_GERALD_QREPORTTMP = 's_%s_qreport.txt.tmp' +PATTERN_GERALD_QTABLETMP = 's_%s_qtable.txt.tmp' +PATTERN_GERALD_QCALREPORTTMP = 's_%s_qcalreport.txt.tmp' +PATTERN_GERALD_SEQUENCETMP = 's_%s_sequence.txt.tmp' +PATTERN_GERALD_LANEFINISHED = 's_%s_finished.txt' + + + +def _p2f(pattern, lane, tile=None, cycle=None): + """ + Converts a pattern plus info into file names + """ + + # lane, and cycle provided (INVALID) + if tile is None and cycle is not None: + msg = "Handling of cycle without tile is not currently implemented." + raise ValueError, msg + + # lane, tile, cycle provided + elif cycle: + return pattern % (lane, + _four_digit_num_in_string(tile), + _two_digit_num_in_string(cycle)) + + # lane, tile provided + elif tile: + return pattern % (lane, _four_digit_num_in_string(tile)) + + # lane provided + else: + return pattern % (lane) + + +class GARunStatus(object): + + def __init__(self, conf_filepath): + """ + Given an eland config file in the top level directory + of a run, predicts the files that will be generated + during a run and provides methods for retrieving + (completed, total) for each step or entire run. + """ + #print 'self._conf_filepath = %s' % (conf_filepath) + self._conf_filepath = conf_filepath + self._base_dir, junk = os.path.split(conf_filepath) + self._image_dir = os.path.join(self._base_dir, 'Images') + + self.lanes = [] + self.lane_read_length = {} + self.tiles = None + self.cycles = None + + self.status = {} + self.status['firecrest'] = {} + self.status['bustard'] = {} + self.status['gerald'] = {} + + self._process_config() + self._count_tiles() + self._count_cycles() + self._generate_expected() + + + def _process_config(self): + """ + Grabs info from self._conf_filepath + """ + f = open(self._conf_filepath, 'r') + + for line in f: + + #Skip comment lines for now. + if s_comment.search(line): + continue + + mo = s_general_read_len.search(line) + if mo: + read_length = int(line[mo.end():]) + #Handle general READ_LENGTH + for i in range(1,9): + self.lane_read_length[i] = read_length + + mo = s_read_len.search(line) + if mo: + read_length = int(line[mo.end():]) + lanes, junk = line.split(':') + + #Convert lanes from string of lanes to list of lane #s. + lanes = [ int(i) for i in lanes ] + + + for lane in lanes: + + #Keep track of which lanes are being run. + if lane not in self.lanes: + self.lanes.append(lane) + + #Update with lane specific read lengths + self.lane_read_length[lane] = read_length + + self.lanes.sort() + + + def _count_tiles(self): + """ + Count the number of tiles being used + """ + self.tiles = len(glob.glob(os.path.join(self._image_dir, + 'L001', + 'C1.1', + 's_1_*_a.tif'))) + + def _count_cycles(self): + """ + Figures out the number of cycles that are available + """ + #print 'self._image_dir = %s' % (self._image_dir) + cycle_dirs = glob.glob(os.path.join(self._image_dir, 'L001', 'C*.1')) + #print 'cycle_dirs = %s' % (cycle_dirs) + cycle_list = [] + for cycle_dir in cycle_dirs: + junk, c = os.path.split(cycle_dir) + cycle_list.append(int(c[1:c.find('.')])) + + self.cycles = max(cycle_list) + + + + + def _generate_expected(self): + """ + generates a list of files we expect to find. + """ + + firecrest = self.status['firecrest'] + bustard = self.status['bustard'] + gerald = self.status['gerald'] + + + for lane in self.lanes: + for tile in range(1,self.tiles+1): + for cycle in range(1, self.cycles+1): + + ########################## + # LANE, TILE, CYCLE LAYER + + # FIRECREST + firecrest[_p2f(PATTERN_FIRECREST_QCM, lane, tile, cycle)] = False + + + ################### + # LANE, TILE LAYER + + # FIRECREST + firecrest[_p2f(PATTERN_FIRECREST_INT, lane, tile)] = False + firecrest[_p2f(PATTERN_FIRECREST_NSE, lane, tile)] = False + firecrest[_p2f(PATTERN_FIRECREST_POS, lane, tile)] = False + firecrest[_p2f(PATTERN_FIRECREST_IDX, lane, tile)] = False + firecrest[_p2f(PATTERN_FIRECREST_CLU1, lane, tile)] = False + firecrest[_p2f(PATTERN_FIRECREST_CLU2, lane, tile)] = False + firecrest[_p2f(PATTERN_FIRECREST_CLU3, lane, tile)] = False + firecrest[_p2f(PATTERN_FIRECREST_CLU4, lane, tile)] = False + + + # BUSTARD + bustard[_p2f(PATTERN_BUSTARD_SIG2, lane, tile)] = False + bustard[_p2f(PATTERN_BUSTARD_PRB, lane, tile)] = False + + + # GERALD + #gerald[_p2f(PATTERN_GERALD_ALLTMP, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_QRAWTMP, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_ALLPNGTMP, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_ALIGNTMP, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_QVALTMP, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_SCORETMP, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_PREALIGNTMP, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_REALIGNTMP, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_RESCORETMP, lane, tile)] = False + gerald[_p2f(PATTERN_GERALD_RESCOREPNG, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_ERRORSTMPPNG, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_QCALTMP, lane, tile)] = False + #gerald[_p2f(PATTERN_GERALD_QVAL, lane, tile)] = False + + ################### + # LANE LAYER + + # GERALD + #gerald[_p2f(PATTERN_GERALD_SEQPRETMP, lane)] = False + #gerald[_p2f(PATTERN_GERALD_RESULTTMP, lane)] = False + #gerald[_p2f(PATTERN_GERALD_SIGMEANSTMP, lane)] = False + gerald[_p2f(PATTERN_GERALD_CALLPNG, lane)] = False + gerald[_p2f(PATTERN_GERALD_ALLPNG, lane)] = False + gerald[_p2f(PATTERN_GERALD_PERCENTALLPNG, lane)] = False + gerald[_p2f(PATTERN_GERALD_PERCENTCALLPNG, lane)] = False + gerald[_p2f(PATTERN_GERALD_PERCENTBASEPNG, lane)] = False + #gerald[_p2f(PATTERN_GERALD_FILTTMP, lane)] = False + #gerald[_p2f(PATTERN_GERALD_FRAGTMP, lane)] = False + #gerald[_p2f(PATTERN_GERALD_QREPORTTMP, lane)] = False + #gerald[_p2f(PATTERN_GERALD_QTABLETMP, lane)] = False + #gerald[_p2f(PATTERN_GERALD_QCALREPORTTMP, lane)] = False + #gerald[_p2f(PATTERN_GERALD_SEQUENCETMP, lane)] = False + gerald[_p2f(PATTERN_GERALD_LANEFINISHED, lane)] = False + + + + ################# + # LOOPS FINISHED + + # FIRECREST + firecrest['offsets_finished.txt'] = False + firecrest['finished.txt'] = False + + # BUSTARD + bustard['finished.txt'] = False + + # GERALD + gerald['tiles.txt'] = False + gerald['FullAll.htm'] = False + #gerald['All.htm.tmp'] = False + #gerald['Signal_Means.txt.tmp'] = False + #gerald['plotIntensity_for_IVC'] = False + #gerald['IVC.htm.tmp'] = False + gerald['FullError.htm'] = False + gerald['FullPerfect.htm'] = False + #gerald['Error.htm.tmp'] = False + #gerald['Perfect.htm.tmp'] = False + #gerald['Summary.htm.tmp'] = False + #gerald['Tile.htm.tmp'] = False + gerald['finished.txt'] = False + + def statusFirecrest(self): + """ + returns (, ) + """ + firecrest = self.status['firecrest'] + total = len(firecrest) + completed = firecrest.values().count(True) + + return (completed, total) + + + def statusBustard(self): + """ + returns (, ) + """ + bustard = self.status['bustard'] + total = len(bustard) + completed = bustard.values().count(True) + + return (completed, total) + + + def statusGerald(self): + """ + returns (, ) + """ + gerald = self.status['gerald'] + total = len(gerald) + completed = gerald.values().count(True) + + return (completed, total) + + + def statusTotal(self): + """ + returns (, ) + """ + #f = firecrest c = completed + #b = bustard t = total + #g = gerald + fc, ft = self.statusFirecrest() + bc, bt = self.statusBustard() + gc, gt = self.statusGerald() + + return (fc+bc+gc, ft+bt+gt) + + + def statusReport(self): + """ + Generate the basic percent complete report + """ + def _percentCompleted(completed, total): + """ + Returns precent completed as float + """ + return (completed / float(total)) * 100 + + fc, ft = self.statusFirecrest() + bc, bt = self.statusBustard() + gc, gt = self.statusGerald() + tc, tt = self.statusTotal() + + fp = _percentCompleted(fc, ft) + bp = _percentCompleted(bc, bt) + gp = _percentCompleted(gc, gt) + tp = _percentCompleted(tc, tt) + + report = ['Firecrest: %s%% (%s/%s)' % (fp, fc, ft), + ' Bustard: %s%% (%s/%s)' % (bp, bc, bt), + ' Gerald: %s%% (%s/%s)' % (gp, gc, gt), + '-----------------------', + ' Total: %s%% (%s/%s)' % (tp, tc, tt), + ] + return report + + def updateFirecrest(self, filename): + """ + Marks firecrest filename as being completed. + """ + self.status['firecrest'][filename] = True + + + def updateBustard(self, filename): + """ + Marks bustard filename as being completed. + """ + self.status['bustard'][filename] = True + + + def updateGerald(self, filename): + """ + Marks gerald filename as being completed. + """ + self.status['gerald'][filename] = True + + + +################################################## +# Functions to be called by Thread(target=) +def _cmdLineStatusMonitorFunc(conf_info): + """ + Given a ConfigInfo object, provides status to stdout. + + You should probably use startCmdLineStatusMonitor() + instead of ths function. + + Use with: + t = threading.Thread(target=_cmdLineStatusMonitorFunc, + args=[conf_info]) + t.setDaemon(True) + t.start() + """ + SLEEP_AMOUNT = 30 + + while 1: + if conf_info.status is None: + print "No status object yet." + time.sleep(SLEEP_AMOUNT) + continue + + report = conf_info.status.statusReport() + print os.linesep.join(report) + print + + time.sleep(SLEEP_AMOUNT) + + +############################################# +# Start monitor thread convenience functions +def startCmdLineStatusMonitor(conf_info): + """ + Starts a command line status monitor given a conf_info object. + """ + t = threading.Thread(target=_cmdLineStatusMonitorFunc, args=[conf_info]) + t.setDaemon(True) + t.start() + +from optparse import OptionParser +def make_parser(): + usage = "%prog: config file" + + parser = OptionParser() + return parser + +def main(cmdline=None): + parser = make_parser() + opt, args = parser.parse_args(cmdline) + + if len(args) != 1: + parser.error("need name of configuration file") + + status = GARunStatus(args[0]) + print os.linesep.join(status.statusReport()) + return 0 + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) + diff --git a/htsworkflow/pipeline/runfolder.py b/htsworkflow/pipeline/runfolder.py new file mode 100644 index 0000000..492d103 --- /dev/null +++ b/htsworkflow/pipeline/runfolder.py @@ -0,0 +1,313 @@ +""" +Core information needed to inspect a runfolder. +""" +from glob import glob +import logging +import os +import re +import shutil +import stat +import subprocess +import sys +import time + +try: + from xml.etree import ElementTree +except ImportError, e: + from elementtree import ElementTree + +EUROPEAN_STRPTIME = "%d-%m-%Y" +EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})" +VERSION_RE = "([0-9\.]+)" +USER_RE = "([a-zA-Z0-9]+)" +LANES_PER_FLOWCELL = 8 + +from htsworkflow.util.alphanum import alphanum +from htsworkflow.util.ethelp import indent, flatten + + +class PipelineRun(object): + """ + Capture "interesting" information about a pipeline run + """ + XML_VERSION = 1 + PIPELINE_RUN = 'PipelineRun' + FLOWCELL_ID = 'FlowcellID' + + def __init__(self, pathname=None, firecrest=None, bustard=None, gerald=None, xml=None): + if pathname is not None: + self.pathname = os.path.normpath(pathname) + else: + self.pathname = None + self._name = None + self._flowcell_id = None + self.firecrest = firecrest + self.bustard = bustard + self.gerald = gerald + + if xml is not None: + self.set_elements(xml) + + def _get_flowcell_id(self): + # extract flowcell ID + if self._flowcell_id is None: + config_dir = os.path.join(self.pathname, 'Config') + flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml') + if os.path.exists(flowcell_id_path): + flowcell_id_tree = ElementTree.parse(flowcell_id_path) + self._flowcell_id = flowcell_id_tree.findtext('Text') + else: + path_fields = self.pathname.split('_') + if len(path_fields) > 0: + # guessing last element of filename + flowcell_id = path_fields[-1] + else: + flowcell_id = 'unknown' + + logging.warning( + "Flowcell id was not found, guessing %s" % ( + flowcell_id)) + self._flowcell_id = flowcell_id + return self._flowcell_id + flowcell_id = property(_get_flowcell_id) + + def get_elements(self): + """ + make one master xml file from all of our sub-components. + """ + root = ElementTree.Element(PipelineRun.PIPELINE_RUN) + flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID) + flowcell.text = self.flowcell_id + root.append(self.firecrest.get_elements()) + root.append(self.bustard.get_elements()) + root.append(self.gerald.get_elements()) + return root + + def set_elements(self, tree): + # this file gets imported by all the others, + # so we need to hide the imports to avoid a cyclic imports + from htsworkflow.pipeline import firecrest + from htsworkflow.pipeline import bustard + from htsworkflow.pipeline import gerald + + tag = tree.tag.lower() + if tag != PipelineRun.PIPELINE_RUN.lower(): + raise ValueError('Pipeline Run Expecting %s got %s' % ( + PipelineRun.PIPELINE_RUN, tag)) + for element in tree: + tag = element.tag.lower() + if tag == PipelineRun.FLOWCELL_ID.lower(): + self._flowcell_id = element.text + #ok the xword.Xword.XWORD pattern for module.class.constant is lame + elif tag == firecrest.Firecrest.FIRECREST.lower(): + self.firecrest = firecrest.Firecrest(xml=element) + elif tag == bustard.Bustard.BUSTARD.lower(): + self.bustard = bustard.Bustard(xml=element) + elif tag == gerald.Gerald.GERALD.lower(): + self.gerald = gerald.Gerald(xml=element) + else: + logging.warn('PipelineRun unrecognized tag %s' % (tag,)) + + def _get_run_name(self): + """ + Given a run tuple, find the latest date and use that as our name + """ + if self._name is None: + tmax = max(self.firecrest.time, self.bustard.time, self.gerald.time) + timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax)) + self._name = 'run_'+self.flowcell_id+"_"+timestamp+'.xml' + return self._name + name = property(_get_run_name) + + def save(self, destdir=None): + if destdir is None: + destdir = '' + logging.info("Saving run report "+ self.name) + xml = self.get_elements() + indent(xml) + dest_pathname = os.path.join(destdir, self.name) + ElementTree.ElementTree(xml).write(dest_pathname) + + def load(self, filename): + logging.info("Loading run report from " + filename) + tree = ElementTree.parse(filename).getroot() + self.set_elements(tree) + +def get_runs(runfolder): + """ + Search through a run folder for all the various sub component runs + and then return a PipelineRun for each different combination. + + For example if there are two different GERALD runs, this will + generate two different PipelineRun objects, that differ + in there gerald component. + """ + from htsworkflow.pipeline import firecrest + from htsworkflow.pipeline import bustard + from htsworkflow.pipeline import gerald + + datadir = os.path.join(runfolder, 'Data') + + logging.info('Searching for runs in ' + datadir) + runs = [] + for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")): + f = firecrest.firecrest(firecrest_pathname) + bustard_glob = os.path.join(firecrest_pathname, "Bustard*") + for bustard_pathname in glob(bustard_glob): + b = bustard.bustard(bustard_pathname) + gerald_glob = os.path.join(bustard_pathname, 'GERALD*') + for gerald_pathname in glob(gerald_glob): + try: + g = gerald.gerald(gerald_pathname) + runs.append(PipelineRun(runfolder, f, b, g)) + except IOError, e: + print "Ignoring", str(e) + return runs + + +def extract_run_parameters(runs): + """ + Search through runfolder_path for various runs and grab their parameters + """ + for run in runs: + run.save() + +def summarize_mapped_reads(mapped_reads): + """ + Summarize per chromosome reads into a genome count + But handle spike-in/contamination symlinks seperately. + """ + summarized_reads = {} + genome_reads = 0 + genome = 'unknown' + for k, v in mapped_reads.items(): + path, k = os.path.split(k) + if len(path) > 0: + genome = path + genome_reads += v + else: + summarized_reads[k] = summarized_reads.setdefault(k, 0) + v + summarized_reads[genome] = genome_reads + return summarized_reads + +def summary_report(runs): + """ + Summarize cluster numbers and mapped read counts for a runfolder + """ + report = [] + for run in runs: + # print a run name? + report.append('Summary for %s' % (run.name,)) + # sort the report + eland_keys = run.gerald.eland_results.results.keys() + eland_keys.sort(alphanum) + + lane_results = run.gerald.summary.lane_results + for lane_id in eland_keys: + result = run.gerald.eland_results.results[lane_id] + report.append("Sample name %s" % (result.sample_name)) + report.append("Lane id %s" % (result.lane_id,)) + cluster = lane_results[result.lane_id].cluster + report.append("Clusters %d +/- %d" % (cluster[0], cluster[1])) + report.append("Total Reads: %d" % (result.reads)) + mc = result._match_codes + nm = mc['NM'] + nm_percent = float(nm)/result.reads * 100 + qc = mc['QC'] + qc_percent = float(qc)/result.reads * 100 + + report.append("No Match: %d (%2.2g %%)" % (nm, nm_percent)) + report.append("QC Failed: %d (%2.2g %%)" % (qc, qc_percent)) + report.append('Unique (0,1,2 mismatches) %d %d %d' % \ + (mc['U0'], mc['U1'], mc['U2'])) + report.append('Repeat (0,1,2 mismatches) %d %d %d' % \ + (mc['R0'], mc['R1'], mc['R2'])) + report.append("Mapped Reads") + mapped_reads = summarize_mapped_reads(result.mapped_reads) + for name, counts in mapped_reads.items(): + report.append(" %s: %d" % (name, counts)) + report.append('---') + report.append('') + return os.linesep.join(report) + +def extract_results(runs, output_base_dir=None): + if output_base_dir is None: + output_base_dir = os.getcwd() + + for r in runs: + result_dir = os.path.join(output_base_dir, r.flowcell_id) + logging.info("Using %s as result directory" % (result_dir,)) + if not os.path.exists(result_dir): + os.mkdir(result_dir) + + # create cycle_dir + cycle = "C%d-%d" % (r.firecrest.start, r.firecrest.stop) + logging.info("Filling in %s" % (cycle,)) + cycle_dir = os.path.join(result_dir, cycle) + if os.path.exists(cycle_dir): + logging.error("%s already exists, not overwriting" % (cycle_dir,)) + continue + else: + os.mkdir(cycle_dir) + + # copy stuff out of the main run + g = r.gerald + + # save run file + r.save(cycle_dir) + + # Copy Summary.htm + summary_path = os.path.join(r.gerald.pathname, 'Summary.htm') + if os.path.exists(summary_path): + logging.info('Copying %s to %s' % (summary_path, cycle_dir)) + shutil.copy(summary_path, cycle_dir) + else: + logging.info('Summary file %s was not found' % (summary_path,)) + + # tar score files + score_files = [] + for f in os.listdir(g.pathname): + if re.match('.*_score.txt', f): + score_files.append(f) + + tar_cmd = ['/bin/tar', 'c'] + score_files + bzip_cmd = [ 'bzip2', '-9', '-c' ] + tar_dest_name =os.path.join(cycle_dir, 'scores.tar.bz2') + tar_dest = open(tar_dest_name, 'w') + logging.info("Compressing score files in %s" % (g.pathname,)) + logging.info("Running tar: " + " ".join(tar_cmd[:10])) + logging.info("Running bzip2: " + " ".join(bzip_cmd)) + logging.info("Writing to %s" %(tar_dest_name)) + + tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, cwd=g.pathname) + bzip = subprocess.Popen(bzip_cmd, stdin=tar.stdout, stdout=tar_dest) + tar.wait() + + # copy & bzip eland files + for eland_lane in g.eland_results.values(): + source_name = eland_lane.pathname + path, name = os.path.split(eland_lane.pathname) + dest_name = os.path.join(cycle_dir, name+'.bz2') + + args = ['bzip2', '-9', '-c', source_name] + logging.info('Running: %s' % ( " ".join(args) )) + bzip_dest = open(dest_name, 'w') + bzip = subprocess.Popen(args, stdout=bzip_dest) + logging.info('Saving to %s' % (dest_name, )) + bzip.wait() + +def clean_runs(runs): + """ + Clean up run folders to optimize for compression. + """ + # TODO: implement this. + # rm RunLog*.xml + # rm pipeline_*.txt + # rm gclog.txt + # rm NetCopy.log + # rm nfn.log + # rm Images/L* + # cd Data/C1-*_Firecrest* + # make clean_intermediate + + pass diff --git a/htsworkflow/pipeline/test/test_genome_mapper.py b/htsworkflow/pipeline/test/test_genome_mapper.py new file mode 100644 index 0000000..8916965 --- /dev/null +++ b/htsworkflow/pipeline/test/test_genome_mapper.py @@ -0,0 +1,33 @@ +import unittest + +from StringIO import StringIO +from htsworkflow.pipeline import genome_mapper + +class testGenomeMapper(unittest.TestCase): + def test_construct_mapper(self): + genomes = { + 'Arabidopsis thaliana': {'v01212004': '/arabidopsis'}, + 'Homo sapiens': {'hg18': '/hg18'}, + 'Mus musculus': {'mm8': '/mm8', + 'mm9': '/mm9', + 'mm10': '/mm10'}, + 'Phage': {'174': '/phi'}, + } + genome_map = genome_mapper.constructMapperDict(genomes) + + self.failUnlessEqual("%(Mus musculus|mm8)s" % (genome_map), "/mm8") + self.failUnlessEqual("%(Phage|174)s" % (genome_map), "/phi") + self.failUnlessEqual("%(Mus musculus)s" % (genome_map), "/mm10") + self.failUnlessEqual("%(Mus musculus|mm8)s" % (genome_map), "/mm8") + self.failUnlessEqual("%(Mus musculus|mm10)s" % (genome_map), "/mm10") + + self.failUnlessEqual(len(genome_map.keys()), 6) + self.failUnlessEqual(len(genome_map.values()), 6) + self.failUnlessEqual(len(genome_map.items()), 6) + + +def suite(): + return unittest.makeSuite(testGenomeMapper,'test') + +if __name__ == "__main__": + unittest.main(defaultTest="suite") diff --git a/htsworkflow/pipeline/test/test_runfolder026.py b/htsworkflow/pipeline/test/test_runfolder026.py new file mode 100644 index 0000000..74247a6 --- /dev/null +++ b/htsworkflow/pipeline/test/test_runfolder026.py @@ -0,0 +1,601 @@ +#!/usr/bin/env python + +from datetime import datetime, date +import os +import tempfile +import shutil +import unittest + +from htsworkflow.pipeline import firecrest +from htsworkflow.pipeline import bustard +from htsworkflow.pipeline import gerald +from htsworkflow.pipeline import runfolder +from htsworkflow.pipeline.runfolder import ElementTree + + +def make_flowcell_id(runfolder_dir, flowcell_id=None): + if flowcell_id is None: + flowcell_id = '207BTAAXY' + + config = """ + + %s +""" % (flowcell_id,) + config_dir = os.path.join(runfolder_dir, 'Config') + + if not os.path.exists(config_dir): + os.mkdir(config_dir) + pathname = os.path.join(config_dir, 'FlowcellId.xml') + f = open(pathname,'w') + f.write(config) + f.close() + +def make_matrix(matrix_dir): + contents = """# Auto-generated frequency response matrix +> A +> C +> G +> T +0.77 0.15 -0.04 -0.04 +0.76 1.02 -0.05 -0.06 +-0.10 -0.10 1.17 -0.03 +-0.13 -0.12 0.80 1.27 +""" + s_matrix = os.path.join(matrix_dir, 's_matrix.txt') + f = open(s_matrix, 'w') + f.write(contents) + f.close() + +def make_phasing_params(bustard_dir): + for lane in range(1,9): + pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane)) + f = open(pathname, 'w') + f.write(""" + 0.009900 + 0.003500 + +""") + f.close() + +def make_gerald_config(gerald_dir): + config_xml = """ + + default + + + + + Need_to_specify_ELAND_genome_directory + 8 + + domain.com + diane + localhost:25 + /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane + /home/diane/gec + 1 + /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes + Need_to_specify_genome_file_name + genome + /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane + + _prb.txt + 12 + '((CHASTITY>=0.6))' + _qhg.txt + --symbolic + 32 + --scarf + _seq.txt + _sig2.txt + _sig.txt + @(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp + s_[1-8]_[0-9][0-9][0-9][0-9] + s + Sat Apr 19 19:08:30 2008 + /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald + all + http://host.domain.com/yourshare/ + + + + eland + eland + eland + eland + eland + eland + eland + eland + + + /g/dm3 + /g/equcab1 + /g/equcab1 + /g/canfam2 + /g/hg18 + /g/hg18 + /g/hg18 + /g/hg18 + + + 32 + 32 + 32 + 32 + 32 + 32 + 32 + 32 + + + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + + + +""" + pathname = os.path.join(gerald_dir, 'config.xml') + f = open(pathname,'w') + f.write(config_xml) + f.close() + + +def make_summary_htm(gerald_dir): + summary_htm = """ + + + + +

080416_HWI-EAS229_0024_207BTAAXX Summary

+

Summary Information For Experiment 080416_HWI-EAS229_0024_207BTAAXX on Machine HWI-EAS229

+



Chip Summary

+ + + + +
MachineHWI-EAS229
Run Folder080416_HWI-EAS229_0024_207BTAAXX
Chip IDunknown
+



Lane Parameter Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LaneSample IDSample TargetSample TypeLengthFilterTiles
1unknowndm3ELAND32'((CHASTITY>=0.6))'Lane 1
2unknownequcab1ELAND32'((CHASTITY>=0.6))'Lane 2
3unknownequcab1ELAND32'((CHASTITY>=0.6))'Lane 3
4unknowncanfam2ELAND32'((CHASTITY>=0.6))'Lane 4
5unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 5
6unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 6
7unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 7
8unknownhg18ELAND32'((CHASTITY>=0.6))'Lane 8
+



Lane Results Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Lane Clusters Av 1st Cycle Int % intensity after 20 cycles % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
117421 +/- 21397230 +/- 80123.73 +/- 10.7913.00 +/- 22.9132.03 +/- 18.456703.57 +/- 3753.854.55 +/- 4.81
220311 +/- 24027660 +/- 67817.03 +/- 4.4040.74 +/- 30.3329.54 +/- 9.035184.02 +/- 1631.543.27 +/- 3.94
320193 +/- 23997700 +/- 79715.75 +/- 3.3056.56 +/- 17.1627.33 +/- 7.484803.49 +/- 1313.313.07 +/- 2.86
415537 +/- 25317620 +/- 139215.37 +/- 3.7963.05 +/- 18.3015.88 +/- 4.993162.13 +/- 962.593.11 +/- 2.22
532047 +/- 33568093 +/- 83123.79 +/- 6.1853.36 +/- 18.0648.04 +/- 13.779866.23 +/- 2877.302.26 +/- 1.16
632946 +/- 47538227 +/- 73624.07 +/- 4.6954.65 +/- 12.5750.98 +/- 10.5410468.86 +/- 2228.532.21 +/- 2.33
739504 +/- 41718401 +/- 78522.55 +/- 4.5645.22 +/- 10.3448.41 +/- 9.679829.40 +/- 1993.202.26 +/- 1.11
837998 +/- 37928443 +/- 121139.03 +/- 7.5242.16 +/- 12.3540.98 +/- 14.898128.87 +/- 3055.343.57 +/- 2.77
+ + +""" + pathname = os.path.join(gerald_dir, 'Summary.htm') + f = open(pathname, 'w') + f.write(summary_htm) + f.close() + +def make_eland_results(gerald_dir): + eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D. +>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 chr2L.fa 8796855 R DD 24T +>HWI-EAS229_24_207BTAAXX:1:7:776:582 AGCTCANCCGATCGAAAACCTCNCCAAGCAAT NM 0 0 0 +>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 Lambda.fa 8796855 R DD 24T +""" + for i in range(1,9): + pathname = os.path.join(gerald_dir, + 's_%d_eland_result.txt' % (i,)) + f = open(pathname, 'w') + f.write(eland_result) + f.close() + +class RunfolderTests(unittest.TestCase): + """ + Test components of the runfolder processing code + which includes firecrest, bustard, and gerald + """ + def setUp(self): + # make a fake runfolder directory + self.temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_') + + self.runfolder_dir = os.path.join(self.temp_dir, + '080102_HWI-EAS229_0010_207BTAAXX') + os.mkdir(self.runfolder_dir) + + self.data_dir = os.path.join(self.runfolder_dir, 'Data') + os.mkdir(self.data_dir) + + self.firecrest_dir = os.path.join(self.data_dir, + 'C1-33_Firecrest1.8.28_12-04-2008_diane' + ) + os.mkdir(self.firecrest_dir) + self.matrix_dir = os.path.join(self.firecrest_dir, 'Matrix') + os.mkdir(self.matrix_dir) + make_matrix(self.matrix_dir) + + self.bustard_dir = os.path.join(self.firecrest_dir, + 'Bustard1.8.28_12-04-2008_diane') + os.mkdir(self.bustard_dir) + make_phasing_params(self.bustard_dir) + + self.gerald_dir = os.path.join(self.bustard_dir, + 'GERALD_12-04-2008_diane') + os.mkdir(self.gerald_dir) + make_gerald_config(self.gerald_dir) + make_summary_htm(self.gerald_dir) + make_eland_results(self.gerald_dir) + + def tearDown(self): + shutil.rmtree(self.temp_dir) + + def test_firecrest(self): + """ + Construct a firecrest object + """ + f = firecrest.firecrest(self.firecrest_dir) + self.failUnlessEqual(f.version, '1.8.28') + self.failUnlessEqual(f.start, 1) + self.failUnlessEqual(f.stop, 33) + self.failUnlessEqual(f.user, 'diane') + self.failUnlessEqual(f.date, date(2008,4,12)) + + xml = f.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + + f2 = firecrest.Firecrest(xml=xml) + self.failUnlessEqual(f.version, f2.version) + self.failUnlessEqual(f.start, f2.start) + self.failUnlessEqual(f.stop, f2.stop) + self.failUnlessEqual(f.user, f2.user) + self.failUnlessEqual(f.date, f2.date) + + def test_bustard(self): + """ + construct a bustard object + """ + b = bustard.bustard(self.bustard_dir) + self.failUnlessEqual(b.version, '1.8.28') + self.failUnlessEqual(b.date, date(2008,4,12)) + self.failUnlessEqual(b.user, 'diane') + self.failUnlessEqual(len(b.phasing), 8) + self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099) + + xml = b.get_elements() + b2 = bustard.Bustard(xml=xml) + self.failUnlessEqual(b.version, b2.version) + self.failUnlessEqual(b.date, b2.date ) + self.failUnlessEqual(b.user, b2.user) + self.failUnlessEqual(len(b.phasing), len(b2.phasing)) + for key in b.phasing.keys(): + self.failUnlessEqual(b.phasing[key].lane, + b2.phasing[key].lane) + self.failUnlessEqual(b.phasing[key].phasing, + b2.phasing[key].phasing) + self.failUnlessEqual(b.phasing[key].prephasing, + b2.phasing[key].prephasing) + + def test_gerald(self): + # need to update gerald and make tests for it + g = gerald.gerald(self.gerald_dir) + + self.failUnlessEqual(g.version, + '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp') + self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30)) + self.failUnlessEqual(len(g.lanes), len(g.lanes.keys())) + self.failUnlessEqual(len(g.lanes), len(g.lanes.items())) + + + # list of genomes, matches what was defined up in + # make_gerald_config. + # the first None is to offset the genomes list to be 1..9 + # instead of pythons default 0..8 + genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2', + '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ] + + # test lane specific parameters from gerald config file + for i in range(1,9): + cur_lane = g.lanes[str(i)] + self.failUnlessEqual(cur_lane.analysis, 'eland') + self.failUnlessEqual(cur_lane.eland_genome, genomes[i]) + self.failUnlessEqual(cur_lane.read_length, '32') + self.failUnlessEqual(cur_lane.use_bases, 'Y'*32) + + # test data extracted from summary file + clusters = [None, + (17421, 2139), (20311, 2402), (20193, 2399), (15537, 2531), + (32047, 3356), (32946, 4753), (39504, 4171), (37998, 3792)] + + for i in range(1,9): + summary_lane = g.summary[str(i)] + self.failUnlessEqual(summary_lane.cluster, clusters[i]) + self.failUnlessEqual(summary_lane.lane, str(i)) + + xml = g.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + g2 = gerald.Gerald(xml=xml) + + # do it all again after extracting from the xml file + self.failUnlessEqual(g.version, g2.version) + self.failUnlessEqual(g.date, g2.date) + self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys())) + self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items())) + + # test lane specific parameters from gerald config file + for i in range(1,9): + g_lane = g.lanes[str(i)] + g2_lane = g2.lanes[str(i)] + self.failUnlessEqual(g_lane.analysis, g2_lane.analysis) + self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome) + self.failUnlessEqual(g_lane.read_length, g2_lane.read_length) + self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases) + + # test (some) summary elements + for i in range(1,9): + g_summary = g.summary[str(i)] + g2_summary = g2.summary[str(i)] + self.failUnlessEqual(g_summary.cluster, g2_summary.cluster) + self.failUnlessEqual(g_summary.lane, g2_summary.lane) + + g_eland = g.eland_results + g2_eland = g2.eland_results + for lane in g_eland.keys(): + self.failUnlessEqual(g_eland[lane].reads, + g2_eland[lane].reads) + self.failUnlessEqual(len(g_eland[lane].mapped_reads), + len(g2_eland[lane].mapped_reads)) + for k in g_eland[lane].mapped_reads.keys(): + self.failUnlessEqual(g_eland[lane].mapped_reads[k], + g2_eland[lane].mapped_reads[k]) + + self.failUnlessEqual(len(g_eland[lane].match_codes), + len(g2_eland[lane].match_codes)) + for k in g_eland[lane].match_codes.keys(): + self.failUnlessEqual(g_eland[lane].match_codes[k], + g2_eland[lane].match_codes[k]) + + + def test_eland(self): + dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa', + 'chr2L.fa': 'dm3/chr2L.fa', + 'Lambda.fa': 'Lambda.fa'} + genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map, + '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map } + eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps) + + for i in range(1,9): + lane = eland[str(i)] + self.failUnlessEqual(lane.reads, 4) + self.failUnlessEqual(lane.sample_name, "s") + self.failUnlessEqual(lane.lane_id, unicode(i)) + self.failUnlessEqual(len(lane.mapped_reads), 3) + self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1) + self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1) + self.failUnlessEqual(lane.match_codes['U1'], 2) + self.failUnlessEqual(lane.match_codes['NM'], 1) + + xml = eland.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + e2 = gerald.ELAND(xml=xml) + + for i in range(1,9): + l1 = eland[str(i)] + l2 = e2[str(i)] + self.failUnlessEqual(l1.reads, l2.reads) + self.failUnlessEqual(l1.sample_name, l2.sample_name) + self.failUnlessEqual(l1.lane_id, l2.lane_id) + self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads)) + self.failUnlessEqual(len(l1.mapped_reads), 3) + for k in l1.mapped_reads.keys(): + self.failUnlessEqual(l1.mapped_reads[k], + l2.mapped_reads[k]) + + self.failUnlessEqual(len(l1.match_codes), 9) + self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes)) + for k in l1.match_codes.keys(): + self.failUnlessEqual(l1.match_codes[k], + l2.match_codes[k]) + + def test_runfolder(self): + runs = runfolder.get_runs(self.runfolder_dir) + + # do we get the flowcell id from the filename? + self.failUnlessEqual(len(runs), 1) + self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-04-19.xml') + + # do we get the flowcell id from the FlowcellId.xml file + make_flowcell_id(self.runfolder_dir, '207BTAAXY') + runs = runfolder.get_runs(self.runfolder_dir) + self.failUnlessEqual(len(runs), 1) + self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-04-19.xml') + + r1 = runs[0] + xml = r1.get_elements() + xml_str = ElementTree.tostring(xml) + + r2 = runfolder.PipelineRun(xml=xml) + self.failUnlessEqual(r1.name, r2.name) + self.failIfEqual(r2.firecrest, None) + self.failIfEqual(r2.bustard, None) + self.failIfEqual(r2.gerald, None) + + +def suite(): + return unittest.makeSuite(RunfolderTests,'test') + +if __name__ == "__main__": + unittest.main(defaultTest="suite") + diff --git a/htsworkflow/pipeline/test/test_runfolder030.py b/htsworkflow/pipeline/test/test_runfolder030.py new file mode 100644 index 0000000..e5d1ea1 --- /dev/null +++ b/htsworkflow/pipeline/test/test_runfolder030.py @@ -0,0 +1,1024 @@ +#!/usr/bin/env python + +from datetime import datetime, date +import os +import tempfile +import shutil +import unittest + +from htsworkflow.pipeline import firecrest +from htsworkflow.pipeline import bustard +from htsworkflow.pipeline import gerald +from htsworkflow.pipeline import runfolder +from htsworkflow.pipeline.runfolder import ElementTree + + +def make_flowcell_id(runfolder_dir, flowcell_id=None): + if flowcell_id is None: + flowcell_id = '207BTAAXY' + + config = """ + + %s +""" % (flowcell_id,) + config_dir = os.path.join(runfolder_dir, 'Config') + + if not os.path.exists(config_dir): + os.mkdir(config_dir) + pathname = os.path.join(config_dir, 'FlowcellId.xml') + f = open(pathname,'w') + f.write(config) + f.close() + +def make_matrix(matrix_dir): + contents = """# Auto-generated frequency response matrix +> A +> C +> G +> T +0.77 0.15 -0.04 -0.04 +0.76 1.02 -0.05 -0.06 +-0.10 -0.10 1.17 -0.03 +-0.13 -0.12 0.80 1.27 +""" + s_matrix = os.path.join(matrix_dir, 's_matrix.txt') + f = open(s_matrix, 'w') + f.write(contents) + f.close() + +def make_phasing_params(bustard_dir): + for lane in range(1,9): + pathname = os.path.join(bustard_dir, 'params%d.xml' % (lane)) + f = open(pathname, 'w') + f.write(""" + 0.009900 + 0.003500 + +""") + f.close() + +def make_gerald_config(gerald_dir): + config_xml = """ + + default + + + + + Need_to_specify_ELAND_genome_directory + 8 + + domain.com + diane + localhost:25 + /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane + /home/diane/gec + 1 + /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald/../../Genomes + Need_to_specify_genome_file_name + genome + /home/diane/gec/080416_HWI-EAS229_0024_207BTAAXX/Data/C1-33_Firecrest1.8.28_19-04-2008_diane/Bustard1.8.28_19-04-2008_diane/GERALD_19-04-2008_diane + + _prb.txt + 12 + '((CHASTITY>=0.6))' + _qhg.txt + --symbolic + 32 + --scarf + _seq.txt + _sig2.txt + _sig.txt + @(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp + s_[1-8]_[0-9][0-9][0-9][0-9] + s + Sat Apr 19 19:08:30 2008 + /home/diane/proj/SolexaPipeline-0.2.2.6/Goat/../Gerald + all + http://host.domain.com/yourshare/ + + + + eland + eland + eland + eland + eland + eland + eland + eland + + + /g/dm3 + /g/equcab1 + /g/equcab1 + /g/canfam2 + /g/hg18 + /g/hg18 + /g/hg18 + /g/hg18 + + + 32 + 32 + 32 + 32 + 32 + 32 + 32 + 32 + + + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY + + + +""" + pathname = os.path.join(gerald_dir, 'config.xml') + f = open(pathname,'w') + f.write(config_xml) + f.close() + +def make_summary_htm(gerald_dir): + summary_htm=""" + + + + +

080627_HWI-EAS229_0036_3055HAXX Summary

+

Summary Information For Experiment 080627_HWI-EAS229_0036_3055HAXX on Machine HWI-EAS229

+



Chip Summary

+ + + + +
MachineHWI-EAS229
Run Folder080627_HWI-EAS229_0036_3055HAXX
Chip IDunknown
+



Chip Results Summary

+ + + + + + + + + + +
ClustersClusters (PF)Yield (kbases)
80933224435778031133022
+



Lane Parameter Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LaneSample IDSample TargetSample TypeLengthFilterNum TilesTiles
1unknownmm9ELAND26'((CHASTITY>=0.6))'100Lane 1
2unknownmm9ELAND26'((CHASTITY>=0.6))'100Lane 2
3unknownmm9ELAND26'((CHASTITY>=0.6))'100Lane 3
4unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 4
5unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 5
6unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 6
7unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 7
8unknownelegans170ELAND26'((CHASTITY>=0.6))'100Lane 8
+



Lane Results Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Lane InfoTile Mean +/- SD for Lane
Lane Lane Yield (kbases) Clusters (raw)Clusters (PF) 1st Cycle Int (PF) % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Alignment Score (PF) % Error Rate (PF)
115804696483 +/- 907460787 +/- 4240329 +/- 35101.88 +/- 6.0363.21 +/- 3.2970.33 +/- 0.249054.08 +/- 59.160.46 +/- 0.18
2156564133738 +/- 793860217 +/- 1926444 +/- 3992.62 +/- 7.5845.20 +/- 3.3151.98 +/- 0.746692.04 +/- 92.490.46 +/- 0.09
3185818152142 +/- 1000271468 +/- 2827366 +/- 3691.53 +/- 8.6647.19 +/- 3.8082.24 +/- 0.4410598.68 +/- 64.130.41 +/- 0.04
43495315784 +/- 216213443 +/- 1728328 +/- 4097.53 +/- 9.8785.29 +/- 1.9180.02 +/- 0.5310368.82 +/- 71.080.15 +/- 0.05
5167936119735 +/- 846564590 +/- 2529417 +/- 3788.69 +/- 14.7954.10 +/- 2.5976.95 +/- 0.329936.47 +/- 65.750.28 +/- 0.02
6173463152177 +/- 814666716 +/- 2493372 +/- 3987.06 +/- 9.8643.98 +/- 3.1278.80 +/- 0.4310162.28 +/- 49.650.38 +/- 0.03
714928784649 +/- 732557418 +/- 3617295 +/- 2889.40 +/- 8.2367.97 +/- 1.8233.38 +/- 0.254247.92 +/- 32.371.00 +/- 0.03
810695354622 +/- 481241136 +/- 3309284 +/- 3790.21 +/- 9.1075.39 +/- 2.2748.33 +/- 0.296169.21 +/- 169.500.86 +/- 1.22
Tile mean across chip
Av.1011665447235492.3660.2965.258403.690.50
+



Expanded Lane Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Lane InfoPhasing InfoRaw Data (tile mean)Filtered Data (tile mean)
Lane Clusters (tile mean) (raw)% Phasing % Prephasing % Error Rate (raw) Equiv Perfect Clusters (raw) % retained Cycle 2-4 Av Int (PF) Cycle 2-10 Av % Loss (PF) Cycle 10-20 Av % Loss (PF) % Align (PF) % Error Rate (PF) Equiv Perfect Clusters (PF)
1964830.77000.31001.004967663.21317 +/- 320.13 +/- 0.44-1.14 +/- 0.3470.330.4641758
21337380.77000.31001.224046745.20415 +/- 330.29 +/- 0.40-0.79 +/- 0.3551.980.4630615
31521420.77000.31001.307858847.19344 +/- 260.68 +/- 0.51-0.77 +/- 0.4282.240.4157552
4157840.77000.31000.291109585.29306 +/- 340.20 +/- 0.69-1.28 +/- 0.6680.020.1510671
51197350.77000.31000.856033554.10380 +/- 320.34 +/- 0.49-1.55 +/- 4.6976.950.2849015
61521770.77000.31001.217090543.98333 +/- 270.57 +/- 0.50-0.91 +/- 0.3978.800.3851663
7846490.77000.31001.382106967.97272 +/- 201.15 +/- 0.52-0.84 +/- 0.5833.381.0018265
8546220.77000.31001.172133575.39262 +/- 311.10 +/- 0.59-1.01 +/- 0.4748.330.8619104
+

IVC Plots
+

IVC.htm +

+

All Intensity Plots
+

All.htm +

+

Error graphs:
+

Error.htm +

+Back to top +



Lane 1

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
10001114972326.4894.3957.4470.29038.60.44
+Back to top +



Lane 2

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
20001147793448.1283.6838.5753.76905.40.54
+Back to top +



Lane 3

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
30001167904374.0586.9140.3681.310465.00.47
+Back to top +



Lane 4

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
4000120308276.8592.8784.2680.410413.80.16
+Back to top +



Lane 5

+ + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
+Back to top +



Lane 6

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
60001166844348.1277.5938.1379.710264.40.44
+Back to top +



Lane 7

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
7000198913269.9086.6664.5533.24217.51.02
+Back to top +



Lane 8

+ + + + + + + + + + + + + + + + + + + + + + + +
Lane Tile Clusters (raw)Av 1st Cycle Int (PF) Av % intensity after 20 cycles (PF) % PF Clusters % Align (PF) Av Alignment Score (PF) % Error Rate (PF)
8000164972243.6089.4073.1748.36182.80.71
+Back to top + + +""" + pathname = os.path.join(gerald_dir, 'Summary.htm') + f = open(pathname, 'w') + f.write(summary_htm) + f.close() + +def make_eland_results(gerald_dir): + eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D. +>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 chr2L.fa 8796855 R DD 24T +>HWI-EAS229_24_207BTAAXX:1:7:776:582 AGCTCANCCGATCGAAAACCTCNCCAAGCAAT NM 0 0 0 +>HWI-EAS229_24_207BTAAXX:1:7:205:842 AAACAANNCTCCCAAACACGTAAACTGGAAAA U1 0 1 0 Lambda.fa 8796855 R DD 24T +""" + for i in range(1,9): + pathname = os.path.join(gerald_dir, + 's_%d_eland_result.txt' % (i,)) + f = open(pathname, 'w') + f.write(eland_result) + f.close() + +def make_runfolder(obj=None): + """ + Make a fake runfolder, attach all the directories to obj if defined + """ + # make a fake runfolder directory + temp_dir = tempfile.mkdtemp(prefix='tmp_runfolder_') + + runfolder_dir = os.path.join(temp_dir, + '080102_HWI-EAS229_0010_207BTAAXX') + os.mkdir(runfolder_dir) + + data_dir = os.path.join(runfolder_dir, 'Data') + os.mkdir(data_dir) + + firecrest_dir = os.path.join(data_dir, + 'C1-33_Firecrest1.8.28_12-04-2008_diane' + ) + os.mkdir(firecrest_dir) + matrix_dir = os.path.join(firecrest_dir, 'Matrix') + os.mkdir(matrix_dir) + make_matrix(matrix_dir) + + bustard_dir = os.path.join(firecrest_dir, + 'Bustard1.8.28_12-04-2008_diane') + os.mkdir(bustard_dir) + make_phasing_params(bustard_dir) + + gerald_dir = os.path.join(bustard_dir, + 'GERALD_12-04-2008_diane') + os.mkdir(gerald_dir) + make_gerald_config(gerald_dir) + make_summary_htm(gerald_dir) + make_eland_results(gerald_dir) + + if obj is not None: + obj.temp_dir = temp_dir + obj.runfolder_dir = runfolder_dir + obj.data_dir = data_dir + obj.firecrest_dir = firecrest_dir + obj.matrix_dir = matrix_dir + obj.bustard_dir = bustard_dir + obj.gerald_dir = gerald_dir + + +class RunfolderTests(unittest.TestCase): + """ + Test components of the runfolder processing code + which includes firecrest, bustard, and gerald + """ + def setUp(self): + # attaches all the directories to the object passed in + make_runfolder(self) + + def tearDown(self): + shutil.rmtree(self.temp_dir) + + def test_firecrest(self): + """ + Construct a firecrest object + """ + f = firecrest.firecrest(self.firecrest_dir) + self.failUnlessEqual(f.version, '1.8.28') + self.failUnlessEqual(f.start, 1) + self.failUnlessEqual(f.stop, 33) + self.failUnlessEqual(f.user, 'diane') + self.failUnlessEqual(f.date, date(2008,4,12)) + + xml = f.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + + f2 = firecrest.Firecrest(xml=xml) + self.failUnlessEqual(f.version, f2.version) + self.failUnlessEqual(f.start, f2.start) + self.failUnlessEqual(f.stop, f2.stop) + self.failUnlessEqual(f.user, f2.user) + self.failUnlessEqual(f.date, f2.date) + + def test_bustard(self): + """ + construct a bustard object + """ + b = bustard.bustard(self.bustard_dir) + self.failUnlessEqual(b.version, '1.8.28') + self.failUnlessEqual(b.date, date(2008,4,12)) + self.failUnlessEqual(b.user, 'diane') + self.failUnlessEqual(len(b.phasing), 8) + self.failUnlessAlmostEqual(b.phasing[8].phasing, 0.0099) + + xml = b.get_elements() + b2 = bustard.Bustard(xml=xml) + self.failUnlessEqual(b.version, b2.version) + self.failUnlessEqual(b.date, b2.date ) + self.failUnlessEqual(b.user, b2.user) + self.failUnlessEqual(len(b.phasing), len(b2.phasing)) + for key in b.phasing.keys(): + self.failUnlessEqual(b.phasing[key].lane, + b2.phasing[key].lane) + self.failUnlessEqual(b.phasing[key].phasing, + b2.phasing[key].phasing) + self.failUnlessEqual(b.phasing[key].prephasing, + b2.phasing[key].prephasing) + + def test_gerald(self): + # need to update gerald and make tests for it + g = gerald.gerald(self.gerald_dir) + + self.failUnlessEqual(g.version, + '@(#) Id: GERALD.pl,v 1.68.2.2 2007/06/13 11:08:49 km Exp') + self.failUnlessEqual(g.date, datetime(2008,4,19,19,8,30)) + self.failUnlessEqual(len(g.lanes), len(g.lanes.keys())) + self.failUnlessEqual(len(g.lanes), len(g.lanes.items())) + + + # list of genomes, matches what was defined up in + # make_gerald_config. + # the first None is to offset the genomes list to be 1..9 + # instead of pythons default 0..8 + genomes = [None, '/g/dm3', '/g/equcab1', '/g/equcab1', '/g/canfam2', + '/g/hg18', '/g/hg18', '/g/hg18', '/g/hg18', ] + + # test lane specific parameters from gerald config file + for i in range(1,9): + cur_lane = g.lanes[str(i)] + self.failUnlessEqual(cur_lane.analysis, 'eland') + self.failUnlessEqual(cur_lane.eland_genome, genomes[i]) + self.failUnlessEqual(cur_lane.read_length, '32') + self.failUnlessEqual(cur_lane.use_bases, 'Y'*32) + + # test data extracted from summary file + clusters = [None, + (96483, 9074), (133738, 7938), + (152142, 10002), (15784, 2162), + (119735, 8465), (152177, 8146), + (84649, 7325), (54622, 4812),] + + for i in range(1,9): + summary_lane = g.summary[str(i)] + self.failUnlessEqual(summary_lane.cluster, clusters[i]) + self.failUnlessEqual(summary_lane.lane, str(i)) + + xml = g.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + g2 = gerald.Gerald(xml=xml) + + # do it all again after extracting from the xml file + self.failUnlessEqual(g.version, g2.version) + self.failUnlessEqual(g.date, g2.date) + self.failUnlessEqual(len(g.lanes.keys()), len(g2.lanes.keys())) + self.failUnlessEqual(len(g.lanes.items()), len(g2.lanes.items())) + + # test lane specific parameters from gerald config file + for i in range(1,9): + g_lane = g.lanes[str(i)] + g2_lane = g2.lanes[str(i)] + self.failUnlessEqual(g_lane.analysis, g2_lane.analysis) + self.failUnlessEqual(g_lane.eland_genome, g2_lane.eland_genome) + self.failUnlessEqual(g_lane.read_length, g2_lane.read_length) + self.failUnlessEqual(g_lane.use_bases, g2_lane.use_bases) + + # test (some) summary elements + for i in range(1,9): + g_summary = g.summary[str(i)] + g2_summary = g2.summary[str(i)] + self.failUnlessEqual(g_summary.cluster, g2_summary.cluster) + self.failUnlessEqual(g_summary.lane, g2_summary.lane) + + g_eland = g.eland_results + g2_eland = g2.eland_results + for lane in g_eland.keys(): + self.failUnlessEqual(g_eland[lane].reads, + g2_eland[lane].reads) + self.failUnlessEqual(len(g_eland[lane].mapped_reads), + len(g2_eland[lane].mapped_reads)) + for k in g_eland[lane].mapped_reads.keys(): + self.failUnlessEqual(g_eland[lane].mapped_reads[k], + g2_eland[lane].mapped_reads[k]) + + self.failUnlessEqual(len(g_eland[lane].match_codes), + len(g2_eland[lane].match_codes)) + for k in g_eland[lane].match_codes.keys(): + self.failUnlessEqual(g_eland[lane].match_codes[k], + g2_eland[lane].match_codes[k]) + + + def test_eland(self): + dm3_map = { 'chrUextra.fa' : 'dm3/chrUextra.fa', + 'chr2L.fa': 'dm3/chr2L.fa', + 'Lambda.fa': 'Lambda.fa'} + genome_maps = { '1':dm3_map, '2':dm3_map, '3':dm3_map, '4':dm3_map, + '5':dm3_map, '6':dm3_map, '7':dm3_map, '8':dm3_map } + eland = gerald.eland(self.gerald_dir, genome_maps=genome_maps) + + for i in range(1,9): + lane = eland[str(i)] + self.failUnlessEqual(lane.reads, 4) + self.failUnlessEqual(lane.sample_name, "s") + self.failUnlessEqual(lane.lane_id, unicode(i)) + self.failUnlessEqual(len(lane.mapped_reads), 3) + self.failUnlessEqual(lane.mapped_reads['Lambda.fa'], 1) + self.failUnlessEqual(lane.mapped_reads['dm3/chr2L.fa'], 1) + self.failUnlessEqual(lane.match_codes['U1'], 2) + self.failUnlessEqual(lane.match_codes['NM'], 1) + + xml = eland.get_elements() + # just make sure that element tree can serialize the tree + xml_str = ElementTree.tostring(xml) + e2 = gerald.ELAND(xml=xml) + + for i in range(1,9): + l1 = eland[str(i)] + l2 = e2[str(i)] + self.failUnlessEqual(l1.reads, l2.reads) + self.failUnlessEqual(l1.sample_name, l2.sample_name) + self.failUnlessEqual(l1.lane_id, l2.lane_id) + self.failUnlessEqual(len(l1.mapped_reads), len(l2.mapped_reads)) + self.failUnlessEqual(len(l1.mapped_reads), 3) + for k in l1.mapped_reads.keys(): + self.failUnlessEqual(l1.mapped_reads[k], + l2.mapped_reads[k]) + + self.failUnlessEqual(len(l1.match_codes), 9) + self.failUnlessEqual(len(l1.match_codes), len(l2.match_codes)) + for k in l1.match_codes.keys(): + self.failUnlessEqual(l1.match_codes[k], + l2.match_codes[k]) + + def test_runfolder(self): + runs = runfolder.get_runs(self.runfolder_dir) + + # do we get the flowcell id from the filename? + self.failUnlessEqual(len(runs), 1) + self.failUnlessEqual(runs[0].name, 'run_207BTAAXX_2008-04-19.xml') + + # do we get the flowcell id from the FlowcellId.xml file + make_flowcell_id(self.runfolder_dir, '207BTAAXY') + runs = runfolder.get_runs(self.runfolder_dir) + self.failUnlessEqual(len(runs), 1) + self.failUnlessEqual(runs[0].name, 'run_207BTAAXY_2008-04-19.xml') + + r1 = runs[0] + xml = r1.get_elements() + xml_str = ElementTree.tostring(xml) + + r2 = runfolder.PipelineRun(xml=xml) + self.failUnlessEqual(r1.name, r2.name) + self.failIfEqual(r2.firecrest, None) + self.failIfEqual(r2.bustard, None) + self.failIfEqual(r2.gerald, None) + + +def suite(): + return unittest.makeSuite(RunfolderTests,'test') + +if __name__ == "__main__": + unittest.main(defaultTest="suite") + diff --git a/htsworkflow/util/__init__.py b/htsworkflow/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/htsworkflow/util/alphanum.py b/htsworkflow/util/alphanum.py new file mode 100644 index 0000000..8893bdb --- /dev/null +++ b/htsworkflow/util/alphanum.py @@ -0,0 +1,61 @@ +# +# The Alphanum Algorithm is an improved sorting algorithm for strings +# containing numbers. Instead of sorting numbers in ASCII order like +# a standard sort, this algorithm sorts numbers in numeric order. +# +# The Alphanum Algorithm is discussed at http://www.DaveKoelle.com +# +#* Python implementation provided by Chris Hulan (chris.hulan@gmail.com) +#* Distributed under same license as original +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +# + +import re + +# +# TODO: Make decimal points be considered in the same class as digits +# + +def chunkify(str): + """return a list of numbers and non-numeric substrings of +str+ + + the numeric substrings are converted to integer, non-numeric are left as is + """ + chunks = re.findall("(\d+|\D+)",str) + chunks = [re.match('\d',x) and int(x) or x for x in chunks] #convert numeric strings to numbers + return chunks + +def alphanum(a,b): + """breaks +a+ and +b+ into pieces and returns left-to-right comparison of the pieces + + +a+ and +b+ are expected to be strings (for example file names) with numbers and non-numeric characters + Split the values into list of numbers and non numeric sub-strings and so comparison of numbers gives + Numeric sorting, comparison of non-numeric gives Lexicographic order + """ + # split strings into chunks + aChunks = chunkify(a) + bChunks = chunkify(b) + + return cmp(aChunks,bChunks) #built in comparison works once data is prepared + + + +if __name__ == "__main__": + unsorted = ["1000X Radonius Maximus","10X Radonius","200X Radonius","20X Radonius","20X Radonius Prime","30X Radonius","40X Radonius","Allegia 50 Clasteron","Allegia 500 Clasteron","Allegia 51 Clasteron","Allegia 51B Clasteron","Allegia 52 Clasteron","Allegia 60 Clasteron","Alpha 100","Alpha 2","Alpha 200","Alpha 2A","Alpha 2A-8000","Alpha 2A-900","Callisto Morphamax","Callisto Morphamax 500","Callisto Morphamax 5000","Callisto Morphamax 600","Callisto Morphamax 700","Callisto Morphamax 7000","Callisto Morphamax 7000 SE","Callisto Morphamax 7000 SE2","QRS-60 Intrinsia Machine","QRS-60F Intrinsia Machine","QRS-62 Intrinsia Machine","QRS-62F Intrinsia Machine","Xiph Xlater 10000","Xiph Xlater 2000","Xiph Xlater 300","Xiph Xlater 40","Xiph Xlater 5","Xiph Xlater 50","Xiph Xlater 500","Xiph Xlater 5000","Xiph Xlater 58"] + sorted = unsorted[:] + sorted.sort(alphanum) + print '+++++Sorted...++++' + print '\n'.join(sorted) diff --git a/htsworkflow/util/ethelp.py b/htsworkflow/util/ethelp.py new file mode 100644 index 0000000..19f6c9f --- /dev/null +++ b/htsworkflow/util/ethelp.py @@ -0,0 +1,32 @@ +""" +ElementTree helper functions +""" +def indent(elem, level=0): + """ + reformat an element tree to be 'pretty' (indented) + """ + i = "\n" + level*" " + if len(elem): + if not elem.text or not elem.text.strip(): + elem.text = i + " " + for child in elem: + indent(child, level+1) + # we don't want the closing tag indented too far + child.tail = i + if not elem.tail or not elem.tail.strip(): + elem.tail = i + else: + if level and (not elem.tail or not elem.tail.strip()): + elem.tail = i + +def flatten(elem, include_tail=0): + """ + Extract the text from an element tree + (AKA extract the text that not part of XML tags) + """ + text = elem.text or "" + for e in elem: + text += flatten(e, 1) + if include_tail and elem.tail: text += elem.tail + return text + diff --git a/htsworkflow/util/fctracker.py b/htsworkflow/util/fctracker.py new file mode 100644 index 0000000..57b5dcf --- /dev/null +++ b/htsworkflow/util/fctracker.py @@ -0,0 +1,201 @@ +""" +Provide some quick and dirty access and reporting for the fctracker database. + +The advantage to this code is that it doesn't depend on django being +installed, so it can run on machines other than the webserver. +""" +import datetime +import os +import re +import sys +import time + +if sys.version_info[0] + sys.version_info[1] * 0.1 >= 2.5: + # we're python 2.5 + import sqlite3 +else: + import pysqlite2.dbapi2 as sqlite3 + + +class fctracker: + """ + provide a simple way to interact with the flowcell data in fctracker.db + """ + def __init__(self, database): + # default to the current directory + if database is None: + self.database = self._guess_fctracker_path() + else: + self.database = database + self.conn = sqlite3.connect(self.database) + self._get_library() + self._get_species() + + def _guess_fctracker_path(self): + """ + Guess a few obvious places for the database + """ + fctracker = 'fctracker.db' + name = fctracker + # is it in the current dir? + if os.path.exists(name): + return name + name = os.path.expanduser(os.path.join('~', fctracker)) + if os.path.exists(name): + return name + raise RuntimeError("Can't find fctracker") + + def _make_dict_from_table(self, table_name, pkey_name): + """ + Convert a django table into a dictionary indexed by the primary key. + Yes, it really does just load everything into memory, hopefully + we stay under a few tens of thousands of runs for a while. + """ + table = {} + c = self.conn.cursor() + c.execute('select * from %s;' % (table_name)) + # extract just the field name + description = [ f[0] for f in c.description] + for row in c: + row_dict = dict(zip(description, row)) + table[row_dict[pkey_name]] = row_dict + c.close() + return table + + def _add_lanes_to_libraries(self): + """ + add flowcell/lane ids to new attribute 'lanes' in the library dictionary + """ + library_id_re = re.compile('lane_\d_library_id') + + for fc_id, fc in self.flowcells.items(): + lane_library = [ (x[0][5], x[1]) for x in fc.items() + if library_id_re.match(x[0]) ] + for lane, library_id in lane_library: + if not self.library[library_id].has_key('lanes'): + self.library[library_id]['lanes'] = [] + self.library[library_id]['lanes'].append((fc_id, lane)) + + def _get_library(self): + """ + attach the library dictionary to the instance + """ + self.library = self._make_dict_from_table( + 'fctracker_library', + 'library_id') + + + def _get_species(self): + """ + attach the species dictionary to the instance + """ + self.species = self._make_dict_from_table( + 'fctracker_species', + 'id' + ) + + def _get_flowcells(self, where=None): + """ + attach the flowcell dictionary to the instance + + where is a sql where clause. (eg "where run_date > '2008-1-1'") + that can be used to limit what flowcells we select + FIXME: please add sanitization code + """ + if where is None: + where = "" + self.flowcells = {} + c = self.conn.cursor() + c.execute('select * from fctracker_flowcell %s;' % (where)) + # extract just the field name + description = [ f[0] for f in c.description ] + for row in c: + row_dict = dict(zip(description, row)) + fcid, status = self._parse_flowcell_id(row_dict) + row_dict['flowcell_id'] = fcid + row_dict['flowcell_status'] = status + + for lane in [ 'lane_%d_library' % (i) for i in range(1,9) ]: + lane_library = self.library[row_dict[lane+"_id"]] + species_id = lane_library['library_species_id'] + lane_library['library_species'] = self.species[species_id] + row_dict[lane] = lane_library + # some useful parsing + run_date = time.strptime(row_dict['run_date'], '%Y-%m-%d %H:%M:%S') + run_date = datetime.datetime(*run_date[:6]) + row_dict['run_date'] = run_date + self.flowcells[row_dict['flowcell_id']] = row_dict + + self._add_lanes_to_libraries() + return self.flowcells + + def _parse_flowcell_id(self, flowcell_row): + """ + Return flowcell id and status + + We stored the status information in the flowcell id name. + this was dumb, but database schemas are hard to update. + """ + fields = flowcell_row['flowcell_id'].split() + fcid = None + status = None + if len(fields) > 0: + fcid = fields[0] + if len(fields) > 1: + status = fields[1] + return fcid, status + + +def flowcell_gone(cell): + """ + Use a variety of heuristics to determine if the flowcell drive + has been deleted. + """ + status = cell['flowcell_status'] + if status is None: + return False + failures = ['failed', 'deleted', 'not run'] + for f in failures: + if re.search(f, status): + return True + else: + return False + +def recoverable_drive_report(flowcells): + """ + Attempt to report what flowcells are still on a hard drive + """ + def format_status(status): + if status is None: + return "" + else: + return status+" " + + # sort flowcells by run date + flowcell_list = [] + for key, cell in flowcells.items(): + flowcell_list.append( (cell['run_date'], key) ) + flowcell_list.sort() + + report = [] + line = "%(date)s %(id)s %(status)s%(lane)s %(library_name)s (%(library_id)s) " + line += "%(species)s" + for run_date, flowcell_id in flowcell_list: + cell = flowcells[flowcell_id] + if flowcell_gone(cell): + continue + for l in range(1,9): + lane = 'lane_%d' % (l) + cell_library = cell['%s_library'%(lane)] + fields = { + 'date': cell['run_date'].strftime('%y-%b-%d'), + 'id': cell['flowcell_id'], + 'lane': l, + 'library_name': cell_library['library_name'], + 'library_id': cell['%s_library_id'%(lane)], + 'species': cell_library['library_species']['scientific_name'], + 'status': format_status(cell['flowcell_status']), + } + report.append(line % (fields)) + return os.linesep.join(report) + diff --git a/htsworkflow/util/makebed.py b/htsworkflow/util/makebed.py new file mode 100755 index 0000000..4f2b17f --- /dev/null +++ b/htsworkflow/util/makebed.py @@ -0,0 +1,142 @@ +""" +Utility functions to make bedfiles. +""" +import os +import re + +# map eland_result.txt sense +sense_map = { 'F': '+', 'R': '-'} +sense_color = { 'F': '0,0,255', 'R': '255,255,0' } + +def write_bed_header(outstream, name, description): + """ + Produce the headerline for a bedfile + """ + # provide default track names + if name is None: name = "track" + if description is None: description = "eland result file" + bed_header = 'track name="%s" description="%s" visibility=4 itemRgb="ON"' + bed_header += os.linesep + outstream.write(bed_header % (name, description)) + +def make_bed_from_eland_stream(instream, outstream, name, description, chromosome_prefix='chr'): + """ + read an eland result file from instream and write a bedfile to outstream + """ + # indexes into fields in eland_result.txt file + SEQ = 1 + CHR = 6 + START = 7 + SENSE = 8 + + write_bed_header(outstream, name, description) + + for line in instream: + fields = line.split() + # we need more than the CHR field, and it needs to match a chromosome + if len(fields) <= CHR or \ + (chromosome_prefix is not None and \ + fields[CHR][:3] != chromosome_prefix): + continue + start = fields[START] + stop = int(start) + len(fields[SEQ]) + chromosome, extension = fields[CHR].split('.') + assert extension == "fa" + outstream.write('%s %s %d read 0 %s - - %s%s' % ( + chromosome, + start, + stop, + sense_map[fields[SENSE]], + sense_color[fields[SENSE]], + os.linesep + )) + + +def make_bed_from_multi_eland_stream( + instream, + outstream, + name, + description, + chr_prefix='chr', + max_reads=255 + ): + """ + read a multi eland stream and write a bedfile + """ + write_bed_header(outstream, name, description) + parse_multi_eland(instream, outstream, chr_prefix, max_reads) + +def parse_multi_eland(instream, outstream, chr_prefix, max_reads=255): + + loc_pattern = '(?P(?P[0-9]+)(?P[FR])(?P[0-9]+))' + other_pattern = '(?P[^:,]+)' + split_re = re.compile('(%s|%s)' % (loc_pattern, other_pattern)) + + for line in instream: + rec = line.split() + if len(rec) > 3: + # colony_id = rec[0] + seq = rec[1] + # number of matches for 0, 1, and 2 mismatches + # m0, m1, m2 = [int(x) for x in rec[2].split(':')] + compressed_reads = rec[3] + cur_chr = "" + reads = {0: [], 1: [], 2:[]} + + for token in split_re.finditer(compressed_reads): + if token.group('chr') is not None: + cur_chr = token.group('chr')[:-3] # strip off .fa + elif token.group('fullloc') is not None: + matches = int(token.group('count')) + # only emit a bed line if + # our current chromosome starts with chromosome pattern + if chr_prefix is None or cur_chr.startswith(chr_prefix): + start = int(token.group('start')) + stop = start + len(seq) + orientation = token.group('dir') + strand = sense_map[orientation] + color = sense_color[orientation] + # build up list of reads for this record + reads[matches].append((cur_chr, start, stop, strand, color)) + + # report up to our max_read threshold reporting the fewer-mismatch + # matches first + reported_reads = 0 + keys = [0,1,2] + for mismatch, read_list in ((k, reads[k]) for k in keys): + reported_reads += len(read_list) + if reported_reads <= max_reads: + for cur_chr, start, stop, strand, color in read_list: + reported_reads += 1 + outstream.write('%s %d %d read 0 %s - - %s%s' % ( + cur_chr, + start, + stop, + sense_map[orientation], + sense_color[orientation], + os.linesep + )) + +def make_description(database, flowcell_id, lane): + """ + compute a bedfile name and description from the fctracker database + """ + from htsworkflow.util.fctracker import fctracker + + fc = fctracker(database) + cells = fc._get_flowcells("where flowcell_id='%s'" % (flowcell_id)) + if len(cells) != 1: + raise RuntimeError("couldn't find flowcell id %s" % (flowcell_id)) + lane = int(lane) + if lane < 1 or lane > 8: + raise RuntimeError("flowcells only have lanes 1-8") + + name = "%s-%s" % (flowcell_id, lane) + + cell_id, cell = cells.items()[0] + assert cell_id == flowcell_id + + cell_library_id = cell['lane_%d_library_id' %(lane,)] + cell_library = cell['lane_%d_library' %(lane,)] + description = "%s-%s" % (cell_library['library_name'], cell_library_id) + return name, description diff --git a/htsworkflow/util/mount.py b/htsworkflow/util/mount.py new file mode 100644 index 0000000..75dbe0a --- /dev/null +++ b/htsworkflow/util/mount.py @@ -0,0 +1,65 @@ +""" +Utilities for working with unix-style mounts. +""" +import os +import subprocess + +def list_mount_points(): + """ + Return list of current mount points + + Note: unix-like OS specific + """ + mount_points = [] + likely_locations = ['/sbin/mount', '/bin/mount'] + for mount in likely_locations: + if os.path.exists(mount): + p = subprocess.Popen(mount, stdout=subprocess.PIPE) + p.wait() + for l in p.stdout.readlines(): + rec = l.split() + device = rec[0] + mount_point = rec[2] + assert rec[1] == 'on' + # looking at the output of mount on linux, osx, and + # sunos, the first 3 elements are always the same + # devicename on path + # everything after that displays the attributes + # of the mount points in wildly differing formats + mount_points.append(mount_point) + return mount_points + else: + raise RuntimeError("Couldn't find a mount executable") + +def is_mounted(point_to_check): + """ + Return true if argument exactly matches a current mount point. + """ + for mount_point in list_mount_points(): + if point_to_check == mount_point: + return True + else: + return False + +def find_mount_point_for(pathname): + """ + Find the deepest mount point pathname is located on + """ + realpath = os.path.realpath(pathname) + mount_points = list_mount_points() + + prefixes = set() + for current_mount in mount_points: + cp = os.path.commonprefix([current_mount, realpath]) + prefixes.add((len(cp), cp)) + + prefixes = list(prefixes) + prefixes.sort() + if len(prefixes) == 0: + return None + else: + print prefixes + # return longest common prefix + return prefixes[-1][1] + + diff --git a/htsworkflow/util/opener.py b/htsworkflow/util/opener.py new file mode 100644 index 0000000..035bb24 --- /dev/null +++ b/htsworkflow/util/opener.py @@ -0,0 +1,57 @@ +""" +Helpful utilities for turning random names/objects into streams. +""" +import os +import gzip +import bz2 +import types +import urllib2 + +def isfilelike(file_ref, mode): + """Does file_ref have the core file operations? + """ + # if mode is w/a check to make sure we writeable ops + # but always check to see if we can read + read_operations = ['read', 'readline', 'readlines'] + write_operations = [ 'write', 'writelines' ] + #random_operations = [ 'seek', 'tell' ] + if mode[0] in ('w', 'a'): + for o in write_operations: + if not hasattr(file_ref, o): + return False + for o in read_operations: + if not hasattr(file_ref, o): + return False + + return True + +def isurllike(file_ref, mode): + """ + does file_ref look like a url? + (AKA does it start with protocol:// ?) + """ + #what if mode is 'w'? + parsed = urllib2.urlparse.urlparse(file_ref) + schema, netloc, path, params, query, fragment = parsed + + return len(schema) > 0 + +def autoopen(file_ref, mode='r'): + """ + Attempt to intelligently turn file_ref into a readable stream + """ + # catch being passed a file + if type(file_ref) is types.FileType: + return file_ref + # does it look like a file? + elif isfilelike(file_ref, mode): + return file_ref + elif isurllike(file_ref, mode): + return urllib2.urlopen(file_ref) + elif os.path.splitext(file_ref)[1] == ".gz": + return gzip.open(file_ref, mode) + elif os.path.splitext(file_ref)[1] == '.bz2': + return bz2.BZ2File(file_ref, mode) + else: + return open(file_ref,mode) + diff --git a/htsworkflow/util/queuecommands.py b/htsworkflow/util/queuecommands.py new file mode 100644 index 0000000..78728ae --- /dev/null +++ b/htsworkflow/util/queuecommands.py @@ -0,0 +1,87 @@ +""" +Run up to N simultanous jobs from provided of commands +""" + +import logging +from subprocess import PIPE +import subprocess +import select +import sys +import time + +class QueueCommands(object): + """ + Queue up N commands from cmd_list, launching more jobs as the first + finish. + """ + + def __init__(self, cmd_list, N=0, cwd=None): + """ + cmd_list is a list of elements suitable for subprocess + N is the number of simultanious processes to run. + 0 is all of them. + + WARNING: this will not work on windows + (It depends on being able to pass local file descriptors to the + select call with isn't supported by the Win32 API) + """ + self.to_run = cmd_list[:] + self.running = {} + self.N = N + self.cwd = cwd + + def under_process_limit(self): + """ + are we still under the total number of allowable jobs? + """ + if self.N == 0: + return True + + if len(self.running) < self.N: + return True + + return False + + def start_jobs(self): + """ + Launch jobs until we have the maximum allowable running + (or have run out of jobs) + """ + queue_log = logging.getLogger('queue') + queue_log.info('using %s as cwd' % (self.cwd,)) + + while (len(self.to_run) > 0) and self.under_process_limit(): + queue_log.info('%d left to run', len(self.to_run)) + cmd = self.to_run.pop(0) + p = subprocess.Popen(cmd, stdout=PIPE, cwd=self.cwd, shell=True) + self.running[p.stdout] = p + queue_log.info("Created process %d from %s" % (p.pid, str(cmd))) + + def run(self): + """ + run up to N jobs until we run out of jobs + """ + queue_log = logging.getLogger('queue') + + # to_run slowly gets consumed by start_jobs + while len(self.to_run) > 0 or len(self.running) > 0: + # fill any empty spots in our job queue + self.start_jobs() + + # build a list of file descriptors + # fds=file desciptors + fds = [ x.stdout for x in self.running.values()] + + # wait for something to finish + # wl= write list, xl=exception list (not used so get bad names) + read_list, wl, xl = select.select(fds, [], fds) + + # for everything that might have finished... + for pending_fd in read_list: + pending = self.running[pending_fd] + # if it really did finish, remove it from running jobs + if pending.poll() is not None: + queue_log.info("Process %d finished [%d]", + pending.pid, pending.returncode) + del self.running[pending_fd] + time.sleep(1) diff --git a/htsworkflow/util/test/test_ethelp.py b/htsworkflow/util/test/test_ethelp.py new file mode 100644 index 0000000..63f0ac3 --- /dev/null +++ b/htsworkflow/util/test/test_ethelp.py @@ -0,0 +1,35 @@ +import os +import unittest + +try: + from xml.etree import ElementTree +except ImportError, e: + from elementtree import ElementTree + +from htsworkflow.util.ethelp import indent, flatten + +class testETHelper(unittest.TestCase): + def setUp(self): + self.foo = 'asdf
' + self.foo_tree = ElementTree.fromstring(self.foo) + + def test_indent(self): + flat_foo = ElementTree.tostring(self.foo_tree) + self.failUnlessEqual(len(flat_foo.split('\n')), 1) + + indent(self.foo_tree) + pretty_foo = ElementTree.tostring(self.foo_tree) + self.failUnlessEqual(len(pretty_foo.split('\n')), 5) + + def test_flatten(self): + self.failUnless(flatten(self.foo_tree), 'asdf') + +def suite(): + return unittest.makeSuite(testETHelper, 'test') + +if __name__ == "__main__": + unittest.main(defaultTest='suite') + + + + diff --git a/htsworkflow/util/test/test_makebed.py b/htsworkflow/util/test/test_makebed.py new file mode 100644 index 0000000..03c7919 --- /dev/null +++ b/htsworkflow/util/test/test_makebed.py @@ -0,0 +1,51 @@ +import os +from StringIO import StringIO +import unittest + +from htsworkflow.util import makebed + +class testMakeBed(unittest.TestCase): + def test_multi_1_0_0_limit_1(self): + instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:112:383 TCAAATCTTATGCTANGAATCNCAAATTTTCT 1:0:0 mm9_chr13_random.fa:1240R0') + out = StringIO() + + makebed.parse_multi_eland(instream, out, 'mm9_chr', 1) + self.failUnlessEqual(out.getvalue(), 'mm9_chr13_random 1240 1272 read 0 - - - 255,255,0\n') + + def test_multi_1_0_0_limit_255(self): + instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:112:383 TCAAATCTTATGCTANGAATCNCAAATTTTCT 1:0:0 mm9_chr13_random.fa:1240R0') + out = StringIO() + + makebed.parse_multi_eland(instream, out, 'mm9_chr', 255) + self.failUnlessEqual(out.getvalue(), 'mm9_chr13_random 1240 1272 read 0 - - - 255,255,0\n') + + + def test_multi_2_0_0_limit_1(self): + instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:104:586 GTTCTCGCATAAACTNACTCTNAATAGATTCA 2:0:0 mm9_chr4.fa:42995432F0,mm9_chrX.fa:101541458F0') + out = StringIO() + + makebed.parse_multi_eland(instream, out, 'mm9_chr', 1) + self.failUnlessEqual(out.len, 0) + + def test_multi_2_0_0_limit_255(self): + instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:104:586 GTTCTCGCATAAACTNACTCTNAATAGATTCA 2:0:0 mm9_chr4.fa:42995432F0,mm9_chrX.fa:101541458F0') + out = StringIO() + + makebed.parse_multi_eland(instream, out, 'mm9_chr', 255) + self.failUnlessEqual(out.len, 98) + + def test_multi_0_2_0_limit_1(self): + instream = StringIO('>HWI-EAS229_26_209LVAAXX:7:3:115:495 TCTCCCTGAAAAATANAAGTGNTGTTGGTGAG 0:2:1 mm9_chr14.fa:104434729F2,mm9_chr16.fa:63263818R1,mm9_chr2.fa:52265438R1') + out = StringIO() + + makebed.parse_multi_eland(instream, out, 'mm9_chr', 1) + print out.getvalue() + self.failUnlessEqual(out.len, 0) + +def suite(): + return unittest.makeSuite(testMakeBed, 'test') + +if __name__ == "__main__": + unittest.main(defaultTest='suite') + + diff --git a/htsworkflow/util/test/test_queuecommands.py b/htsworkflow/util/test/test_queuecommands.py new file mode 100644 index 0000000..424da88 --- /dev/null +++ b/htsworkflow/util/test/test_queuecommands.py @@ -0,0 +1,58 @@ +import os +import logging +import time +import unittest + + +from htsworkflow.util.queuecommands import QueueCommands + +class testQueueCommands(unittest.TestCase): + def setUp(self): + logging.basicConfig(level=logging.DEBUG, + format='%(asctime)s %(name)-8s %(message)s') + + + + def test_unlimited_run(self): + """ + Run everything at once + """ + cmds = ['/bin/sleep 0', + '/bin/sleep 1', + '/bin/sleep 2',] + + q = QueueCommands(cmds) + start = time.time() + q.run() + end = time.time()-start + # we should only take the length of the longest sleep + # pity I had to add a 1 second sleep + self.failUnless( end > 2.9 and end < 3.1, + "took %s seconds, exected ~3" % (end,)) + + def test_limited_run(self): + """ + Run a limited number of jobs + """ + cmds = ['/bin/sleep 1', + '/bin/sleep 2', + '/bin/sleep 3',] + + q = QueueCommands(cmds, 2) + + start = time.time() + q.run() + end = time.time()-start + # pity I had to add a 1 second sleep + self.failUnless( end > 5.9 and end < 6.1, + "took %s seconds, expected ~6" % (end,)) + +def suite(): + return unittest.makeSuite(testQueueCommands, 'test') + +if __name__ == "__main__": + unittest.main(defaultTest='suite') + + + + diff --git a/scripts/configure_pipeline b/scripts/configure_pipeline index 6c94b91..4b2e3a0 100644 --- a/scripts/configure_pipeline +++ b/scripts/configure_pipeline @@ -2,9 +2,9 @@ import os import sys import re -from gaworkflow.pipeline.configure_run import * -from gaworkflow.pipeline import retrieve_config as _rc -from gaworkflow.pipeline.run_status import startCmdLineStatusMonitor +from htsworkflow.pipeline.configure_run import * +from htsworkflow.pipeline import retrieve_config as _rc +from htsworkflow.pipeline.run_status import startCmdLineStatusMonitor logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s', diff --git a/scripts/copier b/scripts/copier index 7025d57..9338b07 100644 --- a/scripts/copier +++ b/scripts/copier @@ -1,6 +1,6 @@ #!/usr/bin/env python import sys -from gaworkflow.automation.copier import main +from htsworkflow.automation.copier import main if __name__ == "__main__": sys.exit(main(sys.argv[1:])) diff --git a/scripts/elandseq b/scripts/elandseq index ece4bf3..787fd98 100755 --- a/scripts/elandseq +++ b/scripts/elandseq @@ -3,7 +3,7 @@ import optparse import os import sys -from gaworkflow.pipeline.gerald import extract_eland_sequence +from htsworkflow.pipeline.gerald import extract_eland_sequence def make_parser(): usage = "usage: %prog [options] infile [outfile]" diff --git a/scripts/gerald2bed.py b/scripts/gerald2bed.py index c49cc9c..7a726e7 100644 --- a/scripts/gerald2bed.py +++ b/scripts/gerald2bed.py @@ -8,7 +8,7 @@ import optparse import sys import os -from gaworkflow.util.makebed import make_bed_from_eland_stream, make_description +from htsworkflow.util.makebed import make_bed_from_eland_stream, make_description def make_bed_for_gerald(eland_dir, output_dir, prefix, database, flowcell): """ diff --git a/scripts/library.py b/scripts/library.py index eeeed90..35532f4 100644 --- a/scripts/library.py +++ b/scripts/library.py @@ -7,7 +7,7 @@ installed, so it can run on machines other than the webserver. from optparse import OptionParser import sys -from gaworkflow.util import fctracker +from htsworkflow.util import fctracker def make_parser(): """ diff --git a/scripts/makebed b/scripts/makebed index a4a414b..4878697 100755 --- a/scripts/makebed +++ b/scripts/makebed @@ -3,7 +3,7 @@ import optparse import sys import os -from gaworkflow.util.makebed import make_bed_from_eland_stream, make_bed_from_multi_eland_stream, make_description +from htsworkflow.util.makebed import make_bed_from_eland_stream, make_bed_from_multi_eland_stream, make_description def make_parser(): parser = optparse.OptionParser() diff --git a/scripts/rerun_eland.py b/scripts/rerun_eland.py index c271970..7eb30bb 100644 --- a/scripts/rerun_eland.py +++ b/scripts/rerun_eland.py @@ -6,8 +6,8 @@ import os import subprocess import sys -from gaworkflow.pipeline import gerald -from gaworkflow.pipeline import runfolder +from htsworkflow.pipeline import gerald +from htsworkflow.pipeline import runfolder def make_query_filename(eland_obj, output_dir): query_name = '%s_%s_eland_query.txt' diff --git a/scripts/retrieve_config b/scripts/retrieve_config index b0ddd46..cdd2c5b 100644 --- a/scripts/retrieve_config +++ b/scripts/retrieve_config @@ -1,9 +1,9 @@ #!/usr/bin/env python import sys -from gaworkflow.pipeline.retrieve_config import * -from gaworkflow.pipeline import retrieve_config -from gaworkflow.pipeline.genome_mapper import getAvailableGenomes -from gaworkflow.pipeline.genome_mapper import constructMapperDict +from htsworkflow.pipeline.retrieve_config import * +from htsworkflow.pipeline import retrieve_config +from htsworkflow.pipeline.genome_mapper import getAvailableGenomes +from htsworkflow.pipeline.genome_mapper import constructMapperDict #Turn on built-in command-line parsing. retrieve_config.DISABLE_CMDLINE = False diff --git a/scripts/runfolder b/scripts/runfolder index d54bf60..8b4444a 100644 --- a/scripts/runfolder +++ b/scripts/runfolder @@ -31,8 +31,8 @@ import logging import optparse import sys -from gaworkflow.pipeline import runfolder -from gaworkflow.pipeline.runfolder import ElementTree +from htsworkflow.pipeline import runfolder +from htsworkflow.pipeline.runfolder import ElementTree def make_parser(): usage = 'usage: %prog [options] runfolder_root_dir' diff --git a/scripts/runner b/scripts/runner index 6b6da11..560299f 100644 --- a/scripts/runner +++ b/scripts/runner @@ -1,6 +1,6 @@ #!/usr/bin/env python import sys -from gaworkflow.automation.runner import main +from htsworkflow.automation.runner import main if __name__ == "__main__": sys.exit(main(sys.argv[1:])) diff --git a/scripts/spoolwatcher b/scripts/spoolwatcher index 269292e..b2f833e 100644 --- a/scripts/spoolwatcher +++ b/scripts/spoolwatcher @@ -1,6 +1,6 @@ #!/usr/bin/env python import sys -from gaworkflow.automation.spoolwatcher import main +from htsworkflow.automation.spoolwatcher import main if __name__ == "__main__": sys.exit(main(sys.argv[1:])) diff --git a/scripts/srf b/scripts/srf index f8617ea..0f11514 100644 --- a/scripts/srf +++ b/scripts/srf @@ -7,7 +7,7 @@ import os import subprocess import sys -from gaworkflow.util import queuecommands +from htsworkflow.util import queuecommands def make_commands(run_name, lanes, site_name, destdir): """ diff --git a/setup.py b/setup.py index 2229ed6..0cc5fb5 100644 --- a/setup.py +++ b/setup.py @@ -1,15 +1,15 @@ from setuptools import setup setup( - name="gaworkflow", + name="htsworkflow", description="some bots and other utilities to help deal with data from an illumina sequencer", author="Diane Trout & Brandon King", author_email="diane@caltech.edu", - packages=["gaworkflow", - "gaworkflow.pipeline", - "gaworkflow.frontend", - "gaworkflow.frontend.fctracker", - "gaworkflow.frontend.eland_config" + packages=["htsworkflow", + "htsworkflow.pipeline", + "htsworkflow.frontend", + "htsworkflow.frontend.fctracker", + "htsworkflow.frontend.eland_config" ], scripts=[ 'scripts/configure_pipeline', diff --git a/test/test_copier.py b/test/test_copier.py index 910f3ea..9a1b738 100644 --- a/test/test_copier.py +++ b/test/test_copier.py @@ -1,7 +1,7 @@ import unittest from StringIO import StringIO -from gaworkflow.automation import copier +from htsworkflow.automation import copier class testCopier(unittest.TestCase): def test_runfolder_validate(self):