From 63598431ae8e62ca73fd0774ee79816c98d01fad Mon Sep 17 00:00:00 2001 From: Diane Trout Date: Sat, 26 Sep 2009 00:02:17 +0000 Subject: [PATCH] Implement a client side config file generator. This downloads the flowcell information json block and then creates a gerald config file with it. This version will also look for a "post_run" entry in the htsworkflow.ini config file for a script that should be inserted into the config file to be run when make ends. --- .../frontend/experiments/experiments.py | 7 +- htsworkflow/pipelines/genome_mapper.py | 44 +-- htsworkflow/pipelines/retrieve_config.py | 327 ++++++++++++------ .../pipelines/test/test_retrive_config.py | 43 +++ htsworkflow/util/url.py | 22 ++ scripts/retrieve_config | 29 +- 6 files changed, 320 insertions(+), 152 deletions(-) create mode 100644 htsworkflow/pipelines/test/test_retrive_config.py create mode 100644 htsworkflow/util/url.py diff --git a/htsworkflow/frontend/experiments/experiments.py b/htsworkflow/frontend/experiments/experiments.py index 331bdde..e9a7ab7 100755 --- a/htsworkflow/frontend/experiments/experiments.py +++ b/htsworkflow/frontend/experiments/experiments.py @@ -32,12 +32,15 @@ def flowcell_information(flowcell_id): lane_set[lane.lane_number] = { 'cluster_estimate': lane.cluster_estimate, 'comment': lane.comment, + 'experiment_type': lane.library.experiment_type.name, + 'experiment_type_id': lane.library.experiment_type_id, 'flowcell': lane.flowcell.flowcell_id, 'lane_number': int(lane.lane_number), 'library_name': lane.library.library_name, 'library_id': lane.library.library_id, 'library_species': lane.library.library_species.scientific_name, 'pM': float(lane.pM), + 'read_length': fc.read_length } info = { 'advanced_run': fc.advanced_run, @@ -152,7 +155,6 @@ def generateConfile(request,fcid): try: fc = FlowCell.objects.get(flowcell_id=fcid) for lane in fc.lane_set.all(): - print dir(lane.library.library_species) config += [ str(lane.lane_number) +":" + \ genome_dir + lane.library.library_species.scientific_name ] config += [ str(lane.lane_number) +":" + \ @@ -174,9 +176,6 @@ def getConfile(req): cnfgfile = 'Nothing found' runfolder = 'unknown' request = req.REQUEST - print request, dir(request) - print request['fcid'], request.has_key('fcid') - print request['runf'] if request.has_key('fcid'): fcid = request['fcid'] if request.has_key('runf'): diff --git a/htsworkflow/pipelines/genome_mapper.py b/htsworkflow/pipelines/genome_mapper.py index d29e446..5ae788e 100644 --- a/htsworkflow/pipelines/genome_mapper.py +++ b/htsworkflow/pipelines/genome_mapper.py @@ -74,35 +74,37 @@ class constructMapperDict(object): """ def __init__(self, genome_dict): self.genome_dict = genome_dict - + def __getitem__(self, key): """ Return the best match for key """ elements = re.split("\|", key) - try: - if len(elements) == 1: - # we just the species name - # get the set of builds - builds = self.genome_dict[elements[0]] + if len(elements) == 1: + # we just the species name + # get the set of builds + builds = self.genome_dict[elements[0]] - # sort build names the way humans would - keys = builds.keys() - keys.sort(cmp=alphanum) - - # return the path from the 'last' build name - return builds[keys[-1]] + # sort build names the way humans would + keys = builds.keys() + keys.sort(cmp=alphanum) + + # return the path from the 'last' build name + return builds[keys[-1]] - elif len(elements) == 2: - # we have species, and build name - return self.genome_dict[elements[0]][elements[1]] - else: - raise KeyError("Unrecognized key") - except KeyError, e: - logging.error('Unrecognized genome identifier: %s' % str((elements),)) - return "NoGenomeAvailable" - + elif len(elements) == 2: + # we have species, and build name + return self.genome_dict[elements[0]][elements[1]] + else: + raise KeyError("Unrecognized key") + + def get(self, key, default=None): + try: + return self[key] + except KeyError, e: + return default + def keys(self): keys = [] for species in self.genome_dict.keys(): diff --git a/htsworkflow/pipelines/retrieve_config.py b/htsworkflow/pipelines/retrieve_config.py index e9745a3..9d77c35 100644 --- a/htsworkflow/pipelines/retrieve_config.py +++ b/htsworkflow/pipelines/retrieve_config.py @@ -1,24 +1,151 @@ #!/usr/bin/env python -from optparse import OptionParser, IndentedHelpFormatter -from ConfigParser import SafeConfigParser - +from ConfigParser import RawConfigParser import logging +from optparse import OptionParser, IndentedHelpFormatter import os import sys +import urllib import urllib2 +try: + import json +except ImportError, e: + import simplejson as json + +from htsworkflow.frontend.auth import apidata +from htsworkflow.util.url import normalize_url +from htsworkflow.pipelines.genome_mapper import getAvailableGenomes +from htsworkflow.pipelines.genome_mapper import constructMapperDict + __docformat__ = "restructredtext en" -CONFIG_SYSTEM = '/etc/hts_frontend/hts_frontend.conf' -CONFIG_USER = os.path.expanduser('~/.hts_frontend.conf') +CONFIG_SYSTEM = '/etc/htsworkflow.ini' +CONFIG_USER = os.path.expanduser('~/.htsworkflow.ini') +GERALD_CONFIG_SECTION = 'gerald_config' #Disable or enable commandline arg parsing; disabled by default. DISABLE_CMDLINE = True +LANE_LIST = ['1','2','3','4','5','6','7','8'] + class FlowCellNotFound(Exception): pass class WebError404(Exception): pass +def retrieve_flowcell_info(base_host_url, flowcell): + """ + Return a dictionary describing a + """ + url = base_host_url + '/experiments/config/%s/json' % (flowcell) + + try: + apipayload = urllib.urlencode(apidata) + web = urllib2.urlopen(url, apipayload) + except urllib2.URLError, e: + errmsg = 'URLError: %d %s' % (e.code, e.msg) + logging.error(errmsg) + logging.error('opened %s' % (url,)) + raise IOError(errmsg) + + contents = web.read() + headers = web.info() + + if web.getcode() == 403: + msg = "403 - Forbbidden, probably need api key" + raise FlowCellNotFound(msg) + + if web.getcode() == 404: + msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \ + "Did you get right port #?" % (flowcell, base_host_url, url) + raise FlowCellNotFound(msg) + + if len(contents) == 0: + msg = "No information for flowcell (%s) returned; full url(%s)" % (flowcell, url) + raise FlowCellNotFound(msg) + + data = json.loads(contents) + return data + +def is_sequencing(lane_info): + """ + Determine if we are just sequencing and not doing any follow-up analysis + """ + if lane_info['experiment_type'] in ('De Novo','Whole Genome'): + return True + else: + return False + +def group_lane_parameters(flowcell_info): + """ + goup lanes that can share GERALD configuration blocks. + + (The same species, read length, and eland vs sequencing) + """ + lane_groups = {} + for lane_number, lane_info in flowcell_info['lane_set'].items(): + index = (lane_info['read_length'], + lane_info['library_species'], + is_sequencing(lane_info)) + lane_groups.setdefault(index, []).append(lane_number) + return lane_groups + +def format_gerald_header(flowcell_info): + """ + Generate comment describing the contents of the flowcell + """ + # I'm using '\n# ' to join the lines together, that doesn't include the + # first element so i needed to put the # in manually + config = ['# FLOWCELL: %s' % (flowcell_info['flowcell_id'])] + config += [''] + config += ['CONTROL-LANE: %s' % (flowcell_info['control_lane'],)] + config += [''] + config += ['Flowcell Notes:'] + config.extend(flowcell_info['notes'].split('\r\n')) + config += [''] + for lane_number in LANE_LIST: + lane_info = flowcell_info['lane_set'][lane_number] + config += ['Lane%s: %s | %s' % (lane_number, lane_info['library_id'], + lane_info['library_name'])] + config += [''] + return "\n# ".join(config) + +def format_gerald_config(options, flowcell_info, genome_map): + """ + Generate a GERALD config file + """ + # so we can add nothing or _pair if we're a paired end run + run_type_suffix = { False: "", True: "_pair" } + + # it's convienent to have helpful information describing the flowcell + # in the config file... things like which lane is which library. + config = [format_gerald_header(flowcell_info)] + + analysis_suffix = run_type_suffix[flowcell_info['paired_end']] + lane_groups = group_lane_parameters(flowcell_info) + for lane_index, lane_numbers in lane_groups.items(): + # lane_index is return value of group_lane_parameters + read_length, species, is_sequencing = lane_index + lane_numbers.sort() + lane_prefix = u"".join(lane_numbers) + + if not is_sequencing: + config += ['%s:ANALYSIS eland%s' % (lane_prefix, analysis_suffix)] + else: + config += ['%s:ANALYSIS sequence%s' % (lane_prefix, analysis_suffix)] + #config += ['%s:READ_LENGTH %s' % ( lane_prefix, read_length ) ] + config += ['%s:USE_BASES Y%s' % ( lane_prefix, read_length ) ] + species_path = genome_map.get(species, "Unknown") + config += ['%s:ELAND_GENOME %s' % (lane_prefix, species_path) ] + + # add in option for running script after + if options.post_run is not None: + post_run = options.post_run % {'runfolder': options.runfolder} + config += ['POST_RUN_COMMAND %s' % (post_run,) ] + + config += [''] # force trailing newline + + return "\n".join(config) + class DummyOptions: """ Used when command line parsing is disabled; default @@ -50,19 +177,14 @@ class PreformattedDescriptionFormatter(IndentedHelpFormatter): def constructOptionParser(): - """ - returns a pre-setup optparser - """ - global DISABLE_CMDLINE - - if DISABLE_CMDLINE: - return None - - parser = OptionParser(formatter=PreformattedDescriptionFormatter()) + """ + returns a pre-setup optparser + """ + parser = OptionParser(formatter=PreformattedDescriptionFormatter()) - parser.set_description('Retrieves eland config file from hts_frontend web frontend.') + parser.set_description('Retrieves eland config file from hts_frontend web frontend.') - parser.epilog = """ + parser.epilog = """ Config File: * %s (System wide) * %s (User specific; overrides system) @@ -70,116 +192,111 @@ Config File: Example Config File: - [config_file_server] - base_host_url=http://somewhere.domain:port -""" % (CONFIG_SYSTEM, CONFIG_USER) + [%s] + config_host=http://somewhere.domain:port + genome_dir=/path to search for genomes + +""" % (CONFIG_SYSTEM, CONFIG_USER, GERALD_CONFIG_SECTION) - #Special formatter for allowing preformatted description. - ##parser.format_epilog(PreformattedDescriptionFormatter()) + #Special formatter for allowing preformatted description. + ##parser.format_epilog(PreformattedDescriptionFormatter()) - parser.add_option("-u", "--url", - action="store", type="string", dest="url") + parser.add_option("-u", "--url", + action="store", type="string", dest="url") - parser.add_option("-o", "--output", - action="store", type="string", dest="output_filepath") + parser.add_option("-o", "--output-file", + action="store", type="string", dest="output_filepath", + help="config file destination. If runfolder is specified defaults " + "to /config-auto.txt" ) - parser.add_option("-f", "--flowcell", - action="store", type="string", dest="flowcell") + parser.add_option("-f", "--flowcell", + action="store", type="string", dest="flowcell") - parser.add_option("-g", "--genome_dir", - action="store", type="string", dest="genome_dir") - - #parser.set_default("url", "default") - - return parser + parser.add_option("-g", "--genome_dir", + action="store", type="string", dest="genome_dir") + parser.add_option("-r", "--runfolder", + action="store", type="string", + help="specify runfolder for post_run command ") + + return parser + def constructConfigParser(): - """ - returns a pre-setup config parser - """ - parser = SafeConfigParser() - parser.read([CONFIG_SYSTEM, CONFIG_USER]) - if not parser.has_section('config_file_server'): - parser.add_section('config_file_server') - if not parser.has_section('local_setup'): - parser.add_section('local_setup') + """ + returns a pre-setup config parser + """ + parser = RawConfigParser() + parser.read([CONFIG_SYSTEM, CONFIG_USER]) + if not parser.has_section(GERALD_CONFIG_SECTION): + parser.add_section(GERALD_CONFIG_SECTION) - return parser + return parser -def getCombinedOptions(): - """ - Returns optparse options after it has be updated with ConfigParser - config files and merged with parsed commandline options. - """ - cl_parser = constructOptionParser() - conf_parser = constructConfigParser() - - if cl_parser is None: - options = DummyOptions() - else: - options, args = cl_parser.parse_args() - - if options.url is None: - if conf_parser.has_option('config_file_server', 'base_host_url'): - options.url = conf_parser.get('config_file_server', 'base_host_url') +def getCombinedOptions(argv=None): + """ + Returns optparse options after it has be updated with ConfigParser + config files and merged with parsed commandline options. - if options.genome_dir is None: - if conf_parser.has_option('local_setup', 'genome_dir'): - options.genome_dir = conf_parser.get('local_setup', 'genome_dir') - - logging.info('USING OPTIONS:') - logging.info(' URL: %s' % (options.url,)) - logging.info(' OUT: %s' % (options.output_filepath,)) - logging.info(' FC: %s' % (options.flowcell,)) - logging.info('GDIR: %s' % (options.genome_dir,)) + expects command line arguments to be passed in + """ + cl_parser = constructOptionParser() + conf_parser = constructConfigParser() + + if argv is None: + options = DummyOptions() + else: + options, args = cl_parser.parse_args(argv) + + if options.url is None: + if conf_parser.has_option(GERALD_CONFIG_SECTION, 'config_host'): + options.url = conf_parser.get(GERALD_CONFIG_SECTION, 'config_host') + + options.url = normalize_url(options.url) - return options + if options.genome_dir is None: + if conf_parser.has_option(GERALD_CONFIG_SECTION, 'genome_dir'): + options.genome_dir = conf_parser.get(GERALD_CONFIG_SECTION, 'genome_dir') + + if conf_parser.has_option(GERALD_CONFIG_SECTION, 'post_run'): + options.post_run = conf_parser.get(GERALD_CONFIG_SECTION, 'post_run') + else: + options.post_run = None + + if options.output_filepath is None: + if options.runfolder is not None: + options.output_filepath = os.path.join(options.runfolder, 'config-auto.txt') + + logging.info('USING OPTIONS:') + logging.info(u' URL: %s' % (options.url,)) + logging.info(u' OUT: %s' % (options.output_filepath,)) + logging.info(u' FC: %s' % (options.flowcell,)) + #logging.info(': %s' % (options.genome_dir,)) + logging.info(u'post_run: %s' % ( unicode(options.post_run),)) + + return options -def saveConfigFile(flowcell, base_host_url, output_filepath): +def saveConfigFile(options): """ retrieves the flowcell eland config file, give the base_host_url (i.e. http://sub.domain.edu:port) """ - url = base_host_url + '/eland_config/%s/' % (flowcell) - - f = open(output_filepath, 'w') - #try: - try: - web = urllib2.urlopen(url) - except urllib2.URLError, e: - errmsg = 'URLError: %d %s' % (e.code, e.msg) - logging.error(errmsg) - logging.error('opened %s' % (url,)) - raise IOError(errmsg) - - #except IOError, msg: - # if str(msg).find("Connection refused") >= 0: - # print 'Error: Connection refused for: %s' % (url) - # f.close() - # sys.exit(1) - # elif str(msg).find("Name or service not known") >= 0: - # print 'Error: Invalid domain or ip address for: %s' % (url) - # f.close() - # sys.exit(2) - # else: - # raise IOError, msg + flowcell_info = retrieve_flowcell_info(options.url, options.flowcell) - data = web.read() + available_genomes = getAvailableGenomes(options.genome_dir) + genome_map = constructMapperDict(available_genomes) - if data.find('Hmm, config file for') >= 0: - msg = "Flowcell (%s) not found in DB; full url(%s)" % (flowcell, url) - raise FlowCellNotFound, msg + config = format_gerald_config(options, flowcell_info, genome_map) - if data.find('404 - Not Found') >= 0: - msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \ - "Did you get right port #?" % (flowcell, base_host_url, url) - raise FlowCellNotFound, msg + if options.output_filepath is not None: + outstream = open(options.output_filepath, 'w') + logging.info('Writing config file to %s' % (options.output_filepath,)) + else: + outstream = sys.stdout + + outstream.write(config) - f.write(data) - web.close() - f.close() - logging.info('Wrote config file to %s' % (output_filepath,)) + diff --git a/htsworkflow/pipelines/test/test_retrive_config.py b/htsworkflow/pipelines/test/test_retrive_config.py new file mode 100644 index 0000000..dd8f30e --- /dev/null +++ b/htsworkflow/pipelines/test/test_retrive_config.py @@ -0,0 +1,43 @@ +import os +import re + +try: + import json +except ImportError, e: + import simplejson as json + +from django.test import TestCase + +from htsworkflow.frontend.auth import apidata +from htsworkflow.pipelines.retrieve_config import format_gerald_config, getCombinedOptions + +class RetrieveTestCases(TestCase): + fixtures = ['test_flowcells.json'] + + def setUp(self): + pass + + def test_format_gerald(self): + flowcell_request = self.client.get('/experiments/config/303TUAAXX/json', apidata) + self.failUnlessEqual(flowcell_request.status_code, 200) + + print dir(flowcell_request) + flowcell_info = json.loads(flowcell_request.content) + + options = getCombinedOptions(['-f','303TUAAXX','-g',os.getcwd()]) + genome_map = {u'Homo sapiens': '/tmp/hg18' } + + config = format_gerald_config(options, flowcell_info, genome_map) + config_lines = config.split('\n') + lane3 = [ line for line in config_lines if re.search('Lane3', line) ] + self.failUnlessEqual(len(lane3), 1) + self.failUnlessEqual(lane3[0], '# Lane3: SL039 | Paired ends 99 GM12892') + human = [ line for line in config_lines if re.search('hg18', line) ] + self.failUnlessEqual(len(human), 1) + self.failUnlessEqual(human[0], '345678:ELAND_GENOME /tmp/hg18') + unknown = [ line for line in config_lines if re.search('Unknown', line) ] + self.failUnlessEqual(len(unknown), 2) + + + + diff --git a/htsworkflow/util/url.py b/htsworkflow/util/url.py new file mode 100644 index 0000000..1b62e58 --- /dev/null +++ b/htsworkflow/util/url.py @@ -0,0 +1,22 @@ +""" +Utilities to help handle urls +""" + +def normalize_url(url, scheme='http'): + """ + Make sure there is a http at the head of what should be a url + + >>> normalize_url("google.com") + 'http://google.com' + >>> normalize_url("http://google.com") + 'http://google.com' + >>> normalize_url("foo.com/a/b/c/d/e/f.html") + 'http://foo.com/a/b/c/d/e/f.html' + >>> normalize_url("foo.com", "https") + 'https://foo.com' + """ + scheme_sep = '://' + if url.find(scheme_sep) != -1: + return url + else: + return scheme + scheme_sep + url diff --git a/scripts/retrieve_config b/scripts/retrieve_config index c765da9..9e8ccf5 100644 --- a/scripts/retrieve_config +++ b/scripts/retrieve_config @@ -3,22 +3,18 @@ import logging import sys from htsworkflow.pipelines.retrieve_config import * from htsworkflow.pipelines import retrieve_config -from htsworkflow.pipelines.genome_mapper import getAvailableGenomes -from htsworkflow.pipelines.genome_mapper import constructMapperDict #Turn on built-in command-line parsing. retrieve_config.DISABLE_CMDLINE = False -def main(args=None): - #Display help if no args are presented - if len(sys.argv) == 1: - sys.argv.append('-h') +def main(argv=None): + if argv is None: + argv = sys.argv - options = getCombinedOptions() + #Display help if no args are presented + options = getCombinedOptions(argv) + msg_list = ['ERROR MESSAGES:'] - if options.output_filepath is None: - msg_list.append(" Output filepath argument required. -o or --output=") - if options.flowcell is None: msg_list.append(" Flow cell argument required. -f or --flowcell=") @@ -34,18 +30,7 @@ def main(args=None): print '\n'.join(msg_list) return 1 - saveConfigFile(options.flowcell, options.url, options.output_filepath) - - f = open(options.output_filepath, 'r') - data = f.read() - f.close() - - genome_dict = getAvailableGenomes(options.genome_dir) - mapper_dict = constructMapperDict(genome_dict) - - f = open(options.output_filepath, 'w') - f.write(data % (mapper_dict)) - f.close() + saveConfigFile(options) return 0 -- 2.30.2