X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=blobdiff_plain;f=htsworkflow%2Fpipelines%2Fretrieve_config.py;h=3a1a56aeaf2188350db8b980d7b098bf0098a0ba;hp=e9745a377fe2bed0c9a14789e630409b799e95c3;hb=86a66805c52c4bafb5d878d2b22604b9b9309cc8;hpb=96833cdd527fe260ed6f9aee22c56ed973c5ef3c diff --git a/htsworkflow/pipelines/retrieve_config.py b/htsworkflow/pipelines/retrieve_config.py index e9745a3..3a1a56a 100644 --- a/htsworkflow/pipelines/retrieve_config.py +++ b/htsworkflow/pipelines/retrieve_config.py @@ -1,24 +1,166 @@ #!/usr/bin/env python -from optparse import OptionParser, IndentedHelpFormatter -from ConfigParser import SafeConfigParser - +from ConfigParser import RawConfigParser import logging +from optparse import OptionParser, IndentedHelpFormatter import os import sys +import urllib import urllib2 +try: + import json +except ImportError, e: + import simplejson as json + +from htsworkflow.frontend.auth import apidata +from htsworkflow.util import api +from htsworkflow.util.url import normalize_url +from htsworkflow.pipelines.genome_mapper import getAvailableGenomes +from htsworkflow.pipelines.genome_mapper import constructMapperDict + __docformat__ = "restructredtext en" -CONFIG_SYSTEM = '/etc/hts_frontend/hts_frontend.conf' -CONFIG_USER = os.path.expanduser('~/.hts_frontend.conf') +CONFIG_SYSTEM = '/etc/htsworkflow.ini' +CONFIG_USER = os.path.expanduser('~/.htsworkflow.ini') +GERALD_CONFIG_SECTION = 'gerald_config' #Disable or enable commandline arg parsing; disabled by default. DISABLE_CMDLINE = True +LANE_LIST = ['1','2','3','4','5','6','7','8'] + class FlowCellNotFound(Exception): pass class WebError404(Exception): pass +def retrieve_flowcell_info(base_host_url, flowcell): + """ + Return a dictionary describing a + """ + url = api.flowcell_url(base_host_url, flowcell) + + try: + apipayload = urllib.urlencode(apidata) + web = urllib2.urlopen(url, apipayload) + except urllib2.URLError, e: + errmsg = 'URLError: %d %s' % (e.code, e.msg) + logging.error(errmsg) + logging.error('opened %s' % (url,)) + raise IOError(errmsg) + + contents = web.read() + headers = web.info() + + if web.code == 403: + msg = "403 - Forbbidden, probably need api key" + raise FlowCellNotFound(msg) + + if web.code == 404: + msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \ + "Did you get right port #?" % (flowcell, base_host_url, url) + raise FlowCellNotFound(msg) + + if len(contents) == 0: + msg = "No information for flowcell (%s) returned; full url(%s)" % (flowcell, url) + raise FlowCellNotFound(msg) + + data = json.loads(contents) + return data + +def is_sequencing(lane_info): + """ + Determine if we are just sequencing and not doing any follow-up analysis + """ + if lane_info['experiment_type'] in ('De Novo','Whole Genome'): + return True + else: + return False + +def group_lane_parameters(flowcell_info): + """ + goup lanes that can share GERALD configuration blocks. + + (The same species, read length, and eland vs sequencing) + """ + lane_groups = {} + for lane_number, lane_info in flowcell_info['lane_set'].items(): + index = (lane_info['read_length'], + lane_info['library_species'], + is_sequencing(lane_info)) + lane_groups.setdefault(index, []).append(lane_number) + return lane_groups + +def format_gerald_header(flowcell_info): + """ + Generate comment describing the contents of the flowcell + """ + # I'm using '\n# ' to join the lines together, that doesn't include the + # first element so i needed to put the # in manually + config = ['# FLOWCELL: %s' % (flowcell_info['flowcell_id'])] + config += [''] + config += ['CONTROL-LANE: %s' % (flowcell_info['control_lane'],)] + config += [''] + config += ['Flowcell Notes:'] + config.extend(flowcell_info['notes'].split('\r\n')) + config += [''] + for lane_number in LANE_LIST: + lane_info = flowcell_info['lane_set'][lane_number] + config += ['Lane%s: %s | %s' % (lane_number, lane_info['library_id'], + lane_info['library_name'])] + + config += [''] + return "\n# ".join(config) + +def format_gerald_config(options, flowcell_info, genome_map): + """ + Generate a GERALD config file + """ + # so we can add nothing or _pair if we're a paired end run + eland_analysis_suffix = { False: "_extended", True: "_pair" } + sequence_analysis_suffix = { False: "", True: "_pair" } + + # it's convienent to have helpful information describing the flowcell + # in the config file... things like which lane is which library. + config = [format_gerald_header(flowcell_info)] + + config += ['SEQUENCE_FORMAT --fastq'] + config += ['ELAND_SET_SIZE 20'] + config += ['WITH_SEQUENCE TRUE'] + config += ['12345678:WITH_SEQUENCE TRUE'] + analysis_suffix = eland_analysis_suffix[flowcell_info['paired_end']] + sequence_suffix = sequence_analysis_suffix[flowcell_info['paired_end']] + lane_groups = group_lane_parameters(flowcell_info) + for lane_index, lane_numbers in lane_groups.items(): + # lane_index is return value of group_lane_parameters + read_length, species, is_sequencing = lane_index + lane_numbers.sort() + lane_prefix = u"".join(lane_numbers) + + species_path = genome_map.get(species, None) + logging.debug("Looked for genome '%s' got location '%s'" % (species, species_path)) + if species_path is None: + no_genome_msg = "Forcing lanes %s to sequencing as there is no genome for %s" + logging.warning(no_genome_msg % (lane_numbers, species)) + is_sequencing = True + + if is_sequencing: + config += ['%s:ANALYSIS sequence%s' % (lane_prefix, sequence_suffix)] + else: + config += ['%s:ANALYSIS eland%s' % (lane_prefix, analysis_suffix)] + config += ['%s:ELAND_GENOME %s' % (lane_prefix, species_path) ] + #config += ['%s:READ_LENGTH %s' % ( lane_prefix, read_length ) ] + config += ['%s:USE_BASES Y%s' % ( lane_prefix, read_length ) ] + + # add in option for running script after + if not (options.post_run is None or options.runfolder is None): + runfolder = os.path.abspath(options.runfolder) + post_run = options.post_run % {'runfolder': runfolder} + config += ['POST_RUN_COMMAND %s' % (post_run,) ] + + config += [''] # force trailing newline + + return "\n".join(config) + class DummyOptions: """ Used when command line parsing is disabled; default @@ -50,19 +192,14 @@ class PreformattedDescriptionFormatter(IndentedHelpFormatter): def constructOptionParser(): - """ - returns a pre-setup optparser - """ - global DISABLE_CMDLINE - - if DISABLE_CMDLINE: - return None - - parser = OptionParser(formatter=PreformattedDescriptionFormatter()) + """ + returns a pre-setup optparser + """ + parser = OptionParser(formatter=PreformattedDescriptionFormatter()) - parser.set_description('Retrieves eland config file from hts_frontend web frontend.') + parser.set_description('Retrieves eland config file from hts_frontend web frontend.') - parser.epilog = """ + parser.epilog = """ Config File: * %s (System wide) * %s (User specific; overrides system) @@ -70,116 +207,116 @@ Config File: Example Config File: - [config_file_server] - base_host_url=http://somewhere.domain:port -""" % (CONFIG_SYSTEM, CONFIG_USER) + [%s] + config_host: http://somewhere.domain:port + genome_dir: /path to search for genomes + post_run: runfolder -o %%(runfolder)s + +""" % (CONFIG_SYSTEM, CONFIG_USER, GERALD_CONFIG_SECTION) - #Special formatter for allowing preformatted description. - ##parser.format_epilog(PreformattedDescriptionFormatter()) + #Special formatter for allowing preformatted description. + ##parser.format_epilog(PreformattedDescriptionFormatter()) - parser.add_option("-u", "--url", - action="store", type="string", dest="url") + parser.add_option("-u", "--url", + action="store", type="string", dest="url") - parser.add_option("-o", "--output", - action="store", type="string", dest="output_filepath") + parser.add_option("-o", "--output-file", + action="store", type="string", dest="output_filepath", + help="config file destination. If runfolder is specified defaults " + "to /config-auto.txt" ) - parser.add_option("-f", "--flowcell", - action="store", type="string", dest="flowcell") + parser.add_option("-f", "--flowcell", + action="store", type="string", dest="flowcell") - parser.add_option("-g", "--genome_dir", - action="store", type="string", dest="genome_dir") - - #parser.set_default("url", "default") - - return parser + parser.add_option("-g", "--genome_dir", + action="store", type="string", dest="genome_dir") + + parser.add_option("-r", "--runfolder", + action="store", type="string", + help="specify runfolder for post_run command ") + parser.add_option('-v', '--verbose', action='store_true', default=False, + help='increase logging verbosity') + return parser + def constructConfigParser(): - """ - returns a pre-setup config parser - """ - parser = SafeConfigParser() - parser.read([CONFIG_SYSTEM, CONFIG_USER]) - if not parser.has_section('config_file_server'): - parser.add_section('config_file_server') - if not parser.has_section('local_setup'): - parser.add_section('local_setup') + """ + returns a pre-setup config parser + """ + parser = RawConfigParser() + parser.read([CONFIG_SYSTEM, CONFIG_USER]) + if not parser.has_section(GERALD_CONFIG_SECTION): + parser.add_section(GERALD_CONFIG_SECTION) - return parser + return parser -def getCombinedOptions(): - """ - Returns optparse options after it has be updated with ConfigParser - config files and merged with parsed commandline options. - """ - cl_parser = constructOptionParser() - conf_parser = constructConfigParser() - - if cl_parser is None: - options = DummyOptions() - else: - options, args = cl_parser.parse_args() - - if options.url is None: - if conf_parser.has_option('config_file_server', 'base_host_url'): - options.url = conf_parser.get('config_file_server', 'base_host_url') +def getCombinedOptions(argv=None): + """ + Returns optparse options after it has be updated with ConfigParser + config files and merged with parsed commandline options. - if options.genome_dir is None: - if conf_parser.has_option('local_setup', 'genome_dir'): - options.genome_dir = conf_parser.get('local_setup', 'genome_dir') - - logging.info('USING OPTIONS:') - logging.info(' URL: %s' % (options.url,)) - logging.info(' OUT: %s' % (options.output_filepath,)) - logging.info(' FC: %s' % (options.flowcell,)) - logging.info('GDIR: %s' % (options.genome_dir,)) + expects command line arguments to be passed in + """ + cl_parser = constructOptionParser() + conf_parser = constructConfigParser() + + if argv is None: + options = DummyOptions() + else: + options, args = cl_parser.parse_args(argv) + + if options.url is None: + if conf_parser.has_option(GERALD_CONFIG_SECTION, 'config_host'): + options.url = conf_parser.get(GERALD_CONFIG_SECTION, 'config_host') + + options.url = normalize_url(options.url) - return options + if options.genome_dir is None: + if conf_parser.has_option(GERALD_CONFIG_SECTION, 'genome_dir'): + options.genome_dir = conf_parser.get(GERALD_CONFIG_SECTION, 'genome_dir') + + if conf_parser.has_option(GERALD_CONFIG_SECTION, 'post_run'): + options.post_run = conf_parser.get(GERALD_CONFIG_SECTION, 'post_run') + else: + options.post_run = None + if options.output_filepath is None: + if options.runfolder is not None: + options.output_filepath = os.path.join(options.runfolder, 'config-auto.txt') + + return options -def saveConfigFile(flowcell, base_host_url, output_filepath): + +def saveConfigFile(options): """ retrieves the flowcell eland config file, give the base_host_url (i.e. http://sub.domain.edu:port) """ - url = base_host_url + '/eland_config/%s/' % (flowcell) - - f = open(output_filepath, 'w') - #try: - try: - web = urllib2.urlopen(url) - except urllib2.URLError, e: - errmsg = 'URLError: %d %s' % (e.code, e.msg) - logging.error(errmsg) - logging.error('opened %s' % (url,)) - raise IOError(errmsg) - - #except IOError, msg: - # if str(msg).find("Connection refused") >= 0: - # print 'Error: Connection refused for: %s' % (url) - # f.close() - # sys.exit(1) - # elif str(msg).find("Name or service not known") >= 0: - # print 'Error: Invalid domain or ip address for: %s' % (url) - # f.close() - # sys.exit(2) - # else: - # raise IOError, msg + logging.info('USING OPTIONS:') + logging.info(u' URL: %s' % (options.url,)) + logging.info(u' OUT: %s' % (options.output_filepath,)) + logging.info(u' FC: %s' % (options.flowcell,)) + #logging.info(': %s' % (options.genome_dir,)) + logging.info(u'post_run: %s' % ( unicode(options.post_run),)) + + flowcell_info = retrieve_flowcell_info(options.url, options.flowcell) - data = web.read() + logging.debug('genome_dir: %s' % ( options.genome_dir, )) + available_genomes = getAvailableGenomes(options.genome_dir) + genome_map = constructMapperDict(available_genomes) + logging.debug('available genomes: %s' % ( unicode( genome_map.keys() ),)) - if data.find('Hmm, config file for') >= 0: - msg = "Flowcell (%s) not found in DB; full url(%s)" % (flowcell, url) - raise FlowCellNotFound, msg + config = format_gerald_config(options, flowcell_info, genome_map) - if data.find('404 - Not Found') >= 0: - msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \ - "Did you get right port #?" % (flowcell, base_host_url, url) - raise FlowCellNotFound, msg + if options.output_filepath is not None: + outstream = open(options.output_filepath, 'w') + logging.info('Writing config file to %s' % (options.output_filepath,)) + else: + outstream = sys.stdout + + outstream.write(config) - f.write(data) - web.close() - f.close() - logging.info('Wrote config file to %s' % (output_filepath,)) +