htsworkflow/pipelines/retrieve_config.py

   1 #!/usr/bin/env python
   2
   3 import csv
   4 from configparser import RawConfigParser
   5 import logging
   6 from optparse import OptionParser, IndentedHelpFormatter
   7 import os
   8 import sys
   9 import types
  10 import urllib.request, urllib.parse, urllib.error
  11 import urllib.request, urllib.error, urllib.parse
  12 import collections
  13
  14 try:
  15     import json
  16 except ImportError as e:
  17     import simplejson as json
  18
  19 from htsworkflow.frontend.auth import apidata
  20 from htsworkflow.util import api
  21 from htsworkflow.util import alphanum
  22 from htsworkflow.util.url import normalize_url
  23 from htsworkflow.pipelines.genome_mapper import \
  24      getAvailableGenomes, \
  25      constructMapperDict
  26 from htsworkflow.pipelines import LANE_LIST
  27 # JSON dictionaries use strings
  28 LANE_LIST_JSON = [ str(l) for l in LANE_LIST ]
  29
  30 LOGGER = logging.getLogger(__name__)
  31
  32 __docformat__ = "restructredtext en"
  33
  34 CONFIG_SYSTEM = '/etc/htsworkflow.ini'
  35 CONFIG_USER = os.path.expanduser('~/.htsworkflow.ini')
  36 GERALD_CONFIG_SECTION = 'gerald_config'
  37
  38 #Disable or enable commandline arg parsing; disabled by default.
  39 DISABLE_CMDLINE = True
  40
  41 class FlowCellNotFound(Exception): pass
  42 class WebError404(Exception): pass
  43
  44 def retrieve_flowcell_info(base_host_url, flowcell):
  45     """
  46     Return a dictionary describing a
  47     """
  48     url = api.flowcell_url(base_host_url, flowcell)
  49
  50     try:
  51         apipayload = urllib.parse.urlencode(apidata)
  52         web = urllib.request.urlopen(url, apipayload)
  53     except urllib.error.URLError as e:
  54         errmsg = 'URLError: %d %s' % (e.code, e.msg)
  55         LOGGER.error(errmsg)
  56         LOGGER.error('opened %s' % (url,))
  57         raise IOError(errmsg)
  58
  59     contents = web.read()
  60     headers = web.info()
  61
  62     if web.code == 403:
  63         msg = "403 - Forbbidden, probably need api key"
  64         raise FlowCellNotFound(msg)
  65
  66     if web.code == 404:
  67         msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \
  68               "Did you get right port #?" % (flowcell, base_host_url, url)
  69         raise FlowCellNotFound(msg)
  70
  71     if len(contents) == 0:
  72         msg = "No information for flowcell (%s) returned; full url(%s)" % (flowcell, url)
  73         raise FlowCellNotFound(msg)
  74
  75     data = json.loads(contents)
  76     return data
  77
  78 def is_sequencing(lane_info):
  79     """
  80     Determine if we are just sequencing and not doing any follow-up analysis
  81     """
  82     if lane_info['experiment_type'] in ('De Novo','Whole Genome'):
  83         return True
  84     else:
  85         return False
  86
  87 def group_lane_parameters(flowcell_info):
  88     """
  89     goup lanes that can share GERALD configuration blocks.
  90
  91     (The same species, read length, and eland vs sequencing)
  92     """
  93     lane_groups = {}
  94     for lane_number, lane_contents in list(flowcell_info['lane_set'].items()):
  95         for lane_info in lane_contents:
  96             index = (lane_info['read_length'],
  97                      lane_info['library_species'],
  98                      is_sequencing(lane_info))
  99             lane_groups.setdefault(index, []).append(lane_number)
 100     return lane_groups
 101
 102 def format_gerald_header(flowcell_info):
 103     """
 104     Generate comment describing the contents of the flowcell
 105     """
 106     # I'm using '\n# ' to join the lines together, that doesn't include the
 107     # first element so i needed to put the # in manually
 108     config = ['# FLOWCELL: %s' % (flowcell_info['flowcell_id'])]
 109     config += ['']
 110     config += ['CONTROL-LANE: %s' % (flowcell_info['control_lane'],)]
 111     config += ['']
 112     config += ['Flowcell Notes:']
 113     config.extend(flowcell_info['notes'].split('\r\n'))
 114     config += ['']
 115     for lane_number in LANE_LIST_JSON:
 116         lane_contents = flowcell_info['lane_set'][lane_number]
 117         for lane_info in lane_contents:
 118             config += ['Lane%s: %s | %s' % (lane_number,
 119                                             lane_info['library_id'],
 120                                             lane_info['library_name'])]
 121
 122     config += ['']
 123     return "\n# ".join(config)
 124
 125 def format_gerald_config(options, flowcell_info, genome_map):
 126     """
 127     Generate a GERALD config file
 128     """
 129     # so we can add nothing or _pair if we're a paired end run
 130     eland_analysis_suffix = { False: "_extended", True: "_pair" }
 131     sequence_analysis_suffix = { False: "", True: "_pair" }
 132
 133     # it's convienent to have helpful information describing the flowcell
 134     # in the config file... things like which lane is which library.
 135     config = [format_gerald_header(flowcell_info)]
 136
 137     config += ['SEQUENCE_FORMAT --fastq']
 138     config += ['ELAND_SET_SIZE 20']
 139     config += ['12345678:WITH_SEQUENCE true']
 140     analysis_suffix = eland_analysis_suffix[flowcell_info['paired_end']]
 141     sequence_suffix = sequence_analysis_suffix[flowcell_info['paired_end']]
 142     lane_groups = group_lane_parameters(flowcell_info)
 143     for lane_index, lane_numbers in list(lane_groups.items()):
 144         # lane_index is return value of group_lane_parameters
 145         read_length, species, is_sequencing = lane_index
 146         lane_numbers.sort()
 147         lane_prefix = "".join(lane_numbers)
 148
 149         species_path = genome_map.get(species, None)
 150         LOGGER.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
 151         if not is_sequencing and species_path is None:
 152             no_genome_msg = "Forcing lanes %s to sequencing as there is no genome for %s"
 153             LOGGER.warning(no_genome_msg % (lane_numbers, species))
 154             is_sequencing = True
 155
 156         if is_sequencing:
 157             config += ['%s:ANALYSIS sequence%s' % (lane_prefix, sequence_suffix)]
 158         else:
 159             config += ['%s:ANALYSIS eland%s' % (lane_prefix, analysis_suffix)]
 160             config += ['%s:ELAND_GENOME %s' % (lane_prefix, species_path) ]
 161         #config += ['%s:READ_LENGTH %s' % ( lane_prefix, read_length ) ]
 162         config += ['%s:USE_BASES Y%s' % ( lane_prefix, read_length ) ]
 163
 164     # add in option for running script after
 165     if not (options.post_run is None or options.runfolder is None):
 166         runfolder = os.path.abspath(options.runfolder)
 167         post_run = options.post_run  % {'runfolder': runfolder}
 168         config += ['POST_RUN_COMMAND %s' % (post_run,) ]
 169
 170     config += [''] # force trailing newline
 171
 172     return "\n".join(config)
 173
 174 class DummyOptions:
 175   """
 176   Used when command line parsing is disabled; default
 177   """
 178   def __init__(self):
 179     self.url = None
 180     self.output_filepath = None
 181     self.flowcell = None
 182     self.genome_dir = None
 183
 184 class PreformattedDescriptionFormatter(IndentedHelpFormatter):
 185
 186   #def format_description(self, description):
 187   #
 188   #  if description:
 189   #      return description + "\n"
 190   #  else:
 191   #     return ""
 192
 193   def format_epilog(self, epilog):
 194     """
 195     It was removing my preformated epilog, so this should override
 196     that behavior! Muhahaha!
 197     """
 198     if epilog:
 199         return "\n" + epilog + "\n"
 200     else:
 201         return ""
 202
 203
 204 def constructOptionParser():
 205     """
 206     returns a pre-setup optparser
 207     """
 208     parser = OptionParser(formatter=PreformattedDescriptionFormatter())
 209
 210     parser.set_description('Retrieves eland config file from hts_frontend web frontend.')
 211
 212     parser.epilog = """
 213 Config File:
 214   * %s (System wide)
 215   * %s (User specific; overrides system)
 216   * command line overrides all config file options
 217
 218   Example Config File:
 219
 220     [%s]
 221     config_host: http://somewhere.domain:port
 222     genome_dir: /path to search for genomes
 223     post_run: runfolder -o <destdir> %%(runfolder)s
 224
 225 """ % (CONFIG_SYSTEM, CONFIG_USER, GERALD_CONFIG_SECTION)
 226
 227     #Special formatter for allowing preformatted description.
 228     ##parser.format_epilog(PreformattedDescriptionFormatter())
 229
 230     parser.add_option("-u", "--url",
 231                       action="store", type="string", dest="url")
 232
 233     parser.add_option("-o", "--output-file",
 234                       action="store", type="string", dest="output_filepath",
 235                       help="config file destination. If runfolder is specified defaults "
 236                            "to <runfolder>/config-auto.txt" )
 237
 238     parser.add_option("-f", "--flowcell",
 239                       action="store", type="string", dest="flowcell")
 240
 241     parser.add_option("-g", "--genome_dir",
 242                       action="store", type="string", dest="genome_dir")
 243
 244     parser.add_option("-r", "--runfolder",
 245                       action="store", type="string",
 246                       help="specify runfolder for post_run command ")
 247
 248     parser.add_option("--sample-sheet", default=None,
 249                       help="path to save demultiplexing sample sheet")
 250
 251     parser.add_option("--operator", default='', help="Name of sequencer operator")
 252     parser.add_option("--recipe", default="Unknown",
 253                       help="specify recipe name")
 254
 255     parser.add_option('-v', '--verbose', action='store_true', default=False,
 256                        help='increase logging verbosity')
 257     return parser
 258
 259 def constructConfigParser():
 260     """
 261     returns a pre-setup config parser
 262     """
 263     parser = RawConfigParser()
 264     parser.read([CONFIG_SYSTEM, CONFIG_USER])
 265     if not parser.has_section(GERALD_CONFIG_SECTION):
 266         parser.add_section(GERALD_CONFIG_SECTION)
 267
 268     return parser
 269
 270
 271 def getCombinedOptions(argv=None):
 272     """
 273     Returns optparse options after it has be updated with ConfigParser
 274     config files and merged with parsed commandline options.
 275
 276     expects command line arguments to be passed in
 277     """
 278     cl_parser = constructOptionParser()
 279     conf_parser = constructConfigParser()
 280
 281     if argv is None:
 282         options = DummyOptions()
 283     else:
 284         options, args = cl_parser.parse_args(argv)
 285
 286     if options.url is None:
 287         if conf_parser.has_option(GERALD_CONFIG_SECTION, 'config_host'):
 288             options.url = conf_parser.get(GERALD_CONFIG_SECTION, 'config_host')
 289
 290     options.url = normalize_url(options.url)
 291
 292     if options.genome_dir is None:
 293         if conf_parser.has_option(GERALD_CONFIG_SECTION, 'genome_dir'):
 294             options.genome_dir = conf_parser.get(GERALD_CONFIG_SECTION, 'genome_dir')
 295
 296     if conf_parser.has_option(GERALD_CONFIG_SECTION, 'post_run'):
 297         options.post_run = conf_parser.get(GERALD_CONFIG_SECTION, 'post_run')
 298     else:
 299         options.post_run = None
 300
 301     if options.output_filepath is None:
 302         if options.runfolder is not None:
 303             options.output_filepath = os.path.join(options.runfolder, 'config-auto.txt')
 304
 305     return options
 306
 307
 308 def saveConfigFile(options):
 309   """
 310   retrieves the flowcell eland config file, give the base_host_url
 311   (i.e. http://sub.domain.edu:port)
 312   """
 313   LOGGER.info('USING OPTIONS:')
 314   LOGGER.info('     URL: %s' % (options.url,))
 315   LOGGER.info('     OUT: %s' % (options.output_filepath,))
 316   LOGGER.info('      FC: %s' % (options.flowcell,))
 317   #LOGGER.info(': %s' % (options.genome_dir,))
 318   LOGGER.info('post_run: %s' % ( str(options.post_run),))
 319
 320   flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
 321
 322   LOGGER.debug('genome_dir: %s' % ( options.genome_dir, ))
 323   available_genomes = getAvailableGenomes(options.genome_dir)
 324   genome_map = constructMapperDict(available_genomes)
 325   LOGGER.debug('available genomes: %s' % ( str( list(genome_map.keys()) ),))
 326
 327   config = format_gerald_config(options, flowcell_info, genome_map)
 328
 329   if options.output_filepath is not None:
 330       outstream = open(options.output_filepath, 'w')
 331       logging.info('Writing config file to %s' % (options.output_filepath,))
 332   else:
 333       outstream = sys.stdout
 334
 335   outstream.write(config)
 336
 337   if options.sample_sheet is None:
 338       pass
 339   elif options.sample_sheet == '-':
 340       save_sample_sheet(sys.stdout, options, flowcell_info)
 341   else:
 342       stream = open(options.sample_sheet,'w')
 343       save_sample_sheet(stream, options, flowcell_info)
 344
 345
 346 def save_sample_sheet(outstream, options, flowcell_info):
 347     sample_sheet_fields = ['FCID', 'Lane', 'SampleID', 'SampleRef', 'Index',
 348                            'Description', 'Control', 'Recipe', 'Operator',
 349                            'SampleProject']
 350     illumina_to_htsw_map = {'FCID': 'flowcell',
 351                             'Lane': 'lane_number',
 352                             'SampleID': 'library_id',
 353                             'SampleRef': format_sampleref,
 354                             'Description': 'library_name',
 355                             'Control': format_control_lane,
 356                             'Recipe': format_recipe_name,
 357                             'Operator': format_operator_name}
 358     out = csv.DictWriter(outstream, sample_sheet_fields)
 359     out.writerow(dict(((x,x) for x in sample_sheet_fields)))
 360     for lane_number in LANE_LIST:
 361         lane_contents = flowcell_info['lane_set'][str(lane_number)]
 362
 363         pooled_lane_contents = []
 364         for library in lane_contents:
 365             # build common attributes
 366             renamed = {}
 367             for illumina_name in sample_sheet_fields:
 368                 htsw_field = illumina_to_htsw_map.get(illumina_name, None)
 369                 if htsw_field is None:
 370                     continue
 371                 if isinstance(htsw_field, collections.Callable):
 372                     renamed[illumina_name] = htsw_field(options,
 373                                                         flowcell_info,
 374                                                         library)
 375                 else:
 376                     renamed[illumina_name] = library[htsw_field]
 377
 378             pooled_lane_contents.extend(format_pooled_libraries(renamed, library))
 379
 380         for row in pooled_lane_contents:
 381             out.writerow(row)
 382
 383
 384 def format_sampleref(options, flowcell_info, sample):
 385     return sample['library_species'].replace(' ', '_')
 386
 387
 388 def format_control_lane(options, flowcell_info, sample):
 389     if sample['lane_number'] == flowcell_info['control_lane']:
 390         return 'Y'
 391     else:
 392         return 'N'
 393
 394
 395 def format_recipe_name(options, flowcell_info, sample):
 396     return options.recipe
 397
 398
 399 def format_operator_name(options, flowcell_info, sample):
 400     return options.operator
 401
 402
 403 def format_pooled_libraries(shared, library):
 404     sequences = library.get('index_sequence', None)
 405     if sequences is None:
 406         return []
 407     elif (type(sequences) in str and
 408           sequences.lower().startswith('err')):
 409         shared['Index'] = ''
 410         shared['SampleProject'] = library['library_id']
 411         return [shared]
 412     elif (type(sequences) == dict):
 413         pooled = []
 414         multiplex_ids = list(sequences.keys())
 415         multiplex_ids.sort(cmp=alphanum.alphanum)
 416         for multiplex_id in multiplex_ids:
 417             sample = {}
 418             sample.update(shared)
 419             sample['Index'] = sequences[multiplex_id]
 420             sample['SampleProject'] = format_project_name(library,
 421                                                           multiplex_id)
 422             pooled.append(sample)
 423         return pooled
 424     else:
 425         raise RuntimeError("Unrecognized index type")
 426
 427
 428
 429 def format_project_name(library, multiplex_id):
 430     library_id = library['library_id']
 431     return "%s_index%s" % (library_id, multiplex_id)
 432
 433