htsworkflow/pipelines/retrieve_config.py

   1 #!/usr/bin/env python
   2
   3 import csv
   4 from ConfigParser import RawConfigParser
   5 import logging
   6 from optparse import OptionParser, IndentedHelpFormatter
   7 import os
   8 import sys
   9 import types
  10 import urllib
  11 import urllib2
  12
  13 try:
  14     import json
  15 except ImportError, e:
  16     import simplejson as json
  17
  18 from htsworkflow.frontend.auth import apidata
  19 from htsworkflow.util import api
  20 from htsworkflow.util.url import normalize_url
  21 from htsworkflow.pipelines.genome_mapper import \
  22      getAvailableGenomes, \
  23      constructMapperDict
  24 from htsworkflow.pipelines.runfolder import LANE_LIST
  25 # JSON dictionaries use strings
  26 LANE_LIST_JSON = [ str(l) for l in LANE_LIST ]
  27
  28 __docformat__ = "restructredtext en"
  29
  30 CONFIG_SYSTEM = '/etc/htsworkflow.ini'
  31 CONFIG_USER = os.path.expanduser('~/.htsworkflow.ini')
  32 GERALD_CONFIG_SECTION = 'gerald_config'
  33
  34 #Disable or enable commandline arg parsing; disabled by default.
  35 DISABLE_CMDLINE = True
  36
  37 class FlowCellNotFound(Exception): pass
  38 class WebError404(Exception): pass
  39
  40 def retrieve_flowcell_info(base_host_url, flowcell):
  41     """
  42     Return a dictionary describing a
  43     """
  44     url = api.flowcell_url(base_host_url, flowcell)
  45
  46     try:
  47         apipayload = urllib.urlencode(apidata)
  48         web = urllib2.urlopen(url, apipayload)
  49     except urllib2.URLError, e:
  50         errmsg = 'URLError: %d %s' % (e.code, e.msg)
  51         logging.error(errmsg)
  52         logging.error('opened %s' % (url,))
  53         raise IOError(errmsg)
  54
  55     contents = web.read()
  56     headers = web.info()
  57
  58     if web.code == 403:
  59         msg = "403 - Forbbidden, probably need api key"
  60         raise FlowCellNotFound(msg)
  61
  62     if web.code == 404:
  63         msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \
  64               "Did you get right port #?" % (flowcell, base_host_url, url)
  65         raise FlowCellNotFound(msg)
  66
  67     if len(contents) == 0:
  68         msg = "No information for flowcell (%s) returned; full url(%s)" % (flowcell, url)
  69         raise FlowCellNotFound(msg)
  70
  71     data = json.loads(contents)
  72     return data
  73
  74 def is_sequencing(lane_info):
  75     """
  76     Determine if we are just sequencing and not doing any follow-up analysis
  77     """
  78     if lane_info['experiment_type'] in ('De Novo','Whole Genome'):
  79         return True
  80     else:
  81         return False
  82
  83 def group_lane_parameters(flowcell_info):
  84     """
  85     goup lanes that can share GERALD configuration blocks.
  86
  87     (The same species, read length, and eland vs sequencing)
  88     """
  89     lane_groups = {}
  90     for lane_number, lane_contents in flowcell_info['lane_set'].items():
  91         for lane_info in lane_contents:
  92             index = (lane_info['read_length'],
  93                      lane_info['library_species'],
  94                      is_sequencing(lane_info))
  95             lane_groups.setdefault(index, []).append(lane_number)
  96     return lane_groups
  97
  98 def format_gerald_header(flowcell_info):
  99     """
 100     Generate comment describing the contents of the flowcell
 101     """
 102     # I'm using '\n# ' to join the lines together, that doesn't include the
 103     # first element so i needed to put the # in manually
 104     config = ['# FLOWCELL: %s' % (flowcell_info['flowcell_id'])]
 105     config += ['']
 106     config += ['CONTROL-LANE: %s' % (flowcell_info['control_lane'],)]
 107     config += ['']
 108     config += ['Flowcell Notes:']
 109     config.extend(flowcell_info['notes'].split('\r\n'))
 110     config += ['']
 111     for lane_number in LANE_LIST_JSON:
 112         lane_contents = flowcell_info['lane_set'][lane_number]
 113         for lane_info in lane_contents:
 114             config += ['Lane%s: %s | %s' % (lane_number,
 115                                             lane_info['library_id'],
 116                                             lane_info['library_name'])]
 117
 118     config += ['']
 119     return "\n# ".join(config)
 120
 121 def format_gerald_config(options, flowcell_info, genome_map):
 122     """
 123     Generate a GERALD config file
 124     """
 125     # so we can add nothing or _pair if we're a paired end run
 126     eland_analysis_suffix = { False: "_extended", True: "_pair" }
 127     sequence_analysis_suffix = { False: "", True: "_pair" }
 128
 129     # it's convienent to have helpful information describing the flowcell
 130     # in the config file... things like which lane is which library.
 131     config = [format_gerald_header(flowcell_info)]
 132
 133     config += ['SEQUENCE_FORMAT --fastq']
 134     config += ['ELAND_SET_SIZE 20']
 135     config += ['12345678:WITH_SEQUENCE true']
 136     analysis_suffix = eland_analysis_suffix[flowcell_info['paired_end']]
 137     sequence_suffix = sequence_analysis_suffix[flowcell_info['paired_end']]
 138     lane_groups = group_lane_parameters(flowcell_info)
 139     for lane_index, lane_numbers in lane_groups.items():
 140         # lane_index is return value of group_lane_parameters
 141         read_length, species, is_sequencing = lane_index
 142         lane_numbers.sort()
 143         lane_prefix = u"".join(lane_numbers)
 144
 145         species_path = genome_map.get(species, None)
 146         logging.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
 147         if not is_sequencing and species_path is None:
 148             no_genome_msg = "Forcing lanes %s to sequencing as there is no genome for %s"
 149             logging.warning(no_genome_msg % (lane_numbers, species))
 150             is_sequencing = True
 151
 152         if is_sequencing:
 153             config += ['%s:ANALYSIS sequence%s' % (lane_prefix, sequence_suffix)]
 154         else:
 155             config += ['%s:ANALYSIS eland%s' % (lane_prefix, analysis_suffix)]
 156             config += ['%s:ELAND_GENOME %s' % (lane_prefix, species_path) ]
 157         #config += ['%s:READ_LENGTH %s' % ( lane_prefix, read_length ) ]
 158         config += ['%s:USE_BASES Y%s' % ( lane_prefix, read_length ) ]
 159
 160     # add in option for running script after
 161     if not (options.post_run is None or options.runfolder is None):
 162         runfolder = os.path.abspath(options.runfolder)
 163         post_run = options.post_run  % {'runfolder': runfolder}
 164         config += ['POST_RUN_COMMAND %s' % (post_run,) ]
 165
 166     config += [''] # force trailing newline
 167
 168     return "\n".join(config)
 169
 170 class DummyOptions:
 171   """
 172   Used when command line parsing is disabled; default
 173   """
 174   def __init__(self):
 175     self.url = None
 176     self.output_filepath = None
 177     self.flowcell = None
 178     self.genome_dir = None
 179
 180 class PreformattedDescriptionFormatter(IndentedHelpFormatter):
 181
 182   #def format_description(self, description):
 183   #
 184   #  if description:
 185   #      return description + "\n"
 186   #  else:
 187   #     return ""
 188
 189   def format_epilog(self, epilog):
 190     """
 191     It was removing my preformated epilog, so this should override
 192     that behavior! Muhahaha!
 193     """
 194     if epilog:
 195         return "\n" + epilog + "\n"
 196     else:
 197         return ""
 198
 199
 200 def constructOptionParser():
 201     """
 202     returns a pre-setup optparser
 203     """
 204     parser = OptionParser(formatter=PreformattedDescriptionFormatter())
 205
 206     parser.set_description('Retrieves eland config file from hts_frontend web frontend.')
 207
 208     parser.epilog = """
 209 Config File:
 210   * %s (System wide)
 211   * %s (User specific; overrides system)
 212   * command line overrides all config file options
 213
 214   Example Config File:
 215
 216     [%s]
 217     config_host: http://somewhere.domain:port
 218     genome_dir: /path to search for genomes
 219     post_run: runfolder -o <destdir> %%(runfolder)s
 220
 221 """ % (CONFIG_SYSTEM, CONFIG_USER, GERALD_CONFIG_SECTION)
 222
 223     #Special formatter for allowing preformatted description.
 224     ##parser.format_epilog(PreformattedDescriptionFormatter())
 225
 226     parser.add_option("-u", "--url",
 227                       action="store", type="string", dest="url")
 228
 229     parser.add_option("-o", "--output-file",
 230                       action="store", type="string", dest="output_filepath",
 231                       help="config file destination. If runfolder is specified defaults "
 232                            "to <runfolder>/config-auto.txt" )
 233
 234     parser.add_option("-f", "--flowcell",
 235                       action="store", type="string", dest="flowcell")
 236
 237     parser.add_option("-g", "--genome_dir",
 238                       action="store", type="string", dest="genome_dir")
 239
 240     parser.add_option("-r", "--runfolder",
 241                       action="store", type="string",
 242                       help="specify runfolder for post_run command ")
 243
 244     parser.add_option("--sample-sheet", default=None,
 245                       help="path to save demultiplexing sample sheet")
 246
 247     parser.add_option("--operator", default='', help="Name of sequencer operator")
 248     parser.add_option("--recipe", default="Unknown",
 249                       help="specify recipe name")
 250
 251     parser.add_option('-v', '--verbose', action='store_true', default=False,
 252                        help='increase logging verbosity')
 253     return parser
 254
 255 def constructConfigParser():
 256     """
 257     returns a pre-setup config parser
 258     """
 259     parser = RawConfigParser()
 260     parser.read([CONFIG_SYSTEM, CONFIG_USER])
 261     if not parser.has_section(GERALD_CONFIG_SECTION):
 262         parser.add_section(GERALD_CONFIG_SECTION)
 263
 264     return parser
 265
 266
 267 def getCombinedOptions(argv=None):
 268     """
 269     Returns optparse options after it has be updated with ConfigParser
 270     config files and merged with parsed commandline options.
 271
 272     expects command line arguments to be passed in
 273     """
 274     cl_parser = constructOptionParser()
 275     conf_parser = constructConfigParser()
 276
 277     if argv is None:
 278         options = DummyOptions()
 279     else:
 280         options, args = cl_parser.parse_args(argv)
 281
 282     if options.url is None:
 283         if conf_parser.has_option(GERALD_CONFIG_SECTION, 'config_host'):
 284             options.url = conf_parser.get(GERALD_CONFIG_SECTION, 'config_host')
 285
 286     options.url = normalize_url(options.url)
 287
 288     if options.genome_dir is None:
 289         if conf_parser.has_option(GERALD_CONFIG_SECTION, 'genome_dir'):
 290             options.genome_dir = conf_parser.get(GERALD_CONFIG_SECTION, 'genome_dir')
 291
 292     if conf_parser.has_option(GERALD_CONFIG_SECTION, 'post_run'):
 293         options.post_run = conf_parser.get(GERALD_CONFIG_SECTION, 'post_run')
 294     else:
 295         options.post_run = None
 296
 297     if options.output_filepath is None:
 298         if options.runfolder is not None:
 299             options.output_filepath = os.path.join(options.runfolder, 'config-auto.txt')
 300
 301     return options
 302
 303
 304 def saveConfigFile(options):
 305   """
 306   retrieves the flowcell eland config file, give the base_host_url
 307   (i.e. http://sub.domain.edu:port)
 308   """
 309   logging.info('USING OPTIONS:')
 310   logging.info(u'     URL: %s' % (options.url,))
 311   logging.info(u'     OUT: %s' % (options.output_filepath,))
 312   logging.info(u'      FC: %s' % (options.flowcell,))
 313   #logging.info(': %s' % (options.genome_dir,))
 314   logging.info(u'post_run: %s' % ( unicode(options.post_run),))
 315
 316   flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
 317
 318   logging.debug('genome_dir: %s' % ( options.genome_dir, ))
 319   available_genomes = getAvailableGenomes(options.genome_dir)
 320   genome_map = constructMapperDict(available_genomes)
 321   logging.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
 322
 323   #config = format_gerald_config(options, flowcell_info, genome_map)
 324   #
 325   #if options.output_filepath is not None:
 326   #    outstream = open(options.output_filepath, 'w')
 327   #    logging.info('Writing config file to %s' % (options.output_filepath,))
 328   #else:
 329   #    outstream = sys.stdout
 330   #
 331   #outstream.write(config)
 332
 333   if options.sample_sheet is None:
 334       pass
 335   elif options.sample_sheet == '-':
 336       save_sample_sheet(sys.stdout, options, flowcell_info)
 337   else:
 338       stream = open(options.sample_sheet,'w')
 339       save_sample_sheet(stream, options, flowcell_info)
 340
 341
 342 def save_sample_sheet(outstream, options, flowcell_info):
 343     sample_sheet_fields = ['FCID', 'Lane', 'SampleID', 'SampleRef', 'Index',
 344                            'Description', 'Control', 'Recipe', 'Operator',
 345                            'SampleProject']
 346     illumina_to_htsw_map = {'FCID': 'flowcell',
 347                             'Lane': 'lane_number',
 348                             'SampleID': 'library_id',
 349                             'SampleRef': format_sampleref,
 350                             'Description': 'library_name',
 351                             'Control': format_control_lane,
 352                             'Recipe': format_recipe_name,
 353                             'Operator': format_operator_name}
 354     out = csv.DictWriter(outstream, sample_sheet_fields)
 355     out.writeheader()
 356     for lane_number in LANE_LIST:
 357         lane_contents = flowcell_info['lane_set'][str(lane_number)]
 358
 359         pooled_lane_contents = []
 360         for library in lane_contents:
 361             # build common attributes
 362             renamed = {}
 363             for illumina_name in sample_sheet_fields:
 364                 htsw_field = illumina_to_htsw_map.get(illumina_name, None)
 365                 if htsw_field is None:
 366                     continue
 367                 if callable(htsw_field):
 368                     renamed[illumina_name] = htsw_field(options,
 369                                                         flowcell_info,
 370                                                         library)
 371                 else:
 372                     renamed[illumina_name] = library[htsw_field]
 373
 374             pooled_lane_contents.extend(format_pooled_libraries(renamed, library))
 375
 376         if len(pooled_lane_contents) > 1:
 377             for row in pooled_lane_contents:
 378                 out.writerow(row)
 379
 380
 381 def format_sampleref(options, flowcell_info, sample):
 382     return sample['library_species'].replace(' ', '_')
 383
 384
 385 def format_control_lane(options, flowcell_info, sample):
 386     if sample['lane_number'] == flowcell_info['control_lane']:
 387         return 'Y'
 388     else:
 389         return 'N'
 390
 391
 392 def format_recipe_name(options, flowcell_info, sample):
 393     return options.recipe
 394
 395
 396 def format_operator_name(options, flowcell_info, sample):
 397     return options.operator
 398
 399
 400 def format_pooled_libraries(shared, library):
 401     sequences = library.get('index_sequence', None)
 402     if sequences is None:
 403         return []
 404     elif type(sequences) in types.StringTypes:
 405         shared['Index'] = sequences
 406         shared['SampleProject'] = library['library_id']
 407         return [shared]
 408     else:
 409         pooled = []
 410         multiplex_ids = sequences.keys()
 411         multiplex_ids.sort(key=int)
 412         for multiplex_id in multiplex_ids:
 413             sample = {}
 414             sample.update(shared)
 415             sample['Index'] = sequences[multiplex_id]
 416             sample['SampleProject'] = format_project_name(library,
 417                                                           multiplex_id)
 418             pooled.append(sample)
 419         return pooled
 420
 421
 422 def format_project_name(library, multiplex_id):
 423     library_id = library['library_id']
 424     return "%s_index%s" % (library_id, multiplex_id)
 425
 426