htsworkflow/pipelines/retrieve_config.py

   1 #!/usr/bin/env python
   2
   3 import csv
   4 from ConfigParser import RawConfigParser
   5 import logging
   6 from optparse import OptionParser, IndentedHelpFormatter
   7 import os
   8 import sys
   9 import types
  10 import urllib
  11 import urllib2
  12
  13 try:
  14     import json
  15 except ImportError, e:
  16     import simplejson as json
  17
  18 from htsworkflow.frontend.auth import apidata
  19 from htsworkflow.util import api
  20 from htsworkflow.util import alphanum
  21 from htsworkflow.util.url import normalize_url
  22 from htsworkflow.pipelines.genome_mapper import \
  23      getAvailableGenomes, \
  24      constructMapperDict
  25 from htsworkflow.pipelines import LANE_LIST
  26 # JSON dictionaries use strings
  27 LANE_LIST_JSON = [ str(l) for l in LANE_LIST ]
  28
  29 LOGGER = logging.getLogger(__name__)
  30
  31 __docformat__ = "restructredtext en"
  32
  33 CONFIG_SYSTEM = '/etc/htsworkflow.ini'
  34 CONFIG_USER = os.path.expanduser('~/.htsworkflow.ini')
  35 GERALD_CONFIG_SECTION = 'gerald_config'
  36
  37 #Disable or enable commandline arg parsing; disabled by default.
  38 DISABLE_CMDLINE = True
  39
  40 class FlowCellNotFound(Exception): pass
  41 class WebError404(Exception): pass
  42
  43 def retrieve_flowcell_info(base_host_url, flowcell):
  44     """
  45     Return a dictionary describing a
  46     """
  47     url = api.flowcell_url(base_host_url, flowcell)
  48
  49     try:
  50         apipayload = urllib.urlencode(apidata)
  51         web = urllib2.urlopen(url, apipayload)
  52     except urllib2.URLError, e:
  53         errmsg = 'URLError: %d %s' % (e.code, e.msg)
  54         LOGGER.error(errmsg)
  55         LOGGER.error('opened %s' % (url,))
  56         raise IOError(errmsg)
  57
  58     contents = web.read()
  59     headers = web.info()
  60
  61     if web.code == 403:
  62         msg = "403 - Forbbidden, probably need api key"
  63         raise FlowCellNotFound(msg)
  64
  65     if web.code == 404:
  66         msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \
  67               "Did you get right port #?" % (flowcell, base_host_url, url)
  68         raise FlowCellNotFound(msg)
  69
  70     if len(contents) == 0:
  71         msg = "No information for flowcell (%s) returned; full url(%s)" % (flowcell, url)
  72         raise FlowCellNotFound(msg)
  73
  74     data = json.loads(contents)
  75     return data
  76
  77 def is_sequencing(lane_info):
  78     """
  79     Determine if we are just sequencing and not doing any follow-up analysis
  80     """
  81     if lane_info['experiment_type'] in ('De Novo','Whole Genome'):
  82         return True
  83     else:
  84         return False
  85
  86 def group_lane_parameters(flowcell_info):
  87     """
  88     goup lanes that can share GERALD configuration blocks.
  89
  90     (The same species, read length, and eland vs sequencing)
  91     """
  92     lane_groups = {}
  93     for lane_number, lane_contents in flowcell_info['lane_set'].items():
  94         for lane_info in lane_contents:
  95             index = (lane_info['read_length'],
  96                      lane_info['library_species'],
  97                      is_sequencing(lane_info))
  98             lane_groups.setdefault(index, []).append(lane_number)
  99     return lane_groups
 100
 101 def format_gerald_header(flowcell_info):
 102     """
 103     Generate comment describing the contents of the flowcell
 104     """
 105     # I'm using '\n# ' to join the lines together, that doesn't include the
 106     # first element so i needed to put the # in manually
 107     config = ['# FLOWCELL: %s' % (flowcell_info['flowcell_id'])]
 108     config += ['']
 109     config += ['CONTROL-LANE: %s' % (flowcell_info['control_lane'],)]
 110     config += ['']
 111     config += ['Flowcell Notes:']
 112     config.extend(flowcell_info['notes'].split('\r\n'))
 113     config += ['']
 114     for lane_number in LANE_LIST_JSON:
 115         lane_contents = flowcell_info['lane_set'][lane_number]
 116         for lane_info in lane_contents:
 117             config += ['Lane%s: %s | %s' % (lane_number,
 118                                             lane_info['library_id'],
 119                                             lane_info['library_name'])]
 120
 121     config += ['']
 122     return "\n# ".join(config)
 123
 124 def format_gerald_config(options, flowcell_info, genome_map):
 125     """
 126     Generate a GERALD config file
 127     """
 128     # so we can add nothing or _pair if we're a paired end run
 129     eland_analysis_suffix = { False: "_extended", True: "_pair" }
 130     sequence_analysis_suffix = { False: "", True: "_pair" }
 131
 132     # it's convienent to have helpful information describing the flowcell
 133     # in the config file... things like which lane is which library.
 134     config = [format_gerald_header(flowcell_info)]
 135
 136     config += ['SEQUENCE_FORMAT --fastq']
 137     config += ['ELAND_SET_SIZE 20']
 138     config += ['12345678:WITH_SEQUENCE true']
 139     analysis_suffix = eland_analysis_suffix[flowcell_info['paired_end']]
 140     sequence_suffix = sequence_analysis_suffix[flowcell_info['paired_end']]
 141     lane_groups = group_lane_parameters(flowcell_info)
 142     for lane_index, lane_numbers in lane_groups.items():
 143         # lane_index is return value of group_lane_parameters
 144         read_length, species, is_sequencing = lane_index
 145         lane_numbers.sort()
 146         lane_prefix = u"".join(lane_numbers)
 147
 148         species_path = genome_map.get(species, None)
 149         LOGGER.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
 150         if not is_sequencing and species_path is None:
 151             no_genome_msg = "Forcing lanes %s to sequencing as there is no genome for %s"
 152             LOGGER.warning(no_genome_msg % (lane_numbers, species))
 153             is_sequencing = True
 154
 155         if is_sequencing:
 156             config += ['%s:ANALYSIS sequence%s' % (lane_prefix, sequence_suffix)]
 157         else:
 158             config += ['%s:ANALYSIS eland%s' % (lane_prefix, analysis_suffix)]
 159             config += ['%s:ELAND_GENOME %s' % (lane_prefix, species_path) ]
 160         #config += ['%s:READ_LENGTH %s' % ( lane_prefix, read_length ) ]
 161         config += ['%s:USE_BASES Y%s' % ( lane_prefix, read_length ) ]
 162
 163     # add in option for running script after
 164     if not (options.post_run is None or options.runfolder is None):
 165         runfolder = os.path.abspath(options.runfolder)
 166         post_run = options.post_run  % {'runfolder': runfolder}
 167         config += ['POST_RUN_COMMAND %s' % (post_run,) ]
 168
 169     config += [''] # force trailing newline
 170
 171     return "\n".join(config)
 172
 173 class DummyOptions:
 174   """
 175   Used when command line parsing is disabled; default
 176   """
 177   def __init__(self):
 178     self.url = None
 179     self.output_filepath = None
 180     self.flowcell = None
 181     self.genome_dir = None
 182
 183 class PreformattedDescriptionFormatter(IndentedHelpFormatter):
 184
 185   #def format_description(self, description):
 186   #
 187   #  if description:
 188   #      return description + "\n"
 189   #  else:
 190   #     return ""
 191
 192   def format_epilog(self, epilog):
 193     """
 194     It was removing my preformated epilog, so this should override
 195     that behavior! Muhahaha!
 196     """
 197     if epilog:
 198         return "\n" + epilog + "\n"
 199     else:
 200         return ""
 201
 202
 203 def constructOptionParser():
 204     """
 205     returns a pre-setup optparser
 206     """
 207     parser = OptionParser(formatter=PreformattedDescriptionFormatter())
 208
 209     parser.set_description('Retrieves eland config file from hts_frontend web frontend.')
 210
 211     parser.epilog = """
 212 Config File:
 213   * %s (System wide)
 214   * %s (User specific; overrides system)
 215   * command line overrides all config file options
 216
 217   Example Config File:
 218
 219     [%s]
 220     config_host: http://somewhere.domain:port
 221     genome_dir: /path to search for genomes
 222     post_run: runfolder -o <destdir> %%(runfolder)s
 223
 224 """ % (CONFIG_SYSTEM, CONFIG_USER, GERALD_CONFIG_SECTION)
 225
 226     #Special formatter for allowing preformatted description.
 227     ##parser.format_epilog(PreformattedDescriptionFormatter())
 228
 229     parser.add_option("-u", "--url",
 230                       action="store", type="string", dest="url")
 231
 232     parser.add_option("-o", "--output-file",
 233                       action="store", type="string", dest="output_filepath",
 234                       help="config file destination. If runfolder is specified defaults "
 235                            "to <runfolder>/config-auto.txt" )
 236
 237     parser.add_option("-f", "--flowcell",
 238                       action="store", type="string", dest="flowcell")
 239
 240     parser.add_option("-g", "--genome_dir",
 241                       action="store", type="string", dest="genome_dir")
 242
 243     parser.add_option("-r", "--runfolder",
 244                       action="store", type="string",
 245                       help="specify runfolder for post_run command ")
 246
 247     parser.add_option("--sample-sheet", default=None,
 248                       help="path to save demultiplexing sample sheet")
 249
 250     parser.add_option("--operator", default='', help="Name of sequencer operator")
 251     parser.add_option("--recipe", default="Unknown",
 252                       help="specify recipe name")
 253
 254     parser.add_option('-v', '--verbose', action='store_true', default=False,
 255                        help='increase logging verbosity')
 256     return parser
 257
 258 def constructConfigParser():
 259     """
 260     returns a pre-setup config parser
 261     """
 262     parser = RawConfigParser()
 263     parser.read([CONFIG_SYSTEM, CONFIG_USER])
 264     if not parser.has_section(GERALD_CONFIG_SECTION):
 265         parser.add_section(GERALD_CONFIG_SECTION)
 266
 267     return parser
 268
 269
 270 def getCombinedOptions(argv=None):
 271     """
 272     Returns optparse options after it has be updated with ConfigParser
 273     config files and merged with parsed commandline options.
 274
 275     expects command line arguments to be passed in
 276     """
 277     cl_parser = constructOptionParser()
 278     conf_parser = constructConfigParser()
 279
 280     if argv is None:
 281         options = DummyOptions()
 282     else:
 283         options, args = cl_parser.parse_args(argv)
 284
 285     if options.url is None:
 286         if conf_parser.has_option(GERALD_CONFIG_SECTION, 'config_host'):
 287             options.url = conf_parser.get(GERALD_CONFIG_SECTION, 'config_host')
 288
 289     options.url = normalize_url(options.url)
 290
 291     if options.genome_dir is None:
 292         if conf_parser.has_option(GERALD_CONFIG_SECTION, 'genome_dir'):
 293             options.genome_dir = conf_parser.get(GERALD_CONFIG_SECTION, 'genome_dir')
 294
 295     if conf_parser.has_option(GERALD_CONFIG_SECTION, 'post_run'):
 296         options.post_run = conf_parser.get(GERALD_CONFIG_SECTION, 'post_run')
 297     else:
 298         options.post_run = None
 299
 300     if options.output_filepath is None:
 301         if options.runfolder is not None:
 302             options.output_filepath = os.path.join(options.runfolder, 'config-auto.txt')
 303
 304     return options
 305
 306
 307 def saveConfigFile(options):
 308   """
 309   retrieves the flowcell eland config file, give the base_host_url
 310   (i.e. http://sub.domain.edu:port)
 311   """
 312   LOGGER.info('USING OPTIONS:')
 313   LOGGER.info(u'     URL: %s' % (options.url,))
 314   LOGGER.info(u'     OUT: %s' % (options.output_filepath,))
 315   LOGGER.info(u'      FC: %s' % (options.flowcell,))
 316   #LOGGER.info(': %s' % (options.genome_dir,))
 317   LOGGER.info(u'post_run: %s' % ( unicode(options.post_run),))
 318
 319   flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
 320
 321   LOGGER.debug('genome_dir: %s' % ( options.genome_dir, ))
 322   available_genomes = getAvailableGenomes(options.genome_dir)
 323   genome_map = constructMapperDict(available_genomes)
 324   LOGGER.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
 325
 326   config = format_gerald_config(options, flowcell_info, genome_map)
 327
 328   if options.output_filepath is not None:
 329       outstream = open(options.output_filepath, 'w')
 330       logging.info('Writing config file to %s' % (options.output_filepath,))
 331   else:
 332       outstream = sys.stdout
 333
 334   outstream.write(config)
 335
 336   if options.sample_sheet is None:
 337       pass
 338   elif options.sample_sheet == '-':
 339       save_sample_sheet(sys.stdout, options, flowcell_info)
 340   else:
 341       stream = open(options.sample_sheet,'w')
 342       save_sample_sheet(stream, options, flowcell_info)
 343
 344
 345 def save_sample_sheet(outstream, options, flowcell_info):
 346     sample_sheet_fields = ['FCID', 'Lane', 'SampleID', 'SampleRef', 'Index',
 347                            'Description', 'Control', 'Recipe', 'Operator',
 348                            'SampleProject']
 349     illumina_to_htsw_map = {'FCID': 'flowcell',
 350                             'Lane': 'lane_number',
 351                             'SampleID': 'library_id',
 352                             'SampleRef': format_sampleref,
 353                             'Description': 'library_name',
 354                             'Control': format_control_lane,
 355                             'Recipe': format_recipe_name,
 356                             'Operator': format_operator_name}
 357     out = csv.DictWriter(outstream, sample_sheet_fields)
 358     out.writerow(dict(((x,x) for x in sample_sheet_fields)))
 359     for lane_number in LANE_LIST:
 360         lane_contents = flowcell_info['lane_set'][str(lane_number)]
 361
 362         pooled_lane_contents = []
 363         for library in lane_contents:
 364             # build common attributes
 365             renamed = {}
 366             for illumina_name in sample_sheet_fields:
 367                 htsw_field = illumina_to_htsw_map.get(illumina_name, None)
 368                 if htsw_field is None:
 369                     continue
 370                 if callable(htsw_field):
 371                     renamed[illumina_name] = htsw_field(options,
 372                                                         flowcell_info,
 373                                                         library)
 374                 else:
 375                     renamed[illumina_name] = library[htsw_field]
 376
 377             pooled_lane_contents.extend(format_pooled_libraries(renamed, library))
 378
 379         for row in pooled_lane_contents:
 380             out.writerow(row)
 381
 382
 383 def format_sampleref(options, flowcell_info, sample):
 384     return sample['library_species'].replace(' ', '_')
 385
 386
 387 def format_control_lane(options, flowcell_info, sample):
 388     if sample['lane_number'] == flowcell_info['control_lane']:
 389         return 'Y'
 390     else:
 391         return 'N'
 392
 393
 394 def format_recipe_name(options, flowcell_info, sample):
 395     return options.recipe
 396
 397
 398 def format_operator_name(options, flowcell_info, sample):
 399     return options.operator
 400
 401
 402 def format_pooled_libraries(shared, library):
 403     sequences = library.get('index_sequence', None)
 404     if sequences is None:
 405         return []
 406     elif (type(sequences) in types.StringTypes and
 407           sequences.lower().startswith('err')):
 408         shared['Index'] = ''
 409         shared['SampleProject'] = library['library_id']
 410         return [shared]
 411     elif (type(sequences) == types.DictType):
 412         pooled = []
 413         multiplex_ids = sequences.keys()
 414         multiplex_ids.sort(cmp=alphanum.alphanum)
 415         for multiplex_id in multiplex_ids:
 416             sample = {}
 417             sample.update(shared)
 418             sample['Index'] = sequences[multiplex_id]
 419             sample['SampleProject'] = format_project_name(library,
 420                                                           multiplex_id)
 421             pooled.append(sample)
 422         return pooled
 423     else:
 424         raise RuntimeError("Unrecognized index type")
 425
 426
 427
 428 def format_project_name(library, multiplex_id):
 429     library_id = library['library_id']
 430     return "%s_index%s" % (library_id, multiplex_id)
 431
 432