htsworkflow/pipelines/retrieve_config.py

   1 #!/usr/bin/env python
   2
   3 import csv
   4 from ConfigParser import RawConfigParser
   5 import logging
   6 from optparse import OptionParser, IndentedHelpFormatter
   7 import os
   8 import sys
   9 import types
  10 import urllib
  11 import urllib2
  12
  13 try:
  14     import json
  15 except ImportError, e:
  16     import simplejson as json
  17
  18 from htsworkflow.frontend.auth import apidata
  19 from htsworkflow.util import api
  20 from htsworkflow.util.url import normalize_url
  21 from htsworkflow.pipelines.genome_mapper import \
  22      getAvailableGenomes, \
  23      constructMapperDict
  24 from htsworkflow.pipelines.runfolder import LANE_LIST
  25 # JSON dictionaries use strings
  26 LANE_LIST_JSON = [ str(l) for l in LANE_LIST ]
  27
  28 LOGGER = logging.getLogger(__name__)
  29
  30 __docformat__ = "restructredtext en"
  31
  32 CONFIG_SYSTEM = '/etc/htsworkflow.ini'
  33 CONFIG_USER = os.path.expanduser('~/.htsworkflow.ini')
  34 GERALD_CONFIG_SECTION = 'gerald_config'
  35
  36 #Disable or enable commandline arg parsing; disabled by default.
  37 DISABLE_CMDLINE = True
  38
  39 class FlowCellNotFound(Exception): pass
  40 class WebError404(Exception): pass
  41
  42 def retrieve_flowcell_info(base_host_url, flowcell):
  43     """
  44     Return a dictionary describing a
  45     """
  46     url = api.flowcell_url(base_host_url, flowcell)
  47
  48     try:
  49         apipayload = urllib.urlencode(apidata)
  50         web = urllib2.urlopen(url, apipayload)
  51     except urllib2.URLError, e:
  52         errmsg = 'URLError: %d %s' % (e.code, e.msg)
  53         LOGGER.error(errmsg)
  54         LOGGER.error('opened %s' % (url,))
  55         raise IOError(errmsg)
  56
  57     contents = web.read()
  58     headers = web.info()
  59
  60     if web.code == 403:
  61         msg = "403 - Forbbidden, probably need api key"
  62         raise FlowCellNotFound(msg)
  63
  64     if web.code == 404:
  65         msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \
  66               "Did you get right port #?" % (flowcell, base_host_url, url)
  67         raise FlowCellNotFound(msg)
  68
  69     if len(contents) == 0:
  70         msg = "No information for flowcell (%s) returned; full url(%s)" % (flowcell, url)
  71         raise FlowCellNotFound(msg)
  72
  73     data = json.loads(contents)
  74     return data
  75
  76 def is_sequencing(lane_info):
  77     """
  78     Determine if we are just sequencing and not doing any follow-up analysis
  79     """
  80     if lane_info['experiment_type'] in ('De Novo','Whole Genome'):
  81         return True
  82     else:
  83         return False
  84
  85 def group_lane_parameters(flowcell_info):
  86     """
  87     goup lanes that can share GERALD configuration blocks.
  88
  89     (The same species, read length, and eland vs sequencing)
  90     """
  91     lane_groups = {}
  92     for lane_number, lane_contents in flowcell_info['lane_set'].items():
  93         for lane_info in lane_contents:
  94             index = (lane_info['read_length'],
  95                      lane_info['library_species'],
  96                      is_sequencing(lane_info))
  97             lane_groups.setdefault(index, []).append(lane_number)
  98     return lane_groups
  99
 100 def format_gerald_header(flowcell_info):
 101     """
 102     Generate comment describing the contents of the flowcell
 103     """
 104     # I'm using '\n# ' to join the lines together, that doesn't include the
 105     # first element so i needed to put the # in manually
 106     config = ['# FLOWCELL: %s' % (flowcell_info['flowcell_id'])]
 107     config += ['']
 108     config += ['CONTROL-LANE: %s' % (flowcell_info['control_lane'],)]
 109     config += ['']
 110     config += ['Flowcell Notes:']
 111     config.extend(flowcell_info['notes'].split('\r\n'))
 112     config += ['']
 113     for lane_number in LANE_LIST_JSON:
 114         lane_contents = flowcell_info['lane_set'][lane_number]
 115         for lane_info in lane_contents:
 116             config += ['Lane%s: %s | %s' % (lane_number,
 117                                             lane_info['library_id'],
 118                                             lane_info['library_name'])]
 119
 120     config += ['']
 121     return "\n# ".join(config)
 122
 123 def format_gerald_config(options, flowcell_info, genome_map):
 124     """
 125     Generate a GERALD config file
 126     """
 127     # so we can add nothing or _pair if we're a paired end run
 128     eland_analysis_suffix = { False: "_extended", True: "_pair" }
 129     sequence_analysis_suffix = { False: "", True: "_pair" }
 130
 131     # it's convienent to have helpful information describing the flowcell
 132     # in the config file... things like which lane is which library.
 133     config = [format_gerald_header(flowcell_info)]
 134
 135     config += ['SEQUENCE_FORMAT --fastq']
 136     config += ['ELAND_SET_SIZE 20']
 137     config += ['12345678:WITH_SEQUENCE true']
 138     analysis_suffix = eland_analysis_suffix[flowcell_info['paired_end']]
 139     sequence_suffix = sequence_analysis_suffix[flowcell_info['paired_end']]
 140     lane_groups = group_lane_parameters(flowcell_info)
 141     for lane_index, lane_numbers in lane_groups.items():
 142         # lane_index is return value of group_lane_parameters
 143         read_length, species, is_sequencing = lane_index
 144         lane_numbers.sort()
 145         lane_prefix = u"".join(lane_numbers)
 146
 147         species_path = genome_map.get(species, None)
 148         LOGGER.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
 149         if not is_sequencing and species_path is None:
 150             no_genome_msg = "Forcing lanes %s to sequencing as there is no genome for %s"
 151             LOGGER.warning(no_genome_msg % (lane_numbers, species))
 152             is_sequencing = True
 153
 154         if is_sequencing:
 155             config += ['%s:ANALYSIS sequence%s' % (lane_prefix, sequence_suffix)]
 156         else:
 157             config += ['%s:ANALYSIS eland%s' % (lane_prefix, analysis_suffix)]
 158             config += ['%s:ELAND_GENOME %s' % (lane_prefix, species_path) ]
 159         #config += ['%s:READ_LENGTH %s' % ( lane_prefix, read_length ) ]
 160         config += ['%s:USE_BASES Y%s' % ( lane_prefix, read_length ) ]
 161
 162     # add in option for running script after
 163     if not (options.post_run is None or options.runfolder is None):
 164         runfolder = os.path.abspath(options.runfolder)
 165         post_run = options.post_run  % {'runfolder': runfolder}
 166         config += ['POST_RUN_COMMAND %s' % (post_run,) ]
 167
 168     config += [''] # force trailing newline
 169
 170     return "\n".join(config)
 171
 172 class DummyOptions:
 173   """
 174   Used when command line parsing is disabled; default
 175   """
 176   def __init__(self):
 177     self.url = None
 178     self.output_filepath = None
 179     self.flowcell = None
 180     self.genome_dir = None
 181
 182 class PreformattedDescriptionFormatter(IndentedHelpFormatter):
 183
 184   #def format_description(self, description):
 185   #
 186   #  if description:
 187   #      return description + "\n"
 188   #  else:
 189   #     return ""
 190
 191   def format_epilog(self, epilog):
 192     """
 193     It was removing my preformated epilog, so this should override
 194     that behavior! Muhahaha!
 195     """
 196     if epilog:
 197         return "\n" + epilog + "\n"
 198     else:
 199         return ""
 200
 201
 202 def constructOptionParser():
 203     """
 204     returns a pre-setup optparser
 205     """
 206     parser = OptionParser(formatter=PreformattedDescriptionFormatter())
 207
 208     parser.set_description('Retrieves eland config file from hts_frontend web frontend.')
 209
 210     parser.epilog = """
 211 Config File:
 212   * %s (System wide)
 213   * %s (User specific; overrides system)
 214   * command line overrides all config file options
 215
 216   Example Config File:
 217
 218     [%s]
 219     config_host: http://somewhere.domain:port
 220     genome_dir: /path to search for genomes
 221     post_run: runfolder -o <destdir> %%(runfolder)s
 222
 223 """ % (CONFIG_SYSTEM, CONFIG_USER, GERALD_CONFIG_SECTION)
 224
 225     #Special formatter for allowing preformatted description.
 226     ##parser.format_epilog(PreformattedDescriptionFormatter())
 227
 228     parser.add_option("-u", "--url",
 229                       action="store", type="string", dest="url")
 230
 231     parser.add_option("-o", "--output-file",
 232                       action="store", type="string", dest="output_filepath",
 233                       help="config file destination. If runfolder is specified defaults "
 234                            "to <runfolder>/config-auto.txt" )
 235
 236     parser.add_option("-f", "--flowcell",
 237                       action="store", type="string", dest="flowcell")
 238
 239     parser.add_option("-g", "--genome_dir",
 240                       action="store", type="string", dest="genome_dir")
 241
 242     parser.add_option("-r", "--runfolder",
 243                       action="store", type="string",
 244                       help="specify runfolder for post_run command ")
 245
 246     parser.add_option("--sample-sheet", default=None,
 247                       help="path to save demultiplexing sample sheet")
 248
 249     parser.add_option("--operator", default='', help="Name of sequencer operator")
 250     parser.add_option("--recipe", default="Unknown",
 251                       help="specify recipe name")
 252
 253     parser.add_option('-v', '--verbose', action='store_true', default=False,
 254                        help='increase logging verbosity')
 255     return parser
 256
 257 def constructConfigParser():
 258     """
 259     returns a pre-setup config parser
 260     """
 261     parser = RawConfigParser()
 262     parser.read([CONFIG_SYSTEM, CONFIG_USER])
 263     if not parser.has_section(GERALD_CONFIG_SECTION):
 264         parser.add_section(GERALD_CONFIG_SECTION)
 265
 266     return parser
 267
 268
 269 def getCombinedOptions(argv=None):
 270     """
 271     Returns optparse options after it has be updated with ConfigParser
 272     config files and merged with parsed commandline options.
 273
 274     expects command line arguments to be passed in
 275     """
 276     cl_parser = constructOptionParser()
 277     conf_parser = constructConfigParser()
 278
 279     if argv is None:
 280         options = DummyOptions()
 281     else:
 282         options, args = cl_parser.parse_args(argv)
 283
 284     if options.url is None:
 285         if conf_parser.has_option(GERALD_CONFIG_SECTION, 'config_host'):
 286             options.url = conf_parser.get(GERALD_CONFIG_SECTION, 'config_host')
 287
 288     options.url = normalize_url(options.url)
 289
 290     if options.genome_dir is None:
 291         if conf_parser.has_option(GERALD_CONFIG_SECTION, 'genome_dir'):
 292             options.genome_dir = conf_parser.get(GERALD_CONFIG_SECTION, 'genome_dir')
 293
 294     if conf_parser.has_option(GERALD_CONFIG_SECTION, 'post_run'):
 295         options.post_run = conf_parser.get(GERALD_CONFIG_SECTION, 'post_run')
 296     else:
 297         options.post_run = None
 298
 299     if options.output_filepath is None:
 300         if options.runfolder is not None:
 301             options.output_filepath = os.path.join(options.runfolder, 'config-auto.txt')
 302
 303     return options
 304
 305
 306 def saveConfigFile(options):
 307   """
 308   retrieves the flowcell eland config file, give the base_host_url
 309   (i.e. http://sub.domain.edu:port)
 310   """
 311   LOGGER.info('USING OPTIONS:')
 312   LOGGER.info(u'     URL: %s' % (options.url,))
 313   LOGGER.info(u'     OUT: %s' % (options.output_filepath,))
 314   LOGGER.info(u'      FC: %s' % (options.flowcell,))
 315   #LOGGER.info(': %s' % (options.genome_dir,))
 316   LOGGER.info(u'post_run: %s' % ( unicode(options.post_run),))
 317
 318   flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
 319
 320   LOGGER.debug('genome_dir: %s' % ( options.genome_dir, ))
 321   available_genomes = getAvailableGenomes(options.genome_dir)
 322   genome_map = constructMapperDict(available_genomes)
 323   LOGGER.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
 324
 325   config = format_gerald_config(options, flowcell_info, genome_map)
 326
 327   if options.output_filepath is not None:
 328       outstream = open(options.output_filepath, 'w')
 329       logging.info('Writing config file to %s' % (options.output_filepath,))
 330   else:
 331       outstream = sys.stdout
 332
 333   outstream.write(config)
 334
 335   if options.sample_sheet is None:
 336       pass
 337   elif options.sample_sheet == '-':
 338       save_sample_sheet(sys.stdout, options, flowcell_info)
 339   else:
 340       stream = open(options.sample_sheet,'w')
 341       save_sample_sheet(stream, options, flowcell_info)
 342
 343
 344 def save_sample_sheet(outstream, options, flowcell_info):
 345     sample_sheet_fields = ['FCID', 'Lane', 'SampleID', 'SampleRef', 'Index',
 346                            'Description', 'Control', 'Recipe', 'Operator',
 347                            'SampleProject']
 348     illumina_to_htsw_map = {'FCID': 'flowcell',
 349                             'Lane': 'lane_number',
 350                             'SampleID': 'library_id',
 351                             'SampleRef': format_sampleref,
 352                             'Description': 'library_name',
 353                             'Control': format_control_lane,
 354                             'Recipe': format_recipe_name,
 355                             'Operator': format_operator_name}
 356     out = csv.DictWriter(outstream, sample_sheet_fields)
 357     out.writerow(dict(((x,x) for x in sample_sheet_fields)))
 358     for lane_number in LANE_LIST:
 359         lane_contents = flowcell_info['lane_set'][str(lane_number)]
 360
 361         pooled_lane_contents = []
 362         for library in lane_contents:
 363             # build common attributes
 364             renamed = {}
 365             for illumina_name in sample_sheet_fields:
 366                 htsw_field = illumina_to_htsw_map.get(illumina_name, None)
 367                 if htsw_field is None:
 368                     continue
 369                 if callable(htsw_field):
 370                     renamed[illumina_name] = htsw_field(options,
 371                                                         flowcell_info,
 372                                                         library)
 373                 else:
 374                     renamed[illumina_name] = library[htsw_field]
 375
 376             pooled_lane_contents.extend(format_pooled_libraries(renamed, library))
 377
 378         for row in pooled_lane_contents:
 379             out.writerow(row)
 380
 381
 382 def format_sampleref(options, flowcell_info, sample):
 383     return sample['library_species'].replace(' ', '_')
 384
 385
 386 def format_control_lane(options, flowcell_info, sample):
 387     if sample['lane_number'] == flowcell_info['control_lane']:
 388         return 'Y'
 389     else:
 390         return 'N'
 391
 392
 393 def format_recipe_name(options, flowcell_info, sample):
 394     return options.recipe
 395
 396
 397 def format_operator_name(options, flowcell_info, sample):
 398     return options.operator
 399
 400
 401 def format_pooled_libraries(shared, library):
 402     sequences = library.get('index_sequence', None)
 403     if sequences is None:
 404         return []
 405     elif (type(sequences) in types.StringTypes and
 406           sequences.lower().startswith('err')):
 407         shared['Index'] = ''
 408         shared['SampleProject'] = library['library_id']
 409         return [shared]
 410     elif (type(sequences) == types.DictType):
 411         pooled = []
 412         multiplex_ids = sequences.keys()
 413         multiplex_ids.sort(key=int)
 414         for multiplex_id in multiplex_ids:
 415             sample = {}
 416             sample.update(shared)
 417             sample['Index'] = sequences[multiplex_id]
 418             sample['SampleProject'] = format_project_name(library,
 419                                                           multiplex_id)
 420             pooled.append(sample)
 421         return pooled
 422     else:
 423         raise RuntimeError("Unrecognized index type")
 424
 425
 426
 427 def format_project_name(library, multiplex_id):
 428     library_id = library['library_id']
 429     return "%s_index%s" % (library_id, multiplex_id)
 430
 431