Initial port to python3
[htsworkflow.git] / htsworkflow / pipelines / retrieve_config.py
1 #!/usr/bin/env python
2
3 import csv
4 from configparser import RawConfigParser
5 import logging
6 from optparse import OptionParser, IndentedHelpFormatter
7 import os
8 import sys
9 import types
10 import urllib.request, urllib.parse, urllib.error
11 import urllib.request, urllib.error, urllib.parse
12 import collections
13
14 try:
15     import json
16 except ImportError as e:
17     import simplejson as json
18
19 from htsworkflow.frontend.auth import apidata
20 from htsworkflow.util import api
21 from htsworkflow.util import alphanum
22 from htsworkflow.util.url import normalize_url
23 from htsworkflow.pipelines.genome_mapper import \
24      getAvailableGenomes, \
25      constructMapperDict
26 from htsworkflow.pipelines import LANE_LIST
27 # JSON dictionaries use strings
28 LANE_LIST_JSON = [ str(l) for l in LANE_LIST ]
29
30 LOGGER = logging.getLogger(__name__)
31
32 __docformat__ = "restructredtext en"
33
34 CONFIG_SYSTEM = '/etc/htsworkflow.ini'
35 CONFIG_USER = os.path.expanduser('~/.htsworkflow.ini')
36 GERALD_CONFIG_SECTION = 'gerald_config'
37
38 #Disable or enable commandline arg parsing; disabled by default.
39 DISABLE_CMDLINE = True
40
41 class FlowCellNotFound(Exception): pass
42 class WebError404(Exception): pass
43
44 def retrieve_flowcell_info(base_host_url, flowcell):
45     """
46     Return a dictionary describing a
47     """
48     url = api.flowcell_url(base_host_url, flowcell)
49
50     try:
51         apipayload = urllib.parse.urlencode(apidata)
52         web = urllib.request.urlopen(url, apipayload)
53     except urllib.error.URLError as e:
54         errmsg = 'URLError: %d %s' % (e.code, e.msg)
55         LOGGER.error(errmsg)
56         LOGGER.error('opened %s' % (url,))
57         raise IOError(errmsg)
58
59     contents = web.read()
60     headers = web.info()
61
62     if web.code == 403:
63         msg = "403 - Forbbidden, probably need api key"
64         raise FlowCellNotFound(msg)
65
66     if web.code == 404:
67         msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \
68               "Did you get right port #?" % (flowcell, base_host_url, url)
69         raise FlowCellNotFound(msg)
70
71     if len(contents) == 0:
72         msg = "No information for flowcell (%s) returned; full url(%s)" % (flowcell, url)
73         raise FlowCellNotFound(msg)
74
75     data = json.loads(contents)
76     return data
77
78 def is_sequencing(lane_info):
79     """
80     Determine if we are just sequencing and not doing any follow-up analysis
81     """
82     if lane_info['experiment_type'] in ('De Novo','Whole Genome'):
83         return True
84     else:
85         return False
86
87 def group_lane_parameters(flowcell_info):
88     """
89     goup lanes that can share GERALD configuration blocks.
90
91     (The same species, read length, and eland vs sequencing)
92     """
93     lane_groups = {}
94     for lane_number, lane_contents in list(flowcell_info['lane_set'].items()):
95         for lane_info in lane_contents:
96             index = (lane_info['read_length'],
97                      lane_info['library_species'],
98                      is_sequencing(lane_info))
99             lane_groups.setdefault(index, []).append(lane_number)
100     return lane_groups
101
102 def format_gerald_header(flowcell_info):
103     """
104     Generate comment describing the contents of the flowcell
105     """
106     # I'm using '\n# ' to join the lines together, that doesn't include the
107     # first element so i needed to put the # in manually
108     config = ['# FLOWCELL: %s' % (flowcell_info['flowcell_id'])]
109     config += ['']
110     config += ['CONTROL-LANE: %s' % (flowcell_info['control_lane'],)]
111     config += ['']
112     config += ['Flowcell Notes:']
113     config.extend(flowcell_info['notes'].split('\r\n'))
114     config += ['']
115     for lane_number in LANE_LIST_JSON:
116         lane_contents = flowcell_info['lane_set'][lane_number]
117         for lane_info in lane_contents:
118             config += ['Lane%s: %s | %s' % (lane_number,
119                                             lane_info['library_id'],
120                                             lane_info['library_name'])]
121
122     config += ['']
123     return "\n# ".join(config)
124
125 def format_gerald_config(options, flowcell_info, genome_map):
126     """
127     Generate a GERALD config file
128     """
129     # so we can add nothing or _pair if we're a paired end run
130     eland_analysis_suffix = { False: "_extended", True: "_pair" }
131     sequence_analysis_suffix = { False: "", True: "_pair" }
132
133     # it's convienent to have helpful information describing the flowcell
134     # in the config file... things like which lane is which library.
135     config = [format_gerald_header(flowcell_info)]
136
137     config += ['SEQUENCE_FORMAT --fastq']
138     config += ['ELAND_SET_SIZE 20']
139     config += ['12345678:WITH_SEQUENCE true']
140     analysis_suffix = eland_analysis_suffix[flowcell_info['paired_end']]
141     sequence_suffix = sequence_analysis_suffix[flowcell_info['paired_end']]
142     lane_groups = group_lane_parameters(flowcell_info)
143     for lane_index, lane_numbers in list(lane_groups.items()):
144         # lane_index is return value of group_lane_parameters
145         read_length, species, is_sequencing = lane_index
146         lane_numbers.sort()
147         lane_prefix = "".join(lane_numbers)
148
149         species_path = genome_map.get(species, None)
150         LOGGER.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
151         if not is_sequencing and species_path is None:
152             no_genome_msg = "Forcing lanes %s to sequencing as there is no genome for %s"
153             LOGGER.warning(no_genome_msg % (lane_numbers, species))
154             is_sequencing = True
155
156         if is_sequencing:
157             config += ['%s:ANALYSIS sequence%s' % (lane_prefix, sequence_suffix)]
158         else:
159             config += ['%s:ANALYSIS eland%s' % (lane_prefix, analysis_suffix)]
160             config += ['%s:ELAND_GENOME %s' % (lane_prefix, species_path) ]
161         #config += ['%s:READ_LENGTH %s' % ( lane_prefix, read_length ) ]
162         config += ['%s:USE_BASES Y%s' % ( lane_prefix, read_length ) ]
163
164     # add in option for running script after
165     if not (options.post_run is None or options.runfolder is None):
166         runfolder = os.path.abspath(options.runfolder)
167         post_run = options.post_run  % {'runfolder': runfolder}
168         config += ['POST_RUN_COMMAND %s' % (post_run,) ]
169
170     config += [''] # force trailing newline
171
172     return "\n".join(config)
173
174 class DummyOptions:
175   """
176   Used when command line parsing is disabled; default
177   """
178   def __init__(self):
179     self.url = None
180     self.output_filepath = None
181     self.flowcell = None
182     self.genome_dir = None
183
184 class PreformattedDescriptionFormatter(IndentedHelpFormatter):
185
186   #def format_description(self, description):
187   #
188   #  if description:
189   #      return description + "\n"
190   #  else:
191   #     return ""
192
193   def format_epilog(self, epilog):
194     """
195     It was removing my preformated epilog, so this should override
196     that behavior! Muhahaha!
197     """
198     if epilog:
199         return "\n" + epilog + "\n"
200     else:
201         return ""
202
203
204 def constructOptionParser():
205     """
206     returns a pre-setup optparser
207     """
208     parser = OptionParser(formatter=PreformattedDescriptionFormatter())
209
210     parser.set_description('Retrieves eland config file from hts_frontend web frontend.')
211
212     parser.epilog = """
213 Config File:
214   * %s (System wide)
215   * %s (User specific; overrides system)
216   * command line overrides all config file options
217
218   Example Config File:
219
220     [%s]
221     config_host: http://somewhere.domain:port
222     genome_dir: /path to search for genomes
223     post_run: runfolder -o <destdir> %%(runfolder)s
224
225 """ % (CONFIG_SYSTEM, CONFIG_USER, GERALD_CONFIG_SECTION)
226
227     #Special formatter for allowing preformatted description.
228     ##parser.format_epilog(PreformattedDescriptionFormatter())
229
230     parser.add_option("-u", "--url",
231                       action="store", type="string", dest="url")
232
233     parser.add_option("-o", "--output-file",
234                       action="store", type="string", dest="output_filepath",
235                       help="config file destination. If runfolder is specified defaults "
236                            "to <runfolder>/config-auto.txt" )
237
238     parser.add_option("-f", "--flowcell",
239                       action="store", type="string", dest="flowcell")
240
241     parser.add_option("-g", "--genome_dir",
242                       action="store", type="string", dest="genome_dir")
243
244     parser.add_option("-r", "--runfolder",
245                       action="store", type="string",
246                       help="specify runfolder for post_run command ")
247
248     parser.add_option("--sample-sheet", default=None,
249                       help="path to save demultiplexing sample sheet")
250
251     parser.add_option("--operator", default='', help="Name of sequencer operator")
252     parser.add_option("--recipe", default="Unknown",
253                       help="specify recipe name")
254
255     parser.add_option('-v', '--verbose', action='store_true', default=False,
256                        help='increase logging verbosity')
257     return parser
258
259 def constructConfigParser():
260     """
261     returns a pre-setup config parser
262     """
263     parser = RawConfigParser()
264     parser.read([CONFIG_SYSTEM, CONFIG_USER])
265     if not parser.has_section(GERALD_CONFIG_SECTION):
266         parser.add_section(GERALD_CONFIG_SECTION)
267
268     return parser
269
270
271 def getCombinedOptions(argv=None):
272     """
273     Returns optparse options after it has be updated with ConfigParser
274     config files and merged with parsed commandline options.
275
276     expects command line arguments to be passed in
277     """
278     cl_parser = constructOptionParser()
279     conf_parser = constructConfigParser()
280
281     if argv is None:
282         options = DummyOptions()
283     else:
284         options, args = cl_parser.parse_args(argv)
285
286     if options.url is None:
287         if conf_parser.has_option(GERALD_CONFIG_SECTION, 'config_host'):
288             options.url = conf_parser.get(GERALD_CONFIG_SECTION, 'config_host')
289
290     options.url = normalize_url(options.url)
291
292     if options.genome_dir is None:
293         if conf_parser.has_option(GERALD_CONFIG_SECTION, 'genome_dir'):
294             options.genome_dir = conf_parser.get(GERALD_CONFIG_SECTION, 'genome_dir')
295
296     if conf_parser.has_option(GERALD_CONFIG_SECTION, 'post_run'):
297         options.post_run = conf_parser.get(GERALD_CONFIG_SECTION, 'post_run')
298     else:
299         options.post_run = None
300
301     if options.output_filepath is None:
302         if options.runfolder is not None:
303             options.output_filepath = os.path.join(options.runfolder, 'config-auto.txt')
304
305     return options
306
307
308 def saveConfigFile(options):
309   """
310   retrieves the flowcell eland config file, give the base_host_url
311   (i.e. http://sub.domain.edu:port)
312   """
313   LOGGER.info('USING OPTIONS:')
314   LOGGER.info('     URL: %s' % (options.url,))
315   LOGGER.info('     OUT: %s' % (options.output_filepath,))
316   LOGGER.info('      FC: %s' % (options.flowcell,))
317   #LOGGER.info(': %s' % (options.genome_dir,))
318   LOGGER.info('post_run: %s' % ( str(options.post_run),))
319
320   flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
321
322   LOGGER.debug('genome_dir: %s' % ( options.genome_dir, ))
323   available_genomes = getAvailableGenomes(options.genome_dir)
324   genome_map = constructMapperDict(available_genomes)
325   LOGGER.debug('available genomes: %s' % ( str( list(genome_map.keys()) ),))
326
327   config = format_gerald_config(options, flowcell_info, genome_map)
328
329   if options.output_filepath is not None:
330       outstream = open(options.output_filepath, 'w')
331       logging.info('Writing config file to %s' % (options.output_filepath,))
332   else:
333       outstream = sys.stdout
334
335   outstream.write(config)
336
337   if options.sample_sheet is None:
338       pass
339   elif options.sample_sheet == '-':
340       save_sample_sheet(sys.stdout, options, flowcell_info)
341   else:
342       stream = open(options.sample_sheet,'w')
343       save_sample_sheet(stream, options, flowcell_info)
344
345
346 def save_sample_sheet(outstream, options, flowcell_info):
347     sample_sheet_fields = ['FCID', 'Lane', 'SampleID', 'SampleRef', 'Index',
348                            'Description', 'Control', 'Recipe', 'Operator',
349                            'SampleProject']
350     illumina_to_htsw_map = {'FCID': 'flowcell',
351                             'Lane': 'lane_number',
352                             'SampleID': 'library_id',
353                             'SampleRef': format_sampleref,
354                             'Description': 'library_name',
355                             'Control': format_control_lane,
356                             'Recipe': format_recipe_name,
357                             'Operator': format_operator_name}
358     out = csv.DictWriter(outstream, sample_sheet_fields)
359     out.writerow(dict(((x,x) for x in sample_sheet_fields)))
360     for lane_number in LANE_LIST:
361         lane_contents = flowcell_info['lane_set'][str(lane_number)]
362
363         pooled_lane_contents = []
364         for library in lane_contents:
365             # build common attributes
366             renamed = {}
367             for illumina_name in sample_sheet_fields:
368                 htsw_field = illumina_to_htsw_map.get(illumina_name, None)
369                 if htsw_field is None:
370                     continue
371                 if isinstance(htsw_field, collections.Callable):
372                     renamed[illumina_name] = htsw_field(options,
373                                                         flowcell_info,
374                                                         library)
375                 else:
376                     renamed[illumina_name] = library[htsw_field]
377
378             pooled_lane_contents.extend(format_pooled_libraries(renamed, library))
379
380         for row in pooled_lane_contents:
381             out.writerow(row)
382
383
384 def format_sampleref(options, flowcell_info, sample):
385     return sample['library_species'].replace(' ', '_')
386
387
388 def format_control_lane(options, flowcell_info, sample):
389     if sample['lane_number'] == flowcell_info['control_lane']:
390         return 'Y'
391     else:
392         return 'N'
393
394
395 def format_recipe_name(options, flowcell_info, sample):
396     return options.recipe
397
398
399 def format_operator_name(options, flowcell_info, sample):
400     return options.operator
401
402
403 def format_pooled_libraries(shared, library):
404     sequences = library.get('index_sequence', None)
405     if sequences is None:
406         return []
407     elif (type(sequences) in str and
408           sequences.lower().startswith('err')):
409         shared['Index'] = ''
410         shared['SampleProject'] = library['library_id']
411         return [shared]
412     elif (type(sequences) == dict):
413         pooled = []
414         multiplex_ids = list(sequences.keys())
415         multiplex_ids.sort(cmp=alphanum.alphanum)
416         for multiplex_id in multiplex_ids:
417             sample = {}
418             sample.update(shared)
419             sample['Index'] = sequences[multiplex_id]
420             sample['SampleProject'] = format_project_name(library,
421                                                           multiplex_id)
422             pooled.append(sample)
423         return pooled
424     else:
425         raise RuntimeError("Unrecognized index type")
426
427
428
429 def format_project_name(library, multiplex_id):
430     library_id = library['library_id']
431     return "%s_index%s" % (library_id, multiplex_id)
432
433