4 from configparser import RawConfigParser
6 from optparse import OptionParser, IndentedHelpFormatter
10 import urllib.request, urllib.parse, urllib.error
11 import urllib.request, urllib.error, urllib.parse
16 except ImportError as e:
17 import simplejson as json
19 from htsworkflow.frontend.auth import apidata
20 from htsworkflow.util import api
21 from htsworkflow.util import alphanum
22 from htsworkflow.util.url import normalize_url
23 from htsworkflow.pipelines.genome_mapper import \
24 getAvailableGenomes, \
26 from htsworkflow.pipelines import LANE_LIST
27 # JSON dictionaries use strings
28 LANE_LIST_JSON = [ str(l) for l in LANE_LIST ]
30 LOGGER = logging.getLogger(__name__)
32 __docformat__ = "restructredtext en"
34 CONFIG_SYSTEM = '/etc/htsworkflow.ini'
35 CONFIG_USER = os.path.expanduser('~/.htsworkflow.ini')
36 GERALD_CONFIG_SECTION = 'gerald_config'
38 #Disable or enable commandline arg parsing; disabled by default.
39 DISABLE_CMDLINE = True
41 class FlowCellNotFound(Exception): pass
42 class WebError404(Exception): pass
44 def retrieve_flowcell_info(base_host_url, flowcell):
46 Return a dictionary describing a
48 url = api.flowcell_url(base_host_url, flowcell)
51 apipayload = urllib.parse.urlencode(apidata)
52 web = urllib.request.urlopen(url, apipayload)
53 except urllib.error.URLError as e:
54 errmsg = 'URLError: %d %s' % (e.code, e.msg)
56 LOGGER.error('opened %s' % (url,))
63 msg = "403 - Forbbidden, probably need api key"
64 raise FlowCellNotFound(msg)
67 msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \
68 "Did you get right port #?" % (flowcell, base_host_url, url)
69 raise FlowCellNotFound(msg)
71 if len(contents) == 0:
72 msg = "No information for flowcell (%s) returned; full url(%s)" % (flowcell, url)
73 raise FlowCellNotFound(msg)
75 data = json.loads(contents)
78 def is_sequencing(lane_info):
80 Determine if we are just sequencing and not doing any follow-up analysis
82 if lane_info['experiment_type'] in ('De Novo','Whole Genome'):
87 def group_lane_parameters(flowcell_info):
89 goup lanes that can share GERALD configuration blocks.
91 (The same species, read length, and eland vs sequencing)
94 for lane_number, lane_contents in list(flowcell_info['lane_set'].items()):
95 for lane_info in lane_contents:
96 index = (lane_info['read_length'],
97 lane_info['library_species'],
98 is_sequencing(lane_info))
99 lane_groups.setdefault(index, []).append(lane_number)
102 def format_gerald_header(flowcell_info):
104 Generate comment describing the contents of the flowcell
106 # I'm using '\n# ' to join the lines together, that doesn't include the
107 # first element so i needed to put the # in manually
108 config = ['# FLOWCELL: %s' % (flowcell_info['flowcell_id'])]
110 config += ['CONTROL-LANE: %s' % (flowcell_info['control_lane'],)]
112 config += ['Flowcell Notes:']
113 config.extend(flowcell_info['notes'].split('\r\n'))
115 for lane_number in LANE_LIST_JSON:
116 lane_contents = flowcell_info['lane_set'][lane_number]
117 for lane_info in lane_contents:
118 config += ['Lane%s: %s | %s' % (lane_number,
119 lane_info['library_id'],
120 lane_info['library_name'])]
123 return "\n# ".join(config)
125 def format_gerald_config(options, flowcell_info, genome_map):
127 Generate a GERALD config file
129 # so we can add nothing or _pair if we're a paired end run
130 eland_analysis_suffix = { False: "_extended", True: "_pair" }
131 sequence_analysis_suffix = { False: "", True: "_pair" }
133 # it's convienent to have helpful information describing the flowcell
134 # in the config file... things like which lane is which library.
135 config = [format_gerald_header(flowcell_info)]
137 config += ['SEQUENCE_FORMAT --fastq']
138 config += ['ELAND_SET_SIZE 20']
139 config += ['12345678:WITH_SEQUENCE true']
140 analysis_suffix = eland_analysis_suffix[flowcell_info['paired_end']]
141 sequence_suffix = sequence_analysis_suffix[flowcell_info['paired_end']]
142 lane_groups = group_lane_parameters(flowcell_info)
143 for lane_index, lane_numbers in list(lane_groups.items()):
144 # lane_index is return value of group_lane_parameters
145 read_length, species, is_sequencing = lane_index
147 lane_prefix = "".join(lane_numbers)
149 species_path = genome_map.get(species, None)
150 LOGGER.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
151 if not is_sequencing and species_path is None:
152 no_genome_msg = "Forcing lanes %s to sequencing as there is no genome for %s"
153 LOGGER.warning(no_genome_msg % (lane_numbers, species))
157 config += ['%s:ANALYSIS sequence%s' % (lane_prefix, sequence_suffix)]
159 config += ['%s:ANALYSIS eland%s' % (lane_prefix, analysis_suffix)]
160 config += ['%s:ELAND_GENOME %s' % (lane_prefix, species_path) ]
161 #config += ['%s:READ_LENGTH %s' % ( lane_prefix, read_length ) ]
162 config += ['%s:USE_BASES Y%s' % ( lane_prefix, read_length ) ]
164 # add in option for running script after
165 if not (options.post_run is None or options.runfolder is None):
166 runfolder = os.path.abspath(options.runfolder)
167 post_run = options.post_run % {'runfolder': runfolder}
168 config += ['POST_RUN_COMMAND %s' % (post_run,) ]
170 config += [''] # force trailing newline
172 return "\n".join(config)
176 Used when command line parsing is disabled; default
180 self.output_filepath = None
182 self.genome_dir = None
184 class PreformattedDescriptionFormatter(IndentedHelpFormatter):
186 #def format_description(self, description):
189 # return description + "\n"
193 def format_epilog(self, epilog):
195 It was removing my preformated epilog, so this should override
196 that behavior! Muhahaha!
199 return "\n" + epilog + "\n"
204 def constructOptionParser():
206 returns a pre-setup optparser
208 parser = OptionParser(formatter=PreformattedDescriptionFormatter())
210 parser.set_description('Retrieves eland config file from hts_frontend web frontend.')
215 * %s (User specific; overrides system)
216 * command line overrides all config file options
221 config_host: http://somewhere.domain:port
222 genome_dir: /path to search for genomes
223 post_run: runfolder -o <destdir> %%(runfolder)s
225 """ % (CONFIG_SYSTEM, CONFIG_USER, GERALD_CONFIG_SECTION)
227 #Special formatter for allowing preformatted description.
228 ##parser.format_epilog(PreformattedDescriptionFormatter())
230 parser.add_option("-u", "--url",
231 action="store", type="string", dest="url")
233 parser.add_option("-o", "--output-file",
234 action="store", type="string", dest="output_filepath",
235 help="config file destination. If runfolder is specified defaults "
236 "to <runfolder>/config-auto.txt" )
238 parser.add_option("-f", "--flowcell",
239 action="store", type="string", dest="flowcell")
241 parser.add_option("-g", "--genome_dir",
242 action="store", type="string", dest="genome_dir")
244 parser.add_option("-r", "--runfolder",
245 action="store", type="string",
246 help="specify runfolder for post_run command ")
248 parser.add_option("--sample-sheet", default=None,
249 help="path to save demultiplexing sample sheet")
251 parser.add_option("--operator", default='', help="Name of sequencer operator")
252 parser.add_option("--recipe", default="Unknown",
253 help="specify recipe name")
255 parser.add_option('-v', '--verbose', action='store_true', default=False,
256 help='increase logging verbosity')
259 def constructConfigParser():
261 returns a pre-setup config parser
263 parser = RawConfigParser()
264 parser.read([CONFIG_SYSTEM, CONFIG_USER])
265 if not parser.has_section(GERALD_CONFIG_SECTION):
266 parser.add_section(GERALD_CONFIG_SECTION)
271 def getCombinedOptions(argv=None):
273 Returns optparse options after it has be updated with ConfigParser
274 config files and merged with parsed commandline options.
276 expects command line arguments to be passed in
278 cl_parser = constructOptionParser()
279 conf_parser = constructConfigParser()
282 options = DummyOptions()
284 options, args = cl_parser.parse_args(argv)
286 if options.url is None:
287 if conf_parser.has_option(GERALD_CONFIG_SECTION, 'config_host'):
288 options.url = conf_parser.get(GERALD_CONFIG_SECTION, 'config_host')
290 options.url = normalize_url(options.url)
292 if options.genome_dir is None:
293 if conf_parser.has_option(GERALD_CONFIG_SECTION, 'genome_dir'):
294 options.genome_dir = conf_parser.get(GERALD_CONFIG_SECTION, 'genome_dir')
296 if conf_parser.has_option(GERALD_CONFIG_SECTION, 'post_run'):
297 options.post_run = conf_parser.get(GERALD_CONFIG_SECTION, 'post_run')
299 options.post_run = None
301 if options.output_filepath is None:
302 if options.runfolder is not None:
303 options.output_filepath = os.path.join(options.runfolder, 'config-auto.txt')
308 def saveConfigFile(options):
310 retrieves the flowcell eland config file, give the base_host_url
311 (i.e. http://sub.domain.edu:port)
313 LOGGER.info('USING OPTIONS:')
314 LOGGER.info(' URL: %s' % (options.url,))
315 LOGGER.info(' OUT: %s' % (options.output_filepath,))
316 LOGGER.info(' FC: %s' % (options.flowcell,))
317 #LOGGER.info(': %s' % (options.genome_dir,))
318 LOGGER.info('post_run: %s' % ( str(options.post_run),))
320 flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
322 LOGGER.debug('genome_dir: %s' % ( options.genome_dir, ))
323 available_genomes = getAvailableGenomes(options.genome_dir)
324 genome_map = constructMapperDict(available_genomes)
325 LOGGER.debug('available genomes: %s' % ( str( list(genome_map.keys()) ),))
327 config = format_gerald_config(options, flowcell_info, genome_map)
329 if options.output_filepath is not None:
330 outstream = open(options.output_filepath, 'w')
331 logging.info('Writing config file to %s' % (options.output_filepath,))
333 outstream = sys.stdout
335 outstream.write(config)
337 if options.sample_sheet is None:
339 elif options.sample_sheet == '-':
340 save_sample_sheet(sys.stdout, options, flowcell_info)
342 stream = open(options.sample_sheet,'w')
343 save_sample_sheet(stream, options, flowcell_info)
346 def save_sample_sheet(outstream, options, flowcell_info):
347 sample_sheet_fields = ['FCID', 'Lane', 'SampleID', 'SampleRef', 'Index',
348 'Description', 'Control', 'Recipe', 'Operator',
350 illumina_to_htsw_map = {'FCID': 'flowcell',
351 'Lane': 'lane_number',
352 'SampleID': 'library_id',
353 'SampleRef': format_sampleref,
354 'Description': 'library_name',
355 'Control': format_control_lane,
356 'Recipe': format_recipe_name,
357 'Operator': format_operator_name}
358 out = csv.DictWriter(outstream, sample_sheet_fields)
359 out.writerow(dict(((x,x) for x in sample_sheet_fields)))
360 for lane_number in LANE_LIST:
361 lane_contents = flowcell_info['lane_set'][str(lane_number)]
363 pooled_lane_contents = []
364 for library in lane_contents:
365 # build common attributes
367 for illumina_name in sample_sheet_fields:
368 htsw_field = illumina_to_htsw_map.get(illumina_name, None)
369 if htsw_field is None:
371 if isinstance(htsw_field, collections.Callable):
372 renamed[illumina_name] = htsw_field(options,
376 renamed[illumina_name] = library[htsw_field]
378 pooled_lane_contents.extend(format_pooled_libraries(renamed, library))
380 for row in pooled_lane_contents:
384 def format_sampleref(options, flowcell_info, sample):
385 return sample['library_species'].replace(' ', '_')
388 def format_control_lane(options, flowcell_info, sample):
389 if sample['lane_number'] == flowcell_info['control_lane']:
395 def format_recipe_name(options, flowcell_info, sample):
396 return options.recipe
399 def format_operator_name(options, flowcell_info, sample):
400 return options.operator
403 def format_pooled_libraries(shared, library):
404 sequences = library.get('index_sequence', None)
405 if sequences is None:
407 elif (type(sequences) in str and
408 sequences.lower().startswith('err')):
410 shared['SampleProject'] = library['library_id']
412 elif (type(sequences) == dict):
414 multiplex_ids = list(sequences.keys())
415 multiplex_ids.sort(cmp=alphanum.alphanum)
416 for multiplex_id in multiplex_ids:
418 sample.update(shared)
419 sample['Index'] = sequences[multiplex_id]
420 sample['SampleProject'] = format_project_name(library,
422 pooled.append(sample)
425 raise RuntimeError("Unrecognized index type")
429 def format_project_name(library, multiplex_id):
430 library_id = library['library_id']
431 return "%s_index%s" % (library_id, multiplex_id)