4 from ConfigParser import RawConfigParser
6 from optparse import OptionParser, IndentedHelpFormatter
15 except ImportError, e:
16 import simplejson as json
18 from htsworkflow.frontend.auth import apidata
19 from htsworkflow.util import api
20 from htsworkflow.util import alphanum
21 from htsworkflow.util.url import normalize_url
22 from htsworkflow.pipelines.genome_mapper import \
23 getAvailableGenomes, \
25 from htsworkflow.pipelines import LANE_LIST
26 # JSON dictionaries use strings
27 LANE_LIST_JSON = [ str(l) for l in LANE_LIST ]
29 LOGGER = logging.getLogger(__name__)
31 __docformat__ = "restructredtext en"
33 CONFIG_SYSTEM = '/etc/htsworkflow.ini'
34 CONFIG_USER = os.path.expanduser('~/.htsworkflow.ini')
35 GERALD_CONFIG_SECTION = 'gerald_config'
37 #Disable or enable commandline arg parsing; disabled by default.
38 DISABLE_CMDLINE = True
40 class FlowCellNotFound(Exception): pass
41 class WebError404(Exception): pass
43 def retrieve_flowcell_info(base_host_url, flowcell):
45 Return a dictionary describing a
47 url = api.flowcell_url(base_host_url, flowcell)
50 apipayload = urllib.urlencode(apidata)
51 web = urllib2.urlopen(url, apipayload)
52 except urllib2.URLError, e:
53 errmsg = 'URLError: %d %s' % (e.code, e.msg)
55 LOGGER.error('opened %s' % (url,))
62 msg = "403 - Forbbidden, probably need api key"
63 raise FlowCellNotFound(msg)
66 msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \
67 "Did you get right port #?" % (flowcell, base_host_url, url)
68 raise FlowCellNotFound(msg)
70 if len(contents) == 0:
71 msg = "No information for flowcell (%s) returned; full url(%s)" % (flowcell, url)
72 raise FlowCellNotFound(msg)
74 data = json.loads(contents)
77 def is_sequencing(lane_info):
79 Determine if we are just sequencing and not doing any follow-up analysis
81 if lane_info['experiment_type'] in ('De Novo','Whole Genome'):
86 def group_lane_parameters(flowcell_info):
88 goup lanes that can share GERALD configuration blocks.
90 (The same species, read length, and eland vs sequencing)
93 for lane_number, lane_contents in flowcell_info['lane_set'].items():
94 for lane_info in lane_contents:
95 index = (lane_info['read_length'],
96 lane_info['library_species'],
97 is_sequencing(lane_info))
98 lane_groups.setdefault(index, []).append(lane_number)
101 def format_gerald_header(flowcell_info):
103 Generate comment describing the contents of the flowcell
105 # I'm using '\n# ' to join the lines together, that doesn't include the
106 # first element so i needed to put the # in manually
107 config = ['# FLOWCELL: %s' % (flowcell_info['flowcell_id'])]
109 config += ['CONTROL-LANE: %s' % (flowcell_info['control_lane'],)]
111 config += ['Flowcell Notes:']
112 config.extend(flowcell_info['notes'].split('\r\n'))
114 for lane_number in LANE_LIST_JSON:
115 lane_contents = flowcell_info['lane_set'][lane_number]
116 for lane_info in lane_contents:
117 config += ['Lane%s: %s | %s' % (lane_number,
118 lane_info['library_id'],
119 lane_info['library_name'])]
122 return "\n# ".join(config)
124 def format_gerald_config(options, flowcell_info, genome_map):
126 Generate a GERALD config file
128 # so we can add nothing or _pair if we're a paired end run
129 eland_analysis_suffix = { False: "_extended", True: "_pair" }
130 sequence_analysis_suffix = { False: "", True: "_pair" }
132 # it's convienent to have helpful information describing the flowcell
133 # in the config file... things like which lane is which library.
134 config = [format_gerald_header(flowcell_info)]
136 config += ['SEQUENCE_FORMAT --fastq']
137 config += ['ELAND_SET_SIZE 20']
138 config += ['12345678:WITH_SEQUENCE true']
139 analysis_suffix = eland_analysis_suffix[flowcell_info['paired_end']]
140 sequence_suffix = sequence_analysis_suffix[flowcell_info['paired_end']]
141 lane_groups = group_lane_parameters(flowcell_info)
142 for lane_index, lane_numbers in lane_groups.items():
143 # lane_index is return value of group_lane_parameters
144 read_length, species, is_sequencing = lane_index
146 lane_prefix = u"".join(lane_numbers)
148 species_path = genome_map.get(species, None)
149 LOGGER.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
150 if not is_sequencing and species_path is None:
151 no_genome_msg = "Forcing lanes %s to sequencing as there is no genome for %s"
152 LOGGER.warning(no_genome_msg % (lane_numbers, species))
156 config += ['%s:ANALYSIS sequence%s' % (lane_prefix, sequence_suffix)]
158 config += ['%s:ANALYSIS eland%s' % (lane_prefix, analysis_suffix)]
159 config += ['%s:ELAND_GENOME %s' % (lane_prefix, species_path) ]
160 #config += ['%s:READ_LENGTH %s' % ( lane_prefix, read_length ) ]
161 config += ['%s:USE_BASES Y%s' % ( lane_prefix, read_length ) ]
163 # add in option for running script after
164 if not (options.post_run is None or options.runfolder is None):
165 runfolder = os.path.abspath(options.runfolder)
166 post_run = options.post_run % {'runfolder': runfolder}
167 config += ['POST_RUN_COMMAND %s' % (post_run,) ]
169 config += [''] # force trailing newline
171 return "\n".join(config)
175 Used when command line parsing is disabled; default
179 self.output_filepath = None
181 self.genome_dir = None
183 class PreformattedDescriptionFormatter(IndentedHelpFormatter):
185 #def format_description(self, description):
188 # return description + "\n"
192 def format_epilog(self, epilog):
194 It was removing my preformated epilog, so this should override
195 that behavior! Muhahaha!
198 return "\n" + epilog + "\n"
203 def constructOptionParser():
205 returns a pre-setup optparser
207 parser = OptionParser(formatter=PreformattedDescriptionFormatter())
209 parser.set_description('Retrieves eland config file from hts_frontend web frontend.')
214 * %s (User specific; overrides system)
215 * command line overrides all config file options
220 config_host: http://somewhere.domain:port
221 genome_dir: /path to search for genomes
222 post_run: runfolder -o <destdir> %%(runfolder)s
224 """ % (CONFIG_SYSTEM, CONFIG_USER, GERALD_CONFIG_SECTION)
226 #Special formatter for allowing preformatted description.
227 ##parser.format_epilog(PreformattedDescriptionFormatter())
229 parser.add_option("-u", "--url",
230 action="store", type="string", dest="url")
232 parser.add_option("-o", "--output-file",
233 action="store", type="string", dest="output_filepath",
234 help="config file destination. If runfolder is specified defaults "
235 "to <runfolder>/config-auto.txt" )
237 parser.add_option("-f", "--flowcell",
238 action="store", type="string", dest="flowcell")
240 parser.add_option("-g", "--genome_dir",
241 action="store", type="string", dest="genome_dir")
243 parser.add_option("-r", "--runfolder",
244 action="store", type="string",
245 help="specify runfolder for post_run command ")
247 parser.add_option("--sample-sheet", default=None,
248 help="path to save demultiplexing sample sheet")
250 parser.add_option("--operator", default='', help="Name of sequencer operator")
251 parser.add_option("--recipe", default="Unknown",
252 help="specify recipe name")
254 parser.add_option('-v', '--verbose', action='store_true', default=False,
255 help='increase logging verbosity')
258 def constructConfigParser():
260 returns a pre-setup config parser
262 parser = RawConfigParser()
263 parser.read([CONFIG_SYSTEM, CONFIG_USER])
264 if not parser.has_section(GERALD_CONFIG_SECTION):
265 parser.add_section(GERALD_CONFIG_SECTION)
270 def getCombinedOptions(argv=None):
272 Returns optparse options after it has be updated with ConfigParser
273 config files and merged with parsed commandline options.
275 expects command line arguments to be passed in
277 cl_parser = constructOptionParser()
278 conf_parser = constructConfigParser()
281 options = DummyOptions()
283 options, args = cl_parser.parse_args(argv)
285 if options.url is None:
286 if conf_parser.has_option(GERALD_CONFIG_SECTION, 'config_host'):
287 options.url = conf_parser.get(GERALD_CONFIG_SECTION, 'config_host')
289 options.url = normalize_url(options.url)
291 if options.genome_dir is None:
292 if conf_parser.has_option(GERALD_CONFIG_SECTION, 'genome_dir'):
293 options.genome_dir = conf_parser.get(GERALD_CONFIG_SECTION, 'genome_dir')
295 if conf_parser.has_option(GERALD_CONFIG_SECTION, 'post_run'):
296 options.post_run = conf_parser.get(GERALD_CONFIG_SECTION, 'post_run')
298 options.post_run = None
300 if options.output_filepath is None:
301 if options.runfolder is not None:
302 options.output_filepath = os.path.join(options.runfolder, 'config-auto.txt')
307 def saveConfigFile(options):
309 retrieves the flowcell eland config file, give the base_host_url
310 (i.e. http://sub.domain.edu:port)
312 LOGGER.info('USING OPTIONS:')
313 LOGGER.info(u' URL: %s' % (options.url,))
314 LOGGER.info(u' OUT: %s' % (options.output_filepath,))
315 LOGGER.info(u' FC: %s' % (options.flowcell,))
316 #LOGGER.info(': %s' % (options.genome_dir,))
317 LOGGER.info(u'post_run: %s' % ( unicode(options.post_run),))
319 flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
321 LOGGER.debug('genome_dir: %s' % ( options.genome_dir, ))
322 available_genomes = getAvailableGenomes(options.genome_dir)
323 genome_map = constructMapperDict(available_genomes)
324 LOGGER.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
326 config = format_gerald_config(options, flowcell_info, genome_map)
328 if options.output_filepath is not None:
329 outstream = open(options.output_filepath, 'w')
330 logging.info('Writing config file to %s' % (options.output_filepath,))
332 outstream = sys.stdout
334 outstream.write(config)
336 if options.sample_sheet is None:
338 elif options.sample_sheet == '-':
339 save_sample_sheet(sys.stdout, options, flowcell_info)
341 stream = open(options.sample_sheet,'w')
342 save_sample_sheet(stream, options, flowcell_info)
345 def save_sample_sheet(outstream, options, flowcell_info):
346 sample_sheet_fields = ['FCID', 'Lane', 'SampleID', 'SampleRef', 'Index',
347 'Description', 'Control', 'Recipe', 'Operator',
349 illumina_to_htsw_map = {'FCID': 'flowcell',
350 'Lane': 'lane_number',
351 'SampleID': 'library_id',
352 'SampleRef': format_sampleref,
353 'Description': 'library_name',
354 'Control': format_control_lane,
355 'Recipe': format_recipe_name,
356 'Operator': format_operator_name}
357 out = csv.DictWriter(outstream, sample_sheet_fields)
358 out.writerow(dict(((x,x) for x in sample_sheet_fields)))
359 for lane_number in LANE_LIST:
360 lane_contents = flowcell_info['lane_set'][str(lane_number)]
362 pooled_lane_contents = []
363 for library in lane_contents:
364 # build common attributes
366 for illumina_name in sample_sheet_fields:
367 htsw_field = illumina_to_htsw_map.get(illumina_name, None)
368 if htsw_field is None:
370 if callable(htsw_field):
371 renamed[illumina_name] = htsw_field(options,
375 renamed[illumina_name] = library[htsw_field]
377 pooled_lane_contents.extend(format_pooled_libraries(renamed, library))
379 for row in pooled_lane_contents:
383 def format_sampleref(options, flowcell_info, sample):
384 return sample['library_species'].replace(' ', '_')
387 def format_control_lane(options, flowcell_info, sample):
388 if sample['lane_number'] == flowcell_info['control_lane']:
394 def format_recipe_name(options, flowcell_info, sample):
395 return options.recipe
398 def format_operator_name(options, flowcell_info, sample):
399 return options.operator
402 def format_pooled_libraries(shared, library):
403 sequences = library.get('index_sequence', None)
404 if sequences is None:
406 elif (type(sequences) in types.StringTypes and
407 sequences.lower().startswith('err')):
409 shared['SampleProject'] = library['library_id']
411 elif (type(sequences) == types.DictType):
413 multiplex_ids = sequences.keys()
414 multiplex_ids.sort(cmp=alphanum.alphanum)
415 for multiplex_id in multiplex_ids:
417 sample.update(shared)
418 sample['Index'] = sequences[multiplex_id]
419 sample['SampleProject'] = format_project_name(library,
421 pooled.append(sample)
424 raise RuntimeError("Unrecognized index type")
428 def format_project_name(library, multiplex_id):
429 library_id = library['library_id']
430 return "%s_index%s" % (library_id, multiplex_id)