4 from ConfigParser import RawConfigParser
6 from optparse import OptionParser, IndentedHelpFormatter
15 except ImportError, e:
16 import simplejson as json
18 from htsworkflow.frontend.auth import apidata
19 from htsworkflow.util import api
20 from htsworkflow.util.url import normalize_url
21 from htsworkflow.pipelines.genome_mapper import \
22 getAvailableGenomes, \
24 from htsworkflow.pipelines.runfolder import LANE_LIST
25 # JSON dictionaries use strings
26 LANE_LIST_JSON = [ str(l) for l in LANE_LIST ]
28 LOGGER = logging.getLogger(__name__)
30 __docformat__ = "restructredtext en"
32 CONFIG_SYSTEM = '/etc/htsworkflow.ini'
33 CONFIG_USER = os.path.expanduser('~/.htsworkflow.ini')
34 GERALD_CONFIG_SECTION = 'gerald_config'
36 #Disable or enable commandline arg parsing; disabled by default.
37 DISABLE_CMDLINE = True
39 class FlowCellNotFound(Exception): pass
40 class WebError404(Exception): pass
42 def retrieve_flowcell_info(base_host_url, flowcell):
44 Return a dictionary describing a
46 url = api.flowcell_url(base_host_url, flowcell)
49 apipayload = urllib.urlencode(apidata)
50 web = urllib2.urlopen(url, apipayload)
51 except urllib2.URLError, e:
52 errmsg = 'URLError: %d %s' % (e.code, e.msg)
54 LOGGER.error('opened %s' % (url,))
61 msg = "403 - Forbbidden, probably need api key"
62 raise FlowCellNotFound(msg)
65 msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \
66 "Did you get right port #?" % (flowcell, base_host_url, url)
67 raise FlowCellNotFound(msg)
69 if len(contents) == 0:
70 msg = "No information for flowcell (%s) returned; full url(%s)" % (flowcell, url)
71 raise FlowCellNotFound(msg)
73 data = json.loads(contents)
76 def is_sequencing(lane_info):
78 Determine if we are just sequencing and not doing any follow-up analysis
80 if lane_info['experiment_type'] in ('De Novo','Whole Genome'):
85 def group_lane_parameters(flowcell_info):
87 goup lanes that can share GERALD configuration blocks.
89 (The same species, read length, and eland vs sequencing)
92 for lane_number, lane_contents in flowcell_info['lane_set'].items():
93 for lane_info in lane_contents:
94 index = (lane_info['read_length'],
95 lane_info['library_species'],
96 is_sequencing(lane_info))
97 lane_groups.setdefault(index, []).append(lane_number)
100 def format_gerald_header(flowcell_info):
102 Generate comment describing the contents of the flowcell
104 # I'm using '\n# ' to join the lines together, that doesn't include the
105 # first element so i needed to put the # in manually
106 config = ['# FLOWCELL: %s' % (flowcell_info['flowcell_id'])]
108 config += ['CONTROL-LANE: %s' % (flowcell_info['control_lane'],)]
110 config += ['Flowcell Notes:']
111 config.extend(flowcell_info['notes'].split('\r\n'))
113 for lane_number in LANE_LIST_JSON:
114 lane_contents = flowcell_info['lane_set'][lane_number]
115 for lane_info in lane_contents:
116 config += ['Lane%s: %s | %s' % (lane_number,
117 lane_info['library_id'],
118 lane_info['library_name'])]
121 return "\n# ".join(config)
123 def format_gerald_config(options, flowcell_info, genome_map):
125 Generate a GERALD config file
127 # so we can add nothing or _pair if we're a paired end run
128 eland_analysis_suffix = { False: "_extended", True: "_pair" }
129 sequence_analysis_suffix = { False: "", True: "_pair" }
131 # it's convienent to have helpful information describing the flowcell
132 # in the config file... things like which lane is which library.
133 config = [format_gerald_header(flowcell_info)]
135 config += ['SEQUENCE_FORMAT --fastq']
136 config += ['ELAND_SET_SIZE 20']
137 config += ['12345678:WITH_SEQUENCE true']
138 analysis_suffix = eland_analysis_suffix[flowcell_info['paired_end']]
139 sequence_suffix = sequence_analysis_suffix[flowcell_info['paired_end']]
140 lane_groups = group_lane_parameters(flowcell_info)
141 for lane_index, lane_numbers in lane_groups.items():
142 # lane_index is return value of group_lane_parameters
143 read_length, species, is_sequencing = lane_index
145 lane_prefix = u"".join(lane_numbers)
147 species_path = genome_map.get(species, None)
148 LOGGER.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
149 if not is_sequencing and species_path is None:
150 no_genome_msg = "Forcing lanes %s to sequencing as there is no genome for %s"
151 LOGGER.warning(no_genome_msg % (lane_numbers, species))
155 config += ['%s:ANALYSIS sequence%s' % (lane_prefix, sequence_suffix)]
157 config += ['%s:ANALYSIS eland%s' % (lane_prefix, analysis_suffix)]
158 config += ['%s:ELAND_GENOME %s' % (lane_prefix, species_path) ]
159 #config += ['%s:READ_LENGTH %s' % ( lane_prefix, read_length ) ]
160 config += ['%s:USE_BASES Y%s' % ( lane_prefix, read_length ) ]
162 # add in option for running script after
163 if not (options.post_run is None or options.runfolder is None):
164 runfolder = os.path.abspath(options.runfolder)
165 post_run = options.post_run % {'runfolder': runfolder}
166 config += ['POST_RUN_COMMAND %s' % (post_run,) ]
168 config += [''] # force trailing newline
170 return "\n".join(config)
174 Used when command line parsing is disabled; default
178 self.output_filepath = None
180 self.genome_dir = None
182 class PreformattedDescriptionFormatter(IndentedHelpFormatter):
184 #def format_description(self, description):
187 # return description + "\n"
191 def format_epilog(self, epilog):
193 It was removing my preformated epilog, so this should override
194 that behavior! Muhahaha!
197 return "\n" + epilog + "\n"
202 def constructOptionParser():
204 returns a pre-setup optparser
206 parser = OptionParser(formatter=PreformattedDescriptionFormatter())
208 parser.set_description('Retrieves eland config file from hts_frontend web frontend.')
213 * %s (User specific; overrides system)
214 * command line overrides all config file options
219 config_host: http://somewhere.domain:port
220 genome_dir: /path to search for genomes
221 post_run: runfolder -o <destdir> %%(runfolder)s
223 """ % (CONFIG_SYSTEM, CONFIG_USER, GERALD_CONFIG_SECTION)
225 #Special formatter for allowing preformatted description.
226 ##parser.format_epilog(PreformattedDescriptionFormatter())
228 parser.add_option("-u", "--url",
229 action="store", type="string", dest="url")
231 parser.add_option("-o", "--output-file",
232 action="store", type="string", dest="output_filepath",
233 help="config file destination. If runfolder is specified defaults "
234 "to <runfolder>/config-auto.txt" )
236 parser.add_option("-f", "--flowcell",
237 action="store", type="string", dest="flowcell")
239 parser.add_option("-g", "--genome_dir",
240 action="store", type="string", dest="genome_dir")
242 parser.add_option("-r", "--runfolder",
243 action="store", type="string",
244 help="specify runfolder for post_run command ")
246 parser.add_option("--sample-sheet", default=None,
247 help="path to save demultiplexing sample sheet")
249 parser.add_option("--operator", default='', help="Name of sequencer operator")
250 parser.add_option("--recipe", default="Unknown",
251 help="specify recipe name")
253 parser.add_option('-v', '--verbose', action='store_true', default=False,
254 help='increase logging verbosity')
257 def constructConfigParser():
259 returns a pre-setup config parser
261 parser = RawConfigParser()
262 parser.read([CONFIG_SYSTEM, CONFIG_USER])
263 if not parser.has_section(GERALD_CONFIG_SECTION):
264 parser.add_section(GERALD_CONFIG_SECTION)
269 def getCombinedOptions(argv=None):
271 Returns optparse options after it has be updated with ConfigParser
272 config files and merged with parsed commandline options.
274 expects command line arguments to be passed in
276 cl_parser = constructOptionParser()
277 conf_parser = constructConfigParser()
280 options = DummyOptions()
282 options, args = cl_parser.parse_args(argv)
284 if options.url is None:
285 if conf_parser.has_option(GERALD_CONFIG_SECTION, 'config_host'):
286 options.url = conf_parser.get(GERALD_CONFIG_SECTION, 'config_host')
288 options.url = normalize_url(options.url)
290 if options.genome_dir is None:
291 if conf_parser.has_option(GERALD_CONFIG_SECTION, 'genome_dir'):
292 options.genome_dir = conf_parser.get(GERALD_CONFIG_SECTION, 'genome_dir')
294 if conf_parser.has_option(GERALD_CONFIG_SECTION, 'post_run'):
295 options.post_run = conf_parser.get(GERALD_CONFIG_SECTION, 'post_run')
297 options.post_run = None
299 if options.output_filepath is None:
300 if options.runfolder is not None:
301 options.output_filepath = os.path.join(options.runfolder, 'config-auto.txt')
306 def saveConfigFile(options):
308 retrieves the flowcell eland config file, give the base_host_url
309 (i.e. http://sub.domain.edu:port)
311 LOGGER.info('USING OPTIONS:')
312 LOGGER.info(u' URL: %s' % (options.url,))
313 LOGGER.info(u' OUT: %s' % (options.output_filepath,))
314 LOGGER.info(u' FC: %s' % (options.flowcell,))
315 #LOGGER.info(': %s' % (options.genome_dir,))
316 LOGGER.info(u'post_run: %s' % ( unicode(options.post_run),))
318 flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
320 LOGGER.debug('genome_dir: %s' % ( options.genome_dir, ))
321 available_genomes = getAvailableGenomes(options.genome_dir)
322 genome_map = constructMapperDict(available_genomes)
323 LOGGER.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
325 #config = format_gerald_config(options, flowcell_info, genome_map)
327 #if options.output_filepath is not None:
328 # outstream = open(options.output_filepath, 'w')
329 # LOGGER.info('Writing config file to %s' % (options.output_filepath,))
331 # outstream = sys.stdout
333 #outstream.write(config)
335 if options.sample_sheet is None:
337 elif options.sample_sheet == '-':
338 save_sample_sheet(sys.stdout, options, flowcell_info)
340 stream = open(options.sample_sheet,'w')
341 save_sample_sheet(stream, options, flowcell_info)
344 def save_sample_sheet(outstream, options, flowcell_info):
345 sample_sheet_fields = ['FCID', 'Lane', 'SampleID', 'SampleRef', 'Index',
346 'Description', 'Control', 'Recipe', 'Operator',
348 illumina_to_htsw_map = {'FCID': 'flowcell',
349 'Lane': 'lane_number',
350 'SampleID': 'library_id',
351 'SampleRef': format_sampleref,
352 'Description': 'library_name',
353 'Control': format_control_lane,
354 'Recipe': format_recipe_name,
355 'Operator': format_operator_name}
356 out = csv.DictWriter(outstream, sample_sheet_fields)
357 out.writerow(dict(((x,x) for x in sample_sheet_fields)))
358 for lane_number in LANE_LIST:
359 lane_contents = flowcell_info['lane_set'][str(lane_number)]
361 pooled_lane_contents = []
362 for library in lane_contents:
363 # build common attributes
365 for illumina_name in sample_sheet_fields:
366 htsw_field = illumina_to_htsw_map.get(illumina_name, None)
367 if htsw_field is None:
369 if callable(htsw_field):
370 renamed[illumina_name] = htsw_field(options,
374 renamed[illumina_name] = library[htsw_field]
376 pooled_lane_contents.extend(format_pooled_libraries(renamed, library))
378 for row in pooled_lane_contents:
382 def format_sampleref(options, flowcell_info, sample):
383 return sample['library_species'].replace(' ', '_')
386 def format_control_lane(options, flowcell_info, sample):
387 if sample['lane_number'] == flowcell_info['control_lane']:
393 def format_recipe_name(options, flowcell_info, sample):
394 return options.recipe
397 def format_operator_name(options, flowcell_info, sample):
398 return options.operator
401 def format_pooled_libraries(shared, library):
402 sequences = library.get('index_sequence', None)
403 if sequences is None:
405 elif (type(sequences) in types.StringTypes and
406 sequences.lower().startswith('err')):
408 shared['SampleProject'] = library['library_id']
410 elif (type(sequences) == types.DictType):
412 multiplex_ids = sequences.keys()
413 multiplex_ids.sort(key=int)
414 for multiplex_id in multiplex_ids:
416 sample.update(shared)
417 sample['Index'] = sequences[multiplex_id]
418 sample['SampleProject'] = format_project_name(library,
420 pooled.append(sample)
423 raise RuntimeError("Unrecognized index type")
427 def format_project_name(library, multiplex_id):
428 library_id = library['library_id']
429 return "%s_index%s" % (library_id, multiplex_id)