4 from ConfigParser import RawConfigParser
6 from optparse import OptionParser, IndentedHelpFormatter
15 except ImportError, e:
16 import simplejson as json
18 from htsworkflow.frontend.auth import apidata
19 from htsworkflow.util import api
20 from htsworkflow.util.url import normalize_url
21 from htsworkflow.pipelines.genome_mapper import \
22 getAvailableGenomes, \
24 from htsworkflow.pipelines.runfolder import LANE_LIST
25 # JSON dictionaries use strings
26 LANE_LIST_JSON = [ str(l) for l in LANE_LIST ]
28 __docformat__ = "restructredtext en"
30 CONFIG_SYSTEM = '/etc/htsworkflow.ini'
31 CONFIG_USER = os.path.expanduser('~/.htsworkflow.ini')
32 GERALD_CONFIG_SECTION = 'gerald_config'
34 #Disable or enable commandline arg parsing; disabled by default.
35 DISABLE_CMDLINE = True
37 class FlowCellNotFound(Exception): pass
38 class WebError404(Exception): pass
40 def retrieve_flowcell_info(base_host_url, flowcell):
42 Return a dictionary describing a
44 url = api.flowcell_url(base_host_url, flowcell)
47 apipayload = urllib.urlencode(apidata)
48 web = urllib2.urlopen(url, apipayload)
49 except urllib2.URLError, e:
50 errmsg = 'URLError: %d %s' % (e.code, e.msg)
52 logging.error('opened %s' % (url,))
59 msg = "403 - Forbbidden, probably need api key"
60 raise FlowCellNotFound(msg)
63 msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \
64 "Did you get right port #?" % (flowcell, base_host_url, url)
65 raise FlowCellNotFound(msg)
67 if len(contents) == 0:
68 msg = "No information for flowcell (%s) returned; full url(%s)" % (flowcell, url)
69 raise FlowCellNotFound(msg)
71 data = json.loads(contents)
74 def is_sequencing(lane_info):
76 Determine if we are just sequencing and not doing any follow-up analysis
78 if lane_info['experiment_type'] in ('De Novo','Whole Genome'):
83 def group_lane_parameters(flowcell_info):
85 goup lanes that can share GERALD configuration blocks.
87 (The same species, read length, and eland vs sequencing)
90 for lane_number, lane_contents in flowcell_info['lane_set'].items():
91 for lane_info in lane_contents:
92 index = (lane_info['read_length'],
93 lane_info['library_species'],
94 is_sequencing(lane_info))
95 lane_groups.setdefault(index, []).append(lane_number)
98 def format_gerald_header(flowcell_info):
100 Generate comment describing the contents of the flowcell
102 # I'm using '\n# ' to join the lines together, that doesn't include the
103 # first element so i needed to put the # in manually
104 config = ['# FLOWCELL: %s' % (flowcell_info['flowcell_id'])]
106 config += ['CONTROL-LANE: %s' % (flowcell_info['control_lane'],)]
108 config += ['Flowcell Notes:']
109 config.extend(flowcell_info['notes'].split('\r\n'))
111 for lane_number in LANE_LIST_JSON:
112 lane_contents = flowcell_info['lane_set'][lane_number]
113 for lane_info in lane_contents:
114 config += ['Lane%s: %s | %s' % (lane_number,
115 lane_info['library_id'],
116 lane_info['library_name'])]
119 return "\n# ".join(config)
121 def format_gerald_config(options, flowcell_info, genome_map):
123 Generate a GERALD config file
125 # so we can add nothing or _pair if we're a paired end run
126 eland_analysis_suffix = { False: "_extended", True: "_pair" }
127 sequence_analysis_suffix = { False: "", True: "_pair" }
129 # it's convienent to have helpful information describing the flowcell
130 # in the config file... things like which lane is which library.
131 config = [format_gerald_header(flowcell_info)]
133 config += ['SEQUENCE_FORMAT --fastq']
134 config += ['ELAND_SET_SIZE 20']
135 config += ['12345678:WITH_SEQUENCE true']
136 analysis_suffix = eland_analysis_suffix[flowcell_info['paired_end']]
137 sequence_suffix = sequence_analysis_suffix[flowcell_info['paired_end']]
138 lane_groups = group_lane_parameters(flowcell_info)
139 for lane_index, lane_numbers in lane_groups.items():
140 # lane_index is return value of group_lane_parameters
141 read_length, species, is_sequencing = lane_index
143 lane_prefix = u"".join(lane_numbers)
145 species_path = genome_map.get(species, None)
146 logging.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
147 if not is_sequencing and species_path is None:
148 no_genome_msg = "Forcing lanes %s to sequencing as there is no genome for %s"
149 logging.warning(no_genome_msg % (lane_numbers, species))
153 config += ['%s:ANALYSIS sequence%s' % (lane_prefix, sequence_suffix)]
155 config += ['%s:ANALYSIS eland%s' % (lane_prefix, analysis_suffix)]
156 config += ['%s:ELAND_GENOME %s' % (lane_prefix, species_path) ]
157 #config += ['%s:READ_LENGTH %s' % ( lane_prefix, read_length ) ]
158 config += ['%s:USE_BASES Y%s' % ( lane_prefix, read_length ) ]
160 # add in option for running script after
161 if not (options.post_run is None or options.runfolder is None):
162 runfolder = os.path.abspath(options.runfolder)
163 post_run = options.post_run % {'runfolder': runfolder}
164 config += ['POST_RUN_COMMAND %s' % (post_run,) ]
166 config += [''] # force trailing newline
168 return "\n".join(config)
172 Used when command line parsing is disabled; default
176 self.output_filepath = None
178 self.genome_dir = None
180 class PreformattedDescriptionFormatter(IndentedHelpFormatter):
182 #def format_description(self, description):
185 # return description + "\n"
189 def format_epilog(self, epilog):
191 It was removing my preformated epilog, so this should override
192 that behavior! Muhahaha!
195 return "\n" + epilog + "\n"
200 def constructOptionParser():
202 returns a pre-setup optparser
204 parser = OptionParser(formatter=PreformattedDescriptionFormatter())
206 parser.set_description('Retrieves eland config file from hts_frontend web frontend.')
211 * %s (User specific; overrides system)
212 * command line overrides all config file options
217 config_host: http://somewhere.domain:port
218 genome_dir: /path to search for genomes
219 post_run: runfolder -o <destdir> %%(runfolder)s
221 """ % (CONFIG_SYSTEM, CONFIG_USER, GERALD_CONFIG_SECTION)
223 #Special formatter for allowing preformatted description.
224 ##parser.format_epilog(PreformattedDescriptionFormatter())
226 parser.add_option("-u", "--url",
227 action="store", type="string", dest="url")
229 parser.add_option("-o", "--output-file",
230 action="store", type="string", dest="output_filepath",
231 help="config file destination. If runfolder is specified defaults "
232 "to <runfolder>/config-auto.txt" )
234 parser.add_option("-f", "--flowcell",
235 action="store", type="string", dest="flowcell")
237 parser.add_option("-g", "--genome_dir",
238 action="store", type="string", dest="genome_dir")
240 parser.add_option("-r", "--runfolder",
241 action="store", type="string",
242 help="specify runfolder for post_run command ")
244 parser.add_option("--sample-sheet", default=None,
245 help="path to save demultiplexing sample sheet")
247 parser.add_option("--operator", default='', help="Name of sequencer operator")
248 parser.add_option("--recipe", default="Unknown",
249 help="specify recipe name")
251 parser.add_option('-v', '--verbose', action='store_true', default=False,
252 help='increase logging verbosity')
255 def constructConfigParser():
257 returns a pre-setup config parser
259 parser = RawConfigParser()
260 parser.read([CONFIG_SYSTEM, CONFIG_USER])
261 if not parser.has_section(GERALD_CONFIG_SECTION):
262 parser.add_section(GERALD_CONFIG_SECTION)
267 def getCombinedOptions(argv=None):
269 Returns optparse options after it has be updated with ConfigParser
270 config files and merged with parsed commandline options.
272 expects command line arguments to be passed in
274 cl_parser = constructOptionParser()
275 conf_parser = constructConfigParser()
278 options = DummyOptions()
280 options, args = cl_parser.parse_args(argv)
282 if options.url is None:
283 if conf_parser.has_option(GERALD_CONFIG_SECTION, 'config_host'):
284 options.url = conf_parser.get(GERALD_CONFIG_SECTION, 'config_host')
286 options.url = normalize_url(options.url)
288 if options.genome_dir is None:
289 if conf_parser.has_option(GERALD_CONFIG_SECTION, 'genome_dir'):
290 options.genome_dir = conf_parser.get(GERALD_CONFIG_SECTION, 'genome_dir')
292 if conf_parser.has_option(GERALD_CONFIG_SECTION, 'post_run'):
293 options.post_run = conf_parser.get(GERALD_CONFIG_SECTION, 'post_run')
295 options.post_run = None
297 if options.output_filepath is None:
298 if options.runfolder is not None:
299 options.output_filepath = os.path.join(options.runfolder, 'config-auto.txt')
304 def saveConfigFile(options):
306 retrieves the flowcell eland config file, give the base_host_url
307 (i.e. http://sub.domain.edu:port)
309 logging.info('USING OPTIONS:')
310 logging.info(u' URL: %s' % (options.url,))
311 logging.info(u' OUT: %s' % (options.output_filepath,))
312 logging.info(u' FC: %s' % (options.flowcell,))
313 #logging.info(': %s' % (options.genome_dir,))
314 logging.info(u'post_run: %s' % ( unicode(options.post_run),))
316 flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
318 logging.debug('genome_dir: %s' % ( options.genome_dir, ))
319 available_genomes = getAvailableGenomes(options.genome_dir)
320 genome_map = constructMapperDict(available_genomes)
321 logging.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
323 #config = format_gerald_config(options, flowcell_info, genome_map)
325 #if options.output_filepath is not None:
326 # outstream = open(options.output_filepath, 'w')
327 # logging.info('Writing config file to %s' % (options.output_filepath,))
329 # outstream = sys.stdout
331 #outstream.write(config)
333 if options.sample_sheet is None:
335 elif options.sample_sheet == '-':
336 save_sample_sheet(sys.stdout, options, flowcell_info)
338 stream = open(options.sample_sheet,'w')
339 save_sample_sheet(stream, options, flowcell_info)
342 def save_sample_sheet(outstream, options, flowcell_info):
343 sample_sheet_fields = ['FCID', 'Lane', 'SampleID', 'SampleRef', 'Index',
344 'Description', 'Control', 'Recipe', 'Operator',
346 illumina_to_htsw_map = {'FCID': 'flowcell',
347 'Lane': 'lane_number',
348 'SampleID': 'library_id',
349 'SampleRef': format_sampleref,
350 'Description': 'library_name',
351 'Control': format_control_lane,
352 'Recipe': format_recipe_name,
353 'Operator': format_operator_name}
354 out = csv.DictWriter(outstream, sample_sheet_fields)
356 for lane_number in LANE_LIST:
357 lane_contents = flowcell_info['lane_set'][str(lane_number)]
359 pooled_lane_contents = []
360 for library in lane_contents:
361 # build common attributes
363 for illumina_name in sample_sheet_fields:
364 htsw_field = illumina_to_htsw_map.get(illumina_name, None)
365 if htsw_field is None:
367 if callable(htsw_field):
368 renamed[illumina_name] = htsw_field(options,
372 renamed[illumina_name] = library[htsw_field]
374 pooled_lane_contents.extend(format_pooled_libraries(renamed, library))
376 if len(pooled_lane_contents) > 1:
377 for row in pooled_lane_contents:
381 def format_sampleref(options, flowcell_info, sample):
382 return sample['library_species'].replace(' ', '_')
385 def format_control_lane(options, flowcell_info, sample):
386 if sample['lane_number'] == flowcell_info['control_lane']:
392 def format_recipe_name(options, flowcell_info, sample):
393 return options.recipe
396 def format_operator_name(options, flowcell_info, sample):
397 return options.operator
400 def format_pooled_libraries(shared, library):
401 sequences = library.get('index_sequence', None)
402 if sequences is None:
404 elif type(sequences) in types.StringTypes:
405 shared['Index'] = sequences
406 shared['SampleProject'] = library['library_id']
410 multiplex_ids = sequences.keys()
411 multiplex_ids.sort(key=int)
412 for multiplex_id in multiplex_ids:
414 sample.update(shared)
415 sample['Index'] = sequences[multiplex_id]
416 sample['SampleProject'] = format_project_name(library,
418 pooled.append(sample)
422 def format_project_name(library, multiplex_id):
423 library_id = library['library_id']
424 return "%s_index%s" % (library_id, multiplex_id)