Add support for tracking the multiplex index sequence.
[htsworkflow.git] / htsworkflow / pipelines / retrieve_config.py
1 #!/usr/bin/env python
2
3 import csv
4 from ConfigParser import RawConfigParser
5 import logging
6 from optparse import OptionParser, IndentedHelpFormatter
7 import os
8 import sys
9 import types
10 import urllib
11 import urllib2
12
13 try:
14     import json
15 except ImportError, e:
16     import simplejson as json
17
18 from htsworkflow.frontend.auth import apidata
19 from htsworkflow.util import api
20 from htsworkflow.util.url import normalize_url
21 from htsworkflow.pipelines.genome_mapper import \
22      getAvailableGenomes, \
23      constructMapperDict
24 from htsworkflow.pipelines.runfolder import LANE_LIST
25 # JSON dictionaries use strings
26 LANE_LIST_JSON = [ str(l) for l in LANE_LIST ]
27
28 __docformat__ = "restructredtext en"
29
30 CONFIG_SYSTEM = '/etc/htsworkflow.ini'
31 CONFIG_USER = os.path.expanduser('~/.htsworkflow.ini')
32 GERALD_CONFIG_SECTION = 'gerald_config'
33
34 #Disable or enable commandline arg parsing; disabled by default.
35 DISABLE_CMDLINE = True
36
37 class FlowCellNotFound(Exception): pass
38 class WebError404(Exception): pass
39
40 def retrieve_flowcell_info(base_host_url, flowcell):
41     """
42     Return a dictionary describing a
43     """
44     url = api.flowcell_url(base_host_url, flowcell)
45
46     try:
47         apipayload = urllib.urlencode(apidata)
48         web = urllib2.urlopen(url, apipayload)
49     except urllib2.URLError, e:
50         errmsg = 'URLError: %d %s' % (e.code, e.msg)
51         logging.error(errmsg)
52         logging.error('opened %s' % (url,))
53         raise IOError(errmsg)
54
55     contents = web.read()
56     headers = web.info()
57
58     if web.code == 403:
59         msg = "403 - Forbbidden, probably need api key"
60         raise FlowCellNotFound(msg)
61
62     if web.code == 404:
63         msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \
64               "Did you get right port #?" % (flowcell, base_host_url, url)
65         raise FlowCellNotFound(msg)
66
67     if len(contents) == 0:
68         msg = "No information for flowcell (%s) returned; full url(%s)" % (flowcell, url)
69         raise FlowCellNotFound(msg)
70
71     data = json.loads(contents)
72     return data
73
74 def is_sequencing(lane_info):
75     """
76     Determine if we are just sequencing and not doing any follow-up analysis
77     """
78     if lane_info['experiment_type'] in ('De Novo','Whole Genome'):
79         return True
80     else:
81         return False
82
83 def group_lane_parameters(flowcell_info):
84     """
85     goup lanes that can share GERALD configuration blocks.
86
87     (The same species, read length, and eland vs sequencing)
88     """
89     lane_groups = {}
90     for lane_number, lane_contents in flowcell_info['lane_set'].items():
91         for lane_info in lane_contents:
92             index = (lane_info['read_length'],
93                      lane_info['library_species'],
94                      is_sequencing(lane_info))
95             lane_groups.setdefault(index, []).append(lane_number)
96     return lane_groups
97
98 def format_gerald_header(flowcell_info):
99     """
100     Generate comment describing the contents of the flowcell
101     """
102     # I'm using '\n# ' to join the lines together, that doesn't include the
103     # first element so i needed to put the # in manually
104     config = ['# FLOWCELL: %s' % (flowcell_info['flowcell_id'])]
105     config += ['']
106     config += ['CONTROL-LANE: %s' % (flowcell_info['control_lane'],)]
107     config += ['']
108     config += ['Flowcell Notes:']
109     config.extend(flowcell_info['notes'].split('\r\n'))
110     config += ['']
111     for lane_number in LANE_LIST_JSON:
112         lane_contents = flowcell_info['lane_set'][lane_number]
113         for lane_info in lane_contents:
114             config += ['Lane%s: %s | %s' % (lane_number,
115                                             lane_info['library_id'],
116                                             lane_info['library_name'])]
117
118     config += ['']
119     return "\n# ".join(config)
120
121 def format_gerald_config(options, flowcell_info, genome_map):
122     """
123     Generate a GERALD config file
124     """
125     # so we can add nothing or _pair if we're a paired end run
126     eland_analysis_suffix = { False: "_extended", True: "_pair" }
127     sequence_analysis_suffix = { False: "", True: "_pair" }
128
129     # it's convienent to have helpful information describing the flowcell
130     # in the config file... things like which lane is which library.
131     config = [format_gerald_header(flowcell_info)]
132
133     config += ['SEQUENCE_FORMAT --fastq']
134     config += ['ELAND_SET_SIZE 20']
135     config += ['12345678:WITH_SEQUENCE true']
136     analysis_suffix = eland_analysis_suffix[flowcell_info['paired_end']]
137     sequence_suffix = sequence_analysis_suffix[flowcell_info['paired_end']]
138     lane_groups = group_lane_parameters(flowcell_info)
139     for lane_index, lane_numbers in lane_groups.items():
140         # lane_index is return value of group_lane_parameters
141         read_length, species, is_sequencing = lane_index
142         lane_numbers.sort()
143         lane_prefix = u"".join(lane_numbers)
144
145         species_path = genome_map.get(species, None)
146         logging.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
147         if not is_sequencing and species_path is None:
148             no_genome_msg = "Forcing lanes %s to sequencing as there is no genome for %s"
149             logging.warning(no_genome_msg % (lane_numbers, species))
150             is_sequencing = True
151
152         if is_sequencing:
153             config += ['%s:ANALYSIS sequence%s' % (lane_prefix, sequence_suffix)]
154         else:
155             config += ['%s:ANALYSIS eland%s' % (lane_prefix, analysis_suffix)]
156             config += ['%s:ELAND_GENOME %s' % (lane_prefix, species_path) ]
157         #config += ['%s:READ_LENGTH %s' % ( lane_prefix, read_length ) ]
158         config += ['%s:USE_BASES Y%s' % ( lane_prefix, read_length ) ]
159
160     # add in option for running script after
161     if not (options.post_run is None or options.runfolder is None):
162         runfolder = os.path.abspath(options.runfolder)
163         post_run = options.post_run  % {'runfolder': runfolder}
164         config += ['POST_RUN_COMMAND %s' % (post_run,) ]
165
166     config += [''] # force trailing newline
167
168     return "\n".join(config)
169
170 class DummyOptions:
171   """
172   Used when command line parsing is disabled; default
173   """
174   def __init__(self):
175     self.url = None
176     self.output_filepath = None
177     self.flowcell = None
178     self.genome_dir = None
179
180 class PreformattedDescriptionFormatter(IndentedHelpFormatter):
181
182   #def format_description(self, description):
183   #
184   #  if description:
185   #      return description + "\n"
186   #  else:
187   #     return ""
188
189   def format_epilog(self, epilog):
190     """
191     It was removing my preformated epilog, so this should override
192     that behavior! Muhahaha!
193     """
194     if epilog:
195         return "\n" + epilog + "\n"
196     else:
197         return ""
198
199
200 def constructOptionParser():
201     """
202     returns a pre-setup optparser
203     """
204     parser = OptionParser(formatter=PreformattedDescriptionFormatter())
205
206     parser.set_description('Retrieves eland config file from hts_frontend web frontend.')
207
208     parser.epilog = """
209 Config File:
210   * %s (System wide)
211   * %s (User specific; overrides system)
212   * command line overrides all config file options
213
214   Example Config File:
215
216     [%s]
217     config_host: http://somewhere.domain:port
218     genome_dir: /path to search for genomes
219     post_run: runfolder -o <destdir> %%(runfolder)s
220
221 """ % (CONFIG_SYSTEM, CONFIG_USER, GERALD_CONFIG_SECTION)
222
223     #Special formatter for allowing preformatted description.
224     ##parser.format_epilog(PreformattedDescriptionFormatter())
225
226     parser.add_option("-u", "--url",
227                       action="store", type="string", dest="url")
228
229     parser.add_option("-o", "--output-file",
230                       action="store", type="string", dest="output_filepath",
231                       help="config file destination. If runfolder is specified defaults "
232                            "to <runfolder>/config-auto.txt" )
233
234     parser.add_option("-f", "--flowcell",
235                       action="store", type="string", dest="flowcell")
236
237     parser.add_option("-g", "--genome_dir",
238                       action="store", type="string", dest="genome_dir")
239
240     parser.add_option("-r", "--runfolder",
241                       action="store", type="string",
242                       help="specify runfolder for post_run command ")
243
244     parser.add_option("--sample-sheet", default=None,
245                       help="path to save demultiplexing sample sheet")
246
247     parser.add_option("--operator", default='', help="Name of sequencer operator")
248     parser.add_option("--recipe", default="Unknown",
249                       help="specify recipe name")
250
251     parser.add_option('-v', '--verbose', action='store_true', default=False,
252                        help='increase logging verbosity')
253     return parser
254
255 def constructConfigParser():
256     """
257     returns a pre-setup config parser
258     """
259     parser = RawConfigParser()
260     parser.read([CONFIG_SYSTEM, CONFIG_USER])
261     if not parser.has_section(GERALD_CONFIG_SECTION):
262         parser.add_section(GERALD_CONFIG_SECTION)
263
264     return parser
265
266
267 def getCombinedOptions(argv=None):
268     """
269     Returns optparse options after it has be updated with ConfigParser
270     config files and merged with parsed commandline options.
271
272     expects command line arguments to be passed in
273     """
274     cl_parser = constructOptionParser()
275     conf_parser = constructConfigParser()
276
277     if argv is None:
278         options = DummyOptions()
279     else:
280         options, args = cl_parser.parse_args(argv)
281
282     if options.url is None:
283         if conf_parser.has_option(GERALD_CONFIG_SECTION, 'config_host'):
284             options.url = conf_parser.get(GERALD_CONFIG_SECTION, 'config_host')
285
286     options.url = normalize_url(options.url)
287
288     if options.genome_dir is None:
289         if conf_parser.has_option(GERALD_CONFIG_SECTION, 'genome_dir'):
290             options.genome_dir = conf_parser.get(GERALD_CONFIG_SECTION, 'genome_dir')
291
292     if conf_parser.has_option(GERALD_CONFIG_SECTION, 'post_run'):
293         options.post_run = conf_parser.get(GERALD_CONFIG_SECTION, 'post_run')
294     else:
295         options.post_run = None
296
297     if options.output_filepath is None:
298         if options.runfolder is not None:
299             options.output_filepath = os.path.join(options.runfolder, 'config-auto.txt')
300
301     return options
302
303
304 def saveConfigFile(options):
305   """
306   retrieves the flowcell eland config file, give the base_host_url
307   (i.e. http://sub.domain.edu:port)
308   """
309   logging.info('USING OPTIONS:')
310   logging.info(u'     URL: %s' % (options.url,))
311   logging.info(u'     OUT: %s' % (options.output_filepath,))
312   logging.info(u'      FC: %s' % (options.flowcell,))
313   #logging.info(': %s' % (options.genome_dir,))
314   logging.info(u'post_run: %s' % ( unicode(options.post_run),))
315
316   flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
317
318   logging.debug('genome_dir: %s' % ( options.genome_dir, ))
319   available_genomes = getAvailableGenomes(options.genome_dir)
320   genome_map = constructMapperDict(available_genomes)
321   logging.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
322
323   #config = format_gerald_config(options, flowcell_info, genome_map)
324   #
325   #if options.output_filepath is not None:
326   #    outstream = open(options.output_filepath, 'w')
327   #    logging.info('Writing config file to %s' % (options.output_filepath,))
328   #else:
329   #    outstream = sys.stdout
330   #
331   #outstream.write(config)
332
333   if options.sample_sheet is None:
334       pass
335   elif options.sample_sheet == '-':
336       save_sample_sheet(sys.stdout, options, flowcell_info)
337   else:
338       stream = open(options.sample_sheet,'w')
339       save_sample_sheet(stream, options, flowcell_info)
340
341
342 def save_sample_sheet(outstream, options, flowcell_info):
343     sample_sheet_fields = ['FCID', 'Lane', 'SampleID', 'SampleRef', 'Index',
344                            'Description', 'Control', 'Recipe', 'Operator',
345                            'SampleProject']
346     illumina_to_htsw_map = {'FCID': 'flowcell',
347                             'Lane': 'lane_number',
348                             'SampleID': 'library_id',
349                             'SampleRef': format_sampleref,
350                             'Description': 'library_name',
351                             'Control': format_control_lane,
352                             'Recipe': format_recipe_name,
353                             'Operator': format_operator_name}
354     out = csv.DictWriter(outstream, sample_sheet_fields)
355     out.writeheader()
356     for lane_number in LANE_LIST:
357         lane_contents = flowcell_info['lane_set'][str(lane_number)]
358
359         pooled_lane_contents = []
360         for library in lane_contents:
361             # build common attributes
362             renamed = {}
363             for illumina_name in sample_sheet_fields:
364                 htsw_field = illumina_to_htsw_map.get(illumina_name, None)
365                 if htsw_field is None:
366                     continue
367                 if callable(htsw_field):
368                     renamed[illumina_name] = htsw_field(options,
369                                                         flowcell_info,
370                                                         library)
371                 else:
372                     renamed[illumina_name] = library[htsw_field]
373
374             pooled_lane_contents.extend(format_pooled_libraries(renamed, library))
375
376         if len(pooled_lane_contents) > 1:
377             for row in pooled_lane_contents:
378                 out.writerow(row)
379
380
381 def format_sampleref(options, flowcell_info, sample):
382     return sample['library_species'].replace(' ', '_')
383
384
385 def format_control_lane(options, flowcell_info, sample):
386     if sample['lane_number'] == flowcell_info['control_lane']:
387         return 'Y'
388     else:
389         return 'N'
390
391
392 def format_recipe_name(options, flowcell_info, sample):
393     return options.recipe
394
395
396 def format_operator_name(options, flowcell_info, sample):
397     return options.operator
398
399
400 def format_pooled_libraries(shared, library):
401     sequences = library.get('index_sequence', None)
402     if sequences is None:
403         return []
404     elif type(sequences) in types.StringTypes:
405         shared['Index'] = sequences
406         shared['SampleProject'] = library['library_id']
407         return [shared]
408     else:
409         pooled = []
410         multiplex_ids = sequences.keys()
411         multiplex_ids.sort(key=int)
412         for multiplex_id in multiplex_ids:
413             sample = {}
414             sample.update(shared)
415             sample['Index'] = sequences[multiplex_id]
416             sample['SampleProject'] = format_project_name(library,
417                                                           multiplex_id)
418             pooled.append(sample)
419         return pooled
420
421
422 def format_project_name(library, multiplex_id):
423     library_id = library['library_id']
424     return "%s_index%s" % (library_id, multiplex_id)
425
426