Merge branch 'django1.4'
[htsworkflow.git] / htsworkflow / pipelines / retrieve_config.py
1 #!/usr/bin/env python
2
3 import csv
4 from ConfigParser import RawConfigParser
5 import logging
6 from optparse import OptionParser, IndentedHelpFormatter
7 import os
8 import sys
9 import types
10 import urllib
11 import urllib2
12
13 try:
14     import json
15 except ImportError, e:
16     import simplejson as json
17
18 from htsworkflow.frontend.auth import apidata
19 from htsworkflow.util import api
20 from htsworkflow.util import alphanum
21 from htsworkflow.util.url import normalize_url
22 from htsworkflow.pipelines.genome_mapper import \
23      getAvailableGenomes, \
24      constructMapperDict
25 from htsworkflow.pipelines import LANE_LIST
26 # JSON dictionaries use strings
27 LANE_LIST_JSON = [ str(l) for l in LANE_LIST ]
28
29 LOGGER = logging.getLogger(__name__)
30
31 __docformat__ = "restructredtext en"
32
33 CONFIG_SYSTEM = '/etc/htsworkflow.ini'
34 CONFIG_USER = os.path.expanduser('~/.htsworkflow.ini')
35 GERALD_CONFIG_SECTION = 'gerald_config'
36
37 #Disable or enable commandline arg parsing; disabled by default.
38 DISABLE_CMDLINE = True
39
40 class FlowCellNotFound(Exception): pass
41 class WebError404(Exception): pass
42
43 def retrieve_flowcell_info(base_host_url, flowcell):
44     """
45     Return a dictionary describing a
46     """
47     url = api.flowcell_url(base_host_url, flowcell)
48
49     try:
50         apipayload = urllib.urlencode(apidata)
51         web = urllib2.urlopen(url, apipayload)
52     except urllib2.URLError, e:
53         errmsg = 'URLError: %d %s' % (e.code, e.msg)
54         LOGGER.error(errmsg)
55         LOGGER.error('opened %s' % (url,))
56         raise IOError(errmsg)
57
58     contents = web.read()
59     headers = web.info()
60
61     if web.code == 403:
62         msg = "403 - Forbbidden, probably need api key"
63         raise FlowCellNotFound(msg)
64
65     if web.code == 404:
66         msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \
67               "Did you get right port #?" % (flowcell, base_host_url, url)
68         raise FlowCellNotFound(msg)
69
70     if len(contents) == 0:
71         msg = "No information for flowcell (%s) returned; full url(%s)" % (flowcell, url)
72         raise FlowCellNotFound(msg)
73
74     data = json.loads(contents)
75     return data
76
77 def is_sequencing(lane_info):
78     """
79     Determine if we are just sequencing and not doing any follow-up analysis
80     """
81     if lane_info['experiment_type'] in ('De Novo','Whole Genome'):
82         return True
83     else:
84         return False
85
86 def group_lane_parameters(flowcell_info):
87     """
88     goup lanes that can share GERALD configuration blocks.
89
90     (The same species, read length, and eland vs sequencing)
91     """
92     lane_groups = {}
93     for lane_number, lane_contents in flowcell_info['lane_set'].items():
94         for lane_info in lane_contents:
95             index = (lane_info['read_length'],
96                      lane_info['library_species'],
97                      is_sequencing(lane_info))
98             lane_groups.setdefault(index, []).append(lane_number)
99     return lane_groups
100
101 def format_gerald_header(flowcell_info):
102     """
103     Generate comment describing the contents of the flowcell
104     """
105     # I'm using '\n# ' to join the lines together, that doesn't include the
106     # first element so i needed to put the # in manually
107     config = ['# FLOWCELL: %s' % (flowcell_info['flowcell_id'])]
108     config += ['']
109     config += ['CONTROL-LANE: %s' % (flowcell_info['control_lane'],)]
110     config += ['']
111     config += ['Flowcell Notes:']
112     config.extend(flowcell_info['notes'].split('\r\n'))
113     config += ['']
114     for lane_number in sorted(flowcell_info['lane_set']):
115         lane_contents = flowcell_info['lane_set'][lane_number]
116         for lane_info in lane_contents:
117             config += ['Lane%s: %s | %s' % (lane_number,
118                                             lane_info['library_id'],
119                                             lane_info['library_name'])]
120
121     config += ['']
122     return "\n# ".join(config)
123
124 def format_gerald_config(options, flowcell_info, genome_map):
125     """
126     Generate a GERALD config file
127     """
128     # so we can add nothing or _pair if we're a paired end run
129     eland_analysis_suffix = { False: "_extended", True: "_pair" }
130     sequence_analysis_suffix = { False: "", True: "_pair" }
131
132     # it's convienent to have helpful information describing the flowcell
133     # in the config file... things like which lane is which library.
134     config = [format_gerald_header(flowcell_info)]
135
136     config += ['SEQUENCE_FORMAT --fastq']
137     config += ['ELAND_SET_SIZE 20']
138     config += ['12345678:WITH_SEQUENCE true']
139     analysis_suffix = eland_analysis_suffix[flowcell_info['paired_end']]
140     sequence_suffix = sequence_analysis_suffix[flowcell_info['paired_end']]
141     lane_groups = group_lane_parameters(flowcell_info)
142     for lane_index, lane_numbers in lane_groups.items():
143         # lane_index is return value of group_lane_parameters
144         read_length, species, is_sequencing = lane_index
145         lane_numbers.sort()
146         lane_prefix = u"".join(lane_numbers)
147
148         species_path = genome_map.get(species, None)
149         LOGGER.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
150         if not is_sequencing and species_path is None:
151             no_genome_msg = "Forcing lanes %s to sequencing as there is no genome for %s"
152             LOGGER.warning(no_genome_msg % (lane_numbers, species))
153             is_sequencing = True
154
155         if is_sequencing:
156             config += ['%s:ANALYSIS sequence%s' % (lane_prefix, sequence_suffix)]
157         else:
158             config += ['%s:ANALYSIS eland%s' % (lane_prefix, analysis_suffix)]
159             config += ['%s:ELAND_GENOME %s' % (lane_prefix, species_path) ]
160         #config += ['%s:READ_LENGTH %s' % ( lane_prefix, read_length ) ]
161         config += ['%s:USE_BASES Y%s' % ( lane_prefix, read_length ) ]
162
163     # add in option for running script after
164     if not (options.post_run is None or options.runfolder is None):
165         runfolder = os.path.abspath(options.runfolder)
166         post_run = options.post_run  % {'runfolder': runfolder}
167         config += ['POST_RUN_COMMAND %s' % (post_run,) ]
168
169     config += [''] # force trailing newline
170
171     return "\n".join(config)
172
173 class DummyOptions:
174   """
175   Used when command line parsing is disabled; default
176   """
177   def __init__(self):
178     self.url = None
179     self.output_filepath = None
180     self.flowcell = None
181     self.genome_dir = None
182
183 class PreformattedDescriptionFormatter(IndentedHelpFormatter):
184
185   #def format_description(self, description):
186   #
187   #  if description:
188   #      return description + "\n"
189   #  else:
190   #     return ""
191
192   def format_epilog(self, epilog):
193     """
194     It was removing my preformated epilog, so this should override
195     that behavior! Muhahaha!
196     """
197     if epilog:
198         return "\n" + epilog + "\n"
199     else:
200         return ""
201
202
203 def constructOptionParser():
204     """
205     returns a pre-setup optparser
206     """
207     parser = OptionParser(formatter=PreformattedDescriptionFormatter())
208
209     parser.set_description('Retrieves eland config file from hts_frontend web frontend.')
210
211     parser.epilog = """
212 Config File:
213   * %s (System wide)
214   * %s (User specific; overrides system)
215   * command line overrides all config file options
216
217   Example Config File:
218
219     [%s]
220     config_host: http://somewhere.domain:port
221     genome_dir: /path to search for genomes
222     post_run: runfolder -o <destdir> %%(runfolder)s
223
224 """ % (CONFIG_SYSTEM, CONFIG_USER, GERALD_CONFIG_SECTION)
225
226     #Special formatter for allowing preformatted description.
227     ##parser.format_epilog(PreformattedDescriptionFormatter())
228
229     parser.add_option("-u", "--url",
230                       action="store", type="string", dest="url")
231
232     parser.add_option("-o", "--output-file",
233                       action="store", type="string", dest="output_filepath",
234                       help="config file destination. If runfolder is specified defaults "
235                            "to <runfolder>/config-auto.txt" )
236
237     parser.add_option("-f", "--flowcell",
238                       action="store", type="string", dest="flowcell")
239
240     parser.add_option("-g", "--genome_dir",
241                       action="store", type="string", dest="genome_dir")
242
243     parser.add_option("-r", "--runfolder",
244                       action="store", type="string",
245                       help="specify runfolder for post_run command ")
246
247     parser.add_option("--sample-sheet", default=None,
248                       help="path to save demultiplexing sample sheet")
249
250     parser.add_option("--operator", default='', help="Name of sequencer operator")
251     parser.add_option("--recipe", default="Unknown",
252                       help="specify recipe name")
253
254     parser.add_option('-v', '--verbose', action='store_true', default=False,
255                        help='increase logging verbosity')
256     return parser
257
258 def constructConfigParser():
259     """
260     returns a pre-setup config parser
261     """
262     parser = RawConfigParser()
263     parser.read([CONFIG_SYSTEM, CONFIG_USER])
264     if not parser.has_section(GERALD_CONFIG_SECTION):
265         parser.add_section(GERALD_CONFIG_SECTION)
266
267     return parser
268
269
270 def getCombinedOptions(argv=None):
271     """
272     Returns optparse options after it has be updated with ConfigParser
273     config files and merged with parsed commandline options.
274
275     expects command line arguments to be passed in
276     """
277     cl_parser = constructOptionParser()
278     conf_parser = constructConfigParser()
279
280     if argv is None:
281         options = DummyOptions()
282     else:
283         options, args = cl_parser.parse_args(argv)
284
285     if options.url is None:
286         if conf_parser.has_option(GERALD_CONFIG_SECTION, 'config_host'):
287             options.url = conf_parser.get(GERALD_CONFIG_SECTION, 'config_host')
288
289     options.url = normalize_url(options.url)
290
291     if options.genome_dir is None:
292         if conf_parser.has_option(GERALD_CONFIG_SECTION, 'genome_dir'):
293             options.genome_dir = conf_parser.get(GERALD_CONFIG_SECTION, 'genome_dir')
294
295     if conf_parser.has_option(GERALD_CONFIG_SECTION, 'post_run'):
296         options.post_run = conf_parser.get(GERALD_CONFIG_SECTION, 'post_run')
297     else:
298         options.post_run = None
299
300     if options.output_filepath is None:
301         if options.runfolder is not None:
302             options.output_filepath = os.path.join(options.runfolder, 'config-auto.txt')
303
304     return options
305
306
307 def saveConfigFile(options):
308   """
309   retrieves the flowcell eland config file, give the base_host_url
310   (i.e. http://sub.domain.edu:port)
311   """
312   LOGGER.info('USING OPTIONS:')
313   LOGGER.info(u'     URL: %s' % (options.url,))
314   LOGGER.info(u'     OUT: %s' % (options.output_filepath,))
315   LOGGER.info(u'      FC: %s' % (options.flowcell,))
316   #LOGGER.info(': %s' % (options.genome_dir,))
317   LOGGER.info(u'post_run: %s' % ( unicode(options.post_run),))
318
319   flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
320
321   LOGGER.debug('genome_dir: %s' % ( options.genome_dir, ))
322   available_genomes = getAvailableGenomes(options.genome_dir)
323   genome_map = constructMapperDict(available_genomes)
324   LOGGER.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
325
326   config = format_gerald_config(options, flowcell_info, genome_map)
327
328   if options.output_filepath is not None:
329       outstream = open(options.output_filepath, 'w')
330       logging.info('Writing config file to %s' % (options.output_filepath,))
331   else:
332       outstream = sys.stdout
333
334   outstream.write(config)
335
336   if options.sample_sheet is None:
337       pass
338   elif options.sample_sheet == '-':
339       save_sample_sheet(sys.stdout, options, flowcell_info)
340   else:
341       stream = open(options.sample_sheet,'w')
342       save_sample_sheet(stream, options, flowcell_info)
343
344
345 def save_sample_sheet(outstream, options, flowcell_info):
346     sample_sheet_fields = ['FCID', 'Lane', 'SampleID', 'SampleRef', 'Index',
347                            'Description', 'Control', 'Recipe', 'Operator',
348                            'SampleProject']
349     illumina_to_htsw_map = {'FCID': 'flowcell',
350                             'Lane': 'lane_number',
351                             'SampleID': 'library_id',
352                             'SampleRef': format_sampleref,
353                             'Description': 'library_name',
354                             'Control': format_control_lane,
355                             'Recipe': format_recipe_name,
356                             'Operator': format_operator_name}
357     out = csv.DictWriter(outstream, sample_sheet_fields)
358     out.writerow(dict(((x,x) for x in sample_sheet_fields)))
359     for lane_number in sorted(flowcell_info['lane_set']):
360         lane_contents = flowcell_info['lane_set'][lane_number]
361
362         pooled_lane_contents = []
363         for library in lane_contents:
364             # build common attributes
365             renamed = {}
366             for illumina_name in sample_sheet_fields:
367                 htsw_field = illumina_to_htsw_map.get(illumina_name, None)
368                 if htsw_field is None:
369                     continue
370                 if callable(htsw_field):
371                     renamed[illumina_name] = htsw_field(options,
372                                                         flowcell_info,
373                                                         library)
374                 else:
375                     renamed[illumina_name] = library[htsw_field]
376
377             pooled_lane_contents.extend(format_pooled_libraries(renamed, library))
378
379         for row in pooled_lane_contents:
380             out.writerow(row)
381
382
383 def format_sampleref(options, flowcell_info, sample):
384     return sample['library_species'].replace(' ', '_')
385
386
387 def format_control_lane(options, flowcell_info, sample):
388     if sample['lane_number'] == flowcell_info['control_lane']:
389         return 'Y'
390     else:
391         return 'N'
392
393
394 def format_recipe_name(options, flowcell_info, sample):
395     return options.recipe
396
397
398 def format_operator_name(options, flowcell_info, sample):
399     return options.operator
400
401
402 def format_pooled_libraries(shared, library):
403     sequences = library.get('index_sequence', None)
404     if sequences is None:
405         return []
406     elif (type(sequences) in types.StringTypes and
407           sequences.lower().startswith('err')):
408         shared['Index'] = ''
409         shared['SampleProject'] = library['library_id']
410         return [shared]
411     elif (type(sequences) == types.DictType):
412         pooled = []
413         multiplex_ids = sequences.keys()
414         multiplex_ids.sort(cmp=alphanum.alphanum)
415         for multiplex_id in multiplex_ids:
416             sample = {}
417             sample.update(shared)
418             sample['Index'] = sequences[multiplex_id]
419             sample['SampleProject'] = format_project_name(library,
420                                                           multiplex_id)
421             pooled.append(sample)
422         return pooled
423     else:
424         raise RuntimeError("Unrecognized index type")
425
426
427
428 def format_project_name(library, multiplex_id):
429     library_id = library['library_id']
430     return "%s_index%s" % (library_id, multiplex_id)
431
432