Merge branch 'master' of mus.cacr.caltech.edu:htsworkflow
[htsworkflow.git] / htsworkflow / pipelines / retrieve_config.py
1 #!/usr/bin/env python
2
3 import csv
4 from ConfigParser import RawConfigParser
5 import logging
6 from optparse import OptionParser, IndentedHelpFormatter
7 import os
8 import sys
9 import types
10 import urllib
11 import urllib2
12
13 try:
14     import json
15 except ImportError, e:
16     import simplejson as json
17
18 from htsworkflow.frontend.auth import apidata
19 from htsworkflow.util import api
20 from htsworkflow.util.url import normalize_url
21 from htsworkflow.pipelines.genome_mapper import \
22      getAvailableGenomes, \
23      constructMapperDict
24 from htsworkflow.pipelines.runfolder import LANE_LIST
25 # JSON dictionaries use strings
26 LANE_LIST_JSON = [ str(l) for l in LANE_LIST ]
27
28 LOGGER = logging.getLogger(__name__)
29
30 __docformat__ = "restructredtext en"
31
32 CONFIG_SYSTEM = '/etc/htsworkflow.ini'
33 CONFIG_USER = os.path.expanduser('~/.htsworkflow.ini')
34 GERALD_CONFIG_SECTION = 'gerald_config'
35
36 #Disable or enable commandline arg parsing; disabled by default.
37 DISABLE_CMDLINE = True
38
39 class FlowCellNotFound(Exception): pass
40 class WebError404(Exception): pass
41
42 def retrieve_flowcell_info(base_host_url, flowcell):
43     """
44     Return a dictionary describing a
45     """
46     url = api.flowcell_url(base_host_url, flowcell)
47
48     try:
49         apipayload = urllib.urlencode(apidata)
50         web = urllib2.urlopen(url, apipayload)
51     except urllib2.URLError, e:
52         errmsg = 'URLError: %d %s' % (e.code, e.msg)
53         LOGGER.error(errmsg)
54         LOGGER.error('opened %s' % (url,))
55         raise IOError(errmsg)
56
57     contents = web.read()
58     headers = web.info()
59
60     if web.code == 403:
61         msg = "403 - Forbbidden, probably need api key"
62         raise FlowCellNotFound(msg)
63
64     if web.code == 404:
65         msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \
66               "Did you get right port #?" % (flowcell, base_host_url, url)
67         raise FlowCellNotFound(msg)
68
69     if len(contents) == 0:
70         msg = "No information for flowcell (%s) returned; full url(%s)" % (flowcell, url)
71         raise FlowCellNotFound(msg)
72
73     data = json.loads(contents)
74     return data
75
76 def is_sequencing(lane_info):
77     """
78     Determine if we are just sequencing and not doing any follow-up analysis
79     """
80     if lane_info['experiment_type'] in ('De Novo','Whole Genome'):
81         return True
82     else:
83         return False
84
85 def group_lane_parameters(flowcell_info):
86     """
87     goup lanes that can share GERALD configuration blocks.
88
89     (The same species, read length, and eland vs sequencing)
90     """
91     lane_groups = {}
92     for lane_number, lane_contents in flowcell_info['lane_set'].items():
93         for lane_info in lane_contents:
94             index = (lane_info['read_length'],
95                      lane_info['library_species'],
96                      is_sequencing(lane_info))
97             lane_groups.setdefault(index, []).append(lane_number)
98     return lane_groups
99
100 def format_gerald_header(flowcell_info):
101     """
102     Generate comment describing the contents of the flowcell
103     """
104     # I'm using '\n# ' to join the lines together, that doesn't include the
105     # first element so i needed to put the # in manually
106     config = ['# FLOWCELL: %s' % (flowcell_info['flowcell_id'])]
107     config += ['']
108     config += ['CONTROL-LANE: %s' % (flowcell_info['control_lane'],)]
109     config += ['']
110     config += ['Flowcell Notes:']
111     config.extend(flowcell_info['notes'].split('\r\n'))
112     config += ['']
113     for lane_number in LANE_LIST_JSON:
114         lane_contents = flowcell_info['lane_set'][lane_number]
115         for lane_info in lane_contents:
116             config += ['Lane%s: %s | %s' % (lane_number,
117                                             lane_info['library_id'],
118                                             lane_info['library_name'])]
119
120     config += ['']
121     return "\n# ".join(config)
122
123 def format_gerald_config(options, flowcell_info, genome_map):
124     """
125     Generate a GERALD config file
126     """
127     # so we can add nothing or _pair if we're a paired end run
128     eland_analysis_suffix = { False: "_extended", True: "_pair" }
129     sequence_analysis_suffix = { False: "", True: "_pair" }
130
131     # it's convienent to have helpful information describing the flowcell
132     # in the config file... things like which lane is which library.
133     config = [format_gerald_header(flowcell_info)]
134
135     config += ['SEQUENCE_FORMAT --fastq']
136     config += ['ELAND_SET_SIZE 20']
137     config += ['12345678:WITH_SEQUENCE true']
138     analysis_suffix = eland_analysis_suffix[flowcell_info['paired_end']]
139     sequence_suffix = sequence_analysis_suffix[flowcell_info['paired_end']]
140     lane_groups = group_lane_parameters(flowcell_info)
141     for lane_index, lane_numbers in lane_groups.items():
142         # lane_index is return value of group_lane_parameters
143         read_length, species, is_sequencing = lane_index
144         lane_numbers.sort()
145         lane_prefix = u"".join(lane_numbers)
146
147         species_path = genome_map.get(species, None)
148         LOGGER.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
149         if not is_sequencing and species_path is None:
150             no_genome_msg = "Forcing lanes %s to sequencing as there is no genome for %s"
151             LOGGER.warning(no_genome_msg % (lane_numbers, species))
152             is_sequencing = True
153
154         if is_sequencing:
155             config += ['%s:ANALYSIS sequence%s' % (lane_prefix, sequence_suffix)]
156         else:
157             config += ['%s:ANALYSIS eland%s' % (lane_prefix, analysis_suffix)]
158             config += ['%s:ELAND_GENOME %s' % (lane_prefix, species_path) ]
159         #config += ['%s:READ_LENGTH %s' % ( lane_prefix, read_length ) ]
160         config += ['%s:USE_BASES Y%s' % ( lane_prefix, read_length ) ]
161
162     # add in option for running script after
163     if not (options.post_run is None or options.runfolder is None):
164         runfolder = os.path.abspath(options.runfolder)
165         post_run = options.post_run  % {'runfolder': runfolder}
166         config += ['POST_RUN_COMMAND %s' % (post_run,) ]
167
168     config += [''] # force trailing newline
169
170     return "\n".join(config)
171
172 class DummyOptions:
173   """
174   Used when command line parsing is disabled; default
175   """
176   def __init__(self):
177     self.url = None
178     self.output_filepath = None
179     self.flowcell = None
180     self.genome_dir = None
181
182 class PreformattedDescriptionFormatter(IndentedHelpFormatter):
183
184   #def format_description(self, description):
185   #
186   #  if description:
187   #      return description + "\n"
188   #  else:
189   #     return ""
190
191   def format_epilog(self, epilog):
192     """
193     It was removing my preformated epilog, so this should override
194     that behavior! Muhahaha!
195     """
196     if epilog:
197         return "\n" + epilog + "\n"
198     else:
199         return ""
200
201
202 def constructOptionParser():
203     """
204     returns a pre-setup optparser
205     """
206     parser = OptionParser(formatter=PreformattedDescriptionFormatter())
207
208     parser.set_description('Retrieves eland config file from hts_frontend web frontend.')
209
210     parser.epilog = """
211 Config File:
212   * %s (System wide)
213   * %s (User specific; overrides system)
214   * command line overrides all config file options
215
216   Example Config File:
217
218     [%s]
219     config_host: http://somewhere.domain:port
220     genome_dir: /path to search for genomes
221     post_run: runfolder -o <destdir> %%(runfolder)s
222
223 """ % (CONFIG_SYSTEM, CONFIG_USER, GERALD_CONFIG_SECTION)
224
225     #Special formatter for allowing preformatted description.
226     ##parser.format_epilog(PreformattedDescriptionFormatter())
227
228     parser.add_option("-u", "--url",
229                       action="store", type="string", dest="url")
230
231     parser.add_option("-o", "--output-file",
232                       action="store", type="string", dest="output_filepath",
233                       help="config file destination. If runfolder is specified defaults "
234                            "to <runfolder>/config-auto.txt" )
235
236     parser.add_option("-f", "--flowcell",
237                       action="store", type="string", dest="flowcell")
238
239     parser.add_option("-g", "--genome_dir",
240                       action="store", type="string", dest="genome_dir")
241
242     parser.add_option("-r", "--runfolder",
243                       action="store", type="string",
244                       help="specify runfolder for post_run command ")
245
246     parser.add_option("--sample-sheet", default=None,
247                       help="path to save demultiplexing sample sheet")
248
249     parser.add_option("--operator", default='', help="Name of sequencer operator")
250     parser.add_option("--recipe", default="Unknown",
251                       help="specify recipe name")
252
253     parser.add_option('-v', '--verbose', action='store_true', default=False,
254                        help='increase logging verbosity')
255     return parser
256
257 def constructConfigParser():
258     """
259     returns a pre-setup config parser
260     """
261     parser = RawConfigParser()
262     parser.read([CONFIG_SYSTEM, CONFIG_USER])
263     if not parser.has_section(GERALD_CONFIG_SECTION):
264         parser.add_section(GERALD_CONFIG_SECTION)
265
266     return parser
267
268
269 def getCombinedOptions(argv=None):
270     """
271     Returns optparse options after it has be updated with ConfigParser
272     config files and merged with parsed commandline options.
273
274     expects command line arguments to be passed in
275     """
276     cl_parser = constructOptionParser()
277     conf_parser = constructConfigParser()
278
279     if argv is None:
280         options = DummyOptions()
281     else:
282         options, args = cl_parser.parse_args(argv)
283
284     if options.url is None:
285         if conf_parser.has_option(GERALD_CONFIG_SECTION, 'config_host'):
286             options.url = conf_parser.get(GERALD_CONFIG_SECTION, 'config_host')
287
288     options.url = normalize_url(options.url)
289
290     if options.genome_dir is None:
291         if conf_parser.has_option(GERALD_CONFIG_SECTION, 'genome_dir'):
292             options.genome_dir = conf_parser.get(GERALD_CONFIG_SECTION, 'genome_dir')
293
294     if conf_parser.has_option(GERALD_CONFIG_SECTION, 'post_run'):
295         options.post_run = conf_parser.get(GERALD_CONFIG_SECTION, 'post_run')
296     else:
297         options.post_run = None
298
299     if options.output_filepath is None:
300         if options.runfolder is not None:
301             options.output_filepath = os.path.join(options.runfolder, 'config-auto.txt')
302
303     return options
304
305
306 def saveConfigFile(options):
307   """
308   retrieves the flowcell eland config file, give the base_host_url
309   (i.e. http://sub.domain.edu:port)
310   """
311   LOGGER.info('USING OPTIONS:')
312   LOGGER.info(u'     URL: %s' % (options.url,))
313   LOGGER.info(u'     OUT: %s' % (options.output_filepath,))
314   LOGGER.info(u'      FC: %s' % (options.flowcell,))
315   #LOGGER.info(': %s' % (options.genome_dir,))
316   LOGGER.info(u'post_run: %s' % ( unicode(options.post_run),))
317
318   flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
319
320   LOGGER.debug('genome_dir: %s' % ( options.genome_dir, ))
321   available_genomes = getAvailableGenomes(options.genome_dir)
322   genome_map = constructMapperDict(available_genomes)
323   LOGGER.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
324
325   config = format_gerald_config(options, flowcell_info, genome_map)
326
327   if options.output_filepath is not None:
328       outstream = open(options.output_filepath, 'w')
329       logging.info('Writing config file to %s' % (options.output_filepath,))
330   else:
331       outstream = sys.stdout
332
333   outstream.write(config)
334
335   if options.sample_sheet is None:
336       pass
337   elif options.sample_sheet == '-':
338       save_sample_sheet(sys.stdout, options, flowcell_info)
339   else:
340       stream = open(options.sample_sheet,'w')
341       save_sample_sheet(stream, options, flowcell_info)
342
343
344 def save_sample_sheet(outstream, options, flowcell_info):
345     sample_sheet_fields = ['FCID', 'Lane', 'SampleID', 'SampleRef', 'Index',
346                            'Description', 'Control', 'Recipe', 'Operator',
347                            'SampleProject']
348     illumina_to_htsw_map = {'FCID': 'flowcell',
349                             'Lane': 'lane_number',
350                             'SampleID': 'library_id',
351                             'SampleRef': format_sampleref,
352                             'Description': 'library_name',
353                             'Control': format_control_lane,
354                             'Recipe': format_recipe_name,
355                             'Operator': format_operator_name}
356     out = csv.DictWriter(outstream, sample_sheet_fields)
357     out.writerow(dict(((x,x) for x in sample_sheet_fields)))
358     for lane_number in LANE_LIST:
359         lane_contents = flowcell_info['lane_set'][str(lane_number)]
360
361         pooled_lane_contents = []
362         for library in lane_contents:
363             # build common attributes
364             renamed = {}
365             for illumina_name in sample_sheet_fields:
366                 htsw_field = illumina_to_htsw_map.get(illumina_name, None)
367                 if htsw_field is None:
368                     continue
369                 if callable(htsw_field):
370                     renamed[illumina_name] = htsw_field(options,
371                                                         flowcell_info,
372                                                         library)
373                 else:
374                     renamed[illumina_name] = library[htsw_field]
375
376             pooled_lane_contents.extend(format_pooled_libraries(renamed, library))
377
378         for row in pooled_lane_contents:
379             out.writerow(row)
380
381
382 def format_sampleref(options, flowcell_info, sample):
383     return sample['library_species'].replace(' ', '_')
384
385
386 def format_control_lane(options, flowcell_info, sample):
387     if sample['lane_number'] == flowcell_info['control_lane']:
388         return 'Y'
389     else:
390         return 'N'
391
392
393 def format_recipe_name(options, flowcell_info, sample):
394     return options.recipe
395
396
397 def format_operator_name(options, flowcell_info, sample):
398     return options.operator
399
400
401 def format_pooled_libraries(shared, library):
402     sequences = library.get('index_sequence', None)
403     if sequences is None:
404         return []
405     elif (type(sequences) in types.StringTypes and
406           sequences.lower().startswith('err')):
407         shared['Index'] = ''
408         shared['SampleProject'] = library['library_id']
409         return [shared]
410     elif (type(sequences) == types.DictType):
411         pooled = []
412         multiplex_ids = sequences.keys()
413         multiplex_ids.sort(key=int)
414         for multiplex_id in multiplex_ids:
415             sample = {}
416             sample.update(shared)
417             sample['Index'] = sequences[multiplex_id]
418             sample['SampleProject'] = format_project_name(library,
419                                                           multiplex_id)
420             pooled.append(sample)
421         return pooled
422     else:
423         raise RuntimeError("Unrecognized index type")
424
425
426
427 def format_project_name(library, multiplex_id):
428     library_id = library['library_id']
429     return "%s_index%s" % (library_id, multiplex_id)
430
431