Merge branch 'django1.4'
[htsworkflow.git] / htsworkflow / pipelines / retrieve_config.py
index 0ef42a589fcdd06a90be00c4cdda4a48d10c7b02..fe2b9428cd01c38775c6a743dc2588b140425833 100644 (file)
@@ -1,10 +1,12 @@
 #!/usr/bin/env python
 
+import csv
 from ConfigParser import RawConfigParser
 import logging
 from optparse import OptionParser, IndentedHelpFormatter
 import os
 import sys
+import types
 import urllib
 import urllib2
 
@@ -14,9 +16,17 @@ except ImportError, e:
     import simplejson as json
 
 from htsworkflow.frontend.auth import apidata
+from htsworkflow.util import api
+from htsworkflow.util import alphanum
 from htsworkflow.util.url import normalize_url
-from htsworkflow.pipelines.genome_mapper import getAvailableGenomes
-from htsworkflow.pipelines.genome_mapper import constructMapperDict
+from htsworkflow.pipelines.genome_mapper import \
+     getAvailableGenomes, \
+     constructMapperDict
+from htsworkflow.pipelines import LANE_LIST
+# JSON dictionaries use strings
+LANE_LIST_JSON = [ str(l) for l in LANE_LIST ]
+
+LOGGER = logging.getLogger(__name__)
 
 __docformat__ = "restructredtext en"
 
@@ -27,38 +37,36 @@ GERALD_CONFIG_SECTION = 'gerald_config'
 #Disable or enable commandline arg parsing; disabled by default.
 DISABLE_CMDLINE = True
 
-LANE_LIST = ['1','2','3','4','5','6','7','8']
-
 class FlowCellNotFound(Exception): pass
 class WebError404(Exception): pass
 
 def retrieve_flowcell_info(base_host_url, flowcell):
     """
-    Return a dictionary describing a 
+    Return a dictionary describing a
     """
-    url = base_host_url + '/experiments/config/%s/json' % (flowcell)
-  
+    url = api.flowcell_url(base_host_url, flowcell)
+
     try:
         apipayload = urllib.urlencode(apidata)
         web = urllib2.urlopen(url, apipayload)
     except urllib2.URLError, e:
         errmsg = 'URLError: %d %s' % (e.code, e.msg)
-        logging.error(errmsg)
-        logging.error('opened %s' % (url,))
+        LOGGER.error(errmsg)
+        LOGGER.error('opened %s' % (url,))
         raise IOError(errmsg)
-    
+
     contents = web.read()
     headers = web.info()
 
     if web.code == 403:
         msg = "403 - Forbbidden, probably need api key"
         raise FlowCellNotFound(msg)
-    
+
     if web.code == 404:
         msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \
               "Did you get right port #?" % (flowcell, base_host_url, url)
         raise FlowCellNotFound(msg)
-  
+
     if len(contents) == 0:
         msg = "No information for flowcell (%s) returned; full url(%s)" % (flowcell, url)
         raise FlowCellNotFound(msg)
@@ -74,7 +82,7 @@ def is_sequencing(lane_info):
         return True
     else:
         return False
-    
+
 def group_lane_parameters(flowcell_info):
     """
     goup lanes that can share GERALD configuration blocks.
@@ -82,11 +90,12 @@ def group_lane_parameters(flowcell_info):
     (The same species, read length, and eland vs sequencing)
     """
     lane_groups = {}
-    for lane_number, lane_info in flowcell_info['lane_set'].items():
-        index = (lane_info['read_length'],
-                 lane_info['library_species'],
-                 is_sequencing(lane_info))
-        lane_groups.setdefault(index, []).append(lane_number)
+    for lane_number, lane_contents in flowcell_info['lane_set'].items():
+        for lane_info in lane_contents:
+            index = (lane_info['read_length'],
+                     lane_info['library_species'],
+                     is_sequencing(lane_info))
+            lane_groups.setdefault(index, []).append(lane_number)
     return lane_groups
 
 def format_gerald_header(flowcell_info):
@@ -102,12 +111,13 @@ def format_gerald_header(flowcell_info):
     config += ['Flowcell Notes:']
     config.extend(flowcell_info['notes'].split('\r\n'))
     config += ['']
-    for lane_number in LANE_LIST:
-        lane_info = flowcell_info['lane_set'][lane_number]
-        config += ['Lane%s: %s | %s' % (lane_number, lane_info['library_id'],
-                                        lane_info['library_name'])]
+    for lane_number in sorted(flowcell_info['lane_set']):
+        lane_contents = flowcell_info['lane_set'][lane_number]
+        for lane_info in lane_contents:
+            config += ['Lane%s: %s | %s' % (lane_number,
+                                            lane_info['library_id'],
+                                            lane_info['library_name'])]
 
-    config += ['SEQUENCE_FORMAT --fastq']
     config += ['']
     return "\n# ".join(config)
 
@@ -123,6 +133,9 @@ def format_gerald_config(options, flowcell_info, genome_map):
     # in the config file... things like which lane is which library.
     config = [format_gerald_header(flowcell_info)]
 
+    config += ['SEQUENCE_FORMAT --fastq']
+    config += ['ELAND_SET_SIZE 20']
+    config += ['12345678:WITH_SEQUENCE true']
     analysis_suffix = eland_analysis_suffix[flowcell_info['paired_end']]
     sequence_suffix = sequence_analysis_suffix[flowcell_info['paired_end']]
     lane_groups = group_lane_parameters(flowcell_info)
@@ -131,14 +144,14 @@ def format_gerald_config(options, flowcell_info, genome_map):
         read_length, species, is_sequencing = lane_index
         lane_numbers.sort()
         lane_prefix = u"".join(lane_numbers)
-        
+
         species_path = genome_map.get(species, None)
-        logging.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
-        if species_path is None:
+        LOGGER.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
+        if not is_sequencing and species_path is None:
             no_genome_msg = "Forcing lanes %s to sequencing as there is no genome for %s"
-            logging.warning(no_genome_msg % (lane_numbers, species))
+            LOGGER.warning(no_genome_msg % (lane_numbers, species))
             is_sequencing = True
-            
+
         if is_sequencing:
             config += ['%s:ANALYSIS sequence%s' % (lane_prefix, sequence_suffix)]
         else:
@@ -147,16 +160,16 @@ def format_gerald_config(options, flowcell_info, genome_map):
         #config += ['%s:READ_LENGTH %s' % ( lane_prefix, read_length ) ]
         config += ['%s:USE_BASES Y%s' % ( lane_prefix, read_length ) ]
 
-    # add in option for running script after 
+    # add in option for running script after
     if not (options.post_run is None or options.runfolder is None):
         runfolder = os.path.abspath(options.runfolder)
         post_run = options.post_run  % {'runfolder': runfolder}
         config += ['POST_RUN_COMMAND %s' % (post_run,) ]
-        
+
     config += [''] # force trailing newline
-    
+
     return "\n".join(config)
-              
+
 class DummyOptions:
   """
   Used when command line parsing is disabled; default
@@ -168,14 +181,14 @@ class DummyOptions:
     self.genome_dir = None
 
 class PreformattedDescriptionFormatter(IndentedHelpFormatter):
-  
+
   #def format_description(self, description):
-  #  
+  #
   #  if description:
   #      return description + "\n"
   #  else:
   #     return ""
-      
+
   def format_epilog(self, epilog):
     """
     It was removing my preformated epilog, so this should override
@@ -194,33 +207,33 @@ def constructOptionParser():
     parser = OptionParser(formatter=PreformattedDescriptionFormatter())
 
     parser.set_description('Retrieves eland config file from hts_frontend web frontend.')
-  
+
     parser.epilog = """
 Config File:
   * %s (System wide)
   * %s (User specific; overrides system)
   * command line overrides all config file options
-  
+
   Example Config File:
-  
+
     [%s]
     config_host: http://somewhere.domain:port
     genome_dir: /path to search for genomes
     post_run: runfolder -o <destdir> %%(runfolder)s
-    
+
 """ % (CONFIG_SYSTEM, CONFIG_USER, GERALD_CONFIG_SECTION)
-  
+
     #Special formatter for allowing preformatted description.
     ##parser.format_epilog(PreformattedDescriptionFormatter())
 
     parser.add_option("-u", "--url",
                       action="store", type="string", dest="url")
-  
+
     parser.add_option("-o", "--output-file",
                       action="store", type="string", dest="output_filepath",
                       help="config file destination. If runfolder is specified defaults "
                            "to <runfolder>/config-auto.txt" )
-  
+
     parser.add_option("-f", "--flowcell",
                       action="store", type="string", dest="flowcell")
 
@@ -231,10 +244,17 @@ Config File:
                       action="store", type="string",
                       help="specify runfolder for post_run command ")
 
+    parser.add_option("--sample-sheet", default=None,
+                      help="path to save demultiplexing sample sheet")
+
+    parser.add_option("--operator", default='', help="Name of sequencer operator")
+    parser.add_option("--recipe", default="Unknown",
+                      help="specify recipe name")
+
     parser.add_option('-v', '--verbose', action='store_true', default=False,
                        help='increase logging verbosity')
     return parser
-    
+
 def constructConfigParser():
     """
     returns a pre-setup config parser
@@ -243,7 +263,7 @@ def constructConfigParser():
     parser.read([CONFIG_SYSTEM, CONFIG_USER])
     if not parser.has_section(GERALD_CONFIG_SECTION):
         parser.add_section(GERALD_CONFIG_SECTION)
-  
+
     return parser
 
 
@@ -261,13 +281,13 @@ def getCombinedOptions(argv=None):
         options = DummyOptions()
     else:
         options, args = cl_parser.parse_args(argv)
-        
+
     if options.url is None:
         if conf_parser.has_option(GERALD_CONFIG_SECTION, 'config_host'):
             options.url = conf_parser.get(GERALD_CONFIG_SECTION, 'config_host')
-      
+
     options.url = normalize_url(options.url)
-  
+
     if options.genome_dir is None:
         if conf_parser.has_option(GERALD_CONFIG_SECTION, 'genome_dir'):
             options.genome_dir = conf_parser.get(GERALD_CONFIG_SECTION, 'genome_dir')
@@ -280,7 +300,7 @@ def getCombinedOptions(argv=None):
     if options.output_filepath is None:
         if options.runfolder is not None:
             options.output_filepath = os.path.join(options.runfolder, 'config-auto.txt')
-            
+
     return options
 
 
@@ -289,19 +309,19 @@ def saveConfigFile(options):
   retrieves the flowcell eland config file, give the base_host_url
   (i.e. http://sub.domain.edu:port)
   """
-  logging.info('USING OPTIONS:')
-  logging.info(u'     URL: %s' % (options.url,))
-  logging.info(u'     OUT: %s' % (options.output_filepath,))
-  logging.info(u'      FC: %s' % (options.flowcell,))
-  #logging.info(': %s' % (options.genome_dir,))
-  logging.info(u'post_run: %s' % ( unicode(options.post_run),))
-   
+  LOGGER.info('USING OPTIONS:')
+  LOGGER.info(u'     URL: %s' % (options.url,))
+  LOGGER.info(u'     OUT: %s' % (options.output_filepath,))
+  LOGGER.info(u'      FC: %s' % (options.flowcell,))
+  #LOGGER.info(': %s' % (options.genome_dir,))
+  LOGGER.info(u'post_run: %s' % ( unicode(options.post_run),))
+
   flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
 
-  logging.debug('genome_dir: %s' % ( options.genome_dir, ))
+  LOGGER.debug('genome_dir: %s' % ( options.genome_dir, ))
   available_genomes = getAvailableGenomes(options.genome_dir)
   genome_map = constructMapperDict(available_genomes)
-  logging.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
+  LOGGER.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
 
   config = format_gerald_config(options, flowcell_info, genome_map)
 
@@ -310,9 +330,103 @@ def saveConfigFile(options):
       logging.info('Writing config file to %s' % (options.output_filepath,))
   else:
       outstream = sys.stdout
-      
+
   outstream.write(config)
-  
+
+  if options.sample_sheet is None:
+      pass
+  elif options.sample_sheet == '-':
+      save_sample_sheet(sys.stdout, options, flowcell_info)
+  else:
+      stream = open(options.sample_sheet,'w')
+      save_sample_sheet(stream, options, flowcell_info)
+
+
+def save_sample_sheet(outstream, options, flowcell_info):
+    sample_sheet_fields = ['FCID', 'Lane', 'SampleID', 'SampleRef', 'Index',
+                           'Description', 'Control', 'Recipe', 'Operator',
+                           'SampleProject']
+    illumina_to_htsw_map = {'FCID': 'flowcell',
+                            'Lane': 'lane_number',
+                            'SampleID': 'library_id',
+                            'SampleRef': format_sampleref,
+                            'Description': 'library_name',
+                            'Control': format_control_lane,
+                            'Recipe': format_recipe_name,
+                            'Operator': format_operator_name}
+    out = csv.DictWriter(outstream, sample_sheet_fields)
+    out.writerow(dict(((x,x) for x in sample_sheet_fields)))
+    for lane_number in sorted(flowcell_info['lane_set']):
+        lane_contents = flowcell_info['lane_set'][lane_number]
+
+        pooled_lane_contents = []
+        for library in lane_contents:
+            # build common attributes
+            renamed = {}
+            for illumina_name in sample_sheet_fields:
+                htsw_field = illumina_to_htsw_map.get(illumina_name, None)
+                if htsw_field is None:
+                    continue
+                if callable(htsw_field):
+                    renamed[illumina_name] = htsw_field(options,
+                                                        flowcell_info,
+                                                        library)
+                else:
+                    renamed[illumina_name] = library[htsw_field]
+
+            pooled_lane_contents.extend(format_pooled_libraries(renamed, library))
+
+        for row in pooled_lane_contents:
+            out.writerow(row)
+
+
+def format_sampleref(options, flowcell_info, sample):
+    return sample['library_species'].replace(' ', '_')
+
+
+def format_control_lane(options, flowcell_info, sample):
+    if sample['lane_number'] == flowcell_info['control_lane']:
+        return 'Y'
+    else:
+        return 'N'
+
+
+def format_recipe_name(options, flowcell_info, sample):
+    return options.recipe
+
+
+def format_operator_name(options, flowcell_info, sample):
+    return options.operator
+
+
+def format_pooled_libraries(shared, library):
+    sequences = library.get('index_sequence', None)
+    if sequences is None:
+        return []
+    elif (type(sequences) in types.StringTypes and
+          sequences.lower().startswith('err')):
+        shared['Index'] = ''
+        shared['SampleProject'] = library['library_id']
+        return [shared]
+    elif (type(sequences) == types.DictType):
+        pooled = []
+        multiplex_ids = sequences.keys()
+        multiplex_ids.sort(cmp=alphanum.alphanum)
+        for multiplex_id in multiplex_ids:
+            sample = {}
+            sample.update(shared)
+            sample['Index'] = sequences[multiplex_id]
+            sample['SampleProject'] = format_project_name(library,
+                                                          multiplex_id)
+            pooled.append(sample)
+        return pooled
+    else:
+        raise RuntimeError("Unrecognized index type")
+
+
+
+def format_project_name(library, multiplex_id):
+    library_id = library['library_id']
+    return "%s_index%s" % (library_id, multiplex_id)
 
 
-