Merge branch 'django1.4'

[htsworkflow.git] / htsworkflow / pipelines / retrieve_config.py
diff --git a/htsworkflow/pipelines/retrieve_config.py b/htsworkflow/pipelines/retrieve_config.py

index 0ef42a589fcdd06a90be00c4cdda4a48d10c7b02..fe2b9428cd01c38775c6a743dc2588b140425833 100644 (file)
--- a/htsworkflow/pipelines/retrieve_config.py
+++ b/htsworkflow/pipelines/retrieve_config.py
@@ -1,10 +1,12 @@
  #!/usr/bin/env python
  
+import csv
  from ConfigParser import RawConfigParser
  import logging
  from optparse import OptionParser, IndentedHelpFormatter
  import os
  import sys
+import types
  import urllib
  import urllib2
  
@@ -14,9 +16,17 @@ except ImportError, e:
      import simplejson as json
  
  from htsworkflow.frontend.auth import apidata
+from htsworkflow.util import api
+from htsworkflow.util import alphanum
  from htsworkflow.util.url import normalize_url
-from htsworkflow.pipelines.genome_mapper import getAvailableGenomes
-from htsworkflow.pipelines.genome_mapper import constructMapperDict
+from htsworkflow.pipelines.genome_mapper import \
+     getAvailableGenomes, \
+     constructMapperDict
+from htsworkflow.pipelines import LANE_LIST
+# JSON dictionaries use strings
+LANE_LIST_JSON = [ str(l) for l in LANE_LIST ]
+
+LOGGER = logging.getLogger(__name__)
  
  __docformat__ = "restructredtext en"
  
@@ -27,38 +37,36 @@ GERALD_CONFIG_SECTION = 'gerald_config'
  #Disable or enable commandline arg parsing; disabled by default.
  DISABLE_CMDLINE = True
  
-LANE_LIST = ['1','2','3','4','5','6','7','8']
-
  class FlowCellNotFound(Exception): pass
  class WebError404(Exception): pass
  
  def retrieve_flowcell_info(base_host_url, flowcell):
      """
-    Return a dictionary describing a 
+    Return a dictionary describing a
      """
-    url = base_host_url + '/experiments/config/%s/json' % (flowcell)
-  
+    url = api.flowcell_url(base_host_url, flowcell)
+
      try:
          apipayload = urllib.urlencode(apidata)
          web = urllib2.urlopen(url, apipayload)
      except urllib2.URLError, e:
          errmsg = 'URLError: %d %s' % (e.code, e.msg)
-        logging.error(errmsg)
-        logging.error('opened %s' % (url,))
+        LOGGER.error(errmsg)
+        LOGGER.error('opened %s' % (url,))
          raise IOError(errmsg)
-    
+
      contents = web.read()
      headers = web.info()
  
      if web.code == 403:
          msg = "403 - Forbbidden, probably need api key"
          raise FlowCellNotFound(msg)
-    
+
      if web.code == 404:
          msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \
                "Did you get right port #?" % (flowcell, base_host_url, url)
          raise FlowCellNotFound(msg)
-  
+
      if len(contents) == 0:
          msg = "No information for flowcell (%s) returned; full url(%s)" % (flowcell, url)
          raise FlowCellNotFound(msg)
@@ -74,7 +82,7 @@ def is_sequencing(lane_info):
          return True
      else:
          return False
-    
+
  def group_lane_parameters(flowcell_info):
      """
      goup lanes that can share GERALD configuration blocks.
@@ -82,11 +90,12 @@ def group_lane_parameters(flowcell_info):
      (The same species, read length, and eland vs sequencing)
      """
      lane_groups = {}
-    for lane_number, lane_info in flowcell_info['lane_set'].items():
-        index = (lane_info['read_length'],
-                 lane_info['library_species'],
-                 is_sequencing(lane_info))
-        lane_groups.setdefault(index, []).append(lane_number)
+    for lane_number, lane_contents in flowcell_info['lane_set'].items():
+        for lane_info in lane_contents:
+            index = (lane_info['read_length'],
+                     lane_info['library_species'],
+                     is_sequencing(lane_info))
+            lane_groups.setdefault(index, []).append(lane_number)
      return lane_groups
  
  def format_gerald_header(flowcell_info):
@@ -102,12 +111,13 @@ def format_gerald_header(flowcell_info):
      config += ['Flowcell Notes:']
      config.extend(flowcell_info['notes'].split('\r\n'))
      config += ['']
-    for lane_number in LANE_LIST:
-        lane_info = flowcell_info['lane_set'][lane_number]
-        config += ['Lane%s: %s | %s' % (lane_number, lane_info['library_id'],
-                                        lane_info['library_name'])]
+    for lane_number in sorted(flowcell_info['lane_set']):
+        lane_contents = flowcell_info['lane_set'][lane_number]
+        for lane_info in lane_contents:
+            config += ['Lane%s: %s | %s' % (lane_number,
+                                            lane_info['library_id'],
+                                            lane_info['library_name'])]
  
-    config += ['SEQUENCE_FORMAT --fastq']
      config += ['']
      return "\n# ".join(config)
  
@@ -123,6 +133,9 @@ def format_gerald_config(options, flowcell_info, genome_map):
      # in the config file... things like which lane is which library.
      config = [format_gerald_header(flowcell_info)]
  
+    config += ['SEQUENCE_FORMAT --fastq']
+    config += ['ELAND_SET_SIZE 20']
+    config += ['12345678:WITH_SEQUENCE true']
      analysis_suffix = eland_analysis_suffix[flowcell_info['paired_end']]
      sequence_suffix = sequence_analysis_suffix[flowcell_info['paired_end']]
      lane_groups = group_lane_parameters(flowcell_info)
@@ -131,14 +144,14 @@ def format_gerald_config(options, flowcell_info, genome_map):
          read_length, species, is_sequencing = lane_index
          lane_numbers.sort()
          lane_prefix = u"".join(lane_numbers)
-        
+
          species_path = genome_map.get(species, None)
-        logging.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
-        if species_path is None:
+        LOGGER.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
+        if not is_sequencing and species_path is None:
              no_genome_msg = "Forcing lanes %s to sequencing as there is no genome for %s"
-            logging.warning(no_genome_msg % (lane_numbers, species))
+            LOGGER.warning(no_genome_msg % (lane_numbers, species))
              is_sequencing = True
-            
+
          if is_sequencing:
              config += ['%s:ANALYSIS sequence%s' % (lane_prefix, sequence_suffix)]
          else:
@@ -147,16 +160,16 @@ def format_gerald_config(options, flowcell_info, genome_map):
          #config += ['%s:READ_LENGTH %s' % ( lane_prefix, read_length ) ]
          config += ['%s:USE_BASES Y%s' % ( lane_prefix, read_length ) ]
  
-    # add in option for running script after 
+    # add in option for running script after
      if not (options.post_run is None or options.runfolder is None):
          runfolder = os.path.abspath(options.runfolder)
          post_run = options.post_run  % {'runfolder': runfolder}
          config += ['POST_RUN_COMMAND %s' % (post_run,) ]
-        
+
      config += [''] # force trailing newline
-    
+
      return "\n".join(config)
-              
+
  class DummyOptions:
    """
    Used when command line parsing is disabled; default
@@ -168,14 +181,14 @@ class DummyOptions:
      self.genome_dir = None
  
  class PreformattedDescriptionFormatter(IndentedHelpFormatter):
-  
+
    #def format_description(self, description):
-  #  
+  #
    #  if description:
    #      return description + "\n"
    #  else:
    #     return ""
-      
+
    def format_epilog(self, epilog):
      """
      It was removing my preformated epilog, so this should override
@@ -194,33 +207,33 @@ def constructOptionParser():
      parser = OptionParser(formatter=PreformattedDescriptionFormatter())
  
      parser.set_description('Retrieves eland config file from hts_frontend web frontend.')
-  
+
      parser.epilog = """
  Config File:
    * %s (System wide)
    * %s (User specific; overrides system)
    * command line overrides all config file options
-  
+
    Example Config File:
-  
+
      [%s]
      config_host: http://somewhere.domain:port
      genome_dir: /path to search for genomes
      post_run: runfolder -o <destdir> %%(runfolder)s
-    
+
  """ % (CONFIG_SYSTEM, CONFIG_USER, GERALD_CONFIG_SECTION)
-  
+
      #Special formatter for allowing preformatted description.
      ##parser.format_epilog(PreformattedDescriptionFormatter())
  
      parser.add_option("-u", "--url",
                        action="store", type="string", dest="url")
-  
+
      parser.add_option("-o", "--output-file",
                        action="store", type="string", dest="output_filepath",
                        help="config file destination. If runfolder is specified defaults "
                             "to <runfolder>/config-auto.txt" )
-  
+
      parser.add_option("-f", "--flowcell",
                        action="store", type="string", dest="flowcell")
  
@@ -231,10 +244,17 @@ Config File:
                        action="store", type="string",
                        help="specify runfolder for post_run command ")
  
+    parser.add_option("--sample-sheet", default=None,
+                      help="path to save demultiplexing sample sheet")
+
+    parser.add_option("--operator", default='', help="Name of sequencer operator")
+    parser.add_option("--recipe", default="Unknown",
+                      help="specify recipe name")
+
      parser.add_option('-v', '--verbose', action='store_true', default=False,
                         help='increase logging verbosity')
      return parser
-    
+
  def constructConfigParser():
      """
      returns a pre-setup config parser
@@ -243,7 +263,7 @@ def constructConfigParser():
      parser.read([CONFIG_SYSTEM, CONFIG_USER])
      if not parser.has_section(GERALD_CONFIG_SECTION):
          parser.add_section(GERALD_CONFIG_SECTION)
-  
+
      return parser
  
  
@@ -261,13 +281,13 @@ def getCombinedOptions(argv=None):
          options = DummyOptions()
      else:
          options, args = cl_parser.parse_args(argv)
-        
+
      if options.url is None:
          if conf_parser.has_option(GERALD_CONFIG_SECTION, 'config_host'):
              options.url = conf_parser.get(GERALD_CONFIG_SECTION, 'config_host')
-      
+
      options.url = normalize_url(options.url)
-  
+
      if options.genome_dir is None:
          if conf_parser.has_option(GERALD_CONFIG_SECTION, 'genome_dir'):
              options.genome_dir = conf_parser.get(GERALD_CONFIG_SECTION, 'genome_dir')
@@ -280,7 +300,7 @@ def getCombinedOptions(argv=None):
      if options.output_filepath is None:
          if options.runfolder is not None:
              options.output_filepath = os.path.join(options.runfolder, 'config-auto.txt')
-            
+
      return options
  
  
@@ -289,19 +309,19 @@ def saveConfigFile(options):
    retrieves the flowcell eland config file, give the base_host_url
    (i.e. http://sub.domain.edu:port)
    """
-  logging.info('USING OPTIONS:')
-  logging.info(u'     URL: %s' % (options.url,))
-  logging.info(u'     OUT: %s' % (options.output_filepath,))
-  logging.info(u'      FC: %s' % (options.flowcell,))
-  #logging.info(': %s' % (options.genome_dir,))
-  logging.info(u'post_run: %s' % ( unicode(options.post_run),))
-   
+  LOGGER.info('USING OPTIONS:')
+  LOGGER.info(u'     URL: %s' % (options.url,))
+  LOGGER.info(u'     OUT: %s' % (options.output_filepath,))
+  LOGGER.info(u'      FC: %s' % (options.flowcell,))
+  #LOGGER.info(': %s' % (options.genome_dir,))
+  LOGGER.info(u'post_run: %s' % ( unicode(options.post_run),))
+
    flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
  
-  logging.debug('genome_dir: %s' % ( options.genome_dir, ))
+  LOGGER.debug('genome_dir: %s' % ( options.genome_dir, ))
    available_genomes = getAvailableGenomes(options.genome_dir)
    genome_map = constructMapperDict(available_genomes)
-  logging.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
+  LOGGER.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
  
    config = format_gerald_config(options, flowcell_info, genome_map)
  
@@ -310,9 +330,103 @@ def saveConfigFile(options):
        logging.info('Writing config file to %s' % (options.output_filepath,))
    else:
        outstream = sys.stdout
-      
+
    outstream.write(config)
-  
+
+  if options.sample_sheet is None:
+      pass
+  elif options.sample_sheet == '-':
+      save_sample_sheet(sys.stdout, options, flowcell_info)
+  else:
+      stream = open(options.sample_sheet,'w')
+      save_sample_sheet(stream, options, flowcell_info)
+
+
+def save_sample_sheet(outstream, options, flowcell_info):
+    sample_sheet_fields = ['FCID', 'Lane', 'SampleID', 'SampleRef', 'Index',
+                           'Description', 'Control', 'Recipe', 'Operator',
+                           'SampleProject']
+    illumina_to_htsw_map = {'FCID': 'flowcell',
+                            'Lane': 'lane_number',
+                            'SampleID': 'library_id',
+                            'SampleRef': format_sampleref,
+                            'Description': 'library_name',
+                            'Control': format_control_lane,
+                            'Recipe': format_recipe_name,
+                            'Operator': format_operator_name}
+    out = csv.DictWriter(outstream, sample_sheet_fields)
+    out.writerow(dict(((x,x) for x in sample_sheet_fields)))
+    for lane_number in sorted(flowcell_info['lane_set']):
+        lane_contents = flowcell_info['lane_set'][lane_number]
+
+        pooled_lane_contents = []
+        for library in lane_contents:
+            # build common attributes
+            renamed = {}
+            for illumina_name in sample_sheet_fields:
+                htsw_field = illumina_to_htsw_map.get(illumina_name, None)
+                if htsw_field is None:
+                    continue
+                if callable(htsw_field):
+                    renamed[illumina_name] = htsw_field(options,
+                                                        flowcell_info,
+                                                        library)
+                else:
+                    renamed[illumina_name] = library[htsw_field]
+
+            pooled_lane_contents.extend(format_pooled_libraries(renamed, library))
+
+        for row in pooled_lane_contents:
+            out.writerow(row)
+
+
+def format_sampleref(options, flowcell_info, sample):
+    return sample['library_species'].replace(' ', '_')
+
+
+def format_control_lane(options, flowcell_info, sample):
+    if sample['lane_number'] == flowcell_info['control_lane']:
+        return 'Y'
+    else:
+        return 'N'
+
+
+def format_recipe_name(options, flowcell_info, sample):
+    return options.recipe
+
+
+def format_operator_name(options, flowcell_info, sample):
+    return options.operator
+
+
+def format_pooled_libraries(shared, library):
+    sequences = library.get('index_sequence', None)
+    if sequences is None:
+        return []
+    elif (type(sequences) in types.StringTypes and
+          sequences.lower().startswith('err')):
+        shared['Index'] = ''
+        shared['SampleProject'] = library['library_id']
+        return [shared]
+    elif (type(sequences) == types.DictType):
+        pooled = []
+        multiplex_ids = sequences.keys()
+        multiplex_ids.sort(cmp=alphanum.alphanum)
+        for multiplex_id in multiplex_ids:
+            sample = {}
+            sample.update(shared)
+            sample['Index'] = sequences[multiplex_id]
+            sample['SampleProject'] = format_project_name(library,
+                                                          multiplex_id)
+            pooled.append(sample)
+        return pooled
+    else:
+        raise RuntimeError("Unrecognized index type")
+
+
+
+def format_project_name(library, multiplex_id):
+    library_id = library['library_id']
+    return "%s_index%s" % (library_id, multiplex_id)
  
  
-