Implement a client side config file generator.
authorDiane Trout <diane@caltech.edu>
Sat, 26 Sep 2009 00:02:17 +0000 (00:02 +0000)
committerDiane Trout <diane@caltech.edu>
Sat, 26 Sep 2009 00:02:17 +0000 (00:02 +0000)
This downloads the flowcell information json block and then
creates a gerald config file with it.
This version will also look for a "post_run" entry in the
htsworkflow.ini config file for a script that should be
inserted into the config file to be run when make ends.

htsworkflow/frontend/experiments/experiments.py
htsworkflow/pipelines/genome_mapper.py
htsworkflow/pipelines/retrieve_config.py
htsworkflow/pipelines/test/test_retrive_config.py [new file with mode: 0644]
htsworkflow/util/url.py [new file with mode: 0644]
scripts/retrieve_config

index 331bdde59a2e57a42fd66d633eac580c40e1660b..e9a7ab760100b6846e48f1517484ef2c5e762b48 100755 (executable)
@@ -32,12 +32,15 @@ def flowcell_information(flowcell_id):
         lane_set[lane.lane_number] = {
             'cluster_estimate': lane.cluster_estimate,
             'comment': lane.comment,
+            'experiment_type': lane.library.experiment_type.name,
+            'experiment_type_id': lane.library.experiment_type_id,
             'flowcell': lane.flowcell.flowcell_id,
             'lane_number': int(lane.lane_number),
             'library_name': lane.library.library_name,
             'library_id': lane.library.library_id,
             'library_species': lane.library.library_species.scientific_name,
             'pM': float(lane.pM),
+            'read_length': fc.read_length
         }
     info = {
         'advanced_run': fc.advanced_run,
@@ -152,7 +155,6 @@ def generateConfile(request,fcid):
     try:                                                                                                                                              
       fc = FlowCell.objects.get(flowcell_id=fcid)
       for lane in fc.lane_set.all():
-          print dir(lane.library.library_species)
           config += [ str(lane.lane_number) +":" + \
                       genome_dir + lane.library.library_species.scientific_name ]
           config += [ str(lane.lane_number) +":" + \
@@ -174,9 +176,6 @@ def getConfile(req):
     cnfgfile = 'Nothing found'
     runfolder = 'unknown'
     request = req.REQUEST
-    print request, dir(request)
-    print request['fcid'], request.has_key('fcid')
-    print request['runf']
     if request.has_key('fcid'):
       fcid = request['fcid']
       if request.has_key('runf'):
index d29e44640b233c6cdc201e24c26ae5c781ae2891..5ae788e07089fde076d8afe8f56995f42978f7e5 100644 (file)
@@ -74,35 +74,37 @@ class constructMapperDict(object):
     """
     def __init__(self, genome_dict):
         self.genome_dict = genome_dict
-        
+
     def __getitem__(self, key):
         """
         Return the best match for key
         """
         elements = re.split("\|", key)
         
-        try:  
-            if len(elements) == 1:
-                # we just the species name
-                # get the set of builds
-                builds = self.genome_dict[elements[0]]
+        if len(elements) == 1:
+          # we just the species name
+          # get the set of builds
+          builds = self.genome_dict[elements[0]]
             
-                # sort build names the way humans would
-                keys = builds.keys()
-                keys.sort(cmp=alphanum)
-            
-                # return the path from the 'last' build name
-                return builds[keys[-1]]
+          # sort build names the way humans would
+          keys = builds.keys()
+          keys.sort(cmp=alphanum)
+          
+          # return the path from the 'last' build name
+          return builds[keys[-1]]
                         
-            elif len(elements) == 2:
-                # we have species, and build name
-                return self.genome_dict[elements[0]][elements[1]]
-            else:
-                raise KeyError("Unrecognized key")
-        except KeyError, e:
-            logging.error('Unrecognized genome identifier: %s' % str((elements),))
-            return "NoGenomeAvailable"
-        
+        elif len(elements) == 2:
+          # we have species, and build name
+          return self.genome_dict[elements[0]][elements[1]]
+        else:
+          raise KeyError("Unrecognized key")
+
+    def get(self, key, default=None):
+      try:
+        return self[key]
+      except KeyError, e:
+        return default
+      
     def keys(self):
         keys = []
         for species in self.genome_dict.keys():
index e9745a377fe2bed0c9a14789e630409b799e95c3..9d77c3579a36351a5e58f4b30604860c3e075ce0 100644 (file)
 #!/usr/bin/env python
 
-from optparse import OptionParser, IndentedHelpFormatter
-from ConfigParser import SafeConfigParser
-
+from ConfigParser import RawConfigParser
 import logging
+from optparse import OptionParser, IndentedHelpFormatter
 import os
 import sys
+import urllib
 import urllib2
 
+try:
+    import json
+except ImportError, e:
+    import simplejson as json
+
+from htsworkflow.frontend.auth import apidata
+from htsworkflow.util.url import normalize_url
+from htsworkflow.pipelines.genome_mapper import getAvailableGenomes
+from htsworkflow.pipelines.genome_mapper import constructMapperDict
+
 __docformat__ = "restructredtext en"
 
-CONFIG_SYSTEM = '/etc/hts_frontend/hts_frontend.conf'
-CONFIG_USER = os.path.expanduser('~/.hts_frontend.conf')
+CONFIG_SYSTEM = '/etc/htsworkflow.ini'
+CONFIG_USER = os.path.expanduser('~/.htsworkflow.ini')
+GERALD_CONFIG_SECTION = 'gerald_config'
 
 #Disable or enable commandline arg parsing; disabled by default.
 DISABLE_CMDLINE = True
 
+LANE_LIST = ['1','2','3','4','5','6','7','8']
+
 class FlowCellNotFound(Exception): pass
 class WebError404(Exception): pass
 
+def retrieve_flowcell_info(base_host_url, flowcell):
+    """
+    Return a dictionary describing a 
+    """
+    url = base_host_url + '/experiments/config/%s/json' % (flowcell)
+  
+    try:
+        apipayload = urllib.urlencode(apidata)
+        web = urllib2.urlopen(url, apipayload)
+    except urllib2.URLError, e:
+        errmsg = 'URLError: %d %s' % (e.code, e.msg)
+        logging.error(errmsg)
+        logging.error('opened %s' % (url,))
+        raise IOError(errmsg)
+    
+    contents = web.read()
+    headers = web.info()
+    
+    if web.getcode() == 403:
+        msg = "403 - Forbbidden, probably need api key"
+        raise FlowCellNotFound(msg)
+    
+    if web.getcode() == 404:
+        msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \
+              "Did you get right port #?" % (flowcell, base_host_url, url)
+        raise FlowCellNotFound(msg)
+  
+    if len(contents) == 0:
+        msg = "No information for flowcell (%s) returned; full url(%s)" % (flowcell, url)
+        raise FlowCellNotFound(msg)
+
+    data = json.loads(contents)
+    return data
+
+def is_sequencing(lane_info):
+    """
+    Determine if we are just sequencing and not doing any follow-up analysis
+    """
+    if lane_info['experiment_type'] in ('De Novo','Whole Genome'):
+        return True
+    else:
+        return False
+    
+def group_lane_parameters(flowcell_info):
+    """
+    goup lanes that can share GERALD configuration blocks.
+
+    (The same species, read length, and eland vs sequencing)
+    """
+    lane_groups = {}
+    for lane_number, lane_info in flowcell_info['lane_set'].items():
+        index = (lane_info['read_length'],
+                 lane_info['library_species'],
+                 is_sequencing(lane_info))
+        lane_groups.setdefault(index, []).append(lane_number)
+    return lane_groups
+
+def format_gerald_header(flowcell_info):
+    """
+    Generate comment describing the contents of the flowcell
+    """
+    # I'm using '\n# ' to join the lines together, that doesn't include the
+    # first element so i needed to put the # in manually
+    config = ['# FLOWCELL: %s' % (flowcell_info['flowcell_id'])]
+    config += ['']
+    config += ['CONTROL-LANE: %s' % (flowcell_info['control_lane'],)]
+    config += ['']
+    config += ['Flowcell Notes:']
+    config.extend(flowcell_info['notes'].split('\r\n'))
+    config += ['']
+    for lane_number in LANE_LIST:
+        lane_info = flowcell_info['lane_set'][lane_number]
+        config += ['Lane%s: %s | %s' % (lane_number, lane_info['library_id'],
+                                        lane_info['library_name'])]
+    config += ['']
+    return "\n# ".join(config)
+
+def format_gerald_config(options, flowcell_info, genome_map):
+    """
+    Generate a GERALD config file
+    """
+    # so we can add nothing or _pair if we're a paired end run
+    run_type_suffix = { False: "", True: "_pair" }
+
+    # it's convienent to have helpful information describing the flowcell
+    # in the config file... things like which lane is which library.
+    config = [format_gerald_header(flowcell_info)]
+
+    analysis_suffix = run_type_suffix[flowcell_info['paired_end']]
+    lane_groups = group_lane_parameters(flowcell_info)
+    for lane_index, lane_numbers in lane_groups.items():
+        # lane_index is return value of group_lane_parameters
+        read_length, species, is_sequencing = lane_index
+        lane_numbers.sort()
+        lane_prefix = u"".join(lane_numbers)
+        
+        if not is_sequencing:
+            config += ['%s:ANALYSIS eland%s' % (lane_prefix, analysis_suffix)]
+        else:
+            config += ['%s:ANALYSIS sequence%s' % (lane_prefix, analysis_suffix)]
+        #config += ['%s:READ_LENGTH %s' % ( lane_prefix, read_length ) ]
+        config += ['%s:USE_BASES Y%s' % ( lane_prefix, read_length ) ]
+        species_path = genome_map.get(species, "Unknown")
+        config += ['%s:ELAND_GENOME %s' % (lane_prefix, species_path) ]
+
+    # add in option for running script after 
+    if options.post_run is not None:
+        post_run = options.post_run  % {'runfolder': options.runfolder}
+        config += ['POST_RUN_COMMAND %s' % (post_run,) ]
+        
+    config += [''] # force trailing newline
+    
+    return "\n".join(config)
+              
 class DummyOptions:
   """
   Used when command line parsing is disabled; default
@@ -50,19 +177,14 @@ class PreformattedDescriptionFormatter(IndentedHelpFormatter):
 
 
 def constructOptionParser():
-  """
-  returns a pre-setup optparser
-  """
-  global DISABLE_CMDLINE
-  
-  if DISABLE_CMDLINE:
-    return None
-  
-  parser = OptionParser(formatter=PreformattedDescriptionFormatter())
+    """
+    returns a pre-setup optparser
+    """
+    parser = OptionParser(formatter=PreformattedDescriptionFormatter())
 
-  parser.set_description('Retrieves eland config file from hts_frontend web frontend.')
+    parser.set_description('Retrieves eland config file from hts_frontend web frontend.')
   
-  parser.epilog = """
+    parser.epilog = """
 Config File:
   * %s (System wide)
   * %s (User specific; overrides system)
@@ -70,116 +192,111 @@ Config File:
   
   Example Config File:
   
-    [config_file_server]
-    base_host_url=http://somewhere.domain:port
-""" % (CONFIG_SYSTEM, CONFIG_USER)
+    [%s]
+    config_host=http://somewhere.domain:port
+    genome_dir=/path to search for genomes
+    
+""" % (CONFIG_SYSTEM, CONFIG_USER, GERALD_CONFIG_SECTION)
   
-  #Special formatter for allowing preformatted description.
-  ##parser.format_epilog(PreformattedDescriptionFormatter())
+    #Special formatter for allowing preformatted description.
+    ##parser.format_epilog(PreformattedDescriptionFormatter())
 
-  parser.add_option("-u", "--url",
-                    action="store", type="string", dest="url")
+    parser.add_option("-u", "--url",
+                      action="store", type="string", dest="url")
   
-  parser.add_option("-o", "--output",
-                    action="store", type="string", dest="output_filepath")
+    parser.add_option("-o", "--output-file",
+                      action="store", type="string", dest="output_filepath",
+                      help="config file destination. If runfolder is specified defaults "
+                           "to <runfolder>/config-auto.txt" )
   
-  parser.add_option("-f", "--flowcell",
-                    action="store", type="string", dest="flowcell")
+    parser.add_option("-f", "--flowcell",
+                      action="store", type="string", dest="flowcell")
 
-  parser.add_option("-g", "--genome_dir",
-                    action="store", type="string", dest="genome_dir")
-  
-  #parser.set_default("url", "default")
-  
-  return parser
+    parser.add_option("-g", "--genome_dir",
+                      action="store", type="string", dest="genome_dir")
 
+    parser.add_option("-r", "--runfolder",
+                      action="store", type="string",
+                      help="specify runfolder for post_run command ")
+    
+    return parser
+    
 def constructConfigParser():
-  """
-  returns a pre-setup config parser
-  """
-  parser = SafeConfigParser()
-  parser.read([CONFIG_SYSTEM, CONFIG_USER])
-  if not parser.has_section('config_file_server'):
-    parser.add_section('config_file_server')
-  if not parser.has_section('local_setup'):
-    parser.add_section('local_setup')
+    """
+    returns a pre-setup config parser
+    """
+    parser = RawConfigParser()
+    parser.read([CONFIG_SYSTEM, CONFIG_USER])
+    if not parser.has_section(GERALD_CONFIG_SECTION):
+        parser.add_section(GERALD_CONFIG_SECTION)
   
-  return parser
+    return parser
 
 
-def getCombinedOptions():
-  """
-  Returns optparse options after it has be updated with ConfigParser
-  config files and merged with parsed commandline options.
-  """
-  cl_parser = constructOptionParser()
-  conf_parser = constructConfigParser()
-  
-  if cl_parser is None:
-    options = DummyOptions()
-  else:
-    options, args = cl_parser.parse_args()
-  
-  if options.url is None:
-    if conf_parser.has_option('config_file_server', 'base_host_url'):
-      options.url = conf_parser.get('config_file_server', 'base_host_url')
+def getCombinedOptions(argv=None):
+    """
+    Returns optparse options after it has be updated with ConfigParser
+    config files and merged with parsed commandline options.
 
-  if options.genome_dir is None:
-    if conf_parser.has_option('local_setup', 'genome_dir'):
-      options.genome_dir = conf_parser.get('local_setup', 'genome_dir')
-  
-  logging.info('USING OPTIONS:')
-  logging.info(' URL: %s' % (options.url,))
-  logging.info(' OUT: %s' % (options.output_filepath,))
-  logging.info('  FC: %s' % (options.flowcell,))
-  logging.info('GDIR: %s' % (options.genome_dir,))
+    expects command line arguments to be passed in
+    """
+    cl_parser = constructOptionParser()
+    conf_parser = constructConfigParser()
+
+    if argv is None:
+        options = DummyOptions()
+    else:
+        options, args = cl_parser.parse_args(argv)
+        
+    if options.url is None:
+        if conf_parser.has_option(GERALD_CONFIG_SECTION, 'config_host'):
+            options.url = conf_parser.get(GERALD_CONFIG_SECTION, 'config_host')
+      
+    options.url = normalize_url(options.url)
   
-  return options
+    if options.genome_dir is None:
+        if conf_parser.has_option(GERALD_CONFIG_SECTION, 'genome_dir'):
+            options.genome_dir = conf_parser.get(GERALD_CONFIG_SECTION, 'genome_dir')
+
+    if conf_parser.has_option(GERALD_CONFIG_SECTION, 'post_run'):
+        options.post_run = conf_parser.get(GERALD_CONFIG_SECTION, 'post_run')
+    else:
+        options.post_run = None
+
+    if options.output_filepath is None:
+        if options.runfolder is not None:
+            options.output_filepath = os.path.join(options.runfolder, 'config-auto.txt')
+            
+    logging.info('USING OPTIONS:')
+    logging.info(u'     URL: %s' % (options.url,))
+    logging.info(u'     OUT: %s' % (options.output_filepath,))
+    logging.info(u'      FC: %s' % (options.flowcell,))
+    #logging.info(': %s' % (options.genome_dir,))
+    logging.info(u'post_run: %s' % ( unicode(options.post_run),))
+    
+    return options
 
 
-def saveConfigFile(flowcell, base_host_url, output_filepath):
+def saveConfigFile(options):
   """
   retrieves the flowcell eland config file, give the base_host_url
   (i.e. http://sub.domain.edu:port)
   """
-  url = base_host_url + '/eland_config/%s/' % (flowcell)
-  
-  f = open(output_filepath, 'w')
-  #try:
-  try:
-    web = urllib2.urlopen(url)
-  except urllib2.URLError, e:
-    errmsg = 'URLError: %d %s' % (e.code, e.msg)
-    logging.error(errmsg)
-    logging.error('opened %s' % (url,))
-    raise IOError(errmsg)
-
-  #except IOError, msg:
-  #  if str(msg).find("Connection refused") >= 0:
-  #    print 'Error: Connection refused for: %s' % (url)
-  #    f.close()
-  #    sys.exit(1)
-  #  elif str(msg).find("Name or service not known") >= 0:
-  #    print 'Error: Invalid domain or ip address for: %s' % (url)
-  #    f.close()
-  #    sys.exit(2)
-  #  else:
-  #    raise IOError, msg
+  flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
 
-  data = web.read()
+  available_genomes = getAvailableGenomes(options.genome_dir)
+  genome_map = constructMapperDict(available_genomes)
 
-  if data.find('Hmm, config file for') >= 0:
-    msg = "Flowcell (%s) not found in DB; full url(%s)" % (flowcell, url)
-    raise FlowCellNotFound, msg
+  config = format_gerald_config(options, flowcell_info, genome_map)
 
-  if data.find('404 - Not Found') >= 0:
-    msg = "404 - Not Found: Flowcell (%s); base_host_url (%s);\n full url(%s)\n " \
-          "Did you get right port #?" % (flowcell, base_host_url, url)
-    raise FlowCellNotFound, msg
+  if options.output_filepath is not None:
+      outstream = open(options.output_filepath, 'w')
+      logging.info('Writing config file to %s' % (options.output_filepath,))
+  else:
+      outstream = sys.stdout
+      
+  outstream.write(config)
   
-  f.write(data)
-  web.close()
-  f.close()
-  logging.info('Wrote config file to %s' % (output_filepath,))
+
 
   
diff --git a/htsworkflow/pipelines/test/test_retrive_config.py b/htsworkflow/pipelines/test/test_retrive_config.py
new file mode 100644 (file)
index 0000000..dd8f30e
--- /dev/null
@@ -0,0 +1,43 @@
+import os
+import re
+
+try:
+    import json
+except ImportError, e:
+    import simplejson as json
+    
+from django.test import TestCase
+
+from htsworkflow.frontend.auth import apidata
+from htsworkflow.pipelines.retrieve_config import format_gerald_config, getCombinedOptions
+
+class RetrieveTestCases(TestCase):
+    fixtures = ['test_flowcells.json']
+
+    def setUp(self):
+        pass
+
+    def test_format_gerald(self):
+        flowcell_request = self.client.get('/experiments/config/303TUAAXX/json', apidata)
+        self.failUnlessEqual(flowcell_request.status_code, 200)
+
+        print dir(flowcell_request)
+        flowcell_info = json.loads(flowcell_request.content)
+
+        options = getCombinedOptions(['-f','303TUAAXX','-g',os.getcwd()])        
+        genome_map = {u'Homo sapiens': '/tmp/hg18' }
+        
+        config = format_gerald_config(options, flowcell_info, genome_map)
+        config_lines = config.split('\n')
+        lane3 = [ line for line in config_lines if re.search('Lane3', line) ]
+        self.failUnlessEqual(len(lane3), 1)
+        self.failUnlessEqual(lane3[0], '# Lane3: SL039 | Paired ends 99 GM12892')
+        human = [ line for line in config_lines if re.search('hg18', line) ]
+        self.failUnlessEqual(len(human), 1)
+        self.failUnlessEqual(human[0], '345678:ELAND_GENOME /tmp/hg18')
+        unknown = [ line for line in config_lines if re.search('Unknown', line) ]
+        self.failUnlessEqual(len(unknown), 2)
+                  
+
+        
+    
diff --git a/htsworkflow/util/url.py b/htsworkflow/util/url.py
new file mode 100644 (file)
index 0000000..1b62e58
--- /dev/null
@@ -0,0 +1,22 @@
+"""
+Utilities to help handle urls
+"""
+
+def normalize_url(url, scheme='http'):
+    """
+    Make sure there is a http at the head of what should be a url
+
+    >>> normalize_url("google.com")
+    'http://google.com'
+    >>> normalize_url("http://google.com")
+    'http://google.com'
+    >>> normalize_url("foo.com/a/b/c/d/e/f.html")
+    'http://foo.com/a/b/c/d/e/f.html'
+    >>> normalize_url("foo.com", "https")
+    'https://foo.com'
+    """
+    scheme_sep = '://'
+    if url.find(scheme_sep) != -1:
+        return url
+    else:
+        return scheme + scheme_sep + url
index c765da941050076c4c8787c13d0499b80452a74a..9e8ccf53b6c21f911bccaa976e532587016b244f 100644 (file)
@@ -3,22 +3,18 @@ import logging
 import sys
 from htsworkflow.pipelines.retrieve_config import *
 from htsworkflow.pipelines import retrieve_config
-from htsworkflow.pipelines.genome_mapper import getAvailableGenomes
-from htsworkflow.pipelines.genome_mapper import constructMapperDict
 
 #Turn on built-in command-line parsing.
 retrieve_config.DISABLE_CMDLINE = False
 
-def main(args=None):
-  #Display help if no args are presented
-  if len(sys.argv) == 1:
-    sys.argv.append('-h')
+def main(argv=None):
+  if argv is None:
+    argv = sys.argv
     
-  options = getCombinedOptions()
+  #Display help if no args are presented
+  options = getCombinedOptions(argv)
+  
   msg_list = ['ERROR MESSAGES:']
-  if options.output_filepath is None:
-    msg_list.append("  Output filepath argument required. -o <filepath> or --output=<filepath>")
-    
   if options.flowcell is None:
     msg_list.append("  Flow cell argument required. -f <flowcell> or --flowcell=<flowcell>")
     
@@ -34,18 +30,7 @@ def main(args=None):
     print '\n'.join(msg_list)
     return 1
   
-  saveConfigFile(options.flowcell, options.url, options.output_filepath)
-
-  f = open(options.output_filepath, 'r')
-  data = f.read()
-  f.close()
-
-  genome_dict = getAvailableGenomes(options.genome_dir)
-  mapper_dict = constructMapperDict(genome_dict)
-
-  f = open(options.output_filepath, 'w')
-  f.write(data % (mapper_dict))
-  f.close()
+  saveConfigFile(options)
   
   return 0