Set WITH_SEQUENCE as both a per-lane AND global parameter
[htsworkflow.git] / htsworkflow / pipelines / retrieve_config.py
index 3b6ba3e21f331f5ceade12cd40349831131d6653..3a1a56aeaf2188350db8b980d7b098bf0098a0ba 100644 (file)
@@ -14,6 +14,7 @@ except ImportError, e:
     import simplejson as json
 
 from htsworkflow.frontend.auth import apidata
+from htsworkflow.util import api
 from htsworkflow.util.url import normalize_url
 from htsworkflow.pipelines.genome_mapper import getAvailableGenomes
 from htsworkflow.pipelines.genome_mapper import constructMapperDict
@@ -36,7 +37,7 @@ def retrieve_flowcell_info(base_host_url, flowcell):
     """
     Return a dictionary describing a 
     """
-    url = base_host_url + '/experiments/config/%s/json' % (flowcell)
+    url = api.flowcell_url(base_host_url, flowcell)
   
     try:
         apipayload = urllib.urlencode(apidata)
@@ -49,7 +50,7 @@ def retrieve_flowcell_info(base_host_url, flowcell):
     
     contents = web.read()
     headers = web.info()
-   
+
     if web.code == 403:
         msg = "403 - Forbbidden, probably need api key"
         raise FlowCellNotFound(msg)
@@ -106,6 +107,7 @@ def format_gerald_header(flowcell_info):
         lane_info = flowcell_info['lane_set'][lane_number]
         config += ['Lane%s: %s | %s' % (lane_number, lane_info['library_id'],
                                         lane_info['library_name'])]
+
     config += ['']
     return "\n# ".join(config)
 
@@ -114,13 +116,19 @@ def format_gerald_config(options, flowcell_info, genome_map):
     Generate a GERALD config file
     """
     # so we can add nothing or _pair if we're a paired end run
-    run_type_suffix = { False: "", True: "_pair" }
+    eland_analysis_suffix = { False: "_extended", True: "_pair" }
+    sequence_analysis_suffix = { False: "", True: "_pair" }
 
     # it's convienent to have helpful information describing the flowcell
     # in the config file... things like which lane is which library.
     config = [format_gerald_header(flowcell_info)]
 
-    analysis_suffix = run_type_suffix[flowcell_info['paired_end']]
+    config += ['SEQUENCE_FORMAT --fastq']
+    config += ['ELAND_SET_SIZE 20']
+    config += ['WITH_SEQUENCE TRUE']
+    config += ['12345678:WITH_SEQUENCE TRUE']
+    analysis_suffix = eland_analysis_suffix[flowcell_info['paired_end']]
+    sequence_suffix = sequence_analysis_suffix[flowcell_info['paired_end']]
     lane_groups = group_lane_parameters(flowcell_info)
     for lane_index, lane_numbers in lane_groups.items():
         # lane_index is return value of group_lane_parameters
@@ -128,18 +136,25 @@ def format_gerald_config(options, flowcell_info, genome_map):
         lane_numbers.sort()
         lane_prefix = u"".join(lane_numbers)
         
-        if not is_sequencing:
-            config += ['%s:ANALYSIS eland%s' % (lane_prefix, analysis_suffix)]
+        species_path = genome_map.get(species, None)
+        logging.debug("Looked for genome '%s' got location '%s'" % (species, species_path))
+        if species_path is None:
+            no_genome_msg = "Forcing lanes %s to sequencing as there is no genome for %s"
+            logging.warning(no_genome_msg % (lane_numbers, species))
+            is_sequencing = True
+            
+        if is_sequencing:
+            config += ['%s:ANALYSIS sequence%s' % (lane_prefix, sequence_suffix)]
         else:
-            config += ['%s:ANALYSIS sequence%s' % (lane_prefix, analysis_suffix)]
+            config += ['%s:ANALYSIS eland%s' % (lane_prefix, analysis_suffix)]
+            config += ['%s:ELAND_GENOME %s' % (lane_prefix, species_path) ]
         #config += ['%s:READ_LENGTH %s' % ( lane_prefix, read_length ) ]
         config += ['%s:USE_BASES Y%s' % ( lane_prefix, read_length ) ]
-        species_path = genome_map.get(species, "Unknown")
-        config += ['%s:ELAND_GENOME %s' % (lane_prefix, species_path) ]
 
     # add in option for running script after 
-    if options.post_run is not None:
-        post_run = options.post_run  % {'runfolder': options.runfolder}
+    if not (options.post_run is None or options.runfolder is None):
+        runfolder = os.path.abspath(options.runfolder)
+        post_run = options.post_run  % {'runfolder': runfolder}
         config += ['POST_RUN_COMMAND %s' % (post_run,) ]
         
     config += [''] # force trailing newline
@@ -193,8 +208,9 @@ Config File:
   Example Config File:
   
     [%s]
-    config_host=http://somewhere.domain:port
-    genome_dir=/path to search for genomes
+    config_host: http://somewhere.domain:port
+    genome_dir: /path to search for genomes
+    post_run: runfolder -o <destdir> %%(runfolder)s
     
 """ % (CONFIG_SYSTEM, CONFIG_USER, GERALD_CONFIG_SECTION)
   
@@ -218,7 +234,9 @@ Config File:
     parser.add_option("-r", "--runfolder",
                       action="store", type="string",
                       help="specify runfolder for post_run command ")
-    
+
+    parser.add_option('-v', '--verbose', action='store_true', default=False,
+                       help='increase logging verbosity')
     return parser
     
 def constructConfigParser():
@@ -267,13 +285,6 @@ def getCombinedOptions(argv=None):
         if options.runfolder is not None:
             options.output_filepath = os.path.join(options.runfolder, 'config-auto.txt')
             
-    logging.info('USING OPTIONS:')
-    logging.info(u'     URL: %s' % (options.url,))
-    logging.info(u'     OUT: %s' % (options.output_filepath,))
-    logging.info(u'      FC: %s' % (options.flowcell,))
-    #logging.info(': %s' % (options.genome_dir,))
-    logging.info(u'post_run: %s' % ( unicode(options.post_run),))
-    
     return options
 
 
@@ -282,10 +293,19 @@ def saveConfigFile(options):
   retrieves the flowcell eland config file, give the base_host_url
   (i.e. http://sub.domain.edu:port)
   """
+  logging.info('USING OPTIONS:')
+  logging.info(u'     URL: %s' % (options.url,))
+  logging.info(u'     OUT: %s' % (options.output_filepath,))
+  logging.info(u'      FC: %s' % (options.flowcell,))
+  #logging.info(': %s' % (options.genome_dir,))
+  logging.info(u'post_run: %s' % ( unicode(options.post_run),))
+   
   flowcell_info = retrieve_flowcell_info(options.url, options.flowcell)
 
+  logging.debug('genome_dir: %s' % ( options.genome_dir, ))
   available_genomes = getAvailableGenomes(options.genome_dir)
   genome_map = constructMapperDict(available_genomes)
+  logging.debug('available genomes: %s' % ( unicode( genome_map.keys() ),))
 
   config = format_gerald_config(options, flowcell_info, genome_map)