Recent IPAR xml config blocks include the runfolder name

[htsworkflow.git] / htsworkflow / pipelines / runfolder.py
diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py

index f23b255455c2296bab64113603e9fcf915cb4c3e..26c592ce5f7eb6d65175031f9c9010dc400d9410 100644 (file)
--- a/htsworkflow/pipelines/runfolder.py
+++ b/htsworkflow/pipelines/runfolder.py
@@ -1,5 +1,4 @@
-"""
-Core information needed to inspect a runfolder.
+"""Core information needed to inspect a runfolder.
  """
  from glob import glob
  import logging
  """
  from glob import glob
  import logging
@@ -22,6 +21,7 @@ from htsworkflow.pipelines import ElementTree, \
                                    EUROPEAN_STRPTIME, EUROPEAN_DATE_RE, \
                                    VERSION_RE, USER_RE, \
                                    LANES_PER_FLOWCELL, LANE_LIST
                                    EUROPEAN_STRPTIME, EUROPEAN_DATE_RE, \
                                    VERSION_RE, USER_RE, \
                                    LANES_PER_FLOWCELL, LANE_LIST
+from htsworkflow.pipelines.samplekey import LANE_SAMPLE_KEYS
  from htsworkflow.util.alphanum import alphanum
  from htsworkflow.util.ethelp import indent, flatten
  from htsworkflow.util.queuecommands import QueueCommands
  from htsworkflow.util.alphanum import alphanum
  from htsworkflow.util.ethelp import indent, flatten
  from htsworkflow.util.queuecommands import QueueCommands
@@ -29,14 +29,34 @@ from htsworkflow.util.queuecommands import QueueCommands
  from htsworkflow.pipelines import srf
  
  class PipelineRun(object):
  from htsworkflow.pipelines import srf
  
  class PipelineRun(object):
-    """
-    Capture "interesting" information about a pipeline run
+    """Capture "interesting" information about a pipeline run
+    
+    :Variables:
+      - `pathname` location of the root of this runfolder
+      - `serialization_filename` read only property containing name of run xml file
+      - `flowcell_id` read-only property containing flowcell id (bar code)
+      - `datadir` location of the runfolder data dir.
+      - `image_analysis` generic name for Firecrest or IPAR image analysis
+      - `bustard` summary base caller
+      - `gerald` summary of sequence alignment and quality control metrics
      """
      XML_VERSION = 1
      PIPELINE_RUN = 'PipelineRun'
      FLOWCELL_ID = 'FlowcellID'
  
      def __init__(self, pathname=None, flowcell_id=None, xml=None):
      """
      XML_VERSION = 1
      PIPELINE_RUN = 'PipelineRun'
      FLOWCELL_ID = 'FlowcellID'
  
      def __init__(self, pathname=None, flowcell_id=None, xml=None):
+        """Initialize a PipelineRun object
+        
+        :Parameters:
+          - `pathname` the root directory of this run folder.
+          - `flowcell_id` the flowcell ID in case it can't be determined
+          - `xml` Allows initializing an object from a serialized xml file.
+          
+        :Types:
+          - `pathname` str
+          - `flowcell_id` str
+          - `ElementTree` str
+        """
          if pathname is not None:
            self.pathname = os.path.normpath(pathname)
          else:
          if pathname is not None:
            self.pathname = os.path.normpath(pathname)
          else:
@@ -44,6 +64,7 @@ class PipelineRun(object):
          self._name = None
          self._flowcell_id = flowcell_id
          self.datadir = None
          self._name = None
          self._flowcell_id = flowcell_id
          self.datadir = None
+        self.suffix = None
          self.image_analysis = None
          self.bustard = None
          self.gerald = None
          self.image_analysis = None
          self.bustard = None
          self.gerald = None
@@ -52,6 +73,10 @@ class PipelineRun(object):
            self.set_elements(xml)
  
      def _get_flowcell_id(self):
            self.set_elements(xml)
  
      def _get_flowcell_id(self):
+        """Return the flowcell ID
+        
+        Attempts to find the flowcell ID through several mechanisms.
+        """
          # extract flowcell ID
          if self._flowcell_id is None:
              self._flowcell_id = self._get_flowcell_id_from_runinfo()
          # extract flowcell ID
          if self._flowcell_id is None:
              self._flowcell_id = self._get_flowcell_id_from_runinfo()
@@ -71,6 +96,8 @@ class PipelineRun(object):
  
      def _get_flowcell_id_from_flowcellid(self):
          """Extract flowcell id from a Config/FlowcellId.xml file
  
      def _get_flowcell_id_from_flowcellid(self):
          """Extract flowcell id from a Config/FlowcellId.xml file
+        
+        :return: flowcell_id or None if not found
          """
          config_dir = os.path.join(self.pathname, 'Config')
          flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
          """
          config_dir = os.path.join(self.pathname, 'Config')
          flowcell_id_path = os.path.join(config_dir, 'FlowcellId.xml')
@@ -80,6 +107,8 @@ class PipelineRun(object):
  
      def _get_flowcell_id_from_runinfo(self):
          """Read RunInfo file for flowcell id
  
      def _get_flowcell_id_from_runinfo(self):
          """Read RunInfo file for flowcell id
+
+        :return: flowcell_id or None if not found
          """
          runinfo = os.path.join(self.pathname, 'RunInfo.xml')
          if os.path.exists(runinfo):
          """
          runinfo = os.path.join(self.pathname, 'RunInfo.xml')
          if os.path.exists(runinfo):
@@ -89,9 +118,10 @@ class PipelineRun(object):
              if len(fc_nodes) == 1:
                  return fc_nodes[0].text
  
              if len(fc_nodes) == 1:
                  return fc_nodes[0].text
  
-
      def _get_flowcell_id_from_path(self):
          """Guess a flowcell name from the path
      def _get_flowcell_id_from_path(self):
          """Guess a flowcell name from the path
+
+        :return: flowcell_id or None if not found
          """
          path_fields = self.pathname.split('_')
          if len(path_fields) > 0:
          """
          path_fields = self.pathname.split('_')
          if len(path_fields) > 0:
@@ -99,28 +129,54 @@ class PipelineRun(object):
              return path_fields[-1]
  
      def _get_runfolder_name(self):
              return path_fields[-1]
  
      def _get_runfolder_name(self):
-        if self.gerald is None:
-            return None
-        else:
+        if self.gerald:
              return self.gerald.runfolder_name
              return self.gerald.runfolder_name
+        elif hasattr(self.image_analysis, 'runfolder_name'):
+            return self.image_analysis.runfolder_name
+        else:
+            return None
      runfolder_name = property(_get_runfolder_name)
  
      runfolder_name = property(_get_runfolder_name)
  
-    def get_elements(self):
+    def _get_run_dirname(self):
+        """Return name of directory to hold result files from one analysis
+        
+        For pre-multiplexing runs this is just the cycle range C1-123
+        For post-multiplexing runs the "suffix" that we add to 
+        differentiate runs will be added to the range.
+        E.g. Unaligned_6mm may produce C1-200_6mm
          """
          """
-        make one master xml file from all of our sub-components.
+        if self.image_analysis is None:
+            raise ValueError("Not initialized yet")
+        start = self.image_analysis.start
+        stop = self.image_analysis.stop
+        cycle_fragment = "C%d-%d" % (start, stop)
+        if self.suffix:
+            cycle_fragment += self.suffix
+
+        return cycle_fragment
+    run_dirname = property(_get_run_dirname)
+
+    def get_elements(self):
+        """make one master xml file from all of our sub-components.
+        
+        :return: an ElementTree containing all available pipeline
+                 run xml compoents.
          """
          root = ElementTree.Element(PipelineRun.PIPELINE_RUN)
          flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID)
          flowcell.text = self.flowcell_id
          root.append(self.image_analysis.get_elements())
          root.append(self.bustard.get_elements())
          """
          root = ElementTree.Element(PipelineRun.PIPELINE_RUN)
          flowcell = ElementTree.SubElement(root, PipelineRun.FLOWCELL_ID)
          flowcell.text = self.flowcell_id
          root.append(self.image_analysis.get_elements())
          root.append(self.bustard.get_elements())
-        root.append(self.gerald.get_elements())
+        if self.gerald:
+            root.append(self.gerald.get_elements())
          return root
  
      def set_elements(self, tree):
          return root
  
      def set_elements(self, tree):
-        # this file gets imported by all the others,
-        # so we need to hide the imports to avoid a cyclic imports
+        """Initialize a PipelineRun object from an run.xml ElementTree.
  
  
+        :param tree: parsed ElementTree
+        :type tree: ElementTree
+        """
          tag = tree.tag.lower()
          if tag != PipelineRun.PIPELINE_RUN.lower():
            raise ValueError('Pipeline Run Expecting %s got %s' % (
          tag = tree.tag.lower()
          if tag != PipelineRun.PIPELINE_RUN.lower():
            raise ValueError('Pipeline Run Expecting %s got %s' % (
@@ -144,27 +200,47 @@ class PipelineRun(object):
            else:
              LOGGER.warn('PipelineRun unrecognized tag %s' % (tag,))
  
            else:
              LOGGER.warn('PipelineRun unrecognized tag %s' % (tag,))
  
-    def _get_run_name(self):
-        """
-        Given a run tuple, find the latest date and use that as our name
+    def _get_serialization_filename(self):
+        """Compute the filename for the run xml file
+        
+        Attempts to find the latest date from all of the run 
+        components.
+        
+        :return: filename run_{flowcell id}_{timestamp}.xml
+        :rtype: str
          """
          if self._name is None:
          """
          if self._name is None:
-          tmax = max(self.image_analysis.time, self.bustard.time, self.gerald.time)
+          components = [self.image_analysis, self.bustard, self.gerald]
+          tmax = max([ c.time for c in components if c ])
            timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
            self._name = 'run_' + self.flowcell_id + "_" + timestamp + '.xml'
          return self._name
            timestamp = time.strftime('%Y-%m-%d', time.localtime(tmax))
            self._name = 'run_' + self.flowcell_id + "_" + timestamp + '.xml'
          return self._name
-    name = property(_get_run_name)
+    serialization_filename = property(_get_serialization_filename)
  
      def save(self, destdir=None):
  
      def save(self, destdir=None):
+        """Save a run xml file.
+        
+        :param destdir: Directory name to save too, uses current directory
+                        if not specified.
+        :type destdir: str
+        """
          if destdir is None:
              destdir = ''
          if destdir is None:
              destdir = ''
-        LOGGER.info("Saving run report " + self.name)
+        LOGGER.info("Saving run report " + self.serialization_filename)
          xml = self.get_elements()
          indent(xml)
          xml = self.get_elements()
          indent(xml)
-        dest_pathname = os.path.join(destdir, self.name)
+        dest_pathname = os.path.join(destdir, self.serialization_filename)
          ElementTree.ElementTree(xml).write(dest_pathname)
  
      def load(self, filename):
          ElementTree.ElementTree(xml).write(dest_pathname)
  
      def load(self, filename):
+        """Load a run xml into this object.
+        
+        :Parameters:
+          - `filename` location of a run xml file
+          
+        :Types:
+          - `filename` str
+        """
          LOGGER.info("Loading run report from " + filename)
          tree = ElementTree.parse(filename).getroot()
          self.set_elements(tree)
          LOGGER.info("Loading run report from " + filename)
          tree = ElementTree.parse(filename).getroot()
          self.set_elements(tree)
@@ -174,7 +250,7 @@ def load_pipeline_run_xml(pathname):
      Load and instantiate a Pipeline run from a run xml file
  
      :Parameters:
      Load and instantiate a Pipeline run from a run xml file
  
      :Parameters:
-      - `pathname` : location of an run xml file
+      - `pathname` location of an run xml file
  
      :Returns: initialized PipelineRun object
      """
  
      :Returns: initialized PipelineRun object
      """
@@ -183,19 +259,16 @@ def load_pipeline_run_xml(pathname):
      return run
  
  def get_runs(runfolder, flowcell_id=None):
      return run
  
  def get_runs(runfolder, flowcell_id=None):
-    """
-    Search through a run folder for all the various sub component runs
-    and then return a PipelineRun for each different combination.
+    """Find all runs associated with a runfolder.
+    
+    We end up with multiple analysis runs as we sometimes
+    need to try with different parameters. This attempts
+    to return a list of all the various runs.
  
      For example if there are two different GERALD runs, this will
      generate two different PipelineRun objects, that differ
      in there gerald component.
      """
  
      For example if there are two different GERALD runs, this will
      generate two different PipelineRun objects, that differ
      in there gerald component.
      """
-    from htsworkflow.pipelines import firecrest
-    from htsworkflow.pipelines import ipar
-    from htsworkflow.pipelines import bustard
-    from htsworkflow.pipelines import gerald
-
      datadir = os.path.join(runfolder, 'Data')
  
      LOGGER.info('Searching for runs in ' + datadir)
      datadir = os.path.join(runfolder, 'Data')
  
      LOGGER.info('Searching for runs in ' + datadir)
@@ -279,18 +352,18 @@ def build_hiseq_runs(image_analysis, runs, datadir, runfolder, flowcell_id):
      matched_paths = hiseq_match_aligned_unaligned(aligned_paths, unaligned_paths)
      LOGGER.debug("Matched HiSeq analysis: %s", str(matched_paths))
  
      matched_paths = hiseq_match_aligned_unaligned(aligned_paths, unaligned_paths)
      LOGGER.debug("Matched HiSeq analysis: %s", str(matched_paths))
  
-    for aligned, unaligned in matched_paths:
+    for aligned, unaligned, suffix in matched_paths:
          if unaligned is None:
              LOGGER.warn("Aligned directory %s without matching unalinged, skipping", aligned)
              continue
  
          if unaligned is None:
              LOGGER.warn("Aligned directory %s without matching unalinged, skipping", aligned)
              continue
  
-        g = gerald.gerald(aligned)
-        print "scan for aligned then remove them from unaligned list"
          try:
              p = PipelineRun(runfolder, flowcell_id)
              p.datadir = datadir
          try:
              p = PipelineRun(runfolder, flowcell_id)
              p.datadir = datadir
+            p.suffix = suffix
              p.image_analysis = image_analysis
              p.bustard = bustard.bustard(unaligned)
              p.image_analysis = image_analysis
              p.bustard = bustard.bustard(unaligned)
+            assert p.bustard
              if aligned:
                  p.gerald = gerald.gerald(aligned)
              runs.append(p)
              if aligned:
                  p.gerald = gerald.gerald(aligned)
              runs.append(p)
@@ -312,7 +385,7 @@ def hiseq_match_aligned_unaligned(aligned, unaligned):
      for key in keys:
          a = aligned_by_suffix.get(key)
          u = unaligned_by_suffix.get(key)
      for key in keys:
          a = aligned_by_suffix.get(key)
          u = unaligned_by_suffix.get(key)
-        matches.append((a, u))
+        matches.append((a, u, key))
      return matches
  
  def build_dir_dict_by_suffix(prefix, dirnames):
      return matches
  
  def build_dir_dict_by_suffix(prefix, dirnames):
@@ -466,11 +539,16 @@ def summary_report(runs):
      Summarize cluster numbers and mapped read counts for a runfolder
      """
      report = []
      Summarize cluster numbers and mapped read counts for a runfolder
      """
      report = []
+    eland_keys = []
      for run in runs:
          # print a run name?
      for run in runs:
          # print a run name?
-        report.append('Summary for %s' % (run.name,))
+        report.append('Summary for %s' % (run.serialization_filename,))
         # sort the report
         # sort the report
-       eland_keys = sorted(run.gerald.eland_results.keys())
+       if run.gerald:
+            eland_keys = sorted(run.gerald.eland_results.keys())
+        else:
+            report.append("Alignment not done, no report possible")
+
      for lane_id in eland_keys:
          report.extend(summarize_lane(run.gerald, lane_id))
          report.append('---')
      for lane_id in eland_keys:
          report.extend(summarize_lane(run.gerald, lane_id))
          report.append('---')
@@ -485,14 +563,14 @@ def is_compressed(filename):
      else:
          return False
  
      else:
          return False
  
-def save_flowcell_reports(data_dir, cycle_dir):
+def save_flowcell_reports(data_dir, run_dirname):
      """
      Save the flowcell quality reports
      """
      data_dir = os.path.abspath(data_dir)
      status_file = os.path.join(data_dir, 'Status.xml')
      reports_dir = os.path.join(data_dir, 'reports')
      """
      Save the flowcell quality reports
      """
      data_dir = os.path.abspath(data_dir)
      status_file = os.path.join(data_dir, 'Status.xml')
      reports_dir = os.path.join(data_dir, 'reports')
-    reports_dest = os.path.join(cycle_dir, 'flowcell-reports.tar.bz2')
+    reports_dest = os.path.join(run_dirname, 'flowcell-reports.tar.bz2')
      if os.path.exists(reports_dir):
          cmd_list = [ 'tar', 'cjvf', reports_dest, 'reports/' ]
          if os.path.exists(status_file):
      if os.path.exists(reports_dir):
          cmd_list = [ 'tar', 'cjvf', reports_dest, 'reports/' ]
          if os.path.exists(status_file):
@@ -505,21 +583,21 @@ def save_flowcell_reports(data_dir, cycle_dir):
          os.chdir(cwd)
  
  
          os.chdir(cwd)
  
  
-def save_summary_file(pipeline, cycle_dir):
+def save_summary_file(pipeline, run_dirname):
      # Copy Summary.htm
      gerald_object = pipeline.gerald
      gerald_summary = os.path.join(gerald_object.pathname, 'Summary.htm')
      status_files_summary = os.path.join(pipeline.datadir, 'Status_Files', 'Summary.htm')
      if os.path.exists(gerald_summary):
      # Copy Summary.htm
      gerald_object = pipeline.gerald
      gerald_summary = os.path.join(gerald_object.pathname, 'Summary.htm')
      status_files_summary = os.path.join(pipeline.datadir, 'Status_Files', 'Summary.htm')
      if os.path.exists(gerald_summary):
-        LOGGER.info('Copying %s to %s' % (gerald_summary, cycle_dir))
-        shutil.copy(gerald_summary, cycle_dir)
+        LOGGER.info('Copying %s to %s' % (gerald_summary, run_dirname))
+        shutil.copy(gerald_summary, run_dirname)
      elif os.path.exists(status_files_summary):
      elif os.path.exists(status_files_summary):
-        LOGGER.info('Copying %s to %s' % (status_files_summary, cycle_dir))
-        shutil.copy(status_files_summary, cycle_dir)
+        LOGGER.info('Copying %s to %s' % (status_files_summary, run_dirname))
+        shutil.copy(status_files_summary, run_dirname)
      else:
          LOGGER.info('Summary file %s was not found' % (summary_path,))
  
      else:
          LOGGER.info('Summary file %s was not found' % (summary_path,))
  
-def save_ivc_plot(bustard_object, cycle_dir):
+def save_ivc_plot(bustard_object, run_dirname):
      """
      Save the IVC page and its supporting images
      """
      """
      Save the IVC page and its supporting images
      """
@@ -527,12 +605,12 @@ def save_ivc_plot(bustard_object, cycle_dir):
      plot_image_path = os.path.join(bustard_object.pathname, 'Plots')
      plot_images = os.path.join(plot_image_path, 's_?_[a-z]*.png')
  
      plot_image_path = os.path.join(bustard_object.pathname, 'Plots')
      plot_images = os.path.join(plot_image_path, 's_?_[a-z]*.png')
  
-    plot_target_path = os.path.join(cycle_dir, 'Plots')
+    plot_target_path = os.path.join(run_dirname, 'Plots')
  
      if os.path.exists(plot_html):
          LOGGER.debug("Saving %s" % (plot_html,))
          LOGGER.debug("Saving %s" % (plot_images,))
  
      if os.path.exists(plot_html):
          LOGGER.debug("Saving %s" % (plot_html,))
          LOGGER.debug("Saving %s" % (plot_images,))
-        shutil.copy(plot_html, cycle_dir)
+        shutil.copy(plot_html, run_dirname)
          if not os.path.exists(plot_target_path):
              os.mkdir(plot_target_path)
          for plot_file in glob(plot_images):
          if not os.path.exists(plot_target_path):
              os.mkdir(plot_target_path)
          for plot_file in glob(plot_images):
@@ -541,7 +619,7 @@ def save_ivc_plot(bustard_object, cycle_dir):
          LOGGER.warning('Missing IVC.html file, not archiving')
  
  
          LOGGER.warning('Missing IVC.html file, not archiving')
  
  
-def compress_score_files(bustard_object, cycle_dir):
+def compress_score_files(bustard_object, run_dirname):
      """
      Compress score files into our result directory
      """
      """
      Compress score files into our result directory
      """
@@ -559,7 +637,7 @@ def compress_score_files(bustard_object, cycle_dir):
  
      tar_cmd = ['tar', 'c'] + score_files
      bzip_cmd = [ 'bzip2', '-9', '-c' ]
  
      tar_cmd = ['tar', 'c'] + score_files
      bzip_cmd = [ 'bzip2', '-9', '-c' ]
-    tar_dest_name = os.path.join(cycle_dir, 'scores.tar.bz2')
+    tar_dest_name = os.path.join(run_dirname, 'scores.tar.bz2')
      tar_dest = open(tar_dest_name, 'w')
      LOGGER.info("Compressing score files from %s" % (scores_path,))
      LOGGER.info("Running tar: " + " ".join(tar_cmd[:10]))
      tar_dest = open(tar_dest_name, 'w')
      LOGGER.info("Compressing score files from %s" % (scores_path,))
      LOGGER.info("Running tar: " + " ".join(tar_cmd[:10]))
@@ -573,7 +651,7 @@ def compress_score_files(bustard_object, cycle_dir):
      tar.wait()
  
  
      tar.wait()
  
  
-def compress_eland_results(gerald_object, cycle_dir, num_jobs=1):
+def compress_eland_results(gerald_object, run_dirname, num_jobs=1):
      """
      Compress eland result files into the archive directory
      """
      """
      Compress eland result files into the archive directory
      """
@@ -588,7 +666,7 @@ def compress_eland_results(gerald_object, cycle_dir, num_jobs=1):
                  "Lane ID %s does not have a filename." % (eland_lane.lane_id,))
              else:
                path, name = os.path.split(source_name)
                  "Lane ID %s does not have a filename." % (eland_lane.lane_id,))
              else:
                path, name = os.path.split(source_name)
-              dest_name = os.path.join(cycle_dir, name)
+              dest_name = os.path.join(run_dirname, name)
                LOGGER.info("Saving eland file %s to %s" % \
                           (source_name, dest_name))
  
                LOGGER.info("Saving eland file %s to %s" % \
                           (source_name, dest_name))
  
@@ -629,52 +707,56 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, r
          if not os.path.exists(result_dir):
              os.mkdir(result_dir)
  
          if not os.path.exists(result_dir):
              os.mkdir(result_dir)
  
-        # create cycle_dir
-        cycle = "C%d-%d" % (r.image_analysis.start, r.image_analysis.stop)
-        LOGGER.info("Filling in %s" % (cycle,))
-        cycle_dir = os.path.join(result_dir, cycle)
-        cycle_dir = os.path.abspath(cycle_dir)
-        if os.path.exists(cycle_dir):
-            LOGGER.error("%s already exists, not overwriting" % (cycle_dir,))
+        # create directory to add this runs results to
+        LOGGER.info("Filling in %s" % (r.run_dirname,))
+        run_dirname = os.path.join(result_dir, r.run_dirname)
+        run_dirname = os.path.abspath(run_dirname)
+        if os.path.exists(run_dirname):
+            LOGGER.error("%s already exists, not overwriting" % (run_dirname,))
              continue
          else:
              continue
          else:
-            os.mkdir(cycle_dir)
+            os.mkdir(run_dirname)
  
          # save run file
  
          # save run file
-        r.save(cycle_dir)
+        r.save(run_dirname)
  
          # save illumina flowcell status report
          save_flowcell_reports(os.path.join(r.image_analysis.pathname, '..'),
  
          # save illumina flowcell status report
          save_flowcell_reports(os.path.join(r.image_analysis.pathname, '..'),
-                              cycle_dir)
+                              run_dirname)
  
          # save stuff from bustard
          # grab IVC plot
  
          # save stuff from bustard
          # grab IVC plot
-        save_ivc_plot(r.bustard, cycle_dir)
+        save_ivc_plot(r.bustard, run_dirname)
  
          # build base call saving commands
          if site is not None:
  
          # build base call saving commands
          if site is not None:
-            save_raw_data(num_jobs, r, site, raw_format, cycle_dir)
+            save_raw_data(num_jobs, r, site, raw_format, run_dirname)
  
          # save stuff from GERALD
          # copy stuff out of the main run
  
          # save stuff from GERALD
          # copy stuff out of the main run
-        g = r.gerald
+        if r.gerald:
+            g = r.gerald
  
  
-        # save summary file
-        save_summary_file(r, cycle_dir)
+            # save summary file
+            save_summary_file(r, run_dirname)
  
  
-        # compress eland result files
-        compress_eland_results(g, cycle_dir, num_jobs)
+            # compress eland result files
+            compress_eland_results(g, run_dirname, num_jobs)
  
          # md5 all the compressed files once we're done
  
          # md5 all the compressed files once we're done
-        md5_commands = srf.make_md5_commands(cycle_dir)
-        srf.run_commands(cycle_dir, md5_commands, num_jobs)
+        md5_commands = srf.make_md5_commands(run_dirname)
+        srf.run_commands(run_dirname, md5_commands, num_jobs)
  
  
-def save_raw_data(num_jobs, r, site, raw_format, cycle_dir):
+def save_raw_data(num_jobs, r, site, raw_format, run_dirname):
      lanes = []
      lanes = []
-    for lane in r.gerald.lanes:
-        lane_parameters = r.gerald.lanes.get(lane, None)
-        if lane_parameters is not None:
-            lanes.append(lane)
+    if r.gerald:
+        for lane in r.gerald.lanes:
+            lane_parameters = r.gerald.lanes.get(lane, None)
+            if lane_parameters is not None:
+                lanes.append(lane)
+    else:
+        # assume default list of lanes
+        lanes = LANE_SAMPLE_KEYS
  
      run_name = srf.pathname_to_run_name(r.pathname)
      seq_cmds = []
  
      run_name = srf.pathname_to_run_name(r.pathname)
      seq_cmds = []
@@ -683,13 +765,14 @@ def save_raw_data(num_jobs, r, site, raw_format, cycle_dir):
  
      LOGGER.info("Raw Format is: %s" % (raw_format, ))
      if raw_format == 'fastq':
  
      LOGGER.info("Raw Format is: %s" % (raw_format, ))
      if raw_format == 'fastq':
-        rawpath = os.path.join(r.pathname, r.gerald.runfolder_name)
+        LOGGER.info("Reading fastq files from %s", r.bustard.pathname)
+        rawpath = os.path.join(r.pathname, r.bustard.pathname)
          LOGGER.info("raw data = %s" % (rawpath,))
          LOGGER.info("raw data = %s" % (rawpath,))
-        srf.copy_hiseq_project_fastqs(run_name, rawpath, site, cycle_dir)
+        srf.copy_hiseq_project_fastqs(run_name, rawpath, site, run_dirname)
      elif raw_format == 'qseq':
      elif raw_format == 'qseq':
-        seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir)
+        seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, run_dirname)
      elif raw_format == 'srf':
      elif raw_format == 'srf':
-        seq_cmds = srf.make_srf_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir, 0)
+        seq_cmds = srf.make_srf_commands(run_name, r.bustard.pathname, lanes, site, run_dirname, 0)
      else:
          raise ValueError('Unknown --raw-format=%s' % (raw_format))
      srf.run_commands(r.bustard.pathname, seq_cmds, num_jobs)
      else:
          raise ValueError('Unknown --raw-format=%s' % (raw_format))
      srf.run_commands(r.bustard.pathname, seq_cmds, num_jobs)