Merge branch 'master' of mus.cacr.caltech.edu:htsworkflow

[htsworkflow.git] / htsworkflow / pipelines / runfolder.py
diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py

index fcee65ce8d32806efbffb5abf610a0bf094d0b5c..4a2b4cdc3704fccb1b7fbdd5f48211c86851b6c4 100644 (file)
--- a/htsworkflow/pipelines/runfolder.py
+++ b/htsworkflow/pipelines/runfolder.py
@@ -17,6 +17,8 @@ try:
  except ImportError, e:
      from elementtree import ElementTree
  
+LOGGER = logging.getLogger(__name__)
+
  EUROPEAN_STRPTIME = "%d-%m-%Y"
  EUROPEAN_DATE_RE = "([0-9]{1,2}-[0-9]{1,2}-[0-9]{4,4})"
  VERSION_RE = "([0-9\.]+)"
@@ -64,14 +66,14 @@ class PipelineRun(object):
                  path_fields = self.pathname.split('_')
                  if len(path_fields) > 0:
                      # guessing last element of filename
-                   flowcell_id = path_fields[-1]
+                   self._flowcell_id = path_fields[-1]
                  else:
-                   flowcell_id = 'unknown'
+                   self._flowcell_id = 'unknown'
+
+                   LOGGER.warning(
+                       "Flowcell id was not found, guessing %s" % (
+                       self._flowcell_id))
  
-                logging.warning(
-                  "Flowcell id was not found, guessing %s" % (
-                     flowcell_id))
-                self._flowcell_id = flowcell_id
          return self._flowcell_id
      flowcell_id = property(_get_flowcell_id)
  
@@ -121,7 +123,7 @@ class PipelineRun(object):
            elif tag == gerald.Gerald.GERALD.lower():
              self.gerald = gerald.Gerald(xml=element)
            else:
-            logging.warn('PipelineRun unrecognized tag %s' % (tag,))
+            LOGGER.warn('PipelineRun unrecognized tag %s' % (tag,))
  
      def _get_run_name(self):
          """
@@ -137,14 +139,14 @@ class PipelineRun(object):
      def save(self, destdir=None):
          if destdir is None:
              destdir = ''
-        logging.info("Saving run report " + self.name)
+        LOGGER.info("Saving run report " + self.name)
          xml = self.get_elements()
          indent(xml)
          dest_pathname = os.path.join(destdir, self.name)
          ElementTree.ElementTree(xml).write(dest_pathname)
  
      def load(self, filename):
-        logging.info("Loading run report from " + filename)
+        LOGGER.info("Loading run report from " + filename)
          tree = ElementTree.parse(filename).getroot()
          self.set_elements(tree)
  
@@ -176,17 +178,17 @@ def get_runs(runfolder, flowcell_id=None):
      from htsworkflow.pipelines import gerald
  
      def scan_post_image_analysis(runs, runfolder, image_analysis, pathname):
-        logging.info("Looking for bustard directories in %s" % (pathname,))
+        LOGGER.info("Looking for bustard directories in %s" % (pathname,))
          bustard_dirs = glob(os.path.join(pathname, "Bustard*"))
          # RTA BaseCalls looks enough like Bustard.
          bustard_dirs.extend(glob(os.path.join(pathname, "BaseCalls")))
          for bustard_pathname in bustard_dirs:
-            logging.info("Found bustard directory %s" % (bustard_pathname,))
+            LOGGER.info("Found bustard directory %s" % (bustard_pathname,))
              b = bustard.bustard(bustard_pathname)
              gerald_glob = os.path.join(bustard_pathname, 'GERALD*')
-            logging.info("Looking for gerald directories in %s" % (pathname,))
+            LOGGER.info("Looking for gerald directories in %s" % (pathname,))
              for gerald_pathname in glob(gerald_glob):
-                logging.info("Found gerald directory %s" % (gerald_pathname,))
+                LOGGER.info("Found gerald directory %s" % (gerald_pathname,))
                  try:
                      g = gerald.gerald(gerald_pathname)
                      p = PipelineRun(runfolder, flowcell_id)
@@ -195,18 +197,18 @@ def get_runs(runfolder, flowcell_id=None):
                      p.gerald = g
                      runs.append(p)
                  except IOError, e:
-                    logging.error("Ignoring " + str(e))
+                    LOGGER.error("Ignoring " + str(e))
  
      datadir = os.path.join(runfolder, 'Data')
  
-    logging.info('Searching for runs in ' + datadir)
+    LOGGER.info('Searching for runs in ' + datadir)
      runs = []
      # scan for firecrest directories
      for firecrest_pathname in glob(os.path.join(datadir, "*Firecrest*")):
-        logging.info('Found firecrest in ' + datadir)
+        LOGGER.info('Found firecrest in ' + datadir)
          image_analysis = firecrest.firecrest(firecrest_pathname)
          if image_analysis is None:
-            logging.warn(
+            LOGGER.warn(
                  "%s is an empty or invalid firecrest directory" % (firecrest_pathname,)
              )
          else:
@@ -218,10 +220,10 @@ def get_runs(runfolder, flowcell_id=None):
      # The Intensities directory from the RTA software looks a lot like IPAR
      ipar_dirs.extend(glob(os.path.join(datadir, 'Intensities')))
      for ipar_pathname in ipar_dirs:
-        logging.info('Found ipar directories in ' + datadir)
+        LOGGER.info('Found ipar directories in ' + datadir)
          image_analysis = ipar.ipar(ipar_pathname)
          if image_analysis is None:
-            logging.warn(
+            LOGGER.warn(
                  "%s is an empty or invalid IPAR directory" % (ipar_pathname,)
              )
          else:
@@ -250,19 +252,19 @@ def get_specific_run(gerald_dir):
  
      runfolder_dir = os.path.abspath(os.path.join(image_dir, '..', '..'))
  
-    logging.info('--- use-run detected options ---')
-    logging.info('runfolder: %s' % (runfolder_dir,))
-    logging.info('image_dir: %s' % (image_dir,))
-    logging.info('bustard_dir: %s' % (bustard_dir,))
-    logging.info('gerald_dir: %s' % (gerald_dir,))
+    LOGGER.info('--- use-run detected options ---')
+    LOGGER.info('runfolder: %s' % (runfolder_dir,))
+    LOGGER.info('image_dir: %s' % (image_dir,))
+    LOGGER.info('bustard_dir: %s' % (bustard_dir,))
+    LOGGER.info('gerald_dir: %s' % (gerald_dir,))
  
      # find our processed image dir
      image_run = None
      # split into parent, and leaf directory
      # leaf directory should be an IPAR or firecrest directory
      data_dir, short_image_dir = os.path.split(image_dir)
-    logging.info('data_dir: %s' % (data_dir,))
-    logging.info('short_iamge_dir: %s' % (short_image_dir,))
+    LOGGER.info('data_dir: %s' % (data_dir,))
+    LOGGER.info('short_iamge_dir: %s' % (short_image_dir,))
  
      # guess which type of image processing directory we have by looking
      # in the leaf directory name
@@ -276,19 +278,19 @@ def get_specific_run(gerald_dir):
      # if we din't find a run, report the error and return
      if image_run is None:
          msg = '%s does not contain an image processing step' % (image_dir,)
-        logging.error(msg)
+        LOGGER.error(msg)
          return None
  
      # find our base calling
      base_calling_run = bustard.bustard(bustard_dir)
      if base_calling_run is None:
-        logging.error('%s does not contain a bustard run' % (bustard_dir,))
+        LOGGER.error('%s does not contain a bustard run' % (bustard_dir,))
          return None
  
      # find alignments
      gerald_run = gerald.gerald(gerald_dir)
      if gerald_run is None:
-        logging.error('%s does not contain a gerald run' % (gerald_dir,))
+        LOGGER.error('%s does not contain a gerald run' % (gerald_dir,))
          return None
  
      p = PipelineRun(runfolder_dir)
@@ -296,7 +298,7 @@ def get_specific_run(gerald_dir):
      p.bustard = base_calling_run
      p.gerald = gerald_run
  
-    logging.info('Constructed PipelineRun from %s' % (gerald_dir,))
+    LOGGER.info('Constructed PipelineRun from %s' % (gerald_dir,))
      return p
  
  def extract_run_parameters(runs):
@@ -397,7 +399,7 @@ def save_flowcell_reports(data_dir, cycle_dir):
          cmd_list = [ 'tar', 'cjvf', reports_dest, 'reports/' ]
          if os.path.exists(status_file):
              cmd_list.extend(['Status.xml', 'Status.xsl'])
-        logging.info("Saving reports from " + reports_dir)
+        LOGGER.info("Saving reports from " + reports_dir)
          cwd = os.getcwd()
          os.chdir(data_dir)
          q = QueueCommands([" ".join(cmd_list)])
@@ -409,10 +411,10 @@ def save_summary_file(gerald_object, cycle_dir):
      # Copy Summary.htm
      summary_path = os.path.join(gerald_object.pathname, 'Summary.htm')
      if os.path.exists(summary_path):
-        logging.info('Copying %s to %s' % (summary_path, cycle_dir))
+        LOGGER.info('Copying %s to %s' % (summary_path, cycle_dir))
          shutil.copy(summary_path, cycle_dir)
      else:
-        logging.info('Summary file %s was not found' % (summary_path,))
+        LOGGER.info('Summary file %s was not found' % (summary_path,))
  
  def save_ivc_plot(bustard_object, cycle_dir):
      """
@@ -425,15 +427,15 @@ def save_ivc_plot(bustard_object, cycle_dir):
      plot_target_path = os.path.join(cycle_dir, 'Plots')
  
      if os.path.exists(plot_html):
-        logging.debug("Saving %s" % (plot_html,))
-        logging.debug("Saving %s" % (plot_images,))
+        LOGGER.debug("Saving %s" % (plot_html,))
+        LOGGER.debug("Saving %s" % (plot_images,))
          shutil.copy(plot_html, cycle_dir)
          if not os.path.exists(plot_target_path):
              os.mkdir(plot_target_path)
          for plot_file in glob(plot_images):
              shutil.copy(plot_file, plot_target_path)
      else:
-        logging.warning('Missing IVC.html file, not archiving')
+        LOGGER.warning('Missing IVC.html file, not archiving')
  
  
  def compress_score_files(bustard_object, cycle_dir):
@@ -456,10 +458,10 @@ def compress_score_files(bustard_object, cycle_dir):
      bzip_cmd = [ 'bzip2', '-9', '-c' ]
      tar_dest_name = os.path.join(cycle_dir, 'scores.tar.bz2')
      tar_dest = open(tar_dest_name, 'w')
-    logging.info("Compressing score files from %s" % (scores_path,))
-    logging.info("Running tar: " + " ".join(tar_cmd[:10]))
-    logging.info("Running bzip2: " + " ".join(bzip_cmd))
-    logging.info("Writing to %s" % (tar_dest_name,))
+    LOGGER.info("Compressing score files from %s" % (scores_path,))
+    LOGGER.info("Running tar: " + " ".join(tar_cmd[:10]))
+    LOGGER.info("Running bzip2: " + " ".join(bzip_cmd))
+    LOGGER.info("Writing to %s" % (tar_dest_name,))
  
      env = {'BZIP': '-9'}
      tar = subprocess.Popen(tar_cmd, stdout=subprocess.PIPE, shell=False, env=env,
@@ -479,26 +481,26 @@ def compress_eland_results(gerald_object, cycle_dir, num_jobs=1):
          for eland_lane in lanes_dictionary.values():
              source_name = eland_lane.pathname
              if source_name is None:
-              logging.info(
+              LOGGER.info(
                  "Lane ID %s does not have a filename." % (eland_lane.lane_id,))
              else:
                path, name = os.path.split(source_name)
                dest_name = os.path.join(cycle_dir, name)
-              logging.info("Saving eland file %s to %s" % \
+              LOGGER.info("Saving eland file %s to %s" % \
                           (source_name, dest_name))
  
                if is_compressed(name):
-                logging.info('Already compressed, Saving to %s' % (dest_name,))
+                LOGGER.info('Already compressed, Saving to %s' % (dest_name,))
                  shutil.copy(source_name, dest_name)
                else:
                  # not compressed
                  dest_name += '.bz2'
                  args = ['bzip2', '-9', '-c', source_name, '>', dest_name ]
                  bz_commands.append(" ".join(args))
-                #logging.info('Running: %s' % ( " ".join(args) ))
+                #LOGGER.info('Running: %s' % ( " ".join(args) ))
                  #bzip_dest = open(dest_name, 'w')
                  #bzip = subprocess.Popen(args, stdout=bzip_dest)
-                #logging.info('Saving to %s' % (dest_name, ))
+                #LOGGER.info('Saving to %s' % (dest_name, ))
                  #bzip.wait()
  
      if len(bz_commands) > 0:
@@ -520,17 +522,17 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, r
  
      for r in runs:
        result_dir = os.path.join(output_base_dir, r.flowcell_id)
-      logging.info("Using %s as result directory" % (result_dir,))
+      LOGGER.info("Using %s as result directory" % (result_dir,))
        if not os.path.exists(result_dir):
          os.mkdir(result_dir)
  
        # create cycle_dir
        cycle = "C%d-%d" % (r.image_analysis.start, r.image_analysis.stop)
-      logging.info("Filling in %s" % (cycle,))
+      LOGGER.info("Filling in %s" % (cycle,))
        cycle_dir = os.path.join(result_dir, cycle)
        cycle_dir = os.path.abspath(cycle_dir)
        if os.path.exists(cycle_dir):
-        logging.error("%s already exists, not overwriting" % (cycle_dir,))
+        LOGGER.error("%s already exists, not overwriting" % (cycle_dir,))
          continue
        else:
          os.mkdir(cycle_dir)
@@ -578,24 +580,24 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, r
  def rm_list(files, dry_run=True):
      for f in files:
          if os.path.exists(f):
-            logging.info('deleting %s' % (f,))
+            LOGGER.info('deleting %s' % (f,))
              if not dry_run:
                  if os.path.isdir(f):
                      shutil.rmtree(f)
                  else:
                      os.unlink(f)
          else:
-            logging.warn("%s doesn't exist." % (f,))
+            LOGGER.warn("%s doesn't exist." % (f,))
  
  def clean_runs(runs, dry_run=True):
      """
      Clean up run folders to optimize for compression.
      """
      if dry_run:
-        logging.info('In dry-run mode')
+        LOGGER.info('In dry-run mode')
  
      for run in runs:
-        logging.info('Cleaning %s' % (run.pathname,))
+        LOGGER.info('Cleaninging %s' % (run.pathname,))
          # rm RunLog*.xml
          runlogs = glob(os.path.join(run.pathname, 'RunLog*xml'))
          rm_list(runlogs, dry_run)
@@ -611,15 +613,15 @@ def clean_runs(runs, dry_run=True):
          calibration_dir = glob(os.path.join(run.pathname, 'Calibration_*'))
          rm_list(calibration_dir, dry_run)
          # rm Images/L*
-        logging.info("Cleaning images")
+        LOGGER.info("Cleaning images")
          image_dirs = glob(os.path.join(run.pathname, 'Images', 'L*'))
          rm_list(image_dirs, dry_run)
          # rm ReadPrep
-        logging.info("Cleaning ReadPrep*")
+        LOGGER.info("Cleaning ReadPrep*")
          read_prep_dirs = glob(os.path.join(run.pathname, 'ReadPrep*'))
          rm_list(read_prep_dirs, dry_run)
          # rm ReadPrep
-        logging.info("Cleaning Thubmnail_images")
+        LOGGER.info("Cleaning Thubmnail_images")
          thumbnail_dirs = glob(os.path.join(run.pathname, 'Thumbnail_Images'))
          rm_list(thumbnail_dirs, dry_run)