Minimal changes needed to get raw data archived for loxcyc.

author Diane Trout <diane@caltech.edu>

Thu, 10 May 2012 01:45:05 +0000 (18:45 -0700)

committer Diane Trout <diane@caltech.edu>

Thu, 10 May 2012 01:45:05 +0000 (18:45 -0700)
author Diane Trout <diane@caltech.edu>
Thu, 10 May 2012 01:45:05 +0000 (18:45 -0700)
committer Diane Trout <diane@caltech.edu>
Thu, 10 May 2012 01:45:05 +0000 (18:45 -0700)
diff --git a/htsworkflow/pipelines/gerald.py b/htsworkflow/pipelines/gerald.py

index 06fc94b5433cfb00a131a8aee5e6ab78684e0cc4..9d32a5bfd741739edb4d685c7828180f69ed798d 100644 (file)
--- a/htsworkflow/pipelines/gerald.py
+++ b/htsworkflow/pipelines/gerald.py
@@ -4,6 +4,7 @@ Provide access to information stored in the GERALD directory.
  from datetime import datetime, date
  import logging
  import os
+import stat
  import time
  
  from htsworkflow.pipelines.summary import Summary
@@ -91,6 +92,8 @@ class Gerald(object):
              self._lanes = {}
              tree = self._gerald.tree
              analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
+            if analysis is None:
+                return
              # according to the pipeline specs I think their fields
              # are sampleName_laneID, with sampleName defaulting to s
              # since laneIDs are constant lets just try using
@@ -104,6 +107,10 @@ class Gerald(object):
              if self._lane is None:
                  self._initalize_lanes()
              return self._lanes[key]
+        def get(self, key, default):
+            if self._lane is None:
+                self._initalize_lanes()
+            return self._lanes.get(key, None)
          def keys(self):
              if self._lane is None:
                  self._initalize_lanes()
@@ -138,8 +145,13 @@ class Gerald(object):
          if self.tree is None:
              return datetime.today()
          timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
-        epochstamp = time.mktime(time.strptime(timestamp, '%c'))
-        return datetime.fromtimestamp(epochstamp)
+        if timestamp is not None:
+            epochstamp = time.mktime(time.strptime(timestamp, '%c'))
+            return datetime.fromtimestamp(epochstamp)
+        if self.pathname is not None:
+            epochstamp = os.stat(self.pathname)[stat.ST_MTIME]
+            return datetime.fromtimestamp(epochstamp)
+        return datetime.today()
      date = property(_get_date)
  
      def _get_time(self):
@@ -162,10 +174,12 @@ class Gerald(object):
              root = os.path.join(root,'')
  
          experiment_dir = self.tree.findtext('ChipWideRunParameters/EXPT_DIR')
-        if experiment_dir is None:
-            return None
-        experiment_dir = experiment_dir.replace(root, '')
-        if len(experiment_dir) == 0:
+        if experiment_dir is not None:
+            experiment_dir = experiment_dir.replace(root, '')
+        experiment_dir = self.tree.findtext('Defaults/EXPT_DIR')
+        if experiment_dir is not None:
+            _, experiment_dir = os.path.split(experiment_dir)
+        if experiment_dir is None or len(experiment_dir) == 0:
              return None
  
          dirnames = experiment_dir.split(os.path.sep)
@@ -229,12 +243,18 @@ def gerald(pathname):
      g.tree = ElementTree.parse(config_pathname).getroot()
  
      # parse Summary.htm file
-    summary_pathname = os.path.join(g.pathname, 'Summary.xml')
-    if os.path.exists(summary_pathname):
+    summary_xml = os.path.join(g.pathname, 'Summary.xml')
+    summary_htm = os.path.join(g.pathname, 'Summary.htm')
+    status_files_summary = os.path.join(g.pathname, '..', 'Data', 'Status_Files', 'Summary.htm')
+    if os.path.exists(summary_xml):
          LOGGER.info("Parsing Summary.xml")
-    else:
+        summary_pathname = summary_xml
+    elif os.path.exists(summary_htm):
          summary_pathname = os.path.join(g.pathname, 'Summary.htm')
          LOGGER.info("Parsing Summary.htm")
+    else:
+        summary_pathname = status_files_summary
+        LOGGER.info("Parsing %s" % (status_files_summary,))
      g.summary = Summary(summary_pathname)
      # parse eland files
      g.eland_results = eland(g.pathname, g)
diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py

index 3389c2d9ab8ad6f3ffc4b2378c0c8fcd7ff8139a..3bfb928d763f442faa4b4b852b8683efabaae93f 100644 (file)
--- a/htsworkflow/pipelines/runfolder.py
+++ b/htsworkflow/pipelines/runfolder.py
@@ -47,6 +47,7 @@ class PipelineRun(object):
            self.pathname = None
          self._name = None
          self._flowcell_id = flowcell_id
+        self.datadir = None
          self.image_analysis = None
          self.bustard = None
          self.gerald = None
@@ -177,7 +178,7 @@ def get_runs(runfolder, flowcell_id=None):
      from htsworkflow.pipelines import bustard
      from htsworkflow.pipelines import gerald
  
-    def scan_post_image_analysis(runs, runfolder, image_analysis, pathname):
+    def scan_post_image_analysis(runs, runfolder, datadir, image_analysis, pathname):
          LOGGER.info("Looking for bustard directories in %s" % (pathname,))
          bustard_dirs = glob(os.path.join(pathname, "Bustard*"))
          # RTA BaseCalls looks enough like Bustard.
@@ -192,6 +193,21 @@ def get_runs(runfolder, flowcell_id=None):
                  try:
                      g = gerald.gerald(gerald_pathname)
                      p = PipelineRun(runfolder, flowcell_id)
+                    p.datadir = datadir
+                    p.image_analysis = image_analysis
+                    p.bustard = b
+                    p.gerald = g
+                    runs.append(p)
+                except IOError, e:
+                    LOGGER.error("Ignoring " + str(e))
+
+            aligned_glob = os.path.join(runfolder, 'Aligned*')
+            for aligned in glob(aligned_glob):
+                LOGGER.info("Found aligned directory %s" % (aligned,))
+                try:
+                    g = gerald.gerald(aligned)
+                    p = PipelineRun(runfolder, flowcell_id)
+                    p.datadir = datadir
                      p.image_analysis = image_analysis
                      p.bustard = b
                      p.gerald = g
@@ -228,7 +244,7 @@ def get_runs(runfolder, flowcell_id=None):
              )
          else:
              scan_post_image_analysis(
-                runs, runfolder, image_analysis, ipar_pathname
+                runs, runfolder, datadir, image_analysis, ipar_pathname
              )
  
      return runs
@@ -407,12 +423,17 @@ def save_flowcell_reports(data_dir, cycle_dir):
          os.chdir(cwd)
  
  
-def save_summary_file(gerald_object, cycle_dir):
+def save_summary_file(pipeline, cycle_dir):
      # Copy Summary.htm
-    summary_path = os.path.join(gerald_object.pathname, 'Summary.htm')
-    if os.path.exists(summary_path):
-        LOGGER.info('Copying %s to %s' % (summary_path, cycle_dir))
-        shutil.copy(summary_path, cycle_dir)
+    gerald_object = pipeline.gerald
+    gerald_summary = os.path.join(gerald_object.pathname, 'Summary.htm')
+    status_files_summary = os.path.join(pipeline.datadir, 'Status_Files', 'Summary.htm')
+    if os.path.exists(gerald_summary):
+        LOGGER.info('Copying %s to %s' % (gerald_summary, cycle_dir))
+        shutil.copy(gerald_summary, cycle_dir)
+    elif os.path.exists(status_files_summary):
+        LOGGER.info('Copying %s to %s' % (status_files_summary, cycle_dir))
+        shutil.copy(status_files_summary, cycle_dir)
      else:
          LOGGER.info('Summary file %s was not found' % (summary_path,))
  
@@ -551,13 +572,17 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, r
        if site is not None:
          lanes = []
          for lane in range(1, 9):
-          if r.gerald.lanes[lane].analysis != 'none':
+          lane_parameters = r.gerald.lanes.get(lane, None)
+          if lane_parameters is not None and lane_parameters.analysis != 'none':
              lanes.append(lane)
  
          run_name = srf.pathname_to_run_name(r.pathname)
          seq_cmds = []
+        LOGGER.info("Raw Format is: %s" % (raw_format, ))
          if raw_format == 'fastq':
-            srf.copy_hiseq_project_fastqs(run_name, r.bustard.pathname, site, cycle_dir)
+            rawpath = os.path.join(r.pathname, r.gerald.runfolder_name)
+            LOGGER.info("raw data = %s" % (rawpath,))
+            srf.copy_hiseq_project_fastqs(run_name, rawpath, site, cycle_dir)
          elif raw_format == 'qseq':
              seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir)
          elif raw_format == 'srf':
@@ -571,7 +596,7 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, r
        g = r.gerald
  
        # save summary file
-      save_summary_file(g, cycle_dir)
+      save_summary_file(r, cycle_dir)
  
        # compress eland result files
        compress_eland_results(g, cycle_dir, num_jobs)
author	Diane Trout <diane@caltech.edu>
	Thu, 10 May 2012 01:45:05 +0000 (18:45 -0700)
committer	Diane Trout <diane@caltech.edu>
	Thu, 10 May 2012 01:45:05 +0000 (18:45 -0700)
htsworkflow/pipelines/gerald.py		patch \| blob \| history
htsworkflow/pipelines/runfolder.py		patch \| blob \| history