Minimal changes needed to get raw data archived for loxcyc.
authorDiane Trout <diane@caltech.edu>
Thu, 10 May 2012 01:45:05 +0000 (18:45 -0700)
committerDiane Trout <diane@caltech.edu>
Thu, 10 May 2012 01:45:05 +0000 (18:45 -0700)
Its probably not properly counting how many reads there are.

htsworkflow/pipelines/gerald.py
htsworkflow/pipelines/runfolder.py

index 06fc94b5433cfb00a131a8aee5e6ab78684e0cc4..9d32a5bfd741739edb4d685c7828180f69ed798d 100644 (file)
@@ -4,6 +4,7 @@ Provide access to information stored in the GERALD directory.
 from datetime import datetime, date
 import logging
 import os
+import stat
 import time
 
 from htsworkflow.pipelines.summary import Summary
@@ -91,6 +92,8 @@ class Gerald(object):
             self._lanes = {}
             tree = self._gerald.tree
             analysis = tree.find('LaneSpecificRunParameters/ANALYSIS')
+            if analysis is None:
+                return
             # according to the pipeline specs I think their fields
             # are sampleName_laneID, with sampleName defaulting to s
             # since laneIDs are constant lets just try using
@@ -104,6 +107,10 @@ class Gerald(object):
             if self._lane is None:
                 self._initalize_lanes()
             return self._lanes[key]
+        def get(self, key, default):
+            if self._lane is None:
+                self._initalize_lanes()
+            return self._lanes.get(key, None)
         def keys(self):
             if self._lane is None:
                 self._initalize_lanes()
@@ -138,8 +145,13 @@ class Gerald(object):
         if self.tree is None:
             return datetime.today()
         timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP')
-        epochstamp = time.mktime(time.strptime(timestamp, '%c'))
-        return datetime.fromtimestamp(epochstamp)
+        if timestamp is not None:
+            epochstamp = time.mktime(time.strptime(timestamp, '%c'))
+            return datetime.fromtimestamp(epochstamp)
+        if self.pathname is not None:
+            epochstamp = os.stat(self.pathname)[stat.ST_MTIME]
+            return datetime.fromtimestamp(epochstamp)
+        return datetime.today()
     date = property(_get_date)
 
     def _get_time(self):
@@ -162,10 +174,12 @@ class Gerald(object):
             root = os.path.join(root,'')
 
         experiment_dir = self.tree.findtext('ChipWideRunParameters/EXPT_DIR')
-        if experiment_dir is None:
-            return None
-        experiment_dir = experiment_dir.replace(root, '')
-        if len(experiment_dir) == 0:
+        if experiment_dir is not None:
+            experiment_dir = experiment_dir.replace(root, '')
+        experiment_dir = self.tree.findtext('Defaults/EXPT_DIR')
+        if experiment_dir is not None:
+            _, experiment_dir = os.path.split(experiment_dir)
+        if experiment_dir is None or len(experiment_dir) == 0:
             return None
 
         dirnames = experiment_dir.split(os.path.sep)
@@ -229,12 +243,18 @@ def gerald(pathname):
     g.tree = ElementTree.parse(config_pathname).getroot()
 
     # parse Summary.htm file
-    summary_pathname = os.path.join(g.pathname, 'Summary.xml')
-    if os.path.exists(summary_pathname):
+    summary_xml = os.path.join(g.pathname, 'Summary.xml')
+    summary_htm = os.path.join(g.pathname, 'Summary.htm')
+    status_files_summary = os.path.join(g.pathname, '..', 'Data', 'Status_Files', 'Summary.htm')
+    if os.path.exists(summary_xml):
         LOGGER.info("Parsing Summary.xml")
-    else:
+        summary_pathname = summary_xml
+    elif os.path.exists(summary_htm):
         summary_pathname = os.path.join(g.pathname, 'Summary.htm')
         LOGGER.info("Parsing Summary.htm")
+    else:
+        summary_pathname = status_files_summary
+        LOGGER.info("Parsing %s" % (status_files_summary,))
     g.summary = Summary(summary_pathname)
     # parse eland files
     g.eland_results = eland(g.pathname, g)
index 3389c2d9ab8ad6f3ffc4b2378c0c8fcd7ff8139a..3bfb928d763f442faa4b4b852b8683efabaae93f 100644 (file)
@@ -47,6 +47,7 @@ class PipelineRun(object):
           self.pathname = None
         self._name = None
         self._flowcell_id = flowcell_id
+        self.datadir = None
         self.image_analysis = None
         self.bustard = None
         self.gerald = None
@@ -177,7 +178,7 @@ def get_runs(runfolder, flowcell_id=None):
     from htsworkflow.pipelines import bustard
     from htsworkflow.pipelines import gerald
 
-    def scan_post_image_analysis(runs, runfolder, image_analysis, pathname):
+    def scan_post_image_analysis(runs, runfolder, datadir, image_analysis, pathname):
         LOGGER.info("Looking for bustard directories in %s" % (pathname,))
         bustard_dirs = glob(os.path.join(pathname, "Bustard*"))
         # RTA BaseCalls looks enough like Bustard.
@@ -192,6 +193,21 @@ def get_runs(runfolder, flowcell_id=None):
                 try:
                     g = gerald.gerald(gerald_pathname)
                     p = PipelineRun(runfolder, flowcell_id)
+                    p.datadir = datadir
+                    p.image_analysis = image_analysis
+                    p.bustard = b
+                    p.gerald = g
+                    runs.append(p)
+                except IOError, e:
+                    LOGGER.error("Ignoring " + str(e))
+
+            aligned_glob = os.path.join(runfolder, 'Aligned*')
+            for aligned in glob(aligned_glob):
+                LOGGER.info("Found aligned directory %s" % (aligned,))
+                try:
+                    g = gerald.gerald(aligned)
+                    p = PipelineRun(runfolder, flowcell_id)
+                    p.datadir = datadir
                     p.image_analysis = image_analysis
                     p.bustard = b
                     p.gerald = g
@@ -228,7 +244,7 @@ def get_runs(runfolder, flowcell_id=None):
             )
         else:
             scan_post_image_analysis(
-                runs, runfolder, image_analysis, ipar_pathname
+                runs, runfolder, datadir, image_analysis, ipar_pathname
             )
 
     return runs
@@ -407,12 +423,17 @@ def save_flowcell_reports(data_dir, cycle_dir):
         os.chdir(cwd)
 
 
-def save_summary_file(gerald_object, cycle_dir):
+def save_summary_file(pipeline, cycle_dir):
     # Copy Summary.htm
-    summary_path = os.path.join(gerald_object.pathname, 'Summary.htm')
-    if os.path.exists(summary_path):
-        LOGGER.info('Copying %s to %s' % (summary_path, cycle_dir))
-        shutil.copy(summary_path, cycle_dir)
+    gerald_object = pipeline.gerald
+    gerald_summary = os.path.join(gerald_object.pathname, 'Summary.htm')
+    status_files_summary = os.path.join(pipeline.datadir, 'Status_Files', 'Summary.htm')
+    if os.path.exists(gerald_summary):
+        LOGGER.info('Copying %s to %s' % (gerald_summary, cycle_dir))
+        shutil.copy(gerald_summary, cycle_dir)
+    elif os.path.exists(status_files_summary):
+        LOGGER.info('Copying %s to %s' % (status_files_summary, cycle_dir))
+        shutil.copy(status_files_summary, cycle_dir)
     else:
         LOGGER.info('Summary file %s was not found' % (summary_path,))
 
@@ -551,13 +572,17 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, r
       if site is not None:
         lanes = []
         for lane in range(1, 9):
-          if r.gerald.lanes[lane].analysis != 'none':
+          lane_parameters = r.gerald.lanes.get(lane, None)
+          if lane_parameters is not None and lane_parameters.analysis != 'none':
             lanes.append(lane)
 
         run_name = srf.pathname_to_run_name(r.pathname)
         seq_cmds = []
+        LOGGER.info("Raw Format is: %s" % (raw_format, ))
         if raw_format == 'fastq':
-            srf.copy_hiseq_project_fastqs(run_name, r.bustard.pathname, site, cycle_dir)
+            rawpath = os.path.join(r.pathname, r.gerald.runfolder_name)
+            LOGGER.info("raw data = %s" % (rawpath,))
+            srf.copy_hiseq_project_fastqs(run_name, rawpath, site, cycle_dir)
         elif raw_format == 'qseq':
             seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir)
         elif raw_format == 'srf':
@@ -571,7 +596,7 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, r
       g = r.gerald
 
       # save summary file
-      save_summary_file(g, cycle_dir)
+      save_summary_file(r, cycle_dir)
 
       # compress eland result files
       compress_eland_results(g, cycle_dir, num_jobs)