From: Diane Trout <diane@caltech.edu>
Date: Fri, 30 Jan 2009 02:15:57 +0000 (+0000)
Subject: Try to make runfolder results extraction more robust
X-Git-Tag: 0.2.0.1~30
X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=3d3e70510cff24f388b939f6805add3ac3782dc5

Try to make runfolder results extraction more robust
If an IPAR or firecrest directory is missing some of the important
matrix files that implies there isn't actually a valid run present,
this patch will then (hopefully) issue a warning and skip that analysis
run.

I also added an option to scripts/runfolder to allow a user to specify
where the extracted results should go.

One questionable thing is that for one analysis some of the lanes
were run as sequence and not an eland analysis so were I expected
all the lanes to have an eland genome, it doesn't for these.
I hope that the code doesn't lose the index after serializing and
deserializing that chunk example.
---

diff --git a/htsworkflow/pipelines/firecrest.py b/htsworkflow/pipelines/firecrest.py
index ee6fded..4fbde5d 100644
--- a/htsworkflow/pipelines/firecrest.py
+++ b/htsworkflow/pipelines/firecrest.py
@@ -115,6 +115,8 @@ def firecrest(pathname):
     # should I parse this deeper than just stashing the 
     # contents of the matrix file?
     matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt')
+    if not os.path.exists(matrix_pathname):
+        return None
     f.matrix = open(matrix_pathname, 'r').read()
     return f
 
diff --git a/htsworkflow/pipelines/gerald.py b/htsworkflow/pipelines/gerald.py
index a5dd323..cbc5fcb 100644
--- a/htsworkflow/pipelines/gerald.py
+++ b/htsworkflow/pipelines/gerald.py
@@ -41,7 +41,10 @@ class Gerald(object):
             if len(container.getchildren()) > LANES_PER_FLOWCELL:
                 raise RuntimeError('GERALD config.xml file changed')
             lanes = [x.tag.split('_')[1] for x in container.getchildren()]
-            index = lanes.index(self._lane_id)
+            try:
+                index = lanes.index(self._lane_id)
+            except ValueError, e:
+                return None
             element = container[index]
             return element.text
         def _get_analysis(self):
diff --git a/htsworkflow/pipelines/ipar.py b/htsworkflow/pipelines/ipar.py
index a559229..3d90868 100644
--- a/htsworkflow/pipelines/ipar.py
+++ b/htsworkflow/pipelines/ipar.py
@@ -193,6 +193,8 @@ def ipar(pathname):
 
     # contents of the matrix file?
     matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt')
+    if not os.path.exists(matrix_pathname):
+        return None
     i.matrix = open(matrix_pathname, 'r').read()
 
     # look for parameter xml file
@@ -222,4 +224,4 @@ if __name__ == "__main__":
   print i.tiles.keys()
   print j.tiles.keys()
   print j.tiles.items()
-  print j.file_list()
\ No newline at end of file
+  print j.file_list()
diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py
index f327b78..10eeee1 100644
--- a/htsworkflow/pipelines/runfolder.py
+++ b/htsworkflow/pipelines/runfolder.py
@@ -178,12 +178,26 @@ def get_runs(runfolder):
     for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
         logging.info('Found firecrest in ' + datadir)
         image_analysis = firecrest.firecrest(firecrest_pathname)
-        scan_post_image_analysis(runs, runfolder, image_analysis, firecrest_pathname)
+        if image_analysis is None:
+	    logging.warn(
+                "%s is an empty or invalid firecrest directory" % (firecrest_pathname,)
+            )
+	else:
+            scan_post_image_analysis(
+                runs, runfolder, image_analysis, firecrest_pathname
+            )
     # scan for IPAR directories
     for ipar_pathname in glob(os.path.join(datadir,"IPAR_*")):
         logging.info('Found ipar directories in ' + datadir)
         image_analysis = ipar.ipar(ipar_pathname)
-        scan_post_image_analysis(runs, runfolder, image_analysis, ipar_pathname)
+        if image_analysis is None:
+	    logging.warn(
+                "%s is an empty or invalid IPAR directory" %(ipar_pathname,)
+            )
+	else:
+            scan_post_image_analysis(
+                runs, runfolder, image_analysis, ipar_pathname
+            )
 
     return runs
 
diff --git a/scripts/runfolder b/scripts/runfolder
index 0abbfe1..bf5c5d8 100644
--- a/scripts/runfolder
+++ b/scripts/runfolder
@@ -27,6 +27,7 @@ runfolder.py can also spit out a simple summary report (-s option)
 that contains the per lane post filter cluster numbers and the mapped 
 read counts. (The report isn't currently very pretty)
 """
+from glob import glob
 import logging
 import optparse
 import os
@@ -50,9 +51,13 @@ def make_parser():
     parser.add_option('--extract-results', action='store_true',
            default=False,
            help='extract result files out of runfolder into a simpler archive')
+    parser.add_option('-o', '--output-dir', default=None,
+           help="specify the default output directory for extract results")
+
     parser.add_option('--run-xml', dest='run_xml',
            default=None,
            help='specify a run_<FlowCell>.xml file for summary reports')
+    
 
     return parser
 
@@ -65,14 +70,17 @@ def main(cmdlist=None):
         root_log = logging.getLogger()
         root_log.setLevel(logging.INFO)
 
+    logging.info('Starting htsworkflow illumina runfolder processing tool.')
     runs = []
     if opt.run_xml:
         # handle ~ shortcut
         opt.run_xml = os.path.expanduser(opt.run_xml)
         tree = ElementTree.parse(opt.run_xml).getroot()
         runs.append(runfolder.PipelineRun(xml=tree))
-    for run_dir in args:
-        runs.extend(runfolder.get_runs(run_dir))
+    for run_pattern in args:
+        # expand args on our own if needed
+        for run_dir in glob(run_pattern):
+            runs.extend(runfolder.get_runs(run_dir))
 
     if len(runs) > 0:
         if opt.summary:
@@ -80,7 +88,7 @@ def main(cmdlist=None):
         if opt.archive:
             runfolder.extract_run_parameters(runs)
         if opt.extract_results:
-            runfolder.extract_results(runs)
+            runfolder.extract_results(runs, opt.output_dir)
 
     return 0