Try to make runfolder results extraction more robust

author Diane Trout <diane@caltech.edu>

Fri, 30 Jan 2009 02:15:57 +0000 (02:15 +0000)

committer Diane Trout <diane@caltech.edu>

Fri, 30 Jan 2009 02:15:57 +0000 (02:15 +0000)
author Diane Trout <diane@caltech.edu>
Fri, 30 Jan 2009 02:15:57 +0000 (02:15 +0000)
committer Diane Trout <diane@caltech.edu>
Fri, 30 Jan 2009 02:15:57 +0000 (02:15 +0000)
diff --git a/htsworkflow/pipelines/firecrest.py b/htsworkflow/pipelines/firecrest.py

index ee6fded6371c45b02c10c40d3c16df76923a52a1..4fbde5d517ac661cfecd68ff8ab2fff2472f5d9f 100644 (file)
--- a/htsworkflow/pipelines/firecrest.py
+++ b/htsworkflow/pipelines/firecrest.py
@@ -115,6 +115,8 @@ def firecrest(pathname):
      # should I parse this deeper than just stashing the 
      # contents of the matrix file?
      matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt')
      # should I parse this deeper than just stashing the 
      # contents of the matrix file?
      matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt')
+    if not os.path.exists(matrix_pathname):
+        return None
      f.matrix = open(matrix_pathname, 'r').read()
      return f
  
      f.matrix = open(matrix_pathname, 'r').read()
      return f
  
diff --git a/htsworkflow/pipelines/gerald.py b/htsworkflow/pipelines/gerald.py

index a5dd323861beb46549fc3b98be0f6af77eec6e84..cbc5fcb92d7fb26948f6137ab4f897ad749e129e 100644 (file)
--- a/htsworkflow/pipelines/gerald.py
+++ b/htsworkflow/pipelines/gerald.py
@@ -41,7 +41,10 @@ class Gerald(object):
              if len(container.getchildren()) > LANES_PER_FLOWCELL:
                  raise RuntimeError('GERALD config.xml file changed')
              lanes = [x.tag.split('_')[1] for x in container.getchildren()]
              if len(container.getchildren()) > LANES_PER_FLOWCELL:
                  raise RuntimeError('GERALD config.xml file changed')
              lanes = [x.tag.split('_')[1] for x in container.getchildren()]
-            index = lanes.index(self._lane_id)
+            try:
+                index = lanes.index(self._lane_id)
+            except ValueError, e:
+                return None
              element = container[index]
              return element.text
          def _get_analysis(self):
              element = container[index]
              return element.text
          def _get_analysis(self):
diff --git a/htsworkflow/pipelines/ipar.py b/htsworkflow/pipelines/ipar.py

index a559229ee83957068c8c96dcc3f538980e4fc9b4..3d90868910a37d16d4b9b7f70a042294553f0485 100644 (file)
--- a/htsworkflow/pipelines/ipar.py
+++ b/htsworkflow/pipelines/ipar.py
@@ -193,6 +193,8 @@ def ipar(pathname):
  
      # contents of the matrix file?
      matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt')
  
      # contents of the matrix file?
      matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt')
+    if not os.path.exists(matrix_pathname):
+        return None
      i.matrix = open(matrix_pathname, 'r').read()
  
      # look for parameter xml file
      i.matrix = open(matrix_pathname, 'r').read()
  
      # look for parameter xml file
@@ -222,4 +224,4 @@ if __name__ == "__main__":
    print i.tiles.keys()
    print j.tiles.keys()
    print j.tiles.items()
    print i.tiles.keys()
    print j.tiles.keys()
    print j.tiles.items()
-  print j.file_list()
-\ No newline at end of file
+  print j.file_list()
diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py

index f327b7891868034ca38c8327bdeed5ed2efdd135..10eeee1ccef5d1e7fda2c4100af01713d74910cd 100644 (file)
--- a/htsworkflow/pipelines/runfolder.py
+++ b/htsworkflow/pipelines/runfolder.py
@@ -178,12 +178,26 @@ def get_runs(runfolder):
      for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
          logging.info('Found firecrest in ' + datadir)
          image_analysis = firecrest.firecrest(firecrest_pathname)
      for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
          logging.info('Found firecrest in ' + datadir)
          image_analysis = firecrest.firecrest(firecrest_pathname)
-        scan_post_image_analysis(runs, runfolder, image_analysis, firecrest_pathname)
+        if image_analysis is None:
+           logging.warn(
+                "%s is an empty or invalid firecrest directory" % (firecrest_pathname,)
+            )
+       else:
+            scan_post_image_analysis(
+                runs, runfolder, image_analysis, firecrest_pathname
+            )
      # scan for IPAR directories
      for ipar_pathname in glob(os.path.join(datadir,"IPAR_*")):
          logging.info('Found ipar directories in ' + datadir)
          image_analysis = ipar.ipar(ipar_pathname)
      # scan for IPAR directories
      for ipar_pathname in glob(os.path.join(datadir,"IPAR_*")):
          logging.info('Found ipar directories in ' + datadir)
          image_analysis = ipar.ipar(ipar_pathname)
-        scan_post_image_analysis(runs, runfolder, image_analysis, ipar_pathname)
+        if image_analysis is None:
+           logging.warn(
+                "%s is an empty or invalid IPAR directory" %(ipar_pathname,)
+            )
+       else:
+            scan_post_image_analysis(
+                runs, runfolder, image_analysis, ipar_pathname
+            )
  
      return runs
  
  
      return runs
  
diff --git a/scripts/runfolder b/scripts/runfolder

index 0abbfe1c5128d1693c3443cebf8338d3023f9114..bf5c5d884d2c931a6b96040a477b6bc0577ab30f 100644 (file)
--- a/scripts/runfolder
+++ b/scripts/runfolder
@@ -27,6 +27,7 @@ runfolder.py can also spit out a simple summary report (-s option)
  that contains the per lane post filter cluster numbers and the mapped 
  read counts. (The report isn't currently very pretty)
  """
  that contains the per lane post filter cluster numbers and the mapped 
  read counts. (The report isn't currently very pretty)
  """
+from glob import glob
  import logging
  import optparse
  import os
  import logging
  import optparse
  import os
@@ -50,9 +51,13 @@ def make_parser():
      parser.add_option('--extract-results', action='store_true',
             default=False,
             help='extract result files out of runfolder into a simpler archive')
      parser.add_option('--extract-results', action='store_true',
             default=False,
             help='extract result files out of runfolder into a simpler archive')
+    parser.add_option('-o', '--output-dir', default=None,
+           help="specify the default output directory for extract results")
+
      parser.add_option('--run-xml', dest='run_xml',
             default=None,
             help='specify a run_<FlowCell>.xml file for summary reports')
      parser.add_option('--run-xml', dest='run_xml',
             default=None,
             help='specify a run_<FlowCell>.xml file for summary reports')
+    
  
      return parser
  
  
      return parser
  
@@ -65,14 +70,17 @@ def main(cmdlist=None):
          root_log = logging.getLogger()
          root_log.setLevel(logging.INFO)
  
          root_log = logging.getLogger()
          root_log.setLevel(logging.INFO)
  
+    logging.info('Starting htsworkflow illumina runfolder processing tool.')
      runs = []
      if opt.run_xml:
          # handle ~ shortcut
          opt.run_xml = os.path.expanduser(opt.run_xml)
          tree = ElementTree.parse(opt.run_xml).getroot()
          runs.append(runfolder.PipelineRun(xml=tree))
      runs = []
      if opt.run_xml:
          # handle ~ shortcut
          opt.run_xml = os.path.expanduser(opt.run_xml)
          tree = ElementTree.parse(opt.run_xml).getroot()
          runs.append(runfolder.PipelineRun(xml=tree))
-    for run_dir in args:
-        runs.extend(runfolder.get_runs(run_dir))
+    for run_pattern in args:
+        # expand args on our own if needed
+        for run_dir in glob(run_pattern):
+            runs.extend(runfolder.get_runs(run_dir))
  
      if len(runs) > 0:
          if opt.summary:
  
      if len(runs) > 0:
          if opt.summary:
@@ -80,7 +88,7 @@ def main(cmdlist=None):
          if opt.archive:
              runfolder.extract_run_parameters(runs)
          if opt.extract_results:
          if opt.archive:
              runfolder.extract_run_parameters(runs)
          if opt.extract_results:
-            runfolder.extract_results(runs)
+            runfolder.extract_results(runs, opt.output_dir)
  
      return 0
  
  
      return 0
author	Diane Trout <diane@caltech.edu>
	Fri, 30 Jan 2009 02:15:57 +0000 (02:15 +0000)
committer	Diane Trout <diane@caltech.edu>
	Fri, 30 Jan 2009 02:15:57 +0000 (02:15 +0000)
htsworkflow/pipelines/firecrest.py		patch \| blob \| history
htsworkflow/pipelines/gerald.py		patch \| blob \| history
htsworkflow/pipelines/ipar.py		patch \| blob \| history
htsworkflow/pipelines/runfolder.py		patch \| blob \| history
scripts/runfolder		patch \| blob \| history