Try to make runfolder results extraction more robust
authorDiane Trout <diane@caltech.edu>
Fri, 30 Jan 2009 02:15:57 +0000 (02:15 +0000)
committerDiane Trout <diane@caltech.edu>
Fri, 30 Jan 2009 02:15:57 +0000 (02:15 +0000)
If an IPAR or firecrest directory is missing some of the important
matrix files that implies there isn't actually a valid run present,
this patch will then (hopefully) issue a warning and skip that analysis
run.

I also added an option to scripts/runfolder to allow a user to specify
where the extracted results should go.

One questionable thing is that for one analysis some of the lanes
were run as sequence and not an eland analysis so were I expected
all the lanes to have an eland genome, it doesn't for these.
I hope that the code doesn't lose the index after serializing and
deserializing that chunk example.

htsworkflow/pipelines/firecrest.py
htsworkflow/pipelines/gerald.py
htsworkflow/pipelines/ipar.py
htsworkflow/pipelines/runfolder.py
scripts/runfolder

index ee6fded6371c45b02c10c40d3c16df76923a52a1..4fbde5d517ac661cfecd68ff8ab2fff2472f5d9f 100644 (file)
@@ -115,6 +115,8 @@ def firecrest(pathname):
     # should I parse this deeper than just stashing the 
     # contents of the matrix file?
     matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt')
+    if not os.path.exists(matrix_pathname):
+        return None
     f.matrix = open(matrix_pathname, 'r').read()
     return f
 
index a5dd323861beb46549fc3b98be0f6af77eec6e84..cbc5fcb92d7fb26948f6137ab4f897ad749e129e 100644 (file)
@@ -41,7 +41,10 @@ class Gerald(object):
             if len(container.getchildren()) > LANES_PER_FLOWCELL:
                 raise RuntimeError('GERALD config.xml file changed')
             lanes = [x.tag.split('_')[1] for x in container.getchildren()]
-            index = lanes.index(self._lane_id)
+            try:
+                index = lanes.index(self._lane_id)
+            except ValueError, e:
+                return None
             element = container[index]
             return element.text
         def _get_analysis(self):
index a559229ee83957068c8c96dcc3f538980e4fc9b4..3d90868910a37d16d4b9b7f70a042294553f0485 100644 (file)
@@ -193,6 +193,8 @@ def ipar(pathname):
 
     # contents of the matrix file?
     matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt')
+    if not os.path.exists(matrix_pathname):
+        return None
     i.matrix = open(matrix_pathname, 'r').read()
 
     # look for parameter xml file
@@ -222,4 +224,4 @@ if __name__ == "__main__":
   print i.tiles.keys()
   print j.tiles.keys()
   print j.tiles.items()
-  print j.file_list()
\ No newline at end of file
+  print j.file_list()
index f327b7891868034ca38c8327bdeed5ed2efdd135..10eeee1ccef5d1e7fda2c4100af01713d74910cd 100644 (file)
@@ -178,12 +178,26 @@ def get_runs(runfolder):
     for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")):
         logging.info('Found firecrest in ' + datadir)
         image_analysis = firecrest.firecrest(firecrest_pathname)
-        scan_post_image_analysis(runs, runfolder, image_analysis, firecrest_pathname)
+        if image_analysis is None:
+           logging.warn(
+                "%s is an empty or invalid firecrest directory" % (firecrest_pathname,)
+            )
+       else:
+            scan_post_image_analysis(
+                runs, runfolder, image_analysis, firecrest_pathname
+            )
     # scan for IPAR directories
     for ipar_pathname in glob(os.path.join(datadir,"IPAR_*")):
         logging.info('Found ipar directories in ' + datadir)
         image_analysis = ipar.ipar(ipar_pathname)
-        scan_post_image_analysis(runs, runfolder, image_analysis, ipar_pathname)
+        if image_analysis is None:
+           logging.warn(
+                "%s is an empty or invalid IPAR directory" %(ipar_pathname,)
+            )
+       else:
+            scan_post_image_analysis(
+                runs, runfolder, image_analysis, ipar_pathname
+            )
 
     return runs
 
index 0abbfe1c5128d1693c3443cebf8338d3023f9114..bf5c5d884d2c931a6b96040a477b6bc0577ab30f 100644 (file)
@@ -27,6 +27,7 @@ runfolder.py can also spit out a simple summary report (-s option)
 that contains the per lane post filter cluster numbers and the mapped 
 read counts. (The report isn't currently very pretty)
 """
+from glob import glob
 import logging
 import optparse
 import os
@@ -50,9 +51,13 @@ def make_parser():
     parser.add_option('--extract-results', action='store_true',
            default=False,
            help='extract result files out of runfolder into a simpler archive')
+    parser.add_option('-o', '--output-dir', default=None,
+           help="specify the default output directory for extract results")
+
     parser.add_option('--run-xml', dest='run_xml',
            default=None,
            help='specify a run_<FlowCell>.xml file for summary reports')
+    
 
     return parser
 
@@ -65,14 +70,17 @@ def main(cmdlist=None):
         root_log = logging.getLogger()
         root_log.setLevel(logging.INFO)
 
+    logging.info('Starting htsworkflow illumina runfolder processing tool.')
     runs = []
     if opt.run_xml:
         # handle ~ shortcut
         opt.run_xml = os.path.expanduser(opt.run_xml)
         tree = ElementTree.parse(opt.run_xml).getroot()
         runs.append(runfolder.PipelineRun(xml=tree))
-    for run_dir in args:
-        runs.extend(runfolder.get_runs(run_dir))
+    for run_pattern in args:
+        # expand args on our own if needed
+        for run_dir in glob(run_pattern):
+            runs.extend(runfolder.get_runs(run_dir))
 
     if len(runs) > 0:
         if opt.summary:
@@ -80,7 +88,7 @@ def main(cmdlist=None):
         if opt.archive:
             runfolder.extract_run_parameters(runs)
         if opt.extract_results:
-            runfolder.extract_results(runs)
+            runfolder.extract_results(runs, opt.output_dir)
 
     return 0