From: Diane Trout Date: Fri, 30 Jan 2009 02:15:57 +0000 (+0000) Subject: Try to make runfolder results extraction more robust X-Git-Tag: 0.2.0.1~30 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=3d3e70510cff24f388b939f6805add3ac3782dc5 Try to make runfolder results extraction more robust If an IPAR or firecrest directory is missing some of the important matrix files that implies there isn't actually a valid run present, this patch will then (hopefully) issue a warning and skip that analysis run. I also added an option to scripts/runfolder to allow a user to specify where the extracted results should go. One questionable thing is that for one analysis some of the lanes were run as sequence and not an eland analysis so were I expected all the lanes to have an eland genome, it doesn't for these. I hope that the code doesn't lose the index after serializing and deserializing that chunk example. --- diff --git a/htsworkflow/pipelines/firecrest.py b/htsworkflow/pipelines/firecrest.py index ee6fded..4fbde5d 100644 --- a/htsworkflow/pipelines/firecrest.py +++ b/htsworkflow/pipelines/firecrest.py @@ -115,6 +115,8 @@ def firecrest(pathname): # should I parse this deeper than just stashing the # contents of the matrix file? matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt') + if not os.path.exists(matrix_pathname): + return None f.matrix = open(matrix_pathname, 'r').read() return f diff --git a/htsworkflow/pipelines/gerald.py b/htsworkflow/pipelines/gerald.py index a5dd323..cbc5fcb 100644 --- a/htsworkflow/pipelines/gerald.py +++ b/htsworkflow/pipelines/gerald.py @@ -41,7 +41,10 @@ class Gerald(object): if len(container.getchildren()) > LANES_PER_FLOWCELL: raise RuntimeError('GERALD config.xml file changed') lanes = [x.tag.split('_')[1] for x in container.getchildren()] - index = lanes.index(self._lane_id) + try: + index = lanes.index(self._lane_id) + except ValueError, e: + return None element = container[index] return element.text def _get_analysis(self): diff --git a/htsworkflow/pipelines/ipar.py b/htsworkflow/pipelines/ipar.py index a559229..3d90868 100644 --- a/htsworkflow/pipelines/ipar.py +++ b/htsworkflow/pipelines/ipar.py @@ -193,6 +193,8 @@ def ipar(pathname): # contents of the matrix file? matrix_pathname = os.path.join(pathname, 'Matrix', 's_matrix.txt') + if not os.path.exists(matrix_pathname): + return None i.matrix = open(matrix_pathname, 'r').read() # look for parameter xml file @@ -222,4 +224,4 @@ if __name__ == "__main__": print i.tiles.keys() print j.tiles.keys() print j.tiles.items() - print j.file_list() \ No newline at end of file + print j.file_list() diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py index f327b78..10eeee1 100644 --- a/htsworkflow/pipelines/runfolder.py +++ b/htsworkflow/pipelines/runfolder.py @@ -178,12 +178,26 @@ def get_runs(runfolder): for firecrest_pathname in glob(os.path.join(datadir,"*Firecrest*")): logging.info('Found firecrest in ' + datadir) image_analysis = firecrest.firecrest(firecrest_pathname) - scan_post_image_analysis(runs, runfolder, image_analysis, firecrest_pathname) + if image_analysis is None: + logging.warn( + "%s is an empty or invalid firecrest directory" % (firecrest_pathname,) + ) + else: + scan_post_image_analysis( + runs, runfolder, image_analysis, firecrest_pathname + ) # scan for IPAR directories for ipar_pathname in glob(os.path.join(datadir,"IPAR_*")): logging.info('Found ipar directories in ' + datadir) image_analysis = ipar.ipar(ipar_pathname) - scan_post_image_analysis(runs, runfolder, image_analysis, ipar_pathname) + if image_analysis is None: + logging.warn( + "%s is an empty or invalid IPAR directory" %(ipar_pathname,) + ) + else: + scan_post_image_analysis( + runs, runfolder, image_analysis, ipar_pathname + ) return runs diff --git a/scripts/runfolder b/scripts/runfolder index 0abbfe1..bf5c5d8 100644 --- a/scripts/runfolder +++ b/scripts/runfolder @@ -27,6 +27,7 @@ runfolder.py can also spit out a simple summary report (-s option) that contains the per lane post filter cluster numbers and the mapped read counts. (The report isn't currently very pretty) """ +from glob import glob import logging import optparse import os @@ -50,9 +51,13 @@ def make_parser(): parser.add_option('--extract-results', action='store_true', default=False, help='extract result files out of runfolder into a simpler archive') + parser.add_option('-o', '--output-dir', default=None, + help="specify the default output directory for extract results") + parser.add_option('--run-xml', dest='run_xml', default=None, help='specify a run_.xml file for summary reports') + return parser @@ -65,14 +70,17 @@ def main(cmdlist=None): root_log = logging.getLogger() root_log.setLevel(logging.INFO) + logging.info('Starting htsworkflow illumina runfolder processing tool.') runs = [] if opt.run_xml: # handle ~ shortcut opt.run_xml = os.path.expanduser(opt.run_xml) tree = ElementTree.parse(opt.run_xml).getroot() runs.append(runfolder.PipelineRun(xml=tree)) - for run_dir in args: - runs.extend(runfolder.get_runs(run_dir)) + for run_pattern in args: + # expand args on our own if needed + for run_dir in glob(run_pattern): + runs.extend(runfolder.get_runs(run_dir)) if len(runs) > 0: if opt.summary: @@ -80,7 +88,7 @@ def main(cmdlist=None): if opt.archive: runfolder.extract_run_parameters(runs) if opt.extract_results: - runfolder.extract_results(runs) + runfolder.extract_results(runs, opt.output_dir) return 0