From: Diane Trout Date: Thu, 10 May 2012 01:45:05 +0000 (-0700) Subject: Minimal changes needed to get raw data archived for loxcyc. X-Git-Tag: v0.5.5~28^2 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=ea15819e29b32c4dce7231aa3a3ce5dddae39c2d Minimal changes needed to get raw data archived for loxcyc. Its probably not properly counting how many reads there are. --- diff --git a/htsworkflow/pipelines/gerald.py b/htsworkflow/pipelines/gerald.py index 06fc94b..9d32a5b 100644 --- a/htsworkflow/pipelines/gerald.py +++ b/htsworkflow/pipelines/gerald.py @@ -4,6 +4,7 @@ Provide access to information stored in the GERALD directory. from datetime import datetime, date import logging import os +import stat import time from htsworkflow.pipelines.summary import Summary @@ -91,6 +92,8 @@ class Gerald(object): self._lanes = {} tree = self._gerald.tree analysis = tree.find('LaneSpecificRunParameters/ANALYSIS') + if analysis is None: + return # according to the pipeline specs I think their fields # are sampleName_laneID, with sampleName defaulting to s # since laneIDs are constant lets just try using @@ -104,6 +107,10 @@ class Gerald(object): if self._lane is None: self._initalize_lanes() return self._lanes[key] + def get(self, key, default): + if self._lane is None: + self._initalize_lanes() + return self._lanes.get(key, None) def keys(self): if self._lane is None: self._initalize_lanes() @@ -138,8 +145,13 @@ class Gerald(object): if self.tree is None: return datetime.today() timestamp = self.tree.findtext('ChipWideRunParameters/TIME_STAMP') - epochstamp = time.mktime(time.strptime(timestamp, '%c')) - return datetime.fromtimestamp(epochstamp) + if timestamp is not None: + epochstamp = time.mktime(time.strptime(timestamp, '%c')) + return datetime.fromtimestamp(epochstamp) + if self.pathname is not None: + epochstamp = os.stat(self.pathname)[stat.ST_MTIME] + return datetime.fromtimestamp(epochstamp) + return datetime.today() date = property(_get_date) def _get_time(self): @@ -162,10 +174,12 @@ class Gerald(object): root = os.path.join(root,'') experiment_dir = self.tree.findtext('ChipWideRunParameters/EXPT_DIR') - if experiment_dir is None: - return None - experiment_dir = experiment_dir.replace(root, '') - if len(experiment_dir) == 0: + if experiment_dir is not None: + experiment_dir = experiment_dir.replace(root, '') + experiment_dir = self.tree.findtext('Defaults/EXPT_DIR') + if experiment_dir is not None: + _, experiment_dir = os.path.split(experiment_dir) + if experiment_dir is None or len(experiment_dir) == 0: return None dirnames = experiment_dir.split(os.path.sep) @@ -229,12 +243,18 @@ def gerald(pathname): g.tree = ElementTree.parse(config_pathname).getroot() # parse Summary.htm file - summary_pathname = os.path.join(g.pathname, 'Summary.xml') - if os.path.exists(summary_pathname): + summary_xml = os.path.join(g.pathname, 'Summary.xml') + summary_htm = os.path.join(g.pathname, 'Summary.htm') + status_files_summary = os.path.join(g.pathname, '..', 'Data', 'Status_Files', 'Summary.htm') + if os.path.exists(summary_xml): LOGGER.info("Parsing Summary.xml") - else: + summary_pathname = summary_xml + elif os.path.exists(summary_htm): summary_pathname = os.path.join(g.pathname, 'Summary.htm') LOGGER.info("Parsing Summary.htm") + else: + summary_pathname = status_files_summary + LOGGER.info("Parsing %s" % (status_files_summary,)) g.summary = Summary(summary_pathname) # parse eland files g.eland_results = eland(g.pathname, g) diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py index 3389c2d..3bfb928 100644 --- a/htsworkflow/pipelines/runfolder.py +++ b/htsworkflow/pipelines/runfolder.py @@ -47,6 +47,7 @@ class PipelineRun(object): self.pathname = None self._name = None self._flowcell_id = flowcell_id + self.datadir = None self.image_analysis = None self.bustard = None self.gerald = None @@ -177,7 +178,7 @@ def get_runs(runfolder, flowcell_id=None): from htsworkflow.pipelines import bustard from htsworkflow.pipelines import gerald - def scan_post_image_analysis(runs, runfolder, image_analysis, pathname): + def scan_post_image_analysis(runs, runfolder, datadir, image_analysis, pathname): LOGGER.info("Looking for bustard directories in %s" % (pathname,)) bustard_dirs = glob(os.path.join(pathname, "Bustard*")) # RTA BaseCalls looks enough like Bustard. @@ -192,6 +193,21 @@ def get_runs(runfolder, flowcell_id=None): try: g = gerald.gerald(gerald_pathname) p = PipelineRun(runfolder, flowcell_id) + p.datadir = datadir + p.image_analysis = image_analysis + p.bustard = b + p.gerald = g + runs.append(p) + except IOError, e: + LOGGER.error("Ignoring " + str(e)) + + aligned_glob = os.path.join(runfolder, 'Aligned*') + for aligned in glob(aligned_glob): + LOGGER.info("Found aligned directory %s" % (aligned,)) + try: + g = gerald.gerald(aligned) + p = PipelineRun(runfolder, flowcell_id) + p.datadir = datadir p.image_analysis = image_analysis p.bustard = b p.gerald = g @@ -228,7 +244,7 @@ def get_runs(runfolder, flowcell_id=None): ) else: scan_post_image_analysis( - runs, runfolder, image_analysis, ipar_pathname + runs, runfolder, datadir, image_analysis, ipar_pathname ) return runs @@ -407,12 +423,17 @@ def save_flowcell_reports(data_dir, cycle_dir): os.chdir(cwd) -def save_summary_file(gerald_object, cycle_dir): +def save_summary_file(pipeline, cycle_dir): # Copy Summary.htm - summary_path = os.path.join(gerald_object.pathname, 'Summary.htm') - if os.path.exists(summary_path): - LOGGER.info('Copying %s to %s' % (summary_path, cycle_dir)) - shutil.copy(summary_path, cycle_dir) + gerald_object = pipeline.gerald + gerald_summary = os.path.join(gerald_object.pathname, 'Summary.htm') + status_files_summary = os.path.join(pipeline.datadir, 'Status_Files', 'Summary.htm') + if os.path.exists(gerald_summary): + LOGGER.info('Copying %s to %s' % (gerald_summary, cycle_dir)) + shutil.copy(gerald_summary, cycle_dir) + elif os.path.exists(status_files_summary): + LOGGER.info('Copying %s to %s' % (status_files_summary, cycle_dir)) + shutil.copy(status_files_summary, cycle_dir) else: LOGGER.info('Summary file %s was not found' % (summary_path,)) @@ -551,13 +572,17 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, r if site is not None: lanes = [] for lane in range(1, 9): - if r.gerald.lanes[lane].analysis != 'none': + lane_parameters = r.gerald.lanes.get(lane, None) + if lane_parameters is not None and lane_parameters.analysis != 'none': lanes.append(lane) run_name = srf.pathname_to_run_name(r.pathname) seq_cmds = [] + LOGGER.info("Raw Format is: %s" % (raw_format, )) if raw_format == 'fastq': - srf.copy_hiseq_project_fastqs(run_name, r.bustard.pathname, site, cycle_dir) + rawpath = os.path.join(r.pathname, r.gerald.runfolder_name) + LOGGER.info("raw data = %s" % (rawpath,)) + srf.copy_hiseq_project_fastqs(run_name, rawpath, site, cycle_dir) elif raw_format == 'qseq': seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir) elif raw_format == 'srf': @@ -571,7 +596,7 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, r g = r.gerald # save summary file - save_summary_file(g, cycle_dir) + save_summary_file(r, cycle_dir) # compress eland result files compress_eland_results(g, cycle_dir, num_jobs)