From: Diane Trout Date: Tue, 15 May 2012 22:10:08 +0000 (-0700) Subject: Fix bugs introduduced by the improved HiSeq runfolder scanning. X-Git-Tag: v0.5.5~27 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=ee50e59d11da2c708ca2c31831f795b147ad35b6 Fix bugs introduduced by the improved HiSeq runfolder scanning. This makes more progress toward analyzing a HiSeq runfolder, but is currently lacking the ability to process the Aligned reads. This does seem to now process the base call processing information out of the Unaligned tree of hiseq runs. Also all the previous runfolder versions tests pass again. (Some of my intoduced logic was a bit off for guessing what type of runfolder it is.) --- diff --git a/htsworkflow/pipelines/bustard.py b/htsworkflow/pipelines/bustard.py index 5ebde39..acdc60e 100644 --- a/htsworkflow/pipelines/bustard.py +++ b/htsworkflow/pipelines/bustard.py @@ -219,6 +219,25 @@ class Bustard(object): if xml is not None: self.set_elements(xml) + def update_attributes_from_pathname(self): + """Update version, date, user from bustard directory names + Obviously this wont work for BaseCalls or later runfolders + """ + if self.pathname is None: + raise ValueError( + "Set pathname before calling update_attributes_from_pathname") + path, name = os.path.split(self.pathname) + + if not re.match('bustard', name, re.IGNORECASE): + return + + groups = name.split("_") + version = re.search(VERSION_RE, groups[0]) + self.version = version.group(1) + t = time.strptime(groups[1], EUROPEAN_STRPTIME) + self.date = date(*t[0:3]) + self.user = groups[2] + def _get_time(self): if self.date is None: return None @@ -290,47 +309,92 @@ def bustard(pathname): :Return: Fully initialized Bustard object. """ - b = Bustard() pathname = os.path.abspath(pathname) + bustard_filename = os.path.join(pathname, 'config.xml') + demultiplexed_filename = os.path.join(pathname, + 'DemultiplexedBustardConfig.xml') + + if os.path.exists(demultiplexed_filename): + b = bustard_from_hiseq(pathname, demultiplexed_filename) + elif os.path.exists(bustard_filename): + b = bustard_from_ga2(pathname, bustard_filename) + else: + b = bustard_from_ga1(pathname) + + return b + +def bustard_from_ga1(pathname): + """Initialize bustard class from ga1 era runfolders. + """ path, name = os.path.split(pathname) + groups = name.split("_") - if groups[0].lower().startswith('bustard'): - version = re.search(VERSION_RE, groups[0]) - b.version = version.group(1) - t = time.strptime(groups[1], EUROPEAN_STRPTIME) - b.date = date(*t[0:3]) - b.user = groups[2] - elif groups[0] == 'BaseCalls': - # stub values - b.version = None - b.date = None - b.user = None + if len(groups) < 3: + msg = "Not enough information to create attributes"\ + " from directory name: %s" + LOGGER.error(msg % (self.pathname,)) + return None + b = Bustard() b.pathname = pathname - bustard_config_filename = os.path.join(pathname, 'config.xml') - paramfiles = glob(os.path.join(pathname, "params?.xml")) - for paramfile in paramfiles: - phasing = Phasing(paramfile) - assert (phasing.lane >= 1 and phasing.lane <= 8) - b.phasing[phasing.lane] = phasing + b.update_attributes_from_pathname() + version = re.search(VERSION_RE, groups[0]) + b.version = version.group(1) + t = time.strptime(groups[1], EUROPEAN_STRPTIME) + b.date = date(*t[0:3]) + b.user = groups[2] + # I only found these in Bustard1.9.5/1.9.6 directories if b.version in ('1.9.5', '1.9.6'): # at least for our runfolders for 1.9.5 and 1.9.6 matrix[1-8].txt are always the same crosstalk_file = os.path.join(pathname, "matrix1.txt") b.crosstalk = CrosstalkMatrix(crosstalk_file) + + add_phasing(b) + return b + + +def bustard_from_ga2(pathname, config_filename): + """Initialize bustard class from ga2-era runfolder + Actually I don't quite remember if it is exactly the GA2s, but + its after the original runfolder style and before the HiSeq. + """ # for version 1.3.2 of the pipeline the bustard version number went down # to match the rest of the pipeline. However there's now a nifty # new (useful) bustard config file. - elif os.path.exists(bustard_config_filename): - bustard_config_root = ElementTree.parse(bustard_config_filename) - b.bustard_config = bustard_config_root.getroot() - b.crosstalk = crosstalk_matrix_from_bustard_config(b.pathname, b.bustard_config) - software = bustard_config_root.find('*/Software') - b.version = software.attrib['Version'] - #b.version = software.attrib['Name'] + "-" + software.attrib['Version'] + + # stub values + b = Bustard() + b.pathname = pathname + b.update_attributes_from_pathname() + bustard_config_root = ElementTree.parse(config_filename) + b.bustard_config = bustard_config_root.getroot() + b.crosstalk = crosstalk_matrix_from_bustard_config(b.pathname, + b.bustard_config) + software = bustard_config_root.find('*/Software') + b.version = software.attrib['Version'] + add_phasing(b) return b +def bustard_from_hiseq(pathname, config_filename): + b = Bustard() + b.pathname = pathname + bustard_config_root = ElementTree.parse(config_filename) + b.bustard_config = bustard_config_root.getroot() + software = bustard_config_root.find('*/Software') + b.version = software.attrib['Version'] + add_phasing(b) + return b + +def add_phasing(bustard_obj): + paramfiles = glob(os.path.join(bustard_obj.pathname, + "params?.xml")) + for paramfile in paramfiles: + phasing = Phasing(paramfile) + assert (phasing.lane >= 1 and phasing.lane <= 8) + bustard_obj.phasing[phasing.lane] = phasing + def fromxml(tree): """ Reconstruct a htsworkflow.pipelines.Bustard object from an xml block diff --git a/htsworkflow/pipelines/gerald.py b/htsworkflow/pipelines/gerald.py index 9d32a5b..feb03e6 100644 --- a/htsworkflow/pipelines/gerald.py +++ b/htsworkflow/pipelines/gerald.py @@ -167,29 +167,35 @@ class Gerald(object): if self.tree is None: return None - root = self._get_experiment_root() - if root is None: - root = '' - else: - root = os.path.join(root,'') - - experiment_dir = self.tree.findtext('ChipWideRunParameters/EXPT_DIR') - if experiment_dir is not None: - experiment_dir = experiment_dir.replace(root, '') - experiment_dir = self.tree.findtext('Defaults/EXPT_DIR') - if experiment_dir is not None: - _, experiment_dir = os.path.split(experiment_dir) + expt_root = os.path.normpath(self._get_experiment_root()) + chip_expt_dir = self.tree.findtext('ChipWideRunParameters/EXPT_DIR') + # hiseqs renamed the experiment dir location + defaults_expt_dir = self.tree.findtext('Defaults/EXPT_DIR') + + experiment_dir = None + if defaults_expt_dir is not None: + _, experiment_dir = os.path.split(defaults_expt_dir) + elif expt_root is not None and chip_expt_dir is not None: + experiment_dir = chip_expt_dir.replace(expt_root+os.path.sep, '') + experiment_dir = experiment_dir.split(os.path.sep)[0] + if experiment_dir is None or len(experiment_dir) == 0: return None + return experiment_dir - dirnames = experiment_dir.split(os.path.sep) - return dirnames[0] runfolder_name = property(_get_runfolder_name) def _get_version(self): if self.tree is None: return None - return self.tree.findtext('ChipWideRunParameters/SOFTWARE_VERSION') + ga_version = self.tree.findtext( + 'ChipWideRunParameters/SOFTWARE_VERSION') + if ga_version is not None: + return ga_version + hiseq_software_node = self.tree.find('Software') + hiseq_version = hiseq_software_node.attrib['Version'] + return hiseq_version + version = property(_get_version) def _get_chip_attribute(self, value): diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py index 3bfb928..9edba31 100644 --- a/htsworkflow/pipelines/runfolder.py +++ b/htsworkflow/pipelines/runfolder.py @@ -186,34 +186,41 @@ def get_runs(runfolder, flowcell_id=None): for bustard_pathname in bustard_dirs: LOGGER.info("Found bustard directory %s" % (bustard_pathname,)) b = bustard.bustard(bustard_pathname) - gerald_glob = os.path.join(bustard_pathname, 'GERALD*') - LOGGER.info("Looking for gerald directories in %s" % (pathname,)) - for gerald_pathname in glob(gerald_glob): - LOGGER.info("Found gerald directory %s" % (gerald_pathname,)) - try: - g = gerald.gerald(gerald_pathname) - p = PipelineRun(runfolder, flowcell_id) - p.datadir = datadir - p.image_analysis = image_analysis - p.bustard = b - p.gerald = g - runs.append(p) - except IOError, e: - LOGGER.error("Ignoring " + str(e)) - - aligned_glob = os.path.join(runfolder, 'Aligned*') - for aligned in glob(aligned_glob): - LOGGER.info("Found aligned directory %s" % (aligned,)) - try: - g = gerald.gerald(aligned) - p = PipelineRun(runfolder, flowcell_id) - p.datadir = datadir - p.image_analysis = image_analysis - p.bustard = b - p.gerald = g - runs.append(p) - except IOError, e: - LOGGER.error("Ignoring " + str(e)) + build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder) + + build_aligned_runs(image_analysis, runs, b, datadir, runfolder) + + def build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder): + gerald_glob = os.path.join(bustard_pathname, 'GERALD*') + LOGGER.info("Looking for gerald directories in %s" % (pathname,)) + for gerald_pathname in glob(gerald_glob): + LOGGER.info("Found gerald directory %s" % (gerald_pathname,)) + try: + g = gerald.gerald(gerald_pathname) + p = PipelineRun(runfolder, flowcell_id) + p.datadir = datadir + p.image_analysis = image_analysis + p.bustard = b + p.gerald = g + runs.append(p) + except IOError, e: + LOGGER.error("Ignoring " + str(e)) + + + def build_aligned_runs(image_analysis, runs, b, datadir, runfolder): + aligned_glob = os.path.join(runfolder, 'Aligned*') + for aligned in glob(aligned_glob): + LOGGER.info("Found aligned directory %s" % (aligned,)) + try: + g = gerald.gerald(aligned) + p = PipelineRun(runfolder, flowcell_id) + p.datadir = datadir + p.image_analysis = image_analysis + p.bustard = b + p.gerald = g + runs.append(p) + except IOError, e: + LOGGER.error("Ignoring " + str(e)) datadir = os.path.join(runfolder, 'Data') @@ -229,7 +236,7 @@ def get_runs(runfolder, flowcell_id=None): ) else: scan_post_image_analysis( - runs, runfolder, image_analysis, firecrest_pathname + runs, runfolder, datadir, image_analysis, firecrest_pathname ) # scan for IPAR directories ipar_dirs = glob(os.path.join(datadir, "IPAR_*")) diff --git a/htsworkflow/pipelines/test/simulate_runfolder.py b/htsworkflow/pipelines/test/simulate_runfolder.py index 42cf421..aae6078 100644 --- a/htsworkflow/pipelines/test/simulate_runfolder.py +++ b/htsworkflow/pipelines/test/simulate_runfolder.py @@ -67,9 +67,9 @@ def make_unaligned_config_1_12(unaligned_dir): (os.path.join(TESTDATA_DIR, 'demultiplex_1.12.4.2.xml'), os.path.join(unaligned_dir, 'DemultiplexConfig.xml')), (os.path.join(TESTDATA_DIR, 'demultiplexed_bustard_1.12.4.2.xml'), - os.path.join(unaligned_dir, 'DemultiplexConfig.xml')), + os.path.join(unaligned_dir, 'DemultiplexedBustardConfig.xml')), (os.path.join(TESTDATA_DIR, 'demultiplexed_summary_1.12.4.2.xml'), - os.path.join(unaligned_dir, 'DemultiplexConfig.xml')), + os.path.join(unaligned_dir, 'DemultiplexedBustardSummary.xml')), ] for src, dest in demultiplex_pairs: shutil.copy(src, dest) @@ -357,6 +357,10 @@ def make_summary_casava1_7_xml(gerald_dir): destination = os.path.join(gerald_dir, 'Summary.xml') shutil.copy(source, destination) +def make_summary_rta1_12(status_dir): + source = os.path.join(TESTDATA_DIR, 'Summary-rta1_12.htm') + destination = os.path.join(status_dir, 'Summary.htm') + shutil.copy(source, destination) def make_eland_results(gerald_dir): eland_result = """>HWI-EAS229_24_207BTAAXX:1:7:599:759 ACATAGNCACAGACATAAACATAGACATAGAC U0 1 1 3 chrUextra.fa 28189829 R D. diff --git a/htsworkflow/pipelines/test/test_runfolder_rta1_12.py b/htsworkflow/pipelines/test/test_runfolder_rta1_12.py index 927cf61..70c11bf 100644 --- a/htsworkflow/pipelines/test/test_runfolder_rta1_12.py +++ b/htsworkflow/pipelines/test/test_runfolder_rta1_12.py @@ -1,6 +1,7 @@ #!/usr/bin/env python from datetime import datetime, date +import logging import os import tempfile import shutil @@ -33,6 +34,10 @@ def make_runfolder(obj=None): intensities_dir = make_rta_intensities_1_12(data_dir) + status_dir = os.path.join(data_dir, 'Status_Files') + os.mkdir(status_dir) + make_summary_rta1_12(status_dir) + basecalls_dir = make_rta_basecalls_1_12(intensities_dir) make_matrix_dir_rta_1_12(basecalls_dir) @@ -70,7 +75,7 @@ class RunfolderTests(unittest.TestCase): def test_bustard(self): """Construct a bustard object""" b = bustard.bustard(self.bustard_dir) - self.failUnlessEqual(b.version, '1.8.70.0') + self.failUnlessEqual(b.version, '1.12.4.2') self.failUnlessEqual(b.date, None) self.failUnlessEqual(b.user, None) self.failUnlessEqual(len(b.phasing), 0) @@ -85,13 +90,10 @@ class RunfolderTests(unittest.TestCase): # need to update gerald and make tests for it g = gerald.gerald(self.gerald_dir) - self.failUnlessEqual(g.version, - '@(#) Id: GERALD.pl,v 1.171 2008/05/19 17:36:14 mzerara Exp') - self.failUnlessEqual(g.date, datetime(2009,2,22,21,15,59)) + self.failUnlessEqual(g.version, 'CASAVA-1.8.1') self.failUnlessEqual(len(g.lanes), len(g.lanes.keys())) self.failUnlessEqual(len(g.lanes), len(g.lanes.items())) - # list of genomes, matches what was defined up in # make_gerald_config. # the first None is to offset the genomes list to be 1..9 @@ -283,9 +285,6 @@ def suite(): return unittest.makeSuite(RunfolderTests,'test') if __name__ == "__main__": - #unittest.main(defaultTest="suite") - class Test(object): pass - t = Test() - make_runfolder(t) - print ('path ' + t.runfolder_dir) + logging.basicConfig(level=logging.WARN) + unittest.main(defaultTest="suite") diff --git a/htsworkflow/pipelines/test/testdata/Summary-rta1_12.htm b/htsworkflow/pipelines/test/testdata/Summary-rta1_12.htm new file mode 100755 index 0000000..b11632d --- /dev/null +++ b/htsworkflow/pipelines/test/testdata/Summary-rta1_12.htm @@ -0,0 +1,53 @@ + + + + + + + Status + + + + + + + +
Refresh  + + + +
Total:0Extracted:0Called:0Scored:0Copied:0Err.Rated:0
Loading Run Data ...
+ + + +
+

No data available yet.

+
+
+
+
+
+
+
+
+
+
+ +
+ + + +