From: Diane Trout Date: Tue, 8 Jan 2013 01:46:40 +0000 (-0800) Subject: Try to make Aligned result directories optional in hiseq runs. X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=3ecb135bc504bc9323951cec6c99f1c256b067b1 Try to make Aligned result directories optional in hiseq runs. The previous implementation tried to match Aligned & Unaligned directories by parsing the Aligned directories config file for its the unaligned raw sequence directory. Needless to say that didn't work if there wasn't an Aligned directory. This version tries to match them by comparing the suffix in Aligned and Unaligned. Then the runfolder generation code will still generate a runfolder if there's no aligned directory. --- diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py index 18a38e0..a3b7457 100644 --- a/htsworkflow/pipelines/runfolder.py +++ b/htsworkflow/pipelines/runfolder.py @@ -201,60 +201,6 @@ def get_runs(runfolder, flowcell_id=None): from htsworkflow.pipelines import bustard from htsworkflow.pipelines import gerald - def scan_post_image_analysis(runs, runfolder, datadir, image_analysis, - pathname): - added = build_aligned_runs(image_analysis, runs, datadir, runfolder) - # If we're a multiplexed run, don't look for older run type. - if added > 0: - return - - LOGGER.info("Looking for bustard directories in %s" % (pathname,)) - bustard_dirs = glob(os.path.join(pathname, "Bustard*")) - # RTA BaseCalls looks enough like Bustard. - bustard_dirs.extend(glob(os.path.join(pathname, "BaseCalls"))) - for bustard_pathname in bustard_dirs: - LOGGER.info("Found bustard directory %s" % (bustard_pathname,)) - b = bustard.bustard(bustard_pathname) - build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder) - - - def build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder): - start = len(runs) - gerald_glob = os.path.join(bustard_pathname, 'GERALD*') - LOGGER.info("Looking for gerald directories in %s" % (pathname,)) - for gerald_pathname in glob(gerald_glob): - LOGGER.info("Found gerald directory %s" % (gerald_pathname,)) - try: - g = gerald.gerald(gerald_pathname) - p = PipelineRun(runfolder, flowcell_id) - p.datadir = datadir - p.image_analysis = image_analysis - p.bustard = b - p.gerald = g - runs.append(p) - except IOError, e: - LOGGER.error("Ignoring " + str(e)) - return len(runs) - start - - - def build_aligned_runs(image_analysis, runs, datadir, runfolder): - start = len(runs) - aligned_glob = os.path.join(runfolder, 'Aligned*') - for aligned in glob(aligned_glob): - LOGGER.info("Found aligned directory %s" % (aligned,)) - try: - g = gerald.gerald(aligned) - p = PipelineRun(runfolder, flowcell_id) - bustard_pathname = os.path.join(runfolder, g.runfolder_name) - - p.datadir = datadir - p.image_analysis = image_analysis - p.bustard = bustard.bustard(bustard_pathname) - p.gerald = g - runs.append(p) - except IOError, e: - LOGGER.error("Ignoring " + str(e)) - return len(runs) - start datadir = os.path.join(runfolder, 'Data') LOGGER.info('Searching for runs in ' + datadir) @@ -269,7 +215,7 @@ def get_runs(runfolder, flowcell_id=None): ) else: scan_post_image_analysis( - runs, runfolder, datadir, image_analysis, firecrest_pathname + runs, runfolder, datadir, image_analysis, firecrest_pathname, flowcell_id ) # scan for IPAR directories ipar_dirs = glob(os.path.join(datadir, "IPAR_*")) @@ -284,11 +230,111 @@ def get_runs(runfolder, flowcell_id=None): ) else: scan_post_image_analysis( - runs, runfolder, datadir, image_analysis, ipar_pathname + runs, runfolder, datadir, image_analysis, ipar_pathname, flowcell_id ) return runs +def scan_post_image_analysis(runs, runfolder, datadir, image_analysis, + pathname, flowcell_id): + added = build_hiseq_runs(image_analysis, runs, datadir, runfolder, flowcell_id) + # If we're a multiplexed run, don't look for older run type. + if added > 0: + return + + LOGGER.info("Looking for bustard directories in %s" % (pathname,)) + bustard_dirs = glob(os.path.join(pathname, "Bustard*")) + # RTA BaseCalls looks enough like Bustard. + bustard_dirs.extend(glob(os.path.join(pathname, "BaseCalls"))) + for bustard_pathname in bustard_dirs: + LOGGER.info("Found bustard directory %s" % (bustard_pathname,)) + b = bustard.bustard(bustard_pathname) + build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, + runfolder, flowcell_id) + + +def build_gerald_runs(runs, b, image_analysis, bustard_pathname, datadir, pathname, runfolder, + flowcell_id): + start = len(runs) + gerald_glob = os.path.join(bustard_pathname, 'GERALD*') + LOGGER.info("Looking for gerald directories in %s" % (pathname,)) + for gerald_pathname in glob(gerald_glob): + LOGGER.info("Found gerald directory %s" % (gerald_pathname,)) + try: + g = gerald.gerald(gerald_pathname) + p = PipelineRun(runfolder, flowcell_id) + p.datadir = datadir + p.image_analysis = image_analysis + p.bustard = b + p.gerald = g + runs.append(p) + except IOError, e: + LOGGER.error("Ignoring " + str(e)) + return len(runs) - start + + +def build_hiseq_runs(image_analysis, runs, datadir, runfolder, flowcell_id): + start = len(runs) + aligned_glob = os.path.join(runfolder, 'Aligned*') + unaligned_glob = os.path.join(runfolder, 'Unaligned*') + + aligned_paths = glob(aligned_glob) + unaligned_paths = glob(unaligned_glob) + + matched_paths = hiseq_match_aligned_unaligned(aligned_paths, unaligned_paths) + LOGGER.debug("Matched HiSeq analysis: %s", str(matched_paths)) + + for aligned, unaligned in matched_paths: + if unaligned is None: + LOGGER.warn("Aligned directory %s without matching unalinged, skipping", aligned) + continue + + g = gerald.gerald(aligned) + print "scan for aligned then remove them from unaligned list" + try: + p = PipelineRun(runfolder, flowcell_id) + p.datadir = datadir + p.image_analysis = image_analysis + p.bustard = bustard.bustard(unaligned) + if aligned: + p.gerald = gerald.gerald(aligned) + runs.append(p) + except IOError, e: + LOGGER.error("Ignoring " + str(e)) + return len(runs) - start + +def hiseq_match_aligned_unaligned(aligned, unaligned): + """Match aligned and unaligned folders from seperate lists + """ + unaligned_suffix_re = re.compile('Unaligned(?P[\w]*)') + + aligned_by_suffix = build_dir_dict_by_suffix('Aligned', aligned) + unaligned_by_suffix = build_dir_dict_by_suffix('Unaligned', unaligned) + + keys = set(aligned_by_suffix.keys()).union(set(unaligned_by_suffix.keys())) + + matches = [] + for key in keys: + a = aligned_by_suffix.get(key) + u = unaligned_by_suffix.get(key) + matches.append((a, u)) + return matches + +def build_dir_dict_by_suffix(prefix, dirnames): + """Build a dictionary indexed by suffix of last directory name. + + It assumes a constant prefix + """ + regex = re.compile('%s(?P[\w]*)' % (prefix,)) + + by_suffix = {} + for absname in dirnames: + basename = os.path.basename(absname) + match = regex.match(basename) + if match: + by_suffix[match.group('suffix')] = absname + return by_suffix + def get_specific_run(gerald_dir): """ Given a gerald directory, construct a PipelineRun out of its parents diff --git a/htsworkflow/pipelines/test/test_runfolder_utils.py b/htsworkflow/pipelines/test/test_runfolder_utils.py new file mode 100644 index 0000000..59ee418 --- /dev/null +++ b/htsworkflow/pipelines/test/test_runfolder_utils.py @@ -0,0 +1,36 @@ +from unittest2 import TestCase, TestSuite, defaultTestLoader + +from htsworkflow.pipelines import runfolder +class TestRunfolderUtilities(TestCase): + """Some functions can be tested independently of the runfolder version. + """ + def test_match_aligned_unaligned_abspath(self): + aligned = ['/a/b/c/Aligned', '/a/b/c/Aligned1234', '/a/b/c/Aligned_3mm'] + unaligned = ['/a/b/c/Unaligned', '/a/b/c/Unaligned_3mm', '/a/b/c/Unaligned_6index'] + + matches = set(runfolder.hiseq_match_aligned_unaligned(aligned, unaligned)) + self.assertEqual(len(matches), 4) + self.assertTrue(('/a/b/c/Aligned', '/a/b/c/Unaligned') in matches ) + self.assertTrue(('/a/b/c/Aligned1234', None) in matches ) + self.assertTrue(('/a/b/c/Aligned_3mm', '/a/b/c/Unaligned_3mm') in matches ) + self.assertTrue((None, '/a/b/c/Unaligned_6index') in matches ) + + def test_match_aligned_unaligned_relpath(self): + aligned = ['./Aligned', './Aligned1234', './Aligned_3mm'] + unaligned = ['./Unaligned', './Unaligned_3mm', './Unaligned_6index'] + + matches = set(runfolder.hiseq_match_aligned_unaligned(aligned, unaligned)) + self.assertEqual(len(matches), 4) + self.assertTrue(('./Aligned', './Unaligned') in matches ) + self.assertTrue(('./Aligned1234', None) in matches ) + self.assertTrue(('./Aligned_3mm', './Unaligned_3mm') in matches ) + self.assertTrue((None, './Unaligned_6index') in matches ) + +def suite(): + suite = TestSuite() + suite.addTests(defaultTestLoader.loadTestsFromTestCase(RunfolderTests)) + return suite + +if __name__ == "__main__": + from unittest2 import main + main(defaultTest="suite")