From: Diane Trout Date: Fri, 2 Mar 2012 01:26:57 +0000 (-0800) Subject: Core functions for saving and finding fastq files generated by a HiSeq. X-Git-Tag: v0.5.5~60 X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=b95389fe83caeca5fc25e449e3ea8e8d54adf1ec Core functions for saving and finding fastq files generated by a HiSeq. Becuase of the multiple tiles and demultiplexing I'm writing them into the flowcell directory with the Project_NNNNN[_IndexN] directory names. --- diff --git a/htsworkflow/pipelines/runfolder.py b/htsworkflow/pipelines/runfolder.py index 4a2b4cd..4f871a2 100644 --- a/htsworkflow/pipelines/runfolder.py +++ b/htsworkflow/pipelines/runfolder.py @@ -555,7 +555,10 @@ def extract_results(runs, output_base_dir=None, site="individual", num_jobs=1, r lanes.append(lane) run_name = srf.pathname_to_run_name(r.pathname) - if raw_format == 'qseq': + seq_cmds = [] + if raw_format == 'fastq': + srf.copy_hiseq_project_fastqs(run_name, r.bustard.pathname, lanes, site, cycle_dir) + elif raw_format == 'qseq': seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir) elif raw_format == 'srf': seq_cmds = srf.make_srf_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir, 0) diff --git a/htsworkflow/pipelines/sequences.py b/htsworkflow/pipelines/sequences.py index 993bcc9..f21e48c 100644 --- a/htsworkflow/pipelines/sequences.py +++ b/htsworkflow/pipelines/sequences.py @@ -34,7 +34,9 @@ class SequenceFile(object): Simple container class that holds the path to a sequence archive and basic descriptive information. """ - def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None): + def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None, + project=None, + index=None): self.filetype = filetype self.path = path self.flowcell = flowcell @@ -42,6 +44,8 @@ class SequenceFile(object): self.read = read self.pf = pf self.cycle = cycle + self.project = project + self.index = index def __hash__(self): return hash(self.key()) @@ -56,7 +60,7 @@ class SequenceFile(object): """ Equality is defined if everything but the path matches """ - attributes = ['filetype','flowcell', 'lane', 'read', 'pf', 'cycle'] + attributes = ['filetype','flowcell', 'lane', 'read', 'pf', 'cycle', 'project', 'index'] for a in attributes: if getattr(self, a) != getattr(other, a): return False @@ -114,7 +118,15 @@ def get_flowcell_cycle(path): """ Extract flowcell, cycle from pathname """ - rest, cycle = os.path.split(path) + project = None + rest, tail = os.path.split(path) + if tail.startswith('Project_'): + # we're in a multiplexed sample + project = tail + rest, cycle = os.path.split(rest) + else: + cycle = tail + rest, flowcell = os.path.split(rest) cycle_match = re.match("C(?P[0-9]+)-(?P[0-9]+)", cycle) if cycle_match is None: @@ -128,10 +140,10 @@ def get_flowcell_cycle(path): if stop is not None: stop = int(stop) - return flowcell, start, stop + return flowcell, start, stop, project def parse_srf(path, filename): - flowcell_dir, start, stop = get_flowcell_cycle(path) + flowcell_dir, start, stop, project = get_flowcell_cycle(path) basename, ext = os.path.splitext(filename) records = basename.split('_') flowcell = records[4] @@ -145,7 +157,7 @@ def parse_srf(path, filename): return SequenceFile('srf', fullpath, flowcell, lane, cycle=stop) def parse_qseq(path, filename): - flowcell_dir, start, stop = get_flowcell_cycle(path) + flowcell_dir, start, stop, project = get_flowcell_cycle(path) basename, ext = os.path.splitext(filename) records = basename.split('_') fullpath = os.path.join(path, filename) @@ -162,20 +174,35 @@ def parse_qseq(path, filename): def parse_fastq(path, filename): """Parse fastq names """ - flowcell_dir, start, stop = get_flowcell_cycle(path) + flowcell_dir, start, stop, project = get_flowcell_cycle(path) basename, ext = os.path.splitext(filename) records = basename.split('_') fullpath = os.path.join(path, filename) - flowcell = records[4] - lane = int(records[5][1]) - read = int(records[6][1]) - pf = parse_fastq_pf_flag(records) + if project is not None: + # demultiplexed sample! + flowcell = flowcell_dir + lane = int(records[2][-1]) + read = int(records[3][-1]) + pf = True # as I understand it hiseq runs toss the ones that fail filter + index = records[1] + project_id = records[0] + else: + flowcell = records[4] + lane = int(records[5][1]) + read = int(records[6][1]) + pf = parse_fastq_pf_flag(records) + index = None + project_id = None if flowcell_dir != flowcell: LOGGER.warn("flowcell %s found in wrong directory %s" % \ (flowcell, path)) - return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf, cycle=stop) + return SequenceFile('fastq', fullpath, flowcell, lane, read, + pf=pf, + cycle=stop, + project=project_id, + index=index) def parse_fastq_pf_flag(records): """Take a fastq filename split on _ and look for the pass-filter flag @@ -200,7 +227,7 @@ def parse_eland(path, filename, eland_match=None): if eland_match is None: eland_match = eland_re.match(filename) fullpath = os.path.join(path, filename) - flowcell, start, stop = get_flowcell_cycle(path) + flowcell, start, stop, project = get_flowcell_cycle(path) if eland_match.group('lane'): lane = int(eland_match.group('lane')) else: @@ -233,7 +260,9 @@ def scan_for_sequences(dirs): seq = parse_srf(path, f) elif qseq_re.match(f): seq = parse_qseq(path, f) - elif f.endswith('fastq') or f.endswith('.fastq.bz2'): + elif f.endswith('.fastq') or \ + f.endswith('.fastq.bz2') or \ + f.endswith('.fastq.gz'): seq = parse_fastq(path, f) eland_match = eland_re.match(f) if eland_match: diff --git a/htsworkflow/pipelines/srf.py b/htsworkflow/pipelines/srf.py index c313efe..4f20ce1 100644 --- a/htsworkflow/pipelines/srf.py +++ b/htsworkflow/pipelines/srf.py @@ -1,6 +1,7 @@ from glob import glob import logging import os +import shutil from htsworkflow.util import queuecommands @@ -150,6 +151,32 @@ def make_qseq_commands(run_name, bustard_dir, lanes, site_name, destdir, cmdleve return cmd_list +def copy_hiseq_project_fastqs(run_name, basecall_dir, site_name, destdir): + """ + make a subprocess-friendly list of command line arguments to save HiSeq fastq files + + run_name - most of the file name (run folder name is a good choice) + basecall_dir - location of unaligned files. + site_name - name of your "sequencing site" or "Individual" + destdir - root of where to save fastq files + """ + # clean up pathname + LOGGER.info("run_name %s" % (run_name,)) + + cmd_list = [] + project_dirs = glob(os.path.join(basecall_dir, 'Project_*')) + for project_dir in project_dirs: + _, project_name = os.path.split(project_dir) + sample_files = glob(os.path.join(project_dir, 'Sample*', '*.fastq*')) + project_dest = os.path.join(destdir, project_name) + if not os.path.exists(project_dest): + LOGGER.info("Making: %s" % (project_dest)) + os.mkdir(project_dest) + + for fastq_file in sample_files: + shutil.copy(fastq_file, project_dest) + + def run_commands(new_dir, cmd_list, num_jobs): LOGGER.info("chdir to %s" % (new_dir,)) curdir = os.getcwd() diff --git a/htsworkflow/pipelines/test/test_sequences.py b/htsworkflow/pipelines/test/test_sequences.py index 3587e65..cede8c2 100644 --- a/htsworkflow/pipelines/test/test_sequences.py +++ b/htsworkflow/pipelines/test/test_sequences.py @@ -13,11 +13,27 @@ class SequenceFileTests(unittest.TestCase): Make sure code to parse directory heirarchy works """ path = '/root/42BW9AAXX/C1-152' - flowcell, start, stop = sequences.get_flowcell_cycle(path) + flowcell, start, stop, project = sequences.get_flowcell_cycle(path) self.failUnlessEqual(flowcell, '42BW9AAXX') self.failUnlessEqual(start, 1) self.failUnlessEqual(stop, 152) + self.failUnlessEqual(project, None) + + path = '/root/42BW9AAXX/other' + self.failUnlessRaises(ValueError, sequences.get_flowcell_cycle, path) + + def test_flowcell_project_cycle(self): + """ + Make sure code to parse directory heirarchy works + """ + path = '/root/42BW9AAXX/C1-152/Project_12345_Index1' + flowcell, start, stop, project = sequences.get_flowcell_cycle(path) + + self.failUnlessEqual(flowcell, '42BW9AAXX') + self.failUnlessEqual(start, 1) + self.failUnlessEqual(stop, 152) + self.failUnlessEqual(project, 'Project_12345_Index1') path = '/root/42BW9AAXX/other' self.failUnlessRaises(ValueError, sequences.get_flowcell_cycle, path) @@ -90,6 +106,36 @@ class SequenceFileTests(unittest.TestCase): self.failUnlessEqual(f.pf, False) self.failUnlessEqual(f.cycle, 38) + def test_project_fastq(self): + path = '/root/42BW9AAXX/C1-38/Project_12345' + name = '11111_NoIndex_L001_R1_001.fastq.gz' + pathname = os.path.join(path,name) + f = sequences.parse_fastq(path, name) + + self.failUnlessEqual(f.filetype, 'fastq') + self.failUnlessEqual(f.path, pathname) + self.failUnlessEqual(f.flowcell, '42BW9AAXX') + self.failUnlessEqual(f.lane, 1) + self.failUnlessEqual(f.read, 1) + self.failUnlessEqual(f.pf, True) + self.failUnlessEqual(f.project, '11111') + self.failUnlessEqual(f.index, 'NoIndex') + self.failUnlessEqual(f.cycle, 38) + + name = '11112_AAATTT_L001_R2_003.fastq.gz' + pathname = os.path.join(path,name) + f = sequences.parse_fastq(path, name) + + self.failUnlessEqual(f.filetype, 'fastq') + self.failUnlessEqual(f.path, pathname) + self.failUnlessEqual(f.flowcell, '42BW9AAXX') + self.failUnlessEqual(f.lane, 1) + self.failUnlessEqual(f.read, 2) + self.failUnlessEqual(f.pf, True) + self.failUnlessEqual(f.project, '11112') + self.failUnlessEqual(f.index, 'AAATTT') + self.failUnlessEqual(f.cycle, 38) + def test_eland(self): path = '/root/42BW9AAXX/C1-38' name = 's_4_eland_extended.txt.bz2' @@ -134,7 +180,7 @@ class SequenceFileTests(unittest.TestCase): db = sqlite3.connect(":memory:") c = db.cursor() sequences.create_sequence_table(c) - + data = [('/root/42BW9AAXX/C1-152', 'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2'), ('/root/42BW9AAXX/C1-152', @@ -151,7 +197,7 @@ class SequenceFileTests(unittest.TestCase): count = c.execute("select count(*) from sequences") row = count.fetchone() self.failUnlessEqual(row[0], 4) - + def suite(): return unittest.makeSuite(SequenceFileTests,'test')