lanes.append(lane)
run_name = srf.pathname_to_run_name(r.pathname)
- if raw_format == 'qseq':
+ seq_cmds = []
+ if raw_format == 'fastq':
+ srf.copy_hiseq_project_fastqs(run_name, r.bustard.pathname, lanes, site, cycle_dir)
+ elif raw_format == 'qseq':
seq_cmds = srf.make_qseq_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir)
elif raw_format == 'srf':
seq_cmds = srf.make_srf_commands(run_name, r.bustard.pathname, lanes, site, cycle_dir, 0)
Simple container class that holds the path to a sequence archive
and basic descriptive information.
"""
- def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None):
+ def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None,
+ project=None,
+ index=None):
self.filetype = filetype
self.path = path
self.flowcell = flowcell
self.read = read
self.pf = pf
self.cycle = cycle
+ self.project = project
+ self.index = index
def __hash__(self):
return hash(self.key())
"""
Equality is defined if everything but the path matches
"""
- attributes = ['filetype','flowcell', 'lane', 'read', 'pf', 'cycle']
+ attributes = ['filetype','flowcell', 'lane', 'read', 'pf', 'cycle', 'project', 'index']
for a in attributes:
if getattr(self, a) != getattr(other, a):
return False
"""
Extract flowcell, cycle from pathname
"""
- rest, cycle = os.path.split(path)
+ project = None
+ rest, tail = os.path.split(path)
+ if tail.startswith('Project_'):
+ # we're in a multiplexed sample
+ project = tail
+ rest, cycle = os.path.split(rest)
+ else:
+ cycle = tail
+
rest, flowcell = os.path.split(rest)
cycle_match = re.match("C(?P<start>[0-9]+)-(?P<stop>[0-9]+)", cycle)
if cycle_match is None:
if stop is not None:
stop = int(stop)
- return flowcell, start, stop
+ return flowcell, start, stop, project
def parse_srf(path, filename):
- flowcell_dir, start, stop = get_flowcell_cycle(path)
+ flowcell_dir, start, stop, project = get_flowcell_cycle(path)
basename, ext = os.path.splitext(filename)
records = basename.split('_')
flowcell = records[4]
return SequenceFile('srf', fullpath, flowcell, lane, cycle=stop)
def parse_qseq(path, filename):
- flowcell_dir, start, stop = get_flowcell_cycle(path)
+ flowcell_dir, start, stop, project = get_flowcell_cycle(path)
basename, ext = os.path.splitext(filename)
records = basename.split('_')
fullpath = os.path.join(path, filename)
def parse_fastq(path, filename):
"""Parse fastq names
"""
- flowcell_dir, start, stop = get_flowcell_cycle(path)
+ flowcell_dir, start, stop, project = get_flowcell_cycle(path)
basename, ext = os.path.splitext(filename)
records = basename.split('_')
fullpath = os.path.join(path, filename)
- flowcell = records[4]
- lane = int(records[5][1])
- read = int(records[6][1])
- pf = parse_fastq_pf_flag(records)
+ if project is not None:
+ # demultiplexed sample!
+ flowcell = flowcell_dir
+ lane = int(records[2][-1])
+ read = int(records[3][-1])
+ pf = True # as I understand it hiseq runs toss the ones that fail filter
+ index = records[1]
+ project_id = records[0]
+ else:
+ flowcell = records[4]
+ lane = int(records[5][1])
+ read = int(records[6][1])
+ pf = parse_fastq_pf_flag(records)
+ index = None
+ project_id = None
if flowcell_dir != flowcell:
LOGGER.warn("flowcell %s found in wrong directory %s" % \
(flowcell, path))
- return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf, cycle=stop)
+ return SequenceFile('fastq', fullpath, flowcell, lane, read,
+ pf=pf,
+ cycle=stop,
+ project=project_id,
+ index=index)
def parse_fastq_pf_flag(records):
"""Take a fastq filename split on _ and look for the pass-filter flag
if eland_match is None:
eland_match = eland_re.match(filename)
fullpath = os.path.join(path, filename)
- flowcell, start, stop = get_flowcell_cycle(path)
+ flowcell, start, stop, project = get_flowcell_cycle(path)
if eland_match.group('lane'):
lane = int(eland_match.group('lane'))
else:
seq = parse_srf(path, f)
elif qseq_re.match(f):
seq = parse_qseq(path, f)
- elif f.endswith('fastq') or f.endswith('.fastq.bz2'):
+ elif f.endswith('.fastq') or \
+ f.endswith('.fastq.bz2') or \
+ f.endswith('.fastq.gz'):
seq = parse_fastq(path, f)
eland_match = eland_re.match(f)
if eland_match:
from glob import glob
import logging
import os
+import shutil
from htsworkflow.util import queuecommands
return cmd_list
+def copy_hiseq_project_fastqs(run_name, basecall_dir, site_name, destdir):
+ """
+ make a subprocess-friendly list of command line arguments to save HiSeq fastq files
+
+ run_name - most of the file name (run folder name is a good choice)
+ basecall_dir - location of unaligned files.
+ site_name - name of your "sequencing site" or "Individual"
+ destdir - root of where to save fastq files
+ """
+ # clean up pathname
+ LOGGER.info("run_name %s" % (run_name,))
+
+ cmd_list = []
+ project_dirs = glob(os.path.join(basecall_dir, 'Project_*'))
+ for project_dir in project_dirs:
+ _, project_name = os.path.split(project_dir)
+ sample_files = glob(os.path.join(project_dir, 'Sample*', '*.fastq*'))
+ project_dest = os.path.join(destdir, project_name)
+ if not os.path.exists(project_dest):
+ LOGGER.info("Making: %s" % (project_dest))
+ os.mkdir(project_dest)
+
+ for fastq_file in sample_files:
+ shutil.copy(fastq_file, project_dest)
+
+
def run_commands(new_dir, cmd_list, num_jobs):
LOGGER.info("chdir to %s" % (new_dir,))
curdir = os.getcwd()
Make sure code to parse directory heirarchy works
"""
path = '/root/42BW9AAXX/C1-152'
- flowcell, start, stop = sequences.get_flowcell_cycle(path)
+ flowcell, start, stop, project = sequences.get_flowcell_cycle(path)
self.failUnlessEqual(flowcell, '42BW9AAXX')
self.failUnlessEqual(start, 1)
self.failUnlessEqual(stop, 152)
+ self.failUnlessEqual(project, None)
+
+ path = '/root/42BW9AAXX/other'
+ self.failUnlessRaises(ValueError, sequences.get_flowcell_cycle, path)
+
+ def test_flowcell_project_cycle(self):
+ """
+ Make sure code to parse directory heirarchy works
+ """
+ path = '/root/42BW9AAXX/C1-152/Project_12345_Index1'
+ flowcell, start, stop, project = sequences.get_flowcell_cycle(path)
+
+ self.failUnlessEqual(flowcell, '42BW9AAXX')
+ self.failUnlessEqual(start, 1)
+ self.failUnlessEqual(stop, 152)
+ self.failUnlessEqual(project, 'Project_12345_Index1')
path = '/root/42BW9AAXX/other'
self.failUnlessRaises(ValueError, sequences.get_flowcell_cycle, path)
self.failUnlessEqual(f.pf, False)
self.failUnlessEqual(f.cycle, 38)
+ def test_project_fastq(self):
+ path = '/root/42BW9AAXX/C1-38/Project_12345'
+ name = '11111_NoIndex_L001_R1_001.fastq.gz'
+ pathname = os.path.join(path,name)
+ f = sequences.parse_fastq(path, name)
+
+ self.failUnlessEqual(f.filetype, 'fastq')
+ self.failUnlessEqual(f.path, pathname)
+ self.failUnlessEqual(f.flowcell, '42BW9AAXX')
+ self.failUnlessEqual(f.lane, 1)
+ self.failUnlessEqual(f.read, 1)
+ self.failUnlessEqual(f.pf, True)
+ self.failUnlessEqual(f.project, '11111')
+ self.failUnlessEqual(f.index, 'NoIndex')
+ self.failUnlessEqual(f.cycle, 38)
+
+ name = '11112_AAATTT_L001_R2_003.fastq.gz'
+ pathname = os.path.join(path,name)
+ f = sequences.parse_fastq(path, name)
+
+ self.failUnlessEqual(f.filetype, 'fastq')
+ self.failUnlessEqual(f.path, pathname)
+ self.failUnlessEqual(f.flowcell, '42BW9AAXX')
+ self.failUnlessEqual(f.lane, 1)
+ self.failUnlessEqual(f.read, 2)
+ self.failUnlessEqual(f.pf, True)
+ self.failUnlessEqual(f.project, '11112')
+ self.failUnlessEqual(f.index, 'AAATTT')
+ self.failUnlessEqual(f.cycle, 38)
+
def test_eland(self):
path = '/root/42BW9AAXX/C1-38'
name = 's_4_eland_extended.txt.bz2'
db = sqlite3.connect(":memory:")
c = db.cursor()
sequences.create_sequence_table(c)
-
+
data = [('/root/42BW9AAXX/C1-152',
'woldlab_090622_HWI-EAS229_0120_42BW9AAXX_l1_r1.tar.bz2'),
('/root/42BW9AAXX/C1-152',
count = c.execute("select count(*) from sequences")
row = count.fetchone()
self.failUnlessEqual(row[0], 4)
-
+
def suite():
return unittest.makeSuite(SequenceFileTests,'test')