"""
Utilities to work with the various eras of sequence archive files
"""
+import collections
import logging
import os
+import types
import re
LOGGER = logging.getLogger(__name__)
""" %( {'table': SEQUENCE_TABLE_NAME} )
return cursor.execute(sql)
+FlowcellPath = collections.namedtuple('FlowcellPath',
+ 'flowcell start stop project')
+
class SequenceFile(object):
"""
Simple container class that holds the path to a sequence archive
and basic descriptive information.
"""
- def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None):
+ def __init__(self, filetype, path, flowcell, lane,
+ read=None,
+ pf=None,
+ cycle=None,
+ project=None,
+ index=None,
+ split=None):
+ """Store various fields used in our sequence files
+
+ filetype is one of 'qseq', 'srf', 'fastq'
+ path = location of file
+ flowcell = files flowcell id
+ lane = which lane
+ read = which sequencer read, usually 1 or 2
+ pf = did it pass filter
+ cycle = cycle dir name e.g. C1-202
+ project = projed name from HiSeq, probably library ID
+ index = HiSeq barcode index sequence
+ split = file fragment from HiSeq (Since one file is split into many)
+ """
self.filetype = filetype
self.path = path
self.flowcell = flowcell
self.read = read
self.pf = pf
self.cycle = cycle
+ self.project = project
+ self.index = index
+ self.split = split
def __hash__(self):
return hash(self.key())
def key(self):
- return (self.flowcell, self.lane)
+ return (self.flowcell, self.lane, self.read, self.project, self.split)
def unicode(self):
return unicode(self.path)
"""
Equality is defined if everything but the path matches
"""
- attributes = ['filetype','flowcell', 'lane', 'read', 'pf', 'cycle']
+ attributes = ['filetype',
+ 'flowcell',
+ 'lane',
+ 'read',
+ 'pf',
+ 'cycle',
+ 'project',
+ 'index',
+ 'split']
for a in attributes:
if getattr(self, a) != getattr(other, a):
return False
"""
Extract flowcell, cycle from pathname
"""
- rest, cycle = os.path.split(path)
+ path = os.path.normpath(path)
+ project = None
+ rest, tail = os.path.split(path)
+ if tail.startswith('Project_'):
+ # we're in a multiplexed sample
+ project = tail
+ rest, cycle = os.path.split(rest)
+ else:
+ cycle = tail
+
rest, flowcell = os.path.split(rest)
cycle_match = re.match("C(?P<start>[0-9]+)-(?P<stop>[0-9]+)", cycle)
if cycle_match is None:
if stop is not None:
stop = int(stop)
- return flowcell, start, stop
+ return FlowcellPath(flowcell, start, stop, project)
def parse_srf(path, filename):
- flowcell_dir, start, stop = get_flowcell_cycle(path)
+ flowcell_dir, start, stop, project = get_flowcell_cycle(path)
basename, ext = os.path.splitext(filename)
records = basename.split('_')
flowcell = records[4]
return SequenceFile('srf', fullpath, flowcell, lane, cycle=stop)
def parse_qseq(path, filename):
- flowcell_dir, start, stop = get_flowcell_cycle(path)
+ flowcell_dir, start, stop, project = get_flowcell_cycle(path)
basename, ext = os.path.splitext(filename)
records = basename.split('_')
fullpath = os.path.join(path, filename)
def parse_fastq(path, filename):
"""Parse fastq names
"""
- flowcell_dir, start, stop = get_flowcell_cycle(path)
- basename, ext = os.path.splitext(filename)
+ flowcell_dir, start, stop, project = get_flowcell_cycle(path)
+ basename = re.sub('\.fastq(\.gz|\.bz2)?$', '', filename)
records = basename.split('_')
fullpath = os.path.join(path, filename)
- flowcell = records[4]
- lane = int(records[5][1])
- read = int(records[6][1])
- pf = parse_fastq_pf_flag(records)
+ if project is not None:
+ # demultiplexed sample!
+ flowcell = flowcell_dir
+ lane = int(records[2][-1])
+ read = int(records[3][-1])
+ pf = True # as I understand it hiseq runs toss the ones that fail filter
+ index = records[1]
+ project_id = records[0]
+ split = records[4]
+ sequence_type = 'split_fastq'
+ else:
+ flowcell = records[4]
+ lane = int(records[5][1])
+ read = int(records[6][1])
+ pf = parse_fastq_pf_flag(records)
+ index = None
+ project_id = None
+ split = None
+ sequence_type = 'fastq'
if flowcell_dir != flowcell:
LOGGER.warn("flowcell %s found in wrong directory %s" % \
(flowcell, path))
- return SequenceFile('fastq', fullpath, flowcell, lane, read, pf=pf, cycle=stop)
+ return SequenceFile(sequence_type, fullpath, flowcell, lane, read,
+ pf=pf,
+ cycle=stop,
+ project=project_id,
+ index=index,
+ split=split)
def parse_fastq_pf_flag(records):
"""Take a fastq filename split on _ and look for the pass-filter flag
if eland_match is None:
eland_match = eland_re.match(filename)
fullpath = os.path.join(path, filename)
- flowcell, start, stop = get_flowcell_cycle(path)
+ flowcell, start, stop, project = get_flowcell_cycle(path)
if eland_match.group('lane'):
lane = int(eland_match.group('lane'))
else:
Scan through a list of directories for sequence like files
"""
sequences = []
+ if type(dirs) in types.StringTypes:
+ raise ValueError("You probably want a list or set, not a string")
+
for d in dirs:
LOGGER.info("Scanning %s for sequences" % (d,))
if not os.path.exists(d):
for f in filenames:
seq = None
# find sequence files
- if raw_seq_re.match(f):
- if f.endswith('.md5'):
- continue
- elif f.endswith('.srf') or f.endswith('.srf.bz2'):
- seq = parse_srf(path, f)
- elif qseq_re.match(f):
- seq = parse_qseq(path, f)
- elif f.endswith('fastq') or f.endswith('.fastq.bz2'):
- seq = parse_fastq(path, f)
+ if f.endswith('.md5'):
+ continue
+ elif f.endswith('.srf') or f.endswith('.srf.bz2'):
+ seq = parse_srf(path, f)
+ elif qseq_re.match(f):
+ seq = parse_qseq(path, f)
+ elif f.endswith('.fastq') or \
+ f.endswith('.fastq.bz2') or \
+ f.endswith('.fastq.gz'):
+ seq = parse_fastq(path, f)
eland_match = eland_re.match(f)
if eland_match:
if f.endswith('.md5'):