import os
import types
import re
+import sys
+from urlparse import urljoin, urlparse
+
+import RDF
+from htsworkflow.util.rdfhelp import libraryOntology as libNS
+from htsworkflow.util.rdfhelp import toTypedNode, fromTypedNode, rdfNS, \
+ strip_namespace, dump_model, simplify_uri
LOGGER = logging.getLogger(__name__)
Simple container class that holds the path to a sequence archive
and basic descriptive information.
"""
- def __init__(self, filetype, path, flowcell, lane, read=None, pf=None, cycle=None,
+ def __init__(self, filetype, path, flowcell, lane,
+ read=None,
+ pf=None,
+ cycle=None,
project=None,
- index=None):
+ index=None,
+ split=None):
+ """Store various fields used in our sequence files
+
+ filetype is one of 'qseq', 'srf', 'fastq'
+ path = location of file
+ flowcell = files flowcell id
+ lane = which lane
+ read = which sequencer read, usually 1 or 2
+ pf = did it pass filter
+ cycle = cycle dir name e.g. C1-202
+ project = projed name from HiSeq, probably library ID
+ index = HiSeq barcode index sequence
+ split = file fragment from HiSeq (Since one file is split into many)
+ """
self.filetype = filetype
self.path = path
self.flowcell = flowcell
self.cycle = cycle
self.project = project
self.index = index
+ self.split = split
def __hash__(self):
return hash(self.key())
def key(self):
- return (self.flowcell, self.lane)
+ return (self.flowcell, self.lane, self.read, self.project, self.split)
- def unicode(self):
+ def __unicode__(self):
return unicode(self.path)
def __eq__(self, other):
"""
Equality is defined if everything but the path matches
"""
- attributes = ['filetype','flowcell', 'lane', 'read', 'pf', 'cycle', 'project', 'index']
+ attributes = ['filetype',
+ 'flowcell',
+ 'lane',
+ 'read',
+ 'pf',
+ 'cycle',
+ 'project',
+ 'index',
+ 'split']
for a in attributes:
if getattr(self, a) != getattr(other, a):
return False
return True
+ def __ne__(self, other):
+ return not self == other
+
def __repr__(self):
return u"<%s %s %s %s>" % (self.filetype, self.flowcell, self.lane, self.path)
# information and thus are unique so we don't have to do anything
return os.path.join(root, basename)
- def save(self, cursor):
+ def save_to_sql(self, cursor):
"""
Add this entry to a DB2.0 database.
"""
return cursor.execute(sql, sql_values)
+ def save_to_model(self, model, base_url=None):
+ def add_lit(model, s, p, o):
+ if o is not None:
+ model.add_statement(RDF.Statement(s, p, toTypedNode(o)))
+ def add(model, s, p, o):
+ model.add_statement(RDF.Statement(s,p,o))
+ # a bit unreliable... assumes filesystem is encoded in utf-8
+ path = os.path.abspath(self.path.encode('utf-8'))
+ fileNode = RDF.Node(RDF.Uri('file://' + path))
+ add(model, fileNode, rdfNS['type'], libNS['IlluminaResult'])
+ add_lit(model, fileNode, libNS['flowcell_id'], self.flowcell)
+ add_lit(model, fileNode, libNS['lane_number'], self.lane)
+ if self.read is not None:
+ add_lit(model, fileNode, libNS['read'], self.read)
+ else:
+ add_lit(model, fileNode, libNS['read'], '')
+
+ add_lit(model, fileNode, libNS['library_id'], self.project)
+ add_lit(model, fileNode, libNS['multiplex_index'], self.index)
+ add_lit(model, fileNode, libNS['split_id'], self.split)
+ add_lit(model, fileNode, libNS['cycle'], self.cycle)
+ add_lit(model, fileNode, libNS['passed_filter'], self.pf)
+ add(model, fileNode, libNS['file_type'], libNS[self.filetype])
+
+ if base_url is not None:
+ flowcell = RDF.Node(RDF.Uri("{base}/flowcell/{flowcell}/".format(
+ base=base_url,
+ flowcell=self.flowcell)))
+ add(model, fileNode, libNS['flowcell'], flowcell)
+ if self.project is not None:
+ library = RDF.Node(RDF.Uri("{base}/library/{library}".format(
+ base=base_url,
+ library=self.project)))
+ add(model, fileNode, libNS['library'], library)
+
+
+ @classmethod
+ def load_from_model(cls, model, seq_id):
+ def get(s, p):
+ values = []
+ stmts = model.find_statements(RDF.Statement(s, p, None))
+ for s in stmts:
+ obj = s.object
+ if not obj.is_resource():
+ values.append(fromTypedNode(obj))
+ else:
+ values.append(obj)
+ return values
+ def get_one(s, p):
+ values = get(s, p)
+ if len(values) > 1:
+ errmsg = u"To many values for %s %s"
+ raise ValueError(errmsg % (unicode(s), unicode(p)))
+ elif len(values) == 1:
+ return values[0]
+ else:
+ return None
+
+ if not isinstance(seq_id, RDF.Node):
+ seq_id = RDF.Node(RDF.Uri(seq_id))
+ result_statement = RDF.Statement(seq_id,
+ rdfNS['type'],
+ libNS['IlluminaResult'])
+ if not model.contains_statement(result_statement):
+ raise KeyError(u"%s not found" % (unicode(seq_id),))
+
+ seq_type_node = model.get_target(seq_id, libNS['file_type'])
+ seq_type = strip_namespace(libNS, seq_type_node)
+
+ path = urlparse(str(seq_id.uri)).path
+ flowcellNode = get_one(seq_id, libNS['flowcell'])
+ flowcell = get_one(seq_id, libNS['flowcell_id'])
+ lane = get_one(seq_id, libNS['lane_number'])
+ read = get_one(seq_id, libNS['read'])
+
+ obj = cls(seq_type, path, flowcell, lane)
+ obj.read = read if read != '' else None
+ obj.project = get_one(seq_id, libNS['library_id'])
+ obj.index = get_one(seq_id, libNS['multiplex_index'])
+ obj.split = get_one(seq_id, libNS['split_id'])
+ obj.cycle = get_one(seq_id, libNS['cycle'] )
+ obj.pf = get_one(seq_id, libNS['passed_filter'])
+ obj.libraryNode = get_one(seq_id, libNS['library'])
+ return obj
+
+
def get_flowcell_cycle(path):
"""
Extract flowcell, cycle from pathname
basename, ext = os.path.splitext(filename)
records = basename.split('_')
flowcell = records[4]
- lane = int(records[5][0])
+ lane = records[5][0]
fullpath = os.path.join(path, filename)
if flowcell_dir != flowcell:
records = basename.split('_')
fullpath = os.path.join(path, filename)
flowcell = records[4]
- lane = int(records[5][1])
+ lane = records[5][1]
read = int(records[6][1])
if flowcell_dir != flowcell:
"""Parse fastq names
"""
flowcell_dir, start, stop, project = get_flowcell_cycle(path)
- basename, ext = os.path.splitext(filename)
+ basename = re.sub('\.fastq(\.gz|\.bz2)?$', '', filename)
records = basename.split('_')
fullpath = os.path.join(path, filename)
if project is not None:
# demultiplexed sample!
flowcell = flowcell_dir
- lane = int(records[2][-1])
+ lane = records[2][-1]
read = int(records[3][-1])
pf = True # as I understand it hiseq runs toss the ones that fail filter
index = records[1]
project_id = records[0]
+ split = records[4]
+ sequence_type = 'split_fastq'
else:
flowcell = records[4]
- lane = int(records[5][1])
+ lane = records[5][1]
read = int(records[6][1])
pf = parse_fastq_pf_flag(records)
index = None
project_id = None
+ split = None
+ sequence_type = 'fastq'
if flowcell_dir != flowcell:
LOGGER.warn("flowcell %s found in wrong directory %s" % \
(flowcell, path))
- return SequenceFile('fastq', fullpath, flowcell, lane, read,
+ return SequenceFile(sequence_type, fullpath, flowcell, lane, read,
pf=pf,
cycle=stop,
project=project_id,
- index=index)
+ index=index,
+ split=split)
def parse_fastq_pf_flag(records):
"""Take a fastq filename split on _ and look for the pass-filter flag
elif fastq_type.startswith('all'):
pf = None
else:
- raise ValueError("Unrecognized fastq name %s at %s" % \
- (records[-1], os.path.join(path,filename)))
+ raise ValueError("Unrecognized fastq name: %s" % (
+ "_".join(records),))
return pf
fullpath = os.path.join(path, filename)
flowcell, start, stop, project = get_flowcell_cycle(path)
if eland_match.group('lane'):
- lane = int(eland_match.group('lane'))
+ lane = eland_match.group('lane')
else:
lane = None
if eland_match.group('read'):
for f in filenames:
seq = None
# find sequence files
- if raw_seq_re.match(f):
- if f.endswith('.md5'):
- continue
- elif f.endswith('.srf') or f.endswith('.srf.bz2'):
- seq = parse_srf(path, f)
- elif qseq_re.match(f):
- seq = parse_qseq(path, f)
- elif f.endswith('.fastq') or \
- f.endswith('.fastq.bz2') or \
- f.endswith('.fastq.gz'):
- seq = parse_fastq(path, f)
+ if f.endswith('.md5'):
+ continue
+ elif f.endswith('.srf') or f.endswith('.srf.bz2'):
+ seq = parse_srf(path, f)
+ elif qseq_re.match(f):
+ seq = parse_qseq(path, f)
+ elif f.endswith('.fastq') or \
+ f.endswith('.fastq.bz2') or \
+ f.endswith('.fastq.gz'):
+ seq = parse_fastq(path, f)
eland_match = eland_re.match(f)
if eland_match:
if f.endswith('.md5'):
LOGGER.debug("Found sequence at %s" % (f,))
return sequences
+
+
+def update_model_sequence_library(model, base_url):
+ """Find sequence objects and add library information if its missing
+ """
+ file_body = """
+ prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+ select ?filenode ?flowcell_id ?lane_id ?library_id ?flowcell ?library
+ where {
+ ?filenode a libns:IlluminaResult ;
+ libns:flowcell_id ?flowcell_id ;
+ libns:lane_number ?lane_id .
+ OPTIONAL { ?filenode libns:flowcell ?flowcell . }
+ OPTIONAL { ?filenode libns:library ?library .}
+ OPTIONAL { ?filenode libns:library_id ?library_id .}
+ }
+ """
+ LOGGER.debug("update_model_sequence_library query %s", file_body)
+ file_query = RDF.SPARQLQuery(file_body)
+ files = file_query.execute(model)
+
+ libraryNS = RDF.NS(urljoin(base_url, 'library/'))
+ flowcellNS = RDF.NS(urljoin(base_url, 'flowcell/'))
+ for f in files:
+ filenode = f['filenode']
+ LOGGER.debug("Updating file node %s", str(filenode))
+ lane_id = fromTypedNode(f['lane_id'])
+ if f['flowcell'] is None:
+ flowcell = flowcellNS[str(f['flowcell_id'])+'/']
+ LOGGER.debug("Adding file (%s) to flowcell (%s) link",
+ str(filenode),
+ str(flowcell))
+ model.add_statement(
+ RDF.Statement(filenode, libNS['flowcell'], flowcell))
+ else:
+ flowcell = f['flowcell']
+
+ if f['library'] is None:
+ if f['library_id'] is not None:
+ library = libraryNS[str(f['library_id']) + '/']
+ else:
+ library = guess_library_from_model(model, base_url,
+ flowcell,
+ lane_id)
+ if library is None:
+ LOGGER.error("Unable to decypher: %s %s",
+ str(flowcell), str(lane_id))
+ continue
+ library_id = toTypedNode(simplify_uri(library))
+ LOGGER.debug("Adding file (%s) to library (%s) link",
+ str(filenode),
+ str(library))
+ model.add_statement(
+ RDF.Statement(filenode, libNS['library_id'], library_id))
+ if library is not None:
+ model.add_statement(
+ RDF.Statement(filenode, libNS['library'], library))
+
+
+def guess_library_from_model(model, base_url, flowcell, lane_id):
+ """Attempt to find library URI
+ """
+ flowcellNode = RDF.Node(flowcell)
+ flowcell = str(flowcell.uri)
+ lane_body = """
+ prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+ prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+ prefix xsd: <http://www.w3.org/2001/XMLSchema#>
+
+ select ?library ?lane
+ where {{
+ <{flowcell}> libns:has_lane ?lane ;
+ a libns:IlluminaFlowcell .
+ ?lane libns:lane_number ?lane_id ;
+ libns:library ?library .
+ FILTER(str(?lane_id) = "{lane_id}")
+ }}
+ """
+ lane_body = lane_body.format(flowcell=flowcell, lane_id=lane_id)
+ LOGGER.debug("guess_library_from_model: %s", lane_body)
+ lanes = []
+ tries = 3
+ while len(lanes) == 0 and tries > 0:
+ tries -= 1
+ lane_query = RDF.SPARQLQuery(lane_body)
+ lanes = [ l for l in lane_query.execute(model)]
+ if len(lanes) > 1:
+ # CONFUSED!
+ errmsg = "Too many libraries for flowcell {flowcell} "\
+ "lane {lane} = {count}"
+ LOGGER.error(errmsg.format(flowcell=flowcell,
+ lane=lane_id,
+ count=len(lanes)))
+ return None
+ elif len(lanes) == 1:
+ # success
+ return lanes[0]['library']
+ else:
+ # try grabbing data
+ model.load(flowcellNode.uri, name="rdfa")