import RDF
from htsworkflow.util.rdfhelp import libraryOntology as libNS
from htsworkflow.util.rdfhelp import toTypedNode, fromTypedNode, rdfNS, \
- stripNamespace, dump_model, simplify_uri
+ strip_namespace, dump_model, simplify_uri
LOGGER = logging.getLogger(__name__)
model.add_statement(RDF.Statement(s, p, toTypedNode(o)))
def add(model, s, p, o):
model.add_statement(RDF.Statement(s,p,o))
- fileNode = RDF.Node(RDF.Uri('file://' + os.path.abspath(self.path)))
- add(model, fileNode, rdfNS['type'], libNS['raw_file'])
+ # a bit unreliable... assumes filesystem is encoded in utf-8
+ path = os.path.abspath(self.path.encode('utf-8'))
+ fileNode = RDF.Node(RDF.Uri('file://' + path))
+ add(model, fileNode, rdfNS['type'], libNS['IlluminaResult'])
add_lit(model, fileNode, libNS['flowcell_id'], self.flowcell)
add_lit(model, fileNode, libNS['lane_number'], self.lane)
if self.read is not None:
add_lit(model, fileNode, libNS['split_id'], self.split)
add_lit(model, fileNode, libNS['cycle'], self.cycle)
add_lit(model, fileNode, libNS['passed_filter'], self.pf)
- add(model, fileNode, rdfNS['type'], libNS[self.filetype])
+ add(model, fileNode, libNS['file_type'], libNS[self.filetype])
if base_url is not None:
flowcell = RDF.Node(RDF.Uri("{base}/flowcell/{flowcell}/".format(
if not isinstance(seq_id, RDF.Node):
seq_id = RDF.Node(RDF.Uri(seq_id))
- seqTypesStmt = RDF.Statement(seq_id, rdfNS['type'], None)
- seqTypes = model.find_statements(seqTypesStmt)
- isSequenceFile = False
- for s in seqTypes:
- if s.object == libNS['raw_file']:
- isSequenceFile = True
- else:
- seq_type = stripNamespace(libNS, s.object)
-
- if not isSequenceFile:
+ result_statement = RDF.Statement(seq_id,
+ rdfNS['type'],
+ libNS['IlluminaResult'])
+ if not model.contains_statement(result_statement):
raise KeyError(u"%s not found" % (unicode(seq_id),))
+ seq_type_node = model.get_target(seq_id, libNS['file_type'])
+ seq_type = strip_namespace(libNS, seq_type_node)
+
path = urlparse(str(seq_id.uri)).path
flowcellNode = get_one(seq_id, libNS['flowcell'])
flowcell = get_one(seq_id, libNS['flowcell_id'])
basename, ext = os.path.splitext(filename)
records = basename.split('_')
flowcell = records[4]
- lane = int(records[5][0])
+ lane = records[5][0]
fullpath = os.path.join(path, filename)
if flowcell_dir != flowcell:
records = basename.split('_')
fullpath = os.path.join(path, filename)
flowcell = records[4]
- lane = int(records[5][1])
+ lane = records[5][1]
read = int(records[6][1])
if flowcell_dir != flowcell:
if project is not None:
# demultiplexed sample!
flowcell = flowcell_dir
- lane = int(records[2][-1])
+ lane = records[2][-1]
read = int(records[3][-1])
pf = True # as I understand it hiseq runs toss the ones that fail filter
index = records[1]
sequence_type = 'split_fastq'
else:
flowcell = records[4]
- lane = int(records[5][1])
+ lane = records[5][1]
read = int(records[6][1])
pf = parse_fastq_pf_flag(records)
index = None
fullpath = os.path.join(path, filename)
flowcell, start, stop, project = get_flowcell_cycle(path)
if eland_match.group('lane'):
- lane = int(eland_match.group('lane'))
+ lane = eland_match.group('lane')
else:
lane = None
if eland_match.group('read'):
"""Find sequence objects and add library information if its missing
"""
file_body = """
- prefix libNS: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+ prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
select ?filenode ?flowcell_id ?lane_id ?library_id ?flowcell ?library
where {
- ?filenode a libNS:raw_file ;
- libNS:flowcell_id ?flowcell_id ;
- libNS:lane_number ?lane_id .
- OPTIONAL { ?filenode libNS:flowcell ?flowcell . }
- OPTIONAL { ?filenode libNS:library ?library .}
- OPTIONAL { ?filenode libNS:library_id ?library_id .}
+ ?filenode a libns:IlluminaResult ;
+ libns:flowcell_id ?flowcell_id ;
+ libns:lane_number ?lane_id .
+ OPTIONAL { ?filenode libns:flowcell ?flowcell . }
+ OPTIONAL { ?filenode libns:library ?library .}
+ OPTIONAL { ?filenode libns:library_id ?library_id .}
}
"""
+ LOGGER.debug("update_model_sequence_library query %s", file_body)
file_query = RDF.SPARQLQuery(file_body)
files = file_query.execute(model)
flowcellNS = RDF.NS(urljoin(base_url, 'flowcell/'))
for f in files:
filenode = f['filenode']
+ LOGGER.debug("Updating file node %s", str(filenode))
lane_id = fromTypedNode(f['lane_id'])
if f['flowcell'] is None:
flowcell = flowcellNS[str(f['flowcell_id'])+'/']
+ LOGGER.debug("Adding file (%s) to flowcell (%s) link",
+ str(filenode),
+ str(flowcell))
model.add_statement(
RDF.Statement(filenode, libNS['flowcell'], flowcell))
else:
library = guess_library_from_model(model, base_url,
flowcell,
lane_id)
+ if library is None:
+ LOGGER.error("Unable to decypher: %s %s",
+ str(flowcell), str(lane_id))
+ continue
library_id = toTypedNode(simplify_uri(library))
+ LOGGER.debug("Adding file (%s) to library (%s) link",
+ str(filenode),
+ str(library))
model.add_statement(
RDF.Statement(filenode, libNS['library_id'], library_id))
if library is not None:
flowcellNode = RDF.Node(flowcell)
flowcell = str(flowcell.uri)
lane_body = """
- prefix libNS: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
+ prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#>
prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
prefix xsd: <http://www.w3.org/2001/XMLSchema#>
select ?library ?lane
where {{
- <{flowcell}> libNS:has_lane ?lane ;
- a libNS:illumina_flowcell .
- ?lane libNS:lane_number {lane_id} ;
- libNS:library ?library .
+ <{flowcell}> libns:has_lane ?lane ;
+ a libns:IlluminaFlowcell .
+ ?lane libns:lane_number ?lane_id ;
+ libns:library ?library .
+ FILTER(str(?lane_id) = "{lane_id}")
}}
"""
lane_body = lane_body.format(flowcell=flowcell, lane_id=lane_id)
+ LOGGER.debug("guess_library_from_model: %s", lane_body)
lanes = []
tries = 3
while len(lanes) == 0 and tries > 0:
else:
# try grabbing data
model.load(flowcellNode.uri, name="rdfa")
-
-