From: Diane Trout Date: Fri, 16 Nov 2012 00:01:04 +0000 (-0800) Subject: Further attempts to validate RDF models. X-Git-Url: http://woldlab.caltech.edu/gitweb/?p=htsworkflow.git;a=commitdiff_plain;h=d7e2e918f005746f70ed75891bfd35e4e4750e22 Further attempts to validate RDF models. I had a bug caused by lane numbers being langauage tagged strings, and thus not being found by my sparql query. I found a solution to filter based on just the contents of a string ignoring the language tag. However I thought not only should I make it easier to run my RDF model validation code, I should also double check the literal types. Previously I just tagged any literal as rdfs:Literal. For ones that should have a known type, I've changed it to the xmlschema types. This patch doesn't actually fix the bug. Just introduces the diagnostic tool. --- diff --git a/htsworkflow/pipelines/sequences.py b/htsworkflow/pipelines/sequences.py index acd1005..87212dd 100644 --- a/htsworkflow/pipelines/sequences.py +++ b/htsworkflow/pipelines/sequences.py @@ -453,6 +453,10 @@ def update_model_sequence_library(model, base_url): library = guess_library_from_model(model, base_url, flowcell, lane_id) + if library is None: + LOGGER.error("Unable to decypher: %s %s", + str(flowcell), str(lane_id)) + continue library_id = toTypedNode(simplify_uri(library)) LOGGER.debug("Adding file (%s) to library (%s) link", str(filenode), @@ -478,11 +482,13 @@ def guess_library_from_model(model, base_url, flowcell, lane_id): where {{ <{flowcell}> libns:has_lane ?lane ; a libns:IlluminaFlowcell . - ?lane libns:lane_number "{lane_id}" ; + ?lane libns:lane_number ?lane_id ; libns:library ?library . + FILTER(str(?lane_id) = "{lane_id}") }} """ lane_body = lane_body.format(flowcell=flowcell, lane_id=lane_id) + LOGGER.debug("guess_library_from_model: %s", lane_body) lanes = [] tries = 3 while len(lanes) == 0 and tries > 0: @@ -503,5 +509,3 @@ def guess_library_from_model(model, base_url, flowcell, lane_id): else: # try grabbing data model.load(flowcellNode.uri, name="rdfa") - - diff --git a/htsworkflow/util/rdfinfer.py b/htsworkflow/util/rdfinfer.py index b36fe62..baaa2e4 100644 --- a/htsworkflow/util/rdfinfer.py +++ b/htsworkflow/util/rdfinfer.py @@ -8,6 +8,7 @@ from htsworkflow.util.rdfns import * from htsworkflow.util.rdfhelp import SCHEMAS_URL INFER_URL='http://jumpgate.caltech.edu/phony/infer' +LOGGER = logging.getLogger(__name__) class Infer(object): """Provide some simple inference. @@ -36,6 +37,7 @@ class Infer(object): for method_name in dir(self): if method_name.startswith('_rule_'): + LOGGER.info("Running: %s", method_name) method = getattr(self, method_name) method() if self.model.size() == starting_size: @@ -55,6 +57,7 @@ class Infer(object): """ for method_name in dir(self): if method_name.startswith('_validate_'): + LOGGER.info("Running: %s", method_name) method = getattr(self, method_name) for msg in method(): yield msg @@ -143,7 +146,7 @@ class Infer(object): query = RDF.SPARQLQuery(body) errmsg = "Missing type for: {0}" for r in query.execute(self.model): - yield errmsg.format(str(r['subject'].uri)) + yield errmsg.format(str(r['subject'])) def _validate_undefined_properties(self): """Find properties that aren't defined. @@ -185,15 +188,32 @@ class Infer(object): e.g. is a subject (node) the domain (space) of this property and is the object (node) the range of of this property. """ + resource_error = "Expected resource for {0} in range {1}" + type_error = "Type of {0} was {1} not {2}" # check domain query = RDF.SPARQLQuery(property_template.format( predicate=predicate.uri, space=space)) - seen = [] + seen = set() for r in query.execute(self.model): + # Make sure we have a resource if we're expecting one if r['type'] == rdfsNS['Resource']: + if not node.is_resource(): + return resource_error.format(str(node), space) continue - seen.append(str(r['type'].uri)) + seen.add(str(r['type'].uri)) + if node.is_literal(): + # literal is a generic type. + nodetype = node.literal_value['datatype'] + if nodetype is None: + # lets default to string + nodetype = xsdNS['string'].uri + if r['type'] == rdfsNS['Literal']: + pass + elif nodetype != r['type'].uri: + return type_error.format( + str(node), nodetype, r['type']) + # check that node is the expetected class type check = RDF.Statement(node, rdfNS['type'], r['type']) if self.model.contains_statement(check): return @@ -221,5 +241,3 @@ class Infer(object): wrong_range_type.format(str(s))) if msg is not None: yield msg return - - diff --git a/htsworkflow/util/schemas/htsworkflow.turtle b/htsworkflow/util/schemas/htsworkflow.turtle index 92ed6e6..12c1f55 100644 --- a/htsworkflow/util/schemas/htsworkflow.turtle +++ b/htsworkflow/util/schemas/htsworkflow.turtle @@ -14,10 +14,12 @@ dc:title "HTS-Workflow ontology" ; a owl:Ontology . -htswlib:Class a rdfs:Class . +htswlib:Class rdfs:subClassOf rdfs:Class ; + a rdfs:Class . +rdfs:Resource a rdfs:Class. htswlib:IlluminaFlowcell - a rdfs:Class, htswlib:Class ; + a rdfs:Class, htswlib:Class; rdfs:comment "information about a illumina flowcell" ; rdfs:label "Flowcell" . @@ -69,7 +71,7 @@ htswlib:date rdfs:label "made on" ; rdfs:domain htswlib:IlluminaFlowcell ; rdfs:domain htswlib:Library ; - rdfs:range rdfs:Literal . + rdfs:range xsd:dateTime . htswlib:total_unique_locations a rdf:Property ; @@ -77,7 +79,7 @@ htswlib:total_unique_locations rdfs:label "Unique locations" ; rdfs:domain htswlib:Library ; rdfs:domain htswlib:IlluminaLane ; - rdfs:range rdfs:Literal . + rdfs:range xsd:integer . htswlib:has_mappings a rdf:Property ; @@ -272,7 +274,7 @@ htswlib:gel_cut rdfs:comment "The estimated fragment sizes cut from gel"; rdfs:label "Gel Cut" ; rdfs:domain htswlib:Library ; - rdfs:range rdfs:Literal . + rdfs:range xsd:decimal . htswlib:made_by a rdf:Property ; @@ -330,7 +332,7 @@ htswlib:lane_number rdfs:comment "Which lane were we run in" ; rdfs:label "lane id" ; rdfs:domain htswlib:IlluminaLane ; - rdfs:range rdfs:Literal . + rdfs:range xsd:string . # FIXME: should this be note? htswlib:comment diff --git a/scripts/rdfcheck.py b/scripts/rdfcheck.py new file mode 100644 index 0000000..7f5a6d1 --- /dev/null +++ b/scripts/rdfcheck.py @@ -0,0 +1,30 @@ +from argparse import ArgumentParser +import logging +from htsworkflow.util import rdfhelp, rdfinfer + +def main(cmdline=None): + parser = make_parser() + args = parser.parse_args(cmdline) + + logging.basicConfig(level=logging.INFO) + + validate_urls(args.urls) + +def make_parser(): + parser = ArgumentParser() + parser.add_argument('urls',nargs='*') + return parser + +def validate_urls(urls): + model = rdfhelp.get_model() + rdfhelp.add_default_schemas(model) + + for u in urls: + rdfhelp.load_into_model(model, None, u) + + engine = rdfinfer.Infer(model) + #engine.think() + engine.validate() + +if __name__ == "__main__": + main()