rdf:Resource can be either a resource or a blank node.
[htsworkflow.git] / htsworkflow / util / rdfinfer.py
index 221063fb2efc6b80d93b6c24e5e86f28beba87c9..1a0fb504181cdf4aed6ebf2aadf68271efbb5bb1 100644 (file)
@@ -1,9 +1,14 @@
+import logging
+import os
+import sys
+
 import RDF
 
 from htsworkflow.util.rdfns import *
 from htsworkflow.util.rdfhelp import SCHEMAS_URL
 
 INFER_URL='http://jumpgate.caltech.edu/phony/infer'
+LOGGER = logging.getLogger(__name__)
 
 class Infer(object):
     """Provide some simple inference.
@@ -15,7 +20,7 @@ class Infer(object):
         self._context = RDF.Node(RDF.Uri(INFER_URL))
 
 
-    def update(self, max_iterations=None):
+    def think(self, max_iterations=None):
         """Update model with with inferred statements.
 
         max_iterations puts a limit on the number of times we
@@ -32,12 +37,71 @@ class Infer(object):
 
             for method_name in dir(self):
                 if method_name.startswith('_rule_'):
+                    LOGGER.info("Running: %s", method_name)
                     method = getattr(self, method_name)
                     method()
             if self.model.size() == starting_size:
                 # we didn't add anything new
                 return
 
+    def validate(self, destination=None):
+        if destination is None:
+            destination = sys.stdout
+
+        for msg in self.run_validation():
+            destination.write(msg)
+            destination.write(os.linesep)
+
+    def run_validation(self):
+        """Apply validation rules to our model.
+        """
+        for method_name in dir(self):
+            if method_name.startswith('_validate_'):
+                LOGGER.info("Running: %s", method_name)
+                method = getattr(self, method_name)
+                for msg in method():
+                    yield msg
+
+    def _rule_class(self):
+        """resolve class chains.
+        e.g. if a is an BClass, and a BClass is an AClass
+        then a is both a BClass and AClass.
+        """
+        body = """
+        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+        prefix owl: <http://www.w3.org/2002/07/owl#>
+
+        select ?obj ?class
+        where  {
+          ?alias a ?class .
+          ?obj a ?alias .
+        }"""
+        query = RDF.SPARQLQuery(body)
+        for r in query.execute(self.model):
+            s = RDF.Statement(r['obj'], rdfNS['type'], r['class'])
+            if s not in self.model:
+                self.model.append(s, self._context)
+
+    def _rule_subclass(self):
+        """A subclass is a parent class
+        """
+        body = """
+        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+        prefix owl: <http://www.w3.org/2002/07/owl#>
+
+        select ?obj ?subclass ?parent
+        where  {
+          ?subclass rdfs:subClassOf ?parent .
+          ?obj a ?subclass .
+        }"""
+        query = RDF.SPARQLQuery(body)
+        for r in query.execute(self.model):
+            s = RDF.Statement(r['obj'], rdfNS['type'], r['parent'])
+            if s not in self.model:
+                self.model.append(s, self._context)
+
     def _rule_inverse_of(self):
         """Add statements computed with inverseOf
         """
@@ -118,8 +182,50 @@ class Infer(object):
                         {space} ?type .
         }}"""
 
-        wrong_domain_type = "Domain of {0} {1} {2} not {3}"
-        wrong_range_type = "Range of {0} {1} {2} not {3}"
+        def check_node_space(node, predicate, space, errmsg):
+            """Check that a node conforms to it's allowable space of types.
+
+            e.g. is a subject (node) the domain (space) of this property
+            and is the object (node) the range of of this property.
+            """
+            resource_error = "Expected resource for {0} in range {1}"
+            type_error = "Type of {0} was {1} not {2}"
+            # check domain
+            query = RDF.SPARQLQuery(property_template.format(
+                predicate=predicate.uri,
+                space=space))
+            seen = set()
+            for r in query.execute(self.model):
+                # Make sure we have a resource if we're expecting one
+                if r['type'] == rdfsNS['Resource']:
+                    if node.is_literal():
+                        return resource_error.format(str(node), space)
+                    continue
+                seen.add(str(r['type'].uri))
+                if node.is_literal():
+                    # literal is a generic type.
+                    nodetype = node.literal_value['datatype']
+                    if nodetype is None:
+                        # lets default to string
+                        nodetype = xsdNS['string'].uri
+                    if r['type'] == rdfsNS['Literal']:
+                        pass
+                    elif nodetype != r['type'].uri:
+                        return type_error.format(
+                            str(node), nodetype, r['type'])
+                # check that node is the expetected class type
+                check = RDF.Statement(node, rdfNS['type'], r['type'])
+                if self.model.contains_statement(check):
+                    return
+
+            # need the seen check, because we're surpressing checking
+            # rdfs:Resource types
+            if len(seen) > 0:
+                return errmsg + ",".join(seen)
+
+
+        wrong_domain_type = "Domain of {0} was not in:"
+        wrong_range_type = "Range of {0} was not in:"
 
         count = 0
         schema = RDF.Node(RDF.Uri(SCHEMAS_URL))
@@ -127,30 +233,11 @@ class Infer(object):
             if context == schema:
                 continue
             # check domain
-            query = RDF.SPARQLQuery(property_template.format(
-                predicate=s.predicate,
-                space='rdfs:domain'))
-            for r in query.execute(self.model):
-                if r['type'] == rdfsNS['Resource']:
-                    continue
-                check = RDF.Statement(s.subject, rdfNS['type'], r['type'])
-                if not self.model.contains_statement(check):
-                    yield wrong_domain_type.format(str(s.subject),
-                                                   str(s.predicate),
-                                                   str(s.object),
-                                                   str(r['type']))
+            msg = check_node_space(s.subject, s.predicate, 'rdfs:domain',
+                                   wrong_domain_type.format(str(s)))
+            if msg is not None: yield msg
             # check range
-            query = RDF.SPARQLQuery(property_template.format(
-                predicate=s.predicate,
-                space='rdfs:range'))
-            for r in query.execute(self.model):
-                if r['type'] == rdfsNS['Resource']:
-                    continue
-                check = RDF.Statement(s.object, rdfNS['type'], r['type'])
-                if not self.model.contains_statement(check):
-                    yield wrong_range_type.format(str(s.subject),
-                                                  str(s.predicate),
-                                                  str(s.object),
-                                                  str(r['type']))
-
+            msg = check_node_space(s.object, s.predicate, 'rdfs:range',
+                                   wrong_range_type.format(str(s)))
+            if msg is not None: yield msg
         return