htsworkflow/util/rdfinfer.py

   1 import logging
   2 import os
   3 import sys
   4
   5 from rdflib import ConjunctiveGraph, BNode, Literal, URIRef
   6 from rdflib.plugins.sparql import prepareQuery
   7
   8 from htsworkflow.util.rdfns import *
   9 from htsworkflow.util.rdfhelp import SCHEMAS_URL
  10
  11 INFER_URL='http://jumpgate.caltech.edu/phony/infer'
  12 LOGGER = logging.getLogger(__name__)
  13
  14 class Infer(object):
  15     """Provide some simple inference.
  16
  17     Provides a few default rules as methods starting with _rule_
  18     """
  19     def __init__(self, model):
  20         if not isinstance(model, ConjunctiveGraph):
  21             raise ValueError("Inferences require a ConjunctiveGraph")
  22
  23         self.model = model
  24         self._context = URIRef(INFER_URL)
  25
  26
  27     def think(self, max_iterations=None):
  28         """Update model with with inferred statements.
  29
  30         max_iterations puts a limit on the number of times we
  31         run through the loop.
  32
  33         it will also try to exit if nothing new has been inferred.
  34
  35         Also this is the naive solution.
  36         There's probably better ones out there.
  37         """
  38         iterations = 0
  39         while max_iterations is None or iterations != max_iterations:
  40             starting_size = self.model.size()
  41
  42             for method_name in dir(self):
  43                 if method_name.startswith('_rule_'):
  44                     LOGGER.info("Running: %s", method_name)
  45                     method = getattr(self, method_name)
  46                     method()
  47             if self.model.size() == starting_size:
  48                 # we didn't add anything new
  49                 return
  50
  51     def validate(self, destination=None):
  52         if destination is None:
  53             destination = sys.stdout
  54
  55         for msg in self.run_validation():
  56             destination.write(msg)
  57             destination.write(os.linesep)
  58
  59     def run_validation(self):
  60         """Apply validation rules to our model.
  61         """
  62         for method_name in dir(self):
  63             if method_name.startswith('_validate_'):
  64                 LOGGER.info("Running: %s", method_name)
  65                 method = getattr(self, method_name)
  66                 for msg in method():
  67                     yield msg
  68
  69     def _rule_class(self):
  70         """resolve class chains.
  71         e.g. if a is an BClass, and a BClass is an AClass
  72         then a is both a BClass and AClass.
  73         """
  74         body = """
  75         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
  76         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
  77         prefix owl: <http://www.w3.org/2002/07/owl#>
  78
  79         select ?obj ?class
  80         where  {
  81           ?alias a ?class .
  82           ?obj a ?alias .
  83         }"""
  84         for r in self.model.query(body):
  85             s = (r['obj'], RDF['type'], r['class'], self._context)
  86             if s not in self.model:
  87                 self.model.add(s)
  88
  89     def _rule_subclass(self):
  90         """A subclass is a parent class
  91         """
  92         body = """
  93         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
  94         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
  95         prefix owl: <http://www.w3.org/2002/07/owl#>
  96
  97         select ?obj ?subclass ?parent
  98         where  {
  99           ?subclass rdfs:subClassOf ?parent .
 100           ?obj a ?subclass .
 101         }"""
 102         for r in self.model.query(body):
 103             s = (r['obj'], RDF['type'], r['parent'], self._context)
 104             if s not in self.model:
 105                 self.model.add(s)
 106
 107     def _rule_inverse_of(self):
 108         """Add statements computed with inverseOf
 109         """
 110         body = """
 111         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
 112         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
 113         prefix owl: <http://www.w3.org/2002/07/owl#>
 114
 115         select ?o ?reverse ?s
 116         where  {
 117             ?s ?term ?o .
 118             ?s a ?subject_type .
 119             ?o a ?object_type .
 120             ?term owl:inverseOf ?reverse .
 121             ?term rdfs:domain ?subject_type ;
 122                   rdfs:range ?object_type .
 123             ?reverse rdfs:domain ?object_type ;
 124                   rdfs:range ?subject_type .
 125         }"""
 126         for r in self.model.query(body):
 127             s = (r['o'], r['reverse'], r['s'], self._context)
 128             if s not in self.model:
 129                 self.model.add(s)
 130
 131     def _validate_types(self):
 132         body = """
 133         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
 134         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
 135         prefix owl: <http://www.w3.org/2002/07/owl#>
 136         prefix xhtmlv: <http://www.w3.org/1999/xhtml/vocab#>
 137
 138         select ?subject ?predicate ?object
 139         where {
 140           ?subject ?predicate ?object
 141           OPTIONAL { ?subject a ?class }
 142           FILTER(!bound(?class))
 143           FILTER(?predicate != xhtmlv:stylesheet)
 144         }
 145         """
 146         errmsg = "Missing type for: {0}"
 147         for r in self.model.query(body):
 148             yield errmsg.format(str(r[0]))
 149
 150     def _validate_undefined_properties(self):
 151         """Find properties that aren't defined.
 152         """
 153         body = """
 154         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
 155         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
 156         prefix owl: <http://www.w3.org/2002/07/owl#>
 157
 158         select ?subject ?predicate ?object
 159         where {
 160             ?subject ?predicate ?object
 161             OPTIONAL { ?predicate a ?predicate_class }
 162             FILTER(!bound(?predicate_class))
 163         }"""
 164         msg = "Undefined property in {0} {1} {2}"
 165         for r in self.model.query(body):
 166             yield msg.format(r['subject'],
 167                              r['predicate'],
 168                              r['object'])
 169
 170     def _validate_property_types(self):
 171         """Find resources that don't have a type
 172         """
 173         property_query = prepareQuery("""
 174         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
 175         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
 176
 177         select ?type ?predicate
 178         where {
 179             ?predicate a rdf:Property ;
 180                         ?space ?type .
 181         }""")
 182
 183         def check_node_space(node, predicate, space, errmsg):
 184             """Check that a node conforms to it's allowable space of types.
 185
 186             e.g. is a subject (node) the domain (space) of this property
 187             and is the object (node) the range of of this property.
 188             """
 189             resource_error = "Expected resource for {0} in range {1}"
 190             type_error = "Type of {0} was {1} not {2}"
 191             # check domain
 192             seen = set()
 193             errors = []
 194             for i, r in enumerate(self.model.query(property_query,
 195                                       initBindings={
 196                                           'predicate': predicate,
 197                                           'space': space})):
 198                 # Make sure we have a resource if we're expecting one
 199                 expected_type = r['type']
 200
 201                 if isinstance(node, Literal):
 202                     if expected_type == RDFS['Literal']:
 203                         return []
 204                     elif node.datatype == expected_type:
 205                         return []
 206                     else:
 207                         # not currently handling type hierarchy.
 208                         # a integer could pass a range of decimal for instance.
 209                         errors.append(
 210                             "Type error: {} was type {}, expected {}".format(
 211                                 str(node),
 212                                 str(node.datatype),
 213                                 str(expected_type)))
 214                 elif expected_type == RDFS['Resource']:
 215                     if isinstance(node, Literal):
 216                         errors.append(resource_error.format(str(node), space))
 217                     else:
 218                         return []
 219                 else:
 220                     check = (node, RDF['type'], expected_type)
 221                     if check not in self.model:
 222                         errors.append(errmsg + str(node) + ' was not a ' + str(expected_type))
 223                     else:
 224                         return []
 225
 226             return errors
 227         ### End nested function
 228
 229         wrong_domain_type = "Domain of {0} was not in:"
 230         wrong_range_type = "Range of {0} was not in:"
 231
 232         count = 0
 233         schema = ConjunctiveGraph(identifier=SCHEMAS_URL)
 234         for subject, predicate, obj, context in self.model.quads():
 235             stmt = (subject, predicate, obj)
 236
 237             if context == schema:
 238                 continue
 239             # check domain
 240             for error in check_node_space(subject, predicate, RDFS.domain,
 241                                           wrong_domain_type.format(str(stmt))):
 242                 yield error
 243             # check range
 244             for error in check_node_space(obj, predicate, RDFS.range,
 245                                           wrong_range_type.format(str(stmt))):
 246                 yield error