Port rdfinfer to rdflib (fixing a hidden bug)
[htsworkflow.git] / htsworkflow / util / rdfinfer.py
1 import logging
2 import os
3 import sys
4
5 from rdflib import ConjunctiveGraph, BNode, Literal, URIRef
6 from rdflib.plugins.sparql import prepareQuery
7
8 from htsworkflow.util.rdfns import *
9 from htsworkflow.util.rdfhelp import SCHEMAS_URL
10
11 INFER_URL='http://jumpgate.caltech.edu/phony/infer'
12 LOGGER = logging.getLogger(__name__)
13
14 class Infer(object):
15     """Provide some simple inference.
16
17     Provides a few default rules as methods starting with _rule_
18     """
19     def __init__(self, model):
20         if not isinstance(model, ConjunctiveGraph):
21             raise ValueError("Inferences require a ConjunctiveGraph")
22
23         self.model = model
24         self._context = URIRef(INFER_URL)
25
26
27     def think(self, max_iterations=None):
28         """Update model with with inferred statements.
29
30         max_iterations puts a limit on the number of times we
31         run through the loop.
32
33         it will also try to exit if nothing new has been inferred.
34
35         Also this is the naive solution.
36         There's probably better ones out there.
37         """
38         iterations = 0
39         while max_iterations is None or iterations != max_iterations:
40             starting_size = self.model.size()
41
42             for method_name in dir(self):
43                 if method_name.startswith('_rule_'):
44                     LOGGER.info("Running: %s", method_name)
45                     method = getattr(self, method_name)
46                     method()
47             if self.model.size() == starting_size:
48                 # we didn't add anything new
49                 return
50
51     def validate(self, destination=None):
52         if destination is None:
53             destination = sys.stdout
54
55         for msg in self.run_validation():
56             destination.write(msg)
57             destination.write(os.linesep)
58
59     def run_validation(self):
60         """Apply validation rules to our model.
61         """
62         for method_name in dir(self):
63             if method_name.startswith('_validate_'):
64                 LOGGER.info("Running: %s", method_name)
65                 method = getattr(self, method_name)
66                 for msg in method():
67                     yield msg
68
69     def _rule_class(self):
70         """resolve class chains.
71         e.g. if a is an BClass, and a BClass is an AClass
72         then a is both a BClass and AClass.
73         """
74         body = """
75         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
76         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
77         prefix owl: <http://www.w3.org/2002/07/owl#>
78
79         select ?obj ?class
80         where  {
81           ?alias a ?class .
82           ?obj a ?alias .
83         }"""
84         for r in self.model.query(body):
85             s = (r['obj'], RDF['type'], r['class'], self._context)
86             if s not in self.model:
87                 self.model.add(s)
88
89     def _rule_subclass(self):
90         """A subclass is a parent class
91         """
92         body = """
93         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
94         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
95         prefix owl: <http://www.w3.org/2002/07/owl#>
96
97         select ?obj ?subclass ?parent
98         where  {
99           ?subclass rdfs:subClassOf ?parent .
100           ?obj a ?subclass .
101         }"""
102         for r in self.model.query(body):
103             s = (r['obj'], RDF['type'], r['parent'], self._context)
104             if s not in self.model:
105                 self.model.add(s)
106
107     def _rule_inverse_of(self):
108         """Add statements computed with inverseOf
109         """
110         body = """
111         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
112         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
113         prefix owl: <http://www.w3.org/2002/07/owl#>
114
115         select ?o ?reverse ?s
116         where  {
117             ?s ?term ?o .
118             ?s a ?subject_type .
119             ?o a ?object_type .
120             ?term owl:inverseOf ?reverse .
121             ?term rdfs:domain ?subject_type ;
122                   rdfs:range ?object_type .
123             ?reverse rdfs:domain ?object_type ;
124                   rdfs:range ?subject_type .
125         }"""
126         for r in self.model.query(body):
127             s = (r['o'], r['reverse'], r['s'], self._context)
128             if s not in self.model:
129                 self.model.add(s)
130
131     def _validate_types(self):
132         body = """
133         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
134         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
135         prefix owl: <http://www.w3.org/2002/07/owl#>
136         prefix xhtmlv: <http://www.w3.org/1999/xhtml/vocab#>
137
138         select ?subject ?predicate ?object
139         where {
140           ?subject ?predicate ?object
141           OPTIONAL { ?subject a ?class }
142           FILTER(!bound(?class))
143           FILTER(?predicate != xhtmlv:stylesheet)
144         }
145         """
146         errmsg = "Missing type for: {0}"
147         for r in self.model.query(body):
148             yield errmsg.format(str(r[0]))
149
150     def _validate_undefined_properties(self):
151         """Find properties that aren't defined.
152         """
153         body = """
154         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
155         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
156         prefix owl: <http://www.w3.org/2002/07/owl#>
157
158         select ?subject ?predicate ?object
159         where {
160             ?subject ?predicate ?object
161             OPTIONAL { ?predicate a ?predicate_class }
162             FILTER(!bound(?predicate_class))
163         }"""
164         msg = "Undefined property in {0} {1} {2}"
165         for r in self.model.query(body):
166             yield msg.format(r['subject'],
167                              r['predicate'],
168                              r['object'])
169
170     def _validate_property_types(self):
171         """Find resources that don't have a type
172         """
173         property_query = prepareQuery("""
174         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
175         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
176
177         select ?type ?predicate
178         where {
179             ?predicate a rdf:Property ;
180                         ?space ?type .
181         }""")
182
183         def check_node_space(node, predicate, space, errmsg):
184             """Check that a node conforms to it's allowable space of types.
185
186             e.g. is a subject (node) the domain (space) of this property
187             and is the object (node) the range of of this property.
188             """
189             resource_error = "Expected resource for {0} in range {1}"
190             type_error = "Type of {0} was {1} not {2}"
191             # check domain
192             seen = set()
193             errors = []
194             for i, r in enumerate(self.model.query(property_query,
195                                       initBindings={
196                                           'predicate': predicate,
197                                           'space': space})):
198                 # Make sure we have a resource if we're expecting one
199                 expected_type = r['type']
200
201                 if isinstance(node, Literal):
202                     if expected_type == RDFS['Literal']:
203                         return []
204                     elif node.datatype == expected_type:
205                         return []
206                     else:
207                         # not currently handling type hierarchy.
208                         # a integer could pass a range of decimal for instance.
209                         errors.append(
210                             "Type error: {} was type {}, expected {}".format(
211                                 str(node),
212                                 str(node.datatype),
213                                 str(expected_type)))
214                 elif expected_type == RDFS['Resource']:
215                     if isinstance(node, Literal):
216                         errors.append(resource_error.format(str(node), space))
217                     else:
218                         return []
219                 else:
220                     check = (node, RDF['type'], expected_type)
221                     if check not in self.model:
222                         errors.append(errmsg + str(node) + ' was not a ' + str(expected_type))
223                     else:
224                         return []
225
226             return errors
227         ### End nested function
228
229         wrong_domain_type = "Domain of {0} was not in:"
230         wrong_range_type = "Range of {0} was not in:"
231
232         count = 0
233         schema = ConjunctiveGraph(identifier=SCHEMAS_URL)
234         for subject, predicate, obj, context in self.model.quads():
235             stmt = (subject, predicate, obj)
236
237             if context == schema:
238                 continue
239             # check domain
240             for error in check_node_space(subject, predicate, RDFS.domain,
241                                           wrong_domain_type.format(str(stmt))):
242                 yield error
243             # check range
244             for error in check_node_space(obj, predicate, RDFS.range,
245                                           wrong_range_type.format(str(stmt))):
246                 yield error