Further attempts to validate RDF models.
[htsworkflow.git] / htsworkflow / util / rdfinfer.py
1 import logging
2 import os
3 import sys
4
5 import RDF
6
7 from htsworkflow.util.rdfns import *
8 from htsworkflow.util.rdfhelp import SCHEMAS_URL
9
10 INFER_URL='http://jumpgate.caltech.edu/phony/infer'
11 LOGGER = logging.getLogger(__name__)
12
13 class Infer(object):
14     """Provide some simple inference.
15
16     Provides a few default rules as methods starting with _rule_
17     """
18     def __init__(self, model):
19         self.model = model
20         self._context = RDF.Node(RDF.Uri(INFER_URL))
21
22
23     def think(self, max_iterations=None):
24         """Update model with with inferred statements.
25
26         max_iterations puts a limit on the number of times we
27         run through the loop.
28
29         it will also try to exit if nothing new has been inferred.
30
31         Also this is the naive solution.
32         There's probably better ones out there.
33         """
34         iterations = 0
35         while max_iterations is None or iterations != max_iterations:
36             starting_size = self.model.size()
37
38             for method_name in dir(self):
39                 if method_name.startswith('_rule_'):
40                     LOGGER.info("Running: %s", method_name)
41                     method = getattr(self, method_name)
42                     method()
43             if self.model.size() == starting_size:
44                 # we didn't add anything new
45                 return
46
47     def validate(self, destination=None):
48         if destination is None:
49             destination = sys.stdout
50
51         for msg in self.run_validation():
52             destination.write(msg)
53             destination.write(os.linesep)
54
55     def run_validation(self):
56         """Apply validation rules to our model.
57         """
58         for method_name in dir(self):
59             if method_name.startswith('_validate_'):
60                 LOGGER.info("Running: %s", method_name)
61                 method = getattr(self, method_name)
62                 for msg in method():
63                     yield msg
64
65     def _rule_class(self):
66         """resolve class chains.
67         e.g. if a is an BClass, and a BClass is an AClass
68         then a is both a BClass and AClass.
69         """
70         body = """
71         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
72         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
73         prefix owl: <http://www.w3.org/2002/07/owl#>
74
75         select ?obj ?class
76         where  {
77           ?alias a ?class .
78           ?obj a ?alias .
79         }"""
80         query = RDF.SPARQLQuery(body)
81         for r in query.execute(self.model):
82             s = RDF.Statement(r['obj'], rdfNS['type'], r['class'])
83             if s not in self.model:
84                 self.model.append(s, self._context)
85
86     def _rule_subclass(self):
87         """A subclass is a parent class
88         """
89         body = """
90         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
91         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
92         prefix owl: <http://www.w3.org/2002/07/owl#>
93
94         select ?obj ?subclass ?parent
95         where  {
96           ?subclass rdfs:subClassOf ?parent .
97           ?obj a ?subclass .
98         }"""
99         query = RDF.SPARQLQuery(body)
100         for r in query.execute(self.model):
101             s = RDF.Statement(r['obj'], rdfNS['type'], r['parent'])
102             if s not in self.model:
103                 self.model.append(s, self._context)
104
105     def _rule_inverse_of(self):
106         """Add statements computed with inverseOf
107         """
108         body = """
109         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
110         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
111         prefix owl: <http://www.w3.org/2002/07/owl#>
112
113         select ?o ?reverse ?s
114         where  {
115             ?s ?term ?o .
116             ?s a ?subject_type .
117             ?o a ?object_type .
118             ?term owl:inverseOf ?reverse .
119             ?term rdfs:domain ?subject_type ;
120                   rdfs:range ?object_type .
121             ?reverse rdfs:domain ?object_type ;
122                   rdfs:range ?subject_type .
123         }"""
124         query = RDF.SPARQLQuery(body)
125
126         statements = []
127         for r in query.execute(self.model):
128             s = RDF.Statement(r['o'], r['reverse'], r['s'])
129             if s not in self.model:
130                 self.model.append(s, self._context)
131
132
133     def _validate_types(self):
134         body = """
135         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
136         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
137         prefix owl: <http://www.w3.org/2002/07/owl#>
138
139         select ?subject ?predicate ?object
140         where {
141           ?subject ?predicate ?object
142           OPTIONAL { ?subject a ?class }
143           FILTER(!bound(?class))
144         }
145         """
146         query = RDF.SPARQLQuery(body)
147         errmsg = "Missing type for: {0}"
148         for r in query.execute(self.model):
149             yield errmsg.format(str(r['subject']))
150
151     def _validate_undefined_properties(self):
152         """Find properties that aren't defined.
153         """
154         body = """
155         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
156         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
157         prefix owl: <http://www.w3.org/2002/07/owl#>
158
159         select ?subject ?predicate ?object
160         where {
161             ?subject ?predicate ?object
162             OPTIONAL { ?predicate a ?predicate_class }
163             FILTER(!bound(?predicate_class))
164         }"""
165         query = RDF.SPARQLQuery(body)
166         msg = "Undefined property in {0} {1} {2}"
167         for r in query.execute(self.model):
168             yield msg.format(str(r['subject']),
169                              str(r['predicate']),
170                              str(r['object']))
171
172     def _validate_property_types(self):
173         """Find resources that don't have a type
174         """
175         property_template = """
176         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
177         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
178
179         select ?type
180         where {{
181             <{predicate}> a rdf:Property ;
182                         {space} ?type .
183         }}"""
184
185         def check_node_space(node, predicate, space, errmsg):
186             """Check that a node conforms to it's allowable space of types.
187
188             e.g. is a subject (node) the domain (space) of this property
189             and is the object (node) the range of of this property.
190             """
191             resource_error = "Expected resource for {0} in range {1}"
192             type_error = "Type of {0} was {1} not {2}"
193             # check domain
194             query = RDF.SPARQLQuery(property_template.format(
195                 predicate=predicate.uri,
196                 space=space))
197             seen = set()
198             for r in query.execute(self.model):
199                 # Make sure we have a resource if we're expecting one
200                 if r['type'] == rdfsNS['Resource']:
201                     if not node.is_resource():
202                         return resource_error.format(str(node), space)
203                     continue
204                 seen.add(str(r['type'].uri))
205                 if node.is_literal():
206                     # literal is a generic type.
207                     nodetype = node.literal_value['datatype']
208                     if nodetype is None:
209                         # lets default to string
210                         nodetype = xsdNS['string'].uri
211                     if r['type'] == rdfsNS['Literal']:
212                         pass
213                     elif nodetype != r['type'].uri:
214                         return type_error.format(
215                             str(node), nodetype, r['type'])
216                 # check that node is the expetected class type
217                 check = RDF.Statement(node, rdfNS['type'], r['type'])
218                 if self.model.contains_statement(check):
219                     return
220
221             # need the seen check, because we're surpressing checking
222             # rdfs:Resource types
223             if len(seen) > 0:
224                 return errmsg + ",".join(seen)
225
226
227         wrong_domain_type = "Domain of {0} was not in:"
228         wrong_range_type = "Range of {0} was not in:"
229
230         count = 0
231         schema = RDF.Node(RDF.Uri(SCHEMAS_URL))
232         for s, context in self.model.as_stream_context():
233             if context == schema:
234                 continue
235             # check domain
236             msg = check_node_space(s.subject, s.predicate, 'rdfs:domain',
237                                    wrong_domain_type.format(str(s)))
238             if msg is not None: yield msg
239             # check range
240             msg = check_node_space(s.object, s.predicate, 'rdfs:range',
241                                    wrong_range_type.format(str(s)))
242             if msg is not None: yield msg
243         return