Ignoring the missing type RDF error for the page under testing's url.
[htsworkflow.git] / htsworkflow / util / rdfinfer.py
1 import logging
2 import os
3 import sys
4
5 import RDF
6
7 from htsworkflow.util.rdfns import *
8 from htsworkflow.util.rdfhelp import SCHEMAS_URL
9
10 INFER_URL='http://jumpgate.caltech.edu/phony/infer'
11 LOGGER = logging.getLogger(__name__)
12
13 class Infer(object):
14     """Provide some simple inference.
15
16     Provides a few default rules as methods starting with _rule_
17     """
18     def __init__(self, model):
19         self.model = model
20         self._context = RDF.Node(RDF.Uri(INFER_URL))
21
22
23     def think(self, max_iterations=None):
24         """Update model with with inferred statements.
25
26         max_iterations puts a limit on the number of times we
27         run through the loop.
28
29         it will also try to exit if nothing new has been inferred.
30
31         Also this is the naive solution.
32         There's probably better ones out there.
33         """
34         iterations = 0
35         while max_iterations is None or iterations != max_iterations:
36             starting_size = self.model.size()
37
38             for method_name in dir(self):
39                 if method_name.startswith('_rule_'):
40                     LOGGER.info("Running: %s", method_name)
41                     method = getattr(self, method_name)
42                     method()
43             if self.model.size() == starting_size:
44                 # we didn't add anything new
45                 return
46
47     def validate(self, destination=None):
48         if destination is None:
49             destination = sys.stdout
50
51         for msg in self.run_validation():
52             destination.write(msg)
53             destination.write(os.linesep)
54
55     def run_validation(self):
56         """Apply validation rules to our model.
57         """
58         for method_name in dir(self):
59             if method_name.startswith('_validate_'):
60                 LOGGER.info("Running: %s", method_name)
61                 method = getattr(self, method_name)
62                 for msg in method():
63                     yield msg
64
65     def _rule_class(self):
66         """resolve class chains.
67         e.g. if a is an BClass, and a BClass is an AClass
68         then a is both a BClass and AClass.
69         """
70         body = """
71         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
72         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
73         prefix owl: <http://www.w3.org/2002/07/owl#>
74
75         select ?obj ?class
76         where  {
77           ?alias a ?class .
78           ?obj a ?alias .
79         }"""
80         query = RDF.SPARQLQuery(body)
81         for r in query.execute(self.model):
82             s = RDF.Statement(r['obj'], rdfNS['type'], r['class'])
83             if s not in self.model:
84                 self.model.append(s, self._context)
85
86     def _rule_subclass(self):
87         """A subclass is a parent class
88         """
89         body = """
90         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
91         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
92         prefix owl: <http://www.w3.org/2002/07/owl#>
93
94         select ?obj ?subclass ?parent
95         where  {
96           ?subclass rdfs:subClassOf ?parent .
97           ?obj a ?subclass .
98         }"""
99         query = RDF.SPARQLQuery(body)
100         for r in query.execute(self.model):
101             s = RDF.Statement(r['obj'], rdfNS['type'], r['parent'])
102             if s not in self.model:
103                 self.model.append(s, self._context)
104
105     def _rule_inverse_of(self):
106         """Add statements computed with inverseOf
107         """
108         body = """
109         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
110         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
111         prefix owl: <http://www.w3.org/2002/07/owl#>
112
113         select ?o ?reverse ?s
114         where  {
115             ?s ?term ?o .
116             ?s a ?subject_type .
117             ?o a ?object_type .
118             ?term owl:inverseOf ?reverse .
119             ?term rdfs:domain ?subject_type ;
120                   rdfs:range ?object_type .
121             ?reverse rdfs:domain ?object_type ;
122                   rdfs:range ?subject_type .
123         }"""
124         query = RDF.SPARQLQuery(body)
125
126         statements = []
127         for r in query.execute(self.model):
128             s = RDF.Statement(r['o'], r['reverse'], r['s'])
129             if s not in self.model:
130                 self.model.append(s, self._context)
131
132
133     def _validate_types(self):
134         body = """
135         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
136         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
137         prefix owl: <http://www.w3.org/2002/07/owl#>
138         prefix xhtmlv: <http://www.w3.org/1999/xhtml/vocab#>
139
140         select ?subject ?predicate ?object
141         where {
142           ?subject ?predicate ?object
143           OPTIONAL { ?subject a ?class }
144           FILTER(!bound(?class))
145           FILTER(?predicate != xhtmlv:stylesheet)
146         }
147         """
148         query = RDF.SPARQLQuery(body)
149         errmsg = "Missing type for: {0}"
150         for r in query.execute(self.model):
151             yield errmsg.format(str(r['subject']))
152
153     def _validate_undefined_properties(self):
154         """Find properties that aren't defined.
155         """
156         body = """
157         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
158         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
159         prefix owl: <http://www.w3.org/2002/07/owl#>
160
161         select ?subject ?predicate ?object
162         where {
163             ?subject ?predicate ?object
164             OPTIONAL { ?predicate a ?predicate_class }
165             FILTER(!bound(?predicate_class))
166         }"""
167         query = RDF.SPARQLQuery(body)
168         msg = "Undefined property in {0} {1} {2}"
169         for r in query.execute(self.model):
170             yield msg.format(str(r['subject']),
171                              str(r['predicate']),
172                              str(r['object']))
173
174     def _validate_property_types(self):
175         """Find resources that don't have a type
176         """
177         property_template = """
178         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
179         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
180
181         select ?type
182         where {{
183             <{predicate}> a rdf:Property ;
184                         {space} ?type .
185         }}"""
186
187         def check_node_space(node, predicate, space, errmsg):
188             """Check that a node conforms to it's allowable space of types.
189
190             e.g. is a subject (node) the domain (space) of this property
191             and is the object (node) the range of of this property.
192             """
193             resource_error = "Expected resource for {0} in range {1}"
194             type_error = "Type of {0} was {1} not {2}"
195             # check domain
196             query = RDF.SPARQLQuery(property_template.format(
197                 predicate=predicate.uri,
198                 space=space))
199             seen = set()
200             for r in query.execute(self.model):
201                 # Make sure we have a resource if we're expecting one
202                 if r['type'] == rdfsNS['Resource']:
203                     if node.is_literal():
204                         return resource_error.format(str(node), space)
205                     continue
206                 seen.add(str(r['type'].uri))
207                 if node.is_literal():
208                     # literal is a generic type.
209                     nodetype = node.literal_value['datatype']
210                     if nodetype is None:
211                         # lets default to string
212                         nodetype = xsdNS['string'].uri
213                     if r['type'] == rdfsNS['Literal']:
214                         pass
215                     elif nodetype != r['type'].uri:
216                         return type_error.format(
217                             str(node), nodetype, r['type'])
218                 # check that node is the expetected class type
219                 check = RDF.Statement(node, rdfNS['type'], r['type'])
220                 if self.model.contains_statement(check):
221                     return
222
223             # need the seen check, because we're surpressing checking
224             # rdfs:Resource types
225             if len(seen) > 0:
226                 return errmsg + ",".join(seen)
227
228
229         wrong_domain_type = "Domain of {0} was not in:"
230         wrong_range_type = "Range of {0} was not in:"
231
232         count = 0
233         schema = RDF.Node(RDF.Uri(SCHEMAS_URL))
234         for s, context in self.model.as_stream_context():
235             if context == schema:
236                 continue
237             # check domain
238             msg = check_node_space(s.subject, s.predicate, 'rdfs:domain',
239                                    wrong_domain_type.format(str(s)))
240             if msg is not None: yield msg
241             # check range
242             msg = check_node_space(s.object, s.predicate, 'rdfs:range',
243                                    wrong_range_type.format(str(s)))
244             if msg is not None: yield msg
245         return