Improvements to rdfinfer.
[htsworkflow.git] / htsworkflow / util / rdfinfer.py
1 import logging
2 import os
3 import sys
4
5 import RDF
6
7 from htsworkflow.util.rdfns import *
8 from htsworkflow.util.rdfhelp import SCHEMAS_URL
9
10 INFER_URL='http://jumpgate.caltech.edu/phony/infer'
11
12 class Infer(object):
13     """Provide some simple inference.
14
15     Provides a few default rules as methods starting with _rule_
16     """
17     def __init__(self, model):
18         self.model = model
19         self._context = RDF.Node(RDF.Uri(INFER_URL))
20
21
22     def think(self, max_iterations=None):
23         """Update model with with inferred statements.
24
25         max_iterations puts a limit on the number of times we
26         run through the loop.
27
28         it will also try to exit if nothing new has been inferred.
29
30         Also this is the naive solution.
31         There's probably better ones out there.
32         """
33         iterations = 0
34         while max_iterations is None or iterations != max_iterations:
35             starting_size = self.model.size()
36
37             for method_name in dir(self):
38                 if method_name.startswith('_rule_'):
39                     method = getattr(self, method_name)
40                     method()
41             if self.model.size() == starting_size:
42                 # we didn't add anything new
43                 return
44
45     def validate(self, destination=None):
46         if destination is None:
47             destination = sys.stdout
48
49         for msg in self.run_validation():
50             destination.write(msg)
51             destination.write(os.linesep)
52
53     def run_validation(self):
54         """Apply validation rules to our model.
55         """
56         for method_name in dir(self):
57             if method_name.startswith('_validate_'):
58                 method = getattr(self, method_name)
59                 for msg in method():
60                     yield msg
61
62
63     def _rule_class(self):
64         """resolve class chains.
65         e.g. if a is an BClass, and a BClass is an AClass
66         then a is both a BClass and AClass.
67         """
68         body = """
69         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
70         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
71         prefix owl: <http://www.w3.org/2002/07/owl#>
72
73         select ?obj ?class
74         where  {
75           ?alias a ?class .
76           ?obj a ?alias .
77         }"""
78         query = RDF.SPARQLQuery(body)
79         for r in query.execute(self.model):
80             s = RDF.Statement(r['obj'], rdfNS['type'], r['class'])
81             if s not in self.model:
82                 self.model.append(s, self._context)
83
84     def _rule_subclass(self):
85         """A subclass is a parent class
86         """
87         body = """
88         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
89         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
90         prefix owl: <http://www.w3.org/2002/07/owl#>
91
92         select ?obj ?subclass ?parent
93         where  {
94           ?subclass rdfs:subClassOf ?parent .
95           ?obj a ?subclass .
96         }"""
97         query = RDF.SPARQLQuery(body)
98         for r in query.execute(self.model):
99             s = RDF.Statement(r['obj'], rdfNS['type'], r['parent'])
100             if s not in self.model:
101                 self.model.append(s, self._context)
102
103     def _rule_inverse_of(self):
104         """Add statements computed with inverseOf
105         """
106         body = """
107         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
108         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
109         prefix owl: <http://www.w3.org/2002/07/owl#>
110
111         select ?o ?reverse ?s
112         where  {
113             ?s ?term ?o .
114             ?s a ?subject_type .
115             ?o a ?object_type .
116             ?term owl:inverseOf ?reverse .
117             ?term rdfs:domain ?subject_type ;
118                   rdfs:range ?object_type .
119             ?reverse rdfs:domain ?object_type ;
120                   rdfs:range ?subject_type .
121         }"""
122         query = RDF.SPARQLQuery(body)
123
124         statements = []
125         for r in query.execute(self.model):
126             s = RDF.Statement(r['o'], r['reverse'], r['s'])
127             if s not in self.model:
128                 self.model.append(s, self._context)
129
130
131     def _validate_types(self):
132         body = """
133         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
134         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
135         prefix owl: <http://www.w3.org/2002/07/owl#>
136
137         select ?subject ?predicate ?object
138         where {
139           ?subject ?predicate ?object
140           OPTIONAL { ?subject a ?class }
141           FILTER(!bound(?class))
142         }
143         """
144         query = RDF.SPARQLQuery(body)
145         errmsg = "Missing type for: {0}"
146         for r in query.execute(self.model):
147             yield errmsg.format(str(r['subject']))
148
149     def _validate_undefined_properties(self):
150         """Find properties that aren't defined.
151         """
152         body = """
153         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
154         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
155         prefix owl: <http://www.w3.org/2002/07/owl#>
156
157         select ?subject ?predicate ?object
158         where {
159             ?subject ?predicate ?object
160             OPTIONAL { ?predicate a ?predicate_class }
161             FILTER(!bound(?predicate_class))
162         }"""
163         query = RDF.SPARQLQuery(body)
164         msg = "Undefined property in {0} {1} {2}"
165         for r in query.execute(self.model):
166             yield msg.format(str(r['subject']),
167                              str(r['predicate']),
168                              str(r['object']))
169
170     def _validate_property_types(self):
171         """Find resources that don't have a type
172         """
173         property_template = """
174         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
175         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
176
177         select ?type
178         where {{
179             <{predicate}> a rdf:Property ;
180                         {space} ?type .
181         }}"""
182
183         wrong_domain_type = "Domain of {0} was not {1}"
184         wrong_range_type = "Range of {0} was not {1}"
185
186         count = 0
187         schema = RDF.Node(RDF.Uri(SCHEMAS_URL))
188         for s, context in self.model.as_stream_context():
189             if context == schema:
190                 continue
191             # check domain
192             query = RDF.SPARQLQuery(property_template.format(
193                 predicate=s.predicate,
194                 space='rdfs:domain'))
195             for r in query.execute(self.model):
196                 if r['type'] == rdfsNS['Resource']:
197                     continue
198                 check = RDF.Statement(s.subject, rdfNS['type'], r['type'])
199                 if not self.model.contains_statement(check):
200                     yield wrong_domain_type.format(str(s),
201                                                    str(r['type']))
202             # check range
203             query = RDF.SPARQLQuery(property_template.format(
204                 predicate=s.predicate,
205                 space='rdfs:range'))
206             for r in query.execute(self.model):
207                 if r['type'] == rdfsNS['Resource']:
208                     continue
209                 check = RDF.Statement(s.object, rdfNS['type'], r['type'])
210                 if not self.model.contains_statement(check):
211                     yield wrong_range_type.format(str(s),
212                                                   str(r['type']))
213
214         return
215