b36fe6293883a394ada8f005b37d78503acd79e2
[htsworkflow.git] / htsworkflow / util / rdfinfer.py
1 import logging
2 import os
3 import sys
4
5 import RDF
6
7 from htsworkflow.util.rdfns import *
8 from htsworkflow.util.rdfhelp import SCHEMAS_URL
9
10 INFER_URL='http://jumpgate.caltech.edu/phony/infer'
11
12 class Infer(object):
13     """Provide some simple inference.
14
15     Provides a few default rules as methods starting with _rule_
16     """
17     def __init__(self, model):
18         self.model = model
19         self._context = RDF.Node(RDF.Uri(INFER_URL))
20
21
22     def think(self, max_iterations=None):
23         """Update model with with inferred statements.
24
25         max_iterations puts a limit on the number of times we
26         run through the loop.
27
28         it will also try to exit if nothing new has been inferred.
29
30         Also this is the naive solution.
31         There's probably better ones out there.
32         """
33         iterations = 0
34         while max_iterations is None or iterations != max_iterations:
35             starting_size = self.model.size()
36
37             for method_name in dir(self):
38                 if method_name.startswith('_rule_'):
39                     method = getattr(self, method_name)
40                     method()
41             if self.model.size() == starting_size:
42                 # we didn't add anything new
43                 return
44
45     def validate(self, destination=None):
46         if destination is None:
47             destination = sys.stdout
48
49         for msg in self.run_validation():
50             destination.write(msg)
51             destination.write(os.linesep)
52
53     def run_validation(self):
54         """Apply validation rules to our model.
55         """
56         for method_name in dir(self):
57             if method_name.startswith('_validate_'):
58                 method = getattr(self, method_name)
59                 for msg in method():
60                     yield msg
61
62     def _rule_class(self):
63         """resolve class chains.
64         e.g. if a is an BClass, and a BClass is an AClass
65         then a is both a BClass and AClass.
66         """
67         body = """
68         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
69         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
70         prefix owl: <http://www.w3.org/2002/07/owl#>
71
72         select ?obj ?class
73         where  {
74           ?alias a ?class .
75           ?obj a ?alias .
76         }"""
77         query = RDF.SPARQLQuery(body)
78         for r in query.execute(self.model):
79             s = RDF.Statement(r['obj'], rdfNS['type'], r['class'])
80             if s not in self.model:
81                 self.model.append(s, self._context)
82
83     def _rule_subclass(self):
84         """A subclass is a parent class
85         """
86         body = """
87         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
88         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
89         prefix owl: <http://www.w3.org/2002/07/owl#>
90
91         select ?obj ?subclass ?parent
92         where  {
93           ?subclass rdfs:subClassOf ?parent .
94           ?obj a ?subclass .
95         }"""
96         query = RDF.SPARQLQuery(body)
97         for r in query.execute(self.model):
98             s = RDF.Statement(r['obj'], rdfNS['type'], r['parent'])
99             if s not in self.model:
100                 self.model.append(s, self._context)
101
102     def _rule_inverse_of(self):
103         """Add statements computed with inverseOf
104         """
105         body = """
106         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
107         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
108         prefix owl: <http://www.w3.org/2002/07/owl#>
109
110         select ?o ?reverse ?s
111         where  {
112             ?s ?term ?o .
113             ?s a ?subject_type .
114             ?o a ?object_type .
115             ?term owl:inverseOf ?reverse .
116             ?term rdfs:domain ?subject_type ;
117                   rdfs:range ?object_type .
118             ?reverse rdfs:domain ?object_type ;
119                   rdfs:range ?subject_type .
120         }"""
121         query = RDF.SPARQLQuery(body)
122
123         statements = []
124         for r in query.execute(self.model):
125             s = RDF.Statement(r['o'], r['reverse'], r['s'])
126             if s not in self.model:
127                 self.model.append(s, self._context)
128
129
130     def _validate_types(self):
131         body = """
132         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
133         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
134         prefix owl: <http://www.w3.org/2002/07/owl#>
135
136         select ?subject ?predicate ?object
137         where {
138           ?subject ?predicate ?object
139           OPTIONAL { ?subject a ?class }
140           FILTER(!bound(?class))
141         }
142         """
143         query = RDF.SPARQLQuery(body)
144         errmsg = "Missing type for: {0}"
145         for r in query.execute(self.model):
146             yield errmsg.format(str(r['subject'].uri))
147
148     def _validate_undefined_properties(self):
149         """Find properties that aren't defined.
150         """
151         body = """
152         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
153         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
154         prefix owl: <http://www.w3.org/2002/07/owl#>
155
156         select ?subject ?predicate ?object
157         where {
158             ?subject ?predicate ?object
159             OPTIONAL { ?predicate a ?predicate_class }
160             FILTER(!bound(?predicate_class))
161         }"""
162         query = RDF.SPARQLQuery(body)
163         msg = "Undefined property in {0} {1} {2}"
164         for r in query.execute(self.model):
165             yield msg.format(str(r['subject']),
166                              str(r['predicate']),
167                              str(r['object']))
168
169     def _validate_property_types(self):
170         """Find resources that don't have a type
171         """
172         property_template = """
173         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
174         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
175
176         select ?type
177         where {{
178             <{predicate}> a rdf:Property ;
179                         {space} ?type .
180         }}"""
181
182         def check_node_space(node, predicate, space, errmsg):
183             """Check that a node conforms to it's allowable space of types.
184
185             e.g. is a subject (node) the domain (space) of this property
186             and is the object (node) the range of of this property.
187             """
188             # check domain
189             query = RDF.SPARQLQuery(property_template.format(
190                 predicate=predicate.uri,
191                 space=space))
192             seen = []
193             for r in query.execute(self.model):
194                 if r['type'] == rdfsNS['Resource']:
195                     continue
196                 seen.append(str(r['type'].uri))
197                 check = RDF.Statement(node, rdfNS['type'], r['type'])
198                 if self.model.contains_statement(check):
199                     return
200
201             # need the seen check, because we're surpressing checking
202             # rdfs:Resource types
203             if len(seen) > 0:
204                 return errmsg + ",".join(seen)
205
206
207         wrong_domain_type = "Domain of {0} was not in:"
208         wrong_range_type = "Range of {0} was not in:"
209
210         count = 0
211         schema = RDF.Node(RDF.Uri(SCHEMAS_URL))
212         for s, context in self.model.as_stream_context():
213             if context == schema:
214                 continue
215             # check domain
216             msg = check_node_space(s.subject, s.predicate, 'rdfs:domain',
217                                    wrong_domain_type.format(str(s)))
218             if msg is not None: yield msg
219             # check range
220             msg = check_node_space(s.object, s.predicate, 'rdfs:range',
221                                    wrong_range_type.format(str(s)))
222             if msg is not None: yield msg
223         return
224
225