221063fb2efc6b80d93b6c24e5e86f28beba87c9
[htsworkflow.git] / htsworkflow / util / rdfinfer.py
1 import RDF
2
3 from htsworkflow.util.rdfns import *
4 from htsworkflow.util.rdfhelp import SCHEMAS_URL
5
6 INFER_URL='http://jumpgate.caltech.edu/phony/infer'
7
8 class Infer(object):
9     """Provide some simple inference.
10
11     Provides a few default rules as methods starting with _rule_
12     """
13     def __init__(self, model):
14         self.model = model
15         self._context = RDF.Node(RDF.Uri(INFER_URL))
16
17
18     def update(self, max_iterations=None):
19         """Update model with with inferred statements.
20
21         max_iterations puts a limit on the number of times we
22         run through the loop.
23
24         it will also try to exit if nothing new has been inferred.
25
26         Also this is the naive solution.
27         There's probably better ones out there.
28         """
29         iterations = 0
30         while max_iterations is None or iterations != max_iterations:
31             starting_size = self.model.size()
32
33             for method_name in dir(self):
34                 if method_name.startswith('_rule_'):
35                     method = getattr(self, method_name)
36                     method()
37             if self.model.size() == starting_size:
38                 # we didn't add anything new
39                 return
40
41     def _rule_inverse_of(self):
42         """Add statements computed with inverseOf
43         """
44         body = """
45         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
46         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
47         prefix owl: <http://www.w3.org/2002/07/owl#>
48
49         select ?o ?reverse ?s
50         where  {
51             ?s ?term ?o .
52             ?s a ?subject_type .
53             ?o a ?object_type .
54             ?term owl:inverseOf ?reverse .
55             ?term rdfs:domain ?subject_type ;
56                   rdfs:range ?object_type .
57             ?reverse rdfs:domain ?object_type ;
58                   rdfs:range ?subject_type .
59         }"""
60         query = RDF.SPARQLQuery(body)
61
62         statements = []
63         for r in query.execute(self.model):
64             s = RDF.Statement(r['o'], r['reverse'], r['s'])
65             if s not in self.model:
66                 self.model.append(s, self._context)
67
68
69     def _validate_types(self):
70         body = """
71         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
72         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
73         prefix owl: <http://www.w3.org/2002/07/owl#>
74
75         select ?subject ?predicate ?object
76         where {
77           ?subject ?predicate ?object
78           OPTIONAL { ?subject a ?class }
79           FILTER(!bound(?class))
80         }
81         """
82         query = RDF.SPARQLQuery(body)
83         errmsg = "Missing type for: {0}"
84         for r in query.execute(self.model):
85             yield errmsg.format(str(r['subject']))
86
87     def _validate_undefined_properties(self):
88         """Find properties that aren't defined.
89         """
90         body = """
91         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
92         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
93         prefix owl: <http://www.w3.org/2002/07/owl#>
94
95         select ?subject ?predicate ?object
96         where {
97             ?subject ?predicate ?object
98             OPTIONAL { ?predicate a ?predicate_class }
99             FILTER(!bound(?predicate_class))
100         }"""
101         query = RDF.SPARQLQuery(body)
102         msg = "Undefined property in {0} {1} {2}"
103         for r in query.execute(self.model):
104             yield msg.format(str(r['subject']),
105                              str(r['predicate']),
106                              str(r['object']))
107
108     def _validate_property_types(self):
109         """Find resources that don't have a type
110         """
111         property_template = """
112         prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
113         prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
114
115         select ?type
116         where {{
117             <{predicate}> a rdf:Property ;
118                         {space} ?type .
119         }}"""
120
121         wrong_domain_type = "Domain of {0} {1} {2} not {3}"
122         wrong_range_type = "Range of {0} {1} {2} not {3}"
123
124         count = 0
125         schema = RDF.Node(RDF.Uri(SCHEMAS_URL))
126         for s, context in self.model.as_stream_context():
127             if context == schema:
128                 continue
129             # check domain
130             query = RDF.SPARQLQuery(property_template.format(
131                 predicate=s.predicate,
132                 space='rdfs:domain'))
133             for r in query.execute(self.model):
134                 if r['type'] == rdfsNS['Resource']:
135                     continue
136                 check = RDF.Statement(s.subject, rdfNS['type'], r['type'])
137                 if not self.model.contains_statement(check):
138                     yield wrong_domain_type.format(str(s.subject),
139                                                    str(s.predicate),
140                                                    str(s.object),
141                                                    str(r['type']))
142             # check range
143             query = RDF.SPARQLQuery(property_template.format(
144                 predicate=s.predicate,
145                 space='rdfs:range'))
146             for r in query.execute(self.model):
147                 if r['type'] == rdfsNS['Resource']:
148                     continue
149                 check = RDF.Statement(s.object, rdfNS['type'], r['type'])
150                 if not self.model.contains_statement(check):
151                     yield wrong_range_type.format(str(s.subject),
152                                                   str(s.predicate),
153                                                   str(s.object),
154                                                   str(r['type']))
155
156         return