1 """Helper features for working with librdf
3 from __future__ import print_function, absolute_import
6 from datetime import datetime
9 from six.moves import urllib
14 from pkg_resources import resource_listdir, resource_string
16 from rdflib import ConjunctiveGraph, Graph, Literal, BNode, URIRef, Namespace
17 from rdflib.namespace import ClosedNamespace
20 import lxml.html.clean
22 from .rdfns import XMLNS
24 logger = logging.getLogger(__name__)
26 from htsworkflow.util.rdfns import *
28 SCHEMAS_URL='http://jumpgate.caltech.edu/phony/schemas'
29 INFERENCE_URL='http://jumpgate.caltech.edu/phony/inference'
31 ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
32 ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
34 def sparql_query(model, query_filename, output_format='text'):
35 """Execute sparql query from file
37 logger.info("Opening: %s" % (query_filename,))
38 query_body = open(query_filename, 'r').read()
39 query = RDF.SPARQLQuery(query_body)
40 results = query.execute(model)
41 if output_format == 'html':
42 html_query_results(results)
44 display_query_results(results)
47 def display_query_results(results):
48 """A very simple display of sparql query results showing name value pairs
51 for k, v in row.items()[::-1]:
52 print("{0}: {1}".format(k, v))
55 def html_query_results(result_stream):
56 from django.conf import settings
57 from django.template import Context, loader
59 # I did this because I couldn't figure out how to
60 # get simplify_rdf into the django template as a filter
61 class Simplified(object):
62 def __init__(self, value):
63 self.simple = simplify_rdf(value)
64 if value.is_resource():
69 template = loader.get_template('rdf_report.html')
71 for row in result_stream:
72 new_row = collections.OrderedDict()
74 for k,v in row.items():
75 new_row[k] = Simplified(v)
76 results.append(new_row)
77 context = Context({'results': results,})
78 print(template.render(context))
81 def get_node_type(node):
82 """Return just the base name of a XSD datatype:
83 e.g. http://www.w3.org/2001/XMLSchema#integer -> integer
85 # chop off xml schema declaration
86 value_type = node.datatype
87 if value_type is None:
90 return value_type.replace(str(XSD), '').lower()
93 def simplify_rdf(value):
94 """Return a short name for a RDF object
95 e.g. The last part of a URI or an untyped string.
97 if isinstance(value, Literal):
99 elif isinstance(value, BNode):
101 elif isinstance(value, URIRef):
102 name = split_uri(str(value))
108 def simplify_uri(uri):
109 """Split off the end of a uri
111 >>> simplify_uri('http://asdf.org/foo/bar')
113 >>> simplify_uri('http://asdf.org/foo/bar#bleem')
115 >>> simplify_uri('http://asdf.org/foo/bar/')
117 >>> simplify_uri('http://asdf.org/foo/bar?was=foo')
120 if isinstance(uri, Literal) and uri.datatype not in (XSD.anyURI,):
121 raise ValueError("Literal terms must be of URI type")
125 parsed = urllib.parse.urlparse(uri)
126 if len(parsed.query) > 0:
128 elif len(parsed.fragment) > 0:
129 return parsed.fragment
130 elif len(parsed.path) > 0:
131 for element in reversed(parsed.path.split('/')):
134 raise ValueError("Unable to simplify %s" % (uri,))
136 def strip_namespace(namespace, term):
137 """Remove the namespace portion of a term
139 returns None if they aren't in common
141 if not isinstance(namespace, (URIRef, Namespace, ClosedNamespace)):
142 raise ValueError("Requires a URIRef namespace")
144 if isinstance(term, Literal) and term.datatype not in (XSD.anyURI,):
145 raise ValueError("Term literals must be a URI type")
146 elif not isinstance(term, URIRef):
147 raise ValueError("Term must be a URI type")
150 if not term_s.startswith(str(namespace)):
152 return term_s.replace(str(namespace), "")
155 def load_into_model(model, parser_name, path, ns=None):
156 if isinstance(ns, six.string_types):
159 if isinstance(path, URIRef):
162 url_parts = list(urllib.parse.urlparse(path))
163 if len(url_parts[0]) == 0 or url_parts[0] == 'file':
164 url_parts[0] = 'file'
165 url_parts[2] = os.path.abspath(url_parts[2])
166 if parser_name is None or parser_name == 'guess':
167 parser_name = guess_parser_by_extension(path)
168 url = urllib.parse.urlunparse(url_parts)
169 logger.info("Opening {0} with parser {1}".format(url, parser_name))
171 model.parse(url, format=parser_name, publicID=ns)
174 def load_string_into_model(model, parser_name, data, ns=None):
175 ns = fixup_namespace(ns)
176 logger.debug("load_string_into_model parser={0}, len={1}".format(
177 parser_name, len(data)))
179 model.parse(data=data, format=parser_name, publicID=ns)
180 add_imports(model, ns)
183 def fixup_namespace(ns):
185 ns = URIRef("http://localhost/")
186 elif isinstance(ns, six.string_types):
188 elif not(isinstance(ns, URIRef)):
189 errmsg = "Namespace should be string or uri not {0}"
190 raise ValueError(errmsg.format(str(type(ns))))
194 def add_imports(model, ns):
195 for s, p, o in model.triples((None, OWL.imports, None)):
197 model.remove((s, p, o))
198 load_into_model(model, None, o, ns)
200 def add_default_schemas(model, schema_path=None):
201 """Add default schemas to a model
202 Looks for turtle files in either htsworkflow/util/schemas
203 or in the list of directories provided in schema_path
205 schemas = resource_listdir(__name__, 'schemas')
207 schema = resource_string(__name__, 'schemas/' + s)
209 # files must be encoded utf-8
210 schema = schema.decode('utf-8')
211 namespace = 'file://localhost/htsworkflow/schemas/'+s
212 add_schema(model, schema, namespace)
215 if type(schema_path) in types.StringTypes:
216 schema_path = [schema_path]
218 for path in schema_path:
219 for pathname in glob(os.path.join(path, '*.turtle')):
220 url = 'file://' + os.path.splitext(pathname)[0]
221 stream = open(pathname, 'rt')
222 add_schema(model, stream, url)
225 def add_schema(model, schema, url):
226 """Add a schema to a model.
228 Main difference from 'load_into_model' is it tags it with
229 a RDFlib context so I can remove them later.
231 if not isinstance(model, ConjunctiveGraph):
232 raise ValueError("Schemas requires a graph that supports quads")
234 context = URIRef(SCHEMAS_URL)
236 tmpmodel.parse(data=schema, format='turtle', publicID=url)
237 for s, p, o in tmpmodel:
238 model.add((s, p, o, context))
240 def remove_schemas(model):
241 """Remove statements labeled with our schema context"""
242 context = URIRef(SCHEMAS_URL)
243 for quad in model.triples((None, None, None, context)):
245 #model.remove_context(context)
247 def sanitize_literal(node):
248 """Clean up a literal string
250 if not isinstance(node, Literal):
251 raise ValueError("sanitize_literal only works on Literals")
255 element = lxml.html.fromstring(s)
256 cleaner = lxml.html.clean.Cleaner(page_structure=False)
257 element = cleaner.clean_html(element)
259 text = lxml.html.tostring(element, encoding=str)
261 text = lxml.html.tostring(element)
265 value = text[p_len:-slash_p_len]
269 if node.datatype is not None:
270 args['datatype'] = node.datatype
271 if node.language is not None:
272 args['lang'] = node.language
273 return Literal(value, **args)
276 def guess_parser(content_type, pathname):
277 if content_type in ('application/rdf+xml',):
279 elif content_type in ('application/x-turtle',):
281 elif content_type in ('text/html',):
283 elif content_type is None or content_type in ('text/plain',):
284 return guess_parser_by_extension(pathname)
286 def guess_parser_by_extension(pathname):
287 _, ext = os.path.splitext(pathname)
288 if ext in ('.xml', '.rdf'):
290 elif ext in ('.html',):
292 elif ext in ('.turtle',):
296 def add_default_namespaces(model):
297 """Return a serializer with our standard prefixes loaded
299 model.bind('rdf', RDF)
300 model.bind('rdfs', RDFS)
301 model.bind('owl', OWL)
303 model.bind('xml', XMLNS)
304 model.bind('xsd', XSD)
306 model.bind('wot', WOT)
308 # should these be here, kind of specific to an application
309 model.bind('htswlib', libraryOntology)
310 model.bind('ucscSubmission', submissionOntology)
311 model.bind('ucscDaf', dafTermOntology)
312 model.bind('geoSoft', geoSoftNS)
313 model.bind('encode3', encode3NS)
316 def get_turtle_header():
317 """Return a turtle header with our typical namespaces
319 empty = ConjunctiveGraph()
320 add_default_namespaces(empty)
321 return empty.serialize(format='turtle').decode()
323 def dump_model(model, destination=None):
324 if destination is None:
325 destination = sys.stdout
326 add_default_namespaces(model)
327 model.serialize(destination, format='turtle')