1 """Helper features for working with librdf
3 from __future__ import print_function
6 from datetime import datetime
9 from six.moves import urllib
14 from pkg_resources import resource_listdir, resource_string
17 import lxml.html.clean
20 logger = logging.getLogger(__name__)
22 from htsworkflow.util.rdfns import *
24 SCHEMAS_URL='http://jumpgate.caltech.edu/phony/schemas'
25 INFERENCE_URL='http://jumpgate.caltech.edu/phony/inference'
27 ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
28 ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
30 def sparql_query(model, query_filename, output_format='text'):
31 """Execute sparql query from file
33 logger.info("Opening: %s" % (query_filename,))
34 query_body = open(query_filename, 'r').read()
35 query = RDF.SPARQLQuery(query_body)
36 results = query.execute(model)
37 if output_format == 'html':
38 html_query_results(results)
40 display_query_results(results)
43 def display_query_results(results):
44 """A very simple display of sparql query results showing name value pairs
47 for k, v in row.items()[::-1]:
48 print("{0}: {1}".format(k, v))
51 def html_query_results(result_stream):
52 from django.conf import settings
53 from django.template import Context, loader
55 # I did this because I couldn't figure out how to
56 # get simplify_rdf into the django template as a filter
57 class Simplified(object):
58 def __init__(self, value):
59 self.simple = simplify_rdf(value)
60 if value.is_resource():
65 template = loader.get_template('rdf_report.html')
67 for row in result_stream:
68 new_row = collections.OrderedDict()
70 for k,v in row.items():
71 new_row[k] = Simplified(v)
72 results.append(new_row)
73 context = Context({'results': results,})
74 print(template.render(context))
76 def blankOrUri(value=None):
77 """Return a blank node for None or a resource node for strings.
82 elif isinstance(value, six.string_types):
83 node = RDF.Node(uri_string=value)
84 elif isinstance(value, RDF.Node):
90 def toTypedNode(value, language="en"):
91 """Convert a python variable to a RDF Node with its closest xsd type
93 if isinstance(value, bool):
94 value_type = xsdNS['boolean'].uri
99 elif isinstance(value, int):
100 value_type = xsdNS['decimal'].uri
102 elif isinstance(value, float):
103 value_type = xsdNS['float'].uri
105 elif isinstance(value, datetime):
106 value_type = xsdNS['dateTime'].uri
107 if value.microsecond == 0:
108 value = value.strftime(ISOFORMAT_SHORT)
110 value = value.strftime(ISOFORMAT_MS)
116 value = unicode(value).encode('utf-8')
118 if value_type is not None:
119 node = RDF.Node(literal=value, datatype=value_type)
121 node = RDF.Node(literal=value, language=language)
125 def fromTypedNode(node):
126 """Convert a typed RDF Node to its closest python equivalent
128 if not isinstance(node, RDF.Node):
130 if node.is_resource():
133 value_type = get_node_type(node)
134 literal = node.literal_value['string']
135 literal_lower = literal.lower()
137 if value_type == 'boolean':
138 if literal_lower in ('1', 'yes', 'true'):
140 elif literal_lower in ('0', 'no', 'false'):
143 raise ValueError("Unrecognized boolean %s" % (literal,))
144 elif value_type == 'integer':
146 elif value_type == 'decimal' and literal.find('.') == -1:
148 elif value_type in ('decimal', 'float', 'double'):
149 return float(literal)
150 elif value_type in ('string'):
152 elif value_type in ('dateTime'):
154 return datetime.strptime(literal, ISOFORMAT_MS)
156 return datetime.strptime(literal, ISOFORMAT_SHORT)
160 def get_node_type(node):
161 """Return just the base name of a XSD datatype:
162 e.g. http://www.w3.org/2001/XMLSchema#integer -> integer
164 # chop off xml schema declaration
165 value_type = node.literal_value['datatype']
166 if value_type is None:
169 value_type = str(value_type)
170 return value_type.replace(str(xsdNS[''].uri), '')
173 def simplify_rdf(value):
174 """Return a short name for a RDF object
175 e.g. The last part of a URI or an untyped string.
177 if isinstance(value, RDF.Node):
178 if value.is_resource():
179 name = simplify_uri(str(value.uri))
180 elif value.is_blank():
183 name = value.literal_value['string']
184 elif isinstance(value, RDF.Uri):
185 name = split_uri(str(value))
191 def simplify_uri(uri):
192 """Split off the end of a uri
194 >>> simplify_uri('http://asdf.org/foo/bar')
196 >>> simplify_uri('http://asdf.org/foo/bar#bleem')
198 >>> simplify_uri('http://asdf.org/foo/bar/')
200 >>> simplify_uri('http://asdf.org/foo/bar?was=foo')
203 if isinstance(uri, RDF.Node):
204 if uri.is_resource():
207 raise ValueError("Can't simplify an RDF literal")
208 if isinstance(uri, RDF.Uri):
211 parsed = urllib.parse.urlparse(uri)
212 if len(parsed.query) > 0:
214 elif len(parsed.fragment) > 0:
215 return parsed.fragment
216 elif len(parsed.path) > 0:
217 for element in reversed(parsed.path.split('/')):
220 raise ValueError("Unable to simplify %s" % (uri,))
222 def strip_namespace(namespace, term):
223 """Remove the namespace portion of a term
225 returns None if they aren't in common
227 if isinstance(term, RDF.Node):
228 if term.is_resource():
231 raise ValueError("This works on resources")
232 elif not isinstance(term, RDF.Uri):
233 raise ValueError("This works on resources")
235 if not term_s.startswith(namespace._prefix):
237 return term_s.replace(namespace._prefix, "")
240 def get_model(model_name=None, directory=None, use_contexts=True):
241 if directory is None:
242 directory = os.getcwd()
244 contexts = 'yes' if use_contexts else 'no'
246 if model_name is None:
247 storage = RDF.MemoryStorage(options_string="contexts='{}'".format(contexts))
248 logger.info("Using RDF Memory model")
250 options = "contexts='{0}',hash-type='bdb',dir='{1}'".format(contexts, directory)
251 storage = RDF.HashStorage(model_name,
253 logger.info("Using {0} with options {1}".format(model_name, options))
254 model = RDF.Model(storage)
258 def load_into_model(model, parser_name, path, ns=None):
259 if isinstance(ns, six.string_types):
262 if isinstance(path, RDF.Node):
263 if path.is_resource():
266 raise ValueError("url to load can't be a RDF literal")
268 url_parts = list(urllib.parse.urlparse(path))
269 if len(url_parts[0]) == 0 or url_parts[0] == 'file':
270 url_parts[0] = 'file'
271 url_parts[2] = os.path.abspath(url_parts[2])
272 if parser_name is None or parser_name == 'guess':
273 parser_name = guess_parser_by_extension(path)
274 url = urllib.parse.urlunparse(url_parts)
275 logger.info("Opening {0} with parser {1}".format(url, parser_name))
277 rdf_parser = RDF.Parser(name=parser_name)
285 statements = rdf_parser.parse_as_stream(url, ns)
288 except RDF.RedlandError as e:
289 errmsg = "RDF.RedlandError: {0} {1} tries remaining"
290 logger.error(errmsg.format(str(e), retries))
293 logger.warn("Unable to download %s", url)
296 conditionally_add_statement(model, s, ns)
298 def load_string_into_model(model, parser_name, data, ns=None):
299 ns = fixup_namespace(ns)
300 logger.debug("load_string_into_model parser={0}, len={1}".format(
301 parser_name, len(data)))
302 rdf_parser = RDF.Parser(name=str(parser_name))
304 for s in rdf_parser.parse_string_as_stream(data, ns):
305 conditionally_add_statement(model, s, ns)
308 def fixup_namespace(ns):
310 ns = RDF.Uri("http://localhost/")
311 elif isinstance(ns, six.string_types):
313 elif not(isinstance(ns, RDF.Uri)):
314 errmsg = "Namespace should be string or uri not {0}"
315 raise ValueError(errmsg.format(str(type(ns))))
319 def conditionally_add_statement(model, s, ns):
320 imports = owlNS['imports']
321 if s.predicate == imports:
323 logger.info("Importing %s" % (obj,))
324 load_into_model(model, None, obj, ns)
325 if s.object.is_literal():
326 value_type = get_node_type(s.object)
327 if value_type == 'string':
328 s.object = sanitize_literal(s.object)
329 model.add_statement(s)
332 def add_default_schemas(model, schema_path=None):
333 """Add default schemas to a model
334 Looks for turtle files in either htsworkflow/util/schemas
335 or in the list of directories provided in schema_path
338 schemas = resource_listdir(__name__, 'schemas')
340 schema = resource_string(__name__, 'schemas/' + s)
342 # files must be encoded utf-8
343 schema = schema.decode('utf-8')
344 namespace = 'file://localhost/htsworkflow/schemas/'+s
345 add_schema(model, schema, namespace)
348 if type(schema_path) in types.StringTypes:
349 schema_path = [schema_path]
351 for path in schema_path:
352 for pathname in glob(os.path.join(path, '*.turtle')):
353 url = 'file://' + os.path.splitext(pathname)[0]
354 stream = open(pathname, 'rt')
355 add_schema(model, stream, url)
358 def add_schema(model, schema, url):
359 """Add a schema to a model.
361 Main difference from 'load_into_model' is it tags it with
362 a RDFlib context so I can remove them later.
364 parser = RDF.Parser(name='turtle')
365 context = RDF.Node(RDF.Uri(SCHEMAS_URL))
366 for s in parser.parse_string_as_stream(schema, url):
368 model.append(s, context)
369 except RDF.RedlandError as e:
370 logger.error("%s with %s", str(e), str(s))
373 def remove_schemas(model):
374 """Remove statements labeled with our schema context"""
375 context = RDF.Node(RDF.Uri(SCHEMAS_URL))
376 model.context_remove_statements(context)
379 def sanitize_literal(node):
380 """Clean up a literal string
382 if not isinstance(node, RDF.Node):
383 raise ValueError("sanitize_literal only works on RDF.Nodes")
385 s = node.literal_value['string']
387 element = lxml.html.fromstring(s)
388 cleaner = lxml.html.clean.Cleaner(page_structure=False)
389 element = cleaner.clean_html(element)
391 text = lxml.html.tostring(element, encoding=str)
393 text = lxml.html.tostring(element)
397 args = {'literal': text[p_len:-slash_p_len]}
399 args = {'literal': ''}
400 datatype = node.literal_value['datatype']
401 if datatype is not None:
402 args['datatype'] = datatype
403 language = node.literal_value['language']
404 if language is not None:
405 args['language'] = language
406 return RDF.Node(**args)
409 def guess_parser(content_type, pathname):
410 if content_type in ('application/rdf+xml',):
412 elif content_type in ('application/x-turtle',):
414 elif content_type in ('text/html',):
416 elif content_type is None or content_type in ('text/plain',):
417 return guess_parser_by_extension(pathname)
419 def guess_parser_by_extension(pathname):
420 _, ext = os.path.splitext(pathname)
421 if ext in ('.xml', '.rdf'):
423 elif ext in ('.html',):
425 elif ext in ('.turtle',):
429 def get_serializer(name='turtle'):
430 """Return a serializer with our standard prefixes loaded
432 writer = RDF.Serializer(name=name)
433 # really standard stuff
434 writer.set_namespace('rdf', rdfNS._prefix)
435 writer.set_namespace('rdfs', rdfsNS._prefix)
436 writer.set_namespace('owl', owlNS._prefix)
437 writer.set_namespace('dc', dcNS._prefix)
438 writer.set_namespace('xml', xmlNS._prefix)
439 writer.set_namespace('xsd', xsdNS._prefix)
440 writer.set_namespace('vs', vsNS._prefix)
441 writer.set_namespace('wot', wotNS._prefix)
443 # should these be here, kind of specific to an application
444 writer.set_namespace('htswlib', libraryOntology._prefix)
445 writer.set_namespace('ucscSubmission', submissionOntology._prefix)
446 writer.set_namespace('ucscDaf', dafTermOntology._prefix)
447 writer.set_namespace('geoSoft', geoSoftNS._prefix)
448 writer.set_namespace('encode3', encode3NS._prefix)
451 def get_turtle_header():
452 """Return a turtle header with our typical namespaces
454 serializer = get_serializer()
456 return serializer.serialize_model_to_string(empty)
458 def dump_model(model, destination=None):
459 if destination is None:
460 destination = sys.stdout
461 serializer = get_serializer()
462 destination.write(serializer.serialize_model_to_string(model))
463 destination.write(os.linesep)