1 """Helper features for working with librdf
4 from datetime import datetime
6 from urlparse import urlparse, urlunparse
7 from urllib2 import urlopen
14 import lxml.html.clean
17 logger = logging.getLogger(__name__)
19 from htsworkflow.util.rdfns import *
21 SCHEMAS_URL='http://jumpgate.caltech.edu/phony/schemas'
22 INFERENCE_URL='http://jumpgate.caltech.edu/phony/inference'
24 ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
25 ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
27 def sparql_query(model, query_filename, output_format='text'):
28 """Execute sparql query from file
30 logger.info("Opening: %s" % (query_filename,))
31 query_body = open(query_filename, 'r').read()
32 query = RDF.SPARQLQuery(query_body)
33 results = query.execute(model)
34 if output_format == 'html':
35 html_query_results(results)
37 display_query_results(results)
40 def display_query_results(results):
41 """A very simple display of sparql query results showing name value pairs
44 for k, v in row.items()[::-1]:
45 print "{0}: {1}".format(k, v)
48 def html_query_results(result_stream):
49 from django.conf import settings
50 from django.template import Context, loader
52 # I did this because I couldn't figure out how to
53 # get simplify_rdf into the django template as a filter
54 class Simplified(object):
55 def __init__(self, value):
56 self.simple = simplify_rdf(value)
57 if value.is_resource():
62 template = loader.get_template('rdf_report.html')
64 for row in result_stream:
65 new_row = collections.OrderedDict()
67 for k,v in row.items():
68 new_row[k] = Simplified(v)
69 results.append(new_row)
70 context = Context({'results': results,})
71 print template.render(context)
73 def blankOrUri(value=None):
74 """Return a blank node for None or a resource node for strings.
79 elif type(value) in types.StringTypes:
80 node = RDF.Node(uri_string=value)
81 elif isinstance(value, RDF.Node):
87 def toTypedNode(value):
88 """Convert a python variable to a RDF Node with its closest xsd type
90 if type(value) == types.BooleanType:
91 value_type = xsdNS['boolean'].uri
96 elif type(value) in (types.IntType, types.LongType):
97 value_type = xsdNS['decimal'].uri
98 value = unicode(value)
99 elif type(value) == types.FloatType:
100 value_type = xsdNS['float'].uri
101 value = unicode(value)
102 elif isinstance(value, datetime):
103 value_type = xsdNS['dateTime'].uri
104 if value.microsecond == 0:
105 value = value.strftime(ISOFORMAT_SHORT)
107 value = value.strftime(ISOFORMAT_MS)
110 value = unicode(value)
112 if value_type is not None:
113 node = RDF.Node(literal=value, datatype=value_type)
115 node = RDF.Node(literal=unicode(value).encode('utf-8'))
119 def fromTypedNode(node):
120 """Convert a typed RDF Node to its closest python equivalent
125 value_type = get_node_type(node)
126 literal = node.literal_value['string']
127 literal_lower = literal.lower()
129 if value_type == 'boolean':
130 if literal_lower in ('1', 'yes', 'true'):
132 elif literal_lower in ('0', 'no', 'false'):
135 raise ValueError("Unrecognized boolean %s" % (literal,))
136 elif value_type == 'integer':
138 elif value_type == 'decimal' and literal.find('.') == -1:
140 elif value_type in ('decimal', 'float', 'double'):
141 return float(literal)
142 elif value_type in ('string'):
144 elif value_type in ('dateTime'):
146 return datetime.strptime(literal, ISOFORMAT_MS)
147 except ValueError, _:
148 return datetime.strptime(literal, ISOFORMAT_SHORT)
152 def get_node_type(node):
153 """Return just the base name of a XSD datatype:
154 e.g. http://www.w3.org/2001/XMLSchema#integer -> integer
156 # chop off xml schema declaration
157 value_type = node.literal_value['datatype']
158 if value_type is None:
161 value_type = str(value_type)
162 return value_type.replace(str(xsdNS[''].uri), '')
165 def simplify_rdf(value):
166 """Return a short name for a RDF object
167 e.g. The last part of a URI or an untyped string.
169 if isinstance(value, RDF.Node):
170 if value.is_resource():
171 name = simplify_uri(str(value.uri))
172 elif value.is_blank():
175 name = value.literal_value['string']
176 elif isinstance(value, RDF.Uri):
177 name = split_uri(str(value))
183 def simplify_uri(uri):
184 """Split off the end of a uri
186 >>> simplify_uri('http://asdf.org/foo/bar')
188 >>> simplify_uri('http://asdf.org/foo/bar#bleem')
190 >>> simplify_uri('http://asdf.org/foo/bar/')
192 >>> simplify_uri('http://asdf.org/foo/bar?was=foo')
195 if isinstance(uri, RDF.Node):
196 if uri.is_resource():
199 raise ValueError("Can't simplify an RDF literal")
200 if isinstance(uri, RDF.Uri):
203 parsed = urlparse(uri)
204 if len(parsed.query) > 0:
206 elif len(parsed.fragment) > 0:
207 return parsed.fragment
208 elif len(parsed.path) > 0:
209 for element in reversed(parsed.path.split('/')):
212 raise ValueError("Unable to simplify %s" % (uri,))
214 def stripNamespace(namespace, term):
215 """Remove the namespace portion of a term
217 returns None if they aren't in common
219 if isinstance(term, RDF.Node):
220 if term.is_resource():
223 raise ValueError("This works on resources")
224 elif not isinstance(term, RDF.Uri):
225 raise ValueError("This works on resources")
227 if not term_s.startswith(namespace._prefix):
229 return term_s.replace(namespace._prefix, "")
232 def get_model(model_name=None, directory=None):
233 if directory is None:
234 directory = os.getcwd()
236 if model_name is None:
237 storage = RDF.MemoryStorage(options_string="contexts='yes'")
238 logger.info("Using RDF Memory model")
240 options = "contexts='yes',hash-type='bdb',dir='{0}'".format(directory)
241 storage = RDF.HashStorage(model_name,
243 logger.info("Using {0} with options {1}".format(model_name, options))
244 model = RDF.Model(storage)
248 def load_into_model(model, parser_name, path, ns=None):
249 if type(ns) in types.StringTypes:
252 if isinstance(path, RDF.Node):
253 if path.is_resource():
256 raise ValueError("url to load can't be a RDF literal")
258 url_parts = list(urlparse(path))
259 if len(url_parts[0]) == 0 or url_parts[0] == 'file':
260 url_parts[0] = 'file'
261 url_parts[2] = os.path.abspath(url_parts[2])
262 if parser_name is None or parser_name == 'guess':
263 parser_name = guess_parser_by_extension(path)
264 url = urlunparse(url_parts)
265 logger.info("Opening {0} with parser {1}".format(url, parser_name))
267 rdf_parser = RDF.Parser(name=parser_name)
274 statements = rdf_parser.parse_as_stream(url, ns)
276 except RDF.RedlandError, e:
277 errmsg = "RDF.RedlandError: {0} {1} tries remaining"
278 logger.error(errmsg.format(str(e), retries))
281 conditionally_add_statement(model, s, ns)
283 def load_string_into_model(model, parser_name, data, ns=None):
284 ns = fixup_namespace(ns)
285 logger.debug("load_string_into_model parser={0}, len={1}".format(
286 parser_name, len(data)))
287 rdf_parser = RDF.Parser(name=parser_name)
289 for s in rdf_parser.parse_string_as_stream(data, ns):
290 conditionally_add_statement(model, s, ns)
293 def fixup_namespace(ns):
295 ns = RDF.Uri("http://localhost/")
296 elif type(ns) in types.StringTypes:
298 elif not(isinstance(ns, RDF.Uri)):
299 errmsg = "Namespace should be string or uri not {0}"
300 raise ValueError(errmsg.format(str(type(ns))))
304 def conditionally_add_statement(model, s, ns):
305 imports = owlNS['imports']
306 if s.predicate == imports:
308 logger.info("Importing %s" % (obj,))
309 load_into_model(model, None, obj, ns)
310 if s.object.is_literal():
311 value_type = get_node_type(s.object)
312 if value_type == 'string':
313 s.object = sanitize_literal(s.object)
314 model.add_statement(s)
317 def add_default_schemas(model, schema_path=None):
318 """Add default schemas to a model
319 Looks for turtle files in either htsworkflow/util/schemas
320 or in the list of directories provided in schema_path
323 if schema_path is None:
324 path, _ = os.path.split(__file__)
325 schema_path = [os.path.join(path, 'schemas')]
326 elif type(schema_path) in types.StringTypes:
327 schema_path = [schema_path]
329 for p in schema_path:
330 for f in glob(os.path.join(p, '*.turtle')):
333 def add_schema(model, filename):
334 """Add a schema to a model.
336 Main difference from 'load_into_model' is it tags it with
337 a RDFlib context so I can remove them later.
339 parser = RDF.Parser(name='turtle')
340 context = RDF.Node(RDF.Uri(SCHEMAS_URL))
341 url = 'file://' + filename
342 for s in parser.parse_as_stream(url):
344 model.append(s, context)
345 except RDF.RedlandError as e:
346 logger.error("%s with %s", str(e), str(s))
349 def remove_schemas(model):
350 """Remove statements labeled with our schema context"""
351 context = RDF.Node(RDF.Uri(SCHEMAS_URL))
352 model.context_remove_statements(context)
355 def sanitize_literal(node):
356 """Clean up a literal string
358 if not isinstance(node, RDF.Node):
359 raise ValueError("sanitize_literal only works on RDF.Nodes")
361 s = node.literal_value['string']
363 element = lxml.html.fromstring(s)
364 cleaner = lxml.html.clean.Cleaner(page_structure=False)
365 element = cleaner.clean_html(element)
366 text = lxml.html.tostring(element)
370 args = {'literal': text[p_len:-slash_p_len]}
372 args = {'literal': ''}
373 datatype = node.literal_value['datatype']
374 if datatype is not None:
375 args['datatype'] = datatype
376 language = node.literal_value['language']
377 if language is not None:
378 args['language'] = language
379 return RDF.Node(**args)
382 def guess_parser(content_type, pathname):
383 if content_type in ('application/rdf+xml',):
385 elif content_type in ('application/x-turtle',):
387 elif content_type in ('text/html',):
389 elif content_type is None or content_type in ('text/plain',):
390 return guess_parser_by_extension(pathname)
392 def guess_parser_by_extension(pathname):
393 _, ext = os.path.splitext(pathname)
394 if ext in ('.xml', '.rdf'):
396 elif ext in ('.html',):
398 elif ext in ('.turtle',):
402 def get_serializer(name='turtle'):
403 """Return a serializer with our standard prefixes loaded
405 writer = RDF.Serializer(name=name)
406 # really standard stuff
407 writer.set_namespace('rdf', rdfNS._prefix)
408 writer.set_namespace('rdfs', rdfsNS._prefix)
409 writer.set_namespace('owl', owlNS._prefix)
410 writer.set_namespace('dc', dcNS._prefix)
411 writer.set_namespace('xml', xmlNS._prefix)
412 writer.set_namespace('xsd', xsdNS._prefix)
413 writer.set_namespace('vs', vsNS._prefix)
414 writer.set_namespace('wot', wotNS._prefix)
416 # should these be here, kind of specific to an application
417 writer.set_namespace('libraryOntology', libraryOntology._prefix)
418 writer.set_namespace('ucscSubmission', submissionOntology._prefix)
419 writer.set_namespace('ucscDaf', dafTermOntology._prefix)
423 def dump_model(model, destination=None):
424 if destination is None:
425 destination = sys.stdout
426 serializer = get_serializer()
427 destination.write(serializer.serialize_model_to_string(model))
428 destination.write(os.linesep)