1 """Helper features for working with librdf
4 from datetime import datetime
6 from urlparse import urlparse, urlunparse
7 from urllib2 import urlopen
13 import lxml.html.clean
16 logger = logging.getLogger(__name__)
18 from htsworkflow.util.rdfns import *
20 SCHEMAS_URL='http://jumpgate.caltech.edu/phony/schemas'
21 INFERENCE_URL='http://jumpgate.caltech.edu/phony/inference'
23 ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
24 ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
26 def sparql_query(model, query_filename, output_format='text'):
27 """Execute sparql query from file
29 logger.info("Opening: %s" % (query_filename,))
30 query_body = open(query_filename, 'r').read()
31 query = RDF.SPARQLQuery(query_body)
32 results = query.execute(model)
33 if output_format == 'html':
34 html_query_results(results)
36 display_query_results(results)
39 def display_query_results(results):
40 """A very simple display of sparql query results showing name value pairs
43 for k, v in row.items()[::-1]:
44 print "{0}: {1}".format(k, v)
47 def html_query_results(result_stream):
48 from django.conf import settings
49 from django.template import Context, loader
51 # I did this because I couldn't figure out how to
52 # get simplify_rdf into the django template as a filter
53 class Simplified(object):
54 def __init__(self, value):
55 self.simple = simplify_rdf(value)
56 if value.is_resource():
61 template = loader.get_template('rdf_report.html')
63 for row in result_stream:
64 new_row = collections.OrderedDict()
66 for k,v in row.items():
67 new_row[k] = Simplified(v)
68 results.append(new_row)
69 context = Context({'results': results,})
70 print template.render(context)
72 def blankOrUri(value=None):
73 """Return a blank node for None or a resource node for strings.
78 elif type(value) in types.StringTypes:
79 node = RDF.Node(uri_string=value)
80 elif isinstance(value, RDF.Node):
86 def toTypedNode(value):
87 """Convert a python variable to a RDF Node with its closest xsd type
89 if type(value) == types.BooleanType:
90 value_type = xsdNS['boolean'].uri
95 elif type(value) in (types.IntType, types.LongType):
96 value_type = xsdNS['decimal'].uri
97 value = unicode(value)
98 elif type(value) == types.FloatType:
99 value_type = xsdNS['float'].uri
100 value = unicode(value)
101 elif isinstance(value, datetime):
102 value_type = xsdNS['dateTime'].uri
103 if value.microsecond == 0:
104 value = value.strftime(ISOFORMAT_SHORT)
106 value = value.strftime(ISOFORMAT_MS)
109 value = unicode(value)
111 if value_type is not None:
112 node = RDF.Node(literal=value, datatype=value_type)
114 node = RDF.Node(literal=unicode(value).encode('utf-8'))
118 def fromTypedNode(node):
119 """Convert a typed RDF Node to its closest python equivalent
124 value_type = get_node_type(node)
125 literal = node.literal_value['string']
126 literal_lower = literal.lower()
128 if value_type == 'boolean':
129 if literal_lower in ('1', 'yes', 'true'):
131 elif literal_lower in ('0', 'no', 'false'):
134 raise ValueError("Unrecognized boolean %s" % (literal,))
135 elif value_type == 'integer':
137 elif value_type == 'decimal' and literal.find('.') == -1:
139 elif value_type in ('decimal', 'float', 'double'):
140 return float(literal)
141 elif value_type in ('string'):
143 elif value_type in ('dateTime'):
145 return datetime.strptime(literal, ISOFORMAT_MS)
146 except ValueError, _:
147 return datetime.strptime(literal, ISOFORMAT_SHORT)
151 def get_node_type(node):
152 """Return just the base name of a XSD datatype:
153 e.g. http://www.w3.org/2001/XMLSchema#integer -> integer
155 # chop off xml schema declaration
156 value_type = node.literal_value['datatype']
157 if value_type is None:
160 value_type = str(value_type)
161 return value_type.replace(str(xsdNS[''].uri), '')
164 def simplify_rdf(value):
165 """Return a short name for a RDF object
166 e.g. The last part of a URI or an untyped string.
168 if isinstance(value, RDF.Node):
169 if value.is_resource():
170 name = simplify_uri(str(value.uri))
171 elif value.is_blank():
174 name = value.literal_value['string']
175 elif isinstance(value, RDF.Uri):
176 name = split_uri(str(value))
182 def simplify_uri(uri):
183 """Split off the end of a uri
185 >>> simplify_uri('http://asdf.org/foo/bar')
187 >>> simplify_uri('http://asdf.org/foo/bar#bleem')
189 >>> simplify_uri('http://asdf.org/foo/bar/')
191 >>> simplify_uri('http://asdf.org/foo/bar?was=foo')
194 if isinstance(uri, RDF.Node):
195 if uri.is_resource():
198 raise ValueError("Can't simplify an RDF literal")
199 if isinstance(uri, RDF.Uri):
202 parsed = urlparse(uri)
203 if len(parsed.query) > 0:
205 elif len(parsed.fragment) > 0:
206 return parsed.fragment
207 elif len(parsed.path) > 0:
208 for element in reversed(parsed.path.split('/')):
211 raise ValueError("Unable to simplify %s" % (uri,))
213 def stripNamespace(namespace, term):
214 """Remove the namespace portion of a term
216 returns None if they aren't in common
218 if isinstance(term, RDF.Node):
219 if term.is_resource():
222 raise ValueError("This works on resources")
223 elif not isinstance(term, RDF.Uri):
224 raise ValueError("This works on resources")
226 if not term_s.startswith(namespace._prefix):
228 return term_s.replace(namespace._prefix, "")
231 def get_model(model_name=None, directory=None):
232 if directory is None:
233 directory = os.getcwd()
235 if model_name is None:
236 storage = RDF.MemoryStorage(options_string="contexts='yes'")
237 logger.info("Using RDF Memory model")
239 options = "contexts='yes',hash-type='bdb',dir='{0}'".format(directory)
240 storage = RDF.HashStorage(model_name,
242 logger.info("Using {0} with options {1}".format(model_name, options))
243 model = RDF.Model(storage)
247 def load_into_model(model, parser_name, path, ns=None):
248 if type(ns) in types.StringTypes:
251 if isinstance(path, RDF.Node):
252 if path.is_resource():
255 raise ValueError("url to load can't be a RDF literal")
257 url_parts = list(urlparse(path))
258 if len(url_parts[0]) == 0 or url_parts[0] == 'file':
259 url_parts[0] = 'file'
260 url_parts[2] = os.path.abspath(url_parts[2])
261 if parser_name is None or parser_name == 'guess':
262 parser_name = guess_parser_by_extension(path)
263 url = urlunparse(url_parts)
264 logger.info("Opening {0} with parser {1}".format(url, parser_name))
266 rdf_parser = RDF.Parser(name=parser_name)
273 statements = rdf_parser.parse_as_stream(url, ns)
275 except RDF.RedlandError, e:
276 errmsg = "RDF.RedlandError: {0} {1} tries remaining"
277 logger.error(errmsg.format(str(e), retries))
280 conditionally_add_statement(model, s, ns)
282 def load_string_into_model(model, parser_name, data, ns=None):
283 ns = fixup_namespace(ns)
284 logger.debug("load_string_into_model parser={0}, len={1}".format(
285 parser_name, len(data)))
286 rdf_parser = RDF.Parser(name=parser_name)
288 for s in rdf_parser.parse_string_as_stream(data, ns):
289 conditionally_add_statement(model, s, ns)
292 def fixup_namespace(ns):
294 ns = RDF.Uri("http://localhost/")
295 elif type(ns) in types.StringTypes:
297 elif not(isinstance(ns, RDF.Uri)):
298 errmsg = "Namespace should be string or uri not {0}"
299 raise ValueError(errmsg.format(str(type(ns))))
303 def conditionally_add_statement(model, s, ns):
304 imports = owlNS['imports']
305 if s.predicate == imports:
307 logger.info("Importing %s" % (obj,))
308 load_into_model(model, None, obj, ns)
309 if s.object.is_literal():
310 value_type = get_node_type(s.object)
311 if value_type == 'string':
312 s.object = sanitize_literal(s.object)
313 model.add_statement(s)
316 def add_default_schemas(model, schema_path=None):
317 """Add default schemas to a model
318 Looks for turtle files in either htsworkflow/util/schemas
319 or in the list of directories provided in schema_path
322 if schema_path is None:
323 path, _ = os.path.split(__file__)
324 schema_path = [os.path.join(path, 'schemas')]
325 elif type(schema_path) in types.StringTypes:
326 schema_path = [schema_path]
328 for p in schema_path:
329 for f in glob(os.path.join(p, '*.turtle')):
332 def add_schema(model, filename):
333 """Add a schema to a model.
335 Main difference from 'load_into_model' is it tags it with
336 a RDFlib context so I can remove them later.
338 parser = RDF.Parser(name='turtle')
339 context = RDF.Node(RDF.Uri(SCHEMAS_URL))
340 url = 'file://' + filename
341 for s in parser.parse_as_stream(url):
343 model.append(s, context)
344 except RDF.RedlandError as e:
345 logger.error("%s with %s", str(e), str(s))
348 def remove_schemas(model):
349 """Remove statements labeled with our schema context"""
350 context = RDF.Node(RDF.Uri(SCHEMAS_URL))
351 model.context_remove_statements(context)
354 def sanitize_literal(node):
355 """Clean up a literal string
357 if not isinstance(node, RDF.Node):
358 raise ValueError("sanitize_literal only works on RDF.Nodes")
360 s = node.literal_value['string']
362 element = lxml.html.fromstring(s)
363 cleaner = lxml.html.clean.Cleaner(page_structure=False)
364 element = cleaner.clean_html(element)
365 text = lxml.html.tostring(element)
369 args = {'literal': text[p_len:-slash_p_len]}
371 args = {'literal': ''}
372 datatype = node.literal_value['datatype']
373 if datatype is not None:
374 args['datatype'] = datatype
375 language = node.literal_value['language']
376 if language is not None:
377 args['language'] = language
378 return RDF.Node(**args)
381 def guess_parser(content_type, pathname):
382 if content_type in ('application/rdf+xml',):
384 elif content_type in ('application/x-turtle',):
386 elif content_type in ('text/html',):
388 elif content_type is None or content_type in ('text/plain',):
389 return guess_parser_by_extension(pathname)
391 def guess_parser_by_extension(pathname):
392 _, ext = os.path.splitext(pathname)
393 if ext in ('.xml', '.rdf'):
395 elif ext in ('.html',):
397 elif ext in ('.turtle',):
401 def get_serializer(name='turtle'):
402 """Return a serializer with our standard prefixes loaded
404 writer = RDF.Serializer(name=name)
405 # really standard stuff
406 writer.set_namespace('rdf', rdfNS._prefix)
407 writer.set_namespace('rdfs', rdfsNS._prefix)
408 writer.set_namespace('owl', owlNS._prefix)
409 writer.set_namespace('dc', dcNS._prefix)
410 writer.set_namespace('xml', xmlNS._prefix)
411 writer.set_namespace('xsd', xsdNS._prefix)
412 writer.set_namespace('vs', vsNS._prefix)
413 writer.set_namespace('wot', wotNS._prefix)
415 # should these be here, kind of specific to an application
416 writer.set_namespace('libraryOntology', libraryOntology._prefix)
417 writer.set_namespace('ucscSubmission', submissionOntology._prefix)
418 writer.set_namespace('ucscDaf', dafTermOntology._prefix)
422 def dump_model(model, destination=None):
423 if destination is None:
424 destination = sys.stdout
425 serializer = get_serializer()
426 destination.write(serializer.serialize_model_to_string(model))
427 destination.write(os.linesep)