1 """Helper features for working with librdf
4 from datetime import datetime
6 from urlparse import urlparse, urlunparse
7 from urllib2 import urlopen
14 import lxml.html.clean
17 logger = logging.getLogger(__name__)
19 from htsworkflow.util.rdfns import *
21 SCHEMAS_URL='http://jumpgate.caltech.edu/phony/schemas'
22 INFERENCE_URL='http://jumpgate.caltech.edu/phony/inference'
24 ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
25 ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
27 def sparql_query(model, query_filename, output_format='text'):
28 """Execute sparql query from file
30 logger.info("Opening: %s" % (query_filename,))
31 query_body = open(query_filename, 'r').read()
32 query = RDF.SPARQLQuery(query_body)
33 results = query.execute(model)
34 if output_format == 'html':
35 html_query_results(results)
37 display_query_results(results)
40 def display_query_results(results):
41 """A very simple display of sparql query results showing name value pairs
44 for k, v in row.items()[::-1]:
45 print "{0}: {1}".format(k, v)
48 def html_query_results(result_stream):
49 from django.conf import settings
50 from django.template import Context, loader
52 # I did this because I couldn't figure out how to
53 # get simplify_rdf into the django template as a filter
54 class Simplified(object):
55 def __init__(self, value):
56 self.simple = simplify_rdf(value)
57 if value.is_resource():
62 template = loader.get_template('rdf_report.html')
64 for row in result_stream:
65 new_row = collections.OrderedDict()
67 for k,v in row.items():
68 new_row[k] = Simplified(v)
69 results.append(new_row)
70 context = Context({'results': results,})
71 print template.render(context)
73 def blankOrUri(value=None):
74 """Return a blank node for None or a resource node for strings.
79 elif type(value) in types.StringTypes:
80 node = RDF.Node(uri_string=value)
81 elif isinstance(value, RDF.Node):
87 def toTypedNode(value):
88 """Convert a python variable to a RDF Node with its closest xsd type
90 if type(value) == types.BooleanType:
91 value_type = xsdNS['boolean'].uri
96 elif type(value) in (types.IntType, types.LongType):
97 value_type = xsdNS['decimal'].uri
98 value = unicode(value)
99 elif type(value) == types.FloatType:
100 value_type = xsdNS['float'].uri
101 value = unicode(value)
102 elif isinstance(value, datetime):
103 value_type = xsdNS['dateTime'].uri
104 if value.microsecond == 0:
105 value = value.strftime(ISOFORMAT_SHORT)
107 value = value.strftime(ISOFORMAT_MS)
110 value = unicode(value)
112 if value_type is not None:
113 node = RDF.Node(literal=value, datatype=value_type)
115 node = RDF.Node(literal=unicode(value).encode('utf-8'))
119 def fromTypedNode(node):
120 """Convert a typed RDF Node to its closest python equivalent
122 if not isinstance(node, RDF.Node):
124 if node.is_resource():
127 value_type = get_node_type(node)
128 literal = node.literal_value['string']
129 literal_lower = literal.lower()
131 if value_type == 'boolean':
132 if literal_lower in ('1', 'yes', 'true'):
134 elif literal_lower in ('0', 'no', 'false'):
137 raise ValueError("Unrecognized boolean %s" % (literal,))
138 elif value_type == 'integer':
140 elif value_type == 'decimal' and literal.find('.') == -1:
142 elif value_type in ('decimal', 'float', 'double'):
143 return float(literal)
144 elif value_type in ('string'):
146 elif value_type in ('dateTime'):
148 return datetime.strptime(literal, ISOFORMAT_MS)
149 except ValueError, _:
150 return datetime.strptime(literal, ISOFORMAT_SHORT)
154 def get_node_type(node):
155 """Return just the base name of a XSD datatype:
156 e.g. http://www.w3.org/2001/XMLSchema#integer -> integer
158 # chop off xml schema declaration
159 value_type = node.literal_value['datatype']
160 if value_type is None:
163 value_type = str(value_type)
164 return value_type.replace(str(xsdNS[''].uri), '')
167 def simplify_rdf(value):
168 """Return a short name for a RDF object
169 e.g. The last part of a URI or an untyped string.
171 if isinstance(value, RDF.Node):
172 if value.is_resource():
173 name = simplify_uri(str(value.uri))
174 elif value.is_blank():
177 name = value.literal_value['string']
178 elif isinstance(value, RDF.Uri):
179 name = split_uri(str(value))
185 def simplify_uri(uri):
186 """Split off the end of a uri
188 >>> simplify_uri('http://asdf.org/foo/bar')
190 >>> simplify_uri('http://asdf.org/foo/bar#bleem')
192 >>> simplify_uri('http://asdf.org/foo/bar/')
194 >>> simplify_uri('http://asdf.org/foo/bar?was=foo')
197 if isinstance(uri, RDF.Node):
198 if uri.is_resource():
201 raise ValueError("Can't simplify an RDF literal")
202 if isinstance(uri, RDF.Uri):
205 parsed = urlparse(uri)
206 if len(parsed.query) > 0:
208 elif len(parsed.fragment) > 0:
209 return parsed.fragment
210 elif len(parsed.path) > 0:
211 for element in reversed(parsed.path.split('/')):
214 raise ValueError("Unable to simplify %s" % (uri,))
216 def stripNamespace(namespace, term):
217 """Remove the namespace portion of a term
219 returns None if they aren't in common
221 if isinstance(term, RDF.Node):
222 if term.is_resource():
225 raise ValueError("This works on resources")
226 elif not isinstance(term, RDF.Uri):
227 raise ValueError("This works on resources")
229 if not term_s.startswith(namespace._prefix):
231 return term_s.replace(namespace._prefix, "")
234 def get_model(model_name=None, directory=None):
235 if directory is None:
236 directory = os.getcwd()
238 if model_name is None:
239 storage = RDF.MemoryStorage(options_string="contexts='yes'")
240 logger.info("Using RDF Memory model")
242 options = "contexts='yes',hash-type='bdb',dir='{0}'".format(directory)
243 storage = RDF.HashStorage(model_name,
245 logger.info("Using {0} with options {1}".format(model_name, options))
246 model = RDF.Model(storage)
250 def load_into_model(model, parser_name, path, ns=None):
251 if type(ns) in types.StringTypes:
254 if isinstance(path, RDF.Node):
255 if path.is_resource():
258 raise ValueError("url to load can't be a RDF literal")
260 url_parts = list(urlparse(path))
261 if len(url_parts[0]) == 0 or url_parts[0] == 'file':
262 url_parts[0] = 'file'
263 url_parts[2] = os.path.abspath(url_parts[2])
264 if parser_name is None or parser_name == 'guess':
265 parser_name = guess_parser_by_extension(path)
266 url = urlunparse(url_parts)
267 logger.info("Opening {0} with parser {1}".format(url, parser_name))
269 rdf_parser = RDF.Parser(name=parser_name)
276 statements = rdf_parser.parse_as_stream(url, ns)
278 except RDF.RedlandError, e:
279 errmsg = "RDF.RedlandError: {0} {1} tries remaining"
280 logger.error(errmsg.format(str(e), retries))
283 conditionally_add_statement(model, s, ns)
285 def load_string_into_model(model, parser_name, data, ns=None):
286 ns = fixup_namespace(ns)
287 logger.debug("load_string_into_model parser={0}, len={1}".format(
288 parser_name, len(data)))
289 rdf_parser = RDF.Parser(name=parser_name)
291 for s in rdf_parser.parse_string_as_stream(data, ns):
292 conditionally_add_statement(model, s, ns)
295 def fixup_namespace(ns):
297 ns = RDF.Uri("http://localhost/")
298 elif type(ns) in types.StringTypes:
300 elif not(isinstance(ns, RDF.Uri)):
301 errmsg = "Namespace should be string or uri not {0}"
302 raise ValueError(errmsg.format(str(type(ns))))
306 def conditionally_add_statement(model, s, ns):
307 imports = owlNS['imports']
308 if s.predicate == imports:
310 logger.info("Importing %s" % (obj,))
311 load_into_model(model, None, obj, ns)
312 if s.object.is_literal():
313 value_type = get_node_type(s.object)
314 if value_type == 'string':
315 s.object = sanitize_literal(s.object)
316 model.add_statement(s)
319 def add_default_schemas(model, schema_path=None):
320 """Add default schemas to a model
321 Looks for turtle files in either htsworkflow/util/schemas
322 or in the list of directories provided in schema_path
325 if schema_path is None:
326 path, _ = os.path.split(__file__)
327 schema_path = [os.path.join(path, 'schemas')]
328 elif type(schema_path) in types.StringTypes:
329 schema_path = [schema_path]
331 for p in schema_path:
332 for f in glob(os.path.join(p, '*.turtle')):
335 def add_schema(model, filename):
336 """Add a schema to a model.
338 Main difference from 'load_into_model' is it tags it with
339 a RDFlib context so I can remove them later.
341 parser = RDF.Parser(name='turtle')
342 context = RDF.Node(RDF.Uri(SCHEMAS_URL))
343 url = 'file://' + filename
344 for s in parser.parse_as_stream(url):
346 model.append(s, context)
347 except RDF.RedlandError as e:
348 logger.error("%s with %s", str(e), str(s))
351 def remove_schemas(model):
352 """Remove statements labeled with our schema context"""
353 context = RDF.Node(RDF.Uri(SCHEMAS_URL))
354 model.context_remove_statements(context)
357 def sanitize_literal(node):
358 """Clean up a literal string
360 if not isinstance(node, RDF.Node):
361 raise ValueError("sanitize_literal only works on RDF.Nodes")
363 s = node.literal_value['string']
365 element = lxml.html.fromstring(s)
366 cleaner = lxml.html.clean.Cleaner(page_structure=False)
367 element = cleaner.clean_html(element)
368 text = lxml.html.tostring(element)
372 args = {'literal': text[p_len:-slash_p_len]}
374 args = {'literal': ''}
375 datatype = node.literal_value['datatype']
376 if datatype is not None:
377 args['datatype'] = datatype
378 language = node.literal_value['language']
379 if language is not None:
380 args['language'] = language
381 return RDF.Node(**args)
384 def guess_parser(content_type, pathname):
385 if content_type in ('application/rdf+xml',):
387 elif content_type in ('application/x-turtle',):
389 elif content_type in ('text/html',):
391 elif content_type is None or content_type in ('text/plain',):
392 return guess_parser_by_extension(pathname)
394 def guess_parser_by_extension(pathname):
395 _, ext = os.path.splitext(pathname)
396 if ext in ('.xml', '.rdf'):
398 elif ext in ('.html',):
400 elif ext in ('.turtle',):
404 def get_serializer(name='turtle'):
405 """Return a serializer with our standard prefixes loaded
407 writer = RDF.Serializer(name=name)
408 # really standard stuff
409 writer.set_namespace('rdf', rdfNS._prefix)
410 writer.set_namespace('rdfs', rdfsNS._prefix)
411 writer.set_namespace('owl', owlNS._prefix)
412 writer.set_namespace('dc', dcNS._prefix)
413 writer.set_namespace('xml', xmlNS._prefix)
414 writer.set_namespace('xsd', xsdNS._prefix)
415 writer.set_namespace('vs', vsNS._prefix)
416 writer.set_namespace('wot', wotNS._prefix)
418 # should these be here, kind of specific to an application
419 writer.set_namespace('libraryOntology', libraryOntology._prefix)
420 writer.set_namespace('ucscSubmission', submissionOntology._prefix)
421 writer.set_namespace('ucscDaf', dafTermOntology._prefix)
425 def dump_model(model, destination=None):
426 if destination is None:
427 destination = sys.stdout
428 serializer = get_serializer()
429 destination.write(serializer.serialize_model_to_string(model))
430 destination.write(os.linesep)