1 """Helper features for working with librdf
4 from datetime import datetime
6 from urlparse import urlparse, urlunparse
7 from urllib2 import urlopen
12 from pkg_resources import resource_listdir, resource_string
15 import lxml.html.clean
18 logger = logging.getLogger(__name__)
20 from htsworkflow.util.rdfns import *
22 SCHEMAS_URL='http://jumpgate.caltech.edu/phony/schemas'
23 INFERENCE_URL='http://jumpgate.caltech.edu/phony/inference'
25 ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
26 ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
28 def sparql_query(model, query_filename, output_format='text'):
29 """Execute sparql query from file
31 logger.info("Opening: %s" % (query_filename,))
32 query_body = open(query_filename, 'r').read()
33 query = RDF.SPARQLQuery(query_body)
34 results = query.execute(model)
35 if output_format == 'html':
36 html_query_results(results)
38 display_query_results(results)
41 def display_query_results(results):
42 """A very simple display of sparql query results showing name value pairs
45 for k, v in row.items()[::-1]:
46 print "{0}: {1}".format(k, v)
49 def html_query_results(result_stream):
50 from django.conf import settings
51 from django.template import Context, loader
53 # I did this because I couldn't figure out how to
54 # get simplify_rdf into the django template as a filter
55 class Simplified(object):
56 def __init__(self, value):
57 self.simple = simplify_rdf(value)
58 if value.is_resource():
63 template = loader.get_template('rdf_report.html')
65 for row in result_stream:
66 new_row = collections.OrderedDict()
68 for k,v in row.items():
69 new_row[k] = Simplified(v)
70 results.append(new_row)
71 context = Context({'results': results,})
72 print template.render(context)
74 def blankOrUri(value=None):
75 """Return a blank node for None or a resource node for strings.
80 elif type(value) in types.StringTypes:
81 node = RDF.Node(uri_string=value)
82 elif isinstance(value, RDF.Node):
88 def toTypedNode(value, language="en"):
89 """Convert a python variable to a RDF Node with its closest xsd type
91 if type(value) == types.BooleanType:
92 value_type = xsdNS['boolean'].uri
97 elif type(value) in (types.IntType, types.LongType):
98 value_type = xsdNS['decimal'].uri
99 value = unicode(value)
100 elif type(value) == types.FloatType:
101 value_type = xsdNS['float'].uri
102 value = unicode(value)
103 elif isinstance(value, datetime):
104 value_type = xsdNS['dateTime'].uri
105 if value.microsecond == 0:
106 value = value.strftime(ISOFORMAT_SHORT)
108 value = value.strftime(ISOFORMAT_MS)
111 value = unicode(value)
113 if value_type is not None:
114 node = RDF.Node(literal=value, datatype=value_type)
116 node = RDF.Node(literal=unicode(value).encode('utf-8'), language=language)
120 def fromTypedNode(node):
121 """Convert a typed RDF Node to its closest python equivalent
123 if not isinstance(node, RDF.Node):
125 if node.is_resource():
128 value_type = get_node_type(node)
129 literal = node.literal_value['string']
130 literal_lower = literal.lower()
132 if value_type == 'boolean':
133 if literal_lower in ('1', 'yes', 'true'):
135 elif literal_lower in ('0', 'no', 'false'):
138 raise ValueError("Unrecognized boolean %s" % (literal,))
139 elif value_type == 'integer':
141 elif value_type == 'decimal' and literal.find('.') == -1:
143 elif value_type in ('decimal', 'float', 'double'):
144 return float(literal)
145 elif value_type in ('string'):
147 elif value_type in ('dateTime'):
149 return datetime.strptime(literal, ISOFORMAT_MS)
150 except ValueError, _:
151 return datetime.strptime(literal, ISOFORMAT_SHORT)
155 def get_node_type(node):
156 """Return just the base name of a XSD datatype:
157 e.g. http://www.w3.org/2001/XMLSchema#integer -> integer
159 # chop off xml schema declaration
160 value_type = node.literal_value['datatype']
161 if value_type is None:
164 value_type = str(value_type)
165 return value_type.replace(str(xsdNS[''].uri), '')
168 def simplify_rdf(value):
169 """Return a short name for a RDF object
170 e.g. The last part of a URI or an untyped string.
172 if isinstance(value, RDF.Node):
173 if value.is_resource():
174 name = simplify_uri(str(value.uri))
175 elif value.is_blank():
178 name = value.literal_value['string']
179 elif isinstance(value, RDF.Uri):
180 name = split_uri(str(value))
186 def simplify_uri(uri):
187 """Split off the end of a uri
189 >>> simplify_uri('http://asdf.org/foo/bar')
191 >>> simplify_uri('http://asdf.org/foo/bar#bleem')
193 >>> simplify_uri('http://asdf.org/foo/bar/')
195 >>> simplify_uri('http://asdf.org/foo/bar?was=foo')
198 if isinstance(uri, RDF.Node):
199 if uri.is_resource():
202 raise ValueError("Can't simplify an RDF literal")
203 if isinstance(uri, RDF.Uri):
206 parsed = urlparse(uri)
207 if len(parsed.query) > 0:
209 elif len(parsed.fragment) > 0:
210 return parsed.fragment
211 elif len(parsed.path) > 0:
212 for element in reversed(parsed.path.split('/')):
215 raise ValueError("Unable to simplify %s" % (uri,))
217 def stripNamespace(namespace, term):
218 """Remove the namespace portion of a term
220 returns None if they aren't in common
222 if isinstance(term, RDF.Node):
223 if term.is_resource():
226 raise ValueError("This works on resources")
227 elif not isinstance(term, RDF.Uri):
228 raise ValueError("This works on resources")
230 if not term_s.startswith(namespace._prefix):
232 return term_s.replace(namespace._prefix, "")
235 def get_model(model_name=None, directory=None):
236 if directory is None:
237 directory = os.getcwd()
239 if model_name is None:
240 storage = RDF.MemoryStorage(options_string="contexts='yes'")
241 logger.info("Using RDF Memory model")
243 options = "contexts='yes',hash-type='bdb',dir='{0}'".format(directory)
244 storage = RDF.HashStorage(model_name,
246 logger.info("Using {0} with options {1}".format(model_name, options))
247 model = RDF.Model(storage)
251 def load_into_model(model, parser_name, path, ns=None):
252 if type(ns) in types.StringTypes:
255 if isinstance(path, RDF.Node):
256 if path.is_resource():
259 raise ValueError("url to load can't be a RDF literal")
261 url_parts = list(urlparse(path))
262 if len(url_parts[0]) == 0 or url_parts[0] == 'file':
263 url_parts[0] = 'file'
264 url_parts[2] = os.path.abspath(url_parts[2])
265 if parser_name is None or parser_name == 'guess':
266 parser_name = guess_parser_by_extension(path)
267 url = urlunparse(url_parts)
268 logger.info("Opening {0} with parser {1}".format(url, parser_name))
270 rdf_parser = RDF.Parser(name=parser_name)
278 statements = rdf_parser.parse_as_stream(url, ns)
281 except RDF.RedlandError, e:
282 errmsg = "RDF.RedlandError: {0} {1} tries remaining"
283 logger.error(errmsg.format(str(e), retries))
286 logger.warn("Unable to download %s", url)
289 conditionally_add_statement(model, s, ns)
291 def load_string_into_model(model, parser_name, data, ns=None):
292 ns = fixup_namespace(ns)
293 logger.debug("load_string_into_model parser={0}, len={1}".format(
294 parser_name, len(data)))
295 rdf_parser = RDF.Parser(name=parser_name)
297 for s in rdf_parser.parse_string_as_stream(data, ns):
298 conditionally_add_statement(model, s, ns)
301 def fixup_namespace(ns):
303 ns = RDF.Uri("http://localhost/")
304 elif type(ns) in types.StringTypes:
306 elif not(isinstance(ns, RDF.Uri)):
307 errmsg = "Namespace should be string or uri not {0}"
308 raise ValueError(errmsg.format(str(type(ns))))
312 def conditionally_add_statement(model, s, ns):
313 imports = owlNS['imports']
314 if s.predicate == imports:
316 logger.info("Importing %s" % (obj,))
317 load_into_model(model, None, obj, ns)
318 if s.object.is_literal():
319 value_type = get_node_type(s.object)
320 if value_type == 'string':
321 s.object = sanitize_literal(s.object)
322 model.add_statement(s)
325 def add_default_schemas(model, schema_path=None):
326 """Add default schemas to a model
327 Looks for turtle files in either htsworkflow/util/schemas
328 or in the list of directories provided in schema_path
331 schemas = resource_listdir(__name__, 'schemas')
333 schema = resource_string(__name__, 'schemas/' + s)
334 namespace = 'file://localhost/htsworkflow/schemas/'+s
335 add_schema(model, schema, namespace)
338 if type(schema_path) in types.StringTypes:
339 schema_path = [schema_path]
341 for path in schema_path:
342 for pathname in glob(os.path.join(path, '*.turtle')):
343 url = 'file://' + os.path.splitext(pathname)[0]
344 stream = open(pathname, 'r')
345 add_schema(model, stream, url)
348 def add_schema(model, schema, url):
349 """Add a schema to a model.
351 Main difference from 'load_into_model' is it tags it with
352 a RDFlib context so I can remove them later.
354 parser = RDF.Parser(name='turtle')
355 context = RDF.Node(RDF.Uri(SCHEMAS_URL))
356 for s in parser.parse_string_as_stream(schema, url):
358 model.append(s, context)
359 except RDF.RedlandError as e:
360 logger.error("%s with %s", str(e), str(s))
363 def remove_schemas(model):
364 """Remove statements labeled with our schema context"""
365 context = RDF.Node(RDF.Uri(SCHEMAS_URL))
366 model.context_remove_statements(context)
369 def sanitize_literal(node):
370 """Clean up a literal string
372 if not isinstance(node, RDF.Node):
373 raise ValueError("sanitize_literal only works on RDF.Nodes")
375 s = node.literal_value['string']
377 element = lxml.html.fromstring(s)
378 cleaner = lxml.html.clean.Cleaner(page_structure=False)
379 element = cleaner.clean_html(element)
380 text = lxml.html.tostring(element)
384 args = {'literal': text[p_len:-slash_p_len]}
386 args = {'literal': ''}
387 datatype = node.literal_value['datatype']
388 if datatype is not None:
389 args['datatype'] = datatype
390 language = node.literal_value['language']
391 if language is not None:
392 args['language'] = language
393 return RDF.Node(**args)
396 def guess_parser(content_type, pathname):
397 if content_type in ('application/rdf+xml',):
399 elif content_type in ('application/x-turtle',):
401 elif content_type in ('text/html',):
403 elif content_type is None or content_type in ('text/plain',):
404 return guess_parser_by_extension(pathname)
406 def guess_parser_by_extension(pathname):
407 _, ext = os.path.splitext(pathname)
408 if ext in ('.xml', '.rdf'):
410 elif ext in ('.html',):
412 elif ext in ('.turtle',):
416 def get_serializer(name='turtle'):
417 """Return a serializer with our standard prefixes loaded
419 writer = RDF.Serializer(name=name)
420 # really standard stuff
421 writer.set_namespace('rdf', rdfNS._prefix)
422 writer.set_namespace('rdfs', rdfsNS._prefix)
423 writer.set_namespace('owl', owlNS._prefix)
424 writer.set_namespace('dc', dcNS._prefix)
425 writer.set_namespace('xml', xmlNS._prefix)
426 writer.set_namespace('xsd', xsdNS._prefix)
427 writer.set_namespace('vs', vsNS._prefix)
428 writer.set_namespace('wot', wotNS._prefix)
430 # should these be here, kind of specific to an application
431 writer.set_namespace('htswlib', libraryOntology._prefix)
432 writer.set_namespace('ucscSubmission', submissionOntology._prefix)
433 writer.set_namespace('ucscDaf', dafTermOntology._prefix)
434 writer.set_namespace('geoSoft', geoSoftNS._prefix)
435 writer.set_namespace('encode3', encode3NS._prefix)
438 def get_turtle_header():
439 """Return a turtle header with our typical namespaces
441 serializer = get_serializer()
443 return serializer.serialize_model_to_string(empty)
445 def dump_model(model, destination=None):
446 if destination is None:
447 destination = sys.stdout
448 serializer = get_serializer()
449 destination.write(serializer.serialize_model_to_string(model))
450 destination.write(os.linesep)