htsworkflow/util/rdfhelp.py

   1 """Helper features for working with librdf
   2 """
   3 from __future__ import print_function
   4
   5 import collections
   6 from datetime import datetime
   7 from glob import glob
   8 import six
   9 from six.moves import urllib
  10 import logging
  11 import os
  12 import sys
  13 import types
  14 from pkg_resources import resource_listdir, resource_string
  15
  16 import lxml.html
  17 import lxml.html.clean
  18 import RDF
  19
  20 logger = logging.getLogger(__name__)
  21
  22 from htsworkflow.util.rdfns import *
  23
  24 SCHEMAS_URL='http://jumpgate.caltech.edu/phony/schemas'
  25 INFERENCE_URL='http://jumpgate.caltech.edu/phony/inference'
  26
  27 ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
  28 ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
  29
  30 def sparql_query(model, query_filename, output_format='text'):
  31     """Execute sparql query from file
  32     """
  33     logger.info("Opening: %s" % (query_filename,))
  34     query_body = open(query_filename, 'r').read()
  35     query = RDF.SPARQLQuery(query_body)
  36     results = query.execute(model)
  37     if output_format == 'html':
  38         html_query_results(results)
  39     else:
  40         display_query_results(results)
  41
  42
  43 def display_query_results(results):
  44     """A very simple display of sparql query results showing name value pairs
  45     """
  46     for row in results:
  47         for k, v in row.items()[::-1]:
  48             print("{0}: {1}".format(k, v))
  49         print()
  50
  51 def html_query_results(result_stream):
  52     from django.conf import settings
  53     from django.template import Context, loader
  54
  55     # I did this because I couldn't figure out how to
  56     # get simplify_rdf into the django template as a filter
  57     class Simplified(object):
  58         def __init__(self, value):
  59             self.simple = simplify_rdf(value)
  60             if value.is_resource():
  61                 self.url = value
  62             else:
  63                 self.url = None
  64
  65     template = loader.get_template('rdf_report.html')
  66     results = []
  67     for row in result_stream:
  68         new_row = collections.OrderedDict()
  69         row_urls = []
  70         for k,v in row.items():
  71             new_row[k] = Simplified(v)
  72         results.append(new_row)
  73     context = Context({'results': results,})
  74     print(template.render(context))
  75
  76 def blankOrUri(value=None):
  77     """Return a blank node for None or a resource node for strings.
  78     """
  79     node = None
  80     if value is None:
  81         node = RDF.Node()
  82     elif isinstance(value, six.string_types):
  83         node = RDF.Node(uri_string=value)
  84     elif isinstance(value, RDF.Node):
  85         node = value
  86
  87     return node
  88
  89
  90 def toTypedNode(value, language="en"):
  91     """Convert a python variable to a RDF Node with its closest xsd type
  92     """
  93     if isinstance(value, bool):
  94         value_type = xsdNS['boolean'].uri
  95         if value:
  96             value = u'1'
  97         else:
  98             value = u'0'
  99     elif isinstance(value, int):
 100         value_type = xsdNS['decimal'].uri
 101         value = str(value)
 102     elif isinstance(value, float):
 103         value_type = xsdNS['float'].uri
 104         value = str(value)
 105     elif isinstance(value, datetime):
 106         value_type = xsdNS['dateTime'].uri
 107         if value.microsecond == 0:
 108             value = value.strftime(ISOFORMAT_SHORT)
 109         else:
 110             value = value.strftime(ISOFORMAT_MS)
 111     else:
 112         value_type = None
 113         if six.PY3:
 114             value = str(value)
 115         else:
 116             value = unicode(value).encode('utf-8')
 117
 118     if value_type is not None:
 119         node = RDF.Node(literal=value, datatype=value_type)
 120     else:
 121         node = RDF.Node(literal=value, language=language)
 122     return node
 123
 124
 125 def fromTypedNode(node):
 126     """Convert a typed RDF Node to its closest python equivalent
 127     """
 128     if not isinstance(node, RDF.Node):
 129         return node
 130     if node.is_resource():
 131         return node
 132
 133     value_type = get_node_type(node)
 134     literal = node.literal_value['string']
 135     literal_lower = literal.lower()
 136
 137     if value_type == 'boolean':
 138         if literal_lower in ('1', 'yes', 'true'):
 139             return True
 140         elif literal_lower in ('0', 'no', 'false'):
 141             return False
 142         else:
 143             raise ValueError("Unrecognized boolean %s" % (literal,))
 144     elif value_type == 'integer':
 145         return int(literal)
 146     elif value_type == 'decimal' and literal.find('.') == -1:
 147         return int(literal)
 148     elif value_type in ('decimal', 'float', 'double'):
 149         return float(literal)
 150     elif value_type in ('string'):
 151         return literal
 152     elif value_type in ('dateTime'):
 153         try:
 154             return datetime.strptime(literal, ISOFORMAT_MS)
 155         except ValueError:
 156             return datetime.strptime(literal, ISOFORMAT_SHORT)
 157     return literal
 158
 159
 160 def get_node_type(node):
 161     """Return just the base name of a XSD datatype:
 162     e.g. http://www.w3.org/2001/XMLSchema#integer -> integer
 163     """
 164     # chop off xml schema declaration
 165     value_type = node.literal_value['datatype']
 166     if value_type is None:
 167         return "string"
 168     else:
 169         value_type = str(value_type)
 170         return value_type.replace(str(xsdNS[''].uri), '')
 171
 172
 173 def simplify_rdf(value):
 174     """Return a short name for a RDF object
 175     e.g. The last part of a URI or an untyped string.
 176     """
 177     if isinstance(value, RDF.Node):
 178         if value.is_resource():
 179             name = simplify_uri(str(value.uri))
 180         elif value.is_blank():
 181             name = '<BLANK>'
 182         else:
 183             name = value.literal_value['string']
 184     elif isinstance(value, RDF.Uri):
 185         name = split_uri(str(value))
 186     else:
 187         name = value
 188     return str(name)
 189
 190
 191 def simplify_uri(uri):
 192     """Split off the end of a uri
 193
 194     >>> simplify_uri('http://asdf.org/foo/bar')
 195     'bar'
 196     >>> simplify_uri('http://asdf.org/foo/bar#bleem')
 197     'bleem'
 198     >>> simplify_uri('http://asdf.org/foo/bar/')
 199     'bar'
 200     >>> simplify_uri('http://asdf.org/foo/bar?was=foo')
 201     'was=foo'
 202     """
 203     if isinstance(uri, RDF.Node):
 204         if uri.is_resource():
 205             uri = uri.uri
 206         else:
 207             raise ValueError("Can't simplify an RDF literal")
 208     if isinstance(uri, RDF.Uri):
 209         uri = str(uri)
 210
 211     parsed = urllib.parse.urlparse(uri)
 212     if len(parsed.query) > 0:
 213         return parsed.query
 214     elif len(parsed.fragment) > 0:
 215         return parsed.fragment
 216     elif len(parsed.path) > 0:
 217         for element in reversed(parsed.path.split('/')):
 218             if len(element) > 0:
 219                 return element
 220     raise ValueError("Unable to simplify %s" % (uri,))
 221
 222 def strip_namespace(namespace, term):
 223     """Remove the namespace portion of a term
 224
 225     returns None if they aren't in common
 226     """
 227     if isinstance(term, RDF.Node):
 228         if  term.is_resource():
 229             term = term.uri
 230         else:
 231             raise ValueError("This works on resources")
 232     elif not isinstance(term, RDF.Uri):
 233         raise ValueError("This works on resources")
 234     term_s = str(term)
 235     if not term_s.startswith(namespace._prefix):
 236         return None
 237     return term_s.replace(namespace._prefix, "")
 238
 239
 240 def get_model(model_name=None, directory=None, use_contexts=True):
 241     if directory is None:
 242         directory = os.getcwd()
 243
 244     contexts = 'yes' if use_contexts else 'no'
 245
 246     if model_name is None:
 247         storage = RDF.MemoryStorage(options_string="contexts='{}'".format(contexts))
 248         logger.info("Using RDF Memory model")
 249     else:
 250         options = "contexts='{0}',hash-type='bdb',dir='{1}'".format(contexts, directory)
 251         storage = RDF.HashStorage(model_name,
 252                       options=options)
 253         logger.info("Using {0} with options {1}".format(model_name, options))
 254     model = RDF.Model(storage)
 255     return model
 256
 257
 258 def load_into_model(model, parser_name, path, ns=None):
 259     if isinstance(ns, six.string_types):
 260         ns = RDF.Uri(ns)
 261
 262     if isinstance(path, RDF.Node):
 263         if path.is_resource():
 264             path = str(path.uri)
 265         else:
 266             raise ValueError("url to load can't be a RDF literal")
 267
 268     url_parts = list(urllib.parse.urlparse(path))
 269     if len(url_parts[0]) == 0 or url_parts[0] == 'file':
 270         url_parts[0] = 'file'
 271         url_parts[2] = os.path.abspath(url_parts[2])
 272     if parser_name is None or parser_name == 'guess':
 273         parser_name = guess_parser_by_extension(path)
 274     url = urllib.parse.urlunparse(url_parts)
 275     logger.info("Opening {0} with parser {1}".format(url, parser_name))
 276
 277     rdf_parser = RDF.Parser(name=parser_name)
 278
 279     statements = []
 280     retries = 3
 281     succeeded = False
 282     while retries > 0:
 283         try:
 284             retries -= 1
 285             statements = rdf_parser.parse_as_stream(url, ns)
 286             retries = 0
 287             succeeded = True
 288         except RDF.RedlandError as e:
 289             errmsg = "RDF.RedlandError: {0} {1} tries remaining"
 290             logger.error(errmsg.format(str(e), retries))
 291
 292     if not succeeded:
 293         logger.warn("Unable to download %s", url)
 294
 295     for s in statements:
 296         conditionally_add_statement(model, s, ns)
 297
 298 def load_string_into_model(model, parser_name, data, ns=None):
 299     ns = fixup_namespace(ns)
 300     logger.debug("load_string_into_model parser={0}, len={1}".format(
 301         parser_name, len(data)))
 302     rdf_parser = RDF.Parser(name=str(parser_name))
 303
 304     for s in rdf_parser.parse_string_as_stream(data, ns):
 305         conditionally_add_statement(model, s, ns)
 306
 307
 308 def fixup_namespace(ns):
 309     if ns is None:
 310         ns = RDF.Uri("http://localhost/")
 311     elif isinstance(ns, six.string_types):
 312         ns = RDF.Uri(ns)
 313     elif not(isinstance(ns, RDF.Uri)):
 314         errmsg = "Namespace should be string or uri not {0}"
 315         raise ValueError(errmsg.format(str(type(ns))))
 316     return ns
 317
 318
 319 def conditionally_add_statement(model, s, ns):
 320     imports = owlNS['imports']
 321     if s.predicate == imports:
 322         obj = str(s.object)
 323         logger.info("Importing %s" % (obj,))
 324         load_into_model(model, None, obj, ns)
 325     if s.object.is_literal():
 326         value_type = get_node_type(s.object)
 327         if value_type == 'string':
 328             s.object = sanitize_literal(s.object)
 329     model.add_statement(s)
 330
 331
 332 def add_default_schemas(model, schema_path=None):
 333     """Add default schemas to a model
 334     Looks for turtle files in either htsworkflow/util/schemas
 335     or in the list of directories provided in schema_path
 336     """
 337
 338     schemas = resource_listdir(__name__, 'schemas')
 339     for s in schemas:
 340         schema = resource_string(__name__,  'schemas/' + s)
 341         if six.PY3:
 342             # files must be encoded utf-8
 343             schema = schema.decode('utf-8')
 344         namespace = 'file://localhost/htsworkflow/schemas/'+s
 345         add_schema(model, schema, namespace)
 346
 347     if schema_path:
 348         if type(schema_path) in types.StringTypes:
 349             schema_path = [schema_path]
 350
 351         for path in schema_path:
 352             for pathname in glob(os.path.join(path, '*.turtle')):
 353                 url = 'file://' + os.path.splitext(pathname)[0]
 354                 stream = open(pathname, 'rt')
 355                 add_schema(model, stream, url)
 356                 stream.close()
 357
 358 def add_schema(model, schema, url):
 359     """Add a schema to a model.
 360
 361     Main difference from 'load_into_model' is it tags it with
 362     a RDFlib context so I can remove them later.
 363     """
 364     parser = RDF.Parser(name='turtle')
 365     context = RDF.Node(RDF.Uri(SCHEMAS_URL))
 366     for s in parser.parse_string_as_stream(schema, url):
 367         try:
 368             model.append(s, context)
 369         except RDF.RedlandError as e:
 370             logger.error("%s with %s", str(e), str(s))
 371
 372
 373 def remove_schemas(model):
 374     """Remove statements labeled with our schema context"""
 375     context = RDF.Node(RDF.Uri(SCHEMAS_URL))
 376     model.context_remove_statements(context)
 377
 378
 379 def sanitize_literal(node):
 380     """Clean up a literal string
 381     """
 382     if not isinstance(node, RDF.Node):
 383         raise ValueError("sanitize_literal only works on RDF.Nodes")
 384
 385     s = node.literal_value['string']
 386     if len(s) > 0:
 387         element = lxml.html.fromstring(s)
 388         cleaner = lxml.html.clean.Cleaner(page_structure=False)
 389         element = cleaner.clean_html(element)
 390         if six.PY3:
 391             text = lxml.html.tostring(element, encoding=str)
 392         else:
 393             text = lxml.html.tostring(element)
 394         p_len = 3
 395         slash_p_len = 4
 396
 397         args = {'literal': text[p_len:-slash_p_len]}
 398     else:
 399         args = {'literal': ''}
 400     datatype = node.literal_value['datatype']
 401     if datatype is not None:
 402         args['datatype'] = datatype
 403     language = node.literal_value['language']
 404     if language is not None:
 405         args['language'] = language
 406     return RDF.Node(**args)
 407
 408
 409 def guess_parser(content_type, pathname):
 410     if content_type in ('application/rdf+xml',):
 411         return 'rdfxml'
 412     elif content_type in ('application/x-turtle',):
 413         return 'turtle'
 414     elif content_type in ('text/html',):
 415         return 'rdfa'
 416     elif content_type is None or content_type in ('text/plain',):
 417         return guess_parser_by_extension(pathname)
 418
 419 def guess_parser_by_extension(pathname):
 420     _, ext = os.path.splitext(pathname)
 421     if ext in ('.xml', '.rdf'):
 422         return 'rdfxml'
 423     elif ext in ('.html',):
 424         return 'rdfa'
 425     elif ext in ('.turtle',):
 426         return 'turtle'
 427     return 'guess'
 428
 429 def get_serializer(name='turtle'):
 430     """Return a serializer with our standard prefixes loaded
 431     """
 432     writer = RDF.Serializer(name=name)
 433     # really standard stuff
 434     writer.set_namespace('rdf', rdfNS._prefix)
 435     writer.set_namespace('rdfs', rdfsNS._prefix)
 436     writer.set_namespace('owl', owlNS._prefix)
 437     writer.set_namespace('dc', dcNS._prefix)
 438     writer.set_namespace('xml', xmlNS._prefix)
 439     writer.set_namespace('xsd', xsdNS._prefix)
 440     writer.set_namespace('vs', vsNS._prefix)
 441     writer.set_namespace('wot', wotNS._prefix)
 442
 443     # should these be here, kind of specific to an application
 444     writer.set_namespace('htswlib', libraryOntology._prefix)
 445     writer.set_namespace('ucscSubmission', submissionOntology._prefix)
 446     writer.set_namespace('ucscDaf', dafTermOntology._prefix)
 447     writer.set_namespace('geoSoft', geoSoftNS._prefix)
 448     writer.set_namespace('encode3', encode3NS._prefix)
 449     return writer
 450
 451 def get_turtle_header():
 452     """Return a turtle header with our typical namespaces
 453     """
 454     serializer = get_serializer()
 455     empty = get_model()
 456     return serializer.serialize_model_to_string(empty)
 457
 458 def dump_model(model, destination=None):
 459     if destination is None:
 460         destination = sys.stdout
 461     serializer = get_serializer()
 462     destination.write(serializer.serialize_model_to_string(model))
 463     destination.write(os.linesep)