htsworkflow/util/rdfhelp.py

   1 """Helper features for working with librdf
   2 """
   3 from __future__ import print_function
   4
   5 import collections
   6 from datetime import datetime
   7 from glob import glob
   8 from six.moves import urllib
   9 import logging
  10 import os
  11 import sys
  12 import types
  13 from pkg_resources import resource_listdir, resource_string
  14
  15 import lxml.html
  16 import lxml.html.clean
  17 import RDF
  18
  19 logger = logging.getLogger(__name__)
  20
  21 from htsworkflow.util.rdfns import *
  22
  23 SCHEMAS_URL='http://jumpgate.caltech.edu/phony/schemas'
  24 INFERENCE_URL='http://jumpgate.caltech.edu/phony/inference'
  25
  26 ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
  27 ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
  28
  29 def sparql_query(model, query_filename, output_format='text'):
  30     """Execute sparql query from file
  31     """
  32     logger.info("Opening: %s" % (query_filename,))
  33     query_body = open(query_filename, 'r').read()
  34     query = RDF.SPARQLQuery(query_body)
  35     results = query.execute(model)
  36     if output_format == 'html':
  37         html_query_results(results)
  38     else:
  39         display_query_results(results)
  40
  41
  42 def display_query_results(results):
  43     """A very simple display of sparql query results showing name value pairs
  44     """
  45     for row in results:
  46         for k, v in row.items()[::-1]:
  47             print("{0}: {1}".format(k, v))
  48         print()
  49
  50 def html_query_results(result_stream):
  51     from django.conf import settings
  52     from django.template import Context, loader
  53
  54     # I did this because I couldn't figure out how to
  55     # get simplify_rdf into the django template as a filter
  56     class Simplified(object):
  57         def __init__(self, value):
  58             self.simple = simplify_rdf(value)
  59             if value.is_resource():
  60                 self.url = value
  61             else:
  62                 self.url = None
  63
  64     template = loader.get_template('rdf_report.html')
  65     results = []
  66     for row in result_stream:
  67         new_row = collections.OrderedDict()
  68         row_urls = []
  69         for k,v in row.items():
  70             new_row[k] = Simplified(v)
  71         results.append(new_row)
  72     context = Context({'results': results,})
  73     print(template.render(context))
  74
  75 def blankOrUri(value=None):
  76     """Return a blank node for None or a resource node for strings.
  77     """
  78     node = None
  79     if value is None:
  80         node = RDF.Node()
  81     elif type(value) in types.StringTypes:
  82         node = RDF.Node(uri_string=value)
  83     elif isinstance(value, RDF.Node):
  84         node = value
  85
  86     return node
  87
  88
  89 def toTypedNode(value, language="en"):
  90     """Convert a python variable to a RDF Node with its closest xsd type
  91     """
  92     if type(value) == types.BooleanType:
  93         value_type = xsdNS['boolean'].uri
  94         if value:
  95             value = u'1'
  96         else:
  97             value = u'0'
  98     elif type(value) in (types.IntType, types.LongType):
  99         value_type = xsdNS['decimal'].uri
 100         value = unicode(value)
 101     elif type(value) == types.FloatType:
 102         value_type = xsdNS['float'].uri
 103         value = unicode(value)
 104     elif isinstance(value, datetime):
 105         value_type = xsdNS['dateTime'].uri
 106         if value.microsecond == 0:
 107             value = value.strftime(ISOFORMAT_SHORT)
 108         else:
 109             value = value.strftime(ISOFORMAT_MS)
 110     else:
 111         value_type = None
 112         value = unicode(value)
 113
 114     if value_type is not None:
 115         node = RDF.Node(literal=value, datatype=value_type)
 116     else:
 117         node = RDF.Node(literal=unicode(value).encode('utf-8'), language=language)
 118     return node
 119
 120
 121 def fromTypedNode(node):
 122     """Convert a typed RDF Node to its closest python equivalent
 123     """
 124     if not isinstance(node, RDF.Node):
 125         return node
 126     if node.is_resource():
 127         return node
 128
 129     value_type = get_node_type(node)
 130     literal = node.literal_value['string']
 131     literal_lower = literal.lower()
 132
 133     if value_type == 'boolean':
 134         if literal_lower in ('1', 'yes', 'true'):
 135             return True
 136         elif literal_lower in ('0', 'no', 'false'):
 137             return False
 138         else:
 139             raise ValueError("Unrecognized boolean %s" % (literal,))
 140     elif value_type == 'integer':
 141         return int(literal)
 142     elif value_type == 'decimal' and literal.find('.') == -1:
 143         return int(literal)
 144     elif value_type in ('decimal', 'float', 'double'):
 145         return float(literal)
 146     elif value_type in ('string'):
 147         return literal
 148     elif value_type in ('dateTime'):
 149         try:
 150             return datetime.strptime(literal, ISOFORMAT_MS)
 151         except ValueError:
 152             return datetime.strptime(literal, ISOFORMAT_SHORT)
 153     return literal
 154
 155
 156 def get_node_type(node):
 157     """Return just the base name of a XSD datatype:
 158     e.g. http://www.w3.org/2001/XMLSchema#integer -> integer
 159     """
 160     # chop off xml schema declaration
 161     value_type = node.literal_value['datatype']
 162     if value_type is None:
 163         return "string"
 164     else:
 165         value_type = str(value_type)
 166         return value_type.replace(str(xsdNS[''].uri), '')
 167
 168
 169 def simplify_rdf(value):
 170     """Return a short name for a RDF object
 171     e.g. The last part of a URI or an untyped string.
 172     """
 173     if isinstance(value, RDF.Node):
 174         if value.is_resource():
 175             name = simplify_uri(str(value.uri))
 176         elif value.is_blank():
 177             name = '<BLANK>'
 178         else:
 179             name = value.literal_value['string']
 180     elif isinstance(value, RDF.Uri):
 181         name = split_uri(str(value))
 182     else:
 183         name = value
 184     return str(name)
 185
 186
 187 def simplify_uri(uri):
 188     """Split off the end of a uri
 189
 190     >>> simplify_uri('http://asdf.org/foo/bar')
 191     'bar'
 192     >>> simplify_uri('http://asdf.org/foo/bar#bleem')
 193     'bleem'
 194     >>> simplify_uri('http://asdf.org/foo/bar/')
 195     'bar'
 196     >>> simplify_uri('http://asdf.org/foo/bar?was=foo')
 197     'was=foo'
 198     """
 199     if isinstance(uri, RDF.Node):
 200         if uri.is_resource():
 201             uri = uri.uri
 202         else:
 203             raise ValueError("Can't simplify an RDF literal")
 204     if isinstance(uri, RDF.Uri):
 205         uri = str(uri)
 206
 207     parsed = urllib.parse.urlparse(uri)
 208     if len(parsed.query) > 0:
 209         return parsed.query
 210     elif len(parsed.fragment) > 0:
 211         return parsed.fragment
 212     elif len(parsed.path) > 0:
 213         for element in reversed(parsed.path.split('/')):
 214             if len(element) > 0:
 215                 return element
 216     raise ValueError("Unable to simplify %s" % (uri,))
 217
 218 def strip_namespace(namespace, term):
 219     """Remove the namespace portion of a term
 220
 221     returns None if they aren't in common
 222     """
 223     if isinstance(term, RDF.Node):
 224         if  term.is_resource():
 225             term = term.uri
 226         else:
 227             raise ValueError("This works on resources")
 228     elif not isinstance(term, RDF.Uri):
 229         raise ValueError("This works on resources")
 230     term_s = str(term)
 231     if not term_s.startswith(namespace._prefix):
 232         return None
 233     return term_s.replace(namespace._prefix, "")
 234
 235
 236 def get_model(model_name=None, directory=None, use_contexts=True):
 237     if directory is None:
 238         directory = os.getcwd()
 239
 240     contexts = 'yes' if use_contexts else 'no'
 241
 242     if model_name is None:
 243         storage = RDF.MemoryStorage(options_string="contexts='{}'".format(contexts))
 244         logger.info("Using RDF Memory model")
 245     else:
 246         options = "contexts='{0}',hash-type='bdb',dir='{1}'".format(contexts, directory)
 247         storage = RDF.HashStorage(model_name,
 248                       options=options)
 249         logger.info("Using {0} with options {1}".format(model_name, options))
 250     model = RDF.Model(storage)
 251     return model
 252
 253
 254 def load_into_model(model, parser_name, path, ns=None):
 255     if type(ns) in types.StringTypes:
 256         ns = RDF.Uri(ns)
 257
 258     if isinstance(path, RDF.Node):
 259         if path.is_resource():
 260             path = str(path.uri)
 261         else:
 262             raise ValueError("url to load can't be a RDF literal")
 263
 264     url_parts = list(urllib.parse.urlparse(path))
 265     if len(url_parts[0]) == 0 or url_parts[0] == 'file':
 266         url_parts[0] = 'file'
 267         url_parts[2] = os.path.abspath(url_parts[2])
 268     if parser_name is None or parser_name == 'guess':
 269         parser_name = guess_parser_by_extension(path)
 270     url = urllib.parse.urlunparse(url_parts)
 271     logger.info("Opening {0} with parser {1}".format(url, parser_name))
 272
 273     rdf_parser = RDF.Parser(name=parser_name)
 274
 275     statements = []
 276     retries = 3
 277     succeeded = False
 278     while retries > 0:
 279         try:
 280             retries -= 1
 281             statements = rdf_parser.parse_as_stream(url, ns)
 282             retries = 0
 283             succeeded = True
 284         except RDF.RedlandError as e:
 285             errmsg = "RDF.RedlandError: {0} {1} tries remaining"
 286             logger.error(errmsg.format(str(e), retries))
 287
 288     if not succeeded:
 289         logger.warn("Unable to download %s", url)
 290
 291     for s in statements:
 292         conditionally_add_statement(model, s, ns)
 293
 294 def load_string_into_model(model, parser_name, data, ns=None):
 295     ns = fixup_namespace(ns)
 296     logger.debug("load_string_into_model parser={0}, len={1}".format(
 297         parser_name, len(data)))
 298     rdf_parser = RDF.Parser(name=str(parser_name))
 299
 300     for s in rdf_parser.parse_string_as_stream(data, ns):
 301         conditionally_add_statement(model, s, ns)
 302
 303
 304 def fixup_namespace(ns):
 305     if ns is None:
 306         ns = RDF.Uri("http://localhost/")
 307     elif type(ns) in types.StringTypes:
 308         ns = RDF.Uri(ns)
 309     elif not(isinstance(ns, RDF.Uri)):
 310         errmsg = "Namespace should be string or uri not {0}"
 311         raise ValueError(errmsg.format(str(type(ns))))
 312     return ns
 313
 314
 315 def conditionally_add_statement(model, s, ns):
 316     imports = owlNS['imports']
 317     if s.predicate == imports:
 318         obj = str(s.object)
 319         logger.info("Importing %s" % (obj,))
 320         load_into_model(model, None, obj, ns)
 321     if s.object.is_literal():
 322         value_type = get_node_type(s.object)
 323         if value_type == 'string':
 324             s.object = sanitize_literal(s.object)
 325     model.add_statement(s)
 326
 327
 328 def add_default_schemas(model, schema_path=None):
 329     """Add default schemas to a model
 330     Looks for turtle files in either htsworkflow/util/schemas
 331     or in the list of directories provided in schema_path
 332     """
 333
 334     schemas = resource_listdir(__name__, 'schemas')
 335     for s in schemas:
 336         schema = resource_string(__name__,  'schemas/' + s)
 337         namespace = 'file://localhost/htsworkflow/schemas/'+s
 338         add_schema(model, schema, namespace)
 339
 340     if schema_path:
 341         if type(schema_path) in types.StringTypes:
 342             schema_path = [schema_path]
 343
 344         for path in schema_path:
 345             for pathname in glob(os.path.join(path, '*.turtle')):
 346                 url = 'file://' + os.path.splitext(pathname)[0]
 347                 stream = open(pathname, 'r')
 348                 add_schema(model, stream, url)
 349                 stream.close()
 350
 351 def add_schema(model, schema, url):
 352     """Add a schema to a model.
 353
 354     Main difference from 'load_into_model' is it tags it with
 355     a RDFlib context so I can remove them later.
 356     """
 357     parser = RDF.Parser(name='turtle')
 358     context = RDF.Node(RDF.Uri(SCHEMAS_URL))
 359     for s in parser.parse_string_as_stream(schema, url):
 360         try:
 361             model.append(s, context)
 362         except RDF.RedlandError as e:
 363             logger.error("%s with %s", str(e), str(s))
 364
 365
 366 def remove_schemas(model):
 367     """Remove statements labeled with our schema context"""
 368     context = RDF.Node(RDF.Uri(SCHEMAS_URL))
 369     model.context_remove_statements(context)
 370
 371
 372 def sanitize_literal(node):
 373     """Clean up a literal string
 374     """
 375     if not isinstance(node, RDF.Node):
 376         raise ValueError("sanitize_literal only works on RDF.Nodes")
 377
 378     s = node.literal_value['string']
 379     if len(s) > 0:
 380         element = lxml.html.fromstring(s)
 381         cleaner = lxml.html.clean.Cleaner(page_structure=False)
 382         element = cleaner.clean_html(element)
 383         text = lxml.html.tostring(element)
 384         p_len = 3
 385         slash_p_len = 4
 386
 387         args = {'literal': text[p_len:-slash_p_len]}
 388     else:
 389         args = {'literal': ''}
 390     datatype = node.literal_value['datatype']
 391     if datatype is not None:
 392         args['datatype'] = datatype
 393     language = node.literal_value['language']
 394     if language is not None:
 395         args['language'] = language
 396     return RDF.Node(**args)
 397
 398
 399 def guess_parser(content_type, pathname):
 400     if content_type in ('application/rdf+xml',):
 401         return 'rdfxml'
 402     elif content_type in ('application/x-turtle',):
 403         return 'turtle'
 404     elif content_type in ('text/html',):
 405         return 'rdfa'
 406     elif content_type is None or content_type in ('text/plain',):
 407         return guess_parser_by_extension(pathname)
 408
 409 def guess_parser_by_extension(pathname):
 410     _, ext = os.path.splitext(pathname)
 411     if ext in ('.xml', '.rdf'):
 412         return 'rdfxml'
 413     elif ext in ('.html',):
 414         return 'rdfa'
 415     elif ext in ('.turtle',):
 416         return 'turtle'
 417     return 'guess'
 418
 419 def get_serializer(name='turtle'):
 420     """Return a serializer with our standard prefixes loaded
 421     """
 422     writer = RDF.Serializer(name=name)
 423     # really standard stuff
 424     writer.set_namespace('rdf', rdfNS._prefix)
 425     writer.set_namespace('rdfs', rdfsNS._prefix)
 426     writer.set_namespace('owl', owlNS._prefix)
 427     writer.set_namespace('dc', dcNS._prefix)
 428     writer.set_namespace('xml', xmlNS._prefix)
 429     writer.set_namespace('xsd', xsdNS._prefix)
 430     writer.set_namespace('vs', vsNS._prefix)
 431     writer.set_namespace('wot', wotNS._prefix)
 432
 433     # should these be here, kind of specific to an application
 434     writer.set_namespace('htswlib', libraryOntology._prefix)
 435     writer.set_namespace('ucscSubmission', submissionOntology._prefix)
 436     writer.set_namespace('ucscDaf', dafTermOntology._prefix)
 437     writer.set_namespace('geoSoft', geoSoftNS._prefix)
 438     writer.set_namespace('encode3', encode3NS._prefix)
 439     return writer
 440
 441 def get_turtle_header():
 442     """Return a turtle header with our typical namespaces
 443     """
 444     serializer = get_serializer()
 445     empty = get_model()
 446     return serializer.serialize_model_to_string(empty)
 447
 448 def dump_model(model, destination=None):
 449     if destination is None:
 450         destination = sys.stdout
 451     serializer = get_serializer()
 452     destination.write(serializer.serialize_model_to_string(model))
 453     destination.write(os.linesep)