htsworkflow/util/rdfhelp.py

   1 """Helper features for working with librdf
   2 """
   3 from __future__ import print_function
   4
   5 import collections
   6 from datetime import datetime
   7 from glob import glob
   8 import six
   9 from six.moves import urllib
  10 import logging
  11 import os
  12 import sys
  13 import types
  14 from pkg_resources import resource_listdir, resource_string
  15
  16 from rdflib import ConjunctiveGraph, Graph, Literal, BNode, URIRef, Namespace
  17 from rdflib.namespace import ClosedNamespace
  18
  19 import lxml.html
  20 import lxml.html.clean
  21
  22 logger = logging.getLogger(__name__)
  23
  24 from htsworkflow.util.rdfns import *
  25
  26 SCHEMAS_URL='http://jumpgate.caltech.edu/phony/schemas'
  27 INFERENCE_URL='http://jumpgate.caltech.edu/phony/inference'
  28
  29 ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
  30 ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
  31
  32 def sparql_query(model, query_filename, output_format='text'):
  33     """Execute sparql query from file
  34     """
  35     logger.info("Opening: %s" % (query_filename,))
  36     query_body = open(query_filename, 'r').read()
  37     query = RDF.SPARQLQuery(query_body)
  38     results = query.execute(model)
  39     if output_format == 'html':
  40         html_query_results(results)
  41     else:
  42         display_query_results(results)
  43
  44
  45 def display_query_results(results):
  46     """A very simple display of sparql query results showing name value pairs
  47     """
  48     for row in results:
  49         for k, v in row.items()[::-1]:
  50             print("{0}: {1}".format(k, v))
  51         print()
  52
  53 def html_query_results(result_stream):
  54     from django.conf import settings
  55     from django.template import Context, loader
  56
  57     # I did this because I couldn't figure out how to
  58     # get simplify_rdf into the django template as a filter
  59     class Simplified(object):
  60         def __init__(self, value):
  61             self.simple = simplify_rdf(value)
  62             if value.is_resource():
  63                 self.url = value
  64             else:
  65                 self.url = None
  66
  67     template = loader.get_template('rdf_report.html')
  68     results = []
  69     for row in result_stream:
  70         new_row = collections.OrderedDict()
  71         row_urls = []
  72         for k,v in row.items():
  73             new_row[k] = Simplified(v)
  74         results.append(new_row)
  75     context = Context({'results': results,})
  76     print(template.render(context))
  77
  78
  79 def get_node_type(node):
  80     """Return just the base name of a XSD datatype:
  81     e.g. http://www.w3.org/2001/XMLSchema#integer -> integer
  82     """
  83     # chop off xml schema declaration
  84     value_type = node.datatype
  85     if value_type is None:
  86         return "string"
  87     else:
  88         return value_type.replace(str(XSD), '').lower()
  89
  90
  91 def simplify_rdf(value):
  92     """Return a short name for a RDF object
  93     e.g. The last part of a URI or an untyped string.
  94     """
  95     if isinstance(value, Literal):
  96         name = value.value
  97     elif isinstance(value, BNode):
  98         name = '<BLANK>'
  99     elif isinstance(value, URIRef):
 100         name = split_uri(str(value))
 101     else:
 102         name = value
 103     return str(name)
 104
 105
 106 def simplify_uri(uri):
 107     """Split off the end of a uri
 108
 109     >>> simplify_uri('http://asdf.org/foo/bar')
 110     'bar'
 111     >>> simplify_uri('http://asdf.org/foo/bar#bleem')
 112     'bleem'
 113     >>> simplify_uri('http://asdf.org/foo/bar/')
 114     'bar'
 115     >>> simplify_uri('http://asdf.org/foo/bar?was=foo')
 116     'was=foo'
 117     """
 118     if isinstance(uri, Literal) and uri.datatype not in (XSD.anyURI,):
 119         raise ValueError("Literal terms must be of URI type")
 120
 121     uri = str(uri)
 122
 123     parsed = urllib.parse.urlparse(uri)
 124     if len(parsed.query) > 0:
 125         return parsed.query
 126     elif len(parsed.fragment) > 0:
 127         return parsed.fragment
 128     elif len(parsed.path) > 0:
 129         for element in reversed(parsed.path.split('/')):
 130             if len(element) > 0:
 131                 return element
 132     raise ValueError("Unable to simplify %s" % (uri,))
 133
 134 def strip_namespace(namespace, term):
 135     """Remove the namespace portion of a term
 136
 137     returns None if they aren't in common
 138     """
 139     if not isinstance(namespace, (URIRef, Namespace, ClosedNamespace)):
 140         raise ValueError("Requires a URIRef namespace")
 141
 142     if isinstance(term, Literal) and term.datatype not in (XSD.anyURI,):
 143         raise ValueError("Term literals must be a URI type")
 144     elif not isinstance(term, URIRef):
 145         raise ValueError("Term must be a URI type")
 146
 147     term_s = str(term)
 148     if not term_s.startswith(str(namespace)):
 149         return None
 150     return term_s.replace(str(namespace), "")
 151
 152
 153 def load_into_model(model, parser_name, path, ns=None):
 154     if isinstance(ns, six.string_types):
 155         ns = URIRef(ns)
 156
 157     if isinstance(path, URIRef):
 158         path = str(path)
 159
 160     url_parts = list(urllib.parse.urlparse(path))
 161     if len(url_parts[0]) == 0 or url_parts[0] == 'file':
 162         url_parts[0] = 'file'
 163         url_parts[2] = os.path.abspath(url_parts[2])
 164     if parser_name is None or parser_name == 'guess':
 165         parser_name = guess_parser_by_extension(path)
 166     url = urllib.parse.urlunparse(url_parts)
 167     logger.info("Opening {0} with parser {1}".format(url, parser_name))
 168
 169     model.parse(url, format=parser_name, publicID=ns)
 170
 171
 172 def load_string_into_model(model, parser_name, data, ns=None):
 173     ns = fixup_namespace(ns)
 174     logger.debug("load_string_into_model parser={0}, len={1}".format(
 175         parser_name, len(data)))
 176
 177     model.parse(data=data, format=parser_name, publicID=ns)
 178     add_imports(model, ns)
 179
 180
 181 def fixup_namespace(ns):
 182     if ns is None:
 183         ns = URIRef("http://localhost/")
 184     elif isinstance(ns, six.string_types):
 185         ns = URIRef(ns)
 186     elif not(isinstance(ns, URIRef)):
 187         errmsg = "Namespace should be string or uri not {0}"
 188         raise ValueError(errmsg.format(str(type(ns))))
 189     return ns
 190
 191
 192 def add_imports(model, ns):
 193     for s, p, o in model.triples((None, OWL.imports, None)):
 194         if p == OWL.imports:
 195             model.remove((s, p, o))
 196             load_into_model(model, None, o, ns)
 197
 198 def add_default_schemas(model, schema_path=None):
 199     """Add default schemas to a model
 200     Looks for turtle files in either htsworkflow/util/schemas
 201     or in the list of directories provided in schema_path
 202     """
 203     schemas = resource_listdir(__name__, 'schemas')
 204     for s in schemas:
 205         schema = resource_string(__name__,  'schemas/' + s)
 206         if six.PY3:
 207             # files must be encoded utf-8
 208             schema = schema.decode('utf-8')
 209         namespace = 'file://localhost/htsworkflow/schemas/'+s
 210         add_schema(model, schema, namespace)
 211
 212     if schema_path:
 213         if type(schema_path) in types.StringTypes:
 214             schema_path = [schema_path]
 215
 216         for path in schema_path:
 217             for pathname in glob(os.path.join(path, '*.turtle')):
 218                 url = 'file://' + os.path.splitext(pathname)[0]
 219                 stream = open(pathname, 'rt')
 220                 add_schema(model, stream, url)
 221                 stream.close()
 222
 223 def add_schema(model, schema, url):
 224     """Add a schema to a model.
 225
 226     Main difference from 'load_into_model' is it tags it with
 227     a RDFlib context so I can remove them later.
 228     """
 229     if not isinstance(model, ConjunctiveGraph):
 230         raise ValueError("Schemas requires a graph that supports quads")
 231
 232     context = URIRef(SCHEMAS_URL)
 233     tmpmodel = Graph()
 234     tmpmodel.parse(data=schema, format='turtle', publicID=url)
 235     for s, p, o in tmpmodel:
 236         model.add((s, p, o, context))
 237
 238 def remove_schemas(model):
 239     """Remove statements labeled with our schema context"""
 240     context = URIRef(SCHEMAS_URL)
 241     for quad in model.triples((None, None, None, context)):
 242         model.remove(quad)
 243         #model.remove_context(context)
 244
 245 def sanitize_literal(node):
 246     """Clean up a literal string
 247     """
 248     if not isinstance(node, Literal):
 249         raise ValueError("sanitize_literal only works on Literals")
 250
 251     s = node.value
 252     if len(s) > 0:
 253         element = lxml.html.fromstring(s)
 254         cleaner = lxml.html.clean.Cleaner(page_structure=False)
 255         element = cleaner.clean_html(element)
 256         if six.PY3:
 257             text = lxml.html.tostring(element, encoding=str)
 258         else:
 259             text = lxml.html.tostring(element)
 260         p_len = 3
 261         slash_p_len = 4
 262
 263         value = text[p_len:-slash_p_len]
 264     else:
 265         value = ''
 266     args = {}
 267     if node.datatype is not None:
 268         args['datatype'] = node.datatype
 269     if node.language is not None:
 270         args['lang'] = node.language
 271     return Literal(value, **args)
 272
 273
 274 def guess_parser(content_type, pathname):
 275     if content_type in ('application/rdf+xml',):
 276         return 'rdfxml'
 277     elif content_type in ('application/x-turtle',):
 278         return 'turtle'
 279     elif content_type in ('text/html',):
 280         return 'rdfa'
 281     elif content_type is None or content_type in ('text/plain',):
 282         return guess_parser_by_extension(pathname)
 283
 284 def guess_parser_by_extension(pathname):
 285     _, ext = os.path.splitext(pathname)
 286     if ext in ('.xml', '.rdf'):
 287         return 'rdfxml'
 288     elif ext in ('.html',):
 289         return 'rdfa'
 290     elif ext in ('.turtle',):
 291         return 'turtle'
 292     return 'guess'
 293
 294 def add_default_namespaces(model):
 295     """Return a serializer with our standard prefixes loaded
 296     """
 297     model.bind('rdf', RDF)
 298     model.bind('rdfs', RDFS)
 299     model.bind('owl', OWL)
 300     model.bind('dc', DC)
 301     model.bind('xml', XML)
 302     model.bind('xsd', XSD)
 303     model.bind('vs', VS)
 304     model.bind('wot', WOT)
 305
 306     # should these be here, kind of specific to an application
 307     model.bind('htswlib', libraryOntology)
 308     model.bind('ucscSubmission', submissionOntology)
 309     model.bind('ucscDaf', dafTermOntology)
 310     model.bind('geoSoft', geoSoftNS)
 311     model.bind('encode3', encode3NS)
 312     return model
 313
 314 def get_turtle_header():
 315     """Return a turtle header with our typical namespaces
 316     """
 317     empty = get_model()
 318     add_default_namespaces(model)
 319     return model.serialize(format='turtle')
 320
 321 def dump_model(model, destination=None):
 322     if destination is None:
 323         destination = sys.stdout
 324     add_default_namespaces(model)
 325     model.serialize(destination, format='turtle')