htsworkflow/util/rdfhelp.py

   1 """Helper features for working with librdf
   2 """
   3 from __future__ import print_function, absolute_import
   4
   5 import collections
   6 from datetime import datetime
   7 from glob import glob
   8 import six
   9 from six.moves import urllib
  10 import logging
  11 import os
  12 import sys
  13 import types
  14 from pkg_resources import resource_listdir, resource_string
  15
  16 from rdflib import ConjunctiveGraph, Graph, Literal, BNode, URIRef, Namespace
  17 from rdflib.namespace import ClosedNamespace
  18
  19 import lxml.html
  20 import lxml.html.clean
  21
  22 from .rdfns import XMLNS
  23
  24 logger = logging.getLogger(__name__)
  25
  26 from htsworkflow.util.rdfns import *
  27
  28 SCHEMAS_URL='http://jumpgate.caltech.edu/phony/schemas'
  29 INFERENCE_URL='http://jumpgate.caltech.edu/phony/inference'
  30
  31 ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
  32 ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
  33
  34 def sparql_query(model, query_filename, output_format='text'):
  35     """Execute sparql query from file
  36     """
  37     logger.info("Opening: %s" % (query_filename,))
  38     query_body = open(query_filename, 'r').read()
  39     query = RDF.SPARQLQuery(query_body)
  40     results = query.execute(model)
  41     if output_format == 'html':
  42         html_query_results(results)
  43     else:
  44         display_query_results(results)
  45
  46
  47 def display_query_results(results):
  48     """A very simple display of sparql query results showing name value pairs
  49     """
  50     for row in results:
  51         for k, v in row.items()[::-1]:
  52             print("{0}: {1}".format(k, v))
  53         print()
  54
  55 def html_query_results(result_stream):
  56     from django.conf import settings
  57     from django.template import Context, loader
  58
  59     # I did this because I couldn't figure out how to
  60     # get simplify_rdf into the django template as a filter
  61     class Simplified(object):
  62         def __init__(self, value):
  63             self.simple = simplify_rdf(value)
  64             if value.is_resource():
  65                 self.url = value
  66             else:
  67                 self.url = None
  68
  69     template = loader.get_template('rdf_report.html')
  70     results = []
  71     for row in result_stream:
  72         new_row = collections.OrderedDict()
  73         row_urls = []
  74         for k,v in row.items():
  75             new_row[k] = Simplified(v)
  76         results.append(new_row)
  77     context = Context({'results': results,})
  78     print(template.render(context))
  79
  80
  81 def get_node_type(node):
  82     """Return just the base name of a XSD datatype:
  83     e.g. http://www.w3.org/2001/XMLSchema#integer -> integer
  84     """
  85     # chop off xml schema declaration
  86     value_type = node.datatype
  87     if value_type is None:
  88         return "string"
  89     else:
  90         return value_type.replace(str(XSD), '').lower()
  91
  92
  93 def simplify_rdf(value):
  94     """Return a short name for a RDF object
  95     e.g. The last part of a URI or an untyped string.
  96     """
  97     if isinstance(value, Literal):
  98         name = value.value
  99     elif isinstance(value, BNode):
 100         name = '<BLANK>'
 101     elif isinstance(value, URIRef):
 102         name = split_uri(str(value))
 103     else:
 104         name = value
 105     return str(name)
 106
 107
 108 def simplify_uri(uri):
 109     """Split off the end of a uri
 110
 111     >>> simplify_uri('http://asdf.org/foo/bar')
 112     'bar'
 113     >>> simplify_uri('http://asdf.org/foo/bar#bleem')
 114     'bleem'
 115     >>> simplify_uri('http://asdf.org/foo/bar/')
 116     'bar'
 117     >>> simplify_uri('http://asdf.org/foo/bar?was=foo')
 118     'was=foo'
 119     """
 120     if isinstance(uri, Literal) and uri.datatype not in (XSD.anyURI,):
 121         raise ValueError("Literal terms must be of URI type")
 122
 123     uri = str(uri)
 124
 125     parsed = urllib.parse.urlparse(uri)
 126     if len(parsed.query) > 0:
 127         return parsed.query
 128     elif len(parsed.fragment) > 0:
 129         return parsed.fragment
 130     elif len(parsed.path) > 0:
 131         for element in reversed(parsed.path.split('/')):
 132             if len(element) > 0:
 133                 return element
 134     raise ValueError("Unable to simplify %s" % (uri,))
 135
 136 def strip_namespace(namespace, term):
 137     """Remove the namespace portion of a term
 138
 139     returns None if they aren't in common
 140     """
 141     if not isinstance(namespace, (URIRef, Namespace, ClosedNamespace)):
 142         raise ValueError("Requires a URIRef namespace")
 143
 144     if isinstance(term, Literal) and term.datatype not in (XSD.anyURI,):
 145         raise ValueError("Term literals must be a URI type")
 146     elif not isinstance(term, URIRef):
 147         raise ValueError("Term must be a URI type")
 148
 149     term_s = str(term)
 150     if not term_s.startswith(str(namespace)):
 151         return None
 152     return term_s.replace(str(namespace), "")
 153
 154
 155 def load_into_model(model, parser_name, path, ns=None):
 156     if isinstance(ns, six.string_types):
 157         ns = URIRef(ns)
 158
 159     if isinstance(path, URIRef):
 160         path = str(path)
 161
 162     url_parts = list(urllib.parse.urlparse(path))
 163     if len(url_parts[0]) == 0 or url_parts[0] == 'file':
 164         url_parts[0] = 'file'
 165         url_parts[2] = os.path.abspath(url_parts[2])
 166     if parser_name is None or parser_name == 'guess':
 167         parser_name = guess_parser_by_extension(path)
 168     url = urllib.parse.urlunparse(url_parts)
 169     logger.info("Opening {0} with parser {1}".format(url, parser_name))
 170
 171     model.parse(url, format=parser_name, publicID=ns)
 172
 173
 174 def load_string_into_model(model, parser_name, data, ns=None):
 175     ns = fixup_namespace(ns)
 176     logger.debug("load_string_into_model parser={0}, len={1}".format(
 177         parser_name, len(data)))
 178
 179     model.parse(data=data, format=parser_name, publicID=ns)
 180     add_imports(model, ns)
 181
 182
 183 def fixup_namespace(ns):
 184     if ns is None:
 185         ns = URIRef("http://localhost/")
 186     elif isinstance(ns, six.string_types):
 187         ns = URIRef(ns)
 188     elif not(isinstance(ns, URIRef)):
 189         errmsg = "Namespace should be string or uri not {0}"
 190         raise ValueError(errmsg.format(str(type(ns))))
 191     return ns
 192
 193
 194 def add_imports(model, ns):
 195     for s, p, o in model.triples((None, OWL.imports, None)):
 196         if p == OWL.imports:
 197             model.remove((s, p, o))
 198             load_into_model(model, None, o, ns)
 199
 200 def add_default_schemas(model, schema_path=None):
 201     """Add default schemas to a model
 202     Looks for turtle files in either htsworkflow/util/schemas
 203     or in the list of directories provided in schema_path
 204     """
 205     schemas = resource_listdir(__name__, 'schemas')
 206     for s in schemas:
 207         schema = resource_string(__name__,  'schemas/' + s)
 208         if six.PY3:
 209             # files must be encoded utf-8
 210             schema = schema.decode('utf-8')
 211         namespace = 'file://localhost/htsworkflow/schemas/'+s
 212         add_schema(model, schema, namespace)
 213
 214     if schema_path:
 215         if type(schema_path) in types.StringTypes:
 216             schema_path = [schema_path]
 217
 218         for path in schema_path:
 219             for pathname in glob(os.path.join(path, '*.turtle')):
 220                 url = 'file://' + os.path.splitext(pathname)[0]
 221                 stream = open(pathname, 'rt')
 222                 add_schema(model, stream, url)
 223                 stream.close()
 224
 225 def add_schema(model, schema, url):
 226     """Add a schema to a model.
 227
 228     Main difference from 'load_into_model' is it tags it with
 229     a RDFlib context so I can remove them later.
 230     """
 231     if not isinstance(model, ConjunctiveGraph):
 232         raise ValueError("Schemas requires a graph that supports quads")
 233
 234     context = URIRef(SCHEMAS_URL)
 235     tmpmodel = Graph()
 236     tmpmodel.parse(data=schema, format='turtle', publicID=url)
 237     for s, p, o in tmpmodel:
 238         model.add((s, p, o, context))
 239
 240 def remove_schemas(model):
 241     """Remove statements labeled with our schema context"""
 242     context = URIRef(SCHEMAS_URL)
 243     for quad in model.triples((None, None, None, context)):
 244         model.remove(quad)
 245         #model.remove_context(context)
 246
 247 def sanitize_literal(node):
 248     """Clean up a literal string
 249     """
 250     if not isinstance(node, Literal):
 251         raise ValueError("sanitize_literal only works on Literals")
 252
 253     s = node.value
 254     if len(s) > 0:
 255         element = lxml.html.fromstring(s)
 256         cleaner = lxml.html.clean.Cleaner(page_structure=False)
 257         element = cleaner.clean_html(element)
 258         if six.PY3:
 259             text = lxml.html.tostring(element, encoding=str)
 260         else:
 261             text = lxml.html.tostring(element)
 262         p_len = 3
 263         slash_p_len = 4
 264
 265         value = text[p_len:-slash_p_len]
 266     else:
 267         value = ''
 268     args = {}
 269     if node.datatype is not None:
 270         args['datatype'] = node.datatype
 271     if node.language is not None:
 272         args['lang'] = node.language
 273     return Literal(value, **args)
 274
 275
 276 def guess_parser(content_type, pathname):
 277     if content_type in ('application/rdf+xml',):
 278         return 'rdfxml'
 279     elif content_type in ('application/x-turtle',):
 280         return 'turtle'
 281     elif content_type in ('text/html',):
 282         return 'rdfa'
 283     elif content_type is None or content_type in ('text/plain',):
 284         return guess_parser_by_extension(pathname)
 285
 286 def guess_parser_by_extension(pathname):
 287     _, ext = os.path.splitext(pathname)
 288     if ext in ('.xml', '.rdf'):
 289         return 'rdfxml'
 290     elif ext in ('.html',):
 291         return 'rdfa'
 292     elif ext in ('.turtle',):
 293         return 'turtle'
 294     return 'guess'
 295
 296 def add_default_namespaces(model):
 297     """Return a serializer with our standard prefixes loaded
 298     """
 299     model.bind('rdf', RDF)
 300     model.bind('rdfs', RDFS)
 301     model.bind('owl', OWL)
 302     model.bind('dc', DC)
 303     model.bind('xml', XMLNS)
 304     model.bind('xsd', XSD)
 305     model.bind('vs', VS)
 306     model.bind('wot', WOT)
 307
 308     # should these be here, kind of specific to an application
 309     model.bind('htswlib', libraryOntology)
 310     model.bind('ucscSubmission', submissionOntology)
 311     model.bind('ucscDaf', dafTermOntology)
 312     model.bind('geoSoft', geoSoftNS)
 313     model.bind('encode3', encode3NS)
 314     return model
 315
 316 def get_turtle_header():
 317     """Return a turtle header with our typical namespaces
 318     """
 319     empty = get_model()
 320     add_default_namespaces(model)
 321     return model.serialize(format='turtle')
 322
 323 def dump_model(model, destination=None):
 324     if destination is None:
 325         destination = sys.stdout
 326     add_default_namespaces(model)
 327     model.serialize(destination, format='turtle')