htsworkflow/util/rdfhelp.py

   1 """Helper features for working with librdf
   2 """
   3 import collections
   4 from datetime import datetime
   5 from glob import glob
   6 from urlparse import urlparse, urlunparse
   7 from urllib2 import urlopen
   8 import logging
   9 import os
  10 import sys
  11 import types
  12 from pkg_resources import resource_listdir, resource_string
  13
  14 import lxml.html
  15 import lxml.html.clean
  16 import RDF
  17
  18 logger = logging.getLogger(__name__)
  19
  20 from htsworkflow.util.rdfns import *
  21
  22 SCHEMAS_URL='http://jumpgate.caltech.edu/phony/schemas'
  23 INFERENCE_URL='http://jumpgate.caltech.edu/phony/inference'
  24
  25 ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
  26 ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
  27
  28 def sparql_query(model, query_filename, output_format='text'):
  29     """Execute sparql query from file
  30     """
  31     logger.info("Opening: %s" % (query_filename,))
  32     query_body = open(query_filename, 'r').read()
  33     query = RDF.SPARQLQuery(query_body)
  34     results = query.execute(model)
  35     if output_format == 'html':
  36         html_query_results(results)
  37     else:
  38         display_query_results(results)
  39
  40
  41 def display_query_results(results):
  42     """A very simple display of sparql query results showing name value pairs
  43     """
  44     for row in results:
  45         for k, v in row.items()[::-1]:
  46             print "{0}: {1}".format(k, v)
  47         print
  48
  49 def html_query_results(result_stream):
  50     from django.conf import settings
  51     from django.template import Context, loader
  52
  53     # I did this because I couldn't figure out how to
  54     # get simplify_rdf into the django template as a filter
  55     class Simplified(object):
  56         def __init__(self, value):
  57             self.simple = simplify_rdf(value)
  58             if value.is_resource():
  59                 self.url = value
  60             else:
  61                 self.url = None
  62
  63     template = loader.get_template('rdf_report.html')
  64     results = []
  65     for row in result_stream:
  66         new_row = collections.OrderedDict()
  67         row_urls = []
  68         for k,v in row.items():
  69             new_row[k] = Simplified(v)
  70         results.append(new_row)
  71     context = Context({'results': results,})
  72     print template.render(context)
  73
  74 def blankOrUri(value=None):
  75     """Return a blank node for None or a resource node for strings.
  76     """
  77     node = None
  78     if value is None:
  79         node = RDF.Node()
  80     elif type(value) in types.StringTypes:
  81         node = RDF.Node(uri_string=value)
  82     elif isinstance(value, RDF.Node):
  83         node = value
  84
  85     return node
  86
  87
  88 def toTypedNode(value, language="en"):
  89     """Convert a python variable to a RDF Node with its closest xsd type
  90     """
  91     if type(value) == types.BooleanType:
  92         value_type = xsdNS['boolean'].uri
  93         if value:
  94             value = u'1'
  95         else:
  96             value = u'0'
  97     elif type(value) in (types.IntType, types.LongType):
  98         value_type = xsdNS['decimal'].uri
  99         value = unicode(value)
 100     elif type(value) == types.FloatType:
 101         value_type = xsdNS['float'].uri
 102         value = unicode(value)
 103     elif isinstance(value, datetime):
 104         value_type = xsdNS['dateTime'].uri
 105         if value.microsecond == 0:
 106             value = value.strftime(ISOFORMAT_SHORT)
 107         else:
 108             value = value.strftime(ISOFORMAT_MS)
 109     else:
 110         value_type = None
 111         value = unicode(value)
 112
 113     if value_type is not None:
 114         node = RDF.Node(literal=value, datatype=value_type)
 115     else:
 116         node = RDF.Node(literal=unicode(value).encode('utf-8'), language=language)
 117     return node
 118
 119
 120 def fromTypedNode(node):
 121     """Convert a typed RDF Node to its closest python equivalent
 122     """
 123     if not isinstance(node, RDF.Node):
 124         return node
 125     if node.is_resource():
 126         return node
 127
 128     value_type = get_node_type(node)
 129     literal = node.literal_value['string']
 130     literal_lower = literal.lower()
 131
 132     if value_type == 'boolean':
 133         if literal_lower in ('1', 'yes', 'true'):
 134             return True
 135         elif literal_lower in ('0', 'no', 'false'):
 136             return False
 137         else:
 138             raise ValueError("Unrecognized boolean %s" % (literal,))
 139     elif value_type == 'integer':
 140         return int(literal)
 141     elif value_type == 'decimal' and literal.find('.') == -1:
 142         return int(literal)
 143     elif value_type in ('decimal', 'float', 'double'):
 144         return float(literal)
 145     elif value_type in ('string'):
 146         return literal
 147     elif value_type in ('dateTime'):
 148         try:
 149             return datetime.strptime(literal, ISOFORMAT_MS)
 150         except ValueError, _:
 151             return datetime.strptime(literal, ISOFORMAT_SHORT)
 152     return literal
 153
 154
 155 def get_node_type(node):
 156     """Return just the base name of a XSD datatype:
 157     e.g. http://www.w3.org/2001/XMLSchema#integer -> integer
 158     """
 159     # chop off xml schema declaration
 160     value_type = node.literal_value['datatype']
 161     if value_type is None:
 162         return "string"
 163     else:
 164         value_type = str(value_type)
 165         return value_type.replace(str(xsdNS[''].uri), '')
 166
 167
 168 def simplify_rdf(value):
 169     """Return a short name for a RDF object
 170     e.g. The last part of a URI or an untyped string.
 171     """
 172     if isinstance(value, RDF.Node):
 173         if value.is_resource():
 174             name = simplify_uri(str(value.uri))
 175         elif value.is_blank():
 176             name = '<BLANK>'
 177         else:
 178             name = value.literal_value['string']
 179     elif isinstance(value, RDF.Uri):
 180         name = split_uri(str(value))
 181     else:
 182         name = value
 183     return str(name)
 184
 185
 186 def simplify_uri(uri):
 187     """Split off the end of a uri
 188
 189     >>> simplify_uri('http://asdf.org/foo/bar')
 190     'bar'
 191     >>> simplify_uri('http://asdf.org/foo/bar#bleem')
 192     'bleem'
 193     >>> simplify_uri('http://asdf.org/foo/bar/')
 194     'bar'
 195     >>> simplify_uri('http://asdf.org/foo/bar?was=foo')
 196     'was=foo'
 197     """
 198     if isinstance(uri, RDF.Node):
 199         if uri.is_resource():
 200             uri = uri.uri
 201         else:
 202             raise ValueError("Can't simplify an RDF literal")
 203     if isinstance(uri, RDF.Uri):
 204         uri = str(uri)
 205
 206     parsed = urlparse(uri)
 207     if len(parsed.query) > 0:
 208         return parsed.query
 209     elif len(parsed.fragment) > 0:
 210         return parsed.fragment
 211     elif len(parsed.path) > 0:
 212         for element in reversed(parsed.path.split('/')):
 213             if len(element) > 0:
 214                 return element
 215     raise ValueError("Unable to simplify %s" % (uri,))
 216
 217 def strip_namespace(namespace, term):
 218     """Remove the namespace portion of a term
 219
 220     returns None if they aren't in common
 221     """
 222     if isinstance(term, RDF.Node):
 223         if  term.is_resource():
 224             term = term.uri
 225         else:
 226             raise ValueError("This works on resources")
 227     elif not isinstance(term, RDF.Uri):
 228         raise ValueError("This works on resources")
 229     term_s = str(term)
 230     if not term_s.startswith(namespace._prefix):
 231         return None
 232     return term_s.replace(namespace._prefix, "")
 233
 234
 235 def get_model(model_name=None, directory=None, use_contexts=True):
 236     if directory is None:
 237         directory = os.getcwd()
 238
 239     contexts = 'yes' if use_contexts else 'no'
 240
 241     if model_name is None:
 242         storage = RDF.MemoryStorage(options_string="contexts='{}'".format(contexts))
 243         logger.info("Using RDF Memory model")
 244     else:
 245         options = "contexts='{0}',hash-type='bdb',dir='{1}'".format(contexts, directory)
 246         storage = RDF.HashStorage(model_name,
 247                       options=options)
 248         logger.info("Using {0} with options {1}".format(model_name, options))
 249     model = RDF.Model(storage)
 250     return model
 251
 252
 253 def load_into_model(model, parser_name, path, ns=None):
 254     if type(ns) in types.StringTypes:
 255         ns = RDF.Uri(ns)
 256
 257     if isinstance(path, RDF.Node):
 258         if path.is_resource():
 259             path = str(path.uri)
 260         else:
 261             raise ValueError("url to load can't be a RDF literal")
 262
 263     url_parts = list(urlparse(path))
 264     if len(url_parts[0]) == 0 or url_parts[0] == 'file':
 265         url_parts[0] = 'file'
 266         url_parts[2] = os.path.abspath(url_parts[2])
 267     if parser_name is None or parser_name == 'guess':
 268         parser_name = guess_parser_by_extension(path)
 269     url = urlunparse(url_parts)
 270     logger.info("Opening {0} with parser {1}".format(url, parser_name))
 271
 272     rdf_parser = RDF.Parser(name=parser_name)
 273
 274     statements = []
 275     retries = 3
 276     succeeded = False
 277     while retries > 0:
 278         try:
 279             retries -= 1
 280             statements = rdf_parser.parse_as_stream(url, ns)
 281             retries = 0
 282             succeeded = True
 283         except RDF.RedlandError, e:
 284             errmsg = "RDF.RedlandError: {0} {1} tries remaining"
 285             logger.error(errmsg.format(str(e), retries))
 286
 287     if not succeeded:
 288         logger.warn("Unable to download %s", url)
 289
 290     for s in statements:
 291         conditionally_add_statement(model, s, ns)
 292
 293 def load_string_into_model(model, parser_name, data, ns=None):
 294     ns = fixup_namespace(ns)
 295     logger.debug("load_string_into_model parser={0}, len={1}".format(
 296         parser_name, len(data)))
 297     rdf_parser = RDF.Parser(name=parser_name)
 298
 299     for s in rdf_parser.parse_string_as_stream(data, ns):
 300         conditionally_add_statement(model, s, ns)
 301
 302
 303 def fixup_namespace(ns):
 304     if ns is None:
 305         ns = RDF.Uri("http://localhost/")
 306     elif type(ns) in types.StringTypes:
 307         ns = RDF.Uri(ns)
 308     elif not(isinstance(ns, RDF.Uri)):
 309         errmsg = "Namespace should be string or uri not {0}"
 310         raise ValueError(errmsg.format(str(type(ns))))
 311     return ns
 312
 313
 314 def conditionally_add_statement(model, s, ns):
 315     imports = owlNS['imports']
 316     if s.predicate == imports:
 317         obj = str(s.object)
 318         logger.info("Importing %s" % (obj,))
 319         load_into_model(model, None, obj, ns)
 320     if s.object.is_literal():
 321         value_type = get_node_type(s.object)
 322         if value_type == 'string':
 323             s.object = sanitize_literal(s.object)
 324     model.add_statement(s)
 325
 326
 327 def add_default_schemas(model, schema_path=None):
 328     """Add default schemas to a model
 329     Looks for turtle files in either htsworkflow/util/schemas
 330     or in the list of directories provided in schema_path
 331     """
 332
 333     schemas = resource_listdir(__name__, 'schemas')
 334     for s in schemas:
 335         schema = resource_string(__name__,  'schemas/' + s)
 336         namespace = 'file://localhost/htsworkflow/schemas/'+s
 337         add_schema(model, schema, namespace)
 338
 339     if schema_path:
 340         if type(schema_path) in types.StringTypes:
 341             schema_path = [schema_path]
 342
 343         for path in schema_path:
 344             for pathname in glob(os.path.join(path, '*.turtle')):
 345                 url = 'file://' + os.path.splitext(pathname)[0]
 346                 stream = open(pathname, 'r')
 347                 add_schema(model, stream, url)
 348                 stream.close()
 349
 350 def add_schema(model, schema, url):
 351     """Add a schema to a model.
 352
 353     Main difference from 'load_into_model' is it tags it with
 354     a RDFlib context so I can remove them later.
 355     """
 356     parser = RDF.Parser(name='turtle')
 357     context = RDF.Node(RDF.Uri(SCHEMAS_URL))
 358     for s in parser.parse_string_as_stream(schema, url):
 359         try:
 360             model.append(s, context)
 361         except RDF.RedlandError as e:
 362             logger.error("%s with %s", str(e), str(s))
 363
 364
 365 def remove_schemas(model):
 366     """Remove statements labeled with our schema context"""
 367     context = RDF.Node(RDF.Uri(SCHEMAS_URL))
 368     model.context_remove_statements(context)
 369
 370
 371 def sanitize_literal(node):
 372     """Clean up a literal string
 373     """
 374     if not isinstance(node, RDF.Node):
 375         raise ValueError("sanitize_literal only works on RDF.Nodes")
 376
 377     s = node.literal_value['string']
 378     if len(s) > 0:
 379         element = lxml.html.fromstring(s)
 380         cleaner = lxml.html.clean.Cleaner(page_structure=False)
 381         element = cleaner.clean_html(element)
 382         text = lxml.html.tostring(element)
 383         p_len = 3
 384         slash_p_len = 4
 385
 386         args = {'literal': text[p_len:-slash_p_len]}
 387     else:
 388         args = {'literal': ''}
 389     datatype = node.literal_value['datatype']
 390     if datatype is not None:
 391         args['datatype'] = datatype
 392     language = node.literal_value['language']
 393     if language is not None:
 394         args['language'] = language
 395     return RDF.Node(**args)
 396
 397
 398 def guess_parser(content_type, pathname):
 399     if content_type in ('application/rdf+xml',):
 400         return 'rdfxml'
 401     elif content_type in ('application/x-turtle',):
 402         return 'turtle'
 403     elif content_type in ('text/html',):
 404         return 'rdfa'
 405     elif content_type is None or content_type in ('text/plain',):
 406         return guess_parser_by_extension(pathname)
 407
 408 def guess_parser_by_extension(pathname):
 409     _, ext = os.path.splitext(pathname)
 410     if ext in ('.xml', '.rdf'):
 411         return 'rdfxml'
 412     elif ext in ('.html',):
 413         return 'rdfa'
 414     elif ext in ('.turtle',):
 415         return 'turtle'
 416     return 'guess'
 417
 418 def get_serializer(name='turtle'):
 419     """Return a serializer with our standard prefixes loaded
 420     """
 421     writer = RDF.Serializer(name=name)
 422     # really standard stuff
 423     writer.set_namespace('rdf', rdfNS._prefix)
 424     writer.set_namespace('rdfs', rdfsNS._prefix)
 425     writer.set_namespace('owl', owlNS._prefix)
 426     writer.set_namespace('dc', dcNS._prefix)
 427     writer.set_namespace('xml', xmlNS._prefix)
 428     writer.set_namespace('xsd', xsdNS._prefix)
 429     writer.set_namespace('vs', vsNS._prefix)
 430     writer.set_namespace('wot', wotNS._prefix)
 431
 432     # should these be here, kind of specific to an application
 433     writer.set_namespace('htswlib', libraryOntology._prefix)
 434     writer.set_namespace('ucscSubmission', submissionOntology._prefix)
 435     writer.set_namespace('ucscDaf', dafTermOntology._prefix)
 436     writer.set_namespace('geoSoft', geoSoftNS._prefix)
 437     writer.set_namespace('encode3', encode3NS._prefix)
 438     return writer
 439
 440 def get_turtle_header():
 441     """Return a turtle header with our typical namespaces
 442     """
 443     serializer = get_serializer()
 444     empty = get_model()
 445     return serializer.serialize_model_to_string(empty)
 446
 447 def dump_model(model, destination=None):
 448     if destination is None:
 449         destination = sys.stdout
 450     serializer = get_serializer()
 451     destination.write(serializer.serialize_model_to_string(model))
 452     destination.write(os.linesep)