htsworkflow/util/rdfhelp.py

   1 """Helper features for working with librdf
   2 """
   3 import collections
   4 from datetime import datetime
   5 from glob import glob
   6 from urlparse import urlparse, urlunparse
   7 from urllib2 import urlopen
   8 import logging
   9 import os
  10 import sys
  11 import types
  12
  13 import lxml.html
  14 import lxml.html.clean
  15 import RDF
  16
  17 logger = logging.getLogger(__name__)
  18
  19 from htsworkflow.util.rdfns import *
  20
  21 SCHEMAS_URL='http://jumpgate.caltech.edu/phony/schemas'
  22 INFERENCE_URL='http://jumpgate.caltech.edu/phony/inference'
  23
  24 ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
  25 ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
  26
  27 def sparql_query(model, query_filename, output_format='text'):
  28     """Execute sparql query from file
  29     """
  30     logger.info("Opening: %s" % (query_filename,))
  31     query_body = open(query_filename, 'r').read()
  32     query = RDF.SPARQLQuery(query_body)
  33     results = query.execute(model)
  34     if output_format == 'html':
  35         html_query_results(results)
  36     else:
  37         display_query_results(results)
  38
  39
  40 def display_query_results(results):
  41     """A very simple display of sparql query results showing name value pairs
  42     """
  43     for row in results:
  44         for k, v in row.items()[::-1]:
  45             print "{0}: {1}".format(k, v)
  46         print
  47
  48 def html_query_results(result_stream):
  49     from django.conf import settings
  50     from django.template import Context, loader
  51
  52     # I did this because I couldn't figure out how to
  53     # get simplify_rdf into the django template as a filter
  54     class Simplified(object):
  55         def __init__(self, value):
  56             self.simple = simplify_rdf(value)
  57             if value.is_resource():
  58                 self.url = value
  59             else:
  60                 self.url = None
  61
  62     template = loader.get_template('rdf_report.html')
  63     results = []
  64     for row in result_stream:
  65         new_row = collections.OrderedDict()
  66         row_urls = []
  67         for k,v in row.items():
  68             new_row[k] = Simplified(v)
  69         results.append(new_row)
  70     context = Context({'results': results,})
  71     print template.render(context)
  72
  73 def blankOrUri(value=None):
  74     """Return a blank node for None or a resource node for strings.
  75     """
  76     node = None
  77     if value is None:
  78         node = RDF.Node()
  79     elif type(value) in types.StringTypes:
  80         node = RDF.Node(uri_string=value)
  81     elif isinstance(value, RDF.Node):
  82         node = value
  83
  84     return node
  85
  86
  87 def toTypedNode(value):
  88     """Convert a python variable to a RDF Node with its closest xsd type
  89     """
  90     if type(value) == types.BooleanType:
  91         value_type = xsdNS['boolean'].uri
  92         if value:
  93             value = u'1'
  94         else:
  95             value = u'0'
  96     elif type(value) in (types.IntType, types.LongType):
  97         value_type = xsdNS['decimal'].uri
  98         value = unicode(value)
  99     elif type(value) == types.FloatType:
 100         value_type = xsdNS['float'].uri
 101         value = unicode(value)
 102     elif isinstance(value, datetime):
 103         value_type = xsdNS['dateTime'].uri
 104         if value.microsecond == 0:
 105             value = value.strftime(ISOFORMAT_SHORT)
 106         else:
 107             value = value.strftime(ISOFORMAT_MS)
 108     else:
 109         value_type = None
 110         value = unicode(value)
 111
 112     if value_type is not None:
 113         node = RDF.Node(literal=value, datatype=value_type)
 114     else:
 115         node = RDF.Node(literal=unicode(value).encode('utf-8'))
 116     return node
 117
 118
 119 def fromTypedNode(node):
 120     """Convert a typed RDF Node to its closest python equivalent
 121     """
 122     if not isinstance(node, RDF.Node):
 123         return node
 124     if node.is_resource():
 125         return node
 126
 127     value_type = get_node_type(node)
 128     literal = node.literal_value['string']
 129     literal_lower = literal.lower()
 130
 131     if value_type == 'boolean':
 132         if literal_lower in ('1', 'yes', 'true'):
 133             return True
 134         elif literal_lower in ('0', 'no', 'false'):
 135             return False
 136         else:
 137             raise ValueError("Unrecognized boolean %s" % (literal,))
 138     elif value_type == 'integer':
 139         return int(literal)
 140     elif value_type == 'decimal' and literal.find('.') == -1:
 141         return int(literal)
 142     elif value_type in ('decimal', 'float', 'double'):
 143         return float(literal)
 144     elif value_type in ('string'):
 145         return literal
 146     elif value_type in ('dateTime'):
 147         try:
 148             return datetime.strptime(literal, ISOFORMAT_MS)
 149         except ValueError, _:
 150             return datetime.strptime(literal, ISOFORMAT_SHORT)
 151     return literal
 152
 153
 154 def get_node_type(node):
 155     """Return just the base name of a XSD datatype:
 156     e.g. http://www.w3.org/2001/XMLSchema#integer -> integer
 157     """
 158     # chop off xml schema declaration
 159     value_type = node.literal_value['datatype']
 160     if value_type is None:
 161         return "string"
 162     else:
 163         value_type = str(value_type)
 164         return value_type.replace(str(xsdNS[''].uri), '')
 165
 166
 167 def simplify_rdf(value):
 168     """Return a short name for a RDF object
 169     e.g. The last part of a URI or an untyped string.
 170     """
 171     if isinstance(value, RDF.Node):
 172         if value.is_resource():
 173             name = simplify_uri(str(value.uri))
 174         elif value.is_blank():
 175             name = '<BLANK>'
 176         else:
 177             name = value.literal_value['string']
 178     elif isinstance(value, RDF.Uri):
 179         name = split_uri(str(value))
 180     else:
 181         name = value
 182     return str(name)
 183
 184
 185 def simplify_uri(uri):
 186     """Split off the end of a uri
 187
 188     >>> simplify_uri('http://asdf.org/foo/bar')
 189     'bar'
 190     >>> simplify_uri('http://asdf.org/foo/bar#bleem')
 191     'bleem'
 192     >>> simplify_uri('http://asdf.org/foo/bar/')
 193     'bar'
 194     >>> simplify_uri('http://asdf.org/foo/bar?was=foo')
 195     'was=foo'
 196     """
 197     if isinstance(uri, RDF.Node):
 198         if uri.is_resource():
 199             uri = uri.uri
 200         else:
 201             raise ValueError("Can't simplify an RDF literal")
 202     if isinstance(uri, RDF.Uri):
 203         uri = str(uri)
 204
 205     parsed = urlparse(uri)
 206     if len(parsed.query) > 0:
 207         return parsed.query
 208     elif len(parsed.fragment) > 0:
 209         return parsed.fragment
 210     elif len(parsed.path) > 0:
 211         for element in reversed(parsed.path.split('/')):
 212             if len(element) > 0:
 213                 return element
 214     raise ValueError("Unable to simplify %s" % (uri,))
 215
 216 def stripNamespace(namespace, term):
 217     """Remove the namespace portion of a term
 218
 219     returns None if they aren't in common
 220     """
 221     if isinstance(term, RDF.Node):
 222         if  term.is_resource():
 223             term = term.uri
 224         else:
 225             raise ValueError("This works on resources")
 226     elif not isinstance(term, RDF.Uri):
 227         raise ValueError("This works on resources")
 228     term_s = str(term)
 229     if not term_s.startswith(namespace._prefix):
 230         return None
 231     return term_s.replace(namespace._prefix, "")
 232
 233
 234 def get_model(model_name=None, directory=None):
 235     if directory is None:
 236         directory = os.getcwd()
 237
 238     if model_name is None:
 239         storage = RDF.MemoryStorage(options_string="contexts='yes'")
 240         logger.info("Using RDF Memory model")
 241     else:
 242         options = "contexts='yes',hash-type='bdb',dir='{0}'".format(directory)
 243         storage = RDF.HashStorage(model_name,
 244                       options=options)
 245         logger.info("Using {0} with options {1}".format(model_name, options))
 246     model = RDF.Model(storage)
 247     return model
 248
 249
 250 def load_into_model(model, parser_name, path, ns=None):
 251     if type(ns) in types.StringTypes:
 252         ns = RDF.Uri(ns)
 253
 254     if isinstance(path, RDF.Node):
 255         if path.is_resource():
 256             path = str(path.uri)
 257         else:
 258             raise ValueError("url to load can't be a RDF literal")
 259
 260     url_parts = list(urlparse(path))
 261     if len(url_parts[0]) == 0 or url_parts[0] == 'file':
 262         url_parts[0] = 'file'
 263         url_parts[2] = os.path.abspath(url_parts[2])
 264     if parser_name is None or parser_name == 'guess':
 265         parser_name = guess_parser_by_extension(path)
 266     url = urlunparse(url_parts)
 267     logger.info("Opening {0} with parser {1}".format(url, parser_name))
 268
 269     rdf_parser = RDF.Parser(name=parser_name)
 270
 271     statements = []
 272     retries = 3
 273     while retries > 0:
 274         try:
 275             retries -= 1
 276             statements = rdf_parser.parse_as_stream(url, ns)
 277             retries = 0
 278         except RDF.RedlandError, e:
 279             errmsg = "RDF.RedlandError: {0} {1} tries remaining"
 280             logger.error(errmsg.format(str(e), retries))
 281
 282     for s in statements:
 283         conditionally_add_statement(model, s, ns)
 284
 285 def load_string_into_model(model, parser_name, data, ns=None):
 286     ns = fixup_namespace(ns)
 287     logger.debug("load_string_into_model parser={0}, len={1}".format(
 288         parser_name, len(data)))
 289     rdf_parser = RDF.Parser(name=parser_name)
 290
 291     for s in rdf_parser.parse_string_as_stream(data, ns):
 292         conditionally_add_statement(model, s, ns)
 293
 294
 295 def fixup_namespace(ns):
 296     if ns is None:
 297         ns = RDF.Uri("http://localhost/")
 298     elif type(ns) in types.StringTypes:
 299         ns = RDF.Uri(ns)
 300     elif not(isinstance(ns, RDF.Uri)):
 301         errmsg = "Namespace should be string or uri not {0}"
 302         raise ValueError(errmsg.format(str(type(ns))))
 303     return ns
 304
 305
 306 def conditionally_add_statement(model, s, ns):
 307     imports = owlNS['imports']
 308     if s.predicate == imports:
 309         obj = str(s.object)
 310         logger.info("Importing %s" % (obj,))
 311         load_into_model(model, None, obj, ns)
 312     if s.object.is_literal():
 313         value_type = get_node_type(s.object)
 314         if value_type == 'string':
 315             s.object = sanitize_literal(s.object)
 316     model.add_statement(s)
 317
 318
 319 def add_default_schemas(model, schema_path=None):
 320     """Add default schemas to a model
 321     Looks for turtle files in either htsworkflow/util/schemas
 322     or in the list of directories provided in schema_path
 323     """
 324
 325     if schema_path is None:
 326         path, _ = os.path.split(__file__)
 327         schema_path = [os.path.join(path, 'schemas')]
 328     elif type(schema_path) in types.StringTypes:
 329         schema_path = [schema_path]
 330
 331     for p in schema_path:
 332         for f in glob(os.path.join(p, '*.turtle')):
 333             add_schema(model, f)
 334
 335 def add_schema(model, filename):
 336     """Add a schema to a model.
 337
 338     Main difference from 'load_into_model' is it tags it with
 339     a RDFlib context so I can remove them later.
 340     """
 341     parser = RDF.Parser(name='turtle')
 342     context = RDF.Node(RDF.Uri(SCHEMAS_URL))
 343     url = 'file://' + filename
 344     for s in parser.parse_as_stream(url):
 345         try:
 346             model.append(s, context)
 347         except RDF.RedlandError as e:
 348             logger.error("%s with %s", str(e), str(s))
 349
 350
 351 def remove_schemas(model):
 352     """Remove statements labeled with our schema context"""
 353     context = RDF.Node(RDF.Uri(SCHEMAS_URL))
 354     model.context_remove_statements(context)
 355
 356
 357 def sanitize_literal(node):
 358     """Clean up a literal string
 359     """
 360     if not isinstance(node, RDF.Node):
 361         raise ValueError("sanitize_literal only works on RDF.Nodes")
 362
 363     s = node.literal_value['string']
 364     if len(s) > 0:
 365         element = lxml.html.fromstring(s)
 366         cleaner = lxml.html.clean.Cleaner(page_structure=False)
 367         element = cleaner.clean_html(element)
 368         text = lxml.html.tostring(element)
 369         p_len = 3
 370         slash_p_len = 4
 371
 372         args = {'literal': text[p_len:-slash_p_len]}
 373     else:
 374         args = {'literal': ''}
 375     datatype = node.literal_value['datatype']
 376     if datatype is not None:
 377         args['datatype'] = datatype
 378     language = node.literal_value['language']
 379     if language is not None:
 380         args['language'] = language
 381     return RDF.Node(**args)
 382
 383
 384 def guess_parser(content_type, pathname):
 385     if content_type in ('application/rdf+xml',):
 386         return 'rdfxml'
 387     elif content_type in ('application/x-turtle',):
 388         return 'turtle'
 389     elif content_type in ('text/html',):
 390         return 'rdfa'
 391     elif content_type is None or content_type in ('text/plain',):
 392         return guess_parser_by_extension(pathname)
 393
 394 def guess_parser_by_extension(pathname):
 395     _, ext = os.path.splitext(pathname)
 396     if ext in ('.xml', '.rdf'):
 397         return 'rdfxml'
 398     elif ext in ('.html',):
 399         return 'rdfa'
 400     elif ext in ('.turtle',):
 401         return 'turtle'
 402     return 'guess'
 403
 404 def get_serializer(name='turtle'):
 405     """Return a serializer with our standard prefixes loaded
 406     """
 407     writer = RDF.Serializer(name=name)
 408     # really standard stuff
 409     writer.set_namespace('rdf', rdfNS._prefix)
 410     writer.set_namespace('rdfs', rdfsNS._prefix)
 411     writer.set_namespace('owl', owlNS._prefix)
 412     writer.set_namespace('dc', dcNS._prefix)
 413     writer.set_namespace('xml', xmlNS._prefix)
 414     writer.set_namespace('xsd', xsdNS._prefix)
 415     writer.set_namespace('vs', vsNS._prefix)
 416     writer.set_namespace('wot', wotNS._prefix)
 417
 418     # should these be here, kind of specific to an application
 419     writer.set_namespace('libraryOntology', libraryOntology._prefix)
 420     writer.set_namespace('ucscSubmission', submissionOntology._prefix)
 421     writer.set_namespace('ucscDaf', dafTermOntology._prefix)
 422     return writer
 423
 424
 425 def dump_model(model, destination=None):
 426     if destination is None:
 427         destination = sys.stdout
 428     serializer = get_serializer()
 429     destination.write(serializer.serialize_model_to_string(model))
 430     destination.write(os.linesep)