htsworkflow/util/rdfhelp.py

   1 """Helper features for working with librdf
   2 """
   3 import collections
   4 from datetime import datetime
   5 from glob import glob
   6 from urlparse import urlparse, urlunparse
   7 from urllib2 import urlopen
   8 import logging
   9 import os
  10 import types
  11
  12 import lxml.html
  13 import lxml.html.clean
  14 import RDF
  15
  16 logger = logging.getLogger(__name__)
  17
  18 from htsworkflow.util.rdfns import *
  19
  20 SCHEMAS_URL='http://jumpgate.caltech.edu/phony/schemas'
  21 INFERENCE_URL='http://jumpgate.caltech.edu/phony/inference'
  22
  23 ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
  24 ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
  25
  26 def sparql_query(model, query_filename, output_format='text'):
  27     """Execute sparql query from file
  28     """
  29     logger.info("Opening: %s" % (query_filename,))
  30     query_body = open(query_filename, 'r').read()
  31     query = RDF.SPARQLQuery(query_body)
  32     results = query.execute(model)
  33     if output_format == 'html':
  34         html_query_results(results)
  35     else:
  36         display_query_results(results)
  37
  38
  39 def display_query_results(results):
  40     """A very simple display of sparql query results showing name value pairs
  41     """
  42     for row in results:
  43         for k, v in row.items()[::-1]:
  44             print "{0}: {1}".format(k, v)
  45         print
  46
  47 def html_query_results(result_stream):
  48     from django.conf import settings
  49     from django.template import Context, loader
  50
  51     # I did this because I couldn't figure out how to
  52     # get simplify_rdf into the django template as a filter
  53     class Simplified(object):
  54         def __init__(self, value):
  55             self.simple = simplify_rdf(value)
  56             if value.is_resource():
  57                 self.url = value
  58             else:
  59                 self.url = None
  60
  61     template = loader.get_template('rdf_report.html')
  62     results = []
  63     for row in result_stream:
  64         new_row = collections.OrderedDict()
  65         row_urls = []
  66         for k,v in row.items():
  67             new_row[k] = Simplified(v)
  68         results.append(new_row)
  69     context = Context({'results': results,})
  70     print template.render(context)
  71
  72 def blankOrUri(value=None):
  73     """Return a blank node for None or a resource node for strings.
  74     """
  75     node = None
  76     if value is None:
  77         node = RDF.Node()
  78     elif type(value) in types.StringTypes:
  79         node = RDF.Node(uri_string=value)
  80     elif isinstance(value, RDF.Node):
  81         node = value
  82
  83     return node
  84
  85
  86 def toTypedNode(value):
  87     """Convert a python variable to a RDF Node with its closest xsd type
  88     """
  89     if type(value) == types.BooleanType:
  90         value_type = xsdNS['boolean'].uri
  91         if value:
  92             value = u'1'
  93         else:
  94             value = u'0'
  95     elif type(value) in (types.IntType, types.LongType):
  96         value_type = xsdNS['decimal'].uri
  97         value = unicode(value)
  98     elif type(value) == types.FloatType:
  99         value_type = xsdNS['float'].uri
 100         value = unicode(value)
 101     elif isinstance(value, datetime):
 102         value_type = xsdNS['dateTime'].uri
 103         if value.microsecond == 0:
 104             value = value.strftime(ISOFORMAT_SHORT)
 105         else:
 106             value = value.strftime(ISOFORMAT_MS)
 107     else:
 108         value_type = None
 109         value = unicode(value)
 110
 111     if value_type is not None:
 112         node = RDF.Node(literal=value, datatype=value_type)
 113     else:
 114         node = RDF.Node(literal=unicode(value).encode('utf-8'))
 115     return node
 116
 117
 118 def fromTypedNode(node):
 119     """Convert a typed RDF Node to its closest python equivalent
 120     """
 121     if node is None:
 122         return None
 123
 124     value_type = get_node_type(node)
 125     literal = node.literal_value['string']
 126     literal_lower = literal.lower()
 127
 128     if value_type == 'boolean':
 129         if literal_lower in ('1', 'yes', 'true'):
 130             return True
 131         elif literal_lower in ('0', 'no', 'false'):
 132             return False
 133         else:
 134             raise ValueError("Unrecognized boolean %s" % (literal,))
 135     elif value_type == 'integer':
 136         return int(literal)
 137     elif value_type == 'decimal' and literal.find('.') == -1:
 138         return int(literal)
 139     elif value_type in ('decimal', 'float', 'double'):
 140         return float(literal)
 141     elif value_type in ('string'):
 142         return literal
 143     elif value_type in ('dateTime'):
 144         try:
 145             return datetime.strptime(literal, ISOFORMAT_MS)
 146         except ValueError, _:
 147             return datetime.strptime(literal, ISOFORMAT_SHORT)
 148     return literal
 149
 150
 151 def get_node_type(node):
 152     """Return just the base name of a XSD datatype:
 153     e.g. http://www.w3.org/2001/XMLSchema#integer -> integer
 154     """
 155     # chop off xml schema declaration
 156     value_type = node.literal_value['datatype']
 157     if value_type is None:
 158         return "string"
 159     else:
 160         value_type = str(value_type)
 161         return value_type.replace(str(xsdNS[''].uri), '')
 162
 163
 164 def simplify_rdf(value):
 165     """Return a short name for a RDF object
 166     e.g. The last part of a URI or an untyped string.
 167     """
 168     if isinstance(value, RDF.Node):
 169         if value.is_resource():
 170             name = simplify_uri(str(value.uri))
 171         elif value.is_blank():
 172             name = '<BLANK>'
 173         else:
 174             name = value.literal_value['string']
 175     elif isinstance(value, RDF.Uri):
 176         name = split_uri(str(value))
 177     else:
 178         name = value
 179     return str(name)
 180
 181
 182 def simplify_uri(uri):
 183     """Split off the end of a uri
 184
 185     >>> simplify_uri('http://asdf.org/foo/bar')
 186     'bar'
 187     >>> simplify_uri('http://asdf.org/foo/bar#bleem')
 188     'bleem'
 189     >>> simplify_uri('http://asdf.org/foo/bar/')
 190     'bar'
 191     >>> simplify_uri('http://asdf.org/foo/bar?was=foo')
 192     'was=foo'
 193     """
 194     if isinstance(uri, RDF.Node):
 195         if uri.is_resource():
 196             uri = uri.uri
 197         else:
 198             raise ValueError("Can't simplify an RDF literal")
 199     if isinstance(uri, RDF.Uri):
 200         uri = str(uri)
 201
 202     parsed = urlparse(uri)
 203     if len(parsed.query) > 0:
 204         return parsed.query
 205     elif len(parsed.fragment) > 0:
 206         return parsed.fragment
 207     elif len(parsed.path) > 0:
 208         for element in reversed(parsed.path.split('/')):
 209             if len(element) > 0:
 210                 return element
 211     raise ValueError("Unable to simplify %s" % (uri,))
 212
 213 def stripNamespace(namespace, term):
 214     """Remove the namespace portion of a term
 215
 216     returns None if they aren't in common
 217     """
 218     if isinstance(term, RDF.Node):
 219         if  term.is_resource():
 220             term = term.uri
 221         else:
 222             raise ValueError("This works on resources")
 223     elif not isinstance(term, RDF.Uri):
 224         raise ValueError("This works on resources")
 225     term_s = str(term)
 226     if not term_s.startswith(namespace._prefix):
 227         return None
 228     return term_s.replace(namespace._prefix, "")
 229
 230
 231 def get_model(model_name=None, directory=None):
 232     if directory is None:
 233         directory = os.getcwd()
 234
 235     if model_name is None:
 236         storage = RDF.MemoryStorage(options_string="contexts='yes'")
 237         logger.info("Using RDF Memory model")
 238     else:
 239         options = "contexts='yes',hash-type='bdb',dir='{0}'".format(directory)
 240         storage = RDF.HashStorage(model_name,
 241                       options=options)
 242         logger.info("Using {0} with options {1}".format(model_name, options))
 243     model = RDF.Model(storage)
 244     return model
 245
 246
 247 def load_into_model(model, parser_name, path, ns=None):
 248     if type(ns) in types.StringTypes:
 249         ns = RDF.Uri(ns)
 250
 251     if isinstance(path, RDF.Node):
 252         if path.is_resource():
 253             path = str(path.uri)
 254         else:
 255             raise ValueError("url to load can't be a RDF literal")
 256
 257     url_parts = list(urlparse(path))
 258     if len(url_parts[0]) == 0 or url_parts[0] == 'file':
 259         url_parts[0] = 'file'
 260         url_parts[2] = os.path.abspath(url_parts[2])
 261     if parser_name is None or parser_name == 'guess':
 262         parser_name = guess_parser_by_extension(path)
 263     url = urlunparse(url_parts)
 264     logger.info("Opening {0} with parser {1}".format(url, parser_name))
 265
 266     rdf_parser = RDF.Parser(name=parser_name)
 267
 268     statements = []
 269     retries = 3
 270     while retries > 0:
 271         try:
 272             retries -= 1
 273             statements = rdf_parser.parse_as_stream(url, ns)
 274             retries = 0
 275         except RDF.RedlandError, e:
 276             errmsg = "RDF.RedlandError: {0} {1} tries remaining"
 277             logger.error(errmsg.format(str(e), retries))
 278
 279     for s in statements:
 280         conditionally_add_statement(model, s, ns)
 281
 282 def load_string_into_model(model, parser_name, data, ns=None):
 283     ns = fixup_namespace(ns)
 284     logger.debug("load_string_into_model parser={0}, len={1}".format(
 285         parser_name, len(data)))
 286     rdf_parser = RDF.Parser(name=parser_name)
 287
 288     for s in rdf_parser.parse_string_as_stream(data, ns):
 289         conditionally_add_statement(model, s, ns)
 290
 291
 292 def fixup_namespace(ns):
 293     if ns is None:
 294         ns = RDF.Uri("http://localhost/")
 295     elif type(ns) in types.StringTypes:
 296         ns = RDF.Uri(ns)
 297     elif not(isinstance(ns, RDF.Uri)):
 298         errmsg = "Namespace should be string or uri not {0}"
 299         raise ValueError(errmsg.format(str(type(ns))))
 300     return ns
 301
 302
 303 def conditionally_add_statement(model, s, ns):
 304     imports = owlNS['imports']
 305     if s.predicate == imports:
 306         obj = str(s.object)
 307         logger.info("Importing %s" % (obj,))
 308         load_into_model(model, None, obj, ns)
 309     if s.object.is_literal():
 310         value_type = get_node_type(s.object)
 311         if value_type == 'string':
 312             s.object = sanitize_literal(s.object)
 313     model.add_statement(s)
 314
 315
 316 def add_default_schemas(model, schema_path=None):
 317     """Add default schemas to a model
 318     Looks for turtle files in either htsworkflow/util/schemas
 319     or in the list of directories provided in schema_path
 320     """
 321
 322     if schema_path is None:
 323         path, _ = os.path.split(__file__)
 324         schema_path = [os.path.join(path, 'schemas')]
 325     elif type(schema_path) in types.StringTypes:
 326         schema_path = [schema_path]
 327
 328     for p in schema_path:
 329         for f in glob(os.path.join(p, '*.turtle')):
 330             add_schema(model, f)
 331
 332 def add_schema(model, filename):
 333     """Add a schema to a model.
 334
 335     Main difference from 'load_into_model' is it tags it with
 336     a RDFlib context so I can remove them later.
 337     """
 338     parser = RDF.Parser(name='turtle')
 339     context = RDF.Node(RDF.Uri(SCHEMAS_URL))
 340     url = 'file://' + filename
 341     for s in parser.parse_as_stream(url):
 342         try:
 343             model.append(s, context)
 344         except RDF.RedlandError as e:
 345             logger.error("%s with %s", str(e), str(s))
 346
 347
 348 def remove_schemas(model):
 349     """Remove statements labeled with our schema context"""
 350     context = RDF.Node(RDF.Uri(SCHEMAS_URL))
 351     model.context_remove_statements(context)
 352
 353
 354 def sanitize_literal(node):
 355     """Clean up a literal string
 356     """
 357     if not isinstance(node, RDF.Node):
 358         raise ValueError("sanitize_literal only works on RDF.Nodes")
 359
 360     s = node.literal_value['string']
 361     if len(s) > 0:
 362         element = lxml.html.fromstring(s)
 363         cleaner = lxml.html.clean.Cleaner(page_structure=False)
 364         element = cleaner.clean_html(element)
 365         text = lxml.html.tostring(element)
 366         p_len = 3
 367         slash_p_len = 4
 368
 369         args = {'literal': text[p_len:-slash_p_len]}
 370     else:
 371         args = {'literal': ''}
 372     datatype = node.literal_value['datatype']
 373     if datatype is not None:
 374         args['datatype'] = datatype
 375     language = node.literal_value['language']
 376     if language is not None:
 377         args['language'] = language
 378     return RDF.Node(**args)
 379
 380
 381 def guess_parser(content_type, pathname):
 382     if content_type in ('application/rdf+xml',):
 383         return 'rdfxml'
 384     elif content_type in ('application/x-turtle',):
 385         return 'turtle'
 386     elif content_type in ('text/html',):
 387         return 'rdfa'
 388     elif content_type is None or content_type in ('text/plain',):
 389         return guess_parser_by_extension(pathname)
 390
 391 def guess_parser_by_extension(pathname):
 392     _, ext = os.path.splitext(pathname)
 393     if ext in ('.xml', '.rdf'):
 394         return 'rdfxml'
 395     elif ext in ('.html',):
 396         return 'rdfa'
 397     elif ext in ('.turtle',):
 398         return 'turtle'
 399     return 'guess'
 400
 401 def get_serializer(name='turtle'):
 402     """Return a serializer with our standard prefixes loaded
 403     """
 404     writer = RDF.Serializer(name=name)
 405     # really standard stuff
 406     writer.set_namespace('rdf', rdfNS._prefix)
 407     writer.set_namespace('rdfs', rdfsNS._prefix)
 408     writer.set_namespace('owl', owlNS._prefix)
 409     writer.set_namespace('dc', dcNS._prefix)
 410     writer.set_namespace('xml', xmlNS._prefix)
 411     writer.set_namespace('xsd', xsdNS._prefix)
 412     writer.set_namespace('vs', vsNS._prefix)
 413     writer.set_namespace('wot', wotNS._prefix)
 414
 415     # should these be here, kind of specific to an application
 416     writer.set_namespace('libraryOntology', libraryOntology._prefix)
 417     writer.set_namespace('ucscSubmission', submissionOntology._prefix)
 418     writer.set_namespace('ucscDaf', dafTermOntology._prefix)
 419     return writer
 420
 421
 422 def dump_model(model):
 423     serializer = get_serializer()
 424     print serializer.serialize_model_to_string(model)