htsworkflow/util/rdfhelp.py

   1 """Helper features for working with librdf
   2 """
   3 import collections
   4 from datetime import datetime
   5 from urlparse import urlparse, urlunparse
   6 from urllib2 import urlopen
   7 import logging
   8 import os
   9 import types
  10
  11 import lxml.html
  12 import lxml.html.clean
  13 import RDF
  14
  15 logger = logging.getLogger(__name__)
  16
  17 # standard ontology namespaces
  18 owlNS = RDF.NS('http://www.w3.org/2002/07/owl#')
  19 dublinCoreNS = RDF.NS("http://purl.org/dc/elements/1.1/")
  20 rdfNS = RDF.NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
  21 rdfsNS = RDF.NS("http://www.w3.org/2000/01/rdf-schema#")
  22 xsdNS = RDF.NS("http://www.w3.org/2001/XMLSchema#")
  23
  24 # internal ontologies
  25 submissionOntology = RDF.NS(
  26     "http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#")
  27 dafTermOntology = RDF.NS("http://jumpgate.caltech.edu/wiki/UcscDaf#")
  28 libraryOntology = RDF.NS("http://jumpgate.caltech.edu/wiki/LibraryOntology#")
  29 inventoryOntology = RDF.NS(
  30     "http://jumpgate.caltech.edu/wiki/InventoryOntology#")
  31 submissionLog = RDF.NS("http://jumpgate.caltech.edu/wiki/SubmissionsLog/")
  32 geoSoftNS = RDF.NS('http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#')
  33
  34 ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
  35 ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
  36
  37
  38 def sparql_query(model, query_filename, output_format='text'):
  39     """Execute sparql query from file
  40     """
  41     logger.info("Opening: %s" % (query_filename,))
  42     query_body = open(query_filename, 'r').read()
  43     query = RDF.SPARQLQuery(query_body)
  44     results = query.execute(model)
  45     if output_format == 'html':
  46         html_query_results(results)
  47     else:
  48         display_query_results(results)
  49
  50
  51 def display_query_results(results):
  52     """A very simple display of sparql query results showing name value pairs
  53     """
  54     for row in results:
  55         for k, v in row.items()[::-1]:
  56             print "{0}: {1}".format(k, v)
  57         print
  58
  59 def html_query_results(result_stream):
  60     from django.conf import settings
  61     from django.template import Context, loader
  62
  63     # I did this because I couldn't figure out how to
  64     # get simplify_rdf into the django template as a filter
  65     class Simplified(object):
  66         def __init__(self, value):
  67             self.simple = simplify_rdf(value)
  68             if value.is_resource():
  69                 self.url = value
  70             else:
  71                 self.url = None
  72
  73     template = loader.get_template('rdf_report.html')
  74     results = []
  75     for row in result_stream:
  76         new_row = collections.OrderedDict()
  77         row_urls = []
  78         for k,v in row.items():
  79             new_row[k] = Simplified(v)
  80         results.append(new_row)
  81     context = Context({'results': results,})
  82     print template.render(context)
  83
  84 def blankOrUri(value=None):
  85     """Return a blank node for None or a resource node for strings.
  86     """
  87     node = None
  88     if value is None:
  89         node = RDF.Node()
  90     elif type(value) in types.StringTypes:
  91         node = RDF.Node(uri_string=value)
  92     elif isinstance(value, RDF.Node):
  93         node = value
  94
  95     return node
  96
  97
  98 def toTypedNode(value):
  99     """Convert a python variable to a RDF Node with its closest xsd type
 100     """
 101     if type(value) == types.BooleanType:
 102         value_type = xsdNS['boolean'].uri
 103         if value:
 104             value = u'1'
 105         else:
 106             value = u'0'
 107     elif type(value) in (types.IntType, types.LongType):
 108         value_type = xsdNS['decimal'].uri
 109         value = unicode(value)
 110     elif type(value) == types.FloatType:
 111         value_type = xsdNS['float'].uri
 112         value = unicode(value)
 113     elif isinstance(value, datetime):
 114         value_type = xsdNS['dateTime'].uri
 115         if value.microsecond == 0:
 116             value = value.strftime(ISOFORMAT_SHORT)
 117         else:
 118             value = value.strftime(ISOFORMAT_MS)
 119     else:
 120         value_type = None
 121         value = unicode(value)
 122
 123     if value_type is not None:
 124         node = RDF.Node(literal=value, datatype=value_type)
 125     else:
 126         node = RDF.Node(literal=unicode(value).encode('utf-8'))
 127     return node
 128
 129
 130 def fromTypedNode(node):
 131     """Convert a typed RDF Node to its closest python equivalent
 132     """
 133     if node is None:
 134         return None
 135
 136     value_type = get_node_type(node)
 137     literal = node.literal_value['string']
 138     literal_lower = literal.lower()
 139
 140     if value_type == 'boolean':
 141         if literal_lower in ('1', 'yes', 'true'):
 142             return True
 143         elif literal_lower in ('0', 'no', 'false'):
 144             return False
 145         else:
 146             raise ValueError("Unrecognized boolean %s" % (literal,))
 147     elif value_type == 'integer':
 148         return int(literal)
 149     elif value_type == 'decimal' and literal.find('.') == -1:
 150         return int(literal)
 151     elif value_type in ('decimal', 'float', 'double'):
 152         return float(literal)
 153     elif value_type in ('string'):
 154         return literal
 155     elif value_type in ('dateTime'):
 156         try:
 157             return datetime.strptime(literal, ISOFORMAT_MS)
 158         except ValueError, _:
 159             return datetime.strptime(literal, ISOFORMAT_SHORT)
 160     return literal
 161
 162
 163 def get_node_type(node):
 164     """Return just the base name of a XSD datatype:
 165     e.g. http://www.w3.org/2001/XMLSchema#integer -> integer
 166     """
 167     # chop off xml schema declaration
 168     value_type = node.literal_value['datatype']
 169     if value_type is None:
 170         return "string"
 171     else:
 172         value_type = str(value_type)
 173         return value_type.replace(str(xsdNS[''].uri), '')
 174
 175
 176 def simplify_rdf(value):
 177     """Return a short name for a RDF object
 178     e.g. The last part of a URI or an untyped string.
 179     """
 180     if isinstance(value, RDF.Node):
 181         if value.is_resource():
 182             name = simplify_uri(str(value.uri))
 183         elif value.is_blank():
 184             name = '<BLANK>'
 185         else:
 186             name = value.literal_value['string']
 187     elif isinstance(value, RDF.Uri):
 188         name = split_uri(str(value))
 189     else:
 190         name = value
 191     return str(name)
 192
 193
 194 def simplify_uri(uri):
 195     """Split off the end of a uri
 196
 197     >>> simplify_uri('http://asdf.org/foo/bar')
 198     'bar'
 199     >>> simplify_uri('http://asdf.org/foo/bar#bleem')
 200     'bleem'
 201     >>> simplify_uri('http://asdf.org/foo/bar/')
 202     'bar'
 203     >>> simplify_uri('http://asdf.org/foo/bar?was=foo')
 204     'was=foo'
 205     """
 206     if isinstance(uri, RDF.Node):
 207         if uri.is_resource():
 208             uri = uri.uri
 209         else:
 210             raise ValueError("Can't simplify an RDF literal")
 211     if isinstance(uri, RDF.Uri):
 212         uri = str(uri)
 213
 214     parsed = urlparse(uri)
 215     if len(parsed.query) > 0:
 216         return parsed.query
 217     elif len(parsed.fragment) > 0:
 218         return parsed.fragment
 219     elif len(parsed.path) > 0:
 220         for element in reversed(parsed.path.split('/')):
 221             if len(element) > 0:
 222                 return element
 223     raise ValueError("Unable to simplify %s" % (uri,))
 224
 225 def stripNamespace(namespace, term):
 226     """Remove the namespace portion of a term
 227
 228     returns None if they aren't in common
 229     """
 230     if isinstance(term, RDF.Node):
 231         if  term.is_resource():
 232             term = term.uri
 233         else:
 234             raise ValueError("This works on resources")
 235     elif not isinstance(term, RDF.Uri):
 236         raise ValueError("This works on resources")
 237     term_s = str(term)
 238     if not term_s.startswith(namespace._prefix):
 239         return None
 240     return term_s.replace(namespace._prefix, "")
 241
 242
 243 def get_model(model_name=None, directory=None):
 244     if directory is None:
 245         directory = os.getcwd()
 246
 247     if model_name is None:
 248         storage = RDF.MemoryStorage()
 249         logger.info("Using RDF Memory model")
 250     else:
 251         options = "hash-type='bdb',dir='{0}'".format(directory)
 252         storage = RDF.HashStorage(model_name,
 253                       options=options)
 254         logger.info("Using {0} with options {1}".format(model_name, options))
 255     model = RDF.Model(storage)
 256     return model
 257
 258
 259 def load_into_model(model, parser_name, path, ns=None):
 260     if type(ns) in types.StringTypes:
 261         ns = RDF.Uri(ns)
 262
 263     if isinstance(path, RDF.Node):
 264         if path.is_resource():
 265             path = str(path.uri)
 266         else:
 267             raise ValueError("url to load can't be a RDF literal")
 268
 269     url_parts = list(urlparse(path))
 270     if len(url_parts[0]) == 0 or url_parts[0] == 'file':
 271         url_parts[0] = 'file'
 272         url_parts[2] = os.path.abspath(url_parts[2])
 273         if parser_name is None or parser_name == 'guess':
 274             parser_name = guess_parser_by_extension(path)
 275     url = urlunparse(url_parts)
 276     logger.info("Opening {0} with parser {1}".format(url, parser_name))
 277
 278     rdf_parser = RDF.Parser(name=parser_name)
 279
 280     retries = 3
 281     while retries > 0:
 282         try:
 283             retries -= 1
 284             statements = rdf_parser.parse_as_stream(url, ns)
 285             retries = 0
 286         except RDF.RedlandError, e:
 287             errmsg = "RDF.RedlandError: {0} {1} tries remaining"
 288             logger.error(errmsg.format(str(e), tries))
 289
 290     for s in statements:
 291         conditionally_add_statement(model, s, ns)
 292
 293 def load_string_into_model(model, parser_name, data, ns=None):
 294     ns = fixup_namespace(ns)
 295     logger.debug("load_string_into_model parser={0}, len={1}".format(
 296         parser_name, len(data)))
 297     rdf_parser = RDF.Parser(name=parser_name)
 298
 299     for s in rdf_parser.parse_string_as_stream(data, ns):
 300         conditionally_add_statement(model, s, ns)
 301
 302 def fixup_namespace(ns):
 303     if ns is None:
 304         ns = RDF.Uri("http://localhost/")
 305     elif type(ns) in types.StringTypes:
 306         ns = RDF.Uri(ns)
 307     elif not(isinstance(ns, RDF.Uri)):
 308         errmsg = "Namespace should be string or uri not {0}"
 309         raise ValueError(errmsg.format(str(type(ns))))
 310     return ns
 311
 312 def conditionally_add_statement(model, s, ns):
 313     imports = owlNS['imports']
 314     if s.predicate == imports:
 315         obj = str(s.object)
 316         logger.info("Importing %s" % (obj,))
 317         load_into_model(model, None, obj, ns)
 318     if s.object.is_literal():
 319         value_type = get_node_type(s.object)
 320         if value_type == 'string':
 321             s.object = sanitize_literal(s.object)
 322     model.add_statement(s)
 323
 324 def sanitize_literal(node):
 325     """Clean up a literal string
 326     """
 327     if not isinstance(node, RDF.Node):
 328         raise ValueError("sanitize_literal only works on RDF.Nodes")
 329
 330     s = node.literal_value['string']
 331     if len(s) > 0:
 332         element = lxml.html.fromstring(s)
 333         cleaner = lxml.html.clean.Cleaner(page_structure=False)
 334         element = cleaner.clean_html(element)
 335         text = lxml.html.tostring(element)
 336         p_len = 3
 337         slash_p_len = 4
 338
 339         args = {'literal': text[p_len:-slash_p_len]}
 340     else:
 341         args = {'literal': ''}
 342     datatype = node.literal_value['datatype']
 343     if datatype is not None:
 344         args['datatype'] = datatype
 345     language = node.literal_value['language']
 346     if language is not None:
 347         args['language'] = language
 348     return RDF.Node(**args)
 349
 350
 351 def guess_parser(content_type, pathname):
 352     if content_type in ('application/rdf+xml',):
 353         return 'rdfxml'
 354     elif content_type in ('application/x-turtle',):
 355         return 'turtle'
 356     elif content_type in ('text/html',):
 357         return 'rdfa'
 358     elif content_type is None:
 359         return guess_parser_by_extension(pathname)
 360
 361 def guess_parser_by_extension(pathname):
 362     _, ext = os.path.splitext(pathname)
 363     if ext in ('.xml', '.rdf'):
 364         return 'rdfxml'
 365     elif ext in ('.html'):
 366         return 'rdfa'
 367     elif ext in ('.turtle'):
 368         return 'turtle'
 369     return 'guess'
 370
 371 def get_serializer(name='turtle'):
 372     """Return a serializer with our standard prefixes loaded
 373     """
 374     writer = RDF.Serializer(name=name)
 375     # really standard stuff
 376     writer.set_namespace('owl', owlNS._prefix)
 377     writer.set_namespace('rdf', rdfNS._prefix)
 378     writer.set_namespace('rdfs', rdfsNS._prefix)
 379     writer.set_namespace('xsd', xsdNS._prefix)
 380
 381     # should these be here, kind of specific to an application
 382     writer.set_namespace('libraryOntology', libraryOntology._prefix)
 383     writer.set_namespace('ucscSubmission', submissionOntology._prefix)
 384     writer.set_namespace('ucscDaf', dafTermOntology._prefix)
 385     return writer
 386
 387
 388 def dump_model(model):
 389     serializer = get_serializer()
 390     print serializer.serialize_model_to_string(model)