htsworkflow/util/rdfhelp.py

   1 """Helper features for working with librdf
   2 """
   3 import collections
   4 from datetime import datetime
   5 from urlparse import urlparse, urlunparse
   6 from urllib2 import urlopen
   7 import logging
   8 import os
   9 import types
  10
  11 import lxml.html
  12 import lxml.html.clean
  13 import RDF
  14
  15 logger = logging.getLogger(__name__)
  16
  17 # standard ontology namespaces
  18 owlNS = RDF.NS('http://www.w3.org/2002/07/owl#')
  19 dublinCoreNS = RDF.NS("http://purl.org/dc/elements/1.1/")
  20 rdfNS = RDF.NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
  21 rdfsNS = RDF.NS("http://www.w3.org/2000/01/rdf-schema#")
  22 xsdNS = RDF.NS("http://www.w3.org/2001/XMLSchema#")
  23
  24 # internal ontologies
  25 submissionOntology = RDF.NS(
  26     "http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#")
  27 dafTermOntology = RDF.NS("http://jumpgate.caltech.edu/wiki/UcscDaf#")
  28 libraryOntology = RDF.NS("http://jumpgate.caltech.edu/wiki/LibraryOntology#")
  29 inventoryOntology = RDF.NS(
  30     "http://jumpgate.caltech.edu/wiki/InventoryOntology#")
  31 submissionLog = RDF.NS("http://jumpgate.caltech.edu/wiki/SubmissionsLog/")
  32 geoSoftNS = RDF.NS('http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#')
  33
  34 ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
  35 ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
  36
  37
  38 def sparql_query(model, query_filename, output_format='text'):
  39     """Execute sparql query from file
  40     """
  41     logger.info("Opening: %s" % (query_filename,))
  42     query_body = open(query_filename, 'r').read()
  43     query = RDF.SPARQLQuery(query_body)
  44     results = query.execute(model)
  45     if output_format == 'html':
  46         html_query_results(results)
  47     else:
  48         display_query_results(results)
  49
  50
  51 def display_query_results(results):
  52     """A very simple display of sparql query results showing name value pairs
  53     """
  54     for row in results:
  55         for k, v in row.items()[::-1]:
  56             print "{0}: {1}".format(k, v)
  57         print
  58
  59 def html_query_results(result_stream):
  60     from django.conf import settings
  61     from django.template import Context, loader
  62
  63     # I did this because I couldn't figure out how to
  64     # get simplify_rdf into the django template as a filter
  65     class Simplified(object):
  66         def __init__(self, value):
  67             self.simple = simplify_rdf(value)
  68             if value.is_resource():
  69                 self.url = value
  70             else:
  71                 self.url = None
  72
  73     template = loader.get_template('rdf_report.html')
  74     results = []
  75     for row in result_stream:
  76         new_row = collections.OrderedDict()
  77         row_urls = []
  78         for k,v in row.items():
  79             new_row[k] = Simplified(v)
  80         results.append(new_row)
  81     context = Context({'results': results,})
  82     print template.render(context)
  83
  84 def blankOrUri(value=None):
  85     """Return a blank node for None or a resource node for strings.
  86     """
  87     node = None
  88     if value is None:
  89         node = RDF.Node()
  90     elif type(value) in types.StringTypes:
  91         node = RDF.Node(uri_string=value)
  92     elif isinstance(value, RDF.Node):
  93         node = value
  94
  95     return node
  96
  97
  98 def toTypedNode(value):
  99     """Convert a python variable to a RDF Node with its closest xsd type
 100     """
 101     if type(value) == types.BooleanType:
 102         value_type = xsdNS['boolean'].uri
 103         if value:
 104             value = u'1'
 105         else:
 106             value = u'0'
 107     elif type(value) in (types.IntType, types.LongType):
 108         value_type = xsdNS['decimal'].uri
 109         value = unicode(value)
 110     elif type(value) == types.FloatType:
 111         value_type = xsdNS['float'].uri
 112         value = unicode(value)
 113     elif isinstance(value, datetime):
 114         value_type = xsdNS['dateTime'].uri
 115         if value.microsecond == 0:
 116             value = value.strftime(ISOFORMAT_SHORT)
 117         else:
 118             value = value.strftime(ISOFORMAT_MS)
 119     else:
 120         value_type = None
 121         value = unicode(value)
 122
 123     if value_type is not None:
 124         node = RDF.Node(literal=value, datatype=value_type)
 125     else:
 126         node = RDF.Node(literal=unicode(value).encode('utf-8'))
 127     return node
 128
 129
 130 def fromTypedNode(node):
 131     """Convert a typed RDF Node to its closest python equivalent
 132     """
 133     if node is None:
 134         return None
 135
 136     value_type = get_node_type(node)
 137     literal = node.literal_value['string']
 138     literal_lower = literal.lower()
 139
 140     if value_type == 'boolean':
 141         if literal_lower in ('1', 'yes', 'true'):
 142             return True
 143         elif literal_lower in ('0', 'no', 'false'):
 144             return False
 145         else:
 146             raise ValueError("Unrecognized boolean %s" % (literal,))
 147     elif value_type == 'integer':
 148         return int(literal)
 149     elif value_type == 'decimal' and literal.find('.') == -1:
 150         return int(literal)
 151     elif value_type in ('decimal', 'float', 'double'):
 152         return float(literal)
 153     elif value_type in ('string'):
 154         return literal
 155     elif value_type in ('dateTime'):
 156         try:
 157             return datetime.strptime(literal, ISOFORMAT_MS)
 158         except ValueError, _:
 159             return datetime.strptime(literal, ISOFORMAT_SHORT)
 160     return literal
 161
 162
 163 def get_node_type(node):
 164     """Return just the base name of a XSD datatype:
 165     e.g. http://www.w3.org/2001/XMLSchema#integer -> integer
 166     """
 167     # chop off xml schema declaration
 168     value_type = node.literal_value['datatype']
 169     if value_type is None:
 170         return "string"
 171     else:
 172         value_type = str(value_type)
 173         return value_type.replace(str(xsdNS[''].uri), '')
 174
 175
 176 def simplify_rdf(value):
 177     """Return a short name for a RDF object
 178     e.g. The last part of a URI or an untyped string.
 179     """
 180     if isinstance(value, RDF.Node):
 181         if value.is_resource():
 182             name = simplify_uri(str(value.uri))
 183         elif value.is_blank():
 184             name = '<BLANK>'
 185         else:
 186             name = value.literal_value['string']
 187     elif isinstance(value, RDF.Uri):
 188         name = split_uri(str(value))
 189     else:
 190         name = value
 191     return str(name)
 192
 193
 194 def simplify_uri(uri):
 195     """Split off the end of a uri
 196
 197     >>> simplify_uri('http://asdf.org/foo/bar')
 198     'bar'
 199     >>> simplify_uri('http://asdf.org/foo/bar#bleem')
 200     'bleem'
 201     >>> simplify_uri('http://asdf.org/foo/bar/')
 202     'bar'
 203     >>> simplify_uri('http://asdf.org/foo/bar?was=foo')
 204     'was=foo'
 205     """
 206     if isinstance(uri, RDF.Node):
 207         if uri.is_resource():
 208             uri = uri.uri
 209         else:
 210             raise ValueError("Can't simplify an RDF literal")
 211     if isinstance(uri, RDF.Uri):
 212         uri = str(uri)
 213
 214     parsed = urlparse(uri)
 215     if len(parsed.query) > 0:
 216         return parsed.query
 217     elif len(parsed.fragment) > 0:
 218         return parsed.fragment
 219     elif len(parsed.path) > 0:
 220         for element in reversed(parsed.path.split('/')):
 221             if len(element) > 0:
 222                 return element
 223     raise ValueError("Unable to simplify %s" % (uri,))
 224
 225 def stripNamespace(namespace, term):
 226     """Remove the namespace portion of a term
 227
 228     returns None if they aren't in common
 229     """
 230     if isinstance(term, RDF.Node):
 231         if  term.is_resource():
 232             term = term.uri
 233         else:
 234             raise ValueError("This works on resources")
 235     elif not isinstance(term, RDF.Uri):
 236         raise ValueError("This works on resources")
 237     term_s = str(term)
 238     if not term_s.startswith(namespace._prefix):
 239         return None
 240     return term_s.replace(namespace._prefix, "")
 241
 242
 243 def get_model(model_name=None, directory=None):
 244     if directory is None:
 245         directory = os.getcwd()
 246
 247     if model_name is None:
 248         storage = RDF.MemoryStorage()
 249         logger.info("Using RDF Memory model")
 250     else:
 251         options = "hash-type='bdb',dir='{0}'".format(directory)
 252         storage = RDF.HashStorage(model_name,
 253                       options=options)
 254         logger.info("Using {0} with options {1}".format(model_name, options))
 255     model = RDF.Model(storage)
 256     return model
 257
 258
 259 def load_into_model(model, parser_name, path, ns=None):
 260     if isinstance(path, RDF.Node):
 261         if path.is_resource():
 262             path = str(path.uri)
 263         else:
 264             raise ValueError("url to load can't be a RDF literal")
 265
 266     url_parts = list(urlparse(path))
 267     if len(url_parts[0]) == 0:
 268         url_parts[0] = 'file'
 269         url_parts[2] = os.path.abspath(url_parts[2])
 270     url = urlunparse(url_parts)
 271     logger.info("Opening %s" % (url,))
 272     req = urlopen(url)
 273     logger.debug("request status: %s" % (req.code,))
 274     if parser_name is None:
 275         content_type = req.headers.get('Content-Type', None)
 276         parser_name = guess_parser(content_type, path)
 277         logger.debug("Guessed parser: %s" % (parser_name,))
 278     data = req.read()
 279     load_string_into_model(model, parser_name, data, ns)
 280
 281
 282 def load_string_into_model(model, parser_name, data, ns=None):
 283     if ns is None:
 284         ns = RDF.Uri("http://localhost/")
 285     imports = owlNS['imports']
 286     rdf_parser = RDF.Parser(name=parser_name)
 287     for s in rdf_parser.parse_string_as_stream(data, ns):
 288         if s.predicate == imports:
 289             obj = str(s.object)
 290             logger.info("Importing %s" % (obj,))
 291             load_into_model(model, None, obj, ns)
 292         if s.object.is_literal():
 293             value_type = get_node_type(s.object)
 294             if value_type == 'string':
 295                 s.object = sanitize_literal(s.object)
 296         model.add_statement(s)
 297
 298
 299 def sanitize_literal(node):
 300     """Clean up a literal string
 301     """
 302     if not isinstance(node, RDF.Node):
 303         raise ValueError("sanitize_literal only works on RDF.Nodes")
 304
 305     s = node.literal_value['string']
 306     if len(s) > 0:
 307         element = lxml.html.fromstring(s)
 308         cleaner = lxml.html.clean.Cleaner(page_structure=False)
 309         element = cleaner.clean_html(element)
 310         text = lxml.html.tostring(element)
 311         p_len = 3
 312         slash_p_len = 4
 313
 314         args = {'literal': text[p_len:-slash_p_len]}
 315     else:
 316         args = {'literal': ''}
 317     datatype = node.literal_value['datatype']
 318     if datatype is not None:
 319         args['datatype'] = datatype
 320     language = node.literal_value['language']
 321     if language is not None:
 322         args['language'] = language
 323     return RDF.Node(**args)
 324
 325
 326 def guess_parser(content_type, pathname):
 327     if content_type in ('application/rdf+xml'):
 328         return 'rdfxml'
 329     elif content_type in ('application/x-turtle'):
 330         return 'turtle'
 331     elif content_type in ('text/html'):
 332         return 'rdfa'
 333     elif content_type is None:
 334         _, ext = os.path.splitext(pathname)
 335         if ext in ('xml', 'rdf'):
 336             return 'rdfxml'
 337         elif ext in ('html'):
 338             return 'rdfa'
 339         elif ext in ('turtle'):
 340             return 'turtle'
 341     return 'guess'
 342
 343 def get_serializer(name='turtle'):
 344     """Return a serializer with our standard prefixes loaded
 345     """
 346     writer = RDF.Serializer(name=name)
 347     # really standard stuff
 348     writer.set_namespace('owl', owlNS._prefix)
 349     writer.set_namespace('rdf', rdfNS._prefix)
 350     writer.set_namespace('rdfs', rdfsNS._prefix)
 351     writer.set_namespace('xsd', xsdNS._prefix)
 352
 353     # should these be here, kind of specific to an application
 354     writer.set_namespace('libraryOntology', libraryOntology._prefix)
 355     writer.set_namespace('ucscSubmission', submissionOntology._prefix)
 356     writer.set_namespace('ucscDaf', dafTermOntology._prefix)
 357     return writer
 358
 359
 360 def dump_model(model):
 361     serializer = get_serializer()
 362     print serializer.serialize_model_to_string(model)