htsworkflow/util/rdfhelp.py

   1 """Helper features for working with librdf
   2 """
   3 from datetime import datetime
   4 from urlparse import urlparse, urlunparse
   5 from urllib2 import urlopen
   6 import logging
   7 import os
   8 import types
   9
  10 import lxml.html
  11 import lxml.html.clean
  12 import RDF
  13
  14 logger = logging.getLogger(__name__)
  15
  16 # standard ontology namespaces
  17 owlNS = RDF.NS('http://www.w3.org/2002/07/owl#')
  18 dublinCoreNS = RDF.NS("http://purl.org/dc/elements/1.1/")
  19 rdfNS = RDF.NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
  20 rdfsNS = RDF.NS("http://www.w3.org/2000/01/rdf-schema#")
  21 xsdNS = RDF.NS("http://www.w3.org/2001/XMLSchema#")
  22
  23 # internal ontologies
  24 submissionOntology = RDF.NS(
  25     "http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#")
  26 dafTermOntology = RDF.NS("http://jumpgate.caltech.edu/wiki/UcscDaf#")
  27 libraryOntology = RDF.NS("http://jumpgate.caltech.edu/wiki/LibraryOntology#")
  28 inventoryOntology = RDF.NS(
  29     "http://jumpgate.caltech.edu/wiki/InventoryOntology#")
  30 submissionLog = RDF.NS("http://jumpgate.caltech.edu/wiki/SubmissionsLog/")
  31 geoSoftNS = RDF.NS('http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#')
  32
  33 ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
  34 ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
  35
  36
  37 def sparql_query(model, query_filename):
  38     """Execute sparql query from file
  39     """
  40     logger.info("Opening: %s" % (query_filename,))
  41     query_body = open(query_filename, 'r').read()
  42     query = RDF.SPARQLQuery(query_body)
  43     results = query.execute(model)
  44     display_query_results(results)
  45
  46
  47 def display_query_results(results):
  48     """A very simple display of sparql query results showing name value pairs
  49     """
  50     for row in results:
  51         for k, v in row.items()[::-1]:
  52             print "{0}: {1}".format(k, v)
  53         print
  54
  55
  56 def blankOrUri(value=None):
  57     """Return a blank node for None or a resource node for strings.
  58     """
  59     node = None
  60     if value is None:
  61         node = RDF.Node()
  62     elif type(value) in types.StringTypes:
  63         node = RDF.Node(uri_string=value)
  64     elif isinstance(value, RDF.Node):
  65         node = value
  66
  67     return node
  68
  69
  70 def toTypedNode(value):
  71     """Convert a python variable to a RDF Node with its closest xsd type
  72     """
  73     if type(value) == types.BooleanType:
  74         value_type = xsdNS['boolean'].uri
  75         if value:
  76             value = u'1'
  77         else:
  78             value = u'0'
  79     elif type(value) in (types.IntType, types.LongType):
  80         value_type = xsdNS['decimal'].uri
  81         value = unicode(value)
  82     elif type(value) == types.FloatType:
  83         value_type = xsdNS['float'].uri
  84         value = unicode(value)
  85     elif isinstance(value, datetime):
  86         value_type = xsdNS['dateTime'].uri
  87         if value.microsecond == 0:
  88             value = value.strftime(ISOFORMAT_SHORT)
  89         else:
  90             value = value.strftime(ISOFORMAT_MS)
  91     else:
  92         value_type = None
  93         value = unicode(value)
  94
  95     if value_type is not None:
  96         node = RDF.Node(literal=value, datatype=value_type)
  97     else:
  98         node = RDF.Node(literal=unicode(value).encode('utf-8'))
  99     return node
 100
 101
 102 def fromTypedNode(node):
 103     """Convert a typed RDF Node to its closest python equivalent
 104     """
 105     if node is None:
 106         return None
 107
 108     value_type = get_node_type(node)
 109     literal = node.literal_value['string']
 110     literal_lower = literal.lower()
 111
 112     if value_type == 'boolean':
 113         if literal_lower in ('1', 'yes', 'true'):
 114             return True
 115         elif literal_lower in ('0', 'no', 'false'):
 116             return False
 117         else:
 118             raise ValueError("Unrecognized boolean %s" % (literal,))
 119     elif value_type == 'integer':
 120         return int(literal)
 121     elif value_type == 'decimal' and literal.find('.') == -1:
 122         return int(literal)
 123     elif value_type in ('decimal', 'float', 'double'):
 124         return float(literal)
 125     elif value_type in ('string'):
 126         return literal
 127     elif value_type in ('dateTime'):
 128         try:
 129             return datetime.strptime(literal, ISOFORMAT_MS)
 130         except ValueError, _:
 131             return datetime.strptime(literal, ISOFORMAT_SHORT)
 132     return literal
 133
 134
 135 def get_node_type(node):
 136     """Return just the base name of a XSD datatype:
 137     e.g. http://www.w3.org/2001/XMLSchema#integer -> integer
 138     """
 139     # chop off xml schema declaration
 140     value_type = node.literal_value['datatype']
 141     if value_type is None:
 142         return "string"
 143     else:
 144         value_type = str(value_type)
 145         return value_type.replace(str(xsdNS[''].uri), '')
 146
 147
 148 def simplifyUri(namespace, term):
 149     """Remove the namespace portion of a term
 150
 151     returns None if they aren't in common
 152     """
 153     if isinstance(term, RDF.Node):
 154         if  term.is_resource():
 155             term = term.uri
 156         else:
 157             raise ValueError("This works on resources")
 158     elif not isinstance(term, RDF.Uri):
 159         raise ValueError("This works on resources")
 160     term_s = str(term)
 161     if not term_s.startswith(namespace._prefix):
 162         return None
 163     return term_s.replace(namespace._prefix, "")
 164
 165
 166 def get_model(model_name=None, directory=None):
 167     if directory is None:
 168         directory = os.getcwd()
 169
 170     if model_name is None:
 171         storage = RDF.MemoryStorage()
 172         logger.info("Using RDF Memory model")
 173     else:
 174         options = "hash-type='bdb',dir='{0}'".format(directory)
 175         storage = RDF.HashStorage(model_name,
 176                       options=options)
 177         logger.info("Using {0} with options {1}".format(model_name, options))
 178     model = RDF.Model(storage)
 179     return model
 180
 181
 182 def load_into_model(model, parser_name, path, ns=None):
 183     url_parts = list(urlparse(path))
 184     if len(url_parts[0]) == 0:
 185         url_parts[0] = 'file'
 186         url_parts[2] = os.path.abspath(url_parts[2])
 187     url = urlunparse(url_parts)
 188     logger.info("Opening %s" % (url,))
 189     req = urlopen(url)
 190     logger.debug("request status: %s" % (req.code,))
 191     if parser_name is None:
 192         content_type = req.headers.get('Content-Type', None)
 193         parser_name = guess_parser(content_type, path)
 194         logger.debug("Guessed parser: %s" % (parser_name,))
 195     data = req.read()
 196     load_string_into_model(model, parser_name, data, ns)
 197
 198
 199 def load_string_into_model(model, parser_name, data, ns=None):
 200     if ns is None:
 201         ns = RDF.NS("http://localhost/")
 202     imports = owlNS['imports']
 203     rdf_parser = RDF.Parser(name=parser_name)
 204     for s in rdf_parser.parse_string_as_stream(data, ns):
 205         if s.predicate == imports:
 206             obj = str(s.object)
 207             logger.info("Importing %s" % (obj,))
 208             load_into_model(model, None, obj, ns)
 209         if s.object.is_literal():
 210             value_type = get_node_type(s.object)
 211             if value_type == 'string':
 212                 s.object = sanitize_literal(s.object)
 213         model.add_statement(s)
 214
 215
 216 def sanitize_literal(node):
 217     """Clean up a literal string
 218     """
 219     if not isinstance(node, RDF.Node):
 220         raise ValueError("sanitize_literal only works on RDF.Nodes")
 221
 222     element = lxml.html.fromstring(node.literal_value['string'])
 223     cleaner = lxml.html.clean.Cleaner(page_structure=False)
 224     element = cleaner.clean_html(element)
 225     text = lxml.html.tostring(element)
 226     p_len = 3
 227     slash_p_len = 4
 228
 229     args = {'literal': text[p_len:-slash_p_len]}
 230     datatype = node.literal_value['datatype']
 231     if datatype is not None:
 232         args['datatype'] = datatype
 233     language = node.literal_value['language']
 234     if language is not None:
 235         args['language'] = language
 236     return RDF.Node(**args)
 237
 238
 239 def guess_parser(content_type, pathname):
 240     if content_type in ('application/rdf+xml'):
 241         return 'rdfxml'
 242     elif content_type in ('application/x-turtle'):
 243         return 'turtle'
 244     elif content_type in ('text/html'):
 245         return 'rdfa'
 246     elif content_type is None:
 247         _, ext = os.path.splitext(pathname)
 248         if ext in ('xml', 'rdf'):
 249             return 'rdfxml'
 250         elif ext in ('html'):
 251             return 'rdfa'
 252         elif ext in ('turtle'):
 253             return 'turtle'
 254     return 'guess'
 255
 256
 257 def get_serializer(name='turtle'):
 258     """Return a serializer with our standard prefixes loaded
 259     """
 260     writer = RDF.Serializer(name=name)
 261     # really standard stuff
 262     writer.set_namespace('owl', owlNS._prefix)
 263     writer.set_namespace('rdf', rdfNS._prefix)
 264     writer.set_namespace('rdfs', rdfsNS._prefix)
 265     writer.set_namespace('xsd', xsdNS._prefix)
 266
 267     # should these be here, kind of specific to an application
 268     writer.set_namespace('libraryOntology', libraryOntology._prefix)
 269     writer.set_namespace('ucscSubmission', submissionOntology._prefix)
 270     writer.set_namespace('ucscDaf', dafTermOntology._prefix)
 271     return writer
 272
 273
 274 def dump_model(model):
 275     serializer = get_serializer()
 276     print serializer.serialize_model_to_string(model)