1 """Helper features for working with librdf
3 from datetime import datetime
4 from urlparse import urlparse, urlunparse
5 from urllib2 import urlopen
11 import lxml.html.clean
14 logger = logging.getLogger(__name__)
16 # standard ontology namespaces
17 owlNS = RDF.NS('http://www.w3.org/2002/07/owl#')
18 dublinCoreNS = RDF.NS("http://purl.org/dc/elements/1.1/")
19 rdfNS = RDF.NS("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
20 rdfsNS = RDF.NS("http://www.w3.org/2000/01/rdf-schema#")
21 xsdNS = RDF.NS("http://www.w3.org/2001/XMLSchema#")
24 submissionOntology = RDF.NS(
25 "http://jumpgate.caltech.edu/wiki/UcscSubmissionOntology#")
26 dafTermOntology = RDF.NS("http://jumpgate.caltech.edu/wiki/UcscDaf#")
27 libraryOntology = RDF.NS("http://jumpgate.caltech.edu/wiki/LibraryOntology#")
28 inventoryOntology = RDF.NS(
29 "http://jumpgate.caltech.edu/wiki/InventoryOntology#")
30 submissionLog = RDF.NS("http://jumpgate.caltech.edu/wiki/SubmissionsLog/")
31 geoSoftNS = RDF.NS('http://www.ncbi.nlm.nih.gov/geo/info/soft2.html#')
33 ISOFORMAT_MS = "%Y-%m-%dT%H:%M:%S.%f"
34 ISOFORMAT_SHORT = "%Y-%m-%dT%H:%M:%S"
37 def sparql_query(model, query_filename):
38 """Execute sparql query from file
40 logger.info("Opening: %s" % (query_filename,))
41 query_body = open(query_filename, 'r').read()
42 query = RDF.SPARQLQuery(query_body)
43 results = query.execute(model)
44 display_query_results(results)
47 def display_query_results(results):
48 """A very simple display of sparql query results showing name value pairs
51 for k, v in row.items()[::-1]:
52 print "{0}: {1}".format(k, v)
56 def blankOrUri(value=None):
57 """Return a blank node for None or a resource node for strings.
62 elif type(value) in types.StringTypes:
63 node = RDF.Node(uri_string=value)
64 elif isinstance(value, RDF.Node):
70 def toTypedNode(value):
71 """Convert a python variable to a RDF Node with its closest xsd type
73 if type(value) == types.BooleanType:
74 value_type = xsdNS['boolean'].uri
79 elif type(value) in (types.IntType, types.LongType):
80 value_type = xsdNS['decimal'].uri
81 value = unicode(value)
82 elif type(value) == types.FloatType:
83 value_type = xsdNS['float'].uri
84 value = unicode(value)
85 elif isinstance(value, datetime):
86 value_type = xsdNS['dateTime'].uri
87 if value.microsecond == 0:
88 value = value.strftime(ISOFORMAT_SHORT)
90 value = value.strftime(ISOFORMAT_MS)
93 value = unicode(value)
95 if value_type is not None:
96 node = RDF.Node(literal=value, datatype=value_type)
98 node = RDF.Node(literal=unicode(value).encode('utf-8'))
102 def fromTypedNode(node):
103 """Convert a typed RDF Node to its closest python equivalent
108 value_type = get_node_type(node)
109 literal = node.literal_value['string']
110 literal_lower = literal.lower()
112 if value_type == 'boolean':
113 if literal_lower in ('1', 'yes', 'true'):
115 elif literal_lower in ('0', 'no', 'false'):
118 raise ValueError("Unrecognized boolean %s" % (literal,))
119 elif value_type == 'integer':
121 elif value_type == 'decimal' and literal.find('.') == -1:
123 elif value_type in ('decimal', 'float', 'double'):
124 return float(literal)
125 elif value_type in ('string'):
127 elif value_type in ('dateTime'):
129 return datetime.strptime(literal, ISOFORMAT_MS)
130 except ValueError, _:
131 return datetime.strptime(literal, ISOFORMAT_SHORT)
135 def get_node_type(node):
136 """Return just the base name of a XSD datatype:
137 e.g. http://www.w3.org/2001/XMLSchema#integer -> integer
139 # chop off xml schema declaration
140 value_type = node.literal_value['datatype']
141 if value_type is None:
144 value_type = str(value_type)
145 return value_type.replace(str(xsdNS[''].uri), '')
148 def simplifyUri(namespace, term):
149 """Remove the namespace portion of a term
151 returns None if they aren't in common
153 if isinstance(term, RDF.Node):
154 if term.is_resource():
157 raise ValueError("This works on resources")
158 elif not isinstance(term, RDF.Uri):
159 raise ValueError("This works on resources")
161 if not term_s.startswith(namespace._prefix):
163 return term_s.replace(namespace._prefix, "")
166 def get_model(model_name=None, directory=None):
167 if directory is None:
168 directory = os.getcwd()
170 if model_name is None:
171 storage = RDF.MemoryStorage()
172 logger.info("Using RDF Memory model")
174 options = "hash-type='bdb',dir='{0}'".format(directory)
175 storage = RDF.HashStorage(model_name,
177 logger.info("Using {0} with options {1}".format(model_name, options))
178 model = RDF.Model(storage)
182 def load_into_model(model, parser_name, path, ns=None):
183 url_parts = list(urlparse(path))
184 if len(url_parts[0]) == 0:
185 url_parts[0] = 'file'
186 url_parts[2] = os.path.abspath(url_parts[2])
187 url = urlunparse(url_parts)
188 logger.info("Opening %s" % (url,))
190 logger.debug("request status: %s" % (req.code,))
191 if parser_name is None:
192 content_type = req.headers.get('Content-Type', None)
193 parser_name = guess_parser(content_type, path)
194 logger.debug("Guessed parser: %s" % (parser_name,))
196 load_string_into_model(model, parser_name, data, ns)
199 def load_string_into_model(model, parser_name, data, ns=None):
201 ns = RDF.NS("http://localhost/")
202 imports = owlNS['imports']
203 rdf_parser = RDF.Parser(name=parser_name)
204 for s in rdf_parser.parse_string_as_stream(data, ns):
205 if s.predicate == imports:
207 logger.info("Importing %s" % (obj,))
208 load_into_model(model, None, obj, ns)
209 if s.object.is_literal():
210 value_type = get_node_type(s.object)
211 if value_type == 'string':
212 s.object = sanitize_literal(s.object)
213 model.add_statement(s)
216 def sanitize_literal(node):
217 """Clean up a literal string
219 if not isinstance(node, RDF.Node):
220 raise ValueError("sanitize_literal only works on RDF.Nodes")
222 element = lxml.html.fromstring(node.literal_value['string'])
223 cleaner = lxml.html.clean.Cleaner(page_structure=False)
224 element = cleaner.clean_html(element)
225 text = lxml.html.tostring(element)
229 args = {'literal': text[p_len:-slash_p_len]}
230 datatype = node.literal_value['datatype']
231 if datatype is not None:
232 args['datatype'] = datatype
233 language = node.literal_value['language']
234 if language is not None:
235 args['language'] = language
236 return RDF.Node(**args)
239 def guess_parser(content_type, pathname):
240 if content_type in ('application/rdf+xml'):
242 elif content_type in ('application/x-turtle'):
244 elif content_type in ('text/html'):
246 elif content_type is None:
247 _, ext = os.path.splitext(pathname)
248 if ext in ('xml', 'rdf'):
250 elif ext in ('html'):
252 elif ext in ('turtle'):
257 def get_serializer(name='turtle'):
258 """Return a serializer with our standard prefixes loaded
260 writer = RDF.Serializer(name=name)
261 # really standard stuff
262 writer.set_namespace('owl', owlNS._prefix)
263 writer.set_namespace('rdf', rdfNS._prefix)
264 writer.set_namespace('rdfs', rdfsNS._prefix)
265 writer.set_namespace('xsd', xsdNS._prefix)
267 # should these be here, kind of specific to an application
268 writer.set_namespace('libraryOntology', libraryOntology._prefix)
269 writer.set_namespace('ucscSubmission', submissionOntology._prefix)
270 writer.set_namespace('ucscDaf', dafTermOntology._prefix)
274 def dump_model(model):
275 serializer = get_serializer()
276 print serializer.serialize_model_to_string(model)