sanitize_literal had trouble with empty strings.
authorDiane Trout <diane@caltech.edu>
Tue, 28 Aug 2012 22:56:22 +0000 (15:56 -0700)
committerDiane Trout <diane@caltech.edu>
Tue, 28 Aug 2012 22:56:22 +0000 (15:56 -0700)
This fixes that and tests that edge case.

htsworkflow/util/rdfhelp.py
htsworkflow/util/test/test_rdfhelp.py

index 3ca27fb97315b968644724a977e55aedd1bb5443..29df21ec492beeaee42b54dc65e7216753f6a3a7 100644 (file)
@@ -267,7 +267,7 @@ def load_into_model(model, parser_name, path, ns=None):
 
 def load_string_into_model(model, parser_name, data, ns=None):
     if ns is None:
-        ns = RDF.NS("http://localhost/")
+        ns = RDF.Uri("http://localhost/")
     imports = owlNS['imports']
     rdf_parser = RDF.Parser(name=parser_name)
     for s in rdf_parser.parse_string_as_stream(data, ns):
@@ -288,14 +288,18 @@ def sanitize_literal(node):
     if not isinstance(node, RDF.Node):
         raise ValueError("sanitize_literal only works on RDF.Nodes")
 
-    element = lxml.html.fromstring(node.literal_value['string'])
-    cleaner = lxml.html.clean.Cleaner(page_structure=False)
-    element = cleaner.clean_html(element)
-    text = lxml.html.tostring(element)
-    p_len = 3
-    slash_p_len = 4
+    s = node.literal_value['string']
+    if len(s) > 0:
+        element = lxml.html.fromstring(s)
+        cleaner = lxml.html.clean.Cleaner(page_structure=False)
+        element = cleaner.clean_html(element)
+        text = lxml.html.tostring(element)
+        p_len = 3
+        slash_p_len = 4
 
-    args = {'literal': text[p_len:-slash_p_len]}
+        args = {'literal': text[p_len:-slash_p_len]}
+    else:
+        args = {'literal': ''}
     datatype = node.literal_value['datatype']
     if datatype is not None:
         args['datatype'] = datatype
index 4fe6769f827b0636cccfdbdd63307981018e7423..34c3200a909de310c8fb9dd48bcb04605aa8a3c9 100644 (file)
@@ -150,6 +150,11 @@ _:a owl:imports "{loc}extra.turtle" .
             self.failUnlessEqual(hello_clean.literal_value['string'],
                                  hello_text)
 
+        def test_sanitize_literal_empty_string(self):
+            value = ""
+            value_node = RDF.Node(value)
+            self.assertEqual(str(sanitize_literal(value_node)), value)
+
         def test_sanitize_literal_html(self):
             hello = "hello <a onload='javascript:alert(\"foo\");' href='http://google.com'>google.com</a>, whats up?"
             hello_clean = 'hello <a href="http://google.com">google.com</a>, whats up?'