Restrict normalization to unicode-compatible values

This allows us to avoid accidentally overriding our intial lexical value with one that is not unicode compatible after normalization. This is specifically relevant for arbitrary binary data with bytes outside of the defined unicode range.
RDFLib · Jan 23, 2017 · e4f3f20 · e4f3f20
1 parent 14243b6
commit e4f3f20
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 12 deletions.
diff --git a/rdflib/term.py b/rdflib/term.py
@@ -50,7 +50,6 @@
 
 from isodate import parse_time, parse_date, parse_datetime
 
-
 try:
     from hashlib import md5
     assert md5
@@ -63,7 +62,6 @@
 from rdflib.compat import numeric_greater
 
 
-
 b = py3compat.b
 
 skolem_genid = "/.well-known/genid/"
@@ -83,6 +81,24 @@ def _is_valid_uri(uri):
 def _is_valid_langtag(tag):
     return bool(_lang_tag_regex.match(tag))
 
+def _is_valid_unicode(value):
+    """
+    Verify that the provided value can be converted into a Python
+    unicode object.
+    """
+    if isinstance(value, bytes):
+        coding_func, param = getattr(value, 'decode'), 'utf-8'
+    elif py3compat.PY3:
+        coding_func, param = str, value
+    else:
+        coding_func, param = unicode, value
+
+    # Try to encode/decode data into ascii
+    try:
+        coding_func(param)
+    except UnicodeError:
+        return False
+    return True
 
 class Node(object):
     """
@@ -568,10 +584,9 @@ def __new__(cls, lexical_or_value, lang=None, datatype=None, normalize=None):
                 # passed a string
                 # try parsing lexical form of datatyped literal
                 value = _castLexicalToPython(lexical_or_value, datatype)
-
                 if value is not None and normalize:
                     _value, _datatype = _castPythonToLiteral(value)
-                    if _value is not None:
+                    if _value is not None and _is_valid_unicode(_value):
                         lexical_or_value = _value
 
         else:
@@ -585,7 +600,6 @@ def __new__(cls, lexical_or_value, lang=None, datatype=None, normalize=None):
             if datatype:
                 lang = None
 
-
         if py3compat.PY3 and isinstance(lexical_or_value, bytes):
             lexical_or_value = lexical_or_value.decode('utf-8')
 
@@ -1495,8 +1509,7 @@ def _castPythonToLiteral(obj):
     URIRef(_XSD_PFX + 'unsignedByte'): int,
     URIRef(_XSD_PFX + 'float'): float,
     URIRef(_XSD_PFX + 'double'): float,
-    URIRef(
-        _XSD_PFX + 'base64Binary'): lambda s: base64.b64decode(py3compat.b(s)),
+    URIRef(_XSD_PFX + 'base64Binary'): lambda s: base64.b64decode(s),
     URIRef(_XSD_PFX + 'anyURI'): None,
     _RDF_XMLLITERAL: _parseXML,
     _RDF_HTMLLITERAL: _parseHTML

diff --git a/test/test_term.py b/test/test_term.py
@@ -3,9 +3,12 @@
 """
 
 import unittest
+import base64
+
 from rdflib.py3compat import format_doctest_out as uformat
-from rdflib.term import URIRef, BNode
+from rdflib.term import URIRef, BNode, Literal, _is_valid_unicode
 from rdflib.graph import QuotedGraph, Graph
+from rdflib.namespace import XSD
 
 class TestURIRefRepr(unittest.TestCase):
     """
@@ -25,10 +28,7 @@ def testGracefulOrdering(self):
         a = u>BNode()
         a = u>QuotedGraph(g.store, u)
         a = u>g
-
-
-
-
+
 
 class TestBNodeRepr(unittest.TestCase):
 
@@ -37,3 +37,27 @@ class MyBNode(BNode):
             pass
         x = MyBNode()
         self.assert_(repr(x).startswith("MyBNode("))
+
+class TestLiteral(unittest.TestCase):
+
+    def test_base64_values(self):
+        b64msg = 'cmRmbGliIGlzIGNvb2whIGFsc28gaGVyZSdzIHNvbWUgYmluYXJ5IAAR83UC'
+        decoded_b64msg = base64.b64decode(b64msg)
+        lit = Literal(b64msg, datatype=XSD.base64Binary)
+        self.assertEqual(lit.value, decoded_b64msg)
+        self.assertEqual(str(lit), b64msg)
+
+class TestValidityFunctions(unittest.TestCase):
+
+    def test_is_valid_unicode(self):
+        testcase_list = (
+            (None, True),
+            (1, True),
+            (['foo'], True),
+            ({'foo': b'bar'}, True),
+            ('foo', True),
+            (b'foo\x00', True),
+            (b'foo\xf3\x02', False)
+        )
+        for val, expected in testcase_list:
+            self.assertEqual(_is_valid_unicode(val), expected)