From e798da687bccfa97c7a57d6b158e252ee9e310b5 Mon Sep 17 00:00:00 2001
From: Nate Prewitt <Nate.Prewitt@gmail.com>
Date: Tue, 24 Jan 2017 07:06:26 -0700
Subject: [PATCH] Restrict normalization to unicode-compatible values

This allows us to avoid accidentally overriding our intial lexical
value with one that is not unicode compatible after normalization.
This is specifically relevant for arbitrary binary data with bytes
outside of the defined unicode range.
---
 rdflib/term.py    | 26 ++++++++++++++++++++------
 test/test_term.py | 38 ++++++++++++++++++++++++++++++++------
 2 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/rdflib/term.py b/rdflib/term.py
index 37c0e84a9..37fa4ac7a 100644
--- a/rdflib/term.py
+++ b/rdflib/term.py
@@ -50,7 +50,6 @@
 
 from isodate import parse_time, parse_date, parse_datetime
 
-
 try:
     from hashlib import md5
     assert md5
@@ -63,7 +62,6 @@
 from rdflib.compat import numeric_greater
 
 
-
 b = py3compat.b
 
 skolem_genid = "/.well-known/genid/"
@@ -83,6 +81,24 @@ def _is_valid_uri(uri):
 def _is_valid_langtag(tag):
     return bool(_lang_tag_regex.match(tag))
 
+def _is_valid_unicode(value):
+    """
+    Verify that the provided value can be converted into a Python
+    unicode object.
+    """
+    if isinstance(value, bytes):
+        coding_func, param = getattr(value, 'decode'), 'utf-8'
+    elif py3compat.PY3:
+        coding_func, param = str, value
+    else:
+        coding_func, param = unicode, value
+
+    # try to convert value into unicode
+    try:
+        coding_func(param)
+    except UnicodeError:
+        return False
+    return True
 
 class Node(object):
     """
@@ -571,7 +587,7 @@ def __new__(cls, lexical_or_value, lang=None, datatype=None, normalize=None):
 
                 if value is not None and normalize:
                     _value, _datatype = _castPythonToLiteral(value)
-                    if _value is not None:
+                    if _value is not None and _is_valid_unicode(_value):
                         lexical_or_value = _value
 
         else:
@@ -585,7 +601,6 @@ def __new__(cls, lexical_or_value, lang=None, datatype=None, normalize=None):
             if datatype:
                 lang = None
 
-
         if py3compat.PY3 and isinstance(lexical_or_value, bytes):
             lexical_or_value = lexical_or_value.decode('utf-8')
 
@@ -1495,8 +1510,7 @@ def _castPythonToLiteral(obj):
     URIRef(_XSD_PFX + 'unsignedByte'): int,
     URIRef(_XSD_PFX + 'float'): float,
     URIRef(_XSD_PFX + 'double'): float,
-    URIRef(
-        _XSD_PFX + 'base64Binary'): lambda s: base64.b64decode(py3compat.b(s)),
+    URIRef(_XSD_PFX + 'base64Binary'): lambda s: base64.b64decode(s),
     URIRef(_XSD_PFX + 'anyURI'): None,
     _RDF_XMLLITERAL: _parseXML,
     _RDF_HTMLLITERAL: _parseHTML
diff --git a/test/test_term.py b/test/test_term.py
index fdbea4d53..78fa7a034 100644
--- a/test/test_term.py
+++ b/test/test_term.py
@@ -3,9 +3,12 @@
 """
 
 import unittest
+import base64
+
 from rdflib.py3compat import format_doctest_out as uformat
-from rdflib.term import URIRef, BNode
+from rdflib.term import URIRef, BNode, Literal, _is_valid_unicode
 from rdflib.graph import QuotedGraph, Graph
+from rdflib.namespace import XSD
 
 class TestURIRefRepr(unittest.TestCase):
     """
@@ -25,15 +28,38 @@ def testGracefulOrdering(self):
         a = u>BNode()
         a = u>QuotedGraph(g.store, u)
         a = u>g
-        
-        
-        
-        
+
 
 class TestBNodeRepr(unittest.TestCase):
-   
+
     def testSubclassNameAppearsInRepr(self):
         class MyBNode(BNode):
             pass
         x = MyBNode()
         self.assertTrue(repr(x).startswith("MyBNode("))
+
+
+class TestLiteral(unittest.TestCase):
+
+    def test_base64_values(self):
+        b64msg = 'cmRmbGliIGlzIGNvb2whIGFsc28gaGVyZSdzIHNvbWUgYmluYXJ5IAAR83UC'
+        decoded_b64msg = base64.b64decode(b64msg)
+        lit = Literal(b64msg, datatype=XSD.base64Binary)
+        self.assertEqual(lit.value, decoded_b64msg)
+        self.assertEqual(str(lit), b64msg)
+
+
+class TestValidityFunctions(unittest.TestCase):
+
+    def test_is_valid_unicode(self):
+        testcase_list = (
+            (None, True),
+            (1, True),
+            (['foo'], True),
+            ({'foo': b'bar'}, True),
+            ('foo', True),
+            (b'foo\x00', True),
+            (b'foo\xf3\x02', False)
+        )
+        for val, expected in testcase_list:
+            self.assertEqual(_is_valid_unicode(val), expected)