Skip to content

Commit

Permalink
Restrict normalization to unicode-compatible values
Browse files Browse the repository at this point in the history
This allows us to avoid accidentally overriding our intial lexical
value with one that is not unicode compatible after normalization.
This is specifically relevant for arbitrary binary data with bytes
outside of the defined unicode range.
  • Loading branch information
nateprewitt committed Jan 23, 2017
1 parent 14243b6 commit e4f3f20
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 12 deletions.
27 changes: 20 additions & 7 deletions rdflib/term.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@

from isodate import parse_time, parse_date, parse_datetime


try:
from hashlib import md5
assert md5
Expand All @@ -63,7 +62,6 @@
from rdflib.compat import numeric_greater



b = py3compat.b

skolem_genid = "/.well-known/genid/"
Expand All @@ -83,6 +81,24 @@ def _is_valid_uri(uri):
def _is_valid_langtag(tag):
return bool(_lang_tag_regex.match(tag))

def _is_valid_unicode(value):
"""
Verify that the provided value can be converted into a Python
unicode object.
"""
if isinstance(value, bytes):
coding_func, param = getattr(value, 'decode'), 'utf-8'
elif py3compat.PY3:
coding_func, param = str, value
else:
coding_func, param = unicode, value

# Try to encode/decode data into ascii
try:
coding_func(param)
except UnicodeError:
return False
return True

class Node(object):
"""
Expand Down Expand Up @@ -568,10 +584,9 @@ def __new__(cls, lexical_or_value, lang=None, datatype=None, normalize=None):
# passed a string
# try parsing lexical form of datatyped literal
value = _castLexicalToPython(lexical_or_value, datatype)

if value is not None and normalize:
_value, _datatype = _castPythonToLiteral(value)
if _value is not None:
if _value is not None and _is_valid_unicode(_value):
lexical_or_value = _value

else:
Expand All @@ -585,7 +600,6 @@ def __new__(cls, lexical_or_value, lang=None, datatype=None, normalize=None):
if datatype:
lang = None


if py3compat.PY3 and isinstance(lexical_or_value, bytes):
lexical_or_value = lexical_or_value.decode('utf-8')

Expand Down Expand Up @@ -1495,8 +1509,7 @@ def _castPythonToLiteral(obj):
URIRef(_XSD_PFX + 'unsignedByte'): int,
URIRef(_XSD_PFX + 'float'): float,
URIRef(_XSD_PFX + 'double'): float,
URIRef(
_XSD_PFX + 'base64Binary'): lambda s: base64.b64decode(py3compat.b(s)),
URIRef(_XSD_PFX + 'base64Binary'): lambda s: base64.b64decode(s),
URIRef(_XSD_PFX + 'anyURI'): None,
_RDF_XMLLITERAL: _parseXML,
_RDF_HTMLLITERAL: _parseHTML
Expand Down
34 changes: 29 additions & 5 deletions test/test_term.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
"""

import unittest
import base64

from rdflib.py3compat import format_doctest_out as uformat
from rdflib.term import URIRef, BNode
from rdflib.term import URIRef, BNode, Literal, _is_valid_unicode
from rdflib.graph import QuotedGraph, Graph
from rdflib.namespace import XSD

class TestURIRefRepr(unittest.TestCase):
"""
Expand All @@ -25,10 +28,7 @@ def testGracefulOrdering(self):
a = u>BNode()
a = u>QuotedGraph(g.store, u)
a = u>g






class TestBNodeRepr(unittest.TestCase):

Expand All @@ -37,3 +37,27 @@ class MyBNode(BNode):
pass
x = MyBNode()
self.assert_(repr(x).startswith("MyBNode("))

class TestLiteral(unittest.TestCase):

def test_base64_values(self):
b64msg = 'cmRmbGliIGlzIGNvb2whIGFsc28gaGVyZSdzIHNvbWUgYmluYXJ5IAAR83UC'
decoded_b64msg = base64.b64decode(b64msg)
lit = Literal(b64msg, datatype=XSD.base64Binary)
self.assertEqual(lit.value, decoded_b64msg)
self.assertEqual(str(lit), b64msg)

class TestValidityFunctions(unittest.TestCase):

def test_is_valid_unicode(self):
testcase_list = (
(None, True),
(1, True),
(['foo'], True),
({'foo': b'bar'}, True),
('foo', True),
(b'foo\x00', True),
(b'foo\xf3\x02', False)
)
for val, expected in testcase_list:
self.assertEqual(_is_valid_unicode(val), expected)

0 comments on commit e4f3f20

Please sign in to comment.