From dff5a56a5dc2d300124cf56e75656f02d6cea924 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 24 Nov 2017 22:51:59 +0900 Subject: [PATCH 1/4] Issue #144 and #145: add encoding-related tests for S3 functionality Assume that if mode is binary and encoding is explicitly specified, the user actually wanted text. --- smart_open/smart_open_lib.py | 12 ++++++++- smart_open/tests/test_smart_open.py | 40 +++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 23d6f63b..2ac3118a 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -140,11 +140,21 @@ def smart_open(uri, mode="rb", **kw): """ logger.debug('%r', locals()) + # + # This is a work-around for the problem described in Issue #144. + # If the user has explicitly specified an encoding, then assume they want + # us to open the destination in text mode, instead of the default binary. + # + # If we change the default mode to be text, and match the normal behavior + # of Py2 and 3, then the above assumption will be unnecessary. + # + if kw.get('encoding') is not None and 'b' in mode: + mode = mode.replace('b', '') + # validate mode parameter if not isinstance(mode, six.string_types): raise TypeError('mode should be a string') - if isinstance(uri, six.string_types): # this method just routes the request to classes handling the specific storage # schemes, depending on the URI protocol in `uri` diff --git a/smart_open/tests/test_smart_open.py b/smart_open/tests/test_smart_open.py index eb9bdf1c..2f28062a 100644 --- a/smart_open/tests/test_smart_open.py +++ b/smart_open/tests/test_smart_open.py @@ -1015,6 +1015,46 @@ def test_gzip_read_mode(self): smart_open.s3_open_uri(uri, "r") mock_open.assert_called_with('bucket', 'key.gz', 'rb') + @mock_s3 + def test_read_encoding(self): + """Should open the file with the correct encoding, explicit text read.""" + conn = boto.connect_s3() + conn.create_bucket('test-bucket') + key = "s3://bucket/key.txt" + text = u'это знала ева, это знал адам, колеса любви едут прямо по нам' + with smart_open.smart_open(key, 'wb') as fout: + fout.write(text.encode('koi8-r')) + with smart_open.smart_open(key, 'r', encoding='koi8-r') as fin: + actual = fin.read() + self.assertEqual(text, actual) + + @mock_s3 + def test_read_encoding_implicit_text(self): + """Should open the file with the correct encoding, implicit text read.""" + conn = boto.connect_s3() + conn.create_bucket('test-bucket') + key = "s3://bucket/key.txt" + text = u'это знала ева, это знал адам, колеса любви едут прямо по нам' + with smart_open.smart_open(key, 'wb') as fout: + fout.write(text.encode('koi8-r')) + with smart_open.smart_open(key, encoding='koi8-r') as fin: + actual = fin.read() + self.assertEqual(text, actual) + + @mock_s3 + def test_write_encoding(self): + """Should open the file for writing with the correct encoding.""" + conn = boto.connect_s3() + conn.create_bucket('test-bucket') + key = "s3://bucket/key.txt" + text = u'какая боль, какая боль, аргентина - ямайка, 5-0' + + with smart_open.smart_open(key, 'w', encoding='koi8-r') as fout: + fout.write(text) + with smart_open.smart_open(key, encoding='koi8-r') as fin: + actual = fin.read() + self.assertEqual(text, actual) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) From aab4a9a4b7f697dd0f71be04052270d137ef6f08 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 24 Nov 2017 23:39:25 +0900 Subject: [PATCH 2/4] Issue #146: add warnings when encoding is quietly ignored --- smart_open/smart_open_lib.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 2ac3118a..18b142ff 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -28,6 +28,7 @@ import sys import requests import io +import warnings from boto.compat import BytesIO, urlsplit, six import boto.s3.connection @@ -167,6 +168,8 @@ def smart_open(uri, mode="rb", **kw): elif parsed_uri.scheme in ("s3", "s3n", 's3u'): return s3_open_uri(parsed_uri, mode, **kw) elif parsed_uri.scheme in ("hdfs", ): + if kw.get('encoding') is not None: + warnings.warn('Issue #146: encoding is not supported for HDFS, ignoring') if mode in ('r', 'rb'): return HdfsOpenRead(parsed_uri, **kw) if mode in ('w', 'wb'): @@ -174,6 +177,8 @@ def smart_open(uri, mode="rb", **kw): else: raise NotImplementedError("file mode %s not supported for %r scheme", mode, parsed_uri.scheme) elif parsed_uri.scheme in ("webhdfs", ): + if kw.get('encoding') is not None: + warnings.warn('Issue #146: encoding is not supported for WebHDFS, ignoring') if mode in ('r', 'rb'): return WebHdfsOpenRead(parsed_uri, **kw) elif mode in ('w', 'wb'): @@ -181,6 +186,8 @@ def smart_open(uri, mode="rb", **kw): else: raise NotImplementedError("file mode %s not supported for %r scheme", mode, parsed_uri.scheme) elif parsed_uri.scheme.startswith('http'): + if kw.get('encoding') is not None: + warnings.warn('Issue #146: encoding is not supported for HTTP, ignoring') if mode in ('r', 'rb'): return HttpOpenRead(parsed_uri, **kw) else: From 09c14f648a80fa2102f5d9c1f25638e764630422 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 25 Nov 2017 02:08:42 +0900 Subject: [PATCH 3/4] make warning message more verbose, add tests --- smart_open/smart_open_lib.py | 21 +++++++++++++++------ smart_open/tests/test_smart_open.py | 23 +++++++++++++++++++++++ 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 18b142ff..a1653cd0 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -77,6 +77,12 @@ SYSTEM_ENCODING = sys.getdefaultencoding() +_ISSUE_146_FSTR = ( + "You have explicitly specified encoding=%(encoding)s, but smart_open does " + "not currently support decoding text via the %(scheme)s scheme. " + "Re-open the file without specifying an encoding to suppress this warning." +) + def smart_open(uri, mode="rb", **kw): """ @@ -168,8 +174,9 @@ def smart_open(uri, mode="rb", **kw): elif parsed_uri.scheme in ("s3", "s3n", 's3u'): return s3_open_uri(parsed_uri, mode, **kw) elif parsed_uri.scheme in ("hdfs", ): - if kw.get('encoding') is not None: - warnings.warn('Issue #146: encoding is not supported for HDFS, ignoring') + encoding = kw.pop('encoding') + if encoding is not None: + warnings.warn(_ISSUE_146_FSTR % {'encoding': encoding, 'scheme': parsed_uri.scheme}) if mode in ('r', 'rb'): return HdfsOpenRead(parsed_uri, **kw) if mode in ('w', 'wb'): @@ -177,8 +184,9 @@ def smart_open(uri, mode="rb", **kw): else: raise NotImplementedError("file mode %s not supported for %r scheme", mode, parsed_uri.scheme) elif parsed_uri.scheme in ("webhdfs", ): - if kw.get('encoding') is not None: - warnings.warn('Issue #146: encoding is not supported for WebHDFS, ignoring') + encoding = kw.pop('encoding') + if encoding is not None: + warnings.warn(_ISSUE_146_FSTR % {'encoding': encoding, 'scheme': parsed_uri.scheme}) if mode in ('r', 'rb'): return WebHdfsOpenRead(parsed_uri, **kw) elif mode in ('w', 'wb'): @@ -186,8 +194,9 @@ def smart_open(uri, mode="rb", **kw): else: raise NotImplementedError("file mode %s not supported for %r scheme", mode, parsed_uri.scheme) elif parsed_uri.scheme.startswith('http'): - if kw.get('encoding') is not None: - warnings.warn('Issue #146: encoding is not supported for HTTP, ignoring') + encoding = kw.pop('encoding') + if encoding is not None: + warnings.warn(_ISSUE_146_FSTR % {'encoding': encoding, 'scheme': parsed_uri.scheme}) if mode in ('r', 'rb'): return HttpOpenRead(parsed_uri, **kw) else: diff --git a/smart_open/tests/test_smart_open.py b/smart_open/tests/test_smart_open.py index 2f28062a..8f866cb0 100644 --- a/smart_open/tests/test_smart_open.py +++ b/smart_open/tests/test_smart_open.py @@ -333,6 +333,17 @@ def test_hdfs(self, mock_subprocess): smart_open_object.__iter__() mock_subprocess.Popen.assert_called_with(["hdfs", "dfs", "-text", "/tmp/test.txt"], stdout=mock_subprocess.PIPE) + @mock.patch('smart_open.smart_open_lib.subprocess') + def test_hdfs_encoding(self, mock_subprocess): + """Is HDFS line iterator called correctly?""" + mock_subprocess.PIPE.return_value = "test" + with mock.patch('warnings.warn') as warn: + smart_open.smart_open("hdfs:///tmp/test.txt", encoding='utf-8') + expected = smart_open.smart_open_lib._ISSUE_146_FSTR % { + 'encoding': 'utf-8', 'scheme': 'hdfs' + } + warn.assert_called_with(expected) + @responses.activate def test_webhdfs(self): """Is webhdfs line iterator called correctly""" @@ -342,6 +353,18 @@ def test_webhdfs(self): self.assertEqual(next(iterator).decode("utf-8"), "line1") self.assertEqual(next(iterator).decode("utf-8"), "line2") + @mock.patch('smart_open.smart_open_lib.subprocess') + def test_webhdfs_encoding(self, mock_subprocess): + """Is HDFS line iterator called correctly?""" + url = "webhdfs://127.0.0.1:8440/path/file" + mock_subprocess.PIPE.return_value = "test" + with mock.patch('warnings.warn') as warn: + smart_open.smart_open(url, encoding='utf-8') + expected = smart_open.smart_open_lib._ISSUE_146_FSTR % { + 'encoding': 'utf-8', 'scheme': 'webhdfs' + } + warn.assert_called_with(expected) + @responses.activate def test_webhdfs_read(self): """Does webhdfs read method work correctly""" From 3b506f1c8929ddcc8a9a267a36e79f1120bb6427 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 25 Nov 2017 02:34:59 +0900 Subject: [PATCH 4/4] fixup for 09c14f6: allow encoding to be unspecified --- smart_open/smart_open_lib.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index a1653cd0..ec71f87d 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -174,7 +174,7 @@ def smart_open(uri, mode="rb", **kw): elif parsed_uri.scheme in ("s3", "s3n", 's3u'): return s3_open_uri(parsed_uri, mode, **kw) elif parsed_uri.scheme in ("hdfs", ): - encoding = kw.pop('encoding') + encoding = kw.pop('encoding', None) if encoding is not None: warnings.warn(_ISSUE_146_FSTR % {'encoding': encoding, 'scheme': parsed_uri.scheme}) if mode in ('r', 'rb'): @@ -184,7 +184,7 @@ def smart_open(uri, mode="rb", **kw): else: raise NotImplementedError("file mode %s not supported for %r scheme", mode, parsed_uri.scheme) elif parsed_uri.scheme in ("webhdfs", ): - encoding = kw.pop('encoding') + encoding = kw.pop('encoding', None) if encoding is not None: warnings.warn(_ISSUE_146_FSTR % {'encoding': encoding, 'scheme': parsed_uri.scheme}) if mode in ('r', 'rb'): @@ -194,7 +194,7 @@ def smart_open(uri, mode="rb", **kw): else: raise NotImplementedError("file mode %s not supported for %r scheme", mode, parsed_uri.scheme) elif parsed_uri.scheme.startswith('http'): - encoding = kw.pop('encoding') + encoding = kw.pop('encoding', None) if encoding is not None: warnings.warn(_ISSUE_146_FSTR % {'encoding': encoding, 'scheme': parsed_uri.scheme}) if mode in ('r', 'rb'):