From fb289ab9dce911c74e52f473ce240a9cab5a9c52 Mon Sep 17 00:00:00 2001 From: Michael Penkov <misha.penkov@gmail.com> Date: Sun, 14 Apr 2019 11:36:32 +0900 Subject: [PATCH 1/8] expose __version__, fix #289 --- MANIFEST.in | 1 + setup.py | 13 ++++++++++++- smart_open/VERSION | 1 + smart_open/__init__.py | 10 ++++++++++ 4 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 smart_open/VERSION diff --git a/MANIFEST.in b/MANIFEST.in index c60c7b57..e593af0d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,3 +2,4 @@ include LICENSE include README.rst include CHANGELOG.rst include setup.py +include smart_open/VERSION diff --git a/setup.py b/setup.py index 78c5a4e4..819f714b 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,17 @@ def read(fname): return io.open(os.path.join(os.path.dirname(__file__), fname), encoding='utf-8').read() +# +# This code intentially duplicates a similar function in __init__.py. The +# alternative would be to somehow import that module to access the function, +# which would be too messy for a setup.py script. +# +def _get_version(): + curr_dir = os.path.dirname(os.path.abspath(__file__)) + with open(os.path.join(curr_dir, 'smart_open', 'VERSION')) as fin: + return fin.read().strip() + + tests_require = [ 'mock', 'moto==1.3.4', @@ -32,7 +43,7 @@ def read(fname): setup( name='smart_open', - version='1.8.1', + version=_get_version(), description='Utils for streaming large files (S3, HDFS, gzip, bz2...)', long_description=read('README.rst'), diff --git a/smart_open/VERSION b/smart_open/VERSION new file mode 100644 index 00000000..a8fdfda1 --- /dev/null +++ b/smart_open/VERSION @@ -0,0 +1 @@ +1.8.1 diff --git a/smart_open/__init__.py b/smart_open/__init__.py index ca425aa4..db77ee84 100644 --- a/smart_open/__init__.py +++ b/smart_open/__init__.py @@ -23,10 +23,20 @@ """ import logging +import os.path from .smart_open_lib import open, smart_open, register_compressor from .s3 import iter_bucket as s3_iter_bucket __all__ = ['open', 'smart_open', 's3_iter_bucket', 'register_compressor'] + +def _get_version(): + curr_dir = os.path.dirname(os.path.abspath(__file__)) + with open(os.path.join(curr_dir, 'VERSION')) as fin: + return fin.read().strip() + + +__version__ = _get_version() + logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) From 1724ebab5f7c2af504d5035926e91674ba1f9972 Mon Sep 17 00:00:00 2001 From: Michael Penkov <misha.penkov@gmail.com> Date: Sun, 14 Apr 2019 11:39:04 +0900 Subject: [PATCH 2/8] fix warning, fix #289 --- smart_open/ssh.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/smart_open/ssh.py b/smart_open/ssh.py index 1310ca3f..fbd37e49 100644 --- a/smart_open/ssh.py +++ b/smart_open/ssh.py @@ -24,13 +24,14 @@ import getpass import logging +import warnings logger = logging.getLogger(__name__) try: import paramiko except ImportError: - logger.warning('paramiko missing, opening SSH/SCP/SFTP paths will be disabled. `pip install paramiko` to suppress') + warnings.warn('paramiko missing, opening SSH/SCP/SFTP paths will be disabled. `pip install paramiko` to suppress') # # Global storage for SSH connections. From f6d9b006d2b74febe429dc0f25a4deb9a262f415 Mon Sep 17 00:00:00 2001 From: Michael Penkov <misha.penkov@gmail.com> Date: Sun, 14 Apr 2019 14:28:36 +0900 Subject: [PATCH 3/8] include VERSIOn in package_data --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 819f714b..ea467d25 100644 --- a/setup.py +++ b/setup.py @@ -89,4 +89,5 @@ def _get_version(): 'Topic :: System :: Distributed Computing', 'Topic :: Database :: Front-Ends', ], + package_data={'smart_open': ['VERSION']}, ) From 01155d4c405667dcfcf6e1e4384e236b3a18f8c7 Mon Sep 17 00:00:00 2001 From: Michael Penkov <misha.penkov@gmail.com> Date: Sun, 14 Apr 2019 14:46:20 +0900 Subject: [PATCH 4/8] fixup --- setup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ea467d25..dca6c905 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,10 @@ def _get_version(): long_description=read('README.rst'), packages=find_packages(), - package_data={"smart_open.tests": ["test_data/*gz"]}, + package_data={ + "smart_open": ["VERSION"]}, + "smart_open.tests": ["test_data/*gz"] + }, author='Radim Rehurek', author_email='me@radimrehurek.com', @@ -89,5 +92,4 @@ def _get_version(): 'Topic :: System :: Distributed Computing', 'Topic :: Database :: Front-Ends', ], - package_data={'smart_open': ['VERSION']}, ) From efc1093ef0972828cd692825604bad764e59ee5e Mon Sep 17 00:00:00 2001 From: Michael Penkov <misha.penkov@gmail.com> Date: Sun, 14 Apr 2019 14:54:33 +0900 Subject: [PATCH 5/8] fixup --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index dca6c905..02323da1 100644 --- a/setup.py +++ b/setup.py @@ -49,8 +49,8 @@ def _get_version(): packages=find_packages(), package_data={ - "smart_open": ["VERSION"]}, - "smart_open.tests": ["test_data/*gz"] + "smart_open": ["VERSION"], + "smart_open.tests": ["test_data/*gz"], }, author='Radim Rehurek', From fcffbe8662bdf791cba58efe5ea5e25ca3f084f6 Mon Sep 17 00:00:00 2001 From: Michael Penkov <misha.penkov@gmail.com> Date: Sun, 14 Apr 2019 21:59:29 +0900 Subject: [PATCH 6/8] fix #285 --- smart_open/smart_open_lib.py | 29 ++++++++++++++++++++++++++++- smart_open/tests/test_smart_open.py | 10 ++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index e3fd04e5..8eb6699f 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -563,6 +563,32 @@ def _s3_open_uri(parsed_uri, mode, transport_params): return smart_open_s3.open(parsed_uri.bucket_id, parsed_uri.key_id, mode, **kwargs) +def _my_urlsplit(url): + """This is a hack to prevent the regular urlsplit from splitting around question marks. + + A question mark (?) in a URL typically indicates the start of a + querystring, and the standard library's urlparse function handles the + querystring separately. Unfortunately, question marks can also appear + _inside_ the actual URL for some schemas like S3. + + Replaces question marks with newlines prior to splitting. This is safe because: + + 1. The standard library's urlsplit completely ignores newlines + 2. Raw newlines will never occur in innocuous URLs. They are always URL-encoded. + + See Also + -------- + https://github.com/python/cpython/blob/3.7/Lib/urllib/parse.py + https://github.com/RaRe-Technologies/smart_open/issues/285 + """ + if '?' not in url: + return urlsplit(url, allow_fragments=False) + + sr = urlsplit(url.replace('?', '\n'), allow_fragments=False) + SplitResult = collections.namedtuple('SplitResult', 'scheme netloc path query fragment') + return SplitResult(sr.scheme, sr.netloc, sr.path.replace('\n', '?'), '', '') + + def _parse_uri(uri_as_string): """ Parse the given URI from a string. @@ -604,7 +630,8 @@ def _parse_uri(uri_as_string): if '://' not in uri_as_string: # no protocol given => assume a local file uri_as_string = 'file://' + uri_as_string - parsed_uri = urlsplit(uri_as_string, allow_fragments=False) + + parsed_uri = _my_urlsplit(uri_as_string) if parsed_uri.scheme == "hdfs": return _parse_uri_hdfs(parsed_uri) diff --git a/smart_open/tests/test_smart_open.py b/smart_open/tests/test_smart_open.py index 5ccf38ff..e622a041 100644 --- a/smart_open/tests/test_smart_open.py +++ b/smart_open/tests/test_smart_open.py @@ -104,6 +104,16 @@ def test_s3_uri_has_atmark_in_key_name2(self): self.assertEqual(parsed_uri.host, "hostname") self.assertEqual(parsed_uri.port, 1234) + def test_s3_handles_fragments(self): + uri_str = 's3://bucket-name/folder/picture #1.jpg' + parsed_uri = smart_open_lib._parse_uri(uri_str) + self.assertEqual(parsed_uri.key_id, "folder/picture #1.jpg") + + def test_s3_handles_querystring(self): + uri_str = 's3://bucket-name/folder/picture1.jpg?bar' + parsed_uri = smart_open_lib._parse_uri(uri_str) + self.assertEqual(parsed_uri.key_id, "folder/picture1.jpg?bar") + def test_s3_invalid_url_atmark_in_bucket_name(self): self.assertRaises(ValueError, smart_open_lib._parse_uri, "s3://access_id:access_secret@my@bucket@port/mykey") From 7edc1bd6b60a895531217187635252181b272f82 Mon Sep 17 00:00:00 2001 From: Michael Penkov <misha.penkov@gmail.com> Date: Mon, 15 Apr 2019 09:57:53 +0900 Subject: [PATCH 7/8] more logging -> warnings changes --- smart_open/s3.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/smart_open/s3.py b/smart_open/s3.py index 7f31a2b8..6434ec7d 100644 --- a/smart_open/s3.py +++ b/smart_open/s3.py @@ -5,6 +5,7 @@ import contextlib import functools import logging +import warnings import boto3 import botocore.client @@ -20,7 +21,7 @@ import multiprocessing.pool _MULTIPROCESSING = True except ImportError: - logger.warning("multiprocessing could not be imported and won't be used") + warnings.warn("multiprocessing could not be imported and won't be used") DEFAULT_MIN_PART_SIZE = 50 * 1024**2 From 45bd792327c58c7efee4fb08a88c8c9ac9d07eec Mon Sep 17 00:00:00 2001 From: Michael Penkov <misha.penkov@gmail.com> Date: Mon, 15 Apr 2019 09:58:33 +0900 Subject: [PATCH 8/8] add unit test, fix #47 --- smart_open/tests/test_s3.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/smart_open/tests/test_s3.py b/smart_open/tests/test_s3.py index 50ca7893..dd50fee4 100644 --- a/smart_open/tests/test_s3.py +++ b/smart_open/tests/test_s3.py @@ -446,6 +446,25 @@ def test_old(self): self.assertEqual(result, expected) +@maybe_mock_s3 +class IterBucketSingleProcessTest(unittest.TestCase): + def setUp(self): + self.old_flag = smart_open.s3._MULTIPROCESSING + smart_open.s3._MULTIPROCESSING = False + + def tearDown(self): + smart_open.s3._MULTIPROCESSING = self.old_flag + + def test(self): + num_keys = 101 + populate_bucket(num_keys=num_keys) + keys = list(smart_open.s3.iter_bucket(BUCKET_NAME)) + self.assertEqual(len(keys), num_keys) + + expected = [('key_%d' % x, b'%d' % x) for x in range(num_keys)] + self.assertEqual(sorted(keys), sorted(expected)) + + @maybe_mock_s3 class DownloadKeyTest(unittest.TestCase):