From fb289ab9dce911c74e52f473ce240a9cab5a9c52 Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Sun, 14 Apr 2019 11:36:32 +0900
Subject: [PATCH 1/8] expose __version__, fix #289

---
 MANIFEST.in            |  1 +
 setup.py               | 13 ++++++++++++-
 smart_open/VERSION     |  1 +
 smart_open/__init__.py | 10 ++++++++++
 4 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 smart_open/VERSION

diff --git a/MANIFEST.in b/MANIFEST.in
index c60c7b57..e593af0d 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,3 +2,4 @@ include LICENSE
 include README.rst
 include CHANGELOG.rst
 include setup.py
+include smart_open/VERSION
diff --git a/setup.py b/setup.py
index 78c5a4e4..819f714b 100644
--- a/setup.py
+++ b/setup.py
@@ -16,6 +16,17 @@ def read(fname):
     return io.open(os.path.join(os.path.dirname(__file__), fname), encoding='utf-8').read()
 
 
+#
+# This code intentially duplicates a similar function in __init__.py.  The
+# alternative would be to somehow import that module to access the function,
+# which would be too messy for a setup.py script.
+#
+def _get_version():
+    curr_dir = os.path.dirname(os.path.abspath(__file__))
+    with open(os.path.join(curr_dir, 'smart_open', 'VERSION')) as fin:
+        return fin.read().strip()
+
+
 tests_require = [
     'mock',
     'moto==1.3.4',
@@ -32,7 +43,7 @@ def read(fname):
 
 setup(
     name='smart_open',
-    version='1.8.1',
+    version=_get_version(),
     description='Utils for streaming large files (S3, HDFS, gzip, bz2...)',
     long_description=read('README.rst'),
 
diff --git a/smart_open/VERSION b/smart_open/VERSION
new file mode 100644
index 00000000..a8fdfda1
--- /dev/null
+++ b/smart_open/VERSION
@@ -0,0 +1 @@
+1.8.1
diff --git a/smart_open/__init__.py b/smart_open/__init__.py
index ca425aa4..db77ee84 100644
--- a/smart_open/__init__.py
+++ b/smart_open/__init__.py
@@ -23,10 +23,20 @@
 """
 
 import logging
+import os.path
 
 from .smart_open_lib import open, smart_open, register_compressor
 from .s3 import iter_bucket as s3_iter_bucket
 __all__ = ['open', 'smart_open', 's3_iter_bucket', 'register_compressor']
 
+
+def _get_version():
+    curr_dir = os.path.dirname(os.path.abspath(__file__))
+    with open(os.path.join(curr_dir, 'VERSION')) as fin:
+        return fin.read().strip()
+
+
+__version__ = _get_version()
+
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())

From 1724ebab5f7c2af504d5035926e91674ba1f9972 Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Sun, 14 Apr 2019 11:39:04 +0900
Subject: [PATCH 2/8] fix warning, fix #289

---
 smart_open/ssh.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/smart_open/ssh.py b/smart_open/ssh.py
index 1310ca3f..fbd37e49 100644
--- a/smart_open/ssh.py
+++ b/smart_open/ssh.py
@@ -24,13 +24,14 @@
 
 import getpass
 import logging
+import warnings
 
 logger = logging.getLogger(__name__)
 
 try:
     import paramiko
 except ImportError:
-    logger.warning('paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress')
+    warnings.warn('paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress')
 
 #
 # Global storage for SSH connections.

From f6d9b006d2b74febe429dc0f25a4deb9a262f415 Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Sun, 14 Apr 2019 14:28:36 +0900
Subject: [PATCH 3/8] include VERSIOn in package_data

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 819f714b..ea467d25 100644
--- a/setup.py
+++ b/setup.py
@@ -89,4 +89,5 @@ def _get_version():
         'Topic :: System :: Distributed Computing',
         'Topic :: Database :: Front-Ends',
     ],
+    package_data={'smart_open': ['VERSION']},
 )

From 01155d4c405667dcfcf6e1e4384e236b3a18f8c7 Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Sun, 14 Apr 2019 14:46:20 +0900
Subject: [PATCH 4/8] fixup

---
 setup.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index ea467d25..dca6c905 100644
--- a/setup.py
+++ b/setup.py
@@ -48,7 +48,10 @@ def _get_version():
     long_description=read('README.rst'),
 
     packages=find_packages(),
-    package_data={"smart_open.tests": ["test_data/*gz"]},
+    package_data={
+        "smart_open": ["VERSION"]},
+        "smart_open.tests": ["test_data/*gz"]
+    },
 
     author='Radim Rehurek',
     author_email='me@radimrehurek.com',
@@ -89,5 +92,4 @@ def _get_version():
         'Topic :: System :: Distributed Computing',
         'Topic :: Database :: Front-Ends',
     ],
-    package_data={'smart_open': ['VERSION']},
 )

From efc1093ef0972828cd692825604bad764e59ee5e Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Sun, 14 Apr 2019 14:54:33 +0900
Subject: [PATCH 5/8] fixup

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index dca6c905..02323da1 100644
--- a/setup.py
+++ b/setup.py
@@ -49,8 +49,8 @@ def _get_version():
 
     packages=find_packages(),
     package_data={
-        "smart_open": ["VERSION"]},
-        "smart_open.tests": ["test_data/*gz"]
+        "smart_open": ["VERSION"],
+        "smart_open.tests": ["test_data/*gz"],
     },
 
     author='Radim Rehurek',

From fcffbe8662bdf791cba58efe5ea5e25ca3f084f6 Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Sun, 14 Apr 2019 21:59:29 +0900
Subject: [PATCH 6/8] fix #285

---
 smart_open/smart_open_lib.py        | 29 ++++++++++++++++++++++++++++-
 smart_open/tests/test_smart_open.py | 10 ++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py
index e3fd04e5..8eb6699f 100644
--- a/smart_open/smart_open_lib.py
+++ b/smart_open/smart_open_lib.py
@@ -563,6 +563,32 @@ def _s3_open_uri(parsed_uri, mode, transport_params):
     return smart_open_s3.open(parsed_uri.bucket_id, parsed_uri.key_id, mode, **kwargs)
 
 
+def _my_urlsplit(url):
+    """This is a hack to prevent the regular urlsplit from splitting around question marks.
+
+    A question mark (?) in a URL typically indicates the start of a
+    querystring, and the standard library's urlparse function handles the
+    querystring separately.  Unfortunately, question marks can also appear
+    _inside_ the actual URL for some schemas like S3.
+
+    Replaces question marks with newlines prior to splitting.  This is safe because:
+
+    1. The standard library's urlsplit completely ignores newlines
+    2. Raw newlines will never occur in innocuous URLs.  They are always URL-encoded.
+
+    See Also
+    --------
+    https://github.com/python/cpython/blob/3.7/Lib/urllib/parse.py
+    https://github.com/RaRe-Technologies/smart_open/issues/285
+    """
+    if '?' not in url:
+        return urlsplit(url, allow_fragments=False)
+
+    sr = urlsplit(url.replace('?', '\n'), allow_fragments=False)
+    SplitResult = collections.namedtuple('SplitResult', 'scheme netloc path query fragment')
+    return SplitResult(sr.scheme, sr.netloc, sr.path.replace('\n', '?'), '', '')
+
+
 def _parse_uri(uri_as_string):
     """
     Parse the given URI from a string.
@@ -604,7 +630,8 @@ def _parse_uri(uri_as_string):
         if '://' not in uri_as_string:
             # no protocol given => assume a local file
             uri_as_string = 'file://' + uri_as_string
-    parsed_uri = urlsplit(uri_as_string, allow_fragments=False)
+
+    parsed_uri = _my_urlsplit(uri_as_string)
 
     if parsed_uri.scheme == "hdfs":
         return _parse_uri_hdfs(parsed_uri)
diff --git a/smart_open/tests/test_smart_open.py b/smart_open/tests/test_smart_open.py
index 5ccf38ff..e622a041 100644
--- a/smart_open/tests/test_smart_open.py
+++ b/smart_open/tests/test_smart_open.py
@@ -104,6 +104,16 @@ def test_s3_uri_has_atmark_in_key_name2(self):
         self.assertEqual(parsed_uri.host, "hostname")
         self.assertEqual(parsed_uri.port, 1234)
 
+    def test_s3_handles_fragments(self):
+        uri_str = 's3://bucket-name/folder/picture #1.jpg'
+        parsed_uri = smart_open_lib._parse_uri(uri_str)
+        self.assertEqual(parsed_uri.key_id, "folder/picture #1.jpg")
+
+    def test_s3_handles_querystring(self):
+        uri_str = 's3://bucket-name/folder/picture1.jpg?bar'
+        parsed_uri = smart_open_lib._parse_uri(uri_str)
+        self.assertEqual(parsed_uri.key_id, "folder/picture1.jpg?bar")
+
     def test_s3_invalid_url_atmark_in_bucket_name(self):
         self.assertRaises(ValueError, smart_open_lib._parse_uri, "s3://access_id:access_secret@my@bucket@port/mykey")
 

From 7edc1bd6b60a895531217187635252181b272f82 Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Mon, 15 Apr 2019 09:57:53 +0900
Subject: [PATCH 7/8] more logging -> warnings changes

---
 smart_open/s3.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/smart_open/s3.py b/smart_open/s3.py
index 7f31a2b8..6434ec7d 100644
--- a/smart_open/s3.py
+++ b/smart_open/s3.py
@@ -5,6 +5,7 @@
 import contextlib
 import functools
 import logging
+import warnings
 
 import boto3
 import botocore.client
@@ -20,7 +21,7 @@
     import multiprocessing.pool
     _MULTIPROCESSING = True
 except ImportError:
-    logger.warning("multiprocessing could not be imported and won't be used")
+    warnings.warn("multiprocessing could not be imported and won't be used")
 
 
 DEFAULT_MIN_PART_SIZE = 50 * 1024**2

From 45bd792327c58c7efee4fb08a88c8c9ac9d07eec Mon Sep 17 00:00:00 2001
From: Michael Penkov <misha.penkov@gmail.com>
Date: Mon, 15 Apr 2019 09:58:33 +0900
Subject: [PATCH 8/8] add unit test, fix #47

---
 smart_open/tests/test_s3.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/smart_open/tests/test_s3.py b/smart_open/tests/test_s3.py
index 50ca7893..dd50fee4 100644
--- a/smart_open/tests/test_s3.py
+++ b/smart_open/tests/test_s3.py
@@ -446,6 +446,25 @@ def test_old(self):
             self.assertEqual(result, expected)
 
 
+@maybe_mock_s3
+class IterBucketSingleProcessTest(unittest.TestCase):
+    def setUp(self):
+        self.old_flag = smart_open.s3._MULTIPROCESSING
+        smart_open.s3._MULTIPROCESSING = False
+
+    def tearDown(self):
+        smart_open.s3._MULTIPROCESSING = self.old_flag
+
+    def test(self):
+        num_keys = 101
+        populate_bucket(num_keys=num_keys)
+        keys = list(smart_open.s3.iter_bucket(BUCKET_NAME))
+        self.assertEqual(len(keys), num_keys)
+
+        expected = [('key_%d' % x, b'%d' % x) for x in range(num_keys)]
+        self.assertEqual(sorted(keys), sorted(expected))
+
+
 @maybe_mock_s3
 class DownloadKeyTest(unittest.TestCase):