piskvorky · mpenkov · Aug 18, 2021 · Aug 3, 2021 · Aug 15, 2021 · Aug 15, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ You may continue to use `ignore_ext` parameter for now, but it will be deprecate
 - Add new top-level compression parameter (PR [#609](https://github.com/RaRe-Technologies/smart_open/pull/609), [@dmcguire81](https://github.com/dmcguire81))
 - Drop mock dependency; standardize on unittest.mock (PR [#621](https://github.com/RaRe-Technologies/smart_open/pull/621), [@musicinmybrain](https://github.com/musicinmybrain))
 - Fix to_boto3 method (PR [#619](https://github.com/RaRe-Technologies/smart_open/pull/619), [@mpenkov](https://github.com/mpenkov))
+- Work around changes to `urllib.parse.urlsplit` (PR [#633](https://github.com/RaRe-Technologies/smart_open/pull/633), [@judahrand](https://github.com/judahrand)
 
 # 5.0.0, 30 Mar 2021
 

diff --git a/smart_open/tests/test_utils.py b/smart_open/tests/test_utils.py
@@ -5,8 +5,10 @@
 # This code is distributed under the terms and conditions
 # from the MIT License (MIT).
 #
-
 import unittest
+import urllib.parse
+
+import pytest
 
 import smart_open.utils
 
@@ -28,3 +30,17 @@ def test_check_kwargs():
     kwargs = {'client': 'foo', 'unsupported': 'bar', 'client_kwargs': 'boaz'}
     supported = smart_open.utils.check_kwargs(kallable, kwargs)
     assert supported == {'client': 'foo', 'client_kwargs': 'boaz'}
+
+
+@pytest.mark.parametrize(
+    'url,expected',
+    [
+        ('s3://bucket/key', ('s3', 'bucket', '/key', '', '')),
+        ('s3://bucket/key?', ('s3', 'bucket', '/key?', '', '')),
+        ('s3://bucket/???', ('s3', 'bucket', '/???', '', '')),
+        ('https://host/path?foo=bar', ('https', 'host', '/path', 'foo=bar', '')),
+    ]
+)
+def test_safe_urlsplit(url, expected):
+    actual = smart_open.utils.safe_urlsplit(url)
+    assert actual == urllib.parse.SplitResult(*expected)
diff --git a/smart_open/utils.py b/smart_open/utils.py
@@ -14,6 +14,9 @@
 
 logger = logging.getLogger(__name__)
 
+WORKAROUND_SCHEMES = ['s3', 's3n', 's3u', 's3a', 'gs']
+QUESTION_MARK_PLACEHOLDER = '///smart_open.utils.QUESTION_MARK_PLACEHOLDER///'
+
 
 def inspect_kwargs(kallable):
     #
@@ -150,16 +153,34 @@ def safe_urlsplit(url):
     querystring separately.  Unfortunately, question marks can also appear
     _inside_ the actual URL for some schemas like S3, GS.
 
-    Replaces question marks with newlines prior to splitting.  This is safe because:
-
-    1. The standard library's urlsplit completely ignores newlines
-    2. Raw newlines will never occur in innocuous URLs.  They are always URL-encoded.
+    Replaces question marks with a special placeholder substring prior to
+    splitting.  This work-around behavior is disabled in the unlikely event the
+    placeholder is already part of the URL.  If this affects you, consider
+    changing the value of QUESTION_MARK_PLACEHOLDER to something more suitable.
 
     See Also
     --------
+    https://bugs.python.org/issue43882
     https://github.com/python/cpython/blob/3.7/Lib/urllib/parse.py
     https://github.com/RaRe-Technologies/smart_open/issues/285
     https://github.com/RaRe-Technologies/smart_open/issues/458
+    smart_open/utils.py:QUESTION_MARK_PLACEHOLDER
     """
-    sr = urllib.parse.urlsplit(url.replace('?', '\n'), allow_fragments=False)
-    return urllib.parse.SplitResult(sr.scheme, sr.netloc, sr.path.replace('\n', '?'), '', '')
+    sr = urllib.parse.urlsplit(url, allow_fragments=False)
+
+    placeholder = None
+    if sr.scheme in WORKAROUND_SCHEMES and '?' in url and QUESTION_MARK_PLACEHOLDER not in url:
+        #
+        # This is safe because people will _almost never_ use the below
+        # substring in a URL.  If they do, then they're asking for trouble,
+        # and this special handling will simply not happen for them.
+        #
+        placeholder = QUESTION_MARK_PLACEHOLDER
+        url = url.replace('?', placeholder)
+        sr = urllib.parse.urlsplit(url, allow_fragments=False)
+
+    if placeholder is None:
+        return sr
+
+    path = sr.path.replace(placeholder, '?')
+    return urllib.parse.SplitResult(sr.scheme, sr.netloc, path, '', '')