diff --git a/CHANGELOG.md b/CHANGELOG.md index ed14e6ff..5ebeba0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,30 @@ # Unreleased +This release modifies the behavior of setup.py with respect to dependencies. +Previously, `boto3` and other AWS-related packages were installed by default. +Now, in order to install them, you need to run either: + + pip install smart_open[aws] + +to install the AWS dependencies only, or + + pip install smart_open[all] + +to install all dependencies, including AWS, GCS, etc. + +Summary of changes: + - Correctly pass `newline` parameter to built-in `open` function (PR [#478](https://github.com/RaRe-Technologies/smart_open/pull/478), [@burkovae](https://github.com/burkovae)) - Remove boto as a dependency (PR [#523](https://github.com/RaRe-Technologies/smart_open/pull/523), [@isobit](https://github.com/isobit)) - Performance improvement: avoid redundant GetObject API queries in s3.Reader (PR [#495](https://github.com/RaRe-Technologies/smart_open/pull/495), [@jcushman](https://github.com/jcushman)) + - Support installing smart_open without AWS dependencies (PR [#534](https://github.com/RaRe-Technologies/smart_open/pull/534), [@justindujardin](https://github.com/justindujardin)) + +## Deprecations + +Functionality on the left hand side will be removed in future releases. +Use the functions on the right hand side instead. + +- `smart_open.s3_iter_bucket` → `smart_open.s3.iter_bucket` # 2.1.1, 27 Aug 2020 diff --git a/README.rst b/README.rst index 0050710f..3922b961 100644 --- a/README.rst +++ b/README.rst @@ -113,7 +113,7 @@ Installation :: pip install smart_open // Install with no cloud dependencies - pip install smart_open[aws] // Install AWS deps + pip install smart_open[s3] // Install S3 deps pip install smart_open[gcp] // Install GCP deps pip install smart_open[all] // Installs all cloud dependencies @@ -129,7 +129,7 @@ If you're upgrading from ``smart_open`` versions 1.8.0 and below, please check o Version ``2.0`` will introduce a backwards incompatible installation method with regards to the cloud dependencies. A migration path to minimize breaking was introduced in version ``x.x.x``. If you want to maintain backwards compatibility (installing all dependencies) install this package via ``smart_open[all]`` now -and once the change is made you should not have any issues. If all you care about is AWS dependencies for example you can install via ``smart_open[aws]`` and +and once the change is made you should not have any issues. If all you care about is AWS dependencies for example you can install via ``smart_open[s3]`` and once the dependency change is made you will simply drop the unwanted dependencies. You can read more about the motivations `here `_ @@ -311,16 +311,16 @@ Port your old ``boto`` settings to ``boto3`` in order to use them with ``smart_o Iterating Over an S3 Bucket's Contents -------------------------------------- -Since going over all (or select) keys in an S3 bucket is a very common operation, there's also an extra function ``smart_open.s3_iter_bucket()`` that does this efficiently, **processing the bucket keys in parallel** (using multiprocessing): +Since going over all (or select) keys in an S3 bucket is a very common operation, there's also an extra function ``smart_open.s3.iter_bucket()`` that does this efficiently, **processing the bucket keys in parallel** (using multiprocessing): .. code-block:: python - >>> from smart_open import s3_iter_bucket + >>> from smart_open import s3 >>> # get data corresponding to 2010 and later under "silo-open-data/annual/monthly_rain" >>> # we use workers=1 for reproducibility; you should use as many workers as you have cores >>> bucket = 'silo-open-data' >>> prefix = 'annual/monthly_rain/' - >>> for key, content in s3_iter_bucket(bucket, prefix=prefix, accept_key=lambda key: '/201' in key, workers=1, key_limit=3): + >>> for key, content in s3.iter_bucket(bucket, prefix=prefix, accept_key=lambda key: '/201' in key, workers=1, key_limit=3): ... print(key, round(len(content) / 2**20)) annual/monthly_rain/2010.monthly_rain.nc 13 annual/monthly_rain/2011.monthly_rain.nc 13 diff --git a/extending.md b/extending.md index b205b2b7..3ad83bc2 100644 --- a/extending.md +++ b/extending.md @@ -30,6 +30,13 @@ This is the part that goes before the `://` in a URL, e.g. `s3://`.""" URI_EXAMPLES = ('xxx://foo/bar', 'zzz://baz/boz') """This will appear in the documentation of the the `parse_uri` function.""" +MISSING_DEPS = False +"""Wrap transport-specific imports in a try/catch and set this to True if +any imports are not found. Seting MISSING_DEPS to True will cause the library +to suggest installing its dependencies with an example pip command. + +If your transport has no external dependencies, you can omit this variable. +""" def parse_uri(uri_as_str): """Parse the specified URI into a dict. diff --git a/setup.py b/setup.py index 93307c6d..81bde0ab 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ def read(fname): 'paramiko', 'parameterizedtestcase', 'pytest', - 'pytest-rerunfailures', + 'pytest-rerunfailures' ] install_requires = [ @@ -82,14 +82,11 @@ def read(fname): license='MIT', platforms='any', - # Concatenating the lists together is temporary and will - # eventually simply be install_requires dropping the cloud - # dependencies from being installed without explicitly being declared. - install_requires=install_requires + aws_deps, + install_requires=install_requires, tests_require=tests_require, extras_require={ 'test': tests_require, - 'aws': aws_deps, + 's3': aws_deps, 'gcp': gcp_deps, 'azure': azure_deps, 'all': all_deps, diff --git a/smart_open/__init__.py b/smart_open/__init__.py index ec75474b..e7c8218d 100644 --- a/smart_open/__init__.py +++ b/smart_open/__init__.py @@ -27,11 +27,45 @@ # # Prevent regression of #474 and #475 # -logging.getLogger(__name__).addHandler(logging.NullHandler()) +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) from smart_open import version # noqa: E402 from .smart_open_lib import open, parse_uri, smart_open, register_compressor # noqa: E402 -from .s3 import iter_bucket as s3_iter_bucket # noqa: E402 + +_WARNING = """smart_open.s3_iter_bucket is deprecated and will stop functioning +in a future version. Please import iter_bucket from the smart_open.s3 module instead: + + from smart_open.s3 import iter_bucket as s3_iter_bucket + +""" +_WARNED = False + + +def s3_iter_bucket( + bucket_name, + prefix='', + accept_key=None, + key_limit=None, + workers=16, + retries=3, + **session_kwargs +): + global _WARNED + from .s3 import iter_bucket + if not _WARNED: + logger.warn(_WARNING) + _WARNED = True + return iter_bucket( + bucket_name=bucket_name, + prefix=prefix, + accept_key=accept_key, + key_limit=key_limit, + workers=workers, + retries=retries, + session_kwargs=session_kwargs + ) + __all__ = [ 'open', diff --git a/smart_open/azure.py b/smart_open/azure.py index 09541ac6..a2ec54fc 100644 --- a/smart_open/azure.py +++ b/smart_open/azure.py @@ -15,8 +15,11 @@ import smart_open.bytebuffer import smart_open.constants -import azure.storage.blob -import azure.core.exceptions +try: + import azure.storage.blob + import azure.core.exceptions +except ImportError: + MISSING_DEPS = True logger = logging.getLogger(__name__) diff --git a/smart_open/gcs.py b/smart_open/gcs.py index 7c882e40..bf027482 100644 --- a/smart_open/gcs.py +++ b/smart_open/gcs.py @@ -11,9 +11,12 @@ import io import logging -import google.cloud.exceptions -import google.cloud.storage -import google.auth.transport.requests +try: + import google.cloud.exceptions + import google.cloud.storage + import google.auth.transport.requests +except ImportError: + MISSING_DEPS = True import smart_open.bytebuffer import smart_open.utils diff --git a/smart_open/s3.py b/smart_open/s3.py index a48ab7d8..1fe9b65b 100644 --- a/smart_open/s3.py +++ b/smart_open/s3.py @@ -12,9 +12,12 @@ import logging import time -import boto3 -import botocore.client -import botocore.exceptions +try: + import boto3 + import botocore.client + import botocore.exceptions +except ImportError: + MISSING_DEPS = True import smart_open.bytebuffer import smart_open.concurrency @@ -961,7 +964,9 @@ def _retry_if_failed( partial, attempts=_UPLOAD_ATTEMPTS, sleep_seconds=_SLEEP_SECONDS, - exceptions=(botocore.exceptions.EndpointConnectionError, )): + exceptions=None): + if exceptions is None: + exceptions = (botocore.exceptions.EndpointConnectionError, ) for attempt in range(attempts): try: return partial() diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 72316c05..d1ebe2da 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -25,8 +25,6 @@ import warnings import sys -import boto3 - # # This module defines a function called smart_open so we cannot use # smart_open.submodule to reference to the submodules. @@ -289,6 +287,8 @@ def smart_open(uri, mode="rb", **kw): logger.error('profile_name and s3_session are mutually exclusive, ignoring the former') if 'profile_name' in kw: + import boto3 + transport_params['session'] = boto3.Session(profile_name=kw.pop('profile_name')) if 's3_session' in kw: diff --git a/smart_open/tests/fixtures/__init__.py b/smart_open/tests/fixtures/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/smart_open/tests/fixtures/good_transport.py b/smart_open/tests/fixtures/good_transport.py new file mode 100644 index 00000000..0a9b66c7 --- /dev/null +++ b/smart_open/tests/fixtures/good_transport.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- +"""A no-op transport that registers scheme 'foo'""" +import io + +SCHEME = "foo" +open = io.open + + +def parse_uri(uri_as_string): # pragma: no cover + ... + + +def open_uri(uri_as_string, mode, transport_params): # pragma: no cover + ... diff --git a/smart_open/tests/fixtures/missing_deps_transport.py b/smart_open/tests/fixtures/missing_deps_transport.py new file mode 100644 index 00000000..ea686598 --- /dev/null +++ b/smart_open/tests/fixtures/missing_deps_transport.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +"""Transport that has missing deps""" +import io + + +try: + import this_module_does_not_exist_but_we_need_it # noqa +except ImportError: + MISSING_DEPS = True + +SCHEME = "missing" +open = io.open + + +def parse_uri(uri_as_string): # pragma: no cover + ... + + +def open_uri(uri_as_string, mode, transport_params): # pragma: no cover + ... diff --git a/smart_open/tests/fixtures/no_schemes_transport.py b/smart_open/tests/fixtures/no_schemes_transport.py new file mode 100644 index 00000000..9c46efb8 --- /dev/null +++ b/smart_open/tests/fixtures/no_schemes_transport.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +"""A transport that is missing the required SCHEME/SCHEMAS attributes""" +import io + +open = io.open + + +def parse_uri(uri_as_string): # pragma: no cover + ... + + +def open_uri(uri_as_string, mode, transport_params): # pragma: no cover + ... diff --git a/smart_open/tests/test_package.py b/smart_open/tests/test_package.py new file mode 100644 index 00000000..948bb4a8 --- /dev/null +++ b/smart_open/tests/test_package.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +import os +import unittest +import pytest + +from smart_open import open + +skip_tests = "SMART_OPEN_TEST_MISSING_DEPS" not in os.environ + + +class PackageTests(unittest.TestCase): + + @pytest.mark.skipif(skip_tests, reason="requires missing dependencies") + def test_azure_raises_helpful_error_with_missing_deps(self): + with pytest.raises(ImportError, match=r"pip install smart_open\[azure\]"): + open("azure://foo/bar") + + @pytest.mark.skipif(skip_tests, reason="requires missing dependencies") + def test_aws_raises_helpful_error_with_missing_deps(self): + match = r"pip install smart_open\[s3\]" + with pytest.raises(ImportError, match=match): + open("s3://foo/bar") + # With the DRY errors in transport, this no longer gets a nice error message + with pytest.raises(ImportError): + from smart_open import smart_open + smart_open('fake-name', profile_name="will produce an error importing s3") + + @pytest.mark.skipif(skip_tests, reason="requires missing dependencies") + def test_gcs_raises_helpful_error_with_missing_deps(self): + with pytest.raises(ImportError, match=r"pip install smart_open\[gcs\]"): + open("gs://foo/bar") diff --git a/smart_open/tests/test_s3.py b/smart_open/tests/test_s3.py index fc4591c7..2a2720b3 100644 --- a/smart_open/tests/test_s3.py +++ b/smart_open/tests/test_s3.py @@ -604,6 +604,18 @@ def test_iter_bucket(self): results = list(smart_open.s3.iter_bucket(BUCKET_NAME)) self.assertEqual(len(results), 10) + def test_deprecated_top_level_s3_iter_bucket(self): + populate_bucket() + with self.assertLogs(smart_open.logger.name, level='WARN') as cm: + # invoking once will generate a warning + smart_open.s3_iter_bucket(BUCKET_NAME) + # invoking again will not (to reduce spam) + smart_open.s3_iter_bucket(BUCKET_NAME) + # verify only one output + assert len(cm.output) == 1 + # verify the suggested new import is in the warning + assert "from smart_open.s3 import iter_bucket as s3_iter_bucket" in cm.output[0] + def test_accepts_boto3_bucket(self): populate_bucket() bucket = boto3.resource('s3').Bucket(BUCKET_NAME) diff --git a/smart_open/tests/test_transport.py b/smart_open/tests/test_transport.py new file mode 100644 index 00000000..c44b04ab --- /dev/null +++ b/smart_open/tests/test_transport.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +import pytest +import unittest + +from smart_open.transport import register_transport, get_transport + + +class TransportTest(unittest.TestCase): + + def test_registry_requires_declared_schemes(self): + with pytest.raises(ValueError): + register_transport('smart_open.tests.fixtures.no_schemes_transport') + + def test_registry_errors_on_double_register_scheme(self): + register_transport('smart_open.tests.fixtures.good_transport') + with pytest.raises(AssertionError): + register_transport('smart_open.tests.fixtures.good_transport') + + def test_registry_errors_get_transport_for_module_with_missing_deps(self): + register_transport('smart_open.tests.fixtures.missing_deps_transport') + with pytest.raises(ImportError): + get_transport("missing") diff --git a/smart_open/transport.py b/smart_open/transport.py index d88cc908..00fb27d7 100644 --- a/smart_open/transport.py +++ b/smart_open/transport.py @@ -20,6 +20,13 @@ NO_SCHEME = '' _REGISTRY = {NO_SCHEME: smart_open.local_file} +_ERRORS = {} +_MISSING_DEPS_ERROR = """You are trying to use the %(module)s functionality of smart_open +but you do not have the correct %(module)s dependencies installed. Try: + + pip install smart_open[%(module)s] + +""" def register_transport(submodule): @@ -35,12 +42,17 @@ def register_transport(submodule): Once registered, you can get the submodule by calling :func:`get_transport`. """ - global _REGISTRY + global _REGISTRY, _ERRORS + module_name = submodule if isinstance(submodule, str): try: submodule = importlib.import_module(submodule) except ImportError: return + else: + module_name = submodule.__name__ + # Save only the last module name piece + module_name = module_name.rsplit(".")[-1] if hasattr(submodule, 'SCHEME'): schemes = [submodule.SCHEME] @@ -54,7 +66,10 @@ def register_transport(submodule): for scheme in schemes: assert scheme not in _REGISTRY - _REGISTRY[scheme] = submodule + if getattr(submodule, "MISSING_DEPS", False): + _ERRORS[scheme] = module_name + else: + _REGISTRY[scheme] = submodule def get_transport(scheme): @@ -63,6 +78,7 @@ def get_transport(scheme): This submodule must have been previously registered via :func:`register_transport`. """ + global _ERRORS, _MISSING_DEPS_ERROR, _REGISTRY, SUPPORTED_SCHEMES expected = SUPPORTED_SCHEMES readme_url = 'https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst' message = ( @@ -70,12 +86,11 @@ def get_transport(scheme): "Extra dependencies required by %(scheme)r may be missing. " "See <%(readme_url)s> for details." % locals() ) - try: - submodule = _REGISTRY[scheme] - except KeyError as e: - raise NotImplementedError(message) from e - else: - return submodule + if scheme in _ERRORS: + raise ImportError(_MISSING_DEPS_ERROR % dict(module=_ERRORS[scheme])) + if scheme in _REGISTRY: + return _REGISTRY[scheme] + raise NotImplementedError(message) register_transport(smart_open.local_file) diff --git a/tox.ini b/tox.ini index bc77896d..1bdd4c84 100644 --- a/tox.ini +++ b/tox.ini @@ -75,13 +75,13 @@ commands = ./travis_ci_helpers.sh disable_moto_server [testenv:test_coverage] skip_install = True -recreate = False +recreate = True deps = - .[all] - .[test] + . pytest-cov coveralls - commands = - pytest smart_open -v --cov smart_open --cov-report term-missing + ./travis_ci_helpers.sh dependencies + pip install .[all,test] + pytest smart_open -v --cov smart_open --cov-report term-missing --cov-append coveralls diff --git a/travis_ci_helpers.sh b/travis_ci_helpers.sh index 801d803d..5213970c 100755 --- a/travis_ci_helpers.sh +++ b/travis_ci_helpers.sh @@ -3,9 +3,6 @@ set -e set -x -export PYTEST_ADDOPTS="--reruns 3 --reruns-delay 1" - - is_travis_secure_vars_available(){ if [[ "${TRAVIS_SECURE_ENV_VARS}" == "true" ]]; then return 0 @@ -20,6 +17,7 @@ benchmark(){ return 0 fi + export PYTEST_ADDOPTS="--reruns 3 --reruns-delay 1" SO_S3_URL="${SO_S3_URL}"/`python -c "from uuid import uuid4;print(uuid4())"`; COMMIT_HASH=`git rev-parse HEAD`; @@ -30,6 +28,7 @@ benchmark(){ } integration(){ + export PYTEST_ADDOPTS="--reruns 3 --reruns-delay 1" pytest integration-tests/test_http.py integration-tests/test_207.py if ! is_travis_secure_vars_available; then return 0 @@ -38,6 +37,10 @@ integration(){ pytest integration-tests/test_s3_ported.py; } +dependencies(){ + SMART_OPEN_TEST_MISSING_DEPS=1 pytest smart_open/tests/test_package.py -v --cov smart_open --cov-report term-missing; +} + doctest(){ if ! is_travis_secure_vars_available; then return 0