diff --git a/cloudpathlib/azure/azblobclient.py b/cloudpathlib/azure/azblobclient.py index 1f738ef0..aaa45e4c 100644 --- a/cloudpathlib/azure/azblobclient.py +++ b/cloudpathlib/azure/azblobclient.py @@ -73,6 +73,8 @@ def __init__( content_type_method (Optional[Callable]): Function to call to guess media type (mimetype) when writing a file to the cloud. Defaults to `mimetypes.guess_type`. Must return a tuple (content type, content encoding). """ + super().__init__(local_cache_dir=local_cache_dir, content_type_method=content_type_method) + if connection_string is None: connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING", None) @@ -90,8 +92,6 @@ def __init__( "Credentials are required; see docs for options." ) - super().__init__(local_cache_dir=local_cache_dir, content_type_method=content_type_method) - def _get_metadata(self, cloud_path: AzureBlobPath) -> Union["BlobProperties", Dict[str, Any]]: blob = self.service_client.get_blob_client( container=cloud_path.container, blob=cloud_path.blob @@ -151,6 +151,19 @@ def _exists(self, cloud_path: AzureBlobPath) -> bool: def _list_dir( self, cloud_path: AzureBlobPath, recursive: bool = False ) -> Iterable[Tuple[AzureBlobPath, bool]]: + # shortcut if listing all available containers + if not cloud_path.container: + if recursive: + raise NotImplementedError( + "Cannot recursively list all containers and contents; you can get all the containers then recursively list each separately." + ) + + yield from ( + (self.CloudPath(f"az://{c.name}"), True) + for c in self.service_client.list_containers() + ) + return + container_client = self.service_client.get_container_client(cloud_path.container) prefix = cloud_path.blob diff --git a/cloudpathlib/gs/gsclient.py b/cloudpathlib/gs/gsclient.py index 8b66d737..099fa5f1 100644 --- a/cloudpathlib/gs/gsclient.py +++ b/cloudpathlib/gs/gsclient.py @@ -148,6 +148,18 @@ def _exists(self, cloud_path: GSPath) -> bool: return self._is_file_or_dir(cloud_path) in ["file", "dir"] def _list_dir(self, cloud_path: GSPath, recursive=False) -> Iterable[Tuple[GSPath, bool]]: + # shortcut if listing all available buckets + if not cloud_path.bucket: + if recursive: + raise NotImplementedError( + "Cannot recursively list all buckets and contents; you can get all the buckets then recursively list each separately." + ) + + yield from ( + (self.CloudPath(f"gs://{str(b)}"), True) for b in self.client.list_buckets() + ) + return + bucket = self.client.bucket(cloud_path.bucket) prefix = cloud_path.blob diff --git a/cloudpathlib/local/implementations/azure.py b/cloudpathlib/local/implementations/azure.py index 270a5f3a..519924d0 100644 --- a/cloudpathlib/local/implementations/azure.py +++ b/cloudpathlib/local/implementations/azure.py @@ -25,12 +25,13 @@ def __init__(self, *args, **kwargs): kwargs.get("account_url", None), os.getenv("AZURE_STORAGE_CONNECTION_STRING", None), ] + super().__init__(*args, **kwargs) + if all(opt is None for opt in cred_opts): raise MissingCredentialsError( "AzureBlobClient does not support anonymous instantiation. " "Credentials are required; see docs for options." ) - super().__init__(*args, **kwargs) LocalAzureBlobClient.AzureBlobPath = LocalAzureBlobClient.CloudPath # type: ignore diff --git a/cloudpathlib/s3/s3client.py b/cloudpathlib/s3/s3client.py index 72ac8573..1b1db036 100644 --- a/cloudpathlib/s3/s3client.py +++ b/cloudpathlib/s3/s3client.py @@ -3,6 +3,8 @@ from pathlib import Path, PurePosixPath from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union +from cloudpathlib.exceptions import CloudPathException + from ..client import Client, register_client_class from ..cloudpath import implementation_registry @@ -10,7 +12,7 @@ try: from boto3.session import Session - from boto3.s3.transfer import TransferConfig + from boto3.s3.transfer import TransferConfig, S3Transfer from botocore.config import Config from botocore.exceptions import ClientError import botocore.session @@ -37,6 +39,7 @@ def __init__( endpoint_url: Optional[str] = None, boto3_transfer_config: Optional["TransferConfig"] = None, content_type_method: Optional[Callable] = mimetypes.guess_type, + extra_args: Optional[dict] = None, ): """Class constructor. Sets up a boto3 [`Session`]( https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html). @@ -52,9 +55,9 @@ def __init__( aws_secret_access_key (Optional[str]): AWS secret access key. aws_session_token (Optional[str]): Session key for your AWS account. This is only needed when you are using temporarycredentials. - no_sign_request: (Optional[bool]): If `True`, credentials are not looked for and we use unsigned + no_sign_request (Optional[bool]): If `True`, credentials are not looked for and we use unsigned requests to fetch resources. This will only allow access to public resources. This is equivalent - to `--no-sign-request` in the AWS CLI (https://docs.aws.amazon.com/cli/latest/reference/). + to `--no-sign-request` in the [AWS CLI](https://docs.aws.amazon.com/cli/latest/reference/). botocore_session (Optional[botocore.session.Session]): An already instantiated botocore Session. profile_name (Optional[str]): Profile name of a profile in a shared credentials file. @@ -63,10 +66,14 @@ def __init__( for downloaded files. If None, will use a temporary directory. endpoint_url (Optional[str]): S3 server endpoint URL to use for the constructed boto3 S3 resource and client. Parameterize it to access a customly deployed S3-compatible object store such as MinIO, Ceph or any other. - boto3_transfer_config (Optional[dict]): Instantiated TransferConfig for managing s3 transfers. - (https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.TransferConfig) + boto3_transfer_config (Optional[dict]): Instantiated TransferConfig for managing + [s3 transfers](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.TransferConfig) content_type_method (Optional[Callable]): Function to call to guess media type (mimetype) when writing a file to the cloud. Defaults to `mimetypes.guess_type`. Must return a tuple (content type, content encoding). + extra_args (Optional[dict]): A dictionary of extra args passed to download, upload, and list functions as relevant. You + can include any keys supported by upload or download, and we will pass on only the relevant args. To see the extra + args that are supported look at the upload and download lists in the + [boto3 docs](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.S3Transfer). """ endpoint_url = endpoint_url or os.getenv("AWS_ENDPOINT_URL") if boto3_session is not None: @@ -97,10 +104,32 @@ def __init__( self.boto3_transfer_config = boto3_transfer_config + if extra_args is None: + extra_args = {} + + self._extra_args = extra_args + self.boto3_dl_extra_args = { + k: v for k, v in extra_args.items() if k in S3Transfer.ALLOWED_DOWNLOAD_ARGS + } + self.boto3_ul_extra_args = { + k: v for k, v in extra_args.items() if k in S3Transfer.ALLOWED_UPLOAD_ARGS + } + + # listing ops (list_objects_v2, filter, delete) only accept these extras: + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html + self.boto3_list_extra_args = { + k: self._extra_args[k] + for k in ["RequestPayer", "ExpectedBucketOwner"] + if k in self._extra_args + } + super().__init__(local_cache_dir=local_cache_dir, content_type_method=content_type_method) def _get_metadata(self, cloud_path: S3Path) -> Dict[str, Any]: - data = self.s3.ObjectSummary(cloud_path.bucket, cloud_path.key).get() + # get accepts all download extra args + data = self.s3.ObjectSummary(cloud_path.bucket, cloud_path.key).get( + **self.boto3_dl_extra_args + ) return { "last_modified": data["LastModified"], @@ -114,7 +143,9 @@ def _download_file(self, cloud_path: S3Path, local_path: Union[str, os.PathLike] local_path = Path(local_path) obj = self.s3.Object(cloud_path.bucket, cloud_path.key) - obj.download_file(str(local_path), Config=self.boto3_transfer_config) + obj.download_file( + str(local_path), Config=self.boto3_transfer_config, ExtraArgs=self.boto3_dl_extra_args + ) return local_path def _is_file_or_dir(self, cloud_path: S3Path) -> Optional[str]: @@ -123,30 +154,17 @@ def _is_file_or_dir(self, cloud_path: S3Path) -> Optional[str]: return "dir" # get first item by listing at least one key - s3_obj = self._s3_file_query(cloud_path) - - if s3_obj is None: - return None - - # since S3 only returns files when filtering objects: - # if the first item key is equal to the path key, this is a file - if s3_obj.key == cloud_path.key: - - # "fake" directories on S3 can be created in the console UI - # these are 0-size keys that end in `/` - # Ref: https://github.com/boto/boto3/issues/377 - if s3_obj.key.endswith("/") and s3_obj.content_length == 0: - return "dir" - else: - return "file" - else: - return "dir" + return self._s3_file_query(cloud_path) def _exists(self, cloud_path: S3Path) -> bool: # check if this is a bucket if not cloud_path.key: + extra = { + k: self._extra_args[k] for k in ["ExpectedBucketOwner"] if k in self._extra_args + } + try: - self.client.head_bucket(Bucket=cloud_path.bucket) + self.client.head_bucket(Bucket=cloud_path.bucket, **extra) return True except ClientError: return False @@ -157,25 +175,44 @@ def _s3_file_query(self, cloud_path: S3Path): """Boto3 query used for quick checks of existence and if path is file/dir""" # check if this is an object that we can access directly try: - obj = self.s3.Object(cloud_path.bucket, cloud_path.key) - obj.load() - return obj + # head_object accepts all download extra args (note: Object.load does not accept extra args so we do not use it for this check) + self.client.head_object( + Bucket=cloud_path.bucket, + Key=cloud_path.key.rstrip("/"), + **self.boto3_dl_extra_args, + ) + return "file" - # else, confirm it is a dir by filtering to the first item under the prefix - except ClientError: + # else, confirm it is a dir by filtering to the first item under the prefix plus a "/" + except (ClientError, self.client.exceptions.NoSuchKey): key = cloud_path.key.rstrip("/") + "/" return next( ( - obj + "dir" # always a dir if we find anything with this query for obj in ( - self.s3.Bucket(cloud_path.bucket).objects.filter(Prefix=key).limit(1) + self.s3.Bucket(cloud_path.bucket) + .objects.filter(Prefix=key, **self.boto3_list_extra_args) + .limit(1) ) ), None, ) def _list_dir(self, cloud_path: S3Path, recursive=False) -> Iterable[Tuple[S3Path, bool]]: + # shortcut if listing all available buckets + if not cloud_path.bucket: + if recursive: + raise NotImplementedError( + "Cannot recursively list all buckets and contents; you can get all the buckets then recursively list each separately." + ) + + yield from ( + (self.CloudPath(f"s3://{b['Name']}"), True) + for b in self.client.list_buckets().get("Buckets", []) + ) + return + prefix = cloud_path.key if prefix and not prefix.endswith("/"): prefix += "/" @@ -185,7 +222,10 @@ def _list_dir(self, cloud_path: S3Path, recursive=False) -> Iterable[Tuple[S3Pat paginator = self.client.get_paginator("list_objects_v2") for result in paginator.paginate( - Bucket=cloud_path.bucket, Prefix=prefix, Delimiter=("" if recursive else "/") + Bucket=cloud_path.bucket, + Prefix=prefix, + Delimiter=("" if recursive else "/"), + **self.boto3_list_extra_args, ): # yield everything in common prefixes as directories for result_prefix in result.get("CommonPrefixes", []): @@ -238,27 +278,33 @@ def _move_file(self, src: S3Path, dst: S3Path, remove_src: bool = True) -> S3Pat CopySource={"Bucket": src.bucket, "Key": src.key}, Metadata=self._get_metadata(src).get("extra", {}), MetadataDirective="REPLACE", + **self.boto3_ul_extra_args, ) else: target = self.s3.Object(dst.bucket, dst.key) - target.copy({"Bucket": src.bucket, "Key": src.key}) + target.copy( + {"Bucket": src.bucket, "Key": src.key}, + ExtraArgs=self.boto3_dl_extra_args, + Config=self.boto3_transfer_config, + ) if remove_src: self._remove(src) return dst def _remove(self, cloud_path: S3Path, missing_ok: bool = True) -> None: - try: - obj = self.s3.Object(cloud_path.bucket, cloud_path.key) - - # will throw if not a file - obj.load() - - resp = obj.delete() - assert resp.get("ResponseMetadata").get("HTTPStatusCode") == 204 + file_or_dir = self._is_file_or_dir(cloud_path=cloud_path) + if file_or_dir == "file": + resp = self.s3.Object(cloud_path.bucket, cloud_path.key).delete( + **self.boto3_list_extra_args + ) + if resp.get("ResponseMetadata").get("HTTPStatusCode") not in (204, 200): + raise CloudPathException( + f"Delete operation failed for {cloud_path} with response: {resp}" + ) - except ClientError: + elif file_or_dir == "dir": # try to delete as a direcotry instead bucket = self.s3.Bucket(cloud_path.bucket) @@ -266,20 +312,24 @@ def _remove(self, cloud_path: S3Path, missing_ok: bool = True) -> None: if prefix and not prefix.endswith("/"): prefix += "/" - resp = bucket.objects.filter(Prefix=prefix).delete() + resp = bucket.objects.filter(Prefix=prefix, **self.boto3_list_extra_args).delete( + **self.boto3_list_extra_args + ) + if resp[0].get("ResponseMetadata").get("HTTPStatusCode") not in (204, 200): + raise CloudPathException( + f"Delete operation failed for {cloud_path} with response: {resp}" + ) - # ensure directory deleted; if cloud_path did not exist at all - # resp will be [], so no need to check success - if resp: - assert resp[0].get("ResponseMetadata").get("HTTPStatusCode") == 200 - else: - if not missing_ok: - raise FileNotFoundError(f"File does not exist: {cloud_path}") + else: + if not missing_ok: + raise FileNotFoundError( + f"Cannot delete file that does not exist: {cloud_path} (consider passing missing_ok=True)" + ) def _upload_file(self, local_path: Union[str, os.PathLike], cloud_path: S3Path) -> S3Path: obj = self.s3.Object(cloud_path.bucket, cloud_path.key) - extra_args = {} + extra_args = self.boto3_ul_extra_args.copy() if self.content_type_method is not None: content_type, content_encoding = self.content_type_method(str(local_path)) diff --git a/docs/docs/authentication.md b/docs/docs/authentication.md index 849c23a6..76c7b1b3 100644 --- a/docs/docs/authentication.md +++ b/docs/docs/authentication.md @@ -60,6 +60,134 @@ S3Client.get_default_client() #> ``` + +## Accessing public S3 buckets without credentials + +For most operations, you will need to have your S3 credentials configured. However, for buckets that provide public access, you can use `cloudpathlib` without credentials. To do so, you need to instantiate a client and pass the kwarg `no_sign_request=True`. Failure to do so will result in a `NoCredentialsError` being thrown. + +```python +from cloudpathlib import CloudPath + +# this file deinitely exists, but credentials are not configured +CloudPath("s3://ladi/Images/FEMA_CAP/2020/70349/DSC_0001_5a63d42e-27c6-448a-84f1-bfc632125b8e.jpg").exists() + +#> NoCredentialsError +``` + +Instead, you must either configure credentials or instantiate a client object using `no_sign_request=True`: + +```python +from cloudpathlib import S3Client + +c = S3Client(no_sign_request=True) + +# use this client object to create the CloudPath +c.CloudPath("s3://ladi/Images/FEMA_CAP/2020/70349/DSC_0001_5a63d42e-27c6-448a-84f1-bfc632125b8e.jpg").exists() +#> True +``` + +**Note:** Many public buckets _do not_ allow listing of the bucket contents by anonymous users. If this is the case, any listing operation on a directory will fail with an error like `ClientError: An error occurred (AccessDenied) when calling the ListObjectsV2 operation: Access Denied` when you try to do an operation, even with `no_sign_request=True`. In this case, you can generally only work with `CloudPath` objects that refer to the files themselves (instead of directories). You can contact the bucket owner to request that they allow listing, or write your code in a way that only references files you know will exist. + +As noted above, you can also call `.set_as_default_client()` on the client object that you create and then it will be used by default without your having to explicitly use the client object that you created. + + +## Requester Pays buckets on S3 + +S3 supports [Requester Pays](https://docs.aws.amazon.com/AmazonS3/latest/userguide/RequesterPaysBuckets.html) buckets where you must have credentials to access the bucket and any costs are passed on to you rather than the owner of the bucket. + +For a requester pays bucket, you need to pass extras telling cloudpathlib you will pay for any operations. + +For example, on the requester pays bucket `arxiv`, just trying to list the contents will result in a `ClientError`: + +```python +from cloudpathlib import CloudPath + +tars = list(CloudPath("s3://arxiv/src/").iterdir()) +print(tars) + +#> ClientError: An error occurred (AccessDenied) ... +``` + +To indicate that the request payer will be the "requester," pass the extra args to an `S3Client` and use that client to instantiate paths: + +```python +from cloudpathlib import S3Client + +c = S3Client(extra_args={"RequestPayer": "requester"}) + +# use the client we created to build the path +tars = list(c.CloudPath("s3://arxiv/src/").iterdir()) +print(tars) +``` + +As noted above, you can also call `.set_as_default_client()` on the client object that you create and then it will be used by default without your having to explicitly use the client object that you created. + + +## Other S3 `ExtraArgs` in `boto3` + +The S3 SDK, `boto3` supports a set of `ExtraArgs` for uploads, downloads, and listing operations. When you instatiate a client, you can pass the `extra_args` keyword argument with any of those extra args that you want to set. We will pass these on to the upload, download, and list methods insofar as those methods support the specific args. + +The args supported for uploads are the same as `boto3.s3.transfer.S3Transfer.ALLOWED_UPLOAD_ARGS`, see the [`boto3` documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.S3Transfer) for the latest, but as of the time of writing, these are: + + - `ACL` + - `CacheControl` + - `ChecksumAlgorithm` + - `ContentDisposition` + - `ContentEncoding` + - `ContentLanguage` + - `ContentType` + - `ExpectedBucketOwner` + - `Expires` + - `GrantFullControl` + - `GrantRead` + - `GrantReadACP` + - `GrantWriteACP` + - `Metadata` + - `ObjectLockLegalHoldStatus` + - `ObjectLockMode` + - `ObjectLockRetainUntilDate` + - `RequestPayer` + - `ServerSideEncryption` + - `StorageClass` + - `SSECustomerAlgorithm` + - `SSECustomerKey` + - `SSECustomerKeyMD5` + - `SSEKMSKeyId` + - `SSEKMSEncryptionContext` + - `Tagging` + - `WebsiteRedirectLocation` + +The args supported for downloads are the same as `boto3.s3.transfer.S3Transfer.ALLOWED_DOWNLOAD_ARGS`, see the [`boto3` documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.S3Transfer) for the latest, but as of the time of writing, these are: + + - `ChecksumMode` + - `VersionId` + - `SSECustomerAlgorithm` + - `SSECustomerKey` + - `SSECustomerKeyMD5` + - `RequestPayer` + - `ExpectedBucketOwner` + +To use any of these extra args, pass them as a dict to `extra_args` when instantiating and `S3Client`. + +```python +from cloudpathlib import S3Client + +c = S3Client(extra_args={ + "ChecksumMode": "ENABLED", # download extra arg, only used when downloading + "ACL": "public-read", # upload extra arg, only used when uploading +}) + +# use these extras for all CloudPaths +c.set_as_default_client() +``` + +**Note:** The `extra_args` kwargs accepts the union of upload and download args, and will only pass on the relevant subset to the `boto3` method that is called by the internals of `S3Client`. + +**Note:** The ExtraArgs on the client will be used for every call that client makes. If you need to set different `ExtraArgs` in different code paths, we recommend creating separate explicit client objects and using those to create and manage the CloudPath objects with different needs. + +**Note:** To explicitly set the `ContentType` and `ContentEncoding`, we recommend using the `content_type_method` kwarg when instantiating the client. If instead you want to set this for all uploads via the extras, you must additionally pass `content_type_method=None` to the `S3Client` so we don't try to guess these automatically. + + ## Accessing custom S3-compatible object stores It might happen so that you need to access a customly deployed S3 object store ([MinIO](https://min.io/), [Ceph](https://ceph.io/ceph-storage/object-storage/) or any other). In such cases, the service endpoint will be different from the AWS object store endpoints (used by default). diff --git a/requirements-dev.txt b/requirements-dev.txt index 32382c91..115aa945 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,6 @@ -e .[all] black -coverage<7 flake8 ipytest ipython diff --git a/setup.cfg b/setup.cfg index 128298a7..6cddd4c6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,4 +24,4 @@ testpaths = tests/ addopts = --cov=cloudpathlib --cov-report=term --cov-report=html --cov-report=xml -n=auto [coverage:report] -include = cloudpathlib/**.py +include = cloudpathlib/**/*.py diff --git a/tests/mock_clients/mock_azureblob.py b/tests/mock_clients/mock_azureblob.py index 281562fb..198badae 100644 --- a/tests/mock_clients/mock_azureblob.py +++ b/tests/mock_clients/mock_azureblob.py @@ -1,3 +1,4 @@ +from collections import namedtuple from datetime import datetime from pathlib import Path, PurePosixPath import shutil @@ -38,6 +39,10 @@ def get_blob_client(self, container, blob): def get_container_client(self, container): return MockContainerClient(self.tmp_path, container_name=container) + def list_containers(self): + Container = namedtuple("Container", "name") + return [Container(name=DEFAULT_CONTAINER_NAME)] + return MockBlobServiceClient diff --git a/tests/mock_clients/mock_gs.py b/tests/mock_clients/mock_gs.py index ccf675bd..b2cb8b9d 100644 --- a/tests/mock_clients/mock_gs.py +++ b/tests/mock_clients/mock_gs.py @@ -35,6 +35,9 @@ def __del__(self): def bucket(self, bucket): return MockBucket(self.tmp_path, bucket, client=self) + def list_buckets(self): + return [DEFAULT_GS_BUCKET_NAME] + return MockClient diff --git a/tests/mock_clients/mock_s3.py b/tests/mock_clients/mock_s3.py index 994b24a4..a6eb5685 100644 --- a/tests/mock_clients/mock_s3.py +++ b/tests/mock_clients/mock_s3.py @@ -88,11 +88,12 @@ def copy_from(self, CopySource=None, Metadata=None, MetadataDirective=None): else: self.path.write_bytes((self.root / Path(CopySource["Key"])).read_bytes()) - def download_file(self, to_path, Config=None): + def download_file(self, to_path, Config=None, ExtraArgs=None): to_path = Path(to_path) to_path.write_bytes(self.path.read_bytes()) # track config to make sure it's used in tests self.resource.download_config = Config + self.resource.download_extra_args = ExtraArgs def upload_file(self, from_path, Config=None, ExtraArgs=None): self.path.parent.mkdir(parents=True, exist_ok=True) @@ -107,7 +108,7 @@ def delete(self): delete_empty_parents_up_to_root(self.path, self.root) return {"ResponseMetadata": {"HTTPStatusCode": 204}} - def copy(self, source): + def copy(self, source, ExtraArgs=None, Config=None): # boto3 is more like "copy from" source = self.root / source["Key"] self.path.parent.mkdir(parents=True, exist_ok=True) @@ -214,6 +215,19 @@ def head_bucket(self, Bucket): {}, ) + def list_buckets(self): + return {"Buckets": [{"Name": DEFAULT_S3_BUCKET_NAME}]} + + def head_object(self, Bucket, Key, **kwargs): + if ( + not (self.root / Key).exists() + or (self.root / Key).is_dir() + or Bucket != DEFAULT_S3_BUCKET_NAME + ): + raise ClientError({}, {}) + else: + return {"key": Key} + @property def exceptions(self): Ex = collections.namedtuple("Ex", "NoSuchKey") diff --git a/tests/test_cloudpath_file_io.py b/tests/test_cloudpath_file_io.py index 96ff1c1d..3592d49c 100644 --- a/tests/test_cloudpath_file_io.py +++ b/tests/test_cloudpath_file_io.py @@ -138,6 +138,17 @@ def test_iterdir(glob_test_dirs): ) +def test_list_buckets(rig): + # test we can list buckets + buckets = list(rig.path_class(f"{rig.path_class.cloud_prefix}").iterdir()) + assert len(buckets) > 0 + + for b in buckets: + assert isinstance(b, rig.path_class) + assert b.drive != "" + assert b._no_prefix_no_drive == "" + + def test_glob(glob_test_dirs): cloud_root, local_root = glob_test_dirs diff --git a/tests/test_s3_specific.py b/tests/test_s3_specific.py index 4c9ac851..c529a4f9 100644 --- a/tests/test_s3_specific.py +++ b/tests/test_s3_specific.py @@ -1,4 +1,5 @@ from concurrent.futures import ProcessPoolExecutor +from itertools import islice from time import sleep import pytest @@ -206,6 +207,32 @@ def test_no_sign_request(s3_rig, tmp_path): p.exists() +def test_extra_args_via_requester_pays(s3_rig, tmp_path): + """Tests that we can pass Extra to the S3Client and we will + be able to use those by leveraging the RequestPayer extra. + + NOTE: Requires AWS S3 credentials where you will pay for + the requests here (which are a minimal amount). + """ + if not s3_rig.live_server: + pytest.skip("This test only runs against live servers.") + + # without the extras, this should raise + with pytest.raises(botocore.exceptions.ClientError): + list(s3_rig.path_class("s3://arxiv/pdf/").iterdir()) + + client = s3_rig.client_class(extra_args={"RequestPayer": "requester"}) + + # unsigned can access public path (axiv bucket is requester pays) + paths = list(islice(client.CloudPath("s3://arxiv/pdf/").iterdir(), 0, 5)) + assert len(paths) > 0 + + # download of a file to the local bucket + local = client.CloudPath("s3://arxiv/pdf/arXiv_pdf_manifest.xml").download_to(tmp_path) + + assert local.stat().st_size > 0 + + def test_aws_endpoint_url_env(monkeypatch): """Allows setting endpoint_url from env variable until upstream boto3 PR is merged.