Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make HNS check and client instantiation more reliable #478

Merged
merged 8 commits into from
Oct 18, 2024
Merged
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -30,3 +30,9 @@ LIVE_GS_BUCKET=a-bucket-you-can-access
# Custom S3, e.g. MinIO
CUSTOM_S3_BUCKET=a-bucket-you-can-access
CUSTOM_S3_ENDPOINT=your_custom_s3_endpoint

# From a registered Azure App as a service principal; currently used just for
# live tests with DefaultCredentials
AZURE_CLIENT_ID=
AZURE_TENANT_ID=
AZURE_CLIENT_SECRET=
3 changes: 3 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -103,6 +103,9 @@ jobs:
LIVE_AZURE_CONTAINER: ${{ secrets.LIVE_AZURE_CONTAINER }}
AZURE_STORAGE_CONNECTION_STRING: ${{ secrets.AZURE_STORAGE_CONNECTION_STRING }}
AZURE_STORAGE_GEN2_CONNECTION_STRING: ${{ secrets.AZURE_STORAGE_GEN2_CONNECTION_STRING }}
AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }}
LIVE_GS_BUCKET: ${{ secrets.LIVE_GS_BUCKET }}
LIVE_S3_BUCKET: ${{ secrets.LIVE_S3_BUCKET }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
3 changes: 2 additions & 1 deletion HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# cloudpathlib Changelog

## UNRELEASED
## v0.20.0 (2024-10-18)

- Added support for custom schemes in CloudPath and Client subclases. (Issue [#466](https://github.com/drivendataorg/cloudpathlib/issues/466), PR [#467](https://github.com/drivendataorg/cloudpathlib/pull/467))
- Add check for HNS that works in more scenarios and with different kinds of credentials. (Issue [#470](https://github.com/drivendataorg/cloudpathlib/issues/470), Issue [#476](https://github.com/drivendataorg/cloudpathlib/issues/476), PR [#478](https://github.com/drivendataorg/cloudpathlib/pull/478))
pjbull marked this conversation as resolved.
Show resolved Hide resolved

## v0.19.0 (2024-08-29)

78 changes: 59 additions & 19 deletions cloudpathlib/azure/azblobclient.py
Original file line number Diff line number Diff line change
@@ -15,6 +15,7 @@
try:
from azure.core.exceptions import ResourceNotFoundError
from azure.core.credentials import AzureNamedKeyCredential

from azure.storage.blob import (
BlobPrefix,
BlobSasPermissions,
@@ -24,7 +25,15 @@
generate_blob_sas,
)

from azure.storage.blob._shared.authentication import (
SharedKeyCredentialPolicy as BlobSharedKeyCredentialPolicy,
)

from azure.storage.filedatalake import DataLakeServiceClient, FileProperties
from azure.storage.filedatalake._shared.authentication import (
SharedKeyCredentialPolicy as DataLakeSharedKeyCredentialPolicy,
)

except ModuleNotFoundError:
implementation_registry["azure"].dependencies_loaded = False

@@ -104,19 +113,29 @@
if connection_string is None:
connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING", None)

self.data_lake_client = None # only needs to end up being set if HNS is enabled
self.data_lake_client: Optional[DataLakeServiceClient] = (
None # only needs to end up being set if HNS is enabled
)

if blob_service_client is not None:
self.service_client = blob_service_client

# create from blob service client if not passed
if data_lake_client is None:
self.data_lake_client = DataLakeServiceClient(
account_url=self.service_client.url.replace(".blob.", ".dfs.", 1),
credential=AzureNamedKeyCredential(
credential = (

Check warning on line 125 in cloudpathlib/azure/azblobclient.py

Codecov / codecov/patch

cloudpathlib/azure/azblobclient.py#L125

Added line #L125 was not covered by tests
blob_service_client.credential
if not isinstance(
blob_service_client.credential, BlobSharedKeyCredentialPolicy
)
else AzureNamedKeyCredential(
blob_service_client.credential.account_name,
blob_service_client.credential.account_key,
),
)
)

self.data_lake_client = DataLakeServiceClient(

Check warning on line 136 in cloudpathlib/azure/azblobclient.py

Codecov / codecov/patch

cloudpathlib/azure/azblobclient.py#L136

Added line #L136 was not covered by tests
account_url=self.service_client.url.replace(".blob.", ".dfs.", 1),
credential=credential,
)
else:
self.data_lake_client = data_lake_client
@@ -125,12 +144,21 @@
self.data_lake_client = data_lake_client

if blob_service_client is None:
self.service_client = BlobServiceClient(
account_url=self.data_lake_client.url.replace(".dfs.", ".blob.", 1),
credential=AzureNamedKeyCredential(

credential = (

Check warning on line 148 in cloudpathlib/azure/azblobclient.py

Codecov / codecov/patch

cloudpathlib/azure/azblobclient.py#L148

Added line #L148 was not covered by tests
data_lake_client.credential
if not isinstance(
data_lake_client.credential, DataLakeSharedKeyCredentialPolicy
)
else AzureNamedKeyCredential(
data_lake_client.credential.account_name,
data_lake_client.credential.account_key,
),
)
)

self.service_client = BlobServiceClient(

Check warning on line 159 in cloudpathlib/azure/azblobclient.py

Codecov / codecov/patch

cloudpathlib/azure/azblobclient.py#L159

Added line #L159 was not covered by tests
account_url=self.data_lake_client.url.replace(".dfs.", ".blob.", 1),
credential=credential,
)

elif connection_string is not None:
@@ -167,19 +195,31 @@
"Credentials are required; see docs for options."
)

self._hns_enabled = None
self._hns_enabled: Optional[bool] = None

def _check_hns(self) -> Optional[bool]:
def _check_hns(self, cloud_path: AzureBlobPath) -> Optional[bool]:
if self._hns_enabled is None:
account_info = self.service_client.get_account_information() # type: ignore
self._hns_enabled = account_info.get("is_hns_enabled", False) # type: ignore
try:
account_info = self.service_client.get_account_information() # type: ignore
self._hns_enabled = account_info.get("is_hns_enabled", False) # type: ignore
except ResourceNotFoundError:

Check warning on line 205 in cloudpathlib/azure/azblobclient.py

Codecov / codecov/patch

cloudpathlib/azure/azblobclient.py#L205

Added line #L205 was not covered by tests
# get_account_information() not supported with this credential; we have to fallback to
# checking if the root directory exists and is a has 'metadata': {'hdi_isfolder': 'true'}
root_dir = self.service_client.get_blob_client(

Check warning on line 208 in cloudpathlib/azure/azblobclient.py

Codecov / codecov/patch

cloudpathlib/azure/azblobclient.py#L208

Added line #L208 was not covered by tests
container=cloud_path.container, blob="/"
)
self._hns_enabled = (

Check warning on line 211 in cloudpathlib/azure/azblobclient.py

Codecov / codecov/patch

cloudpathlib/azure/azblobclient.py#L211

Added line #L211 was not covered by tests
root_dir.exists()
and root_dir.get_blob_properties().metadata.get("hdi_isfolder", False)
== "true"
)

return self._hns_enabled

def _get_metadata(
self, cloud_path: AzureBlobPath
) -> Union["BlobProperties", "FileProperties", Dict[str, Any]]:
if self._check_hns():
if self._check_hns(cloud_path):

# works on both files and directories
fsc = self.data_lake_client.get_file_system_client(cloud_path.container) # type: ignore
@@ -292,7 +332,7 @@
if prefix and not prefix.endswith("/"):
prefix += "/"

if self._check_hns():
if self._check_hns(cloud_path):
file_system_client = self.data_lake_client.get_file_system_client(cloud_path.container) # type: ignore
paths = file_system_client.get_paths(path=cloud_path.blob, recursive=recursive)

@@ -334,7 +374,7 @@
)

# we can use rename API when the same account on adls gen2
elif remove_src and (src.client is dst.client) and self._check_hns():
elif remove_src and (src.client is dst.client) and self._check_hns(src):
fsc = self.data_lake_client.get_file_system_client(src.container) # type: ignore

if src.is_dir():
@@ -358,7 +398,7 @@
def _mkdir(
self, cloud_path: AzureBlobPath, parents: bool = False, exist_ok: bool = False
) -> None:
if self._check_hns():
if self._check_hns(cloud_path):
file_system_client = self.data_lake_client.get_file_system_client(cloud_path.container) # type: ignore
directory_client = file_system_client.get_directory_client(cloud_path.blob)

@@ -379,7 +419,7 @@
def _remove(self, cloud_path: AzureBlobPath, missing_ok: bool = True) -> None:
file_or_dir = self._is_file_or_dir(cloud_path)
if file_or_dir == "dir":
if self._check_hns():
if self._check_hns(cloud_path):
_hns_rmtree(self.data_lake_client, cloud_path.container, cloud_path.blob)
return

@@ -432,7 +472,7 @@
self, cloud_path: AzureBlobPath, expire_seconds: int = 60 * 60
) -> str:
sas_token = generate_blob_sas(
self.service_client.account_name,
self.service_client.account_name, # type: ignore[arg-type]
container_name=cloud_path.container,
blob_name=cloud_path.blob,
account_key=self.service_client.credential.account_key,
2 changes: 1 addition & 1 deletion cloudpathlib/azure/azblobpath.py
Original file line number Diff line number Diff line change
@@ -91,7 +91,7 @@ def replace(self, target: "AzureBlobPath") -> "AzureBlobPath":

# we can rename directories on ADLS Gen2
except CloudPathIsADirectoryError:
if self.client._check_hns():
if self.client._check_hns(self):
return self.client._move_file(self, target)
else:
raise
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"

[project]
name = "cloudpathlib"
version = "0.19.0"
version = "0.20.0"
description = "pathlib-style classes for cloud storage services."
readme = "README.md"
authors = [{ name = "DrivenData", email = "[email protected]" }]
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
-e .[all]

azure-identity
black[jupyter]>=24.1.0;python_version>='3.8'
build
flake8
15 changes: 15 additions & 0 deletions tests/test_azure_specific.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os

from azure.core.credentials import AzureNamedKeyCredential
from azure.identity import DefaultAzureCredential
from azure.storage.blob import (
BlobServiceClient,
StorageStreamDownloader,
@@ -134,6 +135,20 @@ def _check_access(az_client, gen2=False):
)
_check_access(cl, gen2=azure_rigs.is_adls_gen2)

# discover and use credentials for service principal by having set:
# AZURE_CLIENT_ID, AZURE_CLIENT_SECRET, AZURE_TENANT_ID
credential = DefaultAzureCredential()
cl: AzureBlobClient = azure_rigs.client_class(credential=credential, account_url=bsc.url)
_check_access(cl, gen2=azure_rigs.is_adls_gen2)

# add basic checks for gen2 to exercise limited-privilege access scenarios
p = azure_rigs.create_cloud_path("new_dir/new_file.txt", client=cl)
assert cl._check_hns(p) == azure_rigs.is_adls_gen2
p.write_text("Hello")
assert p.exists()
assert p.read_text() == "Hello"
assert list(p.parent.iterdir()) == [p]


def test_adls_gen2_mkdir(azure_gen2_rig):
"""Since directories can be created on gen2, we should test mkdir, rmdir, rmtree, and unlink