From 9dbfd3db3e9154d2a2c87e721de9c6aef8264db0 Mon Sep 17 00:00:00 2001 From: Srinivas Lade Date: Wed, 12 Jul 2023 12:53:59 -0400 Subject: [PATCH] GH-35903: [C++] Skeleton for Azure Blob Storage filesystem implementation (#35701) ### What changes are included in this PR? This PR splits out the overall skeleton of #12914 in order to make merging of the overall Azure Filesystem easier to do. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * Closes: #35903 Authored-by: Srinivas Lade Signed-off-by: Antoine Pitrou --- .github/workflows/cpp.yml | 1 + ci/docker/ubuntu-20.04-cpp.dockerfile | 1 + ci/docker/ubuntu-22.04-cpp.dockerfile | 1 + ci/scripts/cpp_build.sh | 1 + cpp/CMakePresets.json | 1 + cpp/cmake_modules/DefineOptions.cmake | 3 + cpp/src/arrow/CMakeLists.txt | 6 + cpp/src/arrow/filesystem/CMakeLists.txt | 9 ++ cpp/src/arrow/filesystem/azurefs.cc | 154 ++++++++++++++++++++++ cpp/src/arrow/filesystem/azurefs.h | 159 +++++++++++++++++++++++ cpp/src/arrow/filesystem/azurefs_test.cc | 46 +++++++ 11 files changed, 382 insertions(+) create mode 100644 cpp/src/arrow/filesystem/azurefs.cc create mode 100644 cpp/src/arrow/filesystem/azurefs.h create mode 100644 cpp/src/arrow/filesystem/azurefs_test.cc diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 297a9664e35ce..67435566ce305 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -151,6 +151,7 @@ jobs: if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 75 env: + ARROW_AZURE: ON ARROW_BUILD_TESTS: ON ARROW_DATASET: ON ARROW_FLIGHT: ON diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index a5c1f0cdc1822..f94494177e8ee 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -141,6 +141,7 @@ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin # static Arrow to run Flight/Flight SQL tests ENV absl_SOURCE=BUNDLED \ ARROW_ACERO=ON \ + ARROW_AZURE=ON \ ARROW_BUILD_STATIC=ON \ ARROW_BUILD_TESTS=ON \ ARROW_DEPENDENCY_SOURCE=SYSTEM \ diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index 764e9fd4f9ded..e773c6f1ee659 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -164,6 +164,7 @@ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin # - libgtest-dev only provide sources ENV absl_SOURCE=BUNDLED \ ARROW_ACERO=ON \ + ARROW_AZURE=ON \ ARROW_BUILD_STATIC=ON \ ARROW_BUILD_TESTS=ON \ ARROW_DEPENDENCY_SOURCE=SYSTEM \ diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 91a570be977a2..fd682d0e2a62a 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -84,6 +84,7 @@ pushd ${build_dir} cmake \ -Dabsl_SOURCE=${absl_SOURCE:-} \ -DARROW_ACERO=${ARROW_ACERO:-ON} \ + -DARROW_AZURE=${ARROW_AZURE:-OFF} \ -DARROW_BOOST_USE_SHARED=${ARROW_BOOST_USE_SHARED:-ON} \ -DARROW_BUILD_BENCHMARKS_REFERENCE=${ARROW_BUILD_BENCHMARKS:-OFF} \ -DARROW_BUILD_BENCHMARKS=${ARROW_BUILD_BENCHMARKS:-OFF} \ diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json index 7882be57a0534..94141d693be8f 100644 --- a/cpp/CMakePresets.json +++ b/cpp/CMakePresets.json @@ -97,6 +97,7 @@ "inherits": "features-basic", "hidden": true, "cacheVariables": { + "ARROW_AZURE": "ON", "ARROW_GCS": "ON", "ARROW_HDFS": "ON", "ARROW_S3": "ON" diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 8601184309f34..d20af060453b0 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -301,6 +301,9 @@ takes precedence over ccache if a storage backend is configured" ON) ARROW_COMPUTE ARROW_IPC) + define_option(ARROW_AZURE + "Build Arrow with Azure support (requires the Azure SDK for C++)" OFF) + define_option(ARROW_BUILD_UTILITIES "Build Arrow commandline utilities" OFF) define_option(ARROW_COMPUTE "Build all Arrow Compute kernels" OFF) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index e283fd7ee92b1..fccff6c8cf1a9 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -481,6 +481,12 @@ if(ARROW_FILESYSTEM) filesystem/path_util.cc filesystem/util_internal.cc) + if(ARROW_AZURE) + list(APPEND ARROW_SRCS filesystem/azurefs.cc) + set_source_files_properties(filesystem/azurefs.cc + PROPERTIES SKIP_PRECOMPILE_HEADERS ON + SKIP_UNITY_BUILD_INCLUSION ON) + endif() if(ARROW_GCS) list(APPEND ARROW_SRCS filesystem/gcsfs.cc filesystem/gcsfs_internal.cc) set_source_files_properties(filesystem/gcsfs.cc filesystem/gcsfs_internal.cc diff --git a/cpp/src/arrow/filesystem/CMakeLists.txt b/cpp/src/arrow/filesystem/CMakeLists.txt index 97aa01ea9f995..b997ca0a387a6 100644 --- a/cpp/src/arrow/filesystem/CMakeLists.txt +++ b/cpp/src/arrow/filesystem/CMakeLists.txt @@ -47,6 +47,15 @@ if(ARROW_GCS) Boost::system) endif() +if(ARROW_AZURE) + add_arrow_test(azurefs_test + EXTRA_LABELS + filesystem + EXTRA_LINK_LIBS + Boost::filesystem + Boost::system) +endif() + if(ARROW_S3) add_arrow_test(s3fs_test SOURCES diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc new file mode 100644 index 0000000000000..0158c0cec74e1 --- /dev/null +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/filesystem/azurefs.h" + +#include "arrow/result.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { +namespace fs { + +// ----------------------------------------------------------------------- +// AzureOptions Implementation + +AzureOptions::AzureOptions() {} + +bool AzureOptions::Equals(const AzureOptions& other) const { + return (account_dfs_url == other.account_dfs_url && + account_blob_url == other.account_blob_url && + credentials_kind == other.credentials_kind); +} + +// ----------------------------------------------------------------------- +// AzureFilesystem Implementation + +class AzureFileSystem::Impl { + public: + io::IOContext io_context_; + bool is_hierarchical_namespace_enabled_; + AzureOptions options_; + + explicit Impl(AzureOptions options, io::IOContext io_context) + : io_context_(io_context), options_(std::move(options)) {} + + Status Init() { + if (options_.backend == AzureBackend::Azurite) { + // gen1Client_->GetAccountInfo().Value.IsHierarchicalNamespaceEnabled + // throws error in azurite + is_hierarchical_namespace_enabled_ = false; + } + return Status::OK(); + } + + const AzureOptions& options() const { return options_; } +}; + +const AzureOptions& AzureFileSystem::options() const { return impl_->options(); } + +bool AzureFileSystem::Equals(const FileSystem& other) const { + if (this == &other) { + return true; + } + if (other.type_name() != type_name()) { + return false; + } + const auto& azure_fs = ::arrow::internal::checked_cast(other); + return options().Equals(azure_fs.options()); +} + +Result AzureFileSystem::GetFileInfo(const std::string& path) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Result AzureFileSystem::GetFileInfo(const FileSelector& select) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Status AzureFileSystem::CreateDir(const std::string& path, bool recursive) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Status AzureFileSystem::DeleteDir(const std::string& path) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Status AzureFileSystem::DeleteDirContents(const std::string& path, bool missing_dir_ok) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Status AzureFileSystem::DeleteRootDirContents() { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Status AzureFileSystem::DeleteFile(const std::string& path) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Status AzureFileSystem::Move(const std::string& src, const std::string& dest) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Status AzureFileSystem::CopyFile(const std::string& src, const std::string& dest) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Result> AzureFileSystem::OpenInputStream( + const std::string& path) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Result> AzureFileSystem::OpenInputStream( + const FileInfo& info) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Result> AzureFileSystem::OpenInputFile( + const std::string& path) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Result> AzureFileSystem::OpenInputFile( + const FileInfo& info) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Result> AzureFileSystem::OpenOutputStream( + const std::string& path, const std::shared_ptr& metadata) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Result> AzureFileSystem::OpenAppendStream( + const std::string&, const std::shared_ptr&) { + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); +} + +Result> AzureFileSystem::Make( + const AzureOptions& options, const io::IOContext& io_context) { + std::shared_ptr ptr(new AzureFileSystem(options, io_context)); + RETURN_NOT_OK(ptr->impl_->Init()); + return ptr; +} + +AzureFileSystem::AzureFileSystem(const AzureOptions& options, + const io::IOContext& io_context) + : FileSystem(io_context), impl_(std::make_unique(options, io_context)) { + default_async_is_sync_ = false; +} + +} // namespace fs +} // namespace arrow diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h new file mode 100644 index 0000000000000..e5af4d23aabe5 --- /dev/null +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -0,0 +1,159 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/filesystem/filesystem.h" +#include "arrow/util/macros.h" +#include "arrow/util/uri.h" + +namespace Azure { +namespace Core { +namespace Credentials { + +class TokenCredential; + +} // namespace Credentials +} // namespace Core +namespace Storage { + +class StorageSharedKeyCredential; + +} // namespace Storage +} // namespace Azure + +namespace arrow { +namespace fs { + +enum class AzureCredentialsKind : int8_t { + /// Anonymous access (no credentials used), public + Anonymous, + /// Use explicitly-provided access key pair + StorageCredentials, + /// Use ServicePrincipleCredentials + ServicePrincipleCredentials, + /// Use Sas Token to authenticate + Sas, + /// Use Connection String + ConnectionString +}; + +enum class AzureBackend : bool { + /// Official Azure Remote Backend + Azure, + /// Local Simulated Storage + Azurite +}; + +/// Options for the AzureFileSystem implementation. +struct ARROW_EXPORT AzureOptions { + std::string account_dfs_url; + std::string account_blob_url; + AzureBackend backend = AzureBackend::Azure; + AzureCredentialsKind credentials_kind = AzureCredentialsKind::Anonymous; + + std::string sas_token; + std::string connection_string; + std::shared_ptr + storage_credentials_provider; + std::shared_ptr + service_principle_credentials_provider; + + AzureOptions(); + + bool Equals(const AzureOptions& other) const; +}; + +/// \brief Azure-backed FileSystem implementation for ABFS and ADLS. +/// +/// ABFS (Azure Blob Storage - https://azure.microsoft.com/en-us/products/storage/blobs/) +/// object-based cloud storage system. +/// +/// ADLS (Azure Data Lake Storage - +/// https://azure.microsoft.com/en-us/products/storage/data-lake-storage/) +/// is a scalable data storage system designed for big-data applications. +/// ADLS provides filesystem semantics, file-level security, and Hadoop +/// compatibility. Gen1 exists as a separate object that will retired +/// on Feb 29, 2024. New ADLS accounts will use Gen2 instead, which is +/// implemented on top of ABFS. +/// +/// TODO: GH-18014 Complete the internal implementation +/// and review the documentation +class ARROW_EXPORT AzureFileSystem : public FileSystem { + public: + ~AzureFileSystem() override = default; + + std::string type_name() const override { return "abfs"; } + + /// Return the original Azure options when constructing the filesystem + const AzureOptions& options() const; + + bool Equals(const FileSystem& other) const override; + + Result GetFileInfo(const std::string& path) override; + + Result GetFileInfo(const FileSelector& select) override; + + Status CreateDir(const std::string& path, bool recursive = true) override; + + Status DeleteDir(const std::string& path) override; + + Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override; + + Status DeleteRootDirContents() override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Result> OpenInputStream( + const std::string& path) override; + + Result> OpenInputStream(const FileInfo& info) override; + + Result> OpenInputFile( + const std::string& path) override; + + Result> OpenInputFile( + const FileInfo& info) override; + + Result> OpenOutputStream( + const std::string& path, + const std::shared_ptr& metadata = {}) override; + + Result> OpenAppendStream( + const std::string& path, + const std::shared_ptr& metadata = {}) override; + + static Result> Make( + const AzureOptions& options, const io::IOContext& = io::default_io_context()); + + private: + explicit AzureFileSystem(const AzureOptions& options, const io::IOContext& io_context); + + class Impl; + std::unique_ptr impl_; +}; + +} // namespace fs +} // namespace arrow diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc new file mode 100644 index 0000000000000..0f03e88393aeb --- /dev/null +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/filesystem/azurefs.h" + +#include +#include +#include + +#include + +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" + +namespace arrow { +namespace fs { +namespace { + +using ::testing::IsEmpty; +using ::testing::Not; +using ::testing::NotNull; + +// Placeholder test for file structure +// TODO: GH-18014 Remove once a proper test is added +TEST(AzureFileSystem, OptionsCompare) { + AzureOptions options; + EXPECT_TRUE(options.Equals(options)); +} + +} // namespace +} // namespace fs +} // namespace arrow