From 2790bfcbed5e141001a9c221056980fa746af8cd Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 17 Nov 2022 19:07:32 -0800 Subject: [PATCH 1/9] Replace calls to dlsym functions with actual functions. --- cpp/include/raft/comms/detail/ucp_helper.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/comms/detail/ucp_helper.hpp b/cpp/include/raft/comms/detail/ucp_helper.hpp index 668acafae4..3c0f7adbd6 100644 --- a/cpp/include/raft/comms/detail/ucp_helper.hpp +++ b/cpp/include/raft/comms/detail/ucp_helper.hpp @@ -170,7 +170,7 @@ class comms_ucp_handler { } public: - int ucp_progress(ucp_worker_h worker) const { return (*(worker_progress_func))(worker); } + int ucp_progress(ucp_worker_h worker) const { return ucp_worker_progress(worker); } /** * @brief Frees any memory underlying the given ucp request object @@ -179,7 +179,7 @@ class comms_ucp_handler { { if (request->needs_release) { request->req->completed = 0; - (*(req_free_func))(request->req); + ucp_request_free(request->req); } free(request); } @@ -198,7 +198,7 @@ class comms_ucp_handler { ucp_tag_t ucp_tag = build_message_tag(rank, tag); ucs_status_ptr_t send_result = - (*(send_func))(ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback); + ucp_tag_send_nb(ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback); struct ucx_context* ucp_req = (struct ucx_context*)send_result; if (UCS_PTR_IS_ERR(send_result)) { @@ -240,7 +240,7 @@ class comms_ucp_handler { ucp_tag_t ucp_tag = build_message_tag(sender_rank, tag); ucs_status_ptr_t recv_result = - (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, tag_mask, recv_callback); + ucp_tag_recv_nb(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, tag_mask, recv_callback); struct ucx_context* ucp_req = (struct ucx_context*)recv_result; From 7164f9a72dc835c4465b96778e8763f8c907c6e4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 17 Nov 2022 19:11:07 -0800 Subject: [PATCH 2/9] Get rid of internal dlopen-based APIs. --- cpp/include/raft/comms/detail/ucp_helper.hpp | 68 -------------------- 1 file changed, 68 deletions(-) diff --git a/cpp/include/raft/comms/detail/ucp_helper.hpp b/cpp/include/raft/comms/detail/ucp_helper.hpp index 3c0f7adbd6..f0ce8e6c05 100644 --- a/cpp/include/raft/comms/detail/ucp_helper.hpp +++ b/cpp/include/raft/comms/detail/ucp_helper.hpp @@ -94,75 +94,7 @@ static void recv_callback(void* request, ucs_status_t status, ucp_tag_recv_info_ * interacting with ucp. */ class comms_ucp_handler { - public: - comms_ucp_handler() - { - load_ucp_handle(); - load_send_func(); - load_recv_func(); - load_free_req_func(); - load_print_info_func(); - load_worker_progress_func(); - } - - ~comms_ucp_handler() { dlclose(ucp_handle); } - private: - void* ucp_handle; - - dlsym_print_info print_info_func; - dlsym_rec_free req_free_func; - dlsym_worker_progress worker_progress_func; - dlsym_send send_func; - dlsym_recv recv_func; - - void load_ucp_handle() - { - ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NOLOAD | RTLD_NODELETE); - if (!ucp_handle) { - ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NODELETE); - ASSERT(ucp_handle, "Cannot open UCX library: %s\n", dlerror()); - } - // Reset any potential error - dlerror(); - } - - void assert_dlerror() - { - char* error = dlerror(); - ASSERT(error == NULL, "Error loading function symbol: %s\n", error); - } - - void load_send_func() - { - send_func = (dlsym_send)dlsym(ucp_handle, "ucp_tag_send_nb"); - assert_dlerror(); - } - - void load_free_req_func() - { - req_free_func = (dlsym_rec_free)dlsym(ucp_handle, "ucp_request_free"); - assert_dlerror(); - } - - void load_print_info_func() - { - print_info_func = (dlsym_print_info)dlsym(ucp_handle, "ucp_ep_print_info"); - assert_dlerror(); - } - - void load_worker_progress_func() - { - worker_progress_func = (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress"); - assert_dlerror(); - } - - void load_recv_func() - { - recv_func = (dlsym_recv)dlsym(ucp_handle, "ucp_tag_recv_nb"); - assert_dlerror(); - } - ucp_tag_t build_message_tag(int rank, int tag) const { // keeping the rank in the lower bits enables debugging. From 2c92fe6258cc4bdbafc0b90f5401e43827b06515 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 17 Nov 2022 19:13:27 -0800 Subject: [PATCH 3/9] Inline trivial ucp function call. --- cpp/include/raft/comms/detail/std_comms.hpp | 2 +- cpp/include/raft/comms/detail/ucp_helper.hpp | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp index ed9e9e78f0..33892597d8 100644 --- a/cpp/include/raft/comms/detail/std_comms.hpp +++ b/cpp/include/raft/comms/detail/std_comms.hpp @@ -266,7 +266,7 @@ class std_comms : public comms_iface { bool restart = false; // resets the timeout when any progress was made // Causes UCP to progress through the send/recv message queue - while (ucp_handler_.ucp_progress(ucp_worker_) != 0) { + while (ucp_worker_progress(ucp_worker_) != 0) { restart = true; } diff --git a/cpp/include/raft/comms/detail/ucp_helper.hpp b/cpp/include/raft/comms/detail/ucp_helper.hpp index f0ce8e6c05..533f080a69 100644 --- a/cpp/include/raft/comms/detail/ucp_helper.hpp +++ b/cpp/include/raft/comms/detail/ucp_helper.hpp @@ -102,8 +102,6 @@ class comms_ucp_handler { } public: - int ucp_progress(ucp_worker_h worker) const { return ucp_worker_progress(worker); } - /** * @brief Frees any memory underlying the given ucp request object */ From 84f83c9b3d53ada3f7ef6b39047365245e1719eb Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 17 Nov 2022 19:14:04 -0800 Subject: [PATCH 4/9] Remove now extraneous typedefs. --- cpp/include/raft/comms/detail/ucp_helper.hpp | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/cpp/include/raft/comms/detail/ucp_helper.hpp b/cpp/include/raft/comms/detail/ucp_helper.hpp index 533f080a69..82740bc8e4 100644 --- a/cpp/include/raft/comms/detail/ucp_helper.hpp +++ b/cpp/include/raft/comms/detail/ucp_helper.hpp @@ -26,23 +26,6 @@ namespace raft { namespace comms { namespace detail { -typedef void (*dlsym_print_info)(ucp_ep_h, FILE*); - -typedef void (*dlsym_rec_free)(void*); - -typedef int (*dlsym_worker_progress)(ucp_worker_h); - -typedef ucs_status_ptr_t (*dlsym_send)( - ucp_ep_h, const void*, size_t, ucp_datatype_t, ucp_tag_t, ucp_send_callback_t); - -typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h, - void*, - size_t count, - ucp_datatype_t datatype, - ucp_tag_t, - ucp_tag_t, - ucp_tag_recv_callback_t); - /** * Standard UCX request object that will be passed * around asynchronously. This object is really From 617059f79a43e009c01cb820ae5b0b7007ba5217 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 17 Nov 2022 19:14:59 -0800 Subject: [PATCH 5/9] Update comment. --- cpp/include/raft/comms/detail/ucp_helper.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/include/raft/comms/detail/ucp_helper.hpp b/cpp/include/raft/comms/detail/ucp_helper.hpp index 82740bc8e4..8fc657fa5b 100644 --- a/cpp/include/raft/comms/detail/ucp_helper.hpp +++ b/cpp/include/raft/comms/detail/ucp_helper.hpp @@ -73,8 +73,7 @@ static void recv_callback(void* request, ucs_status_t status, ucp_tag_recv_info_ } /** - * Helper class for managing `dlopen` state and - * interacting with ucp. + * Helper class for interacting with ucp. */ class comms_ucp_handler { private: From d8490007281530ea104b49f43190e06c3cb28ca5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 17 Nov 2022 19:15:09 -0800 Subject: [PATCH 6/9] Remove dlfcn header. --- cpp/include/raft/comms/detail/ucp_helper.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/include/raft/comms/detail/ucp_helper.hpp b/cpp/include/raft/comms/detail/ucp_helper.hpp index 8fc657fa5b..9479bc24f9 100644 --- a/cpp/include/raft/comms/detail/ucp_helper.hpp +++ b/cpp/include/raft/comms/detail/ucp_helper.hpp @@ -16,7 +16,6 @@ #pragma once -#include #include #include #include From f3466c7e4367227ba3436d554a30dee5734b1138 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 18 Nov 2022 07:28:48 -0800 Subject: [PATCH 7/9] Add ucx to the link interface of libraft. --- cpp/CMakeLists.txt | 7 +++++++ python/raft-dask/raft_dask/common/CMakeLists.txt | 3 +-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 94e693f861..5f37bacd97 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -205,6 +205,12 @@ endif() add_library(raft INTERFACE) add_library(raft::raft ALIAS raft) +rapids_find_package( + ucx REQUIRED + BUILD_EXPORT_SET raft-exports + INSTALL_EXPORT_SET raft-exports +) + target_include_directories( raft INTERFACE "$" "$" ) @@ -213,6 +219,7 @@ target_include_directories( target_link_libraries( raft INTERFACE rmm::rmm + ucx::ucp CUDA::cublas${_ctk_static_suffix} CUDA::curand${_ctk_static_suffix} CUDA::cusolver${_ctk_static_suffix} diff --git a/python/raft-dask/raft_dask/common/CMakeLists.txt b/python/raft-dask/raft_dask/common/CMakeLists.txt index 77b6695118..bbff2f39cb 100644 --- a/python/raft-dask/raft_dask/common/CMakeLists.txt +++ b/python/raft-dask/raft_dask/common/CMakeLists.txt @@ -13,10 +13,9 @@ # ============================================================================= include(${raft-dask-python_SOURCE_DIR}/cmake/thirdparty/get_nccl.cmake) -find_package(ucx REQUIRED) set(cython_sources comms_utils.pyx nccl.pyx) -set(linked_libraries raft::raft NCCL::NCCL ucx::ucp) +set(linked_libraries raft::raft NCCL::NCCL) rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" CXX ) From 889b78a9eca16747b0fba1139d7af560d40bc45d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 18 Nov 2022 07:29:13 -0800 Subject: [PATCH 8/9] Fix typo in project name. --- python/raft-dask/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt index 868010d27b..c3a222f5f2 100644 --- a/python/raft-dask/CMakeLists.txt +++ b/python/raft-dask/CMakeLists.txt @@ -47,7 +47,7 @@ if(NOT raft_FOUND) enable_language(CUDA) # Since raft-dask only enables CUDA optionally we need to manually include the file that # rapids_cuda_init_architectures relies on `project` including. - include("${CMAKE_PROJECT_raft_dask_INCLUDE}") + include("${CMAKE_PROJECT_raft-dask_INCLUDE}") # raft-dask doesn't actually use raft libraries, it just needs the headers, so we can turn off all # library compilation and we don't need to install anything here. From 6709291012c894501156bbb97e5c67a8e9c3befa Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 18 Nov 2022 11:33:32 -0800 Subject: [PATCH 9/9] Add a new distributed component for ucx. --- cpp/CMakeLists.txt | 60 +++++++++++++++---- python/raft-dask/CMakeLists.txt | 3 +- .../raft-dask/raft_dask/common/CMakeLists.txt | 2 +- 3 files changed, 50 insertions(+), 15 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5f37bacd97..603b090d45 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -205,12 +205,6 @@ endif() add_library(raft INTERFACE) add_library(raft::raft ALIAS raft) -rapids_find_package( - ucx REQUIRED - BUILD_EXPORT_SET raft-exports - INSTALL_EXPORT_SET raft-exports -) - target_include_directories( raft INTERFACE "$" "$" ) @@ -219,7 +213,6 @@ target_include_directories( target_link_libraries( raft INTERFACE rmm::rmm - ucx::ucp CUDA::cublas${_ctk_static_suffix} CUDA::curand${_ctk_static_suffix} CUDA::cusolver${_ctk_static_suffix} @@ -475,6 +468,21 @@ target_link_libraries( raft_nn INTERFACE raft::raft $ nvidia::cutlass::cutlass ) +# ################################################################################################## +# * raft_distributed ------------------------------------------------------------------------------- +add_library(raft_distributed INTERFACE) + +if(TARGET raft_distributed AND (NOT TARGET raft::distributed)) + add_library(raft::distributed ALIAS raft_distributed) +endif() + +set_target_properties(raft_distributed PROPERTIES EXPORT_NAME distributed) + +rapids_export_package(BUILD ucx raft-distributed-exports) +rapids_export_package(INSTALL ucx raft-distributed-exports) + +target_link_libraries(raft_distributed INTERFACE ucx::ucp) + # ################################################################################################## # * install targets----------------------------------------------------------- rapids_cmake_install_lib_dir(lib_dir) @@ -525,6 +533,13 @@ if(TARGET raft_nn_lib) ) endif() +install( + TARGETS raft_distributed + DESTINATION ${lib_dir} + COMPONENT distributed + EXPORT raft-distributed-exports +) + install( DIRECTORY include/raft COMPONENT raft @@ -549,8 +564,8 @@ install( include("${rapids-cmake-dir}/export/write_dependencies.cmake") -set(raft_components distance nn) -set(raft_install_comp raft raft) +set(raft_components distance nn distributed) +set(raft_install_comp raft raft raft) if(TARGET raft_distance_lib) list(APPEND raft_components distance-lib) list(APPEND raft_install_comp distance) @@ -595,11 +610,13 @@ for data science and machine learning. Optional Components: - nn - distance + - distributed Imported Targets: - raft::raft - raft::nn brought in by the `nn` optional component - raft::distance brought in by the `distance` optional component + - raft::distributed brought in by the `distributed` optional component ]=] ) @@ -641,15 +658,32 @@ endif() # Use `rapids_export` for 22.04 as it will have COMPONENT support include(cmake/modules/raft_export.cmake) raft_export( - INSTALL raft COMPONENTS nn distance EXPORT_SET raft-exports GLOBAL_TARGETS raft nn distance - NAMESPACE raft:: DOCUMENTATION doc_string FINAL_CODE_BLOCK code_string + INSTALL raft COMPONENTS nn distance distributed EXPORT_SET raft-exports GLOBAL_TARGETS raft nn + distance distributed NAMESPACE raft:: DOCUMENTATION doc_string FINAL_CODE_BLOCK code_string ) # ################################################################################################## # * build export ------------------------------------------------------------- raft_export( - BUILD raft EXPORT_SET raft-exports COMPONENTS nn distance GLOBAL_TARGETS raft raft_distance - raft_nn DOCUMENTATION doc_string NAMESPACE raft:: FINAL_CODE_BLOCK code_string + BUILD + raft + EXPORT_SET + raft-exports + COMPONENTS + nn + distance + distributed + GLOBAL_TARGETS + raft + raft_distance + distributed + raft_nn + DOCUMENTATION + doc_string + NAMESPACE + raft:: + FINAL_CODE_BLOCK + code_string ) # ################################################################################################## diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt index c3a222f5f2..4e66b40aeb 100644 --- a/python/raft-dask/CMakeLists.txt +++ b/python/raft-dask/CMakeLists.txt @@ -34,7 +34,7 @@ option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulti # If the user requested it we attempt to find RAFT. if(FIND_RAFT_CPP) - find_package(raft ${raft_dask_version} REQUIRED) + find_package(raft ${raft_dask_version} REQUIRED COMPONENTS distributed) else() set(raft_FOUND OFF) endif() @@ -48,6 +48,7 @@ if(NOT raft_FOUND) # Since raft-dask only enables CUDA optionally we need to manually include the file that # rapids_cuda_init_architectures relies on `project` including. include("${CMAKE_PROJECT_raft-dask_INCLUDE}") + find_package(ucx REQUIRED) # raft-dask doesn't actually use raft libraries, it just needs the headers, so we can turn off all # library compilation and we don't need to install anything here. diff --git a/python/raft-dask/raft_dask/common/CMakeLists.txt b/python/raft-dask/raft_dask/common/CMakeLists.txt index bbff2f39cb..e58f81e023 100644 --- a/python/raft-dask/raft_dask/common/CMakeLists.txt +++ b/python/raft-dask/raft_dask/common/CMakeLists.txt @@ -15,7 +15,7 @@ include(${raft-dask-python_SOURCE_DIR}/cmake/thirdparty/get_nccl.cmake) set(cython_sources comms_utils.pyx nccl.pyx) -set(linked_libraries raft::raft NCCL::NCCL) +set(linked_libraries raft::raft raft::distributed NCCL::NCCL) rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" CXX )