Multi gpu sample edges utilities (#2064)

Add utilities to enable multi gpu gathering to be used for mnmg sampling. Authors: - Kumar Aatish (https://github.com/kaatish) Approvers: - Chuck Hastings (https://github.com/ChuckHastings) - Seunghwa Kang (https://github.com/seunghwak) URL: #2064
rapidsai · Feb 16, 2022 · f6a92fc · f6a92fc
1 parent 41c3b70
commit f6a92fc
Show file tree

Hide file tree

Showing 9 changed files with 1,643 additions and 144 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -182,6 +182,7 @@ add_library(cugraph SHARED
     src/community/legacy/extract_subgraph_by_vertex.cu
     src/community/legacy/egonet.cu
     src/sampling/random_walks.cu
+    src/sampling/detail/gather_utils_impl.cu
     src/cores/legacy/core_number.cu
     src/cores/core_number_sg.cu
     src/cores/core_number_mg.cu

diff --git a/cpp/include/cugraph/detail/graph_functions.cuh b/cpp/include/cugraph/detail/graph_functions.cuh
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/matrix_partition_device_view.cuh>
+#include <cugraph/partition_manager.hpp>
+#include <cugraph/utilities/device_comm.cuh>
+#include <cugraph/utilities/host_scalar_comm.cuh>
+
+#include <raft/handle.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/distance.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/tabulate.h>
+
+#include <rmm/device_uvector.hpp>
+
+#include <numeric>
+#include <vector>
+
+namespace cugraph {
+
+namespace detail {
+
+/**
+ * @brief Compute local out degrees of the sources belonging to the adjacency matrices
+ * stored on each gpu
+ *
+ * Iterate through partitions and store their local degrees
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @return A single vector containing the local out degrees of the sources belong to the adjacency
+ * matrices
+ */
+template <typename GraphViewType>
+rmm::device_uvector<typename GraphViewType::edge_type> compute_local_major_degrees(
+  raft::handle_t const& handle, GraphViewType const& graph_view);
+
+/**
+ * @brief Calculate global degree information for all vertices represented by current gpu
+ *
+ * Calculate local degree and perform row wise exclusive scan over all gpus in column
+ * communicator.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @return Tuple of two device vectors. The first one contains per source edge-count encountered
+ * by gpus in the column communicator before current gpu. The second device vector contains the
+ * global out degree for every source represented by current gpu
+ */
+template <typename GraphViewType>
+std::tuple<rmm::device_uvector<typename GraphViewType::edge_type>,
+           rmm::device_uvector<typename GraphViewType::edge_type>>
+get_global_degree_information(raft::handle_t const& handle, GraphViewType const& graph_view);
+
+/**
+ * @brief Gather active sources and associated client gpu ids across gpus in a
+ * column communicator
+ *
+ * Collect all the vertex ids and client gpu ids to be processed by every gpu in
+ * the column communicator and call sort on the list.
+ *
+ * @tparam vertex_t Type of vertex indices.
+ * @tparam VertexIterator  Type of the iterator for vertex identifiers.
+ * @tparam GPUIdIterator  Type of the iterator for gpu id identifiers.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param vertex_input_first Iterator pointing to the first vertex id to be processed
+ * @param vertex_input_last Iterator pointing to the last (exclusive) vertex id to be processed
+ * @param gpu_id_first Iterator pointing to the first gpu id to be processed
+ * @return Device vector containing all the vertices that are to be processed by every gpu
+ * in the column communicator
+ */
+template <typename GraphViewType, typename VertexIterator, typename GPUIdIterator>
+std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
+           rmm::device_uvector<typename std::iterator_traits<GPUIdIterator>::value_type>>
+gather_active_sources_in_row(raft::handle_t const& handle,
+                             GraphViewType const& graph_view,
+                             VertexIterator vertex_input_first,
+                             VertexIterator vertex_input_last,
+                             GPUIdIterator gpu_id_first);
+
+/**
+ * @brief Return global out degrees of active sources
+ *
+ * Get partition information of all graph partitions on the gpu and select
+ * global degrees of all active sources
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param active_majors Device vector containing all the vertex id that are processed by
+ * gpus in the column communicator
+ * @param global_out_degrees Global out degrees for every source represented by current gpu
+ * @return Global out degrees of all sources in active_majors
+ */
+template <typename GraphViewType>
+rmm::device_uvector<typename GraphViewType::edge_type> get_active_major_global_degrees(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  const rmm::device_uvector<typename GraphViewType::vertex_type>& active_majors,
+  const rmm::device_uvector<typename GraphViewType::edge_type>& global_out_degrees);
+
+/**
+ * @brief Return partition information of all vertex ids of all the partitions belonging to a gpu
+ *
+ * Iterate through partitions and store the starting vertex ids, exclusive scan of vertex counts,
+ * offsets and indices of the partitions csr structure
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @return Tuple of device vectors. The first vector contains all the partitions related to the
+ * gpu. The second and third vectors contain starting and ending vertex ids of all the partitions
+ * belonging to the gpu. The fourth vector contains the starting vertex id of the hypersparse
+ * region in each partition. The fifth vector denotes the vertex count offset (how many vertices
+ * are dealt with by the previous partitions.
+ */
+template <typename GraphViewType>
+std::tuple<rmm::device_uvector<matrix_partition_device_view_t<typename GraphViewType::vertex_type,
+                                                              typename GraphViewType::edge_type,
+                                                              typename GraphViewType::weight_type,
+                                                              GraphViewType::is_multi_gpu>>,
+           rmm::device_uvector<typename GraphViewType::vertex_type>,
+           rmm::device_uvector<typename GraphViewType::vertex_type>,
+           rmm::device_uvector<typename GraphViewType::vertex_type>,
+           rmm::device_uvector<typename GraphViewType::vertex_type>>
+partition_information(raft::handle_t const& handle, GraphViewType const& graph_view);
+
+/**
+ * @brief Gather valid edges present on the current gpu
+ *
+ * Collect all the edges that are present in the adjacency lists on the current gpu
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam EdgeIndexIterator Type of the iterator for edge indices.
+ * @tparam GPUIdIterator  Type of the iterator for gpu id identifiers.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param active_majors_in_row Device vector containing all the vertex id that are processed by
+ * gpus in the column communicator
+ * @param active_major_gpu_ids Device vector containing the gpu id associated by every vertex
+ * present in active_majors_in_row
+ * @param edge_index_first Iterator pointing to the first destination index
+ * @param indices_per_source Number of indices supplied for every source in the range
+ * [vertex_input_first, vertex_input_last)
+ * @param global_degree_offset Global degree offset to local adjacency list for every source
+ * represented by current gpu
+ * @return A tuple of device vector containing the majors, minors and gpu_ids gathered locally
+ */
+template <typename GraphViewType, typename EdgeIndexIterator, typename gpu_t>
+std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
+           rmm::device_uvector<typename GraphViewType::vertex_type>,
+           rmm::device_uvector<gpu_t>>
+gather_local_edges(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  const rmm::device_uvector<typename GraphViewType::vertex_type>& active_majors_in_row,
+  const rmm::device_uvector<gpu_t>& active_major_gpu_ids,
+  EdgeIndexIterator edge_index_first,
+  typename GraphViewType::edge_type indices_per_major,
+  const rmm::device_uvector<typename GraphViewType::edge_type>& global_degree_offsets);
+
+}  // namespace detail
+
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #pragma once
 
 #include <cugraph/matrix_partition_view.hpp>
@@ -632,6 +633,12 @@ class graph_view_t<vertex_t,
     return local_sorted_unique_edge_col_offsets_;
   }
 
+  std::tuple<rmm::device_uvector<vertex_t>,
+             rmm::device_uvector<vertex_t>,
+             std::optional<rmm::device_uvector<weight_t>>>
+  decompress_to_edgelist(raft::handle_t const& handle,
+                         std::optional<rmm::device_uvector<vertex_t>> const& renumber_map) const;
+
  private:
   std::vector<edge_t const*> adj_matrix_partition_offsets_{};
   std::vector<vertex_t const*> adj_matrix_partition_indices_{};
@@ -859,6 +866,12 @@ class graph_view_t<vertex_t,
     return std::nullopt;
   }
 
+  std::tuple<rmm::device_uvector<vertex_t>,
+             rmm::device_uvector<vertex_t>,
+             std::optional<rmm::device_uvector<weight_t>>>
+  decompress_to_edgelist(raft::handle_t const& handle,
+                         std::optional<rmm::device_uvector<vertex_t>> const& renumber_map) const;
+
  private:
   edge_t const* offsets_{nullptr};
   vertex_t const* indices_{nullptr};