diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1997fd75dab..5a3cb65caa5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -432,6 +432,7 @@ add_library(cugraph SHARED src/experimental/graph_view.cu src/experimental/coarsen_graph.cu src/experimental/renumber_edgelist.cu + src/experimental/renumber_utils.cu src/experimental/relabel.cu src/experimental/induced_subgraph.cu src/experimental/bfs.cu diff --git a/cpp/include/dendrogram.hpp b/cpp/include/dendrogram.hpp index bb9ba470a52..aa0802e80b3 100644 --- a/cpp/include/dendrogram.hpp +++ b/cpp/include/dendrogram.hpp @@ -27,7 +27,7 @@ class Dendrogram { public: void add_level(vertex_t first_index, vertex_t num_verts, - cudaStream_t stream = 0, + cudaStream_t stream, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) { level_ptr_.push_back(std::make_unique>(num_verts, stream, mr)); diff --git a/cpp/include/experimental/detail/graph_utils.cuh b/cpp/include/experimental/detail/graph_utils.cuh index 084d68b8ba4..d79788e59ce 100644 --- a/cpp/include/experimental/detail/graph_utils.cuh +++ b/cpp/include/experimental/detail/graph_utils.cuh @@ -56,65 +56,32 @@ rmm::device_uvector compute_major_degrees( rmm::device_uvector degrees(0, handle.get_stream()); vertex_t max_num_local_degrees{0}; - for (int i = 0; i < (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size); - ++i) { - auto vertex_partition_idx = partition.is_hypergraph_partitioned() - ? static_cast(i * row_comm_size + row_comm_rank) - : static_cast(col_comm_rank * row_comm_size + i); + for (int i = 0; i < col_comm_size; ++i) { + auto vertex_partition_idx = static_cast(i * row_comm_size + row_comm_rank); auto vertex_partition_size = partition.get_vertex_partition_size(vertex_partition_idx); max_num_local_degrees = std::max(max_num_local_degrees, vertex_partition_size); - if (i == (partition.is_hypergraph_partitioned() ? col_comm_rank : row_comm_rank)) { - degrees.resize(vertex_partition_size, handle.get_stream()); - } + if (i == col_comm_rank) { degrees.resize(vertex_partition_size, handle.get_stream()); } } local_degrees.resize(max_num_local_degrees, handle.get_stream()); - for (int i = 0; i < (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size); - ++i) { - auto vertex_partition_idx = partition.is_hypergraph_partitioned() - ? static_cast(i * row_comm_size + row_comm_rank) - : static_cast(col_comm_rank * row_comm_size + i); + for (int i = 0; i < col_comm_size; ++i) { + auto vertex_partition_idx = static_cast(i * row_comm_size + row_comm_rank); vertex_t major_first{}; vertex_t major_last{}; std::tie(major_first, major_last) = partition.get_vertex_partition_range(vertex_partition_idx); - auto p_offsets = - partition.is_hypergraph_partitioned() - ? adj_matrix_partition_offsets[i] - : adj_matrix_partition_offsets[0] + - (major_first - partition.get_vertex_partition_first(col_comm_rank * row_comm_size)); + auto p_offsets = adj_matrix_partition_offsets[i]; thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), thrust::make_counting_iterator(vertex_t{0}), thrust::make_counting_iterator(major_last - major_first), local_degrees.data(), [p_offsets] __device__(auto i) { return p_offsets[i + 1] - p_offsets[i]; }); - if (partition.is_hypergraph_partitioned()) { - col_comm.reduce(local_degrees.data(), - i == col_comm_rank ? degrees.data() : static_cast(nullptr), - static_cast(major_last - major_first), - raft::comms::op_t::SUM, - i, - handle.get_stream()); - } else { - row_comm.reduce(local_degrees.data(), - i == row_comm_rank ? degrees.data() : static_cast(nullptr), - static_cast(major_last - major_first), - raft::comms::op_t::SUM, - i, - handle.get_stream()); - } + col_comm.reduce(local_degrees.data(), + i == col_comm_rank ? degrees.data() : static_cast(nullptr), + static_cast(major_last - major_first), + raft::comms::op_t::SUM, + i, + handle.get_stream()); } - raft::comms::status_t status{}; - if (partition.is_hypergraph_partitioned()) { - status = - col_comm.sync_stream(handle.get_stream()); // this is neessary as local_degrees will become - // out-of-scope once this function returns. - } else { - status = - row_comm.sync_stream(handle.get_stream()); // this is neessary as local_degrees will become - // out-of-scope once this function returns. - } - CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure."); - return degrees; } @@ -170,7 +137,6 @@ struct compute_gpu_id_from_vertex_t { template struct compute_gpu_id_from_edge_t { - bool hypergraph_partitioned{false}; int comm_size{0}; int row_comm_size{0}; int col_comm_size{0}; @@ -180,12 +146,22 @@ struct compute_gpu_id_from_edge_t { cuco::detail::MurmurHash3_32 hash_func{}; auto major_comm_rank = static_cast(hash_func(major) % comm_size); auto minor_comm_rank = static_cast(hash_func(minor) % comm_size); - if (hypergraph_partitioned) { - return (minor_comm_rank / col_comm_size) * row_comm_size + (major_comm_rank % row_comm_size); - } else { - return (major_comm_rank - (major_comm_rank % row_comm_size)) + - (minor_comm_rank / col_comm_size); - } + return (minor_comm_rank / row_comm_size) * row_comm_size + (major_comm_rank % row_comm_size); + } +}; + +template +struct compute_partition_id_from_edge_t { + int comm_size{0}; + int row_comm_size{0}; + int col_comm_size{0}; + + __device__ int operator()(vertex_t major, vertex_t minor) const + { + cuco::detail::MurmurHash3_32 hash_func{}; + auto major_comm_rank = static_cast(hash_func(major) % comm_size); + auto minor_comm_rank = static_cast(hash_func(minor) % comm_size); + return major_comm_rank * col_comm_size + minor_comm_rank / row_comm_size; } }; diff --git a/cpp/include/experimental/graph.hpp b/cpp/include/experimental/graph.hpp index 6a10256e6f4..a380200ea1f 100644 --- a/cpp/include/experimental/graph.hpp +++ b/cpp/include/experimental/graph.hpp @@ -188,6 +188,20 @@ template struct invalid_edge_id : invalid_idx { }; +template +__host__ __device__ std::enable_if_t::value, bool> is_valid_vertex( + vertex_t num_vertices, vertex_t v) +{ + return (v >= 0) && (v < num_vertices); +} + +template +__host__ __device__ std::enable_if_t::value, bool> is_valid_vertex( + vertex_t num_vertices, vertex_t v) +{ + return v < num_vertices; +} + } // namespace experimental } // namespace cugraph diff --git a/cpp/include/experimental/graph_functions.hpp b/cpp/include/experimental/graph_functions.hpp index 7b4bb466b97..100742adccd 100644 --- a/cpp/include/experimental/graph_functions.hpp +++ b/cpp/include/experimental/graph_functions.hpp @@ -17,13 +17,13 @@ #include #include -#include #include #include #include #include +#include namespace cugraph { namespace experimental { @@ -40,19 +40,24 @@ namespace experimental { * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. - * @param edgelist_major_vertices Edge source vertex IDs (if the graph adjacency matrix is stored as + * @param edgelist_major_vertices Pointers (one pointer per local graph adjacency matrix partition + * assigned to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex - * IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t functor to - * every (major, minor) pair should return the local GPU ID for this function to work (edges should - * be pre-shuffled). - * @param edgelist_minor_vertices Edge destination vertex IDs (if the graph adjacency matrix is - * stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). - * Vertex IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t - * functor to every (major, minor) pair should return the local GPU ID for this function to work - * (edges should be pre-shuffled). - * @param num_edgelist_edges Number of edges in the edgelist. - * @param is_hypergraph_partitioned Flag indicating whether we are assuming hypergraph partitioning - * (this flag will be removed in the future). + * IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target + * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, + * minor) pair should return the GPU ID of this process and applying the + * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition + * should return the partition ID of the corresponding matrix partition. + * @param edgelist_minor_vertices Pointers (one pointer per local graph adjacency matrix partition + * assigned to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored + * as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs + * are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target + * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, + * minor) pair should return the GPU ID of this process and applying the + * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition + * should return the partition ID of the corresponding matrix partition. + * @param edgelist_edge_counts Edge counts (one count per local graph adjacency matrix partition + * assigned to this process). * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). * @return std::tuple, partition_t, vertex_t, edge_t> * Quadruplet of labels (vertex IDs before renumbering) for the entire set of vertices (assigned to @@ -63,10 +68,9 @@ template std::enable_if_t, partition_t, vertex_t, edge_t>> renumber_edgelist(raft::handle_t const& handle, - vertex_t* edgelist_major_vertices /* [INOUT] */, - vertex_t* edgelist_minor_vertices /* [INOUT] */, - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, bool do_expensive_check = false); /** @@ -115,19 +119,24 @@ std::enable_if_t> renumber_edgelist( * the compute_gpu_id_from_vertex_t to every vertex should return the local GPU ID for this function * to work (vertices should be pre-shuffled). * @param num_local_vertices Number of local vertices. - * @param edgelist_major_vertices Edge source vertex IDs (if the graph adjacency matrix is stored as + * @param edgelist_major_vertices Pointers (one pointer per local graph adjacency matrix partition + * assigned to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex - * IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t functor to - * every (major, minor) pair should return the local GPU ID for this function to work (edges should - * be pre-shuffled). - * @param edgelist_minor_vertices Edge destination vertex IDs (if the graph adjacency matrix is - * stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). - * Vertex IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t - * functor to every (major, minor) pair should return the local GPU ID for this function to work - * (edges should be pre-shuffled). - * @param num_edgelist_edges Number of edges in the edgelist. - * @param is_hypergraph_partitioned Flag indicating whether we are assuming hypergraph partitioning - * (this flag will be removed in the future). + * IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target + * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, + * minor) pair should return the GPU ID of this process and applying the + * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition + * should return the partition ID of the corresponding matrix partition. + * @param edgelist_minor_vertices Pointers (one pointer per local graph adjacency matrix partition + * assigned to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored + * as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs + * are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target + * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, + * minor) pair should return the GPU ID of this process and applying the + * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition + * should return the partition ID of the corresponding matrix partition. + * @param edgelist_edge_counts Edge counts (one count per local graph adjacency matrix partition + * assigned to this process). * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). * @return std::tuple, partition_t, vertex_t, edge_t> * Quadruplet of labels (vertex IDs before renumbering) for the entire set of vertices (assigned to @@ -140,10 +149,9 @@ std::enable_if_t const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, bool do_expensive_check = false); /** @@ -181,6 +189,102 @@ std::enable_if_t> renumber_edgelist( edge_t num_edgelist_edges, bool do_expensive_check = false); +/** + * @brief Renumber external vertices to internal vertices based on the provoided @p + * renumber_map_labels. + * + * Note cugraph::experimental::invalid_id::value remains unchanged. + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) + * or multi-GPU (true). + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param vertices Pointer to the vertices to be renumbered. The input external vertices are + * renumbered to internal vertices in-place. + * @param num_vertices Number of vertices to be renumbered. + * @param renumber_map_labels Pointer to the external vertices corresponding to the internal + * vertices in the range [@p local_int_vertex_first, @p local_int_vertex_last). + * @param local_int_vertex_first The first local internal vertex (inclusive, assigned to this + * process in multi-GPU). + * @param local_int_vertex_last The last local internal vertex (exclusive, assigned to this process + * in multi-GPU). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void renumber_ext_vertices(raft::handle_t const& handle, + vertex_t* vertices /* [INOUT] */, + size_t num_vertices, + vertex_t const* renumber_map_labels, + vertex_t local_int_vertex_first, + vertex_t local_int_vertex_last, + bool do_expensive_check = false); + +/** + * @brief Unrenumber local internal vertices to external vertices based on the providied @p + * renumber_map_labels. + * + * Note cugraph::experimental::invalid_id::value remains unchanged. + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param vertices Pointer to the local internal vertices to be unrenumbered. Each input element + * should be in [@p local_int_vertex_first, @p local_int_vertex_last). The input internal vertices + * are renumbered to external vertices in-place. + * @param num_vertices Number of vertices to be unrenumbered. + * @param renumber_map_labels Pointer to the external vertices corresponding to the internal + * vertices in the range [@p local_int_vertex_first, @p local_int_vertex_last). + * @param local_int_vertex_first The first local internal vertex (inclusive, assigned to this + * process in multi-GPU). + * @param local_int_vertex_last The last local internal vertex (exclusive, assigned to this process + * in multi-GPU). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void unrenumber_local_int_vertices( + raft::handle_t const& handle, + vertex_t* vertices /* [INOUT] */, + size_t num_vertices, + vertex_t const* renumber_map_labels /* size = local_int_vertex_last - local_int_vertex_first */, + vertex_t local_int_vertex_first, + vertex_t local_int_vertex_last, + bool do_expensive_check = false); + +/** + * @brief Unrenumber (possibly non-local) internal vertices to external vertices based on the + * providied @p renumber_map_labels. + * + * Note cugraph::experimental::invalid_id::value remains unchanged. + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) + * or multi-GPU (true). + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param vertices Pointer to the internal vertices to be unrenumbered. The input internal vertices + * are renumbered to external vertices in-place. + * @param num_vertices Number of vertices to be unrenumbered. + * @param renumber_map_labels Pointer to the external vertices corresponding to the internal + * vertices in the range [@p local_int_vertex_first, @p local_int_vertex_last). + * @param local_int_vertex_first The first local internal vertex (inclusive, assigned to this + * process in multi-GPU). + * @param local_int_vertex_last The last local internal vertex (exclusive, assigned to this process + * in multi-GPU). + * @param vertex_partition_lasts Last local internal vertices (exclusive, assigned to each process + * in multi-GPU). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void unrenumber_int_vertices(raft::handle_t const& handle, + vertex_t* vertices /* [INOUT] */, + size_t num_vertices, + vertex_t const* renumber_map_labels, + vertex_t local_int_vertex_first, + vertex_t local_int_vertex_last, + std::vector& vertex_partition_lasts, + bool do_expensive_check = false); + /** * @brief Compute the coarsened graph. * diff --git a/cpp/include/experimental/graph_view.hpp b/cpp/include/experimental/graph_view.hpp index 5d3d09bb087..47c93b42ca9 100644 --- a/cpp/include/experimental/graph_view.hpp +++ b/cpp/include/experimental/graph_view.hpp @@ -40,32 +40,11 @@ namespace experimental { * * We need to partition 1D vertex arrays (storing per vertex values) and the 2D graph adjacency * matrix (or transposed 2D graph adjacency matrix) of G. An 1D vertex array of size V is divided to - * P linear partitions; each partition has the size close to V / P. We consider two different - * strategies to partition the 2D matrix: the default strategy and the hypergraph partitioning based - * strategy (the latter is for future extension). - * FIXME: in the future we may use the latter for both as this leads to simpler communication - * patterns and better control over parallelism vs memory footprint trade-off. + * P linear partitions; each partition has the size close to V / P. * - * In the default case, one GPU will be responsible for 1 rectangular partition. The matrix will be - * horizontally partitioned first to P_row slabs. Each slab will be further vertically partitioned - * to P_col rectangles. Each rectangular partition will have the size close to V / P_row by V / - * P_col. - * - * To be more specific, a GPU with (col_comm_rank, row_comm_rank) will be responsible for one - * rectangular partition [a,b) by [c,d) where a = vertex_partition_offsets[row_comm_size * - * col_comm_rank], b = vertex_partition_offsets[row_comm_size * (col_comm_rank + 1)], c = - * vertex_partition_offsets[col_comm_size * row_comm_rank], and d = - * vertex_partition_offsets[col_comm_size * (row_comm_rank + 1)]. - * - * In the future, we may apply hyper-graph partitioning to divide V vertices to P groups minimizing - * edge cuts across groups while balancing the number of vertices in each group. We will also - * renumber vertices so the vertices in each group are mapped to consecutive integers. Then, there - * will be more non-zeros in the diagonal partitions of the 2D graph adjacency matrix (or the - * transposed 2D graph adjacency matrix) than the off-diagonal partitions. The default strategy does - * not balance the number of nonzeros if hyper-graph partitioning is applied. To solve this problem, - * the matrix is first horizontally partitioned to P slabs, then each slab will be further - * vertically partitioned to P_row (instead of P_col in the default case) rectangles. One GPU will - * be responsible col_comm_size rectangular partitions in this case. + * The 2D graph adjacency matrix is first horizontally partitioned to P slabs, then each slab will + * be further vertically partitioned to P_row (instead of P_col in the default case) rectangles. One + * GPU will be responsible col_comm_size rectangular partitions. * * To be more specific, a GPU with (col_comm_rank, row_comm_rank) will be responsible for * col_comm_size rectangular partitions [a_i,b_i) by [c,d) where a_i = @@ -85,13 +64,11 @@ class partition_t { partition_t() = default; partition_t(std::vector const& vertex_partition_offsets, - bool hypergraph_partitioned, int row_comm_size, int col_comm_size, int row_comm_rank, int col_comm_rank) : vertex_partition_offsets_(vertex_partition_offsets), - hypergraph_partitioned_(hypergraph_partitioned), comm_rank_(col_comm_rank * row_comm_size + row_comm_rank), row_comm_size_(row_comm_size), col_comm_size_(col_comm_size), @@ -159,10 +136,7 @@ class partition_t { get_vertex_partition_first(vertex_partition_idx); } - size_t get_number_of_matrix_partitions() const - { - return hypergraph_partitioned_ ? col_comm_size_ : 1; - } + size_t get_number_of_matrix_partitions() const { return col_comm_size_; } // major: row of the graph adjacency matrix (if the graph adjacency matrix is stored as is) or // column of the graph adjacency matrix (if the transposed graph adjacency matrix is stored). @@ -175,16 +149,18 @@ class partition_t { vertex_t get_matrix_partition_major_first(size_t partition_idx) const { - return hypergraph_partitioned_ - ? vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_] - : vertex_partition_offsets_[col_comm_rank_ * row_comm_size_]; + return vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_]; } vertex_t get_matrix_partition_major_last(size_t partition_idx) const { - return hypergraph_partitioned_ - ? vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_ + 1] - : vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_]; + return vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_ + 1]; + } + + vertex_t get_matrix_partition_major_size(size_t partition_idx) const + { + return get_matrix_partition_major_last(partition_idx) - + get_matrix_partition_major_first(partition_idx); } vertex_t get_matrix_partition_major_value_start_offset(size_t partition_idx) const @@ -204,24 +180,21 @@ class partition_t { vertex_t get_matrix_partition_minor_first() const { - return hypergraph_partitioned_ ? vertex_partition_offsets_[col_comm_rank_ * row_comm_size_] - : vertex_partition_offsets_[row_comm_rank_ * col_comm_size_]; + return vertex_partition_offsets_[col_comm_rank_ * row_comm_size_]; } vertex_t get_matrix_partition_minor_last() const { - return hypergraph_partitioned_ - ? vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_] - : vertex_partition_offsets_[(row_comm_rank_ + 1) * col_comm_size_]; + return vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_]; } - // FIXME: this function may be removed if we use the same partitioning strategy whether hypergraph - // partitioning is applied or not - bool is_hypergraph_partitioned() const { return hypergraph_partitioned_; } + vertex_t get_matrix_partition_minor_size() const + { + return get_matrix_partition_minor_last() - get_matrix_partition_minor_first(); + } private: std::vector vertex_partition_offsets_{}; // size = P + 1 - bool hypergraph_partitioned_{false}; int comm_rank_{0}; int row_comm_size_{0}; @@ -236,6 +209,7 @@ class partition_t { struct graph_properties_t { bool is_symmetric{false}; bool is_multigraph{false}; + bool is_weighted{false}; }; namespace detail { @@ -277,6 +251,7 @@ class graph_base_t { bool is_symmetric() const { return properties_.is_symmetric; } bool is_multigraph() const { return properties_.is_multigraph; } + bool is_weighted() const { return properties_.is_weighted; } protected: raft::handle_t const* get_handle_ptr() const { return handle_ptr_; }; @@ -334,11 +309,6 @@ class graph_view_t 0; } - - // FIXME: this should be removed once MNMG Louvain is updated to use graph primitives - partition_t get_partition() const { return partition_; } - vertex_t get_number_of_local_vertices() const { return partition_.get_local_vertex_last() - partition_.get_local_vertex_first(); @@ -421,6 +391,12 @@ class graph_view_t compute_in_weight_sums(raft::handle_t const& handle) const; rmm::device_uvector compute_out_weight_sums(raft::handle_t const& handle) const; + edge_t compute_max_in_degree(raft::handle_t const& handle) const; + edge_t compute_max_out_degree(raft::handle_t const& handle) const; + + weight_t compute_max_in_weight_sum(raft::handle_t const& handle) const; + weight_t compute_max_out_weight_sum(raft::handle_t const& handle) const; + private: std::vector adj_matrix_partition_offsets_{}; std::vector adj_matrix_partition_indices_{}; @@ -549,8 +535,6 @@ class graph_view_tget_number_of_vertices(); } constexpr vertex_t get_local_vertex_first() const { return vertex_t{0}; } @@ -628,8 +612,6 @@ class graph_view_t compute_in_weight_sums(raft::handle_t const& handle) const; rmm::device_uvector compute_out_weight_sums(raft::handle_t const& handle) const; + edge_t compute_max_in_degree(raft::handle_t const& handle) const; + edge_t compute_max_out_degree(raft::handle_t const& handle) const; + + weight_t compute_max_in_weight_sum(raft::handle_t const& handle) const; + weight_t compute_max_out_weight_sum(raft::handle_t const& handle) const; + private: edge_t const* offsets_{nullptr}; vertex_t const* indices_{nullptr}; diff --git a/cpp/include/matrix_partition_device.cuh b/cpp/include/matrix_partition_device.cuh index b41119e7be6..30d6540bcfe 100644 --- a/cpp/include/matrix_partition_device.cuh +++ b/cpp/include/matrix_partition_device.cuh @@ -192,7 +192,7 @@ class matrix_partition_device_t rx_counts(row_comm_size, size_t{0}); - std::vector displacements(row_comm_size, size_t{0}); - for (int i = 0; i < row_comm_size; ++i) { - rx_counts[i] = graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i); - displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1]; - } - device_allgatherv(row_comm, - vertex_value_input_first, - matrix_major_value_output_first, - rx_counts, - displacements, - handle.get_stream()); + auto& comm = handle.get_comms(); + auto const comm_rank = comm.get_rank(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_rank = row_comm.get_rank(); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); + + std::vector rx_counts(col_comm_size, size_t{0}); + std::vector displacements(col_comm_size, size_t{0}); + for (int i = 0; i < col_comm_size; ++i) { + rx_counts[i] = graph_view.get_vertex_partition_size(i * row_comm_size + row_comm_rank); + displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1]; } + device_allgatherv(col_comm, + vertex_value_input_first, + matrix_major_value_output_first, + rx_counts, + displacements, + handle.get_stream()); } else { assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols() @@ -101,80 +97,78 @@ void copy_to_matrix_major(raft::handle_t const& handle, using vertex_t = typename GraphViewType::vertex_type; if (GraphViewType::is_multi_gpu) { - if (graph_view.is_hypergraph_partitioned()) { - CUGRAPH_FAIL("unimplemented."); - } else { - auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - auto const col_comm_size = col_comm.get_size(); - - auto rx_counts = - host_scalar_allgather(row_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); - - matrix_partition_device_t matrix_partition(graph_view, 0); - for (int i = 0; i < row_comm_size; ++i) { - rmm::device_uvector rx_vertices(row_comm_rank == i ? size_t{0} : rx_counts[i], - handle.get_stream()); - auto rx_tmp_buffer = allocate_dataframe_buffer< - typename std::iterator_traits::value_type>(rx_counts[i], - handle.get_stream()); - auto rx_value_first = get_dataframe_buffer_begin< - typename std::iterator_traits::value_type>(rx_tmp_buffer); + auto& comm = handle.get_comms(); + auto const comm_rank = comm.get_rank(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_rank = row_comm.get_rank(); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); + + auto rx_counts = + host_scalar_allgather(col_comm, + static_cast(thrust::distance(vertex_first, vertex_last)), + handle.get_stream()); + + for (int i = 0; i < col_comm_size; ++i) { + matrix_partition_device_t matrix_partition(graph_view, i); + + rmm::device_uvector rx_vertices(col_comm_rank == i ? size_t{0} : rx_counts[i], + handle.get_stream()); + auto rx_tmp_buffer = allocate_dataframe_buffer< + typename std::iterator_traits::value_type>(rx_counts[i], + handle.get_stream()); + auto rx_value_first = get_dataframe_buffer_begin< + typename std::iterator_traits::value_type>(rx_tmp_buffer); - if (row_comm_rank == i) { - vertex_partition_device_t vertex_partition(graph_view); - auto map_first = - thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) { - return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v); - }); - // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a - // permutation iterator (and directly gathers to the internal buffer) - thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - map_first, - map_first + thrust::distance(vertex_first, vertex_last), - vertex_value_input_first, - rx_value_first); - } + if (col_comm_rank == i) { + vertex_partition_device_t vertex_partition(graph_view); + auto map_first = + thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) { + return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v); + }); + // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a + // permutation iterator (and directly gathers to the internal buffer) + thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + map_first, + map_first + thrust::distance(vertex_first, vertex_last), + vertex_value_input_first, + rx_value_first); + } - // FIXME: these broadcast operations can be placed between ncclGroupStart() and - // ncclGroupEnd() - device_bcast( - row_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); - device_bcast( - row_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream()); + // FIXME: these broadcast operations can be placed between ncclGroupStart() and + // ncclGroupEnd() + device_bcast( + col_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + device_bcast(col_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream()); - if (row_comm_rank == i) { - auto map_first = - thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) { - return matrix_partition.get_major_offset_from_major_nocheck(v); - }); - // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and - // directly scatters from the internal buffer) - thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_value_first, - rx_value_first + rx_counts[i], - map_first, - matrix_major_value_output_first); - } else { - auto map_first = thrust::make_transform_iterator( - rx_vertices.begin(), [matrix_partition] __device__(auto v) { - return matrix_partition.get_major_offset_from_major_nocheck(v); - }); - // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and - // directly scatters from the internal buffer) - thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_value_first, - rx_value_first + rx_counts[i], - map_first, - matrix_major_value_output_first); - } + if (col_comm_rank == i) { + auto map_first = + thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) { + return matrix_partition.get_major_offset_from_major_nocheck(v); + }); + // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and + // directly scatters from the internal buffer) + thrust::scatter( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_value_first, + rx_value_first + rx_counts[i], + map_first, + matrix_major_value_output_first + matrix_partition.get_major_value_start_offset()); + } else { + auto map_first = thrust::make_transform_iterator( + rx_vertices.begin(), [matrix_partition] __device__(auto v) { + return matrix_partition.get_major_offset_from_major_nocheck(v); + }); + // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and + // directly scatters from the internal buffer) + thrust::scatter( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_value_first, + rx_value_first + rx_counts[i], + map_first, + matrix_major_value_output_first + matrix_partition.get_major_value_start_offset()); } } } else { @@ -199,59 +193,27 @@ void copy_to_matrix_minor(raft::handle_t const& handle, MatrixMinorValueOutputIterator matrix_minor_value_output_first) { if (GraphViewType::is_multi_gpu) { - if (graph_view.is_hypergraph_partitioned()) { - CUGRAPH_FAIL("unimplemented."); - } else { - auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - auto const col_comm_size = col_comm.get_size(); - - // FIXME: this P2P is unnecessary if we apply the partitioning scheme used with hypergraph - // partitioning - auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank; - auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size; - // FIXME: this branch may be no longer necessary with NCCL backend - if (comm_src_rank == comm_rank) { - assert(comm_dst_rank == comm_rank); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - vertex_value_input_first, - vertex_value_input_first + graph_view.get_number_of_local_vertices(), - matrix_minor_value_output_first + - (graph_view.get_vertex_partition_first(comm_src_rank) - - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size))); - } else { - device_sendrecv( - comm, - vertex_value_input_first, - static_cast(graph_view.get_number_of_local_vertices()), - comm_dst_rank, - matrix_minor_value_output_first + - (graph_view.get_vertex_partition_first(comm_src_rank) - - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)), - static_cast(graph_view.get_vertex_partition_size(comm_src_rank)), - comm_src_rank, - handle.get_stream()); - } - - // FIXME: these broadcast operations can be placed between ncclGroupStart() and - // ncclGroupEnd() - for (int i = 0; i < col_comm_size; ++i) { - auto offset = graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + i) - - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size); - auto count = graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + i); - device_bcast(col_comm, - matrix_minor_value_output_first + offset, - matrix_minor_value_output_first + offset, - count, - i, - handle.get_stream()); - } + auto& comm = handle.get_comms(); + auto const comm_rank = comm.get_rank(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_rank = row_comm.get_rank(); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); + + std::vector rx_counts(row_comm_size, size_t{0}); + std::vector displacements(row_comm_size, size_t{0}); + for (int i = 0; i < row_comm_size; ++i) { + rx_counts[i] = graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i); + displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1]; } + device_allgatherv(row_comm, + vertex_value_input_first, + matrix_minor_value_output_first, + rx_counts, + displacements, + handle.get_stream()); } else { assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed ? graph_view.get_number_of_local_adj_matrix_partition_rows() @@ -277,143 +239,75 @@ void copy_to_matrix_minor(raft::handle_t const& handle, using vertex_t = typename GraphViewType::vertex_type; if (GraphViewType::is_multi_gpu) { - if (graph_view.is_hypergraph_partitioned()) { - CUGRAPH_FAIL("unimplemented."); - } else { - auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - auto const col_comm_size = col_comm.get_size(); - - // FIXME: this P2P is unnecessary if apply the same partitioning scheme regardless of - // hypergraph partitioning is applied or not - auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank; - auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size; - size_t tx_count = thrust::distance(vertex_first, vertex_last); - size_t rx_count{}; - // FIXME: it seems like raft::isend and raft::irecv do not properly handle the destination (or - // source) == self case. Need to double check and fix this if this is indeed the case (or RAFT - // may use ncclSend/ncclRecv instead of UCX for device data). - if (comm_src_rank == comm_rank) { - assert(comm_dst_rank == comm_rank); - rx_count = tx_count; - } else { - std::vector count_requests(2); - comm.isend(&tx_count, 1, comm_dst_rank, 0 /* tag */, count_requests.data()); - comm.irecv(&rx_count, 1, comm_src_rank, 0 /* tag */, count_requests.data() + 1); - comm.waitall(count_requests.size(), count_requests.data()); - } - - vertex_partition_device_t vertex_partition(graph_view); - rmm::device_uvector dst_vertices(rx_count, handle.get_stream()); - auto dst_tmp_buffer = allocate_dataframe_buffer< - typename std::iterator_traits::value_type>(rx_count, + auto& comm = handle.get_comms(); + auto const comm_rank = comm.get_rank(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_rank = row_comm.get_rank(); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); + + auto rx_counts = + host_scalar_allgather(row_comm, + static_cast(thrust::distance(vertex_first, vertex_last)), + handle.get_stream()); + + matrix_partition_device_t matrix_partition(graph_view, 0); + for (int i = 0; i < row_comm_size; ++i) { + rmm::device_uvector rx_vertices(row_comm_rank == i ? size_t{0} : rx_counts[i], + handle.get_stream()); + auto rx_tmp_buffer = allocate_dataframe_buffer< + typename std::iterator_traits::value_type>(rx_counts[i], handle.get_stream()); - auto dst_value_first = get_dataframe_buffer_begin< - typename std::iterator_traits::value_type>(dst_tmp_buffer); - if (comm_src_rank == comm_rank) { - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - vertex_first, - vertex_last, - dst_vertices.begin()); - auto map_first = - thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) { - return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v); - }); - thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - map_first, - map_first + thrust::distance(vertex_first, vertex_last), - vertex_value_input_first, - dst_value_first); - } else { - auto src_tmp_buffer = allocate_dataframe_buffer< - typename std::iterator_traits::value_type>(tx_count, - handle.get_stream()); - auto src_value_first = get_dataframe_buffer_begin< - typename std::iterator_traits::value_type>(src_tmp_buffer); + auto rx_value_first = get_dataframe_buffer_begin< + typename std::iterator_traits::value_type>(rx_tmp_buffer); + if (row_comm_rank == i) { + vertex_partition_device_t vertex_partition(graph_view); auto map_first = thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) { return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v); }); + // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a + // permutation iterator (and directly gathers to the internal buffer) thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), map_first, map_first + thrust::distance(vertex_first, vertex_last), vertex_value_input_first, - src_value_first); - - device_sendrecv( - comm, - vertex_first, - tx_count, - comm_dst_rank, - dst_vertices.begin(), - rx_count, - comm_src_rank, - handle.get_stream()); - - device_sendrecv(comm, - src_value_first, - tx_count, - comm_dst_rank, - dst_value_first, - rx_count, - comm_src_rank, - handle.get_stream()); + rx_value_first); } - // FIXME: now we can clear tx_tmp_buffer - - auto rx_counts = host_scalar_allgather(col_comm, rx_count, handle.get_stream()); - - matrix_partition_device_t matrix_partition(graph_view, 0); - for (int i = 0; i < col_comm_size; ++i) { - rmm::device_uvector rx_vertices(col_comm_rank == i ? size_t{0} : rx_counts[i], - handle.get_stream()); - auto rx_tmp_buffer = allocate_dataframe_buffer< - typename std::iterator_traits::value_type>(rx_counts[i], - handle.get_stream()); - auto rx_value_first = get_dataframe_buffer_begin< - typename std::iterator_traits::value_type>(rx_tmp_buffer); - - // FIXME: these broadcast operations can be placed between ncclGroupStart() and - // ncclGroupEnd() - device_bcast(col_comm, - dst_vertices.begin(), - rx_vertices.begin(), - rx_counts[i], - i, - handle.get_stream()); - device_bcast( - col_comm, dst_value_first, rx_value_first, rx_counts[i], i, handle.get_stream()); - - if (col_comm_rank == i) { - auto map_first = thrust::make_transform_iterator( - dst_vertices.begin(), [matrix_partition] __device__(auto v) { - return matrix_partition.get_minor_offset_from_minor_nocheck(v); - }); - - thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - dst_value_first, - dst_value_first + rx_counts[i], - map_first, - matrix_minor_value_output_first); - } else { - auto map_first = thrust::make_transform_iterator( - rx_vertices.begin(), [matrix_partition] __device__(auto v) { - return matrix_partition.get_minor_offset_from_minor_nocheck(v); - }); + // FIXME: these broadcast operations can be placed between ncclGroupStart() and + // ncclGroupEnd() + device_bcast( + row_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + device_bcast(row_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream()); - thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_value_first, - rx_value_first + rx_counts[i], - map_first, - matrix_minor_value_output_first); - } + if (row_comm_rank == i) { + auto map_first = + thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) { + return matrix_partition.get_minor_offset_from_minor_nocheck(v); + }); + // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and + // directly scatters from the internal buffer) + thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_value_first, + rx_value_first + rx_counts[i], + map_first, + matrix_minor_value_output_first); + } else { + auto map_first = thrust::make_transform_iterator( + rx_vertices.begin(), [matrix_partition] __device__(auto v) { + return matrix_partition.get_minor_offset_from_minor_nocheck(v); + }); + // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and + // directly scatters from the internal buffer) + thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_value_first, + rx_value_first + rx_counts[i], + map_first, + matrix_minor_value_output_first); } } } else { diff --git a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh index 3059cf95852..e6a73a874ae 100644 --- a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh +++ b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh @@ -362,16 +362,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); - auto loop_count = size_t{1}; - if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_size = row_comm.get_size(); - loop_count = graph_view.is_hypergraph_partitioned() - ? graph_view.get_number_of_local_adj_matrix_partitions() - : static_cast(row_comm_size); - } - auto comm_rank = handle.comms_initialized() ? handle.get_comms().get_rank() : int{0}; - auto minor_tmp_buffer_size = (GraphViewType::is_multi_gpu && (in != GraphViewType::is_adj_matrix_transposed)) ? GraphViewType::is_adj_matrix_transposed @@ -386,10 +376,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, if (GraphViewType::is_multi_gpu) { auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); auto const row_comm_rank = row_comm.get_rank(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - minor_init = graph_view.is_hypergraph_partitioned() ? (row_comm_rank == 0) ? init : T{} - : (col_comm_rank == 0) ? init : T{}; + minor_init = (row_comm_rank == 0) ? init : T{}; } if (GraphViewType::is_multi_gpu) { @@ -407,24 +394,13 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, assert(minor_tmp_buffer_size == 0); } - for (size_t i = 0; i < loop_count; ++i) { - matrix_partition_device_t matrix_partition( - graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + matrix_partition_device_t matrix_partition(graph_view, i); - auto major_tmp_buffer_size = vertex_t{0}; - if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - - major_tmp_buffer_size = - (in == GraphViewType::is_adj_matrix_transposed) - ? graph_view.is_hypergraph_partitioned() - ? matrix_partition.get_major_size() - : graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i) - : vertex_t{0}; - } + auto major_tmp_buffer_size = + GraphViewType::is_multi_gpu && (in == GraphViewType::is_adj_matrix_transposed) + ? matrix_partition.get_major_size() + : vertex_t{0}; auto major_tmp_buffer = allocate_dataframe_buffer(major_tmp_buffer_size, handle.get_stream()); auto major_buffer_first = get_dataframe_buffer_begin(major_tmp_buffer); @@ -432,12 +408,9 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, auto major_init = T{}; if (in == GraphViewType::is_adj_matrix_transposed) { if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_rank = col_comm.get_rank(); - major_init = graph_view.is_hypergraph_partitioned() ? (col_comm_rank == 0) ? init : T{} - : (row_comm_rank == 0) ? init : T{}; + major_init = (col_comm_rank == 0) ? init : T{}; } else { major_init = init; } @@ -450,8 +423,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, auto const row_comm_size = row_comm.get_size(); auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_rank = col_comm.get_rank(); - comm_root_rank = graph_view.is_hypergraph_partitioned() ? i * row_comm_size + row_comm_rank - : col_comm_rank * row_comm_size + i; + comm_root_rank = i * row_comm_size + row_comm_rank; } if (graph_view.get_vertex_partition_size(comm_root_rank) > 0) { @@ -505,25 +477,13 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); - if (graph_view.is_hypergraph_partitioned()) { - device_reduce( - col_comm, - major_buffer_first, - vertex_value_output_first, - static_cast(graph_view.get_vertex_partition_size(i * row_comm_size + i)), - raft::comms::op_t::SUM, - i, - handle.get_stream()); - } else { - device_reduce(row_comm, - major_buffer_first, - vertex_value_output_first, - static_cast( - graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i)), - raft::comms::op_t::SUM, - i, - handle.get_stream()); - } + device_reduce(col_comm, + major_buffer_first, + vertex_value_output_first, + matrix_partition.get_major_size(), + raft::comms::op_t::SUM, + i, + handle.get_stream()); } } @@ -537,53 +497,17 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); - if (graph_view.is_hypergraph_partitioned()) { - CUGRAPH_FAIL("unimplemented."); - } else { - for (int i = 0; i < col_comm_size; ++i) { - auto offset = (graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + i) - - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)); - auto size = static_cast( - graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + i)); - device_reduce(col_comm, - minor_buffer_first + offset, - minor_buffer_first + offset, - size, - raft::comms::op_t::SUM, - i, - handle.get_stream()); - } - - // FIXME: this P2P is unnecessary if we apply the partitioning scheme used with hypergraph - // partitioning - auto comm_src_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size; - auto comm_dst_rank = row_comm_rank * col_comm_size + col_comm_rank; - // FIXME: this branch may no longer necessary with NCCL backend - if (comm_src_rank == comm_rank) { - assert(comm_dst_rank == comm_rank); - auto offset = - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + col_comm_rank) - - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size); - auto size = static_cast( - graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + col_comm_rank)); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - minor_buffer_first + offset, - minor_buffer_first + offset + size, - vertex_value_output_first); - } else { - device_sendrecv( - comm, - minor_buffer_first + - (graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + col_comm_rank) - - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)), - static_cast( - graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + col_comm_rank)), - comm_dst_rank, - vertex_value_output_first, - static_cast(graph_view.get_vertex_partition_size(comm_rank)), - comm_src_rank, - handle.get_stream()); - } + for (int i = 0; i < row_comm_size; ++i) { + auto offset = (graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size + i) - + graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size)); + device_reduce(row_comm, + minor_buffer_first + offset, + vertex_value_output_first, + static_cast( + graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i)), + raft::comms::op_t::SUM, + i, + handle.get_stream()); } } } diff --git a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index 19a5f67c9de..22dc2041793 100644 --- a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -170,8 +171,8 @@ __global__ void for_all_major_for_all_nbr_low_degree( */ template ::value_type, + static_assert(std::is_same::value_type, typename GraphViewType::vertex_type>::value); + static_assert(std::is_same::value_type, + typename std::iterator_traits::value_type>::value); static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); using vertex_t = typename GraphViewType::vertex_type; @@ -206,64 +209,113 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( // 1. build a cuco::static_map object for the k, v pairs. auto kv_map_ptr = std::make_unique>( - static_cast(static_cast(thrust::distance(map_key_first, map_key_last)) / - load_factor), - invalid_vertex_id::value, - invalid_vertex_id::value); - auto pair_first = thrust::make_transform_iterator( - thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first)), - [] __device__(auto val) { - return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); - }); - kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); - - // 2. aggregate each vertex out-going edges based on keys and transform-reduce. - - auto loop_count = size_t{1}; + size_t{0}, invalid_vertex_id::value, invalid_vertex_id::value); if (GraphViewType::is_multi_gpu) { auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_rank = row_comm.get_rank(); auto const row_comm_size = row_comm.get_size(); - loop_count = graph_view.is_hypergraph_partitioned() - ? graph_view.get_number_of_local_adj_matrix_partitions() - : static_cast(row_comm_size); + + auto map_counts = + host_scalar_allgather(row_comm, + static_cast(thrust::distance(map_key_first, map_key_last)), + handle.get_stream()); + std::vector map_displacements(row_comm_size, size_t{0}); + std::partial_sum(map_counts.begin(), map_counts.end() - 1, map_displacements.begin() + 1); + rmm::device_uvector map_keys(map_displacements.back() + map_counts.back(), + handle.get_stream()); + auto map_value_buffer = + allocate_dataframe_buffer(map_keys.size(), handle.get_stream()); + for (int i = 0; i < row_comm_size; ++i) { + device_bcast(row_comm, + map_key_first, + map_keys.begin() + map_displacements[i], + map_counts[i], + i, + handle.get_stream()); + device_bcast(row_comm, + map_value_first, + get_dataframe_buffer_begin(map_value_buffer) + map_displacements[i], + map_counts[i], + i, + handle.get_stream()); + } + // FIXME: these copies are unnecessary, better fix RAFT comm's bcast to take separate input & + // output pointers + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + map_key_first, + map_key_last, + map_keys.begin() + map_displacements[row_comm_rank]); + thrust::copy( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + map_value_first, + map_value_first + thrust::distance(map_key_first, map_key_last), + get_dataframe_buffer_begin(map_value_buffer) + map_displacements[row_comm_rank]); + + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream + + kv_map_ptr.reset(); + + kv_map_ptr = std::make_unique>( + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast(static_cast(map_keys.size()) / load_factor), + static_cast(thrust::distance(map_key_first, map_key_last)) + 1), + invalid_vertex_id::value, + invalid_vertex_id::value); + + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple( + map_keys.begin(), get_dataframe_buffer_begin(map_value_buffer))), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (map_keys.size()) { kv_map_ptr->insert(pair_first, pair_first + map_keys.size()); } + } else { + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream + + kv_map_ptr.reset(); + + kv_map_ptr = std::make_unique>( + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast( + static_cast(thrust::distance(map_key_first, map_key_last)) / load_factor), + static_cast(thrust::distance(map_key_first, map_key_last)) + 1), + invalid_vertex_id::value, + invalid_vertex_id::value); + + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first)), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (thrust::distance(map_key_first, map_key_last) > 0) { + kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); + } } + // 2. aggregate each vertex out-going edges based on keys and transform-reduce. + rmm::device_uvector major_vertices(0, handle.get_stream()); auto e_op_result_buffer = allocate_dataframe_buffer(0, handle.get_stream()); - for (size_t i = 0; i < loop_count; ++i) { - matrix_partition_device_t matrix_partition( - graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i); - - int comm_root_rank = 0; - if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - comm_root_rank = graph_view.is_hypergraph_partitioned() ? i * row_comm_size + row_comm_rank - : col_comm_rank * row_comm_size + i; - } + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + matrix_partition_device_t matrix_partition(graph_view, i); - auto num_edges = thrust::transform_reduce( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - thrust::make_counting_iterator(graph_view.get_vertex_partition_first(comm_root_rank)), - thrust::make_counting_iterator(graph_view.get_vertex_partition_last(comm_root_rank)), - [matrix_partition] __device__(auto row) { - auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row); - return matrix_partition.get_local_degree(row_offset); - }, - edge_t{0}, - thrust::plus()); - - rmm::device_uvector tmp_major_vertices(num_edges, handle.get_stream()); + rmm::device_uvector tmp_major_vertices(matrix_partition.get_number_of_edges(), + handle.get_stream()); rmm::device_uvector tmp_minor_keys(tmp_major_vertices.size(), handle.get_stream()); rmm::device_uvector tmp_key_aggregated_edge_weights(tmp_major_vertices.size(), handle.get_stream()); - if (graph_view.get_vertex_partition_size(comm_root_rank) > 0) { + if (matrix_partition.get_major_size() > 0) { raft::grid_1d_thread_t update_grid( - graph_view.get_vertex_partition_size(comm_root_rank), + matrix_partition.get_major_size(), detail::copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size, handle.get_device_properties().maxGridSize[0]); @@ -277,8 +329,8 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( 0, handle.get_stream()>>>( matrix_partition, - graph_view.get_vertex_partition_first(comm_root_rank), - graph_view.get_vertex_partition_last(comm_root_rank), + matrix_partition.get_major_first(), + matrix_partition.get_major_last(), adj_matrix_col_key_first, tmp_major_vertices.data(), tmp_minor_keys.data(), @@ -300,10 +352,14 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( tmp_key_aggregated_edge_weights.resize(tmp_major_vertices.size(), handle.get_stream()); if (GraphViewType::is_multi_gpu) { - auto& sub_comm = handle.get_subcomm(graph_view.is_hypergraph_partitioned() - ? cugraph::partition_2d::key_naming_t().col_name() - : cugraph::partition_2d::key_naming_t().row_name()); - auto const sub_comm_size = sub_comm.get_size(); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_size = row_comm.get_size(); + + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_size = col_comm.get_size(); triplet_first = thrust::make_zip_iterator(thrust::make_tuple(tmp_major_vertices.begin(), @@ -315,11 +371,13 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( std::forward_as_tuple( std::tie(rx_major_vertices, rx_minor_keys, rx_key_aggregated_edge_weights), std::ignore) = groupby_gpuid_and_shuffle_values( - sub_comm, + col_comm, triplet_first, triplet_first + tmp_major_vertices.size(), - [key_func = detail::compute_gpu_id_from_vertex_t{sub_comm_size}] __device__( - auto val) { return key_func(thrust::get<1>(val)); }, + [key_func = detail::compute_gpu_id_from_vertex_t{comm_size}, + row_comm_size] __device__(auto val) { + return key_func(thrust::get<1>(val)) / row_comm_size; + }, handle.get_stream()); auto pair_first = thrust::make_zip_iterator( @@ -355,56 +413,52 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( triplet_first = thrust::make_zip_iterator(thrust::make_tuple( tmp_major_vertices.begin(), tmp_minor_keys.begin(), tmp_key_aggregated_edge_weights.begin())); - thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - triplet_first, - triplet_first + tmp_major_vertices.size(), - tmp_e_op_result_buffer_first, - [adj_matrix_row_value_input_first, - key_aggregated_e_op, - matrix_partition, - kv_map = kv_map_ptr->get_device_view()] __device__(auto val) { - auto major = thrust::get<0>(val); - auto key = thrust::get<1>(val); - auto w = thrust::get<2>(val); - return key_aggregated_e_op( - major, - key, - w, - *(adj_matrix_row_value_input_first + - matrix_partition.get_major_offset_from_major_nocheck(major)), - kv_map.find(key)->second.load(cuda::std::memory_order_relaxed)); - }); + thrust::transform( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + triplet_first, + triplet_first + tmp_major_vertices.size(), + tmp_e_op_result_buffer_first, + [adj_matrix_row_value_input_first = + adj_matrix_row_value_input_first + matrix_partition.get_major_value_start_offset(), + key_aggregated_e_op, + matrix_partition, + kv_map = kv_map_ptr->get_device_view()] __device__(auto val) { + auto major = thrust::get<0>(val); + auto key = thrust::get<1>(val); + auto w = thrust::get<2>(val); + return key_aggregated_e_op(major, + key, + w, + *(adj_matrix_row_value_input_first + + matrix_partition.get_major_offset_from_major_nocheck(major)), + kv_map.find(key)->second.load(cuda::std::memory_order_relaxed)); + }); tmp_minor_keys.resize(0, handle.get_stream()); tmp_key_aggregated_edge_weights.resize(0, handle.get_stream()); tmp_minor_keys.shrink_to_fit(handle.get_stream()); tmp_key_aggregated_edge_weights.shrink_to_fit(handle.get_stream()); if (GraphViewType::is_multi_gpu) { - auto& sub_comm = handle.get_subcomm(graph_view.is_hypergraph_partitioned() - ? cugraph::partition_2d::key_naming_t().col_name() - : cugraph::partition_2d::key_naming_t().row_name()); - auto const sub_comm_rank = sub_comm.get_rank(); - auto const sub_comm_size = sub_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); // FIXME: additional optimization is possible if reduce_op is a pure function (and reduce_op // can be mapped to ncclRedOp_t). auto rx_sizes = - host_scalar_gather(sub_comm, tmp_major_vertices.size(), i, handle.get_stream()); - std::vector rx_displs( - static_cast(sub_comm_rank) == i ? sub_comm_size : int{0}, size_t{0}); - if (static_cast(sub_comm_rank) == i) { + host_scalar_gather(col_comm, tmp_major_vertices.size(), i, handle.get_stream()); + std::vector rx_displs{}; + rmm::device_uvector rx_major_vertices(0, handle.get_stream()); + if (static_cast(col_comm_rank) == i) { + rx_displs.assign(col_comm_size, size_t{0}); std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1); + rx_major_vertices.resize(rx_displs.back() + rx_sizes.back(), handle.get_stream()); } - rmm::device_uvector rx_major_vertices( - static_cast(sub_comm_rank) == i - ? std::accumulate(rx_sizes.begin(), rx_sizes.end(), size_t{0}) - : size_t{0}, - handle.get_stream()); auto rx_tmp_e_op_result_buffer = allocate_dataframe_buffer(rx_major_vertices.size(), handle.get_stream()); - device_gatherv(sub_comm, + device_gatherv(col_comm, tmp_major_vertices.data(), rx_major_vertices.data(), tmp_major_vertices.size(), @@ -412,7 +466,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( rx_displs, i, handle.get_stream()); - device_gatherv(sub_comm, + device_gatherv(col_comm, tmp_e_op_result_buffer_first, get_dataframe_buffer_begin(rx_tmp_e_op_result_buffer), tmp_major_vertices.size(), @@ -421,7 +475,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( i, handle.get_stream()); - if (static_cast(sub_comm_rank) == i) { + if (static_cast(col_comm_rank) == i) { major_vertices = std::move(rx_major_vertices); e_op_result_buffer = std::move(rx_tmp_e_op_result_buffer); } diff --git a/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh b/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh index e621ed91ddb..34721c75e31 100644 --- a/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh +++ b/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh @@ -179,20 +179,10 @@ transform_reduce_by_adj_matrix_row_col_key_e( using edge_t = typename GraphViewType::edge_type; using weight_t = typename GraphViewType::weight_type; - auto loop_count = size_t{1}; - if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_size = row_comm.get_size(); - loop_count = graph_view.is_hypergraph_partitioned() - ? graph_view.get_number_of_local_adj_matrix_partitions() - : static_cast(row_comm_size); - } - rmm::device_uvector keys(0, handle.get_stream()); auto value_buffer = allocate_dataframe_buffer(0, handle.get_stream()); - for (size_t i = 0; i < loop_count; ++i) { - matrix_partition_device_t matrix_partition( - graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + matrix_partition_device_t matrix_partition(graph_view, i); int comm_root_rank = 0; if (GraphViewType::is_multi_gpu) { @@ -201,8 +191,7 @@ transform_reduce_by_adj_matrix_row_col_key_e( auto const row_comm_size = row_comm.get_size(); auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_rank = col_comm.get_rank(); - comm_root_rank = graph_view.is_hypergraph_partitioned() ? i * row_comm_size + row_comm_rank - : col_comm_rank * row_comm_size + i; + comm_root_rank = i * row_comm_size + row_comm_rank; } auto num_edges = thrust::transform_reduce( @@ -224,6 +213,13 @@ transform_reduce_by_adj_matrix_row_col_key_e( detail::transform_reduce_by_key_e_for_all_block_size, handle.get_device_properties().maxGridSize[0]); + auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed + ? vertex_t{0} + : matrix_partition.get_major_value_start_offset(); + auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed + ? matrix_partition.get_major_value_start_offset() + : vertex_t{0}; + // FIXME: This is highly inefficient for graphs with high-degree vertices. If we renumber // vertices to insure that rows within a partition are sorted by their out-degree in // decreasing order, we will apply this kernel only to low out-degree vertices. @@ -232,9 +228,10 @@ transform_reduce_by_adj_matrix_row_col_key_e( matrix_partition, graph_view.get_vertex_partition_first(comm_root_rank), graph_view.get_vertex_partition_last(comm_root_rank), - adj_matrix_row_value_input_first, - adj_matrix_col_value_input_first, - adj_matrix_row_col_key_first, + adj_matrix_row_value_input_first + row_value_input_offset, + adj_matrix_col_value_input_first + col_value_input_offset, + adj_matrix_row_col_key_first + + (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset), e_op, tmp_keys.data(), get_dataframe_buffer_begin(tmp_value_buffer)); diff --git a/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh index 4efd32bcac7..4d557b97a30 100644 --- a/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh +++ b/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh @@ -25,12 +25,14 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include @@ -115,12 +117,10 @@ __global__ void for_all_frontier_row_for_all_nbr_low_degree( static_assert(sizeof(unsigned long long int) == sizeof(size_t)); auto buffer_idx = atomicAdd(reinterpret_cast(buffer_idx_ptr), static_cast(1)); - *(buffer_key_output_first + buffer_idx) = col; - *(buffer_payload_output_first + buffer_idx) = - remove_first_thrust_tuple_element()(e_op_result); + *(buffer_key_output_first + buffer_idx) = col; + *(buffer_payload_output_first + buffer_idx) = thrust::get<1>(e_op_result); } } - idx += gridDim.x * blockDim.x; } } @@ -155,8 +155,8 @@ size_t reduce_buffer_elements(raft::handle_t const& handle, // temporary buffer size exceeds the maximum buffer size (may be definied as percentage of the // system HBM size or a function of the maximum number of threads in the system)) // FIXME: actually, we can find how many unique keys are here by now. - // FIXME: if GraphViewType::is_multi_gpu is true, this should be executed on the GPU holding the - // vertex unless reduce_op is a pure function. + // FIXME: if GraphViewType::is_multi_gpu is true, this should be executed on the GPU holding + // the vertex unless reduce_op is a pure function. rmm::device_uvector keys(num_buffer_elements, handle.get_stream()); auto value_buffer = allocate_dataframe_buffer(num_buffer_elements, handle.get_stream()); @@ -234,8 +234,7 @@ __global__ void update_frontier_and_vertex_output_values( auto v_op_result = v_op(v_val, payload); selected_bucket_idx = thrust::get<0>(v_op_result); if (selected_bucket_idx != invalid_bucket_idx) { - *(vertex_value_output_first + key_offset) = - remove_first_thrust_tuple_element()(v_op_result); + *(vertex_value_output_first + key_offset) = thrust::get<1>(v_op_result); bucket_block_local_offsets[selected_bucket_idx] = 1; } } @@ -349,25 +348,18 @@ void update_frontier_v_push_if_out_nbr( static_assert(!GraphViewType::is_adj_matrix_transposed, "GraphViewType should support the push model."); - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using weight_t = typename GraphViewType::weight_type; + using payload_t = typename ReduceOp::type; // 1. fill the buffer - vertex_frontier.set_buffer_idx_value(0); - - auto loop_count = size_t{1}; - if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_size = row_comm.get_size(); - loop_count = graph_view.is_hypergraph_partitioned() - ? graph_view.get_number_of_local_adj_matrix_partitions() - : static_cast(row_comm_size); - } - - for (size_t i = 0; i < loop_count; ++i) { - matrix_partition_device_t matrix_partition( - graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i); + rmm::device_uvector keys(size_t{0}, handle.get_stream()); + auto payload_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + rmm::device_scalar buffer_idx(size_t{0}, handle.get_stream()); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + matrix_partition_device_t matrix_partition(graph_view, i); rmm::device_uvector frontier_rows( 0, handle.get_stream()); // relevant only if GraphViewType::is_multi_gpu is true @@ -380,22 +372,18 @@ void update_frontier_v_push_if_out_nbr( auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_rank = col_comm.get_rank(); - auto sub_comm_rank = graph_view.is_hypergraph_partitioned() ? col_comm_rank : row_comm_rank; - frontier_size = host_scalar_bcast( - graph_view.is_hypergraph_partitioned() ? col_comm : row_comm, - (static_cast(sub_comm_rank) == i) ? thrust::distance(vertex_first, vertex_last) - : size_t{0}, - i, - handle.get_stream()); + auto sub_comm_rank = col_comm_rank; + frontier_size = host_scalar_bcast(col_comm, + (static_cast(sub_comm_rank) == i) + ? thrust::distance(vertex_first, vertex_last) + : size_t{0}, + i, + handle.get_stream()); if (static_cast(sub_comm_rank) != i) { frontier_rows.resize(frontier_size, handle.get_stream()); } - device_bcast(graph_view.is_hypergraph_partitioned() ? col_comm : row_comm, - vertex_first, - frontier_rows.begin(), - frontier_size, - i, - handle.get_stream()); + device_bcast( + col_comm, vertex_first, frontier_rows.begin(), frontier_size, i, handle.get_stream()); } else { frontier_size = thrust::distance(vertex_first, vertex_last); } @@ -439,10 +427,8 @@ void update_frontier_v_push_if_out_nbr( // locking. // FIXME: if i != 0, this will require costly reallocation if we don't use the new CUDA feature // to reserve address space. - vertex_frontier.resize_buffer(vertex_frontier.get_buffer_idx_value() + max_pushes); - auto buffer_first = vertex_frontier.buffer_begin(); - auto buffer_key_first = std::get<0>(buffer_first); - auto buffer_payload_first = std::get<1>(buffer_first); + keys.resize(buffer_idx.value(handle.get_stream()) + max_pushes, handle.get_stream()); + resize_dataframe_buffer(payload_buffer, keys.size(), handle.get_stream()); auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed ? vertex_t{0} @@ -467,9 +453,9 @@ void update_frontier_v_push_if_out_nbr( frontier_rows.end(), adj_matrix_row_value_input_first + row_value_input_offset, adj_matrix_col_value_input_first, - buffer_key_first, - buffer_payload_first, - vertex_frontier.get_buffer_idx_ptr(), + keys.begin(), + get_dataframe_buffer_begin(payload_buffer), + buffer_idx.data(), e_op); } else { detail::for_all_frontier_row_for_all_nbr_low_degree<<(payload_buffer), + buffer_idx.data(), e_op); } } @@ -491,18 +477,12 @@ void update_frontier_v_push_if_out_nbr( // 2. reduce the buffer - auto num_buffer_offset = edge_t{0}; - - auto buffer_first = vertex_frontier.buffer_begin(); - auto buffer_key_first = std::get<0>(buffer_first) + num_buffer_offset; - auto buffer_payload_first = std::get<1>(buffer_first) + num_buffer_offset; - - auto num_buffer_elements = detail::reduce_buffer_elements(handle, - buffer_key_first, - buffer_payload_first, - vertex_frontier.get_buffer_idx_value(), - reduce_op); - + auto num_buffer_elements = + detail::reduce_buffer_elements(handle, + keys.begin(), + get_dataframe_buffer_begin(payload_buffer), + buffer_idx.value(handle.get_stream()), + reduce_op); if (GraphViewType::is_multi_gpu) { auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); @@ -513,12 +493,9 @@ void update_frontier_v_push_if_out_nbr( auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); - std::vector h_vertex_lasts(graph_view.is_hypergraph_partitioned() ? row_comm_size - : col_comm_size); + std::vector h_vertex_lasts(row_comm_size); for (size_t i = 0; i < h_vertex_lasts.size(); ++i) { - h_vertex_lasts[i] = graph_view.get_vertex_partition_last( - graph_view.is_hypergraph_partitioned() ? col_comm_rank * row_comm_size + i - : row_comm_rank * col_comm_size + i); + h_vertex_lasts[i] = graph_view.get_vertex_partition_last(col_comm_rank * row_comm_size + i); } rmm::device_uvector d_vertex_lasts(h_vertex_lasts.size(), handle.get_stream()); @@ -527,8 +504,8 @@ void update_frontier_v_push_if_out_nbr( rmm::device_uvector d_tx_buffer_last_boundaries(d_vertex_lasts.size(), handle.get_stream()); thrust::lower_bound(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - buffer_key_first, - buffer_key_first + num_buffer_elements, + keys.begin(), + keys.begin() + num_buffer_elements, d_vertex_lasts.begin(), d_vertex_lasts.end(), d_tx_buffer_last_boundaries.begin()); @@ -537,122 +514,35 @@ void update_frontier_v_push_if_out_nbr( d_tx_buffer_last_boundaries.data(), d_tx_buffer_last_boundaries.size(), handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); + handle.get_stream_view().synchronize(); std::vector tx_counts(h_tx_buffer_last_boundaries.size()); std::adjacent_difference( h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin()); - std::vector rx_counts(graph_view.is_hypergraph_partitioned() ? row_comm_size - : col_comm_size); - std::vector count_requests(tx_counts.size() + rx_counts.size()); - size_t tx_self_i = std::numeric_limits::max(); - for (size_t i = 0; i < tx_counts.size(); ++i) { - auto comm_dst_rank = graph_view.is_hypergraph_partitioned() - ? col_comm_rank * row_comm_size + static_cast(i) - : row_comm_rank * col_comm_size + static_cast(i); - if (comm_dst_rank == comm_rank) { - tx_self_i = i; - // FIXME: better define request_null (similar to MPI_REQUEST_NULL) under raft::comms - count_requests[i] = std::numeric_limits::max(); - } else { - comm.isend(&tx_counts[i], 1, comm_dst_rank, 0 /* tag */, count_requests.data() + i); - } - } - for (size_t i = 0; i < rx_counts.size(); ++i) { - auto comm_src_rank = graph_view.is_hypergraph_partitioned() - ? col_comm_rank * row_comm_size + static_cast(i) - : static_cast(i) * row_comm_size + comm_rank / col_comm_size; - if (comm_src_rank == comm_rank) { - assert(tx_self_i != std::numeric_limits::max()); - rx_counts[i] = tx_counts[tx_self_i]; - // FIXME: better define request_null (similar to MPI_REQUEST_NULL) under raft::comms - count_requests[tx_counts.size() + i] = std::numeric_limits::max(); - } else { - comm.irecv(&rx_counts[i], - 1, - comm_src_rank, - 0 /* tag */, - count_requests.data() + tx_counts.size() + i); - } - } - // FIXME: better define request_null (similar to MPI_REQUEST_NULL) under raft::comms, if - // raft::comms::wait immediately returns on seeing request_null, this remove is unnecessary - count_requests.erase(std::remove(count_requests.begin(), - count_requests.end(), - std::numeric_limits::max()), - count_requests.end()); - comm.waitall(count_requests.size(), count_requests.data()); - - std::vector tx_offsets(tx_counts.size() + 1, edge_t{0}); - std::partial_sum(tx_counts.begin(), tx_counts.end(), tx_offsets.begin() + 1); - std::vector rx_offsets(rx_counts.size() + 1, edge_t{0}); - std::partial_sum(rx_counts.begin(), rx_counts.end(), rx_offsets.begin() + 1); - - // FIXME: this will require costly reallocation if we don't use the new CUDA feature to reserve - // address space. - // FIXME: std::max(actual size, 1) as ncclRecv currently hangs if recvuff is nullptr even if - // count is 0 - vertex_frontier.resize_buffer(std::max(num_buffer_elements + rx_offsets.back(), size_t(1))); - - auto buffer_first = vertex_frontier.buffer_begin(); - auto buffer_key_first = std::get<0>(buffer_first) + num_buffer_offset; - auto buffer_payload_first = std::get<1>(buffer_first) + num_buffer_offset; - - std::vector tx_dst_ranks(tx_counts.size()); - std::vector rx_src_ranks(rx_counts.size()); - for (size_t i = 0; i < tx_dst_ranks.size(); ++i) { - tx_dst_ranks[i] = graph_view.is_hypergraph_partitioned() - ? col_comm_rank * row_comm_size + static_cast(i) - : row_comm_rank * col_comm_size + static_cast(i); - } - for (size_t i = 0; i < rx_src_ranks.size(); ++i) { - rx_src_ranks[i] = graph_view.is_hypergraph_partitioned() - ? col_comm_rank * row_comm_size + static_cast(i) - : static_cast(i) * row_comm_size + comm_rank / col_comm_size; - } - - device_multicast_sendrecv( - comm, - buffer_key_first, - tx_counts, - tx_offsets, - tx_dst_ranks, - buffer_key_first + num_buffer_elements, - rx_counts, - rx_offsets, - rx_src_ranks, - handle.get_stream()); - device_multicast_sendrecv( - comm, - buffer_payload_first, - tx_counts, - tx_offsets, - tx_dst_ranks, - buffer_payload_first + num_buffer_elements, - rx_counts, - rx_offsets, - rx_src_ranks, - handle.get_stream()); - - // FIXME: this does not exploit the fact that each segment is sorted. Lost performance - // optimization opportunities. - // FIXME: we can use [vertex_frontier.buffer_begin(), vertex_frontier.buffer_begin() + - // num_buffer_elements) as temporary buffer inside reduce_buffer_elements(). - num_buffer_offset = num_buffer_elements; - num_buffer_elements = detail::reduce_buffer_elements(handle, - buffer_key_first + num_buffer_elements, - buffer_payload_first + num_buffer_elements, - rx_offsets.back(), - reduce_op); + rmm::device_uvector rx_keys(size_t{0}, handle.get_stream()); + std::tie(rx_keys, std::ignore) = + shuffle_values(row_comm, keys.begin(), tx_counts, handle.get_stream()); + keys = std::move(rx_keys); + + auto rx_payload_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_payload_buffer, std::ignore) = + shuffle_values(row_comm, + get_dataframe_buffer_begin(payload_buffer), + tx_counts, + handle.get_stream()); + payload_buffer = std::move(rx_payload_buffer); + + num_buffer_elements = + detail::reduce_buffer_elements(handle, + keys.begin(), + get_dataframe_buffer_begin(payload_buffer), + keys.size(), + reduce_op); } // 3. update vertex properties if (num_buffer_elements > 0) { - auto buffer_first = vertex_frontier.buffer_begin(); - auto buffer_key_first = std::get<0>(buffer_first) + num_buffer_offset; - auto buffer_payload_first = std::get<1>(buffer_first) + num_buffer_offset; - raft::grid_1d_thread_t update_grid(num_buffer_elements, detail::update_frontier_v_push_if_out_nbr_update_block_size, handle.get_device_properties().maxGridSize[0]); @@ -666,8 +556,8 @@ void update_frontier_v_push_if_out_nbr( detail::update_frontier_and_vertex_output_values <<>>( vertex_partition, - buffer_key_first, - buffer_payload_first, + keys.begin(), + get_dataframe_buffer_begin(payload_buffer), num_buffer_elements, vertex_value_input_first, vertex_value_output_first, @@ -690,21 +580,5 @@ void update_frontier_v_push_if_out_nbr( } } -/* - -FIXME: - -iterating over lower triangular (or upper triangular) : triangle counting -LRB might be necessary if the cost of processing an edge (i, j) is a function of degree(i) and -degree(j) : triangle counting -push-pull switching support (e.g. DOBFS), in this case, we need both -CSR & CSC (trade-off execution time vs memory requirement, unless graph is symmetric) -if graph is symmetric, there will be additional optimization opportunities (e.g. in-degree == -out-degree) For BFS, sending a bit vector (for the entire set of dest vertices per partitoin may -work better we can use thrust::set_intersection for triangle counting think about adding thrust -wrappers for reduction functions. Can I pass nullptr for dummy -instead of thrust::make_counting_iterator(0)? -*/ - } // namespace experimental } // namespace cugraph diff --git a/cpp/include/patterns/vertex_frontier.cuh b/cpp/include/patterns/vertex_frontier.cuh index c11142d3cf7..375ec097850 100644 --- a/cpp/include/patterns/vertex_frontier.cuh +++ b/cpp/include/patterns/vertex_frontier.cuh @@ -48,26 +48,6 @@ inline size_t round_up(size_t number_to_round, size_t modulus) return ((number_to_round + (modulus - 1)) / modulus) * modulus; } -template -auto make_buffer_zip_iterator_impl(std::vector& buffer_ptrs, - size_t offset, - std::index_sequence) -{ - auto key_ptr = reinterpret_cast(buffer_ptrs[0]) + offset; - auto payload_it = thrust::make_zip_iterator( - thrust::make_tuple(reinterpret_cast::type*>( - buffer_ptrs[1 + Is])...)); - return std::make_tuple(key_ptr, payload_it); -} - -template -auto make_buffer_zip_iterator(std::vector& buffer_ptrs, size_t offset) -{ - size_t constexpr tuple_size = thrust::tuple_size::value; - return make_buffer_zip_iterator_impl( - buffer_ptrs, offset, std::make_index_sequence()); -} - template __global__ void move_and_invalidate_if(RowIterator row_first, RowIterator row_last, @@ -199,10 +179,7 @@ class Bucket { size_t size_{0}; }; -template +template class VertexFrontier { public: static size_t constexpr kNumBuckets = num_buckets; @@ -211,9 +188,7 @@ class VertexFrontier { VertexFrontier(raft::handle_t const& handle, std::vector bucket_capacities) : handle_ptr_(&handle), tmp_bucket_ptrs_(num_buckets, handle.get_stream()), - tmp_bucket_sizes_(num_buckets, handle.get_stream()), - buffer_ptrs_(kReduceInputTupleSize + 1 /* to store destination column number */, nullptr), - buffer_idx_(0, handle_ptr_->get_stream()) + tmp_bucket_sizes_(num_buckets, handle.get_stream()) { CUGRAPH_EXPECTS(bucket_capacities.size() == num_buckets, "invalid input argument bucket_capacities (size mismatch)"); @@ -228,7 +203,6 @@ class VertexFrontier { for (size_t i = 0; i < num_buckets; ++i) { buckets_.emplace_back(handle, bucket_capacities[i]); } - buffer_.set_stream(handle_ptr_->get_stream()); } Bucket& get_bucket(size_t bucket_idx) { return buckets_[bucket_idx]; } @@ -311,90 +285,11 @@ class VertexFrontier { return std::make_tuple(tmp_bucket_ptrs_.data(), tmp_bucket_sizes_.data()); } - void resize_buffer(size_t size) - { - // FIXME: rmm::device_buffer resize incurs copy if memory is reallocated, which is unnecessary - // in this case. - buffer_.resize(compute_aggregate_buffer_size_in_bytes(size), handle_ptr_->get_stream()); - if (size > buffer_capacity_) { - buffer_capacity_ = size; - update_buffer_ptrs(); - } - buffer_size_ = size; - } - - void clear_buffer() { resize_buffer(0); } - - void shrink_to_fit_buffer() - { - if (buffer_size_ != buffer_capacity_) { - // FIXME: rmm::device_buffer shrink_to_fit incurs copy if memory is reallocated, which is - // unnecessary in this case. - buffer_.shrink_to_fit(handle_ptr_->get_stream()); - update_buffer_ptrs(); - buffer_capacity_ = buffer_size_; - } - } - - auto buffer_begin() - { - return detail::make_buffer_zip_iterator(buffer_ptrs_, 0); - } - - auto buffer_end() - { - return detail::make_buffer_zip_iterator(buffer_ptrs_, - buffer_size_); - } - - auto get_buffer_idx_ptr() { return buffer_idx_.data(); } - - size_t get_buffer_idx_value() { return buffer_idx_.value(handle_ptr_->get_stream()); } - - void set_buffer_idx_value(size_t value) - { - buffer_idx_.set_value(value, handle_ptr_->get_stream()); - } - private: - static size_t constexpr kReduceInputTupleSize = thrust::tuple_size::value; - static size_t constexpr kBufferAlignment = 128; - raft::handle_t const* handle_ptr_{nullptr}; std::vector> buckets_{}; rmm::device_uvector tmp_bucket_ptrs_; rmm::device_uvector tmp_bucket_sizes_; - - std::array tuple_element_sizes_ = - compute_thrust_tuple_element_sizes()(); - std::vector buffer_ptrs_{}; - rmm::device_buffer buffer_{}; - size_t buffer_size_{0}; - size_t buffer_capacity_{0}; - rmm::device_scalar buffer_idx_{}; - - // FIXME: better pick between this apporach or the approach used in allocate_comm_buffer - size_t compute_aggregate_buffer_size_in_bytes(size_t size) - { - size_t aggregate_buffer_size_in_bytes = - detail::round_up(sizeof(vertex_t) * size, kBufferAlignment); - for (size_t i = 0; i < kReduceInputTupleSize; ++i) { - aggregate_buffer_size_in_bytes += - detail::round_up(tuple_element_sizes_[i] * size, kBufferAlignment); - } - return aggregate_buffer_size_in_bytes; - } - - void update_buffer_ptrs() - { - uintptr_t ptr = reinterpret_cast(buffer_.data()); - buffer_ptrs_[0] = reinterpret_cast(ptr); - ptr += detail::round_up(sizeof(vertex_t) * buffer_capacity_, kBufferAlignment); - for (size_t i = 0; i < kReduceInputTupleSize; ++i) { - buffer_ptrs_[1 + i] = reinterpret_cast(ptr); - ptr += detail::round_up(tuple_element_sizes_[i] * buffer_capacity_, kBufferAlignment); - } - } }; } // namespace experimental diff --git a/cpp/include/utilities/collect_comm.cuh b/cpp/include/utilities/collect_comm.cuh index 8d2227c0f60..481717d7c38 100644 --- a/cpp/include/utilities/collect_comm.cuh +++ b/cpp/include/utilities/collect_comm.cuh @@ -58,13 +58,18 @@ collect_values_for_keys(raft::comms::comms_t const &comm, double constexpr load_factor = 0.7; // FIXME: we may compare the performance & memory footprint of this hash based approach vs binary - // search based approach + // search based approach (especially when thrust::distance(collect_key_first, collect_key_last) << + // thrust::distance(map_key_first, map_key_last) // 1. build a cuco::static_map object for the map k, v pairs. auto kv_map_ptr = std::make_unique>( - static_cast(static_cast(thrust::distance(map_key_first, map_key_last)) / - load_factor), + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast( + static_cast(thrust::distance(map_key_first, map_key_last)) / load_factor), + static_cast(thrust::distance(map_key_first, map_key_last)) + 1), invalid_vertex_id::value, invalid_vertex_id::value); { @@ -73,7 +78,11 @@ collect_values_for_keys(raft::comms::comms_t const &comm, [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (thrust::distance(map_key_first, map_key_last) > 0) { + kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); + } } // 2. collect values for the unique keys in [collect_key_first, collect_key_last) @@ -82,9 +91,6 @@ collect_values_for_keys(raft::comms::comms_t const &comm, stream); thrust::copy( rmm::exec_policy(stream)->on(stream), collect_key_first, collect_key_last, unique_keys.begin()); - // FIXME: sort and unique are unnecessary if the keys in [collect_key_first, collect_key_last) are - // already unique, if this cost becomes a performance bottlenec, we may add - // collect_values_for_unique_keys in the future thrust::sort(rmm::exec_policy(stream)->on(stream), unique_keys.begin(), unique_keys.end()); unique_keys.resize( thrust::distance( @@ -107,8 +113,12 @@ collect_values_for_keys(raft::comms::comms_t const &comm, CUDA_TRY(cudaStreamSynchronize(stream)); // cuco::static_map currently does not take stream - kv_map_ptr->find( - rx_unique_keys.begin(), rx_unique_keys.end(), values_for_rx_unique_keys.begin()); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (rx_unique_keys.size() > 0) { + kv_map_ptr->find( + rx_unique_keys.begin(), rx_unique_keys.end(), values_for_rx_unique_keys.begin()); + } rmm::device_uvector rx_values_for_unique_keys(0, stream); std::tie(rx_values_for_unique_keys, std::ignore) = @@ -125,7 +135,11 @@ collect_values_for_keys(raft::comms::comms_t const &comm, kv_map_ptr.reset(); kv_map_ptr = std::make_unique>( - static_cast(static_cast(unique_keys.size()) / load_factor), + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast(static_cast(unique_keys.size()) / load_factor), + unique_keys.size() + 1), invalid_vertex_id::value, invalid_vertex_id::value); { @@ -136,15 +150,154 @@ collect_values_for_keys(raft::comms::comms_t const &comm, return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - kv_map_ptr->insert(pair_first, pair_first + unique_keys.size()); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (unique_keys.size() > 0) { kv_map_ptr->insert(pair_first, pair_first + unique_keys.size()); } } // 4. find values for [collect_key_first, collect_key_last) auto value_buffer = allocate_dataframe_buffer( thrust::distance(collect_key_first, collect_key_last), stream); - kv_map_ptr->find( - collect_key_first, collect_key_last, get_dataframe_buffer_begin(value_buffer)); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (thrust::distance(collect_key_first, collect_key_last) > 0) { + kv_map_ptr->find( + collect_key_first, collect_key_last, get_dataframe_buffer_begin(value_buffer)); + } + + return value_buffer; +} + +// for key = [map_key_first, map_key_last), key_to_gpu_id_op(key) should be coincide with +// comm.get_rank() +template +decltype(allocate_dataframe_buffer::value_type>( + 0, cudaStream_t{nullptr})) +collect_values_for_unique_keys(raft::comms::comms_t const &comm, + VertexIterator0 map_key_first, + VertexIterator0 map_key_last, + ValueIterator map_value_first, + VertexIterator1 collect_unique_key_first, + VertexIterator1 collect_unique_key_last, + KeyToGPUIdOp key_to_gpu_id_op, + cudaStream_t stream) +{ + using vertex_t = typename std::iterator_traits::value_type; + static_assert( + std::is_same::value_type, vertex_t>::value); + using value_t = typename std::iterator_traits::value_type; + + double constexpr load_factor = 0.7; + + // FIXME: we may compare the performance & memory footprint of this hash based approach vs binary + // search based approach (especially when thrust::distance(collect_unique_key_first, + // collect_unique_key_last) << thrust::distance(map_key_first, map_key_last) + + // 1. build a cuco::static_map object for the map k, v pairs. + + auto kv_map_ptr = std::make_unique>( + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast( + static_cast(thrust::distance(map_key_first, map_key_last)) / load_factor), + static_cast(thrust::distance(map_key_first, map_key_last)) + 1), + invalid_vertex_id::value, + invalid_vertex_id::value); + { + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first)), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (thrust::distance(map_key_first, map_key_last)) { + kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); + } + } + + // 2. collect values for the unique keys in [collect_unique_key_first, collect_unique_key_last) + + rmm::device_uvector unique_keys( + thrust::distance(collect_unique_key_first, collect_unique_key_last), stream); + thrust::copy(rmm::exec_policy(stream)->on(stream), + collect_unique_key_first, + collect_unique_key_last, + unique_keys.begin()); + + rmm::device_uvector values_for_unique_keys(0, stream); + { + rmm::device_uvector rx_unique_keys(0, stream); + std::vector rx_value_counts{}; + std::tie(rx_unique_keys, rx_value_counts) = groupby_gpuid_and_shuffle_values( + comm, + unique_keys.begin(), + unique_keys.end(), + [key_to_gpu_id_op] __device__(auto val) { return key_to_gpu_id_op(val); }, + stream); + + rmm::device_uvector values_for_rx_unique_keys(rx_unique_keys.size(), stream); + + CUDA_TRY(cudaStreamSynchronize(stream)); // cuco::static_map currently does not take stream + + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (rx_unique_keys.size() > 0) { + kv_map_ptr->find( + rx_unique_keys.begin(), rx_unique_keys.end(), values_for_rx_unique_keys.begin()); + } + + rmm::device_uvector rx_values_for_unique_keys(0, stream); + std::tie(rx_values_for_unique_keys, std::ignore) = + shuffle_values(comm, values_for_rx_unique_keys.begin(), rx_value_counts, stream); + + values_for_unique_keys = std::move(rx_values_for_unique_keys); + } + + // 3. re-build a cuco::static_map object for the k, v pairs in unique_keys, + // values_for_unique_keys. + + CUDA_TRY(cudaStreamSynchronize(stream)); // cuco::static_map currently does not take stream + + kv_map_ptr.reset(); + + kv_map_ptr = std::make_unique>( + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast(static_cast(unique_keys.size()) / load_factor), + unique_keys.size() + 1), + invalid_vertex_id::value, + invalid_vertex_id::value); + { + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator( + thrust::make_tuple(unique_keys.begin(), values_for_unique_keys.begin())), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (unique_keys.size() > 0) { kv_map_ptr->insert(pair_first, pair_first + unique_keys.size()); } + } + + // 4. find values for [collect_unique_key_first, collect_unique_key_last) + + auto value_buffer = allocate_dataframe_buffer( + thrust::distance(collect_unique_key_first, collect_unique_key_last), stream); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (thrust::distance(collect_unique_key_first, collect_unique_key_last)) { + kv_map_ptr->find(collect_unique_key_first, + collect_unique_key_last, + get_dataframe_buffer_begin(value_buffer)); + } return value_buffer; } diff --git a/cpp/include/utilities/cython.hpp b/cpp/include/utilities/cython.hpp index a58331d465a..d8c476760f0 100644 --- a/cpp/include/utilities/cython.hpp +++ b/cpp/include/utilities/cython.hpp @@ -93,7 +93,7 @@ struct graph_container_t { void* weights; void* vertex_partition_offsets; - size_t num_partition_edges; + size_t num_local_edges; size_t num_global_vertices; size_t num_global_edges; numberTypeEnum vertexType; @@ -103,7 +103,6 @@ struct graph_container_t { bool is_multi_gpu; bool sorted_by_degree; bool do_expensive_check; - bool hypergraph_partitioned; int row_comm_size; int col_comm_size; int row_comm_rank; @@ -147,7 +146,7 @@ struct cy_multi_edgelists_t { // replacement for std::tuple<,,>, since std::tuple is not // supported in cython // -template +template struct major_minor_weights_t { explicit major_minor_weights_t(raft::handle_t const& handle) : shuffled_major_vertices_(0, handle.get_stream()), @@ -155,12 +154,15 @@ struct major_minor_weights_t { shuffled_weights_(0, handle.get_stream()) { } + rmm::device_uvector& get_major(void) { return shuffled_major_vertices_; } rmm::device_uvector& get_minor(void) { return shuffled_minor_vertices_; } rmm::device_uvector& get_weights(void) { return shuffled_weights_; } + std::vector& get_edge_counts(void) { return edge_counts_; } + std::pair, size_t> get_major_wrap( void) // const: triggers errors in Cython autogen-ed C++ { @@ -180,10 +182,16 @@ struct major_minor_weights_t { sizeof(weight_t)); } + std::unique_ptr> get_edge_counts_wrap(void) // const + { + return std::make_unique>(edge_counts_); + } + private: rmm::device_uvector shuffled_major_vertices_; rmm::device_uvector shuffled_minor_vertices_; rmm::device_uvector shuffled_weights_; + std::vector edge_counts_{}; }; // aggregate for random_walks() return type @@ -353,6 +361,9 @@ struct renum_quad_t { // The number of vertices and edges respectively in the graph represented by // the above arrays. // +// bool is_weighted +// true if the resulting graph object should store edge weights +// // bool transposed // true if the resulting graph object should store a transposed adjacency // matrix @@ -369,10 +380,11 @@ void populate_graph_container(graph_container_t& graph_container, numberTypeEnum vertexType, numberTypeEnum edgeType, numberTypeEnum weightType, - size_t num_partition_edges, + size_t num_local_edges, size_t num_global_vertices, size_t num_global_edges, bool sorted_by_degree, + bool is_weighted, bool transposed, bool multi_gpu); @@ -470,14 +482,13 @@ call_random_walks(raft::handle_t const& handle, // wrapper for shuffling: // template -std::unique_ptr> call_shuffle( +std::unique_ptr> call_shuffle( raft::handle_t const& handle, vertex_t* edgelist_major_vertices, // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place vertex_t* edgelist_minor_vertices, // [IN / OUT] weight_t* edgelist_weights, // [IN / OUT] - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned); // = false + edge_t num_edgelist_edges); // Wrapper for calling renumber_edeglist() inplace: // @@ -486,8 +497,7 @@ std::unique_ptr> call_renumber( raft::handle_t const& handle, vertex_t* shuffled_edgelist_major_vertices /* [INOUT] */, vertex_t* shuffled_edgelist_minor_vertices /* [INOUT] */, - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edge_counts, bool do_expensive_check, bool multi_gpu); diff --git a/cpp/include/utilities/dataframe_buffer.cuh b/cpp/include/utilities/dataframe_buffer.cuh index 06352b8e217..e59b12f2a80 100644 --- a/cpp/include/utilities/dataframe_buffer.cuh +++ b/cpp/include/utilities/dataframe_buffer.cuh @@ -47,21 +47,19 @@ auto allocate_dataframe_buffer_tuple_impl(std::index_sequence, } template -void resize_dataframe_buffer_tuple_element_impl(BufferType& buffer, - size_t new_buffer_size, - cudaStream_t stream) -{ - std::get(buffer).resize(new_buffer_size, stream); - resize_dataframe_buffer_tuple_element_impl( - buffer, new_buffer_size, stream); -} +struct resize_dataframe_buffer_tuple_iterator_element_impl { + void run(BufferType& buffer, size_t new_buffer_size, cudaStream_t stream) + { + std::get(buffer).resize(new_buffer_size, stream); + resize_dataframe_buffer_tuple_iterator_element_impl().run( + buffer, new_buffer_size, stream); + } +}; template -void resize_dataframe_buffer_tuple_impl(BufferType& buffer, - size_t new_buffer_size, - cudaStream_t stream) -{ -} +struct resize_dataframe_buffer_tuple_iterator_element_impl { + void run(BufferType& buffer, size_t new_buffer_size, cudaStream_t stream) {} +}; template auto get_dataframe_buffer_begin_tuple_element_impl(BufferType& buffer) @@ -108,8 +106,9 @@ template ::value; - detail::resize_dataframe_buffer_tuple_impl( - buffer, new_buffer_size, stream); + detail:: + resize_dataframe_buffer_tuple_iterator_element_impl() + .run(buffer, new_buffer_size, stream); } template +std::enable_if_t::value, void> +device_allreduce_impl(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t count, + raft::comms::op_t op, + cudaStream_t stream) +{ + // no-op +} + +template +std::enable_if_t< + std::is_arithmetic::value_type>::value, + void> +device_allreduce_impl(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t count, + raft::comms::op_t op, + cudaStream_t stream) +{ + static_assert(std::is_same::value_type, + typename std::iterator_traits::value_type>::value); + comm.allreduce(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, op, stream); +} + +template +struct device_allreduce_tuple_iterator_element_impl { + void run(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t count, + raft::comms::op_t op, + cudaStream_t stream) const + { + device_allreduce_impl(comm, + thrust::get(input_first.get_iterator_tuple()), + thrust::get(output_first.get_iterator_tuple()), + count, + op, + stream); + device_allreduce_tuple_iterator_element_impl( + comm, input_first, output_first, count, op, stream); + } +}; + +template +struct device_allreduce_tuple_iterator_element_impl { + void run(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t count, + raft::comms::op_t op, + cudaStream_t stream) const + { + } +}; + template std::enable_if_t::value, void> device_reduce_impl(raft::comms::comms_t const& comm, @@ -856,6 +916,46 @@ device_bcast(raft::comms::comms_t const& comm, comm, input_first, output_first, count, root, stream); } +template +std::enable_if_t< + std::is_arithmetic::value_type>::value, + void> +device_allreduce(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t count, + raft::comms::op_t op, + cudaStream_t stream) +{ + detail::device_allreduce_impl(comm, input_first, output_first, count, op, stream); +} + +template +std::enable_if_t< + is_thrust_tuple_of_arithmetic::value_type>::value && + is_thrust_tuple::value_type>::value, + void> +device_allreduce(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t count, + raft::comms::op_t op, + cudaStream_t stream) +{ + static_assert( + thrust::tuple_size::value_type>::value == + thrust::tuple_size::value_type>::value); + + size_t constexpr tuple_size = + thrust::tuple_size::value_type>::value; + + detail::device_allreduce_tuple_iterator_element_impl( + comm, input_first, output_first, count, op, stream); +} + template std::enable_if_t< std::is_arithmetic::value_type>::value, diff --git a/cpp/include/utilities/shuffle_comm.cuh b/cpp/include/utilities/shuffle_comm.cuh index 8c363c9a346..b318009d9bf 100644 --- a/cpp/include/utilities/shuffle_comm.cuh +++ b/cpp/include/utilities/shuffle_comm.cuh @@ -22,6 +22,12 @@ #include #include +#include +#include +#include +#include +#include + #include #include #include @@ -31,89 +37,6 @@ namespace experimental { namespace detail { -template -rmm::device_uvector sort_and_count(raft::comms::comms_t const &comm, - ValueIterator tx_value_first /* [INOUT */, - ValueIterator tx_value_last /* [INOUT */, - ValueToGPUIdOp value_to_gpu_id_op, - cudaStream_t stream) -{ - auto const comm_size = comm.get_size(); - - thrust::sort(rmm::exec_policy(stream)->on(stream), - tx_value_first, - tx_value_last, - [value_to_gpu_id_op] __device__(auto lhs, auto rhs) { - return value_to_gpu_id_op(lhs) < value_to_gpu_id_op(rhs); - }); - - auto gpu_id_first = thrust::make_transform_iterator( - tx_value_first, - [value_to_gpu_id_op] __device__(auto value) { return value_to_gpu_id_op(value); }); - rmm::device_uvector d_tx_dst_ranks(comm_size, stream); - rmm::device_uvector d_tx_value_counts(comm_size, stream); - auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), - gpu_id_first, - gpu_id_first + thrust::distance(tx_value_first, tx_value_last), - thrust::make_constant_iterator(size_t{1}), - d_tx_dst_ranks.begin(), - d_tx_value_counts.begin()); - if (thrust::distance(d_tx_value_counts.begin(), thrust::get<1>(last)) < comm_size) { - rmm::device_uvector d_counts(comm_size, stream); - thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0}); - thrust::scatter(rmm::exec_policy(stream)->on(stream), - d_tx_value_counts.begin(), - thrust::get<1>(last), - d_tx_dst_ranks.begin(), - d_counts.begin()); - d_tx_value_counts = std::move(d_counts); - } - - return d_tx_value_counts; -} - -template -rmm::device_uvector sort_and_count(raft::comms::comms_t const &comm, - VertexIterator tx_key_first /* [INOUT */, - VertexIterator tx_key_last /* [INOUT */, - ValueIterator tx_value_first /* [INOUT */, - KeyToGPUIdOp key_to_gpu_id_op, - cudaStream_t stream) -{ - auto const comm_size = comm.get_size(); - - thrust::sort_by_key(rmm::exec_policy(stream)->on(stream), - tx_key_first, - tx_key_last, - tx_value_first, - [key_to_gpu_id_op] __device__(auto lhs, auto rhs) { - return key_to_gpu_id_op(lhs) < key_to_gpu_id_op(rhs); - }); - - auto gpu_id_first = thrust::make_transform_iterator( - tx_key_first, [key_to_gpu_id_op] __device__(auto key) { return key_to_gpu_id_op(key); }); - rmm::device_uvector d_tx_dst_ranks(comm_size, stream); - rmm::device_uvector d_tx_value_counts(comm_size, stream); - auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), - gpu_id_first, - gpu_id_first + thrust::distance(tx_key_first, tx_key_last), - thrust::make_constant_iterator(size_t{1}), - d_tx_dst_ranks.begin(), - d_tx_value_counts.begin()); - if (thrust::distance(d_tx_value_counts.begin(), thrust::get<1>(last)) < comm_size) { - rmm::device_uvector d_counts(comm_size, stream); - thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0}); - thrust::scatter(rmm::exec_policy(stream)->on(stream), - d_tx_value_counts.begin(), - thrust::get<1>(last), - d_tx_dst_ranks.begin(), - d_counts.begin()); - d_tx_value_counts = std::move(d_counts); - } - - return d_tx_value_counts; -} - // inline to suppress a complaint about ODR violation inline std::tuple, std::vector, @@ -187,6 +110,86 @@ compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const &comm, } // namespace detail +template +rmm::device_uvector groupby_and_count(ValueIterator tx_value_first /* [INOUT */, + ValueIterator tx_value_last /* [INOUT */, + ValueToGPUIdOp value_to_group_id_op, + int num_groups, + cudaStream_t stream) +{ + thrust::sort(rmm::exec_policy(stream)->on(stream), + tx_value_first, + tx_value_last, + [value_to_group_id_op] __device__(auto lhs, auto rhs) { + return value_to_group_id_op(lhs) < value_to_group_id_op(rhs); + }); + + auto group_id_first = thrust::make_transform_iterator( + tx_value_first, + [value_to_group_id_op] __device__(auto value) { return value_to_group_id_op(value); }); + rmm::device_uvector d_tx_dst_ranks(num_groups, stream); + rmm::device_uvector d_tx_value_counts(d_tx_dst_ranks.size(), stream); + auto last = + thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), + group_id_first, + group_id_first + thrust::distance(tx_value_first, tx_value_last), + thrust::make_constant_iterator(size_t{1}), + d_tx_dst_ranks.begin(), + d_tx_value_counts.begin()); + if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < num_groups) { + rmm::device_uvector d_counts(num_groups, stream); + thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0}); + thrust::scatter(rmm::exec_policy(stream)->on(stream), + d_tx_value_counts.begin(), + thrust::get<1>(last), + d_tx_dst_ranks.begin(), + d_counts.begin()); + d_tx_value_counts = std::move(d_counts); + } + + return d_tx_value_counts; +} + +template +rmm::device_uvector groupby_and_count(VertexIterator tx_key_first /* [INOUT */, + VertexIterator tx_key_last /* [INOUT */, + ValueIterator tx_value_first /* [INOUT */, + KeyToGPUIdOp key_to_group_id_op, + int num_groups, + cudaStream_t stream) +{ + thrust::sort_by_key(rmm::exec_policy(stream)->on(stream), + tx_key_first, + tx_key_last, + tx_value_first, + [key_to_group_id_op] __device__(auto lhs, auto rhs) { + return key_to_group_id_op(lhs) < key_to_group_id_op(rhs); + }); + + auto group_id_first = thrust::make_transform_iterator( + tx_key_first, [key_to_group_id_op] __device__(auto key) { return key_to_group_id_op(key); }); + rmm::device_uvector d_tx_dst_ranks(num_groups, stream); + rmm::device_uvector d_tx_value_counts(d_tx_dst_ranks.size(), stream); + auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), + group_id_first, + group_id_first + thrust::distance(tx_key_first, tx_key_last), + thrust::make_constant_iterator(size_t{1}), + d_tx_dst_ranks.begin(), + d_tx_value_counts.begin()); + if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < num_groups) { + rmm::device_uvector d_counts(num_groups, stream); + thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0}); + thrust::scatter(rmm::exec_policy(stream)->on(stream), + d_tx_value_counts.begin(), + thrust::get<1>(last), + d_tx_dst_ranks.begin(), + d_counts.begin()); + d_tx_value_counts = std::move(d_counts); + } + + return d_tx_value_counts; +} + template auto shuffle_values(raft::comms::comms_t const &comm, TxValueIterator tx_value_first, @@ -250,7 +253,7 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const &comm, auto const comm_size = comm.get_size(); auto d_tx_value_counts = - detail::sort_and_count(comm, tx_value_first, tx_value_last, value_to_gpu_id_op, stream); + groupby_and_count(tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), stream); std::vector tx_counts{}; std::vector tx_offsets{}; @@ -301,8 +304,8 @@ auto groupby_gpuid_and_shuffle_kv_pairs(raft::comms::comms_t const &comm, { auto const comm_size = comm.get_size(); - auto d_tx_value_counts = detail::sort_and_count( - comm, tx_key_first, tx_key_last, tx_value_first, key_to_gpu_id_op, stream); + auto d_tx_value_counts = groupby_and_count( + tx_key_first, tx_key_last, tx_value_first, key_to_gpu_id_op, comm.get_size(), stream); std::vector tx_counts{}; std::vector tx_offsets{}; diff --git a/cpp/include/utilities/thrust_tuple_utils.cuh b/cpp/include/utilities/thrust_tuple_utils.cuh index 01843a583eb..d5ce6ff1a29 100644 --- a/cpp/include/utilities/thrust_tuple_utils.cuh +++ b/cpp/include/utilities/thrust_tuple_utils.cuh @@ -61,13 +61,6 @@ struct compute_thrust_tuple_element_sizes_impl { void compute(std::array::value>& arr) const {} }; -template -__device__ constexpr auto remove_first_thrust_tuple_element_impl(TupleType const& tuple, - std::index_sequence) -{ - return thrust::make_tuple(thrust::get<1 + Is>(tuple)...); -} - template struct plus_thrust_tuple_impl { __host__ __device__ constexpr void compute(TupleType& lhs, TupleType const& rhs) const @@ -200,16 +193,6 @@ struct compute_thrust_tuple_element_sizes { } }; -template -struct remove_first_thrust_tuple_element { - __device__ constexpr auto operator()(TupleType const& tuple) const - { - size_t constexpr tuple_size = thrust::tuple_size::value; - return detail::remove_first_thrust_tuple_element_impl( - tuple, std::make_index_sequence()); - } -}; - template struct plus_thrust_tuple { __host__ __device__ constexpr TupleType operator()(TupleType const& lhs, diff --git a/cpp/src/community/ecg.cu b/cpp/src/community/ecg.cu index 45f7d723191..a176dfbd1c8 100644 --- a/cpp/src/community/ecg.cu +++ b/cpp/src/community/ecg.cu @@ -117,7 +117,7 @@ class EcgLouvain : public cugraph::Louvain { void initialize_dendrogram_level(vertex_t num_vertices) override { - this->dendrogram_->add_level(0, num_vertices); + this->dendrogram_->add_level(0, num_vertices, this->stream_); get_permutation_vector( num_vertices, seed_, this->dendrogram_->current_level_begin(), this->stream_); diff --git a/cpp/src/community/leiden.cuh b/cpp/src/community/leiden.cuh index aae2d3712b5..4ffb7c20eb2 100644 --- a/cpp/src/community/leiden.cuh +++ b/cpp/src/community/leiden.cuh @@ -132,7 +132,7 @@ class Leiden : public Louvain { // // Initialize every cluster to reference each vertex to itself // - this->dendrogram_->add_level(0, current_graph.number_of_vertices); + this->dendrogram_->add_level(0, current_graph.number_of_vertices, this->stream_); thrust::sequence(rmm::exec_policy(this->stream_)->on(this->stream_), this->dendrogram_->current_level_begin(), diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index 0862bbc62a9..e3569d4c850 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -210,7 +210,7 @@ class Louvain { virtual void initialize_dendrogram_level(vertex_t num_vertices) { - dendrogram_->add_level(0, num_vertices); + dendrogram_->add_level(0, num_vertices, stream_); thrust::sequence(rmm::exec_policy(stream_)->on(stream_), dendrogram_->current_level_begin(), diff --git a/cpp/src/experimental/bfs.cu b/cpp/src/experimental/bfs.cu index 7adfbd7fbd7..9145e3737b6 100644 --- a/cpp/src/experimental/bfs.cu +++ b/cpp/src/experimental/bfs.cu @@ -93,10 +93,7 @@ void bfs(raft::handle_t const &handle, enum class Bucket { cur, num_buckets }; std::vector bucket_sizes(static_cast(Bucket::num_buckets), push_graph_view.get_number_of_local_vertices()); - VertexFrontier, - vertex_t, - GraphViewType::is_multi_gpu, - static_cast(Bucket::num_buckets)> + VertexFrontier(Bucket::num_buckets)> vertex_frontier(handle, bucket_sizes); if (push_graph_view.is_local_vertex_nocheck(source_vertex)) { @@ -133,19 +130,16 @@ void bfs(raft::handle_t const &handle, *(distances + vertex_partition.get_local_vertex_offset_from_vertex_nocheck(dst)); if (distance != invalid_distance) { push = false; } } - // FIXME: need to test this works properly if payload size is 0 (returns a tuple of size - // 1) return thrust::make_tuple(push, src); }, - reduce_op::any>(), + reduce_op::any(), distances, thrust::make_zip_iterator(thrust::make_tuple(distances, predecessor_first)), vertex_frontier, [depth] __device__(auto v_val, auto pushed_val) { - auto idx = (v_val == invalid_distance) - ? static_cast(Bucket::cur) - : VertexFrontier, vertex_t>::kInvalidBucketIdx; - return thrust::make_tuple(idx, depth + 1, thrust::get<0>(pushed_val)); + auto idx = (v_val == invalid_distance) ? static_cast(Bucket::cur) + : VertexFrontier::kInvalidBucketIdx; + return thrust::make_tuple(idx, thrust::make_tuple(depth + 1, pushed_val)); }); auto new_vertex_frontier_aggregate_size = diff --git a/cpp/src/experimental/coarsen_graph.cu b/cpp/src/experimental/coarsen_graph.cu index 0cd551b0d73..1eccbd23584 100644 --- a/cpp/src/experimental/coarsen_graph.cu +++ b/cpp/src/experimental/coarsen_graph.cu @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -49,6 +50,7 @@ std:: weight_t const *compressed_sparse_weights, vertex_t major_first, vertex_t major_last, + bool is_weighted, cudaStream_t stream) { edge_t number_of_edges{0}; @@ -57,8 +59,7 @@ std:: CUDA_TRY(cudaStreamSynchronize(stream)); rmm::device_uvector edgelist_major_vertices(number_of_edges, stream); rmm::device_uvector edgelist_minor_vertices(number_of_edges, stream); - rmm::device_uvector edgelist_weights( - compressed_sparse_weights != nullptr ? number_of_edges : 0, stream); + rmm::device_uvector edgelist_weights(is_weighted ? number_of_edges : 0, stream); // FIXME: this is highly inefficient for very high-degree vertices, for better performance, we can // fill high-degree vertices using one CUDA block per vertex, mid-degree vertices using one CUDA @@ -77,7 +78,7 @@ std:: compressed_sparse_indices, compressed_sparse_indices + number_of_edges, edgelist_minor_vertices.begin()); - if (compressed_sparse_weights != nullptr) { + if (is_weighted) { thrust::copy(rmm::exec_policy(stream)->on(stream), compressed_sparse_weights, compressed_sparse_weights + number_of_edges, @@ -89,62 +90,62 @@ std:: std::move(edgelist_weights)); } -template -void sort_and_coarsen_edgelist(rmm::device_uvector &edgelist_major_vertices /* [INOUT] */, - rmm::device_uvector &edgelist_minor_vertices /* [INOUT] */, - rmm::device_uvector &edgelist_weights /* [INOUT] */, - cudaStream_t stream) +template +edge_t groupby_e_and_coarsen_edgelist(vertex_t *edgelist_major_vertices /* [INOUT] */, + vertex_t *edgelist_minor_vertices /* [INOUT] */, + weight_t *edgelist_weights /* [INOUT] */, + edge_t number_of_edges, + bool is_weighted, + cudaStream_t stream) { - auto pair_first = thrust::make_zip_iterator( - thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); + auto pair_first = + thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices)); - size_t number_of_edges{0}; - if (edgelist_weights.size() > 0) { + if (is_weighted) { thrust::sort_by_key(rmm::exec_policy(stream)->on(stream), pair_first, - pair_first + edgelist_major_vertices.size(), - edgelist_weights.begin()); + pair_first + number_of_edges, + edgelist_weights); - rmm::device_uvector tmp_edgelist_major_vertices(edgelist_major_vertices.size(), - stream); + rmm::device_uvector tmp_edgelist_major_vertices(number_of_edges, stream); rmm::device_uvector tmp_edgelist_minor_vertices(tmp_edgelist_major_vertices.size(), stream); rmm::device_uvector tmp_edgelist_weights(tmp_edgelist_major_vertices.size(), stream); auto it = thrust::reduce_by_key( rmm::exec_policy(stream)->on(stream), pair_first, - pair_first + edgelist_major_vertices.size(), - edgelist_weights.begin(), + pair_first + number_of_edges, + edgelist_weights, thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(), tmp_edgelist_minor_vertices.begin())), tmp_edgelist_weights.begin()); - number_of_edges = thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it)); + auto ret = + static_cast(thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it))); - edgelist_major_vertices = std::move(tmp_edgelist_major_vertices); - edgelist_minor_vertices = std::move(tmp_edgelist_minor_vertices); - edgelist_weights = std::move(tmp_edgelist_weights); + auto edge_first = + thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(), + tmp_edgelist_minor_vertices.begin(), + tmp_edgelist_weights.begin())); + thrust::copy(rmm::exec_policy(stream)->on(stream), + edge_first, + edge_first + ret, + thrust::make_zip_iterator(thrust::make_tuple( + edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights))); + + return ret; } else { - thrust::sort(rmm::exec_policy(stream)->on(stream), - pair_first, - pair_first + edgelist_major_vertices.size()); - auto it = thrust::unique(rmm::exec_policy(stream)->on(stream), - pair_first, - pair_first + edgelist_major_vertices.size()); - number_of_edges = thrust::distance(pair_first, it); + thrust::sort(rmm::exec_policy(stream)->on(stream), pair_first, pair_first + number_of_edges); + return static_cast(thrust::distance( + pair_first, + thrust::unique( + rmm::exec_policy(stream)->on(stream), pair_first, pair_first + number_of_edges))); } - - edgelist_major_vertices.resize(number_of_edges, stream); - edgelist_minor_vertices.resize(number_of_edges, stream); - edgelist_weights.resize(number_of_edges, stream); - edgelist_major_vertices.shrink_to_fit(stream); - edgelist_minor_vertices.shrink_to_fit(stream); - edgelist_weights.shrink_to_fit(stream); } template std:: tuple, rmm::device_uvector, rmm::device_uvector> - compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist( + compressed_sparse_to_relabeled_and_grouped_and_coarsened_edgelist( edge_t const *compressed_sparse_offsets, vertex_t const *compressed_sparse_indices, weight_t const *compressed_sparse_weights, @@ -154,6 +155,7 @@ std:: vertex_t major_last, vertex_t minor_first, vertex_t minor_last, + bool is_weighted, cudaStream_t stream) { // FIXME: it might be possible to directly create relabled & coarsened edgelist from the @@ -168,6 +170,7 @@ std:: compressed_sparse_weights, major_first, major_last, + is_weighted, stream); auto pair_first = thrust::make_zip_iterator( @@ -182,8 +185,21 @@ std:: p_minor_labels[thrust::get<1>(val) - minor_first]); }); - sort_and_coarsen_edgelist( - edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights, stream); + auto number_of_edges = + groupby_e_and_coarsen_edgelist(edgelist_major_vertices.data(), + edgelist_minor_vertices.data(), + edgelist_weights.data(), + static_cast(edgelist_major_vertices.size()), + is_weighted, + stream); + edgelist_major_vertices.resize(number_of_edges, stream); + edgelist_major_vertices.shrink_to_fit(stream); + edgelist_minor_vertices.resize(number_of_edges, stream); + edgelist_minor_vertices.shrink_to_fit(stream); + if (is_weighted) { + edgelist_weights.resize(number_of_edges, stream); + edgelist_weights.shrink_to_fit(stream); + } return std::make_tuple(std::move(edgelist_major_vertices), std::move(edgelist_minor_vertices), @@ -220,48 +236,66 @@ coarsen_graph( // currently, nothing to do } - // 1. locally construct coarsened edge list + // 1. construct coarsened edge list - // FIXME: we don't need adj_matrix_major_labels if we apply the same partitioning scheme - // regardless of hypergraph partitioning is applied or not - rmm::device_uvector adj_matrix_major_labels( - store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols() - : graph_view.get_number_of_local_adj_matrix_partition_rows(), - handle.get_stream()); rmm::device_uvector adj_matrix_minor_labels( store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_rows() : graph_view.get_number_of_local_adj_matrix_partition_cols(), handle.get_stream()); if (store_transposed) { - copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_major_labels.data()); copy_to_adj_matrix_row(handle, graph_view, labels, adj_matrix_minor_labels.data()); } else { - copy_to_adj_matrix_row(handle, graph_view, labels, adj_matrix_major_labels.data()); copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_minor_labels.data()); } - rmm::device_uvector coarsened_edgelist_major_vertices(0, handle.get_stream()); - rmm::device_uvector coarsened_edgelist_minor_vertices(0, handle.get_stream()); - rmm::device_uvector coarsened_edgelist_weights(0, handle.get_stream()); + std::vector> coarsened_edgelist_major_vertices{}; + std::vector> coarsened_edgelist_minor_vertices{}; + std::vector> coarsened_edgelist_weights{}; + coarsened_edgelist_major_vertices.reserve(graph_view.get_number_of_local_adj_matrix_partitions()); + coarsened_edgelist_minor_vertices.reserve(coarsened_edgelist_major_vertices.size()); + coarsened_edgelist_weights.reserve( + graph_view.is_weighted() ? coarsened_edgelist_major_vertices.size() : size_t{0}); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + coarsened_edgelist_major_vertices.emplace_back(0, handle.get_stream()); + coarsened_edgelist_minor_vertices.emplace_back(0, handle.get_stream()); + if (graph_view.is_weighted()) { + coarsened_edgelist_weights.emplace_back(0, handle.get_stream()); + } + } // FIXME: we may compare performance/memory footprint with the hash_based approach especially when // cuco::dynamic_map becomes available (so we don't need to preallocate memory assuming the worst // case). We may be able to limit the memory requirement close to the final coarsened edgelist // with the hash based approach. for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { - // get edge list + // 1-1. locally construct coarsened edge list + + rmm::device_uvector major_labels( + store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols(i) + : graph_view.get_number_of_local_adj_matrix_partition_rows(i), + handle.get_stream()); + // FIXME: this copy is unnecessary, beter fix RAFT comm's bcast to take const iterators for + // input + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + labels, + labels + major_labels.size(), + major_labels.begin()); + device_bcast(col_comm, + major_labels.data(), + major_labels.data(), + major_labels.size(), + static_cast(i), + handle.get_stream()); rmm::device_uvector edgelist_major_vertices(0, handle.get_stream()); rmm::device_uvector edgelist_minor_vertices(0, handle.get_stream()); rmm::device_uvector edgelist_weights(0, handle.get_stream()); std::tie(edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights) = - compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist( + compressed_sparse_to_relabeled_and_grouped_and_coarsened_edgelist( graph_view.offsets(i), graph_view.indices(i), graph_view.weights(i), - adj_matrix_major_labels.begin() + - (store_transposed ? graph_view.get_local_adj_matrix_partition_col_value_start_offset(i) - : graph_view.get_local_adj_matrix_partition_row_value_start_offset(i)), - adj_matrix_minor_labels.begin(), + major_labels.data(), + adj_matrix_minor_labels.data(), store_transposed ? graph_view.get_local_adj_matrix_partition_col_first(i) : graph_view.get_local_adj_matrix_partition_row_first(i), store_transposed ? graph_view.get_local_adj_matrix_partition_col_last(i) @@ -270,86 +304,159 @@ coarsen_graph( : graph_view.get_local_adj_matrix_partition_col_first(i), store_transposed ? graph_view.get_local_adj_matrix_partition_row_last(i) : graph_view.get_local_adj_matrix_partition_col_last(i), + graph_view.is_weighted(), handle.get_stream()); - auto cur_size = coarsened_edgelist_major_vertices.size(); - // FIXME: this can lead to frequent costly reallocation; we may be able to avoid this if we can - // reserve address space to avoid expensive reallocation. - // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management - coarsened_edgelist_major_vertices.resize(cur_size + edgelist_major_vertices.size(), - handle.get_stream()); - coarsened_edgelist_minor_vertices.resize(coarsened_edgelist_major_vertices.size(), - handle.get_stream()); - coarsened_edgelist_weights.resize( - graph_view.is_weighted() ? coarsened_edgelist_major_vertices.size() : 0, handle.get_stream()); - - if (graph_view.is_weighted()) { - auto src_edge_first = - thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(), - edgelist_minor_vertices.begin(), - edgelist_weights.begin())); - auto dst_edge_first = - thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices.begin(), - coarsened_edgelist_minor_vertices.begin(), - coarsened_edgelist_weights.begin())) + - cur_size; - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - src_edge_first, - src_edge_first + edgelist_major_vertices.size(), - dst_edge_first); - } else { - auto src_edge_first = thrust::make_zip_iterator( - thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); - auto dst_edge_first = - thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices.begin(), - coarsened_edgelist_minor_vertices.begin())) + - cur_size; - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - src_edge_first, - src_edge_first + edgelist_major_vertices.size(), - dst_edge_first); + // 1-2. globaly shuffle + + { + rmm::device_uvector rx_edgelist_major_vertices(0, handle.get_stream()); + rmm::device_uvector rx_edgelist_minor_vertices(0, handle.get_stream()); + rmm::device_uvector rx_edgelist_weights(0, handle.get_stream()); + if (graph_view.is_weighted()) { + auto edge_first = + thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(), + edgelist_minor_vertices.begin(), + edgelist_weights.begin())); + std::forward_as_tuple( + std::tie(rx_edgelist_major_vertices, rx_edgelist_minor_vertices, rx_edgelist_weights), + std::ignore) = + groupby_gpuid_and_shuffle_values( + handle.get_comms(), + edge_first, + edge_first + edgelist_major_vertices.size(), + [key_func = + detail::compute_gpu_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto val) { + return key_func(thrust::get<0>(val), thrust::get<1>(val)); + }, + handle.get_stream()); + } else { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); + std::forward_as_tuple(std::tie(rx_edgelist_major_vertices, rx_edgelist_minor_vertices), + std::ignore) = + groupby_gpuid_and_shuffle_values( + handle.get_comms(), + edge_first, + edge_first + edgelist_major_vertices.size(), + [key_func = + detail::compute_gpu_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto val) { + return key_func(thrust::get<0>(val), thrust::get<1>(val)); + }, + handle.get_stream()); + } + + edgelist_major_vertices = std::move(rx_edgelist_major_vertices); + edgelist_minor_vertices = std::move(rx_edgelist_minor_vertices); + edgelist_weights = std::move(rx_edgelist_weights); } - } - sort_and_coarsen_edgelist(coarsened_edgelist_major_vertices, - coarsened_edgelist_minor_vertices, - coarsened_edgelist_weights, - handle.get_stream()); - - // 2. globally shuffle edge list and re-coarsen - - { - auto edge_first = - thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices.begin(), - coarsened_edgelist_minor_vertices.begin(), - coarsened_edgelist_weights.begin())); - rmm::device_uvector rx_edgelist_major_vertices(0, handle.get_stream()); - rmm::device_uvector rx_edgelist_minor_vertices(0, handle.get_stream()); - rmm::device_uvector rx_edgelist_weights(0, handle.get_stream()); - std::forward_as_tuple( - std::tie(rx_edgelist_major_vertices, rx_edgelist_minor_vertices, rx_edgelist_weights), - std::ignore) = - groupby_gpuid_and_shuffle_values( - handle.get_comms(), - edge_first, - edge_first + coarsened_edgelist_major_vertices.size(), - [key_func = - detail::compute_gpu_id_from_edge_t{graph_view.is_hypergraph_partitioned(), - comm.get_size(), - row_comm.get_size(), - col_comm.get_size()}] __device__(auto val) { - return key_func(thrust::get<0>(val), thrust::get<1>(val)); - }, + // 1-3. append data to local adjacency matrix partitions + + // FIXME: we can skip this if groupby_gpuid_and_shuffle_values is updated to return sorted edge + // list based on the final matrix partition (maybe add + // groupby_adj_matrix_partition_and_shuffle_values). + + auto local_partition_id_op = + [comm_size, + key_func = detail::compute_partition_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto pair) { + return key_func(thrust::get<0>(pair), thrust::get<1>(pair)) / + comm_size; // global partition id to local partition id + }; + auto pair_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); + auto counts = graph_view.is_weighted() + ? groupby_and_count(pair_first, + pair_first + edgelist_major_vertices.size(), + edgelist_weights.begin(), + local_partition_id_op, + graph_view.get_number_of_local_adj_matrix_partitions(), + handle.get_stream()) + : groupby_and_count(pair_first, + pair_first + edgelist_major_vertices.size(), + local_partition_id_op, + graph_view.get_number_of_local_adj_matrix_partitions(), + handle.get_stream()); + + std::vector h_counts(counts.size()); + raft::update_host(h_counts.data(), counts.data(), counts.size(), handle.get_stream()); + handle.get_stream_view().synchronize(); + + std::vector h_displacements(h_counts.size(), size_t{0}); + std::partial_sum(h_counts.begin(), h_counts.end() - 1, h_displacements.begin() + 1); + + for (int j = 0; j < col_comm_size; ++j) { + auto number_of_partition_edges = groupby_e_and_coarsen_edgelist( + edgelist_major_vertices.begin() + h_displacements[j], + edgelist_minor_vertices.begin() + h_displacements[j], + graph_view.is_weighted() ? edgelist_weights.begin() + h_displacements[j] + : static_cast(nullptr), + h_counts[j], + graph_view.is_weighted(), handle.get_stream()); - sort_and_coarsen_edgelist(rx_edgelist_major_vertices, - rx_edgelist_minor_vertices, - rx_edgelist_weights, - handle.get_stream()); + auto cur_size = coarsened_edgelist_major_vertices[j].size(); + // FIXME: this can lead to frequent costly reallocation; we may be able to avoid this if we + // can reserve address space to avoid expensive reallocation. + // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management + coarsened_edgelist_major_vertices[j].resize(cur_size + number_of_partition_edges, + handle.get_stream()); + coarsened_edgelist_minor_vertices[j].resize(coarsened_edgelist_major_vertices[j].size(), + handle.get_stream()); + if (graph_view.is_weighted()) { + coarsened_edgelist_weights[j].resize(coarsened_edgelist_major_vertices[j].size(), + handle.get_stream()); + + auto src_edge_first = + thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(), + edgelist_minor_vertices.begin(), + edgelist_weights.begin())) + + h_displacements[j]; + auto dst_edge_first = + thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices[j].begin(), + coarsened_edgelist_minor_vertices[j].begin(), + coarsened_edgelist_weights[j].begin())) + + cur_size; + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + src_edge_first, + src_edge_first + number_of_partition_edges, + dst_edge_first); + } else { + auto src_edge_first = thrust::make_zip_iterator(thrust::make_tuple( + edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())) + + h_displacements[j]; + auto dst_edge_first = thrust::make_zip_iterator( + thrust::make_tuple(coarsened_edgelist_major_vertices[j].begin(), + coarsened_edgelist_minor_vertices[j].begin())) + + cur_size; + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + src_edge_first, + src_edge_first + edgelist_major_vertices.size(), + dst_edge_first); + } + } + } - coarsened_edgelist_major_vertices = std::move(rx_edgelist_major_vertices); - coarsened_edgelist_minor_vertices = std::move(rx_edgelist_minor_vertices); - coarsened_edgelist_weights = std::move(rx_edgelist_weights); + for (size_t i = 0; i < coarsened_edgelist_major_vertices.size(); ++i) { + auto number_of_partition_edges = groupby_e_and_coarsen_edgelist( + coarsened_edgelist_major_vertices[i].data(), + coarsened_edgelist_minor_vertices[i].data(), + graph_view.is_weighted() ? coarsened_edgelist_weights[i].data() + : static_cast(nullptr), + static_cast(coarsened_edgelist_major_vertices[i].size()), + graph_view.is_weighted(), + handle.get_stream()); + coarsened_edgelist_major_vertices[i].resize(number_of_partition_edges, handle.get_stream()); + coarsened_edgelist_major_vertices[i].shrink_to_fit(handle.get_stream()); + coarsened_edgelist_minor_vertices[i].resize(number_of_partition_edges, handle.get_stream()); + coarsened_edgelist_minor_vertices[i].shrink_to_fit(handle.get_stream()); + if (coarsened_edgelist_weights.size() > 0) { + coarsened_edgelist_weights[i].resize(number_of_partition_edges, handle.get_stream()); + coarsened_edgelist_weights[i].shrink_to_fit(handle.get_stream()); + } } // 3. find unique labels for this GPU @@ -395,37 +502,43 @@ coarsen_graph( rmm::device_uvector renumber_map_labels(0, handle.get_stream()); partition_t partition(std::vector(comm_size + 1, 0), - graph_view.is_hypergraph_partitioned(), row_comm_size, col_comm_size, row_comm_rank, col_comm_rank); vertex_t number_of_vertices{}; edge_t number_of_edges{}; - std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges) = - renumber_edgelist( - handle, - unique_labels.data(), - static_cast(unique_labels.size()), - coarsened_edgelist_major_vertices.data(), - coarsened_edgelist_minor_vertices.data(), - static_cast(coarsened_edgelist_major_vertices.size()), - graph_view.is_hypergraph_partitioned(), - do_expensive_check); + { + std::vector major_ptrs(coarsened_edgelist_major_vertices.size()); + std::vector minor_ptrs(major_ptrs.size()); + std::vector counts(major_ptrs.size()); + for (size_t i = 0; i < coarsened_edgelist_major_vertices.size(); ++i) { + major_ptrs[i] = coarsened_edgelist_major_vertices[i].data(); + minor_ptrs[i] = coarsened_edgelist_minor_vertices[i].data(); + counts[i] = static_cast(coarsened_edgelist_major_vertices[i].size()); + } + std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges) = + renumber_edgelist(handle, + unique_labels.data(), + static_cast(unique_labels.size()), + major_ptrs, + minor_ptrs, + counts, + do_expensive_check); + } // 5. build a graph std::vector> edgelists{}; - if (graph_view.is_hypergraph_partitioned()) { - CUGRAPH_FAIL("unimplemented."); - } else { - edgelists.resize(1); - edgelists[0].p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices.data() - : coarsened_edgelist_major_vertices.data(); - edgelists[0].p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices.data() - : coarsened_edgelist_minor_vertices.data(); - edgelists[0].p_edge_weights = coarsened_edgelist_weights.data(); - edgelists[0].number_of_edges = static_cast(coarsened_edgelist_major_vertices.size()); + edgelists.resize(graph_view.get_number_of_local_adj_matrix_partitions()); + for (size_t i = 0; i < edgelists.size(); ++i) { + edgelists[i].p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices[i].data() + : coarsened_edgelist_major_vertices[i].data(); + edgelists[i].p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices[i].data() + : coarsened_edgelist_minor_vertices[i].data(); + edgelists[i].p_edge_weights = graph_view.is_weighted() ? coarsened_edgelist_weights[i].data() + : static_cast(nullptr); + edgelists[i].number_of_edges = static_cast(coarsened_edgelist_major_vertices[i].size()); } return std::make_tuple( @@ -435,7 +548,7 @@ coarsen_graph( partition, number_of_vertices, number_of_edges, - graph_properties_t{graph_view.is_symmetric(), false}, + graph_properties_t{graph_view.is_symmetric(), false, graph_view.is_weighted()}, true), std::move(renumber_map_labels)); } @@ -466,7 +579,7 @@ coarsen_graph( std::tie(coarsened_edgelist_major_vertices, coarsened_edgelist_minor_vertices, coarsened_edgelist_weights) = - compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist( + compressed_sparse_to_relabeled_and_grouped_and_coarsened_edgelist( graph_view.offsets(), graph_view.indices(), graph_view.weights(), @@ -476,6 +589,7 @@ coarsen_graph( graph_view.get_number_of_vertices(), vertex_t{0}, graph_view.get_number_of_vertices(), + graph_view.is_weighted(), handle.get_stream()); rmm::device_uvector unique_labels(graph_view.get_number_of_vertices(), @@ -516,7 +630,7 @@ coarsen_graph( handle, edgelist, static_cast(renumber_map_labels.size()), - graph_properties_t{graph_view.is_symmetric(), false}, + graph_properties_t{graph_view.is_symmetric(), false, graph_view.is_weighted()}, true), std::move(renumber_map_labels)); } diff --git a/cpp/src/experimental/generate_rmat_edgelist.cu b/cpp/src/experimental/generate_rmat_edgelist.cu index 185fa837a70..d75a4654a15 100644 --- a/cpp/src/experimental/generate_rmat_edgelist.cu +++ b/cpp/src/experimental/generate_rmat_edgelist.cu @@ -46,13 +46,13 @@ std::tuple, rmm::device_uvector> generat bool clip_and_flip, bool scramble_vertex_ids) { - CUGRAPH_EXPECTS(size_t{1} << scale <= std::numeric_limits::max(), + CUGRAPH_EXPECTS((size_t{1} << scale) <= static_cast(std::numeric_limits::max()), "Invalid input argument: scale too large for vertex_t."); CUGRAPH_EXPECTS((a >= 0.0) && (b >= 0.0) && (c >= 0.0) && (a + b + c <= 1.0), "Invalid input argument: a, b, c should be non-negative and a + b + c should not " "be larger than 1.0."); - raft::random::Rng rng(seed + 10); + raft::random::Rng rng(seed); // to limit memory footprint (1024 is a tuning parameter) auto max_edges_to_generate_per_iteration = static_cast(handle.get_device_properties().multiProcessorCount) * 1024; diff --git a/cpp/src/experimental/graph.cu b/cpp/src/experimental/graph.cu index 5abe141dafd..47c41cb3426 100644 --- a/cpp/src/experimental/graph.cu +++ b/cpp/src/experimental/graph.cu @@ -67,12 +67,12 @@ std:: vertex_t major_last, vertex_t minor_first, vertex_t minor_last, + bool is_weighted, cudaStream_t stream) { rmm::device_uvector offsets((major_last - major_first) + 1, stream); rmm::device_uvector indices(edgelist.number_of_edges, stream); - rmm::device_uvector weights( - edgelist.p_edge_weights != nullptr ? edgelist.number_of_edges : 0, stream); + rmm::device_uvector weights(is_weighted ? edgelist.number_of_edges : 0, stream); thrust::fill(rmm::exec_policy(stream)->on(stream), offsets.begin(), offsets.end(), edge_t{0}); thrust::fill(rmm::exec_policy(stream)->on(stream), indices.begin(), indices.end(), vertex_t{0}); @@ -89,8 +89,7 @@ std:: auto p_offsets = offsets.data(); auto p_indices = indices.data(); - auto p_weights = - edgelist.p_edge_weights != nullptr ? weights.data() : static_cast(nullptr); + auto p_weights = is_weighted ? weights.data() : static_cast(nullptr); thrust::for_each(rmm::exec_policy(stream)->on(stream), store_transposed ? edgelist.p_dst_vertices : edgelist.p_src_vertices, @@ -103,7 +102,7 @@ std:: thrust::exclusive_scan( rmm::exec_policy(stream)->on(stream), offsets.begin(), offsets.end(), offsets.begin()); - if (edgelist.p_edge_weights != nullptr) { + if (is_weighted) { auto edge_first = thrust::make_zip_iterator(thrust::make_tuple( edgelist.p_src_vertices, edgelist.p_dst_vertices, edgelist.p_edge_weights)); thrust::for_each(rmm::exec_policy(stream)->on(stream), @@ -191,24 +190,22 @@ graph_t 0, "Invalid input argument: edgelists.size() should be non-zero."); - bool is_weighted = edgelists[0].p_edge_weights != nullptr; - CUGRAPH_EXPECTS( std::any_of(edgelists.begin() + 1, edgelists.end(), - [is_weighted](auto edgelist) { - return (edgelist.p_src_vertices == nullptr) || - (edgelist.p_dst_vertices == nullptr) || - (is_weighted && (edgelist.p_edge_weights == nullptr)) || + [is_weighted = properties.is_weighted](auto edgelist) { + return ((edgelist.number_of_edges > 0) && (edgelist.p_src_vertices == nullptr)) || + ((edgelist.number_of_edges > 0) && (edgelist.p_dst_vertices == nullptr)) || + (is_weighted && (edgelist.number_of_edges > 0) && + (edgelist.p_edge_weights == nullptr)) || (!is_weighted && (edgelist.p_edge_weights != nullptr)); }) == false, "Invalid input argument: edgelists[].p_src_vertices and edgelists[].p_dst_vertices should not " - "be nullptr and edgelists[].p_edge_weights should be nullptr (if edgelists[0].p_edge_weights " - "is nullptr) or should not be nullptr (otherwise)."); + "be nullptr if edgelists[].number_of_edges > 0 and edgelists[].p_edge_weights should be " + "nullptr if unweighted or should not be nullptr if weighted and edgelists[].number_of_edges > " + "0."); - CUGRAPH_EXPECTS((partition.is_hypergraph_partitioned() && - (edgelists.size() == static_cast(col_comm_size))) || - (!(partition.is_hypergraph_partitioned()) && (edgelists.size() == 1)), + CUGRAPH_EXPECTS(edgelists.size() == static_cast(col_comm_size), "Invalid input argument: errneous edgelists.size()."); // optional expensive checks (part 1/3) @@ -251,7 +248,7 @@ graph_tget_handle_ptr()->get_stream()); adj_matrix_partition_offsets_.push_back(std::move(offsets)); adj_matrix_partition_indices_.push_back(std::move(indices)); - if (is_weighted) { adj_matrix_partition_weights_.push_back(std::move(weights)); } + if (properties.is_weighted) { adj_matrix_partition_weights_.push_back(std::move(weights)); } } // update degree-based segment offsets (to be used for graph analytics kernel optimization) @@ -321,22 +319,12 @@ graph_t aggregate_segment_offsets(0, default_stream); - if (partition.is_hypergraph_partitioned()) { - rmm::device_uvector aggregate_segment_offsets( - col_comm_size * segment_offsets.size(), default_stream); - col_comm.allgather(segment_offsets.data(), - aggregate_segment_offsets.data(), - segment_offsets.size(), - default_stream); - } else { - rmm::device_uvector aggregate_segment_offsets( - row_comm_size * segment_offsets.size(), default_stream); - row_comm.allgather(segment_offsets.data(), - aggregate_segment_offsets.data(), - segment_offsets.size(), - default_stream); - } + rmm::device_uvector aggregate_segment_offsets(col_comm_size * segment_offsets.size(), + default_stream); + col_comm.allgather(segment_offsets.data(), + aggregate_segment_offsets.data(), + segment_offsets.size(), + default_stream); vertex_partition_segment_offsets_.resize(aggregate_segment_offsets.size()); raft::update_host(vertex_partition_segment_offsets_.data(), @@ -344,18 +332,10 @@ graph_tget_handle_ptr()->get_stream(); CUGRAPH_EXPECTS( - (edgelist.p_src_vertices != nullptr) && (edgelist.p_dst_vertices != nullptr), + ((edgelist.number_of_edges == 0) || (edgelist.p_src_vertices != nullptr)) && + ((edgelist.number_of_edges == 0) || (edgelist.p_dst_vertices != nullptr)) && + ((properties.is_weighted && + ((edgelist.number_of_edges == 0) || (edgelist.p_edge_weights != nullptr))) || + (!properties.is_weighted && (edgelist.p_edge_weights == nullptr))), "Invalid input argument: edgelist.p_src_vertices and edgelist.p_dst_vertices should " - "not be nullptr."); + "not be nullptr if edgelist.number_of_edges > 0 and edgelist.p_edge_weights should be nullptr " + "if unweighted or should not be nullptr if weighted and edgelist.number_of_edges > 0."); // optional expensive checks (part 1/2) @@ -427,6 +412,7 @@ graph_tget_number_of_vertices(), vertex_t{0}, this->get_number_of_vertices(), + properties.is_weighted, this->get_handle_ptr()->get_stream()); // update degree-based segment offsets (to be used for graph analytics kernel optimization) diff --git a/cpp/src/experimental/graph_view.cu b/cpp/src/experimental/graph_view.cu index f443608e424..c6f39a44333 100644 --- a/cpp/src/experimental/graph_view.cu +++ b/cpp/src/experimental/graph_view.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -195,16 +196,12 @@ graph_view_t(row_comm_size))) || - (!(partition.is_hypergraph_partitioned()) && (adj_matrix_partition_offsets.size() == 1)), - "Internal Error: erroneous adj_matrix_partition_offsets.size()."); + CUGRAPH_EXPECTS(adj_matrix_partition_offsets.size() == static_cast(col_comm_size), + "Internal Error: erroneous adj_matrix_partition_offsets.size()."); CUGRAPH_EXPECTS((sorted_by_global_degree_within_vertex_partition && (vertex_partition_segment_offsets.size() == - (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size) * - (detail::num_segments_per_vertex_partition + 1))) || + col_comm_size * (detail::num_segments_per_vertex_partition + 1))) || (!sorted_by_global_degree_within_vertex_partition && (vertex_partition_segment_offsets.size() == 0)), "Internal Error: vertex_partition_segment_offsets.size() does not match " @@ -267,8 +264,7 @@ graph_view_t graph_view_t< } } +template +edge_t +graph_view_t>:: + compute_max_in_degree(raft::handle_t const& handle) const +{ + auto in_degrees = compute_in_degrees(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + in_degrees.begin(), + in_degrees.end()); + rmm::device_scalar ret(handle.get_stream()); + device_allreduce( + handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream()); + return ret.value(handle.get_stream()); +} + +template +edge_t graph_view_t>::compute_max_in_degree(raft::handle_t const& + handle) const +{ + auto in_degrees = compute_in_degrees(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + in_degrees.begin(), + in_degrees.end()); + edge_t ret{}; + raft::update_host(&ret, it, 1, handle.get_stream()); + handle.get_stream_view().synchronize(); + return ret; +} + +template +edge_t +graph_view_t>:: + compute_max_out_degree(raft::handle_t const& handle) const +{ + auto out_degrees = compute_out_degrees(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + out_degrees.begin(), + out_degrees.end()); + rmm::device_scalar ret(handle.get_stream()); + device_allreduce( + handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream()); + return ret.value(handle.get_stream()); +} + +template +edge_t graph_view_t>::compute_max_out_degree(raft::handle_t const& + handle) const +{ + auto out_degrees = compute_out_degrees(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + out_degrees.begin(), + out_degrees.end()); + edge_t ret{}; + raft::update_host(&ret, it, 1, handle.get_stream()); + handle.get_stream_view().synchronize(); + return ret; +} + +template +weight_t +graph_view_t>:: + compute_max_in_weight_sum(raft::handle_t const& handle) const +{ + auto in_weight_sums = compute_in_weight_sums(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + in_weight_sums.begin(), + in_weight_sums.end()); + rmm::device_scalar ret(handle.get_stream()); + device_allreduce( + handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream()); + return ret.value(handle.get_stream()); +} + +template +weight_t graph_view_t>::compute_max_in_weight_sum(raft::handle_t const& + handle) const +{ + auto in_weight_sums = compute_in_weight_sums(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + in_weight_sums.begin(), + in_weight_sums.end()); + weight_t ret{}; + raft::update_host(&ret, it, 1, handle.get_stream()); + handle.get_stream_view().synchronize(); + return ret; +} + +template +weight_t +graph_view_t>:: + compute_max_out_weight_sum(raft::handle_t const& handle) const +{ + auto out_weight_sums = compute_out_weight_sums(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + out_weight_sums.begin(), + out_weight_sums.end()); + rmm::device_scalar ret(handle.get_stream()); + device_allreduce( + handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream()); + return ret.value(handle.get_stream()); +} + +template +weight_t graph_view_t< + vertex_t, + edge_t, + weight_t, + store_transposed, + multi_gpu, + std::enable_if_t>::compute_max_out_weight_sum(raft::handle_t const& handle) const +{ + auto out_weight_sums = compute_out_weight_sums(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + out_weight_sums.begin(), + out_weight_sums.end()); + weight_t ret{}; + raft::update_host(&ret, it, 1, handle.get_stream()); + handle.get_stream_view().synchronize(); + return ret; +} + // explicit instantiation template class graph_view_t; diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh index 3136515faa6..24914fb028b 100644 --- a/cpp/src/experimental/louvain.cuh +++ b/cpp/src/experimental/louvain.cuh @@ -151,7 +151,8 @@ class Louvain { protected: void initialize_dendrogram_level(vertex_t num_vertices) { - dendrogram_->add_level(current_graph_view_.get_local_vertex_first(), num_vertices); + dendrogram_->add_level( + current_graph_view_.get_local_vertex_first(), num_vertices, handle_.get_stream()); thrust::sequence(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), dendrogram_->current_level_begin(), @@ -369,8 +370,6 @@ class Louvain { current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); rmm::device_uvector src_cluster_weights_v(next_cluster_v.size(), handle_.get_stream()); - rmm::device_uvector dst_cluster_weights_v(next_cluster_v.size(), - handle_.get_stream()); compute_cluster_sum_and_subtract(old_cluster_sum_v, cluster_subtract_v); @@ -396,19 +395,9 @@ class Louvain { vertex_to_gpu_id_op, handle_.get_stream()); - dst_cluster_weights_v = cugraph::experimental::collect_values_for_keys( - handle_.get_comms(), - cluster_keys_v_.begin(), - cluster_keys_v_.end(), - cluster_weights_v_.data(), - d_dst_cluster_cache_, - d_dst_cluster_cache_ + dst_cluster_cache_v_.size(), - vertex_to_gpu_id_op, - handle_.get_stream()); - - map_key_first = d_dst_cluster_cache_; - map_key_last = d_dst_cluster_cache_ + dst_cluster_cache_v_.size(); - map_value_first = dst_cluster_weights_v.begin(); + map_key_first = cluster_keys_v_.begin(); + map_key_last = cluster_keys_v_.end(); + map_value_first = cluster_weights_v_.begin(); } else { thrust::sort_by_key(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), cluster_keys_v_.begin(), @@ -432,12 +421,21 @@ class Louvain { map_value_first = src_cluster_weights_v.begin(); } + rmm::device_uvector src_old_cluster_sum_v( + current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), handle_.get_stream()); + rmm::device_uvector src_cluster_subtract_v( + current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), handle_.get_stream()); + copy_to_adj_matrix_row( + handle_, current_graph_view_, old_cluster_sum_v.begin(), src_old_cluster_sum_v.begin()); + copy_to_adj_matrix_row( + handle_, current_graph_view_, cluster_subtract_v.begin(), src_cluster_subtract_v.begin()); + copy_v_transform_reduce_key_aggregated_out_nbr( handle_, current_graph_view_, - thrust::make_zip_iterator(thrust::make_tuple(old_cluster_sum_v.begin(), + thrust::make_zip_iterator(thrust::make_tuple(src_old_cluster_sum_v.begin(), d_src_vertex_weights_cache_, - cluster_subtract_v.begin(), + src_cluster_subtract_v.begin(), d_src_cluster_cache_, src_cluster_weights_v.begin())), diff --git a/cpp/src/experimental/relabel.cu b/cpp/src/experimental/relabel.cu index 62bd6951f71..8d8fb0322a8 100644 --- a/cpp/src/experimental/relabel.cu +++ b/cpp/src/experimental/relabel.cu @@ -42,6 +42,7 @@ namespace cugraph { namespace experimental { +// FIXME: think about requiring old_new_label_pairs to be pre-shuffled template void relabel(raft::handle_t const& handle, std::tuple old_new_label_pairs, @@ -120,7 +121,12 @@ void relabel(raft::handle_t const& handle, handle.get_stream())); // cuco::static_map currently does not take stream cuco::static_map relabel_map{ - static_cast(static_cast(rx_label_pair_old_labels.size()) / load_factor), + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max( + static_cast(static_cast(rx_label_pair_old_labels.size()) / load_factor), + rx_label_pair_old_labels.size() + 1), invalid_vertex_id::value, invalid_vertex_id::value}; @@ -130,7 +136,11 @@ void relabel(raft::handle_t const& handle, [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - relabel_map.insert(pair_first, pair_first + rx_label_pair_old_labels.size()); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the + // grid size is 0; this leads to cudaErrorInvaildConfiguration. + if (rx_label_pair_old_labels.size() > 0) { + relabel_map.insert(pair_first, pair_first + rx_label_pair_old_labels.size()); + } rx_label_pair_old_labels.resize(0, handle.get_stream()); rx_label_pair_new_labels.resize(0, handle.get_stream()); @@ -152,19 +162,29 @@ void relabel(raft::handle_t const& handle, CUDA_TRY(cudaStreamSynchronize( handle.get_stream())); // cuco::static_map currently does not take stream - relabel_map.find( - rx_unique_old_labels.begin(), - rx_unique_old_labels.end(), - rx_unique_old_labels - .begin()); // now rx_unique_old_lables hold new labels for the corresponding old labels + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the + // grid size is 0; this leads to cudaErrorInvaildConfiguration. + if (rx_unique_old_labels.size() > 0) { + relabel_map.find( + rx_unique_old_labels.begin(), + rx_unique_old_labels.end(), + rx_unique_old_labels.begin()); // now rx_unique_old_lables hold new labels for the + // corresponding old labels + } std::tie(new_labels_for_unique_old_labels, std::ignore) = shuffle_values( handle.get_comms(), rx_unique_old_labels.begin(), rx_value_counts, handle.get_stream()); } } + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream + cuco::static_map relabel_map( - static_cast(static_cast(unique_old_labels.size()) / load_factor), + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast(static_cast(unique_old_labels.size()) / load_factor), + unique_old_labels.size() + 1), invalid_vertex_id::value, invalid_vertex_id::value); @@ -175,11 +195,21 @@ void relabel(raft::handle_t const& handle, return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - relabel_map.insert(pair_first, pair_first + unique_old_labels.size()); - relabel_map.find(labels, labels + num_labels, labels); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (unique_old_labels.size() > 0) { + relabel_map.insert(pair_first, pair_first + unique_old_labels.size()); + } + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_labels > 0) { relabel_map.find(labels, labels + num_labels, labels); } } else { cuco::static_map relabel_map( - static_cast(static_cast(num_label_pairs) / load_factor), + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast(static_cast(num_label_pairs) / load_factor), + static_cast(num_label_pairs) + 1), invalid_vertex_id::value, invalid_vertex_id::value); @@ -190,8 +220,12 @@ void relabel(raft::handle_t const& handle, return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - relabel_map.insert(pair_first, pair_first + num_label_pairs); - relabel_map.find(labels, labels + num_labels, labels); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_label_pairs > 0) { relabel_map.insert(pair_first, pair_first + num_label_pairs); } + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_labels > 0) { relabel_map.find(labels, labels + num_labels, labels); } } if (do_expensive_check) { diff --git a/cpp/src/experimental/renumber_edgelist.cu b/cpp/src/experimental/renumber_edgelist.cu index a8847167b87..127bd507271 100644 --- a/cpp/src/experimental/renumber_edgelist.cu +++ b/cpp/src/experimental/renumber_edgelist.cu @@ -50,62 +50,153 @@ rmm::device_uvector compute_renumber_map( raft::handle_t const& handle, vertex_t const* vertices, vertex_t num_local_vertices /* relevant only if vertices != nullptr */, - vertex_t const* edgelist_major_vertices, - vertex_t const* edgelist_minor_vertices, - edge_t num_edgelist_edges) + std::vector const& edgelist_major_vertices, + std::vector const& edgelist_minor_vertices, + std::vector const& edgelist_edge_counts) { // FIXME: compare this sort based approach with hash based approach in both speed and memory // footprint // 1. acquire (unique major label, count) pairs - rmm::device_uvector tmp_labels(num_edgelist_edges, handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_major_vertices, - edgelist_major_vertices + num_edgelist_edges, - tmp_labels.begin()); - thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - tmp_labels.begin(), - tmp_labels.end()); - rmm::device_uvector major_labels(tmp_labels.size(), handle.get_stream()); - rmm::device_uvector major_counts(major_labels.size(), handle.get_stream()); - auto major_pair_it = - thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - tmp_labels.begin(), - tmp_labels.end(), - thrust::make_constant_iterator(edge_t{1}), - major_labels.begin(), - major_counts.begin()); - tmp_labels.resize(0, handle.get_stream()); - tmp_labels.shrink_to_fit(handle.get_stream()); - major_labels.resize(thrust::distance(major_labels.begin(), thrust::get<0>(major_pair_it)), - handle.get_stream()); - major_counts.resize(major_labels.size(), handle.get_stream()); - major_labels.shrink_to_fit(handle.get_stream()); - major_counts.shrink_to_fit(handle.get_stream()); + rmm::device_uvector major_labels(0, handle.get_stream()); + rmm::device_uvector major_counts(0, handle.get_stream()); + for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + rmm::device_uvector tmp_major_labels(0, handle.get_stream()); + rmm::device_uvector tmp_major_counts(0, handle.get_stream()); + { + rmm::device_uvector sorted_major_labels(edgelist_edge_counts[i], + handle.get_stream()); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edgelist_major_vertices[i], + edgelist_major_vertices[i] + edgelist_edge_counts[i], + sorted_major_labels.begin()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_major_labels.begin(), + sorted_major_labels.end()); + auto num_unique_labels = + thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(sorted_major_labels.size()), + [labels = sorted_major_labels.data()] __device__(auto i) { + return (i == 0) || (labels[i - 1] != labels[i]); + }); + tmp_major_labels.resize(num_unique_labels, handle.get_stream()); + tmp_major_counts.resize(tmp_major_labels.size(), handle.get_stream()); + thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_major_labels.begin(), + sorted_major_labels.end(), + thrust::make_constant_iterator(edge_t{1}), + tmp_major_labels.begin(), + tmp_major_counts.begin()); + } + + if (multi_gpu) { + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); + + rmm::device_uvector rx_major_labels(0, handle.get_stream()); + rmm::device_uvector rx_major_counts(0, handle.get_stream()); + auto rx_sizes = host_scalar_gather( + col_comm, tmp_major_labels.size(), static_cast(i), handle.get_stream()); + std::vector rx_displs{}; + if (static_cast(i) == col_comm_rank) { + rx_displs.assign(col_comm_size, size_t{0}); + std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1); + rx_major_labels.resize(rx_displs.back() + rx_sizes.back(), handle.get_stream()); + rx_major_counts.resize(rx_major_labels.size(), handle.get_stream()); + } + device_gatherv(col_comm, + thrust::make_zip_iterator( + thrust::make_tuple(tmp_major_labels.begin(), tmp_major_counts.begin())), + thrust::make_zip_iterator( + thrust::make_tuple(rx_major_labels.begin(), rx_major_counts.begin())), + tmp_major_labels.size(), + rx_sizes, + rx_displs, + static_cast(i), + handle.get_stream()); + if (static_cast(i) == col_comm_rank) { + thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_major_labels.begin(), + rx_major_labels.end(), + rx_major_counts.begin()); + major_labels.resize(rx_major_labels.size(), handle.get_stream()); + major_counts.resize(major_labels.size(), handle.get_stream()); + auto pair_it = + thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_major_labels.begin(), + rx_major_labels.end(), + rx_major_counts.begin(), + major_labels.begin(), + major_counts.begin()); + major_labels.resize(thrust::distance(major_labels.begin(), thrust::get<0>(pair_it)), + handle.get_stream()); + major_counts.resize(major_labels.size(), handle.get_stream()); + major_labels.shrink_to_fit(handle.get_stream()); + major_counts.shrink_to_fit(handle.get_stream()); + } + } else { + tmp_major_labels.shrink_to_fit(handle.get_stream()); + tmp_major_counts.shrink_to_fit(handle.get_stream()); + major_labels = std::move(tmp_major_labels); + major_counts = std::move(tmp_major_counts); + } + } // 2. acquire unique minor labels - rmm::device_uvector minor_labels(num_edgelist_edges, handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_minor_vertices, - edgelist_minor_vertices + num_edgelist_edges, - minor_labels.begin()); + std::vector minor_displs(edgelist_minor_vertices.size(), edge_t{0}); + std::partial_sum( + edgelist_edge_counts.begin(), edgelist_edge_counts.end() - 1, minor_displs.begin() + 1); + rmm::device_uvector minor_labels(minor_displs.back() + edgelist_edge_counts.back(), + handle.get_stream()); + for (size_t i = 0; i < edgelist_minor_vertices.size(); ++i) { + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edgelist_minor_vertices[i], + edgelist_minor_vertices[i] + edgelist_edge_counts[i], + minor_labels.begin() + minor_displs[i]); + } thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), minor_labels.begin(), minor_labels.end()); - auto minor_label_it = - thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - minor_labels.begin(), - minor_labels.end()); - minor_labels.resize(thrust::distance(minor_labels.begin(), minor_label_it), handle.get_stream()); + minor_labels.resize( + thrust::distance(minor_labels.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + minor_labels.begin(), + minor_labels.end())), + handle.get_stream()); + if (multi_gpu) { + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_size = row_comm.get_size(); + + rmm::device_uvector rx_minor_labels(0, handle.get_stream()); + std::tie(rx_minor_labels, std::ignore) = groupby_gpuid_and_shuffle_values( + row_comm, + minor_labels.begin(), + minor_labels.end(), + [key_func = detail::compute_gpu_id_from_vertex_t{row_comm_size}] __device__( + auto val) { return key_func(val); }, + handle.get_stream()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_minor_labels.begin(), + rx_minor_labels.end()); + rx_minor_labels.resize( + thrust::distance( + rx_minor_labels.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_minor_labels.begin(), + rx_minor_labels.end())), + handle.get_stream()); + minor_labels = std::move(rx_minor_labels); + } minor_labels.shrink_to_fit(handle.get_stream()); // 3. merge major and minor labels and vertex labels rmm::device_uvector merged_labels(major_labels.size() + minor_labels.size(), handle.get_stream()); - rmm::device_uvector merged_counts(merged_labels.size(), handle.get_stream()); thrust::merge_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), major_labels.begin(), @@ -142,47 +233,7 @@ rmm::device_uvector compute_renumber_map( labels.shrink_to_fit(handle.get_stream()); counts.shrink_to_fit(handle.get_stream()); - // 4. if multi-GPU, shuffle and reduce (label, count) pairs - - if (multi_gpu) { - auto& comm = handle.get_comms(); - auto const comm_size = comm.get_size(); - - auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(labels.begin(), counts.begin())); - rmm::device_uvector rx_labels(0, handle.get_stream()); - rmm::device_uvector rx_counts(0, handle.get_stream()); - std::forward_as_tuple(std::tie(rx_labels, rx_counts), std::ignore) = - groupby_gpuid_and_shuffle_values( - comm, - pair_first, - pair_first + labels.size(), - [key_func = detail::compute_gpu_id_from_vertex_t{comm_size}] __device__( - auto val) { return key_func(thrust::get<0>(val)); }, - handle.get_stream()); - - labels.resize(rx_labels.size(), handle.get_stream()); - counts.resize(labels.size(), handle.get_stream()); - thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_labels.begin(), - rx_labels.end(), - rx_counts.begin()); - pair_it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_labels.begin(), - rx_labels.end(), - rx_counts.begin(), - labels.begin(), - counts.begin()); - rx_labels.resize(0, handle.get_stream()); - rx_counts.resize(0, handle.get_stream()); - rx_labels.shrink_to_fit(handle.get_stream()); - rx_counts.shrink_to_fit(handle.get_stream()); - labels.resize(thrust::distance(labels.begin(), thrust::get<0>(pair_it)), handle.get_stream()); - counts.resize(labels.size(), handle.get_stream()); - labels.shrink_to_fit(handle.get_stream()); - labels.shrink_to_fit(handle.get_stream()); - } - - // 5. if vertices != nullptr, add isolated vertices + // 4. if vertices != nullptr, add isolated vertices rmm::device_uvector isolated_vertices(0, handle.get_stream()); if (vertices != nullptr) { @@ -232,10 +283,9 @@ void expensive_check_edgelist( raft::handle_t const& handle, vertex_t const* local_vertices, vertex_t num_local_vertices /* relevant only if local_vertices != nullptr */, - vertex_t const* edgelist_major_vertices, - vertex_t const* edgelist_minor_vertices, - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned /* relevant only if multi_gpu == true */) + std::vector const& edgelist_major_vertices, + std::vector const& edgelist_minor_vertices, + std::vector const& edgelist_edge_counts) { rmm::device_uvector sorted_local_vertices( local_vertices != nullptr ? num_local_vertices : vertex_t{0}, handle.get_stream()); @@ -246,6 +296,12 @@ void expensive_check_edgelist( thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), sorted_local_vertices.begin(), sorted_local_vertices.end()); + CUGRAPH_EXPECTS(static_cast(thrust::distance( + sorted_local_vertices.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_local_vertices.begin(), + sorted_local_vertices.end()))) == sorted_local_vertices.size(), + "Invalid input argument: local_vertices should not have duplicates."); if (multi_gpu) { auto& comm = handle.get_comms(); @@ -253,8 +309,15 @@ void expensive_check_edgelist( auto const comm_rank = comm.get_rank(); auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); auto const row_comm_size = row_comm.get_size(); + auto const row_comm_rank = row_comm.get_rank(); auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_size = col_comm.get_size(); + auto const col_comm_rank = col_comm.get_rank(); + + CUGRAPH_EXPECTS((edgelist_major_vertices.size() == edgelist_minor_vertices.size()) && + (edgelist_major_vertices.size() == static_cast(col_comm_size)), + "Invalid input argument: both edgelist_major_vertices.size() & " + "edgelist_minor_vertices.size() should coincide with col_comm_size."); CUGRAPH_EXPECTS( thrust::count_if( @@ -268,95 +331,127 @@ void expensive_check_edgelist( }) == 0, "Invalid input argument: local_vertices should be pre-shuffled."); - auto edge_first = thrust::make_zip_iterator( - thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices)); - CUGRAPH_EXPECTS( - thrust::count_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edge_first, - edge_first + num_edgelist_edges, - [comm_rank, - key_func = - detail::compute_gpu_id_from_edge_t{is_hypergraph_partitioned, - comm_size, - row_comm_size, - col_comm_size}] __device__(auto edge) { - return key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != comm_rank; - }) == 0, - "Invalid input argument: edgelist_major_vertices & edgelist_minor_vertices should be " - "pre-shuffled."); - - if (local_vertices != nullptr) { - rmm::device_uvector unique_edge_vertices(num_edgelist_edges * 2, - handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_major_vertices, - edgelist_major_vertices + num_edgelist_edges, - unique_edge_vertices.begin()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_minor_vertices, - edgelist_minor_vertices + num_edgelist_edges, - unique_edge_vertices.begin() + num_edgelist_edges); - thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - unique_edge_vertices.begin(), - unique_edge_vertices.end()); - unique_edge_vertices.resize( - thrust::distance( - unique_edge_vertices.begin(), - thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - unique_edge_vertices.begin(), - unique_edge_vertices.end())), - handle.get_stream()); - - rmm::device_uvector rx_unique_edge_vertices(0, handle.get_stream()); - std::tie(rx_unique_edge_vertices, std::ignore) = groupby_gpuid_and_shuffle_values( - handle.get_comms(), - unique_edge_vertices.begin(), - unique_edge_vertices.end(), - [key_func = detail::compute_gpu_id_from_vertex_t{comm_size}] __device__( - auto val) { return key_func(val); }, - handle.get_stream()); - - unique_edge_vertices = std::move(rx_unique_edge_vertices); - + for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices[i], edgelist_minor_vertices[i])); CUGRAPH_EXPECTS( thrust::count_if( rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - unique_edge_vertices.begin(), - unique_edge_vertices.end(), - [num_local_vertices, - sorted_local_vertices = sorted_local_vertices.data()] __device__(auto v) { - return !thrust::binary_search( - thrust::seq, sorted_local_vertices, sorted_local_vertices + num_local_vertices, v); + edge_first, + edge_first + edgelist_edge_counts[i], + [comm_size, + comm_rank, + row_comm_rank, + col_comm_size, + col_comm_rank, + i, + gpu_id_key_func = + detail::compute_gpu_id_from_edge_t{comm_size, row_comm_size, col_comm_size}, + partition_id_key_func = + detail::compute_partition_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto edge) { + return (gpu_id_key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != comm_rank) || + (partition_id_key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != + row_comm_rank * col_comm_size + col_comm_rank + i * comm_size); }) == 0, - "Invalid input argument: edgelist_major_vertices and/or edgelist_minor_vertices have " - "invalid vertex ID(s)."); + "Invalid input argument: edgelist_major_vertices & edgelist_minor_vertices should be " + "pre-shuffled."); + + auto aggregate_vertexlist_size = host_scalar_allreduce( + comm, + local_vertices != nullptr ? num_local_vertices : vertex_t{0}, + handle.get_stream()); // local_vertices != nullptr is insufficient in multi-GPU as only a + // subset of GPUs may have a non-zero vertices + if (aggregate_vertexlist_size > 0) { + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + + rmm::device_uvector sorted_major_vertices(0, handle.get_stream()); + { + auto recvcounts = + host_scalar_allgather(col_comm, sorted_local_vertices.size(), handle.get_stream()); + std::vector displacements(recvcounts.size(), size_t{0}); + std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); + sorted_major_vertices.resize(displacements.back() + recvcounts.back(), + handle.get_stream()); + device_allgatherv(col_comm, + sorted_local_vertices.data(), + sorted_major_vertices.data(), + recvcounts, + displacements, + handle.get_stream()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_major_vertices.begin(), + sorted_major_vertices.end()); + } + + rmm::device_uvector sorted_minor_vertices(0, handle.get_stream()); + { + auto recvcounts = + host_scalar_allgather(row_comm, sorted_local_vertices.size(), handle.get_stream()); + std::vector displacements(recvcounts.size(), size_t{0}); + std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); + sorted_minor_vertices.resize(displacements.back() + recvcounts.back(), + handle.get_stream()); + device_allgatherv(row_comm, + sorted_local_vertices.data(), + sorted_minor_vertices.data(), + recvcounts, + displacements, + handle.get_stream()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_minor_vertices.begin(), + sorted_minor_vertices.end()); + } + + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices[i], edgelist_minor_vertices[i])); + CUGRAPH_EXPECTS( + thrust::count_if( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edge_first, + edge_first + edgelist_edge_counts[i], + [num_major_vertices = static_cast(sorted_major_vertices.size()), + sorted_major_vertices = sorted_major_vertices.data(), + num_minor_vertices = static_cast(sorted_minor_vertices.size()), + sorted_minor_vertices = sorted_minor_vertices.data()] __device__(auto e) { + return !thrust::binary_search(thrust::seq, + sorted_major_vertices, + sorted_major_vertices + num_major_vertices, + thrust::get<0>(e)) || + !thrust::binary_search(thrust::seq, + sorted_minor_vertices, + sorted_minor_vertices + num_minor_vertices, + thrust::get<1>(e)); + }) == 0, + "Invalid input argument: edgelist_major_vertices and/or edgelist_mior_vertices have " + "invalid vertex ID(s)."); + } } } else { - if (local_vertices != nullptr) { - CUGRAPH_EXPECTS( - thrust::count_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_major_vertices, - edgelist_major_vertices + num_edgelist_edges, - [num_local_vertices, - sorted_local_vertices = sorted_local_vertices.data()] __device__(auto v) { - return !thrust::binary_search( - thrust::seq, sorted_local_vertices, sorted_local_vertices + num_local_vertices, v); - }) == 0, - "Invalid input argument: edgelist_major_vertices has invalid vertex ID(s)."); + assert(edgelist_major_vertices.size() == 1); + assert(edgelist_minor_vertices.size() == 1); + if (local_vertices != nullptr) { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices[0], edgelist_minor_vertices[0])); CUGRAPH_EXPECTS( - thrust::count_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_major_vertices, - edgelist_major_vertices + num_edgelist_edges, - [num_local_vertices, - sorted_local_vertices = sorted_local_vertices.data()] __device__(auto v) { - return !thrust::binary_search( - thrust::seq, sorted_local_vertices, sorted_local_vertices + num_local_vertices, v); - }) == 0, - "Invalid input argument: edgelist_major_vertices has invalid vertex ID(s)."); + thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edge_first, + edge_first + edgelist_edge_counts[0], + [num_local_vertices, + sorted_local_vertices = sorted_local_vertices.data()] __device__(auto e) { + return !thrust::binary_search(thrust::seq, + sorted_local_vertices, + sorted_local_vertices + num_local_vertices, + thrust::get<0>(e)) || + !thrust::binary_search(thrust::seq, + sorted_local_vertices, + sorted_local_vertices + num_local_vertices, + thrust::get<1>(e)); + }) == 0, + "Invalid input argument: edgelist_major_vertices and/or edgelist_minor_vertices have " + "invalid vertex ID(s)."); } } } @@ -368,15 +463,15 @@ std::enable_if_t const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, bool do_expensive_check) { // FIXME: remove this check once we drop Pascal support - CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, - "Relabel not supported on Pascal and older architectures."); + CUGRAPH_EXPECTS( + handle.get_device_properties().major >= 7, + "This version of enumber_edgelist not supported on Pascal and older architectures."); #ifdef CUCO_STATIC_MAP_DEFINED auto& comm = handle.get_comms(); @@ -389,14 +484,20 @@ renumber_edgelist(raft::handle_t const& handle, auto const col_comm_size = col_comm.get_size(); auto const col_comm_rank = col_comm.get_rank(); + std::vector edgelist_const_major_vertices(edgelist_major_vertices.size()); + std::vector edgelist_const_minor_vertices(edgelist_const_major_vertices.size()); + for (size_t i = 0; i < edgelist_const_major_vertices.size(); ++i) { + edgelist_const_major_vertices[i] = edgelist_major_vertices[i]; + edgelist_const_minor_vertices[i] = edgelist_minor_vertices[i]; + } + if (do_expensive_check) { expensive_check_edgelist(handle, local_vertices, num_local_vertices, - edgelist_major_vertices, - edgelist_minor_vertices, - num_edgelist_edges, - is_hypergraph_partitioned); + edgelist_const_major_vertices, + edgelist_const_minor_vertices, + edgelist_edge_counts); } // 1. compute renumber map @@ -405,142 +506,129 @@ renumber_edgelist(raft::handle_t const& handle, detail::compute_renumber_map(handle, local_vertices, num_local_vertices, - edgelist_major_vertices, - edgelist_minor_vertices, - num_edgelist_edges); + edgelist_const_major_vertices, + edgelist_const_minor_vertices, + edgelist_edge_counts); // 2. initialize partition_t object, number_of_vertices, and number_of_edges for the coarsened // graph - auto vertex_partition_counts = host_scalar_allgather( + auto vertex_counts = host_scalar_allgather( comm, static_cast(renumber_map_labels.size()), handle.get_stream()); std::vector vertex_partition_offsets(comm_size + 1, 0); - std::partial_sum(vertex_partition_counts.begin(), - vertex_partition_counts.end(), - vertex_partition_offsets.begin() + 1); + std::partial_sum( + vertex_counts.begin(), vertex_counts.end(), vertex_partition_offsets.begin() + 1); - partition_t partition(vertex_partition_offsets, - is_hypergraph_partitioned, - row_comm_size, - col_comm_size, - row_comm_rank, - col_comm_rank); + partition_t partition( + vertex_partition_offsets, row_comm_size, col_comm_size, row_comm_rank, col_comm_rank); auto number_of_vertices = vertex_partition_offsets.back(); - auto number_of_edges = host_scalar_allreduce(comm, num_edgelist_edges, handle.get_stream()); + auto number_of_edges = host_scalar_allreduce( + comm, + std::accumulate(edgelist_edge_counts.begin(), edgelist_edge_counts.end(), edge_t{0}), + handle.get_stream()); // 3. renumber edges - if (is_hypergraph_partitioned) { - CUGRAPH_FAIL("unimplemented."); - } else { - double constexpr load_factor = 0.7; + double constexpr load_factor = 0.7; - // FIXME: compare this hash based approach with a binary search based approach in both memory - // footprint and execution time + // FIXME: compare this hash based approach with a binary search based approach in both memory + // footprint and execution time - { - vertex_t major_first{}; - vertex_t major_last{}; - std::tie(major_first, major_last) = partition.get_matrix_partition_major_range(0); - rmm::device_uvector renumber_map_major_labels(major_last - major_first, - handle.get_stream()); - std::vector recvcounts(row_comm_size); - for (int i = 0; i < row_comm_size; ++i) { - recvcounts[i] = partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i); - } - std::vector displacements(row_comm_size, 0); - std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); - device_allgatherv(row_comm, - renumber_map_labels.begin(), - renumber_map_major_labels.begin(), - recvcounts, - displacements, - handle.get_stream()); - - CUDA_TRY(cudaStreamSynchronize( - handle.get_stream())); // cuco::static_map currently does not take stream - - cuco::static_map renumber_map{ - static_cast(static_cast(renumber_map_major_labels.size()) / load_factor), - invalid_vertex_id::value, - invalid_vertex_id::value}; - auto pair_first = thrust::make_transform_iterator( - thrust::make_zip_iterator(thrust::make_tuple(renumber_map_major_labels.begin(), - thrust::make_counting_iterator(major_first))), - [] __device__(auto val) { - return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); - }); - renumber_map.insert(pair_first, pair_first + renumber_map_major_labels.size()); - renumber_map.find(edgelist_major_vertices, - edgelist_major_vertices + num_edgelist_edges, - edgelist_major_vertices); + for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + rmm::device_uvector renumber_map_major_labels( + col_comm_rank == static_cast(i) ? vertex_t{0} + : partition.get_matrix_partition_major_size(i), + handle.get_stream()); + device_bcast(col_comm, + renumber_map_labels.data(), + renumber_map_major_labels.data(), + partition.get_matrix_partition_major_size(i), + i, + handle.get_stream()); + + CUDA_TRY(cudaStreamSynchronize( + handle.get_stream())); // cuco::static_map currently does not take stream + + cuco::static_map renumber_map{ + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast( + static_cast(partition.get_matrix_partition_major_size(i)) / load_factor), + static_cast(partition.get_matrix_partition_major_size(i)) + 1), + invalid_vertex_id::value, + invalid_vertex_id::value}; + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple( + col_comm_rank == static_cast(i) ? renumber_map_labels.begin() + : renumber_map_major_labels.begin(), + thrust::make_counting_iterator(partition.get_matrix_partition_major_first(i)))), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (partition.get_matrix_partition_major_size(i) > 0) { + renumber_map.insert(pair_first, pair_first + partition.get_matrix_partition_major_size(i)); } + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (edgelist_edge_counts[i]) { + renumber_map.find(edgelist_major_vertices[i], + edgelist_major_vertices[i] + edgelist_edge_counts[i], + edgelist_major_vertices[i]); + } + } - { - vertex_t minor_first{}; - vertex_t minor_last{}; - std::tie(minor_first, minor_last) = partition.get_matrix_partition_minor_range(); - rmm::device_uvector renumber_map_minor_labels(minor_last - minor_first, - handle.get_stream()); - - // FIXME: this P2P is unnecessary if we apply the partitioning scheme used with hypergraph - // partitioning - auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank; - auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size; - // FIXME: this branch may be no longer necessary with NCCL backend - if (comm_src_rank == comm_rank) { - assert(comm_dst_rank == comm_rank); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - renumber_map_labels.begin(), - renumber_map_labels.end(), - renumber_map_minor_labels.begin() + - (partition.get_vertex_partition_first(comm_src_rank) - - partition.get_vertex_partition_first(row_comm_rank * col_comm_size))); - } else { - device_sendrecv(comm, - renumber_map_labels.begin(), - renumber_map_labels.size(), - comm_dst_rank, - renumber_map_minor_labels.begin() + - (partition.get_vertex_partition_first(comm_src_rank) - - partition.get_vertex_partition_first(row_comm_rank * col_comm_size)), - static_cast(partition.get_vertex_partition_size(comm_src_rank)), - comm_src_rank, - handle.get_stream()); - } - - // FIXME: these broadcast operations can be placed between ncclGroupStart() and - // ncclGroupEnd() - for (int i = 0; i < col_comm_size; ++i) { - auto offset = partition.get_vertex_partition_first(row_comm_rank * col_comm_size + i) - - partition.get_vertex_partition_first(row_comm_rank * col_comm_size); - auto count = partition.get_vertex_partition_size(row_comm_rank * col_comm_size + i); - device_bcast(col_comm, - renumber_map_minor_labels.begin() + offset, - renumber_map_minor_labels.begin() + offset, - count, - i, - handle.get_stream()); - } + { + rmm::device_uvector renumber_map_minor_labels( + partition.get_matrix_partition_minor_size(), handle.get_stream()); + std::vector recvcounts(row_comm_size); + for (int i = 0; i < row_comm_size; ++i) { + recvcounts[i] = partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i); + } + std::vector displacements(recvcounts.size(), 0); + std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); + device_allgatherv(row_comm, + renumber_map_labels.begin(), + renumber_map_minor_labels.begin(), + recvcounts, + displacements, + handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize( - handle.get_stream())); // cuco::static_map currently does not take stream + CUDA_TRY(cudaStreamSynchronize( + handle.get_stream())); // cuco::static_map currently does not take stream - cuco::static_map renumber_map{ + cuco::static_map renumber_map{ + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max( static_cast(static_cast(renumber_map_minor_labels.size()) / load_factor), - invalid_vertex_id::value, - invalid_vertex_id::value}; - auto pair_first = thrust::make_transform_iterator( - thrust::make_zip_iterator(thrust::make_tuple(renumber_map_minor_labels.begin(), - thrust::make_counting_iterator(minor_first))), - [] __device__(auto val) { - return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); - }); + renumber_map_minor_labels.size() + 1), + invalid_vertex_id::value, + invalid_vertex_id::value}; + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple( + renumber_map_minor_labels.begin(), + thrust::make_counting_iterator(partition.get_matrix_partition_minor_first()))), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (renumber_map_minor_labels.size()) { renumber_map.insert(pair_first, pair_first + renumber_map_minor_labels.size()); - renumber_map.find(edgelist_minor_vertices, - edgelist_minor_vertices + num_edgelist_edges, - edgelist_minor_vertices); + } + for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the + // grid size is 0; this leads to cudaErrorInvaildConfiguration. + if (edgelist_edge_counts[i]) { + renumber_map.find(edgelist_minor_vertices[i], + edgelist_minor_vertices[i] + edgelist_edge_counts[i], + edgelist_minor_vertices[i]); + } } } @@ -565,27 +653,28 @@ std::enable_if_t> renumber_edgelist( bool do_expensive_check) { // FIXME: remove this check once we drop Pascal support - CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, - "Relabel not supported on Pascal and older architectures."); + CUGRAPH_EXPECTS( + handle.get_device_properties().major >= 7, + "This version of renumber_edgelist not supported on Pascal and older architectures."); #ifdef CUCO_STATIC_MAP_DEFINED if (do_expensive_check) { - expensive_check_edgelist(handle, - vertices, - num_vertices, - edgelist_major_vertices, - edgelist_minor_vertices, - num_edgelist_edges, - false); + expensive_check_edgelist( + handle, + vertices, + num_vertices, + std::vector{edgelist_major_vertices}, + std::vector{edgelist_minor_vertices}, + std::vector{num_edgelist_edges}); } - auto renumber_map_labels = - detail::compute_renumber_map(handle, - vertices, - num_vertices, - edgelist_major_vertices, - edgelist_minor_vertices, - num_edgelist_edges); + auto renumber_map_labels = detail::compute_renumber_map( + handle, + vertices, + num_vertices, + std::vector{edgelist_major_vertices}, + std::vector{edgelist_minor_vertices}, + std::vector{num_edgelist_edges}); double constexpr load_factor = 0.7; @@ -593,7 +682,11 @@ std::enable_if_t> renumber_edgelist( // footprint and execution time cuco::static_map renumber_map{ - static_cast(static_cast(renumber_map_labels.size()) / load_factor), + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast(static_cast(renumber_map_labels.size()) / load_factor), + renumber_map_labels.size() + 1), invalid_vertex_id::value, invalid_vertex_id::value}; auto pair_first = thrust::make_transform_iterator( @@ -602,11 +695,21 @@ std::enable_if_t> renumber_edgelist( [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - renumber_map.insert(pair_first, pair_first + renumber_map_labels.size()); - renumber_map.find( - edgelist_major_vertices, edgelist_major_vertices + num_edgelist_edges, edgelist_major_vertices); - renumber_map.find( - edgelist_minor_vertices, edgelist_minor_vertices + num_edgelist_edges, edgelist_minor_vertices); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (renumber_map_labels.size()) { + renumber_map.insert(pair_first, pair_first + renumber_map_labels.size()); + } + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_edgelist_edges > 0) { + renumber_map.find(edgelist_major_vertices, + edgelist_major_vertices + num_edgelist_edges, + edgelist_major_vertices); + renumber_map.find(edgelist_minor_vertices, + edgelist_minor_vertices + num_edgelist_edges, + edgelist_minor_vertices); + } return renumber_map_labels; #else @@ -620,22 +723,21 @@ template std::enable_if_t, partition_t, vertex_t, edge_t>> renumber_edgelist(raft::handle_t const& handle, - vertex_t* edgelist_major_vertices /* [INOUT] */, - vertex_t* edgelist_minor_vertices /* [INOUT] */, - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, bool do_expensive_check) { // FIXME: remove this check once we drop Pascal support - CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, - "Relabel not supported on Pascal and older architectures."); + CUGRAPH_EXPECTS( + handle.get_device_properties().major >= 7, + "This version of renumber_edgelist not supported on Pascal and older architectures."); return detail::renumber_edgelist(handle, static_cast(nullptr), vertex_t{0}, edgelist_major_vertices, edgelist_minor_vertices, - num_edgelist_edges, - is_hypergraph_partitioned, + edgelist_edge_counts, do_expensive_check); } @@ -648,8 +750,9 @@ std::enable_if_t> renumber_edgelist( bool do_expensive_check) { // FIXME: remove this check once we drop Pascal support - CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, - "Relabel not supported on Pascal and older architectures."); + CUGRAPH_EXPECTS( + handle.get_device_properties().major >= 7, + "This version of renumber_edgelist not supported on Pascal and older architectures."); return detail::renumber_edgelist(handle, static_cast(nullptr), vertex_t{0} /* dummy */, @@ -665,22 +768,21 @@ std::enable_if_t const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, bool do_expensive_check) { // FIXME: remove this check once we drop Pascal support - CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, - "Relabel not supported on Pascal and older architectures."); + CUGRAPH_EXPECTS( + handle.get_device_properties().major >= 7, + "This version of renumber_edgelist not supported on Pascal and older architectures."); return detail::renumber_edgelist(handle, local_vertices, num_local_vertices, edgelist_major_vertices, edgelist_minor_vertices, - num_edgelist_edges, - is_hypergraph_partitioned, + edgelist_edge_counts, do_expensive_check); } @@ -695,8 +797,9 @@ std::enable_if_t> renumber_edgelist( bool do_expensive_check) { // FIXME: remove this check once we drop Pascal support - CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, - "Relabel not supported on Pascal and older architectures."); + CUGRAPH_EXPECTS( + handle.get_device_properties().major >= 7, + "This version of renumber_edgelist not supported on Pascal and older architectures."); return detail::renumber_edgelist(handle, vertices, num_vertices, @@ -711,12 +814,12 @@ std::enable_if_t> renumber_edgelist( // instantiations for // template std::tuple, partition_t, int32_t, int32_t> -renumber_edgelist(raft::handle_t const& handle, - int32_t* edgelist_major_vertices /* [INOUT] */, - int32_t* edgelist_minor_vertices /* [INOUT] */, - int32_t num_edgelist_edges, - bool is_hypergraph_partitioned, - bool do_expensive_check); +renumber_edgelist( + raft::handle_t const& handle, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + bool do_expensive_check); template rmm::device_uvector renumber_edgelist( raft::handle_t const& handle, @@ -726,14 +829,14 @@ template rmm::device_uvector renumber_edgelist bool do_expensive_check); template std::tuple, partition_t, int32_t, int32_t> -renumber_edgelist(raft::handle_t const& handle, - int32_t const* local_vertices, - int32_t num_local_vertices, - int32_t* edgelist_major_vertices /* [INOUT] */, - int32_t* edgelist_minor_vertices /* [INOUT] */, - int32_t num_edgelist_edges, - bool is_hypergraph_partitioned, - bool do_expensive_check); +renumber_edgelist( + raft::handle_t const& handle, + int32_t const* local_vertices, + int32_t num_local_vertices, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + bool do_expensive_check); template rmm::device_uvector renumber_edgelist( raft::handle_t const& handle, @@ -747,12 +850,12 @@ template rmm::device_uvector renumber_edgelist // instantiations for // template std::tuple, partition_t, int32_t, int64_t> -renumber_edgelist(raft::handle_t const& handle, - int32_t* edgelist_major_vertices /* [INOUT] */, - int32_t* edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned, - bool do_expensive_check); +renumber_edgelist( + raft::handle_t const& handle, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + bool do_expensive_check); template rmm::device_uvector renumber_edgelist( raft::handle_t const& handle, @@ -762,14 +865,14 @@ template rmm::device_uvector renumber_edgelist bool do_expensive_check); template std::tuple, partition_t, int32_t, int64_t> -renumber_edgelist(raft::handle_t const& handle, - int32_t const* local_vertices, - int32_t num_local_vertices, - int32_t* edgelist_major_vertices /* [INOUT] */, - int32_t* edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned, - bool do_expensive_check); +renumber_edgelist( + raft::handle_t const& handle, + int32_t const* local_vertices, + int32_t num_local_vertices, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + bool do_expensive_check); template rmm::device_uvector renumber_edgelist( raft::handle_t const& handle, @@ -783,12 +886,12 @@ template rmm::device_uvector renumber_edgelist // instantiations for // template std::tuple, partition_t, int64_t, int64_t> -renumber_edgelist(raft::handle_t const& handle, - int64_t* edgelist_major_vertices /* [INOUT] */, - int64_t* edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned, - bool do_expensive_check); +renumber_edgelist( + raft::handle_t const& handle, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + bool do_expensive_check); template rmm::device_uvector renumber_edgelist( raft::handle_t const& handle, @@ -798,14 +901,14 @@ template rmm::device_uvector renumber_edgelist bool do_expensive_check); template std::tuple, partition_t, int64_t, int64_t> -renumber_edgelist(raft::handle_t const& handle, - int64_t const* local_vertices, - int64_t num_local_vertices, - int64_t* edgelist_major_vertices /* [INOUT] */, - int64_t* edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned, - bool do_expensive_check); +renumber_edgelist( + raft::handle_t const& handle, + int64_t const* local_vertices, + int64_t num_local_vertices, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + bool do_expensive_check); template rmm::device_uvector renumber_edgelist( raft::handle_t const& handle, diff --git a/cpp/src/experimental/renumber_utils.cu b/cpp/src/experimental/renumber_utils.cu new file mode 100644 index 00000000000..8f59683d9d6 --- /dev/null +++ b/cpp/src/experimental/renumber_utils.cu @@ -0,0 +1,477 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cugraph { +namespace experimental { + +template +void renumber_ext_vertices(raft::handle_t const& handle, + vertex_t* vertices /* [INOUT] */, + size_t num_vertices, + vertex_t const* renumber_map_labels, + vertex_t local_int_vertex_first, + vertex_t local_int_vertex_last, + bool do_expensive_check) +{ + double constexpr load_factor = 0.7; + + // FIXME: remove this check once we drop Pascal support + CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, + "renumber_vertices() not supported on Pascal and older architectures."); + +#ifdef CUCO_STATIC_MAP_DEFINED + if (do_expensive_check) { + rmm::device_uvector labels(local_int_vertex_last - local_int_vertex_first, + handle.get_stream()); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + renumber_map_labels, + renumber_map_labels + labels.size(), + labels.begin()); + thrust::sort( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), labels.begin(), labels.end()); + CUGRAPH_EXPECTS(thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + labels.begin(), + labels.end()) == labels.end(), + "Invalid input arguments: renumber_map_labels have duplicate elements."); + } + + auto renumber_map_ptr = std::make_unique>( + size_t{0}, invalid_vertex_id::value, invalid_vertex_id::value); + if (multi_gpu) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + + rmm::device_uvector sorted_unique_ext_vertices(num_vertices, handle.get_stream()); + sorted_unique_ext_vertices.resize( + thrust::distance( + sorted_unique_ext_vertices.begin(), + thrust::copy_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + sorted_unique_ext_vertices.begin(), + [] __device__(auto v) { return v != invalid_vertex_id::value; })), + handle.get_stream()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_unique_ext_vertices.begin(), + sorted_unique_ext_vertices.end()); + sorted_unique_ext_vertices.resize( + thrust::distance( + sorted_unique_ext_vertices.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_unique_ext_vertices.begin(), + sorted_unique_ext_vertices.end())), + handle.get_stream()); + + auto int_vertices_for_sorted_unique_ext_vertices = collect_values_for_unique_keys( + comm, + renumber_map_labels, + renumber_map_labels + (local_int_vertex_last - local_int_vertex_first), + thrust::make_counting_iterator(local_int_vertex_first), + sorted_unique_ext_vertices.begin(), + sorted_unique_ext_vertices.end(), + detail::compute_gpu_id_from_vertex_t{comm_size}, + handle.get_stream()); + + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream + + renumber_map_ptr.reset(); + + renumber_map_ptr = std::make_unique>( + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max( + static_cast(static_cast(sorted_unique_ext_vertices.size()) / load_factor), + sorted_unique_ext_vertices.size() + 1), + invalid_vertex_id::value, + invalid_vertex_id::value); + + auto kv_pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple( + sorted_unique_ext_vertices.begin(), int_vertices_for_sorted_unique_ext_vertices.begin())), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (sorted_unique_ext_vertices.size()) { + renumber_map_ptr->insert(kv_pair_first, kv_pair_first + sorted_unique_ext_vertices.size()); + } + } else { + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream + + renumber_map_ptr.reset(); + + renumber_map_ptr = std::make_unique>( + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast( + static_cast(local_int_vertex_last - local_int_vertex_first) / load_factor), + static_cast(local_int_vertex_last - local_int_vertex_first) + 1), + invalid_vertex_id::value, + invalid_vertex_id::value); + + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator( + thrust::make_tuple(renumber_map_labels, thrust::make_counting_iterator(vertex_t{0}))), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if ((local_int_vertex_last - local_int_vertex_first) > 0) { + renumber_map_ptr->insert(pair_first, + pair_first + (local_int_vertex_last - local_int_vertex_first)); + } + } + + if (do_expensive_check) { + rmm::device_uvector contains(num_vertices, handle.get_stream()); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_vertices > 0) { + renumber_map_ptr->contains(vertices, vertices + num_vertices, contains.begin()); + } + auto vc_pair_first = thrust::make_zip_iterator(thrust::make_tuple(vertices, contains.begin())); + CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vc_pair_first, + vc_pair_first + num_vertices, + [] __device__(auto pair) { + auto v = thrust::get<0>(pair); + auto c = thrust::get<1>(pair); + return v == invalid_vertex_id::value + ? (c == true) + : (c == false); + }) == 0, + "Invalid input arguments: vertices have elements that are missing in " + "(aggregate) renumber_map_labels."); + } + + // FIXME: a temporary workaround for https://github.com/NVIDIA/cuCollections/issues/74 +#if 1 + thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + vertices, + [view = renumber_map_ptr->get_device_view()] __device__(auto v) { + return v != invalid_vertex_id::value + ? view.find(v)->second.load(cuda::std::memory_order_relaxed) + : invalid_vertex_id::value; + }); +#else + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_vertices > 0) { renumber_map_ptr->find(vertices, vertices + num_vertices, vertices); } +#endif +#endif +} + +template +void unrenumber_local_int_vertices( + raft::handle_t const& handle, + vertex_t* vertices /* [INOUT] */, + size_t num_vertices, + vertex_t const* renumber_map_labels /* size = local_int_vertex_last - local_int_vertex_first */, + vertex_t local_int_vertex_first, + vertex_t local_int_vertex_last, + bool do_expensive_check) +{ + // FIXME: remove this check once we drop Pascal support + CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, + "unrenumber_local_vertices() not supported on Pascal and older architectures."); + +#ifdef CUCO_STATIC_MAP_DEFINED + if (do_expensive_check) { + CUGRAPH_EXPECTS( + thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + [local_int_vertex_first, local_int_vertex_last] __device__(auto v) { + return v != invalid_vertex_id::value && + (v < local_int_vertex_first || v >= local_int_vertex_last); + }) == 0, + "Invalid input arguments: there are non-local vertices in [vertices, vertices " + "+ num_vertices)."); + } + + thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + vertices, + [renumber_map_labels, local_int_vertex_first] __device__(auto v) { + return v == invalid_vertex_id::value + ? v + : renumber_map_labels[v - local_int_vertex_first]; + }); +#endif +} + +template +void unrenumber_int_vertices(raft::handle_t const& handle, + vertex_t* vertices /* [INOUT] */, + size_t num_vertices, + vertex_t const* renumber_map_labels, + vertex_t local_int_vertex_first, + vertex_t local_int_vertex_last, + std::vector& vertex_partition_lasts, + bool do_expensive_check) +{ + double constexpr load_factor = 0.7; + + // FIXME: remove this check once we drop Pascal support + CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, + "unrenumber_vertices() not supported on Pascal and older architectures."); + +#ifdef CUCO_STATIC_MAP_DEFINED + if (do_expensive_check) { + CUGRAPH_EXPECTS( + thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + [int_vertex_last = vertex_partition_lasts.back()] __device__(auto v) { + return v != invalid_vertex_id::value && + !is_valid_vertex(int_vertex_last, v); + }) == 0, + "Invalid input arguments: there are out-of-range vertices in [vertices, vertices " + "+ num_vertices)."); + } + + if (multi_gpu) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + + rmm::device_uvector sorted_unique_int_vertices(num_vertices, handle.get_stream()); + sorted_unique_int_vertices.resize( + thrust::distance( + sorted_unique_int_vertices.begin(), + thrust::copy_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + sorted_unique_int_vertices.begin(), + [] __device__(auto v) { return v != invalid_vertex_id::value; })), + handle.get_stream()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_unique_int_vertices.begin(), + sorted_unique_int_vertices.end()); + sorted_unique_int_vertices.resize( + thrust::distance( + sorted_unique_int_vertices.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_unique_int_vertices.begin(), + sorted_unique_int_vertices.end())), + handle.get_stream()); + + rmm::device_uvector d_vertex_partition_lasts(vertex_partition_lasts.size(), + handle.get_stream()); + raft::update_device(d_vertex_partition_lasts.data(), + vertex_partition_lasts.data(), + vertex_partition_lasts.size(), + handle.get_stream()); + rmm::device_uvector d_tx_int_vertex_offsets(d_vertex_partition_lasts.size(), + handle.get_stream()); + thrust::lower_bound(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_unique_int_vertices.begin(), + sorted_unique_int_vertices.end(), + d_vertex_partition_lasts.begin(), + d_vertex_partition_lasts.end(), + d_tx_int_vertex_offsets.begin()); + std::vector h_tx_int_vertex_counts(d_tx_int_vertex_offsets.size()); + raft::update_host(h_tx_int_vertex_counts.data(), + d_tx_int_vertex_offsets.data(), + d_tx_int_vertex_offsets.size(), + handle.get_stream()); + handle.get_stream_view().synchronize(); + std::adjacent_difference( + h_tx_int_vertex_counts.begin(), h_tx_int_vertex_counts.end(), h_tx_int_vertex_counts.begin()); + + rmm::device_uvector rx_int_vertices(0, handle.get_stream()); + std::vector rx_int_vertex_counts{}; + std::tie(rx_int_vertices, rx_int_vertex_counts) = shuffle_values( + comm, sorted_unique_int_vertices.begin(), h_tx_int_vertex_counts, handle.get_stream()); + + auto tx_ext_vertices = std::move(rx_int_vertices); + thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + tx_ext_vertices.begin(), + tx_ext_vertices.end(), + tx_ext_vertices.begin(), + [renumber_map_labels, local_int_vertex_first] __device__(auto v) { + return renumber_map_labels[v - local_int_vertex_first]; + }); + + rmm::device_uvector rx_ext_vertices_for_sorted_unique_int_vertices( + 0, handle.get_stream()); + std::tie(rx_ext_vertices_for_sorted_unique_int_vertices, std::ignore) = + shuffle_values(comm, tx_ext_vertices.begin(), rx_int_vertex_counts, handle.get_stream()); + + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream + + cuco::static_map unrenumber_map( + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max( + static_cast(static_cast(sorted_unique_int_vertices.size()) / load_factor), + sorted_unique_int_vertices.size() + 1), + invalid_vertex_id::value, + invalid_vertex_id::value); + + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator( + thrust::make_tuple(sorted_unique_int_vertices.begin(), + rx_ext_vertices_for_sorted_unique_int_vertices.begin())), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (sorted_unique_int_vertices.size()) { + unrenumber_map.insert(pair_first, pair_first + sorted_unique_int_vertices.size()); + } + // FIXME: a temporary workaround for https://github.com/NVIDIA/cuCollections/issues/74 +#if 1 + thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + vertices, + [view = unrenumber_map.get_device_view()] __device__(auto v) { + return v != invalid_vertex_id::value + ? view.find(v)->second.load(cuda::std::memory_order_relaxed) + : invalid_vertex_id::value; + }); +#else + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_vertices > 0) { unrenumber_map.find(vertices, vertices + num_vertices, vertices); } +#endif + } else { + unrenumber_local_int_vertices(handle, + vertices, + num_vertices, + renumber_map_labels, + local_int_vertex_first, + local_int_vertex_last, + do_expensive_check); + } +#endif +} + +// explicit instantiation + +template void renumber_ext_vertices(raft::handle_t const& handle, + int32_t* vertices, + size_t num_vertices, + int32_t const* renumber_map_labels, + int32_t local_int_vertex_first, + int32_t local_int_vertex_last, + bool do_expensive_check); + +template void renumber_ext_vertices(raft::handle_t const& handle, + int32_t* vertices, + size_t num_vertices, + int32_t const* renumber_map_labels, + int32_t local_int_vertex_first, + int32_t local_int_vertex_last, + bool do_expensive_check); + +template void renumber_ext_vertices(raft::handle_t const& handle, + int64_t* vertices, + size_t num_vertices, + int64_t const* renumber_map_labels, + int64_t local_int_vertex_first, + int64_t local_int_vertex_last, + bool do_expensive_check); + +template void renumber_ext_vertices(raft::handle_t const& handle, + int64_t* vertices, + size_t num_vertices, + int64_t const* renumber_map_labels, + int64_t local_int_vertex_first, + int64_t local_int_vertex_last, + bool do_expensive_check); + +template void unrenumber_local_int_vertices(raft::handle_t const& handle, + int32_t* vertices, + size_t num_vertices, + int32_t const* renumber_map_labels, + int32_t local_int_vertex_first, + int32_t local_int_vertex_last, + bool do_expensive_check); + +template void unrenumber_local_int_vertices(raft::handle_t const& handle, + int64_t* vertices, + size_t num_vertices, + int64_t const* renumber_map_labels, + int64_t local_int_vertex_first, + int64_t local_int_vertex_last, + bool do_expensive_check); + +template void unrenumber_int_vertices(raft::handle_t const& handle, + int32_t* vertices, + size_t num_vertices, + int32_t const* renumber_map_labels, + int32_t local_int_vertex_first, + int32_t local_int_vertex_last, + std::vector& vertex_partition_lasts, + bool do_expensive_check); + +template void unrenumber_int_vertices(raft::handle_t const& handle, + int32_t* vertices, + size_t num_vertices, + int32_t const* renumber_map_labels, + int32_t local_int_vertex_first, + int32_t local_int_vertex_last, + std::vector& vertex_partition_lasts, + bool do_expensive_check); + +template void unrenumber_int_vertices(raft::handle_t const& handle, + int64_t* vertices, + size_t num_vertices, + int64_t const* renumber_map_labels, + int64_t local_int_vertex_first, + int64_t local_int_vertex_last, + std::vector& vertex_partition_lasts, + bool do_expensive_check); + +template void unrenumber_int_vertices(raft::handle_t const& handle, + int64_t* vertices, + size_t num_vertices, + int64_t const* renumber_map_labels, + int64_t local_int_vertex_first, + int64_t local_int_vertex_last, + std::vector& vertex_partition_lasts, + bool do_expensive_check); + +} // namespace experimental +} // namespace cugraph diff --git a/cpp/src/experimental/sssp.cu b/cpp/src/experimental/sssp.cu index 4996b3734cb..373444cb0a2 100644 --- a/cpp/src/experimental/sssp.cu +++ b/cpp/src/experimental/sssp.cu @@ -70,6 +70,9 @@ void sssp(raft::handle_t const &handle, CUGRAPH_EXPECTS(push_graph_view.is_valid_vertex(source_vertex), "Invalid input argument: source vertex out-of-range."); + CUGRAPH_EXPECTS(push_graph_view.is_weighted(), + "Invalid input argument: an unweighted graph is passed to SSSP, BFS is more " + "efficient for unweighted graphs."); if (do_expensive_check) { auto num_negative_edge_weights = @@ -126,10 +129,7 @@ void sssp(raft::handle_t const &handle, // FIXME: need to double check the bucket sizes are sufficient std::vector bucket_sizes(static_cast(Bucket::num_buckets), push_graph_view.get_number_of_local_vertices()); - VertexFrontier, - vertex_t, - GraphViewType::is_multi_gpu, - static_cast(Bucket::num_buckets)> + VertexFrontier(Bucket::num_buckets)> vertex_frontier(handle, bucket_sizes); // 5. SSSP iteration @@ -188,7 +188,7 @@ void sssp(raft::handle_t const &handle, threshold = old_distance < threshold ? old_distance : threshold; } if (new_distance >= threshold) { push = false; } - return thrust::make_tuple(push, new_distance, src); + return thrust::make_tuple(push, thrust::make_tuple(new_distance, src)); }, reduce_op::min>(), distances, @@ -199,8 +199,8 @@ void sssp(raft::handle_t const &handle, auto idx = new_dist < v_val ? (new_dist < near_far_threshold ? static_cast(Bucket::new_near) : static_cast(Bucket::far)) - : VertexFrontier, vertex_t>::kInvalidBucketIdx; - return thrust::make_tuple(idx, thrust::get<0>(pushed_val), thrust::get<1>(pushed_val)); + : VertexFrontier::kInvalidBucketIdx; + return thrust::make_tuple(idx, pushed_val); }); vertex_frontier.get_bucket(static_cast(Bucket::cur_near)).clear(); @@ -222,7 +222,7 @@ void sssp(raft::handle_t const &handle, auto dist = *(distances + vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v)); if (dist < old_near_far_threshold) { - return VertexFrontier, vertex_t>::kInvalidBucketIdx; + return VertexFrontier::kInvalidBucketIdx; } else if (dist < near_far_threshold) { return static_cast(Bucket::cur_near); } else { diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu index a9e3146bbcd..4a2b98ea815 100644 --- a/cpp/src/utilities/cython.cu +++ b/cpp/src/utilities/cython.cu @@ -20,22 +20,101 @@ #include #include #include -#include #include #include #include #include +#include +#include + #include +#include +#include #include +#include #include +#include +#include + +#include +#include namespace cugraph { namespace cython { namespace detail { -// FIXME: Add description of this function +// workaround for CUDA extended lambda restrictions +template +struct compute_local_partition_id_t { + vertex_t const* lasts{nullptr}; + size_t num_local_partitions{0}; + + __device__ size_t operator()(vertex_t v) + { + for (size_t i = 0; i < num_local_partitions; ++i) { + if (v < lasts[i]) { return i; } + } + return num_local_partitions; + } +}; + +// FIXME: this is unnecessary if edge_counts_ in the major_minor_weights_t object returned by +// call_shuffle() is passed back, better be fixed. this code assumes that the entire set of edges +// for each partition are consecutively stored. +template +std::vector compute_edge_counts(raft::handle_t const& handle, + graph_container_t const& graph_container) +{ + auto num_local_partitions = static_cast(graph_container.col_comm_size); + + std::vector partition_offsets_vector( + reinterpret_cast(graph_container.vertex_partition_offsets), + reinterpret_cast(graph_container.vertex_partition_offsets) + + (graph_container.row_comm_size * graph_container.col_comm_size) + 1); + + std::vector h_lasts(num_local_partitions); + for (size_t i = 0; i < h_lasts.size(); ++i) { + h_lasts[i] = partition_offsets_vector[graph_container.row_comm_size * (i + 1)]; + } + rmm::device_uvector d_lasts(h_lasts.size(), handle.get_stream()); + raft::update_device(d_lasts.data(), h_lasts.data(), h_lasts.size(), handle.get_stream()); + auto major_vertices = transposed + ? reinterpret_cast(graph_container.dst_vertices) + : reinterpret_cast(graph_container.src_vertices); + auto key_first = thrust::make_transform_iterator( + major_vertices, compute_local_partition_id_t{d_lasts.data(), num_local_partitions}); + rmm::device_uvector d_local_partition_ids(num_local_partitions, handle.get_stream()); + rmm::device_uvector d_edge_counts(d_local_partition_ids.size(), handle.get_stream()); + auto it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + key_first, + key_first + graph_container.num_local_edges, + thrust::make_constant_iterator(edge_t{1}), + d_local_partition_ids.begin(), + d_edge_counts.begin()); + if (static_cast(thrust::distance(d_local_partition_ids.begin(), thrust::get<0>(it))) < + num_local_partitions) { + rmm::device_uvector d_counts(num_local_partitions, handle.get_stream()); + thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_counts.begin(), + d_counts.end(), + edge_t{0}); + thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_edge_counts.begin(), + thrust::get<1>(it), + d_local_partition_ids.begin(), + d_counts.begin()); + d_edge_counts = std::move(d_counts); + } + std::vector h_edge_counts(num_local_partitions, 0); + raft::update_host( + h_edge_counts.data(), d_edge_counts.data(), d_edge_counts.size(), handle.get_stream()); + handle.get_stream_view().synchronize(); + + return h_edge_counts; +} + template > create_graph(raft::handle_t const& handle, graph_container_t const& graph_container) { - std::vector> edgelist( - {{reinterpret_cast(graph_container.src_vertices), - reinterpret_cast(graph_container.dst_vertices), - reinterpret_cast(graph_container.weights), - static_cast(graph_container.num_partition_edges)}}); + auto num_local_partitions = static_cast(graph_container.col_comm_size); std::vector partition_offsets_vector( reinterpret_cast(graph_container.vertex_partition_offsets), reinterpret_cast(graph_container.vertex_partition_offsets) + (graph_container.row_comm_size * graph_container.col_comm_size) + 1); + auto edge_counts = compute_edge_counts(handle, graph_container); + + std::vector displacements(edge_counts.size(), 0); + std::partial_sum(edge_counts.begin(), edge_counts.end() - 1, displacements.begin() + 1); + + std::vector> edgelists( + num_local_partitions); + for (size_t i = 0; i < edgelists.size(); ++i) { + edgelists[i] = cugraph::experimental::edgelist_t{ + reinterpret_cast(graph_container.src_vertices) + displacements[i], + reinterpret_cast(graph_container.dst_vertices) + displacements[i], + graph_container.graph_props.is_weighted + ? reinterpret_cast(graph_container.weights) + displacements[i] + : static_cast(nullptr), + edge_counts[i]}; + } + experimental::partition_t partition(partition_offsets_vector, - graph_container.hypergraph_partitioned, graph_container.row_comm_size, graph_container.col_comm_size, graph_container.row_comm_rank, @@ -65,14 +156,12 @@ create_graph(raft::handle_t const& handle, graph_container_t const& graph_contai return std::make_unique>( handle, - edgelist, + edgelists, partition, static_cast(graph_container.num_global_vertices), static_cast(graph_container.num_global_edges), graph_container.graph_props, - // FIXME: This currently fails if sorted_by_degree is true... - // graph_container.sorted_by_degree, - false, + true, graph_container.do_expensive_check); } @@ -89,7 +178,7 @@ create_graph(raft::handle_t const& handle, graph_container_t const& graph_contai reinterpret_cast(graph_container.src_vertices), reinterpret_cast(graph_container.dst_vertices), reinterpret_cast(graph_container.weights), - static_cast(graph_container.num_partition_edges)}; + static_cast(graph_container.num_local_edges)}; return std::make_unique>( handle, edgelist, @@ -113,10 +202,11 @@ void populate_graph_container(graph_container_t& graph_container, numberTypeEnum vertexType, numberTypeEnum edgeType, numberTypeEnum weightType, - size_t num_partition_edges, + size_t num_local_edges, size_t num_global_vertices, size_t num_global_edges, bool sorted_by_degree, + bool is_weighted, bool transposed, bool multi_gpu) { @@ -124,7 +214,6 @@ void populate_graph_container(graph_container_t& graph_container, "populate_graph_container() can only be called on an empty container."); bool do_expensive_check{true}; - bool hypergraph_partitioned{false}; if (multi_gpu) { auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); @@ -143,7 +232,7 @@ void populate_graph_container(graph_container_t& graph_container, graph_container.src_vertices = src_vertices; graph_container.dst_vertices = dst_vertices; graph_container.weights = weights; - graph_container.num_partition_edges = num_partition_edges; + graph_container.num_local_edges = num_local_edges; graph_container.num_global_vertices = num_global_vertices; graph_container.num_global_edges = num_global_edges; graph_container.vertexType = vertexType; @@ -151,11 +240,11 @@ void populate_graph_container(graph_container_t& graph_container, graph_container.weightType = weightType; graph_container.transposed = transposed; graph_container.is_multi_gpu = multi_gpu; - graph_container.hypergraph_partitioned = hypergraph_partitioned; graph_container.sorted_by_degree = sorted_by_degree; graph_container.do_expensive_check = do_expensive_check; - experimental::graph_properties_t graph_props{.is_symmetric = false, .is_multigraph = false}; + experimental::graph_properties_t graph_props{ + .is_symmetric = false, .is_multigraph = false, .is_weighted = is_weighted}; graph_container.graph_props = graph_props; graph_container.graph_type = graphTypeEnum::graph_t; @@ -177,7 +266,7 @@ void populate_graph_container_legacy(graph_container_t& graph_container, int* local_offsets) { CUGRAPH_EXPECTS(graph_container.graph_type == graphTypeEnum::null, - "populate_graph_container() can only be called on an empty container."); + "populate_graph_container_legacy() can only be called on an empty container."); // FIXME: This is soon-to-be legacy code left in place until the new graph_t // class is supported everywhere else. Remove everything down to the comment @@ -802,23 +891,23 @@ void call_sssp(raft::handle_t const& handle, // wrapper for shuffling: // template -std::unique_ptr> call_shuffle( +std::unique_ptr> call_shuffle( raft::handle_t const& handle, vertex_t* edgelist_major_vertices, // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place vertex_t* edgelist_minor_vertices, // [IN / OUT] weight_t* edgelist_weights, // [IN / OUT] - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned) // = false + edge_t num_edgelist_edges) { - auto& comm = handle.get_comms(); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_size = col_comm.get_size(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - - std::unique_ptr> ptr_ret = - std::make_unique>(handle); + std::unique_ptr> ptr_ret = + std::make_unique>(handle); if (edgelist_weights != nullptr) { auto zip_edge = thrust::make_zip_iterator( @@ -833,10 +922,7 @@ std::unique_ptr> call_shuffle( zip_edge + num_edgelist_edges, [key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t{ - is_hypergraph_partitioned, - comm.get_size(), - row_comm.get_size(), - col_comm.get_size()}] __device__(auto val) { + comm.get_size(), row_comm.get_size(), col_comm.get_size()}] __device__(auto val) { return key_func(thrust::get<0>(val), thrust::get<1>(val)); }, handle.get_stream()); @@ -852,15 +938,46 @@ std::unique_ptr> call_shuffle( zip_edge + num_edgelist_edges, [key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t{ - is_hypergraph_partitioned, - comm.get_size(), - row_comm.get_size(), - col_comm.get_size()}] __device__(auto val) { + comm.get_size(), row_comm.get_size(), col_comm.get_size()}] __device__(auto val) { return key_func(thrust::get<0>(val), thrust::get<1>(val)); }, handle.get_stream()); } + auto local_partition_id_op = + [comm_size, + key_func = cugraph::experimental::detail::compute_partition_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto pair) { + return key_func(thrust::get<0>(pair), thrust::get<1>(pair)) / + comm_size; // global partition id to local partition id + }; + auto pair_first = thrust::make_zip_iterator( + thrust::make_tuple(ptr_ret->get_major().data(), ptr_ret->get_minor().data())); + + auto edge_counts = + (edgelist_weights != nullptr) + ? cugraph::experimental::groupby_and_count(pair_first, + pair_first + ptr_ret->get_major().size(), + ptr_ret->get_weights().data(), + local_partition_id_op, + col_comm_size, + handle.get_stream()) + : cugraph::experimental::groupby_and_count(pair_first, + pair_first + ptr_ret->get_major().size(), + local_partition_id_op, + col_comm_size, + handle.get_stream()); + + std::vector h_edge_counts(edge_counts.size()); + raft::update_host( + h_edge_counts.data(), edge_counts.data(), edge_counts.size(), handle.get_stream()); + handle.get_stream_view().synchronize(); + + ptr_ret->get_edge_counts().resize(h_edge_counts.size()); + for (size_t i = 0; i < h_edge_counts.size(); ++i) { + ptr_ret->get_edge_counts()[i] = static_cast(h_edge_counts[i]); + } + return ptr_ret; // RVO-ed } @@ -872,8 +989,7 @@ std::unique_ptr> call_renumber( raft::handle_t const& handle, vertex_t* shuffled_edgelist_major_vertices /* [INOUT] */, vertex_t* shuffled_edgelist_minor_vertices /* [INOUT] */, - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edge_counts, bool do_expensive_check, bool multi_gpu) // bc. cython cannot take non-type template params { @@ -883,33 +999,31 @@ std::unique_ptr> call_renumber( std::make_unique>(handle); if (multi_gpu) { + std::vector displacements(edge_counts.size(), edge_t{0}); + std::partial_sum(edge_counts.begin(), edge_counts.end() - 1, displacements.begin() + 1); + std::vector major_ptrs(edge_counts.size()); + std::vector minor_ptrs(major_ptrs.size()); + for (size_t i = 0; i < edge_counts.size(); ++i) { + major_ptrs[i] = shuffled_edgelist_major_vertices + displacements[i]; + minor_ptrs[i] = shuffled_edgelist_minor_vertices + displacements[i]; + } + std::tie( p_ret->get_dv(), p_ret->get_partition(), p_ret->get_num_vertices(), p_ret->get_num_edges()) = cugraph::experimental::renumber_edgelist( - handle, - shuffled_edgelist_major_vertices, - shuffled_edgelist_minor_vertices, - num_edgelist_edges, - is_hypergraph_partitioned, - do_expensive_check); + handle, major_ptrs, minor_ptrs, edge_counts, do_expensive_check); } else { - auto ret_f = cugraph::experimental::renumber_edgelist( + p_ret->get_dv() = cugraph::experimental::renumber_edgelist( handle, shuffled_edgelist_major_vertices, shuffled_edgelist_minor_vertices, - num_edgelist_edges, + edge_counts[0], do_expensive_check); - auto tot_vertices = static_cast(ret_f.size()); - - p_ret->get_dv() = std::move(ret_f); - cugraph::experimental::partition_t part_sg( - std::vector{0, tot_vertices}, false, 1, 1, 0, 0); - - p_ret->get_partition() = std::move(part_sg); + p_ret->get_partition() = cugraph::experimental::partition_t{}; // dummy - p_ret->get_num_vertices() = tot_vertices; - p_ret->get_num_edges() = num_edgelist_edges; + p_ret->get_num_vertices() = static_cast(p_ret->get_dv().size()); + p_ret->get_num_edges() = edge_counts[0]; } return p_ret; // RVO-ed (copy ellision) @@ -1142,53 +1256,47 @@ template void call_sssp(raft::handle_t const& handle, int64_t* predecessors, const int64_t source_vertex); -template std::unique_ptr> call_shuffle( +template std::unique_ptr> call_shuffle( raft::handle_t const& handle, int32_t* edgelist_major_vertices, int32_t* edgelist_minor_vertices, float* edgelist_weights, - int32_t num_edgelist_edges, - bool is_hypergraph_partitioned); + int32_t num_edgelist_edges); -template std::unique_ptr> call_shuffle( +template std::unique_ptr> call_shuffle( raft::handle_t const& handle, int32_t* edgelist_major_vertices, int32_t* edgelist_minor_vertices, float* edgelist_weights, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned); + int64_t num_edgelist_edges); -template std::unique_ptr> call_shuffle( +template std::unique_ptr> call_shuffle( raft::handle_t const& handle, int32_t* edgelist_major_vertices, int32_t* edgelist_minor_vertices, double* edgelist_weights, - int32_t num_edgelist_edges, - bool is_hypergraph_partitioned); + int32_t num_edgelist_edges); -template std::unique_ptr> call_shuffle( +template std::unique_ptr> call_shuffle( raft::handle_t const& handle, int32_t* edgelist_major_vertices, int32_t* edgelist_minor_vertices, double* edgelist_weights, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned); + int64_t num_edgelist_edges); -template std::unique_ptr> call_shuffle( +template std::unique_ptr> call_shuffle( raft::handle_t const& handle, int64_t* edgelist_major_vertices, int64_t* edgelist_minor_vertices, float* edgelist_weights, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned); + int64_t num_edgelist_edges); -template std::unique_ptr> call_shuffle( +template std::unique_ptr> call_shuffle( raft::handle_t const& handle, int64_t* edgelist_major_vertices, int64_t* edgelist_minor_vertices, double* edgelist_weights, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned); + int64_t num_edgelist_edges); // TODO: add the remaining relevant EIDIr's: // @@ -1196,8 +1304,7 @@ template std::unique_ptr> call_renumber( raft::handle_t const& handle, int32_t* shuffled_edgelist_major_vertices /* [INOUT] */, int32_t* shuffled_edgelist_minor_vertices /* [INOUT] */, - int32_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edge_counts, bool do_expensive_check, bool multi_gpu); @@ -1205,8 +1312,7 @@ template std::unique_ptr> call_renumber( raft::handle_t const& handle, int32_t* shuffled_edgelist_major_vertices /* [INOUT] */, int32_t* shuffled_edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edge_counts, bool do_expensive_check, bool multi_gpu); @@ -1214,8 +1320,7 @@ template std::unique_ptr> call_renumber( raft::handle_t const& handle, int64_t* shuffled_edgelist_major_vertices /* [INOUT] */, int64_t* shuffled_edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edge_counts, bool do_expensive_check, bool multi_gpu); diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 3b65b0edb29..89975f673ae 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -20,9 +20,10 @@ # - common test utils ----------------------------------------------------------------------------- add_library(cugraphtestutil STATIC - "${CMAKE_CURRENT_SOURCE_DIR}/utilities/generate_graph_from_edgelist.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/matrix_market_file_utilities.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/rmat_utilities.cu" + "${CMAKE_CURRENT_SOURCE_DIR}/utilities/generate_graph_from_edgelist.cu" + "${CMAKE_CURRENT_SOURCE_DIR}/utilities/thrust_wrapper.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/misc_utilities.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c") @@ -445,7 +446,34 @@ if(BUILD_CUGRAPH_MG_TESTS) target_link_libraries(MG_PAGERANK_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX) ########################################################################################### - # - MG LOUVAIN tests --------------------------------------------------------------------- + # - MG KATZ CENTRALITY tests -------------------------------------------------------------- + + set(MG_KATZ_CENTRALITY_TEST_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/experimental/mg_katz_centrality_test.cpp") + + ConfigureTest(MG_KATZ_CENTRALITY_TEST "${MG_KATZ_CENTRALITY_TEST_SRCS}") + target_link_libraries(MG_KATZ_CENTRALITY_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX) + + ########################################################################################### + # - MG BFS tests -------------------------------------------------------------------------- + + set(MG_BFS_TEST_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/experimental/mg_bfs_test.cpp") + + ConfigureTest(MG_BFS_TEST "${MG_BFS_TEST_SRCS}") + target_link_libraries(MG_BFS_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX) + + ########################################################################################### + # - MG SSSP tests ------------------------------------------------------------------------- + + set(MG_SSSP_TEST_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/experimental/mg_sssp_test.cpp") + + ConfigureTest(MG_SSSP_TEST "${MG_SSSP_TEST_SRCS}") + target_link_libraries(MG_SSSP_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX) + + ########################################################################################### + # - MG LOUVAIN tests ---------------------------------------------------------------------- set(MG_LOUVAIN_TEST_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/community/mg_louvain_helper.cu" @@ -453,7 +481,6 @@ if(BUILD_CUGRAPH_MG_TESTS) ConfigureTest(MG_LOUVAIN_TEST "${MG_LOUVAIN_TEST_SRCS}") target_link_libraries(MG_LOUVAIN_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX) - target_link_libraries(MG_LOUVAIN_TEST PRIVATE cugraph) else(MPI_CXX_FOUND) message(FATAL_ERROR "OpenMPI NOT found, cannot build MG tests.") diff --git a/cpp/tests/community/egonet_test.cu b/cpp/tests/community/egonet_test.cu index e7fea43be42..d61080c685e 100644 --- a/cpp/tests/community/egonet_test.cu +++ b/cpp/tests/community/egonet_test.cu @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -129,8 +128,10 @@ class Tests_InducedEgo : public ::testing::TestWithParam { ASSERT_TRUE(h_cugraph_ego_edge_offsets[i] <= h_cugraph_ego_edge_offsets[i + 1]); auto n_vertices = graph_view.get_number_of_vertices(); for (size_t i = 0; i < d_ego_edgelist_src.size(); i++) { - ASSERT_TRUE(cugraph::test::is_valid_vertex(n_vertices, h_cugraph_ego_edgelist_src[i])); - ASSERT_TRUE(cugraph::test::is_valid_vertex(n_vertices, h_cugraph_ego_edgelist_dst[i])); + ASSERT_TRUE( + cugraph::experimental::is_valid_vertex(n_vertices, h_cugraph_ego_edgelist_src[i])); + ASSERT_TRUE( + cugraph::experimental::is_valid_vertex(n_vertices, h_cugraph_ego_edgelist_dst[i])); } /* diff --git a/cpp/tests/community/mg_louvain_helper.cu b/cpp/tests/community/mg_louvain_helper.cu index a7f95e6d718..661065ca65b 100644 --- a/cpp/tests/community/mg_louvain_helper.cu +++ b/cpp/tests/community/mg_louvain_helper.cu @@ -323,7 +323,8 @@ coarsen_graph( handle, edgelist, new_number_of_vertices, - cugraph::experimental::graph_properties_t{graph_view.is_symmetric(), false}, + cugraph::experimental::graph_properties_t{ + graph_view.is_symmetric(), false, graph_view.is_weighted()}, true); } diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp index f6596a6b59a..8a1a3010a6f 100644 --- a/cpp/tests/community/mg_louvain_test.cpp +++ b/cpp/tests/community/mg_louvain_test.cpp @@ -31,10 +31,13 @@ #include -void compare(float modularity, float sg_modularity) { ASSERT_FLOAT_EQ(modularity, sg_modularity); } -void compare(double modularity, double sg_modularity) +void compare(float mg_modularity, float sg_modularity) { - ASSERT_DOUBLE_EQ(modularity, sg_modularity); + ASSERT_FLOAT_EQ(mg_modularity, sg_modularity); +} +void compare(double mg_modularity, double sg_modularity) +{ + ASSERT_DOUBLE_EQ(mg_modularity, sg_modularity); } //////////////////////////////////////////////////////////////////////////////// @@ -90,13 +93,13 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam cugraph::Dendrogram const& dendrogram, weight_t resolution, int rank, - weight_t modularity) + weight_t mg_modularity) { auto sg_graph = std::make_unique>( handle); rmm::device_uvector d_clustering_v(0, handle.get_stream()); - weight_t sg_modularity; + weight_t sg_modularity{-1.0}; if (rank == 0) { // Create initial SG graph, renumbered according to the MNMG renumber map @@ -160,7 +163,7 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam } }); - if (rank == 0) compare(modularity, sg_modularity); + if (rank == 0) compare(mg_modularity, sg_modularity); } // Compare the results of running louvain on multiple GPUs to that of a @@ -197,9 +200,9 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam auto mg_graph_view = mg_graph.view(); std::unique_ptr> dendrogram; - weight_t modularity; + weight_t mg_modularity; - std::tie(dendrogram, modularity) = + std::tie(dendrogram, mg_modularity) = cugraph::louvain(handle, mg_graph_view, param.max_level, param.resolution); SCOPED_TRACE("compare modularity input: " + param.graph_file_full_path); @@ -213,7 +216,7 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam *dendrogram, param.resolution, comm_rank, - modularity); + mg_modularity); } }; diff --git a/cpp/tests/experimental/bfs_test.cpp b/cpp/tests/experimental/bfs_test.cpp index ad9ece99ef9..8fce9488d8a 100644 --- a/cpp/tests/experimental/bfs_test.cpp +++ b/cpp/tests/experimental/bfs_test.cpp @@ -16,9 +16,11 @@ #include #include +#include #include #include +#include #include #include @@ -28,10 +30,16 @@ #include +#include #include #include #include +// do the perf measurements +// enabled by command line parameter s'--perf' +// +static int PERF = 0; + template void bfs_reference(edge_t const* offsets, vertex_t const* indices, @@ -74,9 +82,12 @@ void bfs_reference(edge_t const* offsets, typedef struct BFS_Usecase_t { cugraph::test::input_graph_specifier_t input_graph_specifier{}; - size_t source{false}; - BFS_Usecase_t(std::string const& graph_file_path, size_t source) : source(source) + size_t source{0}; + bool check_correctness{false}; + + BFS_Usecase_t(std::string const& graph_file_path, size_t source, bool check_correctness = true) + : source(source), check_correctness(check_correctness) { std::string graph_file_full_path{}; if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { @@ -88,13 +99,43 @@ typedef struct BFS_Usecase_t { input_graph_specifier.graph_file_full_path = graph_file_full_path; }; - BFS_Usecase_t(cugraph::test::rmat_params_t rmat_params, size_t source) : source(source) + BFS_Usecase_t(cugraph::test::rmat_params_t rmat_params, + size_t source, + bool check_correctness = true) + : source(source), check_correctness(check_correctness) { input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; input_graph_specifier.rmat_params = rmat_params; } } BFS_Usecase; +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, BFS_Usecase const& configuration, bool renumber) +{ + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, configuration.input_graph_specifier.graph_file_full_path, false, renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + false, + renumber, + std::vector{0}, + size_t{1}); +} + class Tests_BFS : public ::testing::TestWithParam { public: Tests_BFS() {} @@ -107,58 +148,21 @@ class Tests_BFS : public ::testing::TestWithParam { template void run_current_test(BFS_Usecase const& configuration) { + constexpr bool renumber = true; + using weight_t = float; raft::handle_t handle{}; cugraph::experimental::graph_t graph(handle); - std::tie(graph, std::ignore) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test:: - read_graph_from_matrix_market_file( - handle, configuration.input_graph_specifier.graph_file_full_path, false, false) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - false, - false); + rmm::device_uvector d_renumber_map_labels(0, handle.get_stream()); + std::tie(graph, d_renumber_map_labels) = + read_graph(handle, configuration, renumber); auto graph_view = graph.view(); - std::vector h_offsets(graph_view.get_number_of_vertices() + 1); - std::vector h_indices(graph_view.get_number_of_edges()); - raft::update_host(h_offsets.data(), - graph_view.offsets(), - graph_view.get_number_of_vertices() + 1, - handle.get_stream()); - raft::update_host(h_indices.data(), - graph_view.indices(), - graph_view.get_number_of_edges(), - handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - ASSERT_TRUE(configuration.source >= 0 && - configuration.source <= graph_view.get_number_of_vertices()) - << "Starting sources should be >= 0 and" - << " less than the number of vertices in the graph."; - - std::vector h_reference_distances(graph_view.get_number_of_vertices()); - std::vector h_reference_predecessors(graph_view.get_number_of_vertices()); - - bfs_reference(h_offsets.data(), - h_indices.data(), - h_reference_distances.data(), - h_reference_predecessors.data(), - graph_view.get_number_of_vertices(), - static_cast(configuration.source), - std::numeric_limits::max()); + ASSERT_TRUE(static_cast(configuration.source) >= 0 && + static_cast(configuration.source) < graph_view.get_number_of_vertices()) + << "Invalid starting source."; rmm::device_uvector d_distances(graph_view.get_number_of_vertices(), handle.get_stream()); @@ -169,46 +173,120 @@ class Tests_BFS : public ::testing::TestWithParam { cugraph::experimental::bfs(handle, graph_view, - d_distances.begin(), - d_predecessors.begin(), + d_distances.data(), + d_predecessors.data(), static_cast(configuration.source), false, - std::numeric_limits::max(), - false); + std::numeric_limits::max()); CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - std::vector h_cugraph_distances(graph_view.get_number_of_vertices()); - std::vector h_cugraph_predecessors(graph_view.get_number_of_vertices()); - - raft::update_host( - h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream()); - raft::update_host(h_cugraph_predecessors.data(), - d_predecessors.data(), - d_predecessors.size(), - handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - ASSERT_TRUE(std::equal( - h_reference_distances.begin(), h_reference_distances.end(), h_cugraph_distances.begin())) - << "distances do not match with the reference values."; - - for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) { - auto i = std::distance(h_cugraph_predecessors.begin(), it); - if (*it == cugraph::invalid_vertex_id::value) { - ASSERT_TRUE(h_reference_predecessors[i] == *it) - << "vertex reachability do not match with the reference."; + if (configuration.check_correctness) { + cugraph::experimental::graph_t unrenumbered_graph( + handle); + if (renumber) { + std::tie(unrenumbered_graph, std::ignore) = + read_graph(handle, configuration, false); + } + auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view; + + std::vector h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1); + std::vector h_indices(unrenumbered_graph_view.get_number_of_edges()); + raft::update_host(h_offsets.data(), + unrenumbered_graph_view.offsets(), + unrenumbered_graph_view.get_number_of_vertices() + 1, + handle.get_stream()); + raft::update_host(h_indices.data(), + unrenumbered_graph_view.indices(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + auto unrenumbered_source = static_cast(configuration.source); + if (renumber) { + std::vector h_renumber_map_labels(d_renumber_map_labels.size()); + raft::update_host(h_renumber_map_labels.data(), + d_renumber_map_labels.data(), + d_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + unrenumbered_source = h_renumber_map_labels[configuration.source]; + } + + std::vector h_reference_distances(unrenumbered_graph_view.get_number_of_vertices()); + std::vector h_reference_predecessors( + unrenumbered_graph_view.get_number_of_vertices()); + + bfs_reference(h_offsets.data(), + h_indices.data(), + h_reference_distances.data(), + h_reference_predecessors.data(), + unrenumbered_graph_view.get_number_of_vertices(), + unrenumbered_source, + std::numeric_limits::max()); + + std::vector h_cugraph_distances(graph_view.get_number_of_vertices()); + std::vector h_cugraph_predecessors(graph_view.get_number_of_vertices()); + if (renumber) { + cugraph::experimental::unrenumber_local_int_vertices(handle, + d_predecessors.data(), + d_predecessors.size(), + d_renumber_map_labels.data(), + vertex_t{0}, + graph_view.get_number_of_vertices(), + true); + + auto d_unrenumbered_distances = cugraph::test::sort_by_key( + handle, d_renumber_map_labels.data(), d_distances.data(), d_renumber_map_labels.size()); + auto d_unrenumbered_predecessors = cugraph::test::sort_by_key(handle, + d_renumber_map_labels.data(), + d_predecessors.data(), + d_renumber_map_labels.size()); + raft::update_host(h_cugraph_distances.data(), + d_unrenumbered_distances.data(), + d_unrenumbered_distances.size(), + handle.get_stream()); + raft::update_host(h_cugraph_predecessors.data(), + d_unrenumbered_predecessors.data(), + d_unrenumbered_predecessors.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); } else { - ASSERT_TRUE(h_reference_distances[*it] + 1 == h_reference_distances[i]) - << "distance to this vertex != distance to the predecessor vertex + 1."; - bool found{false}; - for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) { - if (h_indices[j] == i) { - found = true; - break; + raft::update_host( + h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream()); + raft::update_host(h_cugraph_predecessors.data(), + d_predecessors.data(), + d_predecessors.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + } + + ASSERT_TRUE(std::equal( + h_reference_distances.begin(), h_reference_distances.end(), h_cugraph_distances.begin())) + << "distances do not match with the reference values."; + + for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) { + auto i = std::distance(h_cugraph_predecessors.begin(), it); + if (*it == cugraph::invalid_vertex_id::value) { + ASSERT_TRUE(h_reference_predecessors[i] == *it) + << "vertex reachability does not match with the reference."; + } else { + ASSERT_TRUE(h_reference_distances[*it] + 1 == h_reference_distances[i]) + << "distance to this vertex != distance to the predecessor vertex + 1."; + bool found{false}; + for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) { + if (h_indices[j] == i) { + found = true; + break; + } } + ASSERT_TRUE(found) << "no edge from the predecessor vertex to this vertex."; } - ASSERT_TRUE(found) << "no edge from the predecessor vertex to this vertex."; } } } @@ -221,12 +299,17 @@ INSTANTIATE_TEST_CASE_P( simple_test, Tests_BFS, ::testing::Values( + // enable correctness checks BFS_Usecase("test/datasets/karate.mtx", 0), BFS_Usecase("test/datasets/polbooks.mtx", 0), BFS_Usecase("test/datasets/netscience.mtx", 0), BFS_Usecase("test/datasets/netscience.mtx", 100), BFS_Usecase("test/datasets/wiki2003.mtx", 1000), BFS_Usecase("test/datasets/wiki-Talk.mtx", 1000), - BFS_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0))); + BFS_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0), + // disable correctness checks for large graphs + BFS_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + 0, + false))); CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/coarsen_graph_test.cpp b/cpp/tests/experimental/coarsen_graph_test.cpp index 789619f2cd9..0fc0634bbbc 100644 --- a/cpp/tests/experimental/coarsen_graph_test.cpp +++ b/cpp/tests/experimental/coarsen_graph_test.cpp @@ -54,13 +54,14 @@ void check_coarsened_graph_results(edge_t* org_offsets, ASSERT_TRUE(std::count_if(org_indices, org_indices + org_offsets[num_org_vertices], [num_org_vertices](auto nbr) { - return !cugraph::test::is_valid_vertex(num_org_vertices, nbr); + return !cugraph::experimental::is_valid_vertex(num_org_vertices, nbr); }) == 0); ASSERT_TRUE(std::is_sorted(coarse_offsets, coarse_offsets + num_coarse_vertices)); ASSERT_TRUE(std::count_if(coarse_indices, coarse_indices + coarse_offsets[num_coarse_vertices], [num_coarse_vertices](auto nbr) { - return !cugraph::test::is_valid_vertex(num_coarse_vertices, nbr); + return !cugraph::experimental::is_valid_vertex(num_coarse_vertices, + nbr); }) == 0); ASSERT_TRUE(num_coarse_vertices <= num_org_vertices); diff --git a/cpp/tests/experimental/generate_rmat_test.cpp b/cpp/tests/experimental/generate_rmat_test.cpp index 666106d62ca..221accea4f7 100644 --- a/cpp/tests/experimental/generate_rmat_test.cpp +++ b/cpp/tests/experimental/generate_rmat_test.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -201,17 +202,19 @@ class Tests_GenerateRmat : public ::testing::TestWithParam (h_cugraph_srcs.size() == (size_t{1} << configuration.scale) * configuration.edge_factor) && (h_cugraph_dsts.size() == (size_t{1} << configuration.scale) * configuration.edge_factor)) << "Returned an invalid number of R-mat graph edges."; - ASSERT_TRUE( - std::count_if(h_cugraph_srcs.begin(), - h_cugraph_srcs.end(), - [num_vertices = static_cast(size_t{1} << configuration.scale)]( - auto v) { return !cugraph::test::is_valid_vertex(num_vertices, v); }) == 0) + ASSERT_TRUE(std::count_if(h_cugraph_srcs.begin(), + h_cugraph_srcs.end(), + [num_vertices = static_cast( + size_t{1} << configuration.scale)](auto v) { + return !cugraph::experimental::is_valid_vertex(num_vertices, v); + }) == 0) << "Returned R-mat graph edges have invalid source vertex IDs."; - ASSERT_TRUE( - std::count_if(h_cugraph_dsts.begin(), - h_cugraph_dsts.end(), - [num_vertices = static_cast(size_t{1} << configuration.scale)]( - auto v) { return !cugraph::test::is_valid_vertex(num_vertices, v); }) == 0) + ASSERT_TRUE(std::count_if(h_cugraph_dsts.begin(), + h_cugraph_dsts.end(), + [num_vertices = static_cast( + size_t{1} << configuration.scale)](auto v) { + return !cugraph::experimental::is_valid_vertex(num_vertices, v); + }) == 0) << "Returned R-mat graph edges have invalid destination vertex IDs."; if (!scramble) { diff --git a/cpp/tests/experimental/graph_test.cpp b/cpp/tests/experimental/graph_test.cpp index 949f6d2e08e..6ce32e0c836 100644 --- a/cpp/tests/experimental/graph_test.cpp +++ b/cpp/tests/experimental/graph_test.cpp @@ -139,7 +139,7 @@ class Tests_Graph : public ::testing::TestWithParam { handle, edgelist, number_of_vertices, - cugraph::experimental::graph_properties_t{is_symmetric, false}, + cugraph::experimental::graph_properties_t{is_symmetric, false, configuration.test_weighted}, false, true); diff --git a/cpp/tests/experimental/katz_centrality_test.cpp b/cpp/tests/experimental/katz_centrality_test.cpp index 776bb60716c..71011f3d018 100644 --- a/cpp/tests/experimental/katz_centrality_test.cpp +++ b/cpp/tests/experimental/katz_centrality_test.cpp @@ -16,9 +16,11 @@ #include #include +#include #include #include +#include #include #include @@ -34,6 +36,11 @@ #include #include +// do the perf measurements +// enabled by command line parameter s'--perf' +// +static int PERF = 0; + template void katz_centrality_reference(edge_t const* offsets, vertex_t const* indices, @@ -92,9 +99,12 @@ typedef struct KatzCentrality_Usecase_t { cugraph::test::input_graph_specifier_t input_graph_specifier{}; bool test_weighted{false}; + bool check_correctness{false}; - KatzCentrality_Usecase_t(std::string const& graph_file_path, bool test_weighted) - : test_weighted(test_weighted) + KatzCentrality_Usecase_t(std::string const& graph_file_path, + bool test_weighted, + bool check_correctness = true) + : test_weighted(test_weighted), check_correctness(check_correctness) { std::string graph_file_full_path{}; if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { @@ -107,15 +117,45 @@ typedef struct KatzCentrality_Usecase_t { }; KatzCentrality_Usecase_t(cugraph::test::rmat_params_t rmat_params, - double personalization_ratio, - bool test_weighted) - : test_weighted(test_weighted) + bool test_weighted, + bool check_correctness = true) + : test_weighted(test_weighted), check_correctness(check_correctness) { input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; input_graph_specifier.rmat_params = rmat_params; } } KatzCentrality_Usecase; +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, KatzCentrality_Usecase const& configuration, bool renumber) +{ + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, + configuration.input_graph_specifier.graph_file_full_path, + configuration.test_weighted, + renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + configuration.test_weighted, + renumber, + std::vector{0}, + size_t{1}); +} + class Tests_KatzCentrality : public ::testing::TestWithParam { public: Tests_KatzCentrality() {} @@ -128,76 +168,26 @@ class Tests_KatzCentrality : public ::testing::TestWithParam void run_current_test(KatzCentrality_Usecase const& configuration) { + constexpr bool renumber = true; + raft::handle_t handle{}; cugraph::experimental::graph_t graph(handle); - std::tie(graph, std::ignore) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test:: - read_graph_from_matrix_market_file( - handle, - configuration.input_graph_specifier.graph_file_full_path, - configuration.test_weighted, - false) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - configuration.test_weighted, - false); + rmm::device_uvector d_renumber_map_labels(0, handle.get_stream()); + std::tie(graph, d_renumber_map_labels) = + read_graph(handle, configuration, renumber); auto graph_view = graph.view(); - std::vector h_offsets(graph_view.get_number_of_vertices() + 1); - std::vector h_indices(graph_view.get_number_of_edges()); - std::vector h_weights{}; - raft::update_host(h_offsets.data(), - graph_view.offsets(), - graph_view.get_number_of_vertices() + 1, - handle.get_stream()); - raft::update_host(h_indices.data(), - graph_view.indices(), - graph_view.get_number_of_edges(), - handle.get_stream()); - if (graph_view.is_weighted()) { - h_weights.assign(graph_view.get_number_of_edges(), weight_t{0.0}); - raft::update_host(h_weights.data(), - graph_view.weights(), - graph_view.get_number_of_edges(), - handle.get_stream()); - } - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - std::vector h_reference_katz_centralities(graph_view.get_number_of_vertices()); - - std::vector tmps(h_offsets.size()); - std::adjacent_difference(h_offsets.begin(), h_offsets.end(), tmps.begin()); - auto max_it = std::max_element(tmps.begin(), tmps.end()); + auto degrees = graph_view.compute_in_degrees(handle); + std::vector h_degrees(degrees.size()); + raft::update_host(h_degrees.data(), degrees.data(), degrees.size(), handle.get_stream()); + handle.get_stream_view().synchronize(); + auto max_it = std::max_element(h_degrees.begin(), h_degrees.end()); result_t const alpha = result_t{1.0} / static_cast(*max_it + 1); result_t constexpr beta{1.0}; result_t constexpr epsilon{1e-6}; - katz_centrality_reference( - h_offsets.data(), - h_indices.data(), - h_weights.size() > 0 ? h_weights.data() : static_cast(nullptr), - static_cast(nullptr), - h_reference_katz_centralities.data(), - graph_view.get_number_of_vertices(), - alpha, - beta, - epsilon, - std::numeric_limits::max(), - false, - true); - rmm::device_uvector d_katz_centralities(graph_view.get_number_of_vertices(), handle.get_stream()); @@ -206,39 +196,98 @@ class Tests_KatzCentrality : public ::testing::TestWithParam(nullptr), - d_katz_centralities.begin(), + d_katz_centralities.data(), alpha, beta, epsilon, std::numeric_limits::max(), false, - true, - false); + true); CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - std::vector h_cugraph_katz_centralities(graph_view.get_number_of_vertices()); - - raft::update_host(h_cugraph_katz_centralities.data(), - d_katz_centralities.data(), - d_katz_centralities.size(), - handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - auto threshold_ratio = 1e-3; - auto threshold_magnitude = - (1.0 / static_cast(graph_view.get_number_of_vertices())) * - threshold_ratio; // skip comparison for low Katz Centrality verties (lowly ranked vertices) - auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { - return std::abs(lhs - rhs) < - std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); - }; - - ASSERT_TRUE(std::equal(h_reference_katz_centralities.begin(), - h_reference_katz_centralities.end(), - h_cugraph_katz_centralities.begin(), - nearly_equal)) - << "Katz centrality values do not match with the reference values."; + if (configuration.check_correctness) { + cugraph::experimental::graph_t unrenumbered_graph( + handle); + if (renumber) { + std::tie(unrenumbered_graph, std::ignore) = + read_graph(handle, configuration, false); + } + auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view; + + std::vector h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1); + std::vector h_indices(unrenumbered_graph_view.get_number_of_edges()); + std::vector h_weights{}; + raft::update_host(h_offsets.data(), + unrenumbered_graph_view.offsets(), + unrenumbered_graph_view.get_number_of_vertices() + 1, + handle.get_stream()); + raft::update_host(h_indices.data(), + unrenumbered_graph_view.indices(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + if (unrenumbered_graph_view.is_weighted()) { + h_weights.assign(unrenumbered_graph_view.get_number_of_edges(), weight_t{0.0}); + raft::update_host(h_weights.data(), + unrenumbered_graph_view.weights(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + } + + handle.get_stream_view().synchronize(); + + std::vector h_reference_katz_centralities( + unrenumbered_graph_view.get_number_of_vertices()); + + katz_centrality_reference( + h_offsets.data(), + h_indices.data(), + h_weights.size() > 0 ? h_weights.data() : static_cast(nullptr), + static_cast(nullptr), + h_reference_katz_centralities.data(), + unrenumbered_graph_view.get_number_of_vertices(), + alpha, + beta, + epsilon, + std::numeric_limits::max(), + false, + true); + + std::vector h_cugraph_katz_centralities(graph_view.get_number_of_vertices()); + if (renumber) { + auto d_unrenumbered_katz_centralities = + cugraph::test::sort_by_key(handle, + d_renumber_map_labels.data(), + d_katz_centralities.data(), + d_renumber_map_labels.size()); + raft::update_host(h_cugraph_katz_centralities.data(), + d_unrenumbered_katz_centralities.data(), + d_unrenumbered_katz_centralities.size(), + handle.get_stream()); + } else { + raft::update_host(h_cugraph_katz_centralities.data(), + d_katz_centralities.data(), + d_katz_centralities.size(), + handle.get_stream()); + } + + handle.get_stream_view().synchronize(); + + auto threshold_ratio = 1e-3; + auto threshold_magnitude = + (1.0 / static_cast(graph_view.get_number_of_vertices())) * + threshold_ratio; // skip comparison for low Katz Centrality verties (lowly ranked vertices) + auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { + return std::abs(lhs - rhs) < + std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); + }; + + ASSERT_TRUE(std::equal(h_reference_katz_centralities.begin(), + h_reference_katz_centralities.end(), + h_cugraph_katz_centralities.begin(), + nearly_equal)) + << "Katz centrality values do not match with the reference values."; + } } }; @@ -252,6 +301,7 @@ INSTANTIATE_TEST_CASE_P( simple_test, Tests_KatzCentrality, ::testing::Values( + // enable correctness checks KatzCentrality_Usecase("test/datasets/karate.mtx", false), KatzCentrality_Usecase("test/datasets/karate.mtx", true), KatzCentrality_Usecase("test/datasets/web-Google.mtx", false), @@ -261,16 +311,15 @@ INSTANTIATE_TEST_CASE_P( KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", false), KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", true), KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, - 0.0, - false), - KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, - 0.5, false), KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, - 0.0, true), - KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, - 0.5, - true))); + // disable correctness checks for large graphs + KatzCentrality_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + false, + false), + KatzCentrality_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + true, + false))); CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/mg_bfs_test.cpp b/cpp/tests/experimental/mg_bfs_test.cpp new file mode 100644 index 00000000000..76ccb5d9de3 --- /dev/null +++ b/cpp/tests/experimental/mg_bfs_test.cpp @@ -0,0 +1,303 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +typedef struct BFS_Usecase_t { + cugraph::test::input_graph_specifier_t input_graph_specifier{}; + + size_t source{0}; + bool check_correctness{false}; + + BFS_Usecase_t(std::string const& graph_file_path, size_t source, bool check_correctness = true) + : source(source), check_correctness(check_correctness) + { + std::string graph_file_full_path{}; + if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { + graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path; + } else { + graph_file_full_path = graph_file_path; + } + input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH; + input_graph_specifier.graph_file_full_path = graph_file_full_path; + }; + + BFS_Usecase_t(cugraph::test::rmat_params_t rmat_params, + size_t source, + bool check_correctness = true) + : source(source), check_correctness(check_correctness) + { + input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; + input_graph_specifier.rmat_params = rmat_params; + } +} BFS_Usecase; + +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, BFS_Usecase const& configuration, bool renumber) +{ + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + std::vector partition_ids(multi_gpu ? size_t{1} : static_cast(comm_size)); + std::iota(partition_ids.begin(), + partition_ids.end(), + multi_gpu ? static_cast(comm_rank) : size_t{0}); + + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, configuration.input_graph_specifier.graph_file_full_path, false, renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + false, + renumber, + partition_ids, + static_cast(comm_size)); +} + +class Tests_MGBFS : public ::testing::TestWithParam { + public: + Tests_MGBFS() {} + static void SetupTestCase() {} + static void TearDownTestCase() {} + + virtual void SetUp() {} + virtual void TearDown() {} + + // Compare the results of running BFS on multiple GPUs to that of a single-GPU run + template + void run_current_test(BFS_Usecase const& configuration) + { + using weight_t = float; + + // 1. initialize handle + + raft::handle_t handle{}; + + raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + auto row_comm_size = static_cast(sqrt(static_cast(comm_size))); + while (comm_size % row_comm_size != 0) { --row_comm_size; } + cugraph::partition_2d::subcomm_factory_t + subcomm_factory(handle, row_comm_size); + + // 2. create MG graph + + cugraph::experimental::graph_t mg_graph(handle); + rmm::device_uvector d_mg_renumber_map_labels(0, handle.get_stream()); + std::tie(mg_graph, d_mg_renumber_map_labels) = + read_graph(handle, configuration, true); + + auto mg_graph_view = mg_graph.view(); + + ASSERT_TRUE(static_cast(configuration.source) >= 0 && + static_cast(configuration.source) < + mg_graph_view.get_number_of_vertices()) + << "Invalid starting source."; + + // 3. run MG BFS + + rmm::device_uvector d_mg_distances(mg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + rmm::device_uvector d_mg_predecessors(mg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + + cugraph::experimental::bfs(handle, + mg_graph_view, + d_mg_distances.data(), + d_mg_predecessors.data(), + static_cast(configuration.source), + false, + std::numeric_limits::max(), + true); + + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + + // 5. copmare SG & MG results + + if (configuration.check_correctness) { + // 5-1. create SG graph + + cugraph::experimental::graph_t sg_graph(handle); + std::tie(sg_graph, std::ignore) = + read_graph(handle, configuration, false); + + auto sg_graph_view = sg_graph.view(); + + std::vector vertex_partition_lasts(comm_size); + for (size_t i = 0; i < vertex_partition_lasts.size(); ++i) { + vertex_partition_lasts[i] = mg_graph_view.get_vertex_partition_last(i); + } + + rmm::device_scalar d_source(static_cast(configuration.source), + handle.get_stream()); + cugraph::experimental::unrenumber_int_vertices( + handle, + d_source.data(), + size_t{1}, + d_mg_renumber_map_labels.data(), + mg_graph_view.get_local_vertex_first(), + mg_graph_view.get_local_vertex_last(), + vertex_partition_lasts, + true); + auto unrenumbered_source = d_source.value(handle.get_stream()); + + // 5-2. run SG BFS + + rmm::device_uvector d_sg_distances(sg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + rmm::device_uvector d_sg_predecessors(sg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + + cugraph::experimental::bfs(handle, + sg_graph_view, + d_sg_distances.data(), + d_sg_predecessors.data(), + unrenumbered_source, + false, + std::numeric_limits::max(), + true); + + // 5-3. compare + + std::vector h_sg_offsets(sg_graph_view.get_number_of_vertices() + 1); + std::vector h_sg_indices(sg_graph_view.get_number_of_edges()); + raft::update_host(h_sg_offsets.data(), + sg_graph_view.offsets(), + sg_graph_view.get_number_of_vertices() + 1, + handle.get_stream()); + raft::update_host(h_sg_indices.data(), + sg_graph_view.indices(), + sg_graph_view.get_number_of_edges(), + handle.get_stream()); + + std::vector h_sg_distances(sg_graph_view.get_number_of_vertices()); + std::vector h_sg_predecessors(sg_graph_view.get_number_of_vertices()); + raft::update_host( + h_sg_distances.data(), d_sg_distances.data(), d_sg_distances.size(), handle.get_stream()); + raft::update_host(h_sg_predecessors.data(), + d_sg_predecessors.data(), + d_sg_predecessors.size(), + handle.get_stream()); + + std::vector h_mg_distances(mg_graph_view.get_number_of_local_vertices()); + std::vector h_mg_predecessors(mg_graph_view.get_number_of_local_vertices()); + raft::update_host( + h_mg_distances.data(), d_mg_distances.data(), d_mg_distances.size(), handle.get_stream()); + cugraph::experimental::unrenumber_int_vertices( + handle, + d_mg_predecessors.data(), + d_mg_predecessors.size(), + d_mg_renumber_map_labels.data(), + mg_graph_view.get_local_vertex_first(), + mg_graph_view.get_local_vertex_last(), + vertex_partition_lasts, + true); + raft::update_host(h_mg_predecessors.data(), + d_mg_predecessors.data(), + d_mg_predecessors.size(), + handle.get_stream()); + + std::vector h_mg_renumber_map_labels(d_mg_renumber_map_labels.size()); + raft::update_host(h_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) { + auto mapped_vertex = h_mg_renumber_map_labels[i]; + ASSERT_TRUE(h_mg_distances[i] == h_sg_distances[mapped_vertex]) + << "MG BFS distance for vertex: " << mapped_vertex << " in rank: " << comm_rank + << " has value: " << h_mg_distances[i] + << " different from the corresponding SG value: " << h_sg_distances[mapped_vertex]; + if (h_mg_predecessors[i] == cugraph::invalid_vertex_id::value) { + ASSERT_TRUE(h_sg_predecessors[mapped_vertex] == h_mg_predecessors[i]) + << "vertex reachability does not match with the SG result."; + } else { + ASSERT_TRUE(h_sg_distances[h_mg_predecessors[i]] + 1 == h_sg_distances[mapped_vertex]) + << "distances to this vertex != distances to the predecessor vertex + 1."; + bool found{false}; + for (auto j = h_sg_offsets[h_mg_predecessors[i]]; + j < h_sg_offsets[h_mg_predecessors[i] + 1]; + ++j) { + if (h_sg_indices[j] == mapped_vertex) { + found = true; + break; + } + } + ASSERT_TRUE(found) << "no edge from the predecessor vertex to this vertex."; + } + } + } + } +}; + +TEST_P(Tests_MGBFS, CheckInt32Int32) { run_current_test(GetParam()); } + +INSTANTIATE_TEST_CASE_P( + simple_test, + Tests_MGBFS, + ::testing::Values( + // enable correctness checks + BFS_Usecase("test/datasets/karate.mtx", 0), + BFS_Usecase("test/datasets/web-Google.mtx", 0), + BFS_Usecase("test/datasets/ljournal-2008.mtx", 0), + BFS_Usecase("test/datasets/webbase-1M.mtx", 0), + BFS_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0), + // disable correctness checks for large graphs + BFS_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + 0, + false))); + +CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/mg_katz_centrality_test.cpp b/cpp/tests/experimental/mg_katz_centrality_test.cpp new file mode 100644 index 00000000000..e3033af3771 --- /dev/null +++ b/cpp/tests/experimental/mg_katz_centrality_test.cpp @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +typedef struct KatzCentrality_Usecase_t { + cugraph::test::input_graph_specifier_t input_graph_specifier{}; + + bool test_weighted{false}; + bool check_correctness{false}; + + KatzCentrality_Usecase_t(std::string const& graph_file_path, + bool test_weighted, + bool check_correctness = true) + : test_weighted(test_weighted), check_correctness(check_correctness) + { + std::string graph_file_full_path{}; + if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { + graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path; + } else { + graph_file_full_path = graph_file_path; + } + input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH; + input_graph_specifier.graph_file_full_path = graph_file_full_path; + }; + + KatzCentrality_Usecase_t(cugraph::test::rmat_params_t rmat_params, + bool test_weighted, + bool check_correctness = true) + : test_weighted(test_weighted), check_correctness(check_correctness) + { + input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; + input_graph_specifier.rmat_params = rmat_params; + } +} KatzCentrality_Usecase; + +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, KatzCentrality_Usecase const& configuration, bool renumber) +{ + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + std::vector partition_ids(multi_gpu ? size_t{1} : static_cast(comm_size)); + std::iota(partition_ids.begin(), + partition_ids.end(), + multi_gpu ? static_cast(comm_rank) : size_t{0}); + + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, + configuration.input_graph_specifier.graph_file_full_path, + configuration.test_weighted, + renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + configuration.test_weighted, + renumber, + partition_ids, + static_cast(comm_size)); +} + +class Tests_MGKatzCentrality : public ::testing::TestWithParam { + public: + Tests_MGKatzCentrality() {} + static void SetupTestCase() {} + static void TearDownTestCase() {} + + virtual void SetUp() {} + virtual void TearDown() {} + + // Compare the results of running Katz Centrality on multiple GPUs to that of a single-GPU run + template + void run_current_test(KatzCentrality_Usecase const& configuration) + { + // 1. initialize handle + + raft::handle_t handle{}; + + raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + auto row_comm_size = static_cast(sqrt(static_cast(comm_size))); + while (comm_size % row_comm_size != 0) { --row_comm_size; } + cugraph::partition_2d::subcomm_factory_t + subcomm_factory(handle, row_comm_size); + + // 2. create MG graph + + cugraph::experimental::graph_t mg_graph(handle); + rmm::device_uvector d_mg_renumber_map_labels(0, handle.get_stream()); + std::tie(mg_graph, d_mg_renumber_map_labels) = + read_graph(handle, configuration, true); + + auto mg_graph_view = mg_graph.view(); + + // 3. compute max in-degree + + auto max_in_degree = mg_graph_view.compute_max_in_degree(handle); + + // 4. run MG Katz Centrality + + result_t const alpha = result_t{1.0} / static_cast(max_in_degree + 1); + result_t constexpr beta{1.0}; + result_t constexpr epsilon{1e-6}; + + rmm::device_uvector d_mg_katz_centralities( + mg_graph_view.get_number_of_local_vertices(), handle.get_stream()); + + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + + cugraph::experimental::katz_centrality(handle, + mg_graph_view, + static_cast(nullptr), + d_mg_katz_centralities.data(), + alpha, + beta, + epsilon, + std::numeric_limits::max(), + false, + true); + + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + + // 5. copmare SG & MG results + + if (configuration.check_correctness) { + // 5-1. create SG graph + + cugraph::experimental::graph_t sg_graph(handle); + std::tie(sg_graph, std::ignore) = + read_graph(handle, configuration, false); + + auto sg_graph_view = sg_graph.view(); + + // 5-3. run SG Katz Centrality + + rmm::device_uvector d_sg_katz_centralities(sg_graph_view.get_number_of_vertices(), + handle.get_stream()); + + cugraph::experimental::katz_centrality(handle, + sg_graph_view, + static_cast(nullptr), + d_sg_katz_centralities.data(), + alpha, + beta, + epsilon, + std::numeric_limits::max(), // max_iterations + false, + true); + + // 5-4. compare + + std::vector h_sg_katz_centralities(sg_graph_view.get_number_of_vertices()); + raft::update_host(h_sg_katz_centralities.data(), + d_sg_katz_centralities.data(), + d_sg_katz_centralities.size(), + handle.get_stream()); + + std::vector h_mg_katz_centralities(mg_graph_view.get_number_of_local_vertices()); + raft::update_host(h_mg_katz_centralities.data(), + d_mg_katz_centralities.data(), + d_mg_katz_centralities.size(), + handle.get_stream()); + + std::vector h_mg_renumber_map_labels(d_mg_renumber_map_labels.size()); + raft::update_host(h_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + auto threshold_ratio = 1e-3; + auto threshold_magnitude = + (1.0 / static_cast(mg_graph_view.get_number_of_vertices())) * + threshold_ratio; // skip comparison for low KatzCentrality verties (lowly ranked vertices) + auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { + return std::abs(lhs - rhs) < + std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); + }; + + for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) { + auto mapped_vertex = h_mg_renumber_map_labels[i]; + ASSERT_TRUE(nearly_equal(h_mg_katz_centralities[i], h_sg_katz_centralities[mapped_vertex])) + << "MG KatzCentrality value for vertex: " << mapped_vertex << " in rank: " << comm_rank + << " has value: " << h_mg_katz_centralities[i] + << " which exceeds the error margin for comparing to SG value: " + << h_sg_katz_centralities[mapped_vertex]; + } + } + } +}; + +TEST_P(Tests_MGKatzCentrality, CheckInt32Int32FloatFloat) +{ + run_current_test(GetParam()); +} + +INSTANTIATE_TEST_CASE_P( + simple_test, + Tests_MGKatzCentrality, + ::testing::Values( + // enable correctness checks + KatzCentrality_Usecase("test/datasets/karate.mtx", false), + KatzCentrality_Usecase("test/datasets/karate.mtx", true), + KatzCentrality_Usecase("test/datasets/web-Google.mtx", false), + KatzCentrality_Usecase("test/datasets/web-Google.mtx", true), + KatzCentrality_Usecase("test/datasets/ljournal-2008.mtx", false), + KatzCentrality_Usecase("test/datasets/ljournal-2008.mtx", true), + KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", false), + KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", true), + KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, + false), + KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, + true), + // disable correctness checks for large graphs + KatzCentrality_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + false, + false), + KatzCentrality_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + true, + false))); + +CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/mg_sssp_test.cpp b/cpp/tests/experimental/mg_sssp_test.cpp new file mode 100644 index 00000000000..48e4dc869f4 --- /dev/null +++ b/cpp/tests/experimental/mg_sssp_test.cpp @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +typedef struct SSSP_Usecase_t { + cugraph::test::input_graph_specifier_t input_graph_specifier{}; + + size_t source{0}; + bool check_correctness{false}; + + SSSP_Usecase_t(std::string const& graph_file_path, size_t source, bool check_correctness = true) + : source(source), check_correctness(check_correctness) + { + std::string graph_file_full_path{}; + if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { + graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path; + } else { + graph_file_full_path = graph_file_path; + } + input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH; + input_graph_specifier.graph_file_full_path = graph_file_full_path; + }; + + SSSP_Usecase_t(cugraph::test::rmat_params_t rmat_params, + size_t source, + bool check_correctness = true) + : source(source), check_correctness(check_correctness) + { + input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; + input_graph_specifier.rmat_params = rmat_params; + } +} SSSP_Usecase; + +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, SSSP_Usecase const& configuration, bool renumber) +{ + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + std::vector partition_ids(multi_gpu ? size_t{1} : static_cast(comm_size)); + std::iota(partition_ids.begin(), + partition_ids.end(), + multi_gpu ? static_cast(comm_rank) : size_t{0}); + + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, configuration.input_graph_specifier.graph_file_full_path, true, renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + true, + renumber, + partition_ids, + static_cast(comm_size)); +} + +class Tests_MGSSSP : public ::testing::TestWithParam { + public: + Tests_MGSSSP() {} + static void SetupTestCase() {} + static void TearDownTestCase() {} + + virtual void SetUp() {} + virtual void TearDown() {} + + // Compare the results of running SSSP on multiple GPUs to that of a single-GPU run + template + void run_current_test(SSSP_Usecase const& configuration) + { + // 1. initialize handle + + raft::handle_t handle{}; + + raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + auto row_comm_size = static_cast(sqrt(static_cast(comm_size))); + while (comm_size % row_comm_size != 0) { --row_comm_size; } + cugraph::partition_2d::subcomm_factory_t + subcomm_factory(handle, row_comm_size); + + // 2. create MG graph + + cugraph::experimental::graph_t mg_graph(handle); + rmm::device_uvector d_mg_renumber_map_labels(0, handle.get_stream()); + std::tie(mg_graph, d_mg_renumber_map_labels) = + read_graph(handle, configuration, true); + + auto mg_graph_view = mg_graph.view(); + + ASSERT_TRUE(static_cast(configuration.source) >= 0 && + static_cast(configuration.source) < + mg_graph_view.get_number_of_vertices()) + << "Invalid starting source."; + + // 3. run MG SSSP + + rmm::device_uvector d_mg_distances(mg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + rmm::device_uvector d_mg_predecessors(mg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + + // FIXME: disable do_expensive_check + cugraph::experimental::sssp(handle, + mg_graph_view, + d_mg_distances.data(), + d_mg_predecessors.data(), + static_cast(configuration.source), + std::numeric_limits::max(), + true); + + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + + // 5. copmare SG & MG results + + if (configuration.check_correctness) { + // 5-1. create SG graph + + cugraph::experimental::graph_t sg_graph(handle); + std::tie(sg_graph, std::ignore) = + read_graph(handle, configuration, false); + + auto sg_graph_view = sg_graph.view(); + + std::vector vertex_partition_lasts(comm_size); + for (size_t i = 0; i < vertex_partition_lasts.size(); ++i) { + vertex_partition_lasts[i] = mg_graph_view.get_vertex_partition_last(i); + } + + rmm::device_scalar d_source(static_cast(configuration.source), + handle.get_stream()); + cugraph::experimental::unrenumber_int_vertices( + handle, + d_source.data(), + size_t{1}, + d_mg_renumber_map_labels.data(), + mg_graph_view.get_local_vertex_first(), + mg_graph_view.get_local_vertex_last(), + vertex_partition_lasts, + true); + auto unrenumbered_source = d_source.value(handle.get_stream()); + + // 5-2. run SG SSSP + + rmm::device_uvector d_sg_distances(sg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + rmm::device_uvector d_sg_predecessors(sg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + + // FIXME: disable do_expensive_check + cugraph::experimental::sssp(handle, + sg_graph_view, + d_sg_distances.data(), + d_sg_predecessors.data(), + unrenumbered_source, + std::numeric_limits::max(), + true); + + // 5-3. compare + + std::vector h_sg_offsets(sg_graph_view.get_number_of_vertices() + 1); + std::vector h_sg_indices(sg_graph_view.get_number_of_edges()); + std::vector h_sg_weights(sg_graph_view.get_number_of_edges()); + raft::update_host(h_sg_offsets.data(), + sg_graph_view.offsets(), + sg_graph_view.get_number_of_vertices() + 1, + handle.get_stream()); + raft::update_host(h_sg_indices.data(), + sg_graph_view.indices(), + sg_graph_view.get_number_of_edges(), + handle.get_stream()); + raft::update_host(h_sg_weights.data(), + sg_graph_view.weights(), + sg_graph_view.get_number_of_edges(), + handle.get_stream()); + + std::vector h_sg_distances(sg_graph_view.get_number_of_vertices()); + std::vector h_sg_predecessors(sg_graph_view.get_number_of_vertices()); + raft::update_host( + h_sg_distances.data(), d_sg_distances.data(), d_sg_distances.size(), handle.get_stream()); + raft::update_host(h_sg_predecessors.data(), + d_sg_predecessors.data(), + d_sg_predecessors.size(), + handle.get_stream()); + + std::vector h_mg_distances(mg_graph_view.get_number_of_local_vertices()); + std::vector h_mg_predecessors(mg_graph_view.get_number_of_local_vertices()); + raft::update_host( + h_mg_distances.data(), d_mg_distances.data(), d_mg_distances.size(), handle.get_stream()); + cugraph::experimental::unrenumber_int_vertices( + handle, + d_mg_predecessors.data(), + d_mg_predecessors.size(), + d_mg_renumber_map_labels.data(), + mg_graph_view.get_local_vertex_first(), + mg_graph_view.get_local_vertex_last(), + vertex_partition_lasts, + true); + raft::update_host(h_mg_predecessors.data(), + d_mg_predecessors.data(), + d_mg_predecessors.size(), + handle.get_stream()); + + std::vector h_mg_renumber_map_labels(d_mg_renumber_map_labels.size()); + raft::update_host(h_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + auto max_weight_element = std::max_element(h_sg_weights.begin(), h_sg_weights.end()); + auto epsilon = *max_weight_element * weight_t{1e-6}; + auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; }; + + for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) { + auto mapped_vertex = h_mg_renumber_map_labels[i]; + ASSERT_TRUE(nearly_equal(h_mg_distances[i], h_sg_distances[mapped_vertex])) + << "MG SSSP distance for vertex: " << mapped_vertex << " in rank: " << comm_rank + << " has value: " << h_mg_distances[i] + << " different from the corresponding SG value: " << h_sg_distances[mapped_vertex]; + if (h_mg_predecessors[i] == cugraph::invalid_vertex_id::value) { + ASSERT_TRUE(h_sg_predecessors[mapped_vertex] == h_mg_predecessors[i]) + << "vertex reachability does not match with the SG result."; + } else { + auto pred_distance = h_sg_distances[h_mg_predecessors[i]]; + bool found{false}; + for (auto j = h_sg_offsets[h_mg_predecessors[i]]; + j < h_sg_offsets[h_mg_predecessors[i] + 1]; + ++j) { + if (h_sg_indices[j] == mapped_vertex) { + if (nearly_equal(pred_distance + h_sg_weights[j], h_sg_distances[mapped_vertex])) { + found = true; + break; + } + } + } + ASSERT_TRUE(found) + << "no edge from the predecessor vertex to this vertex with the matching weight."; + } + } + } + } +}; + +TEST_P(Tests_MGSSSP, CheckInt32Int32Float) +{ + run_current_test(GetParam()); +} + +INSTANTIATE_TEST_CASE_P( + simple_test, + Tests_MGSSSP, + ::testing::Values( + // enable correctness checks + SSSP_Usecase("test/datasets/karate.mtx", 0), + SSSP_Usecase("test/datasets/dblp.mtx", 0), + SSSP_Usecase("test/datasets/wiki2003.mtx", 1000), + SSSP_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0), + // disable correctness checks for large graphs + SSSP_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + 0, + false))); + +CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/pagerank_test.cpp b/cpp/tests/experimental/pagerank_test.cpp index ff3b073cbc7..649fe11d805 100644 --- a/cpp/tests/experimental/pagerank_test.cpp +++ b/cpp/tests/experimental/pagerank_test.cpp @@ -16,9 +16,11 @@ #include #include +#include #include #include +#include #include #include @@ -35,6 +37,11 @@ #include #include +// do the perf measurements +// enabled by command line parameter s'--perf' +// +static int PERF = 0; + template void pagerank_reference(edge_t const* offsets, vertex_t const* indices, @@ -128,11 +135,15 @@ typedef struct PageRank_Usecase_t { double personalization_ratio{0.0}; bool test_weighted{false}; + bool check_correctness{false}; PageRank_Usecase_t(std::string const& graph_file_path, double personalization_ratio, - bool test_weighted) - : personalization_ratio(personalization_ratio), test_weighted(test_weighted) + bool test_weighted, + bool check_correctness = true) + : personalization_ratio(personalization_ratio), + test_weighted(test_weighted), + check_correctness(check_correctness) { std::string graph_file_full_path{}; if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { @@ -146,14 +157,47 @@ typedef struct PageRank_Usecase_t { PageRank_Usecase_t(cugraph::test::rmat_params_t rmat_params, double personalization_ratio, - bool test_weighted) - : personalization_ratio(personalization_ratio), test_weighted(test_weighted) + bool test_weighted, + bool check_correctness = true) + : personalization_ratio(personalization_ratio), + test_weighted(test_weighted), + check_correctness(check_correctness) { input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; input_graph_specifier.rmat_params = rmat_params; } } PageRank_Usecase; +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, PageRank_Usecase const& configuration, bool renumber) +{ + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, + configuration.input_graph_specifier.graph_file_full_path, + configuration.test_weighted, + renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + configuration.test_weighted, + renumber, + std::vector{0}, + size_t{1}); +} + class Tests_PageRank : public ::testing::TestWithParam { public: Tests_PageRank() {} @@ -166,52 +210,16 @@ class Tests_PageRank : public ::testing::TestWithParam { template void run_current_test(PageRank_Usecase const& configuration) { + constexpr bool renumber = true; + raft::handle_t handle{}; cugraph::experimental::graph_t graph(handle); - std::tie(graph, std::ignore) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test:: - read_graph_from_matrix_market_file( - handle, - configuration.input_graph_specifier.graph_file_full_path, - configuration.test_weighted, - false) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - configuration.test_weighted, - false); + rmm::device_uvector d_renumber_map_labels(0, handle.get_stream()); + std::tie(graph, d_renumber_map_labels) = + read_graph(handle, configuration, renumber); auto graph_view = graph.view(); - std::vector h_offsets(graph_view.get_number_of_vertices() + 1); - std::vector h_indices(graph_view.get_number_of_edges()); - std::vector h_weights{}; - raft::update_host(h_offsets.data(), - graph_view.offsets(), - graph_view.get_number_of_vertices() + 1, - handle.get_stream()); - raft::update_host(h_indices.data(), - graph_view.indices(), - graph_view.get_number_of_edges(), - handle.get_stream()); - if (graph_view.is_weighted()) { - h_weights.assign(graph_view.get_number_of_edges(), weight_t{0.0}); - raft::update_host(h_weights.data(), - graph_view.weights(), - graph_view.get_number_of_edges(), - handle.get_stream()); - } - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - std::vector h_personalization_vertices{}; std::vector h_personalization_values{}; if (configuration.personalization_ratio > 0.0) { @@ -260,21 +268,6 @@ class Tests_PageRank : public ::testing::TestWithParam { result_t constexpr alpha{0.85}; result_t constexpr epsilon{1e-6}; - std::vector h_reference_pageranks(graph_view.get_number_of_vertices()); - - pagerank_reference(h_offsets.data(), - h_indices.data(), - h_weights.size() > 0 ? h_weights.data() : static_cast(nullptr), - h_personalization_vertices.data(), - h_personalization_values.data(), - h_reference_pageranks.data(), - graph_view.get_number_of_vertices(), - static_cast(h_personalization_vertices.size()), - alpha, - epsilon, - std::numeric_limits::max(), - false); - rmm::device_uvector d_pageranks(graph_view.get_number_of_vertices(), handle.get_stream()); @@ -286,7 +279,7 @@ class Tests_PageRank : public ::testing::TestWithParam { d_personalization_vertices.data(), d_personalization_values.data(), static_cast(d_personalization_vertices.size()), - d_pageranks.begin(), + d_pageranks.data(), alpha, epsilon, std::numeric_limits::max(), @@ -295,26 +288,129 @@ class Tests_PageRank : public ::testing::TestWithParam { CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - std::vector h_cugraph_pageranks(graph_view.get_number_of_vertices()); - - raft::update_host( - h_cugraph_pageranks.data(), d_pageranks.data(), d_pageranks.size(), handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - auto threshold_ratio = 1e-3; - auto threshold_magnitude = - (1.0 / static_cast(graph_view.get_number_of_vertices())) * - threshold_ratio; // skip comparison for low PageRank verties (lowly ranked vertices) - auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { - return std::abs(lhs - rhs) < - std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); - }; - - ASSERT_TRUE(std::equal(h_reference_pageranks.begin(), - h_reference_pageranks.end(), - h_cugraph_pageranks.begin(), - nearly_equal)) - << "PageRank values do not match with the reference values."; + if (configuration.check_correctness) { + cugraph::experimental::graph_t unrenumbered_graph( + handle); + if (renumber) { + std::tie(unrenumbered_graph, std::ignore) = + read_graph(handle, configuration, false); + } + auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view; + + std::vector h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1); + std::vector h_indices(unrenumbered_graph_view.get_number_of_edges()); + std::vector h_weights{}; + raft::update_host(h_offsets.data(), + unrenumbered_graph_view.offsets(), + unrenumbered_graph_view.get_number_of_vertices() + 1, + handle.get_stream()); + raft::update_host(h_indices.data(), + unrenumbered_graph_view.indices(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + if (unrenumbered_graph_view.is_weighted()) { + h_weights.assign(unrenumbered_graph_view.get_number_of_edges(), weight_t{0.0}); + raft::update_host(h_weights.data(), + unrenumbered_graph_view.weights(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + } + + std::vector h_unrenumbered_personalization_vertices( + d_personalization_vertices.size()); + std::vector h_unrenumbered_personalization_values( + h_unrenumbered_personalization_vertices.size()); + if (renumber) { + rmm::device_uvector d_unrenumbered_personalization_vertices( + d_personalization_vertices.size(), handle.get_stream()); + rmm::device_uvector d_unrenumbered_personalization_values( + d_unrenumbered_personalization_vertices.size(), handle.get_stream()); + raft::copy_async(d_unrenumbered_personalization_vertices.data(), + d_personalization_vertices.data(), + d_personalization_vertices.size(), + handle.get_stream()); + raft::copy_async(d_unrenumbered_personalization_values.data(), + d_personalization_values.data(), + d_personalization_values.size(), + handle.get_stream()); + cugraph::experimental::unrenumber_local_int_vertices( + handle, + d_unrenumbered_personalization_vertices.data(), + d_unrenumbered_personalization_vertices.size(), + d_renumber_map_labels.data(), + vertex_t{0}, + graph_view.get_number_of_vertices()); + cugraph::test::sort_by_key(handle, + d_unrenumbered_personalization_vertices.data(), + d_unrenumbered_personalization_values.data(), + d_unrenumbered_personalization_vertices.size()); + + raft::update_host(h_unrenumbered_personalization_vertices.data(), + d_unrenumbered_personalization_vertices.data(), + d_unrenumbered_personalization_vertices.size(), + handle.get_stream()); + raft::update_host(h_unrenumbered_personalization_values.data(), + d_unrenumbered_personalization_values.data(), + d_unrenumbered_personalization_values.size(), + handle.get_stream()); + } else { + raft::update_host(h_unrenumbered_personalization_vertices.data(), + d_personalization_vertices.data(), + d_personalization_vertices.size(), + handle.get_stream()); + raft::update_host(h_unrenumbered_personalization_values.data(), + d_personalization_values.data(), + d_personalization_values.size(), + handle.get_stream()); + } + + handle.get_stream_view().synchronize(); + + std::vector h_reference_pageranks(unrenumbered_graph_view.get_number_of_vertices()); + + pagerank_reference(h_offsets.data(), + h_indices.data(), + h_weights.size() > 0 ? h_weights.data() : static_cast(nullptr), + h_unrenumbered_personalization_vertices.data(), + h_unrenumbered_personalization_values.data(), + h_reference_pageranks.data(), + unrenumbered_graph_view.get_number_of_vertices(), + static_cast(h_personalization_vertices.size()), + alpha, + epsilon, + std::numeric_limits::max(), + false); + + std::vector h_cugraph_pageranks(graph_view.get_number_of_vertices()); + if (renumber) { + auto d_unrenumbered_pageranks = cugraph::test::sort_by_key( + handle, d_renumber_map_labels.data(), d_pageranks.data(), d_renumber_map_labels.size()); + raft::update_host(h_cugraph_pageranks.data(), + d_unrenumbered_pageranks.data(), + d_unrenumbered_pageranks.size(), + handle.get_stream()); + } else { + raft::update_host( + h_cugraph_pageranks.data(), d_pageranks.data(), d_pageranks.size(), handle.get_stream()); + } + + handle.get_stream_view().synchronize(); + + auto threshold_ratio = 1e-3; + auto threshold_magnitude = + (1.0 / static_cast(graph_view.get_number_of_vertices())) * + threshold_ratio; // skip comparison for low PageRank verties (lowly ranked vertices) + auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { + return std::abs(lhs - rhs) < + std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); + }; + + ASSERT_TRUE(std::equal(h_reference_pageranks.begin(), + h_reference_pageranks.end(), + h_cugraph_pageranks.begin(), + nearly_equal)) + << "PageRank values do not match with the reference values."; + } } }; @@ -328,6 +424,7 @@ INSTANTIATE_TEST_CASE_P( simple_test, Tests_PageRank, ::testing::Values( + // enable correctness checks PageRank_Usecase("test/datasets/karate.mtx", 0.0, false), PageRank_Usecase("test/datasets/karate.mtx", 0.5, false), PageRank_Usecase("test/datasets/karate.mtx", 0.0, true), @@ -355,6 +452,15 @@ INSTANTIATE_TEST_CASE_P( true), PageRank_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0.5, - true))); + true), + // disable correctness checks for large graphs + PageRank_Usecase( + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, false, false), + PageRank_Usecase( + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, false, false), + PageRank_Usecase( + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, true, false), + PageRank_Usecase( + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, true, false))); CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/rw_low_level_test.cu b/cpp/tests/experimental/rw_low_level_test.cu index a32e258d366..8b562bc41f6 100644 --- a/cpp/tests/experimental/rw_low_level_test.cu +++ b/cpp/tests/experimental/rw_low_level_test.cu @@ -53,7 +53,8 @@ graph_t make_graph(raft::handle_t cons std::vector const& v_dst, std::vector const& v_w, vertex_t num_vertices, - edge_t num_edges) + edge_t num_edges, + bool is_weighted) { vector_test_t d_src(num_edges, handle.get_stream()); vector_test_t d_dst(num_edges, handle.get_stream()); @@ -67,7 +68,7 @@ graph_t make_graph(raft::handle_t cons d_src.data(), d_dst.data(), d_weights.data(), num_edges}; graph_t graph( - handle, edgelist, num_vertices, graph_properties_t{}, false); + handle, edgelist, num_vertices, graph_properties_t{false, false, is_weighted}, false); return graph; } @@ -119,7 +120,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphRWStart) std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; - auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges); + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); auto graph_view = graph.view(); @@ -199,7 +200,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphCoalesceExperiments) std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; - auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges); + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); auto graph_view = graph.view(); @@ -275,7 +276,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphColExtraction) std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; - auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges); + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); auto graph_view = graph.view(); @@ -371,7 +372,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphRndGenColIndx) std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; - auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges); + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); auto graph_view = graph.view(); @@ -449,7 +450,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphUpdatePathSizes) std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; - auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges); + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); auto graph_view = graph.view(); @@ -521,7 +522,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphScatterUpdate) std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; - auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges); + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); auto graph_view = graph.view(); @@ -666,7 +667,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphCoalesceDefragment) std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; - auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges); + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); auto graph_view = graph.view(); @@ -741,7 +742,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphRandomWalk) std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; - auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges); + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); auto graph_view = graph.view(); diff --git a/cpp/tests/experimental/sssp_test.cpp b/cpp/tests/experimental/sssp_test.cpp index 611abcb0d75..9364d261dec 100644 --- a/cpp/tests/experimental/sssp_test.cpp +++ b/cpp/tests/experimental/sssp_test.cpp @@ -16,9 +16,11 @@ #include #include +#include #include #include +#include #include #include @@ -28,12 +30,18 @@ #include +#include #include #include #include #include #include +// do the perf measurements +// enabled by command line parameter s'--perf' +// +static int PERF = 0; + // Dijkstra's algorithm template void sssp_reference(edge_t const* offsets, @@ -80,9 +88,12 @@ void sssp_reference(edge_t const* offsets, typedef struct SSSP_Usecase_t { cugraph::test::input_graph_specifier_t input_graph_specifier{}; - size_t source{false}; - SSSP_Usecase_t(std::string const& graph_file_path, size_t source) : source(source) + size_t source{0}; + bool check_correctness{false}; + + SSSP_Usecase_t(std::string const& graph_file_path, size_t source, bool check_correctness = true) + : source(source), check_correctness(check_correctness) { std::string graph_file_full_path{}; if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { @@ -94,13 +105,43 @@ typedef struct SSSP_Usecase_t { input_graph_specifier.graph_file_full_path = graph_file_full_path; }; - SSSP_Usecase_t(cugraph::test::rmat_params_t rmat_params, size_t source) : source(source) + SSSP_Usecase_t(cugraph::test::rmat_params_t rmat_params, + size_t source, + bool check_correctness = true) + : source(source), check_correctness(check_correctness) { input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; input_graph_specifier.rmat_params = rmat_params; } } SSSP_Usecase; +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, SSSP_Usecase const& configuration, bool renumber) +{ + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, configuration.input_graph_specifier.graph_file_full_path, true, renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + true, + renumber, + std::vector{0}, + size_t{1}); +} + class Tests_SSSP : public ::testing::TestWithParam { public: Tests_SSSP() {} @@ -113,61 +154,18 @@ class Tests_SSSP : public ::testing::TestWithParam { template void run_current_test(SSSP_Usecase const& configuration) { + constexpr bool renumber = true; + raft::handle_t handle{}; cugraph::experimental::graph_t graph(handle); - std::tie(graph, std::ignore) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test:: - read_graph_from_matrix_market_file( - handle, configuration.input_graph_specifier.graph_file_full_path, true, false) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - true, - false); + rmm::device_uvector d_renumber_map_labels(0, handle.get_stream()); + std::tie(graph, d_renumber_map_labels) = + read_graph(handle, configuration, renumber); auto graph_view = graph.view(); - std::vector h_offsets(graph_view.get_number_of_vertices() + 1); - std::vector h_indices(graph_view.get_number_of_edges()); - std::vector h_weights(graph_view.get_number_of_edges()); - raft::update_host(h_offsets.data(), - graph_view.offsets(), - graph_view.get_number_of_vertices() + 1, - handle.get_stream()); - raft::update_host(h_indices.data(), - graph_view.indices(), - graph_view.get_number_of_edges(), - handle.get_stream()); - raft::update_host(h_weights.data(), - graph_view.weights(), - graph_view.get_number_of_edges(), - handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - ASSERT_TRUE(configuration.source >= 0 && - configuration.source <= graph_view.get_number_of_vertices()) - << "Starting sources should be >= 0 and" - << " less than the number of vertices in the graph."; - - std::vector h_reference_distances(graph_view.get_number_of_vertices()); - std::vector h_reference_predecessors(graph_view.get_number_of_vertices()); - - sssp_reference(h_offsets.data(), - h_indices.data(), - h_weights.data(), - h_reference_distances.data(), - h_reference_predecessors.data(), - graph_view.get_number_of_vertices(), - static_cast(configuration.source)); + ASSERT_TRUE(static_cast(configuration.source) >= 0 && + static_cast(configuration.source) < graph_view.get_number_of_vertices()); rmm::device_uvector d_distances(graph_view.get_number_of_vertices(), handle.get_stream()); @@ -178,53 +176,135 @@ class Tests_SSSP : public ::testing::TestWithParam { cugraph::experimental::sssp(handle, graph_view, - d_distances.begin(), - d_predecessors.begin(), + d_distances.data(), + d_predecessors.data(), static_cast(configuration.source), std::numeric_limits::max(), false); CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - std::vector h_cugraph_distances(graph_view.get_number_of_vertices()); - std::vector h_cugraph_predecessors(graph_view.get_number_of_vertices()); - - raft::update_host( - h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream()); - raft::update_host(h_cugraph_predecessors.data(), - d_predecessors.data(), - d_predecessors.size(), - handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - auto max_weight_element = std::max_element(h_weights.begin(), h_weights.end()); - auto epsilon = *max_weight_element * weight_t{1e-6}; - auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; }; - - ASSERT_TRUE(std::equal(h_reference_distances.begin(), - h_reference_distances.end(), - h_cugraph_distances.begin(), - nearly_equal)) - << "distances do not match with the reference values."; - - for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) { - auto i = std::distance(h_cugraph_predecessors.begin(), it); - if (*it == cugraph::invalid_vertex_id::value) { - ASSERT_TRUE(h_reference_predecessors[i] == *it) - << "vertex reachability do not match with the reference."; + if (configuration.check_correctness) { + cugraph::experimental::graph_t unrenumbered_graph( + handle); + if (renumber) { + std::tie(unrenumbered_graph, std::ignore) = + read_graph(handle, configuration, false); + } + auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view; + + std::vector h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1); + std::vector h_indices(unrenumbered_graph_view.get_number_of_edges()); + std::vector h_weights(unrenumbered_graph_view.get_number_of_edges()); + raft::update_host(h_offsets.data(), + unrenumbered_graph_view.offsets(), + unrenumbered_graph_view.get_number_of_vertices() + 1, + handle.get_stream()); + raft::update_host(h_indices.data(), + unrenumbered_graph_view.indices(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + raft::update_host(h_weights.data(), + unrenumbered_graph_view.weights(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + auto unrenumbered_source = static_cast(configuration.source); + if (renumber) { + std::vector h_renumber_map_labels(d_renumber_map_labels.size()); + raft::update_host(h_renumber_map_labels.data(), + d_renumber_map_labels.data(), + d_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + unrenumbered_source = h_renumber_map_labels[configuration.source]; + } + + std::vector h_reference_distances(unrenumbered_graph_view.get_number_of_vertices()); + std::vector h_reference_predecessors( + unrenumbered_graph_view.get_number_of_vertices()); + + sssp_reference(h_offsets.data(), + h_indices.data(), + h_weights.data(), + h_reference_distances.data(), + h_reference_predecessors.data(), + unrenumbered_graph_view.get_number_of_vertices(), + unrenumbered_source, + std::numeric_limits::max()); + + std::vector h_cugraph_distances(graph_view.get_number_of_vertices()); + std::vector h_cugraph_predecessors(graph_view.get_number_of_vertices()); + if (renumber) { + cugraph::experimental::unrenumber_local_int_vertices(handle, + d_predecessors.data(), + d_predecessors.size(), + d_renumber_map_labels.data(), + vertex_t{0}, + graph_view.get_number_of_vertices(), + true); + + auto d_unrenumbered_distances = cugraph::test::sort_by_key( + handle, d_renumber_map_labels.data(), d_distances.data(), d_renumber_map_labels.size()); + auto d_unrenumbered_predecessors = cugraph::test::sort_by_key(handle, + d_renumber_map_labels.data(), + d_predecessors.data(), + d_renumber_map_labels.size()); + + raft::update_host(h_cugraph_distances.data(), + d_unrenumbered_distances.data(), + d_unrenumbered_distances.size(), + handle.get_stream()); + raft::update_host(h_cugraph_predecessors.data(), + d_unrenumbered_predecessors.data(), + d_unrenumbered_predecessors.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); } else { - auto pred_distance = h_reference_distances[*it]; - bool found{false}; - for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) { - if (h_indices[j] == i) { - if (nearly_equal(pred_distance + h_weights[j], h_reference_distances[i])) { - found = true; - break; + raft::update_host( + h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream()); + raft::update_host(h_cugraph_predecessors.data(), + d_predecessors.data(), + d_predecessors.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + } + + auto max_weight_element = std::max_element(h_weights.begin(), h_weights.end()); + auto epsilon = *max_weight_element * weight_t{1e-6}; + auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; }; + + ASSERT_TRUE(std::equal(h_reference_distances.begin(), + h_reference_distances.end(), + h_cugraph_distances.begin(), + nearly_equal)) + << "distances do not match with the reference values."; + + for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) { + auto i = std::distance(h_cugraph_predecessors.begin(), it); + if (*it == cugraph::invalid_vertex_id::value) { + ASSERT_TRUE(h_reference_predecessors[i] == *it) + << "vertex reachability do not match with the reference."; + } else { + auto pred_distance = h_reference_distances[*it]; + bool found{false}; + for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) { + if (h_indices[j] == i) { + if (nearly_equal(pred_distance + h_weights[j], h_reference_distances[i])) { + found = true; + break; + } } } + ASSERT_TRUE(found) + << "no edge from the predecessor vertex to this vertex with the matching weight."; } - ASSERT_TRUE(found) - << "no edge from the predecessor vertex to this vertex with the matching weight."; } } } @@ -237,9 +317,14 @@ INSTANTIATE_TEST_CASE_P( simple_test, Tests_SSSP, ::testing::Values( + // enable correctness checks SSSP_Usecase("test/datasets/karate.mtx", 0), SSSP_Usecase("test/datasets/dblp.mtx", 0), SSSP_Usecase("test/datasets/wiki2003.mtx", 1000), - SSSP_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0))); + SSSP_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0), + // disable correctness checks for large graphs + SSSP_Usecase(cugraph::test::rmat_params_t{20, 16, 0.57, 0.19, 0.19, 0, false, false}, + 0, + false))); CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/pagerank/mg_pagerank_test.cpp b/cpp/tests/pagerank/mg_pagerank_test.cpp index 85ee9a4243e..f7b1e8dfbb4 100644 --- a/cpp/tests/pagerank/mg_pagerank_test.cpp +++ b/cpp/tests/pagerank/mg_pagerank_test.cpp @@ -16,13 +16,19 @@ #include #include +#include #include +#include +#include +#include #include #include #include #include +#include +#include #include @@ -33,11 +39,15 @@ typedef struct PageRank_Usecase_t { double personalization_ratio{0.0}; bool test_weighted{false}; + bool check_correctness{false}; PageRank_Usecase_t(std::string const& graph_file_path, double personalization_ratio, - bool test_weighted) - : personalization_ratio(personalization_ratio), test_weighted(test_weighted) + bool test_weighted, + bool check_correctness = true) + : personalization_ratio(personalization_ratio), + test_weighted(test_weighted), + check_correctness(check_correctness) { std::string graph_file_full_path{}; if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { @@ -51,14 +61,56 @@ typedef struct PageRank_Usecase_t { PageRank_Usecase_t(cugraph::test::rmat_params_t rmat_params, double personalization_ratio, - bool test_weighted) - : personalization_ratio(personalization_ratio), test_weighted(test_weighted) + bool test_weighted, + bool check_correctness = true) + : personalization_ratio(personalization_ratio), + test_weighted(test_weighted), + check_correctness(check_correctness) { input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; input_graph_specifier.rmat_params = rmat_params; } } PageRank_Usecase; +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, PageRank_Usecase const& configuration, bool renumber) +{ + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + std::vector partition_ids(multi_gpu ? size_t{1} : static_cast(comm_size)); + std::iota(partition_ids.begin(), + partition_ids.end(), + multi_gpu ? static_cast(comm_rank) : size_t{0}); + + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, + configuration.input_graph_specifier.graph_file_full_path, + configuration.test_weighted, + renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + configuration.test_weighted, + renumber, + partition_ids, + static_cast(comm_size)); +} + class Tests_MGPageRank : public ::testing::TestWithParam { public: Tests_MGPageRank() {} @@ -68,7 +120,7 @@ class Tests_MGPageRank : public ::testing::TestWithParam { virtual void SetUp() {} virtual void TearDown() {} - // Compare the results of running pagerank on multiple GPUs to that of a single-GPU run + // Compare the results of running PageRank on multiple GPUs to that of a single-GPU run template void run_current_test(PageRank_Usecase const& configuration) { @@ -86,168 +138,40 @@ class Tests_MGPageRank : public ::testing::TestWithParam { cugraph::partition_2d::subcomm_factory_t subcomm_factory(handle, row_comm_size); - // 2. create SG & MG graphs - - cugraph::experimental::graph_t sg_graph(handle); - rmm::device_uvector d_sg_renumber_map_labels(0, handle.get_stream()); - std::tie(sg_graph, d_sg_renumber_map_labels) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test:: - read_graph_from_matrix_market_file( - handle, - configuration.input_graph_specifier.graph_file_full_path, - configuration.test_weighted, - true) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - configuration.test_weighted, - true); - - auto sg_graph_view = sg_graph.view(); + // 2. create MG graph cugraph::experimental::graph_t mg_graph(handle); rmm::device_uvector d_mg_renumber_map_labels(0, handle.get_stream()); std::tie(mg_graph, d_mg_renumber_map_labels) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test::read_graph_from_matrix_market_file( - handle, - configuration.input_graph_specifier.graph_file_full_path, - configuration.test_weighted, - true) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - configuration.test_weighted, - true); + read_graph(handle, configuration, true); auto mg_graph_view = mg_graph.view(); - std::vector h_sg_renumber_map_labels(d_sg_renumber_map_labels.size()); - raft::update_host(h_sg_renumber_map_labels.data(), - d_sg_renumber_map_labels.data(), - d_sg_renumber_map_labels.size(), - handle.get_stream()); - - std::vector h_mg_renumber_map_labels(mg_graph_view.get_number_of_local_vertices()); - raft::update_host(h_mg_renumber_map_labels.data(), - d_mg_renumber_map_labels.data(), - d_mg_renumber_map_labels.size(), - handle.get_stream()); + // 3. generate personalization vertex/value pairs - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - // 2. generate personalization vertex/value pairs - - std::vector h_personalization_vertices{}; - std::vector h_personalization_values{}; + std::vector h_mg_personalization_vertices{}; + std::vector h_mg_personalization_values{}; if (configuration.personalization_ratio > 0.0) { - std::default_random_engine generator{}; + std::default_random_engine generator{ + static_cast(comm.get_rank()) /* seed */}; std::uniform_real_distribution distribution{0.0, 1.0}; - h_personalization_vertices.resize(sg_graph_view.get_number_of_vertices()); - std::iota(h_personalization_vertices.begin(), h_personalization_vertices.end(), vertex_t{0}); - h_personalization_vertices.erase( - std::remove_if(h_personalization_vertices.begin(), - h_personalization_vertices.end(), + h_mg_personalization_vertices.resize(mg_graph_view.get_number_of_local_vertices()); + std::iota(h_mg_personalization_vertices.begin(), + h_mg_personalization_vertices.end(), + mg_graph_view.get_local_vertex_first()); + h_mg_personalization_vertices.erase( + std::remove_if(h_mg_personalization_vertices.begin(), + h_mg_personalization_vertices.end(), [&generator, &distribution, configuration](auto v) { return distribution(generator) >= configuration.personalization_ratio; }), - h_personalization_vertices.end()); - h_personalization_values.resize(h_personalization_vertices.size()); - std::for_each(h_personalization_values.begin(), - h_personalization_values.end(), + h_mg_personalization_vertices.end()); + h_mg_personalization_values.resize(h_mg_personalization_vertices.size()); + std::for_each(h_mg_personalization_values.begin(), + h_mg_personalization_values.end(), [&distribution, &generator](auto& val) { val = distribution(generator); }); } - result_t constexpr alpha{0.85}; - result_t constexpr epsilon{1e-6}; - - // 3. run SG pagerank - - std::vector h_sg_personalization_vertices{}; - std::vector h_sg_personalization_values{}; - if (h_personalization_vertices.size() > 0) { - for (vertex_t i = 0; i < sg_graph_view.get_number_of_vertices(); ++i) { - auto it = std::lower_bound(h_personalization_vertices.begin(), - h_personalization_vertices.end(), - h_sg_renumber_map_labels[i]); - if (*it == h_sg_renumber_map_labels[i]) { - h_sg_personalization_vertices.push_back(i); - h_sg_personalization_values.push_back( - h_personalization_values[std::distance(h_personalization_vertices.begin(), it)]); - } - } - } - - rmm::device_uvector d_sg_personalization_vertices( - h_sg_personalization_vertices.size(), handle.get_stream()); - rmm::device_uvector d_sg_personalization_values(d_sg_personalization_vertices.size(), - handle.get_stream()); - if (d_sg_personalization_vertices.size() > 0) { - raft::update_device(d_sg_personalization_vertices.data(), - h_sg_personalization_vertices.data(), - h_sg_personalization_vertices.size(), - handle.get_stream()); - raft::update_device(d_sg_personalization_values.data(), - h_sg_personalization_values.data(), - h_sg_personalization_values.size(), - handle.get_stream()); - } - - rmm::device_uvector d_sg_pageranks(sg_graph_view.get_number_of_vertices(), - handle.get_stream()); - - cugraph::experimental::pagerank(handle, - sg_graph_view, - static_cast(nullptr), - d_sg_personalization_vertices.data(), - d_sg_personalization_values.data(), - static_cast(d_sg_personalization_vertices.size()), - d_sg_pageranks.begin(), - alpha, - epsilon, - std::numeric_limits::max(), // max_iterations - false, - false); - - std::vector h_sg_pageranks(sg_graph_view.get_number_of_vertices()); - raft::update_host( - h_sg_pageranks.data(), d_sg_pageranks.data(), d_sg_pageranks.size(), handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - // 4. run MG pagerank - - std::vector h_mg_personalization_vertices{}; - std::vector h_mg_personalization_values{}; - if (h_personalization_vertices.size() > 0) { - for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) { - auto it = std::lower_bound(h_personalization_vertices.begin(), - h_personalization_vertices.end(), - h_mg_renumber_map_labels[i]); - if (*it == h_mg_renumber_map_labels[i]) { - h_mg_personalization_vertices.push_back(mg_graph_view.get_local_vertex_first() + i); - h_mg_personalization_values.push_back( - h_personalization_values[std::distance(h_personalization_vertices.begin(), it)]); - } - } - } - rmm::device_uvector d_mg_personalization_vertices( h_mg_personalization_vertices.size(), handle.get_stream()); rmm::device_uvector d_mg_personalization_values(d_mg_personalization_vertices.size(), @@ -263,6 +187,11 @@ class Tests_MGPageRank : public ::testing::TestWithParam { handle.get_stream()); } + // 4. run MG PageRank + + result_t constexpr alpha{0.85}; + result_t constexpr epsilon{1e-6}; + rmm::device_uvector d_mg_pageranks(mg_graph_view.get_number_of_local_vertices(), handle.get_stream()); @@ -274,44 +203,145 @@ class Tests_MGPageRank : public ::testing::TestWithParam { d_mg_personalization_vertices.data(), d_mg_personalization_values.data(), static_cast(d_mg_personalization_vertices.size()), - d_mg_pageranks.begin(), + d_mg_pageranks.data(), alpha, epsilon, std::numeric_limits::max(), - false, false); CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - std::vector h_mg_pageranks(mg_graph_view.get_number_of_local_vertices()); - raft::update_host( - h_mg_pageranks.data(), d_mg_pageranks.data(), d_mg_pageranks.size(), handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - // 5. copmare SG & MG results - std::vector h_sg_shuffled_pageranks(sg_graph_view.get_number_of_vertices(), - result_t{0.0}); - for (size_t i = 0; i < h_sg_pageranks.size(); ++i) { - h_sg_shuffled_pageranks[h_sg_renumber_map_labels[i]] = h_sg_pageranks[i]; - } + if (configuration.check_correctness) { + // 5-1. create SG graph + + cugraph::experimental::graph_t sg_graph(handle); + std::tie(sg_graph, std::ignore) = + read_graph(handle, configuration, false); + + auto sg_graph_view = sg_graph.view(); + + // 5-2. collect personalization vertex/value pairs + + rmm::device_uvector d_sg_personalization_vertices(0, handle.get_stream()); + rmm::device_uvector d_sg_personalization_values(0, handle.get_stream()); + if (configuration.personalization_ratio > 0.0) { + rmm::device_uvector d_unrenumbered_personalization_vertices( + d_mg_personalization_vertices.size(), handle.get_stream()); + rmm::device_uvector d_unrenumbered_personalization_values( + d_unrenumbered_personalization_vertices.size(), handle.get_stream()); + raft::copy_async(d_unrenumbered_personalization_vertices.data(), + d_mg_personalization_vertices.data(), + d_mg_personalization_vertices.size(), + handle.get_stream()); + raft::copy_async(d_unrenumbered_personalization_values.data(), + d_mg_personalization_values.data(), + d_mg_personalization_values.size(), + handle.get_stream()); + + std::vector vertex_partition_lasts(comm_size); + for (size_t i = 0; i < vertex_partition_lasts.size(); ++i) { + vertex_partition_lasts[i] = mg_graph_view.get_vertex_partition_last(i); + } + cugraph::experimental::unrenumber_int_vertices( + handle, + d_unrenumbered_personalization_vertices.data(), + d_unrenumbered_personalization_vertices.size(), + d_mg_renumber_map_labels.data(), + mg_graph_view.get_local_vertex_first(), + mg_graph_view.get_local_vertex_last(), + vertex_partition_lasts, + handle.get_stream()); + + rmm::device_scalar d_local_personalization_vector_size( + d_unrenumbered_personalization_vertices.size(), handle.get_stream()); + rmm::device_uvector d_recvcounts(comm_size, handle.get_stream()); + comm.allgather( + d_local_personalization_vector_size.data(), d_recvcounts.data(), 1, handle.get_stream()); + std::vector recvcounts(d_recvcounts.size()); + raft::update_host( + recvcounts.data(), d_recvcounts.data(), d_recvcounts.size(), handle.get_stream()); + auto status = comm.sync_stream(handle.get_stream()); + ASSERT_EQ(status, raft::comms::status_t::SUCCESS); + + std::vector displacements(recvcounts.size(), size_t{0}); + std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); + + d_sg_personalization_vertices.resize(displacements.back() + recvcounts.back(), + handle.get_stream()); + d_sg_personalization_values.resize(d_sg_personalization_vertices.size(), + handle.get_stream()); + + comm.allgatherv(d_unrenumbered_personalization_vertices.data(), + d_sg_personalization_vertices.data(), + recvcounts.data(), + displacements.data(), + handle.get_stream()); + comm.allgatherv(d_unrenumbered_personalization_values.data(), + d_sg_personalization_values.data(), + recvcounts.data(), + displacements.data(), + handle.get_stream()); + + cugraph::test::sort_by_key(handle, + d_unrenumbered_personalization_vertices.data(), + d_unrenumbered_personalization_values.data(), + d_unrenumbered_personalization_vertices.size()); + } + + // 5-3. run SG PageRank + + rmm::device_uvector d_sg_pageranks(sg_graph_view.get_number_of_vertices(), + handle.get_stream()); + + cugraph::experimental::pagerank(handle, + sg_graph_view, + static_cast(nullptr), + d_sg_personalization_vertices.data(), + d_sg_personalization_values.data(), + static_cast(d_sg_personalization_vertices.size()), + d_sg_pageranks.data(), + alpha, + epsilon, + std::numeric_limits::max(), // max_iterations + false); + + // 5-4. compare + + std::vector h_sg_pageranks(sg_graph_view.get_number_of_vertices()); + raft::update_host( + h_sg_pageranks.data(), d_sg_pageranks.data(), d_sg_pageranks.size(), handle.get_stream()); + + std::vector h_mg_pageranks(mg_graph_view.get_number_of_local_vertices()); + raft::update_host( + h_mg_pageranks.data(), d_mg_pageranks.data(), d_mg_pageranks.size(), handle.get_stream()); + + std::vector h_mg_renumber_map_labels(d_mg_renumber_map_labels.size()); + raft::update_host(h_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + auto threshold_ratio = 1e-3; + auto threshold_magnitude = + (1.0 / static_cast(mg_graph_view.get_number_of_vertices())) * + threshold_ratio; // skip comparison for low PageRank verties (lowly ranked vertices) + auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { + return std::abs(lhs - rhs) < + std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); + }; - auto threshold_ratio = 1e-3; - auto threshold_magnitude = - (1.0 / static_cast(mg_graph_view.get_number_of_vertices())) * - threshold_ratio; // skip comparison for low PageRank verties (lowly ranked vertices) - auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { - return std::abs(lhs - rhs) < - std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); - }; - - for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) { - auto mapped_vertex = h_mg_renumber_map_labels[i]; - ASSERT_TRUE(nearly_equal(h_mg_pageranks[i], h_sg_shuffled_pageranks[mapped_vertex])) - << "MG PageRank value for vertex: " << i << " in rank: " << comm_rank - << " has value: " << h_mg_pageranks[i] - << " which exceeds the error margin for comparing to SG value: " - << h_sg_shuffled_pageranks[mapped_vertex]; + for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) { + auto mapped_vertex = h_mg_renumber_map_labels[i]; + ASSERT_TRUE(nearly_equal(h_mg_pageranks[i], h_sg_pageranks[mapped_vertex])) + << "MG PageRank value for vertex: " << mapped_vertex << " in rank: " << comm_rank + << " has value: " << h_mg_pageranks[i] + << " which exceeds the error margin for comparing to SG value: " + << h_sg_pageranks[mapped_vertex]; + } } } }; @@ -325,6 +355,7 @@ INSTANTIATE_TEST_CASE_P( simple_test, Tests_MGPageRank, ::testing::Values( + // enable correctness checks PageRank_Usecase("test/datasets/karate.mtx", 0.0, false), PageRank_Usecase("test/datasets/karate.mtx", 0.5, false), PageRank_Usecase("test/datasets/karate.mtx", 0.0, true), @@ -352,6 +383,15 @@ INSTANTIATE_TEST_CASE_P( true), PageRank_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0.5, - true))); + true), + // disable correctness checks for large graphs + PageRank_Usecase( + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, false, false), + PageRank_Usecase( + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, false, false), + PageRank_Usecase( + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, true, false), + PageRank_Usecase( + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, true, false))); CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/utilities/generate_graph_from_edgelist.cu b/cpp/tests/utilities/generate_graph_from_edgelist.cu index 1b9fe6051f7..a9df392d2fb 100644 --- a/cpp/tests/utilities/generate_graph_from_edgelist.cu +++ b/cpp/tests/utilities/generate_graph_from_edgelist.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -28,7 +29,7 @@ namespace cugraph { namespace test { -namespace detail { +namespace { template , rmm::device_uvector>> -generate_graph_from_edgelist(raft::handle_t const& handle, - rmm::device_uvector&& vertices, - rmm::device_uvector&& edgelist_rows, - rmm::device_uvector&& edgelist_cols, - rmm::device_uvector&& edgelist_weights, - bool is_symmetric, - bool test_weighted, - bool renumber) +generate_graph_from_edgelist_impl(raft::handle_t const& handle, + rmm::device_uvector&& vertices, + rmm::device_uvector&& edgelist_rows, + rmm::device_uvector&& edgelist_cols, + rmm::device_uvector&& edgelist_weights, + bool is_symmetric, + bool test_weighted, + bool renumber) { CUGRAPH_EXPECTS(renumber, "renumber should be true if multi_gpu is true."); @@ -59,95 +60,88 @@ generate_graph_from_edgelist(raft::handle_t const& handle, auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_size = col_comm.get_size(); - vertex_t number_of_vertices = static_cast(vertices.size()); - - auto vertex_key_func = - cugraph::experimental::detail::compute_gpu_id_from_vertex_t{comm_size}; - vertices.resize(thrust::distance(vertices.begin(), - thrust::remove_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - vertices.begin(), - vertices.end(), - [comm_rank, key_func = vertex_key_func] __device__(auto val) { - return key_func(val) != comm_rank; - })), - handle.get_stream()); - vertices.shrink_to_fit(handle.get_stream()); - - auto edge_key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t{ - false, comm_size, row_comm_size, col_comm_size}; - size_t number_of_local_edges{}; - if (test_weighted) { - auto edge_first = thrust::make_zip_iterator( - thrust::make_tuple(edgelist_rows.begin(), edgelist_cols.begin(), edgelist_weights.begin())); - number_of_local_edges = thrust::distance( - edge_first, - thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edge_first, - edge_first + edgelist_rows.size(), - [comm_rank, key_func = edge_key_func] __device__(auto e) { - auto major = store_transposed ? thrust::get<1>(e) : thrust::get<0>(e); - auto minor = store_transposed ? thrust::get<0>(e) : thrust::get<1>(e); - return key_func(major, minor) != comm_rank; - })); - } else { - auto edge_first = - thrust::make_zip_iterator(thrust::make_tuple(edgelist_rows.begin(), edgelist_cols.begin())); - number_of_local_edges = thrust::distance( - edge_first, - thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edge_first, - edge_first + edgelist_rows.size(), - [comm_rank, key_func = edge_key_func] __device__(auto e) { - auto major = store_transposed ? thrust::get<1>(e) : thrust::get<0>(e); - auto minor = store_transposed ? thrust::get<0>(e) : thrust::get<1>(e); - return key_func(major, minor) != comm_rank; - })); - } - - edgelist_rows.resize(number_of_local_edges, handle.get_stream()); - edgelist_rows.shrink_to_fit(handle.get_stream()); - edgelist_cols.resize(number_of_local_edges, handle.get_stream()); - edgelist_cols.shrink_to_fit(handle.get_stream()); - if (test_weighted) { - edgelist_weights.resize(number_of_local_edges, handle.get_stream()); - edgelist_weights.shrink_to_fit(handle.get_stream()); - } + auto local_partition_id_op = + [comm_size, + key_func = cugraph::experimental::detail::compute_partition_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto pair) { + return key_func(thrust::get<0>(pair), thrust::get<1>(pair)) / + comm_size; // global partition id to local partition id + }; + auto pair_first = + store_transposed + ? thrust::make_zip_iterator(thrust::make_tuple(edgelist_cols.begin(), edgelist_rows.begin())) + : thrust::make_zip_iterator(thrust::make_tuple(edgelist_rows.begin(), edgelist_cols.begin())); + auto edge_counts = test_weighted + ? cugraph::experimental::groupby_and_count(pair_first, + pair_first + edgelist_rows.size(), + edgelist_weights.begin(), + local_partition_id_op, + col_comm_size, + handle.get_stream()) + : cugraph::experimental::groupby_and_count(pair_first, + pair_first + edgelist_rows.size(), + local_partition_id_op, + col_comm_size, + handle.get_stream()); + + std::vector h_edge_counts(edge_counts.size()); + raft::update_host( + h_edge_counts.data(), edge_counts.data(), edge_counts.size(), handle.get_stream()); + handle.get_stream_view().synchronize(); + + std::vector h_displacements(h_edge_counts.size(), size_t{0}); + std::partial_sum(h_edge_counts.begin(), h_edge_counts.end() - 1, h_displacements.begin() + 1); // 3. renumber rmm::device_uvector renumber_map_labels(0, handle.get_stream()); cugraph::experimental::partition_t partition{}; - vertex_t aggregate_number_of_vertices{}; + vertex_t number_of_vertices{}; edge_t number_of_edges{}; - // FIXME: set do_expensive_check to false once validated - std::tie(renumber_map_labels, partition, aggregate_number_of_vertices, number_of_edges) = - cugraph::experimental::renumber_edgelist( - handle, - vertices.data(), - static_cast(vertices.size()), - store_transposed ? edgelist_cols.data() : edgelist_rows.data(), - store_transposed ? edgelist_rows.data() : edgelist_cols.data(), - edgelist_rows.size(), - false, - true); - assert(aggregate_number_of_vertices == number_of_vertices); + { + std::vector major_ptrs(h_edge_counts.size()); + std::vector minor_ptrs(major_ptrs.size()); + std::vector counts(major_ptrs.size()); + for (size_t i = 0; i < h_edge_counts.size(); ++i) { + major_ptrs[i] = + (store_transposed ? edgelist_cols.begin() : edgelist_rows.begin()) + h_displacements[i]; + minor_ptrs[i] = + (store_transposed ? edgelist_rows.begin() : edgelist_cols.begin()) + h_displacements[i]; + counts[i] = static_cast(h_edge_counts[i]); + } + // FIXME: set do_expensive_check to false once validated + std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges) = + cugraph::experimental::renumber_edgelist( + handle, + vertices.data(), + static_cast(vertices.size()), + major_ptrs, + minor_ptrs, + counts, + true); + } // 4. create a graph + std::vector> edgelists( + h_edge_counts.size()); + for (size_t i = 0; i < h_edge_counts.size(); ++i) { + edgelists[i] = cugraph::experimental::edgelist_t{ + edgelist_rows.data() + h_displacements[i], + edgelist_cols.data() + h_displacements[i], + test_weighted ? edgelist_weights.data() + h_displacements[i] + : static_cast(nullptr), + static_cast(h_edge_counts[i])}; + } + return std::make_tuple( cugraph::experimental::graph_t( handle, - std::vector>{ - cugraph::experimental::edgelist_t{ - edgelist_rows.data(), - edgelist_cols.data(), - test_weighted ? edgelist_weights.data() : nullptr, - static_cast(edgelist_rows.size())}}, + edgelists, partition, number_of_vertices, number_of_edges, - cugraph::experimental::graph_properties_t{is_symmetric, false}, + cugraph::experimental::graph_properties_t{is_symmetric, false, test_weighted}, true, true), std::move(renumber_map_labels)); @@ -163,14 +157,14 @@ std::enable_if_t< std::tuple< cugraph::experimental::graph_t, rmm::device_uvector>> -generate_graph_from_edgelist(raft::handle_t const& handle, - rmm::device_uvector&& vertices, - rmm::device_uvector&& edgelist_rows, - rmm::device_uvector&& edgelist_cols, - rmm::device_uvector&& edgelist_weights, - bool is_symmetric, - bool test_weighted, - bool renumber) +generate_graph_from_edgelist_impl(raft::handle_t const& handle, + rmm::device_uvector&& vertices, + rmm::device_uvector&& edgelist_rows, + rmm::device_uvector&& edgelist_cols, + rmm::device_uvector&& edgelist_weights, + bool is_symmetric, + bool test_weighted, + bool renumber) { vertex_t number_of_vertices = static_cast(vertices.size()); @@ -196,13 +190,13 @@ generate_graph_from_edgelist(raft::handle_t const& handle, test_weighted ? edgelist_weights.data() : nullptr, static_cast(edgelist_rows.size())}, number_of_vertices, - cugraph::experimental::graph_properties_t{is_symmetric, false}, + cugraph::experimental::graph_properties_t{is_symmetric, false, test_weighted}, renumber ? true : false, true), std::move(renumber_map_labels)); } -} // namespace detail +} // namespace template ( - handle, - std::move(vertices), - std::move(edgelist_rows), - std::move(edgelist_cols), - std::move(edgelist_weights), - is_symmetric, - test_weighted, - renumber); + return generate_graph_from_edgelist_impl( + handle, + std::move(vertices), + std::move(edgelist_rows), + std::move(edgelist_cols), + std::move(edgelist_weights), + is_symmetric, + test_weighted, + renumber); } // explicit instantiations diff --git a/cpp/tests/utilities/matrix_market_file_utilities.cu b/cpp/tests/utilities/matrix_market_file_utilities.cu index ddbbac603ee..bf7539864be 100644 --- a/cpp/tests/utilities/matrix_market_file_utilities.cu +++ b/cpp/tests/utilities/matrix_market_file_utilities.cu @@ -13,9 +13,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #include +#include #include +#include #include #include @@ -339,7 +342,73 @@ read_graph_from_matrix_market_file(raft::handle_t const& handle, d_vertices.begin(), d_vertices.end(), vertex_t{0}); + handle.get_stream_view().synchronize(); + + if (multi_gpu) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_size = col_comm.get_size(); + + auto vertex_key_func = + cugraph::experimental::detail::compute_gpu_id_from_vertex_t{comm_size}; + d_vertices.resize( + thrust::distance( + d_vertices.begin(), + thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_vertices.begin(), + d_vertices.end(), + [comm_rank, key_func = vertex_key_func] __device__(auto val) { + return key_func(val) != comm_rank; + })), + handle.get_stream()); + d_vertices.shrink_to_fit(handle.get_stream()); + + auto edge_key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}; + size_t number_of_local_edges{}; + if (test_weighted) { + auto edge_first = thrust::make_zip_iterator(thrust::make_tuple( + d_edgelist_rows.begin(), d_edgelist_cols.begin(), d_edgelist_weights.begin())); + number_of_local_edges = thrust::distance( + edge_first, + thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edge_first, + edge_first + d_edgelist_rows.size(), + [comm_rank, key_func = edge_key_func] __device__(auto e) { + auto major = store_transposed ? thrust::get<1>(e) : thrust::get<0>(e); + auto minor = store_transposed ? thrust::get<0>(e) : thrust::get<1>(e); + return key_func(major, minor) != comm_rank; + })); + } else { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(d_edgelist_rows.begin(), d_edgelist_cols.begin())); + number_of_local_edges = thrust::distance( + edge_first, + thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edge_first, + edge_first + d_edgelist_rows.size(), + [comm_rank, key_func = edge_key_func] __device__(auto e) { + auto major = store_transposed ? thrust::get<1>(e) : thrust::get<0>(e); + auto minor = store_transposed ? thrust::get<0>(e) : thrust::get<1>(e); + return key_func(major, minor) != comm_rank; + })); + } + + d_edgelist_rows.resize(number_of_local_edges, handle.get_stream()); + d_edgelist_rows.shrink_to_fit(handle.get_stream()); + d_edgelist_cols.resize(number_of_local_edges, handle.get_stream()); + d_edgelist_cols.shrink_to_fit(handle.get_stream()); + if (test_weighted) { + d_edgelist_weights.resize(number_of_local_edges, handle.get_stream()); + d_edgelist_weights.shrink_to_fit(handle.get_stream()); + } + } + handle.get_stream_view().synchronize(); return generate_graph_from_edgelist( handle, std::move(d_vertices), diff --git a/cpp/tests/utilities/rmat_utilities.cu b/cpp/tests/utilities/rmat_utilities.cu index 16ea7a486fc..3f0bb0b4a1f 100644 --- a/cpp/tests/utilities/rmat_utilities.cu +++ b/cpp/tests/utilities/rmat_utilities.cu @@ -13,10 +13,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #include +#include #include +#include #include +#include #include #include @@ -41,39 +45,191 @@ generate_graph_from_rmat_params(raft::handle_t const& handle, double a, double b, double c, - uint64_t seed, + uint64_t base_seed, bool undirected, bool scramble_vertex_ids, bool test_weighted, - bool renumber) + bool renumber, + std::vector const& partition_ids, + size_t num_partitions) { + CUGRAPH_EXPECTS(!multi_gpu || renumber, "renumber should be true if multi_gpu is true."); + CUGRAPH_EXPECTS(size_t{1} << scale <= static_cast(std::numeric_limits::max()), + "vertex_t overflow."); + CUGRAPH_EXPECTS( + (size_t{1} << scale) * edge_factor <= static_cast(std::numeric_limits::max()), + " edge_t overflow."); + + vertex_t number_of_vertices = static_cast(size_t{1} << scale); + edge_t number_of_edges = + static_cast(static_cast(number_of_vertices) * edge_factor); + + std::vector partition_edge_counts(partition_ids.size()); + std::vector partition_vertex_firsts(partition_ids.size()); + std::vector partition_vertex_lasts(partition_ids.size()); + for (size_t i = 0; i < partition_ids.size(); ++i) { + auto id = partition_ids[i]; + + partition_edge_counts[i] = number_of_edges / num_partitions + + (id < number_of_edges % num_partitions ? edge_t{1} : edge_t{0}); + + partition_vertex_firsts[i] = (number_of_vertices / num_partitions) * id; + partition_vertex_lasts[i] = (number_of_vertices / num_partitions) * (id + 1); + if (id < number_of_vertices % num_partitions) { + partition_vertex_firsts[i] += id; + partition_vertex_lasts[i] += id + 1; + } else { + partition_vertex_firsts[i] += number_of_vertices % num_partitions; + partition_vertex_lasts[i] += number_of_vertices % num_partitions; + } + } + rmm::device_uvector d_edgelist_rows(0, handle.get_stream()); rmm::device_uvector d_edgelist_cols(0, handle.get_stream()); - std::tie(d_edgelist_rows, d_edgelist_cols) = - cugraph::experimental::generate_rmat_edgelist( - handle, scale, edge_factor, a, b, c, seed, undirected ? true : false, scramble_vertex_ids); + rmm::device_uvector d_edgelist_weights(0, handle.get_stream()); + for (size_t i = 0; i < partition_ids.size(); ++i) { + auto id = partition_ids[i]; + + rmm::device_uvector d_tmp_rows(0, handle.get_stream()); + rmm::device_uvector d_tmp_cols(0, handle.get_stream()); + std::tie(i == 0 ? d_edgelist_rows : d_tmp_rows, i == 0 ? d_edgelist_cols : d_tmp_cols) = + cugraph::experimental::generate_rmat_edgelist(handle, + scale, + partition_edge_counts[i], + a, + b, + c, + base_seed + id, + undirected ? true : false, + scramble_vertex_ids); + + rmm::device_uvector d_tmp_weights(0, handle.get_stream()); + if (test_weighted) { + if (i == 0) { + d_edgelist_weights.resize(d_edgelist_rows.size(), handle.get_stream()); + } else { + d_tmp_weights.resize(d_tmp_rows.size(), handle.get_stream()); + } + + raft::random::Rng rng(base_seed + num_partitions + id); + rng.uniform(i == 0 ? d_edgelist_weights.data() : d_tmp_weights.data(), + i == 0 ? d_edgelist_weights.size() : d_tmp_weights.size(), + weight_t{0.0}, + weight_t{1.0}, + handle.get_stream()); + } + + if (i > 0) { + auto start_offset = d_edgelist_rows.size(); + d_edgelist_rows.resize(start_offset + d_tmp_rows.size(), handle.get_stream()); + d_edgelist_cols.resize(d_edgelist_rows.size(), handle.get_stream()); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_tmp_rows.begin(), + d_tmp_rows.end(), + d_edgelist_rows.begin() + start_offset); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_tmp_cols.begin(), + d_tmp_cols.end(), + d_edgelist_cols.begin() + start_offset); + if (test_weighted) { + d_edgelist_weights.resize(d_edgelist_rows.size(), handle.get_stream()); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_tmp_weights.begin(), + d_tmp_weights.end(), + d_edgelist_weights.begin() + start_offset); + } + } + } + if (undirected) { // FIXME: need to symmetrize CUGRAPH_FAIL("unimplemented."); } - rmm::device_uvector d_edgelist_weights(test_weighted ? d_edgelist_rows.size() : 0, - handle.get_stream()); - if (test_weighted) { - raft::random::Rng rng(seed + 1); - rng.uniform(d_edgelist_weights.data(), - d_edgelist_weights.size(), - weight_t{0.0}, - weight_t{1.0}, - handle.get_stream()); + if (multi_gpu) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_size = col_comm.get_size(); + + rmm::device_uvector d_rx_edgelist_rows(0, handle.get_stream()); + rmm::device_uvector d_rx_edgelist_cols(0, handle.get_stream()); + rmm::device_uvector d_rx_edgelist_weights(0, handle.get_stream()); + if (test_weighted) { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(store_transposed ? d_edgelist_cols.begin() : d_edgelist_rows.begin(), + store_transposed ? d_edgelist_rows.begin() : d_edgelist_cols.begin(), + d_edgelist_weights.begin())); + + std::forward_as_tuple(std::tie(store_transposed ? d_rx_edgelist_cols : d_rx_edgelist_rows, + store_transposed ? d_rx_edgelist_rows : d_rx_edgelist_cols, + d_rx_edgelist_weights), + std::ignore) = + cugraph::experimental::groupby_gpuid_and_shuffle_values( + comm, // handle.get_comms(), + edge_first, + edge_first + d_edgelist_rows.size(), + [key_func = + cugraph::experimental::detail::compute_gpu_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto val) { + return key_func(thrust::get<0>(val), thrust::get<1>(val)); + }, + handle.get_stream()); + } else { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(store_transposed ? d_edgelist_cols.begin() : d_edgelist_rows.begin(), + store_transposed ? d_edgelist_rows.begin() : d_edgelist_cols.begin())); + + std::forward_as_tuple(std::tie(store_transposed ? d_rx_edgelist_cols : d_rx_edgelist_rows, + store_transposed ? d_rx_edgelist_rows : d_rx_edgelist_cols), + std::ignore) = + cugraph::experimental::groupby_gpuid_and_shuffle_values( + comm, // handle.get_comms(), + edge_first, + edge_first + d_edgelist_rows.size(), + [key_func = + cugraph::experimental::detail::compute_gpu_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto val) { + return key_func(thrust::get<0>(val), thrust::get<1>(val)); + }, + handle.get_stream()); + } + + d_edgelist_rows = std::move(d_rx_edgelist_rows); + d_edgelist_cols = std::move(d_rx_edgelist_cols); + d_edgelist_weights = std::move(d_rx_edgelist_weights); + } + + rmm::device_uvector d_vertices(0, handle.get_stream()); + for (size_t i = 0; i < partition_ids.size(); ++i) { + auto id = partition_ids[i]; + + auto start_offset = d_vertices.size(); + d_vertices.resize(start_offset + (partition_vertex_lasts[i] - partition_vertex_firsts[i]), + handle.get_stream()); + thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_vertices.begin() + start_offset, + d_vertices.end(), + partition_vertex_firsts[i]); } - rmm::device_uvector d_vertices(static_cast(size_t{1} << scale), - handle.get_stream()); - thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - d_vertices.begin(), - d_vertices.end(), - vertex_t{0}); + if (multi_gpu) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + + rmm::device_uvector d_rx_vertices(0, handle.get_stream()); + std::tie(d_rx_vertices, std::ignore) = cugraph::experimental::groupby_gpuid_and_shuffle_values( + comm, // handle.get_comms(), + d_vertices.begin(), + d_vertices.end(), + [key_func = + cugraph::experimental::detail::compute_gpu_id_from_vertex_t{ + comm_size}] __device__(auto val) { return key_func(val); }, + handle.get_stream()); + d_vertices = std::move(d_rx_vertices); + } return generate_graph_from_edgelist( handle, @@ -90,59 +246,71 @@ generate_graph_from_rmat_params(raft::handle_t const& handle, template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> @@ -157,105 +325,128 @@ generate_graph_from_rmat_params( bool undirected, bool scramble_vertex_ids, bool test_weighted, - bool renumber); + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> @@ -270,105 +461,128 @@ generate_graph_from_rmat_params( bool undirected, bool scramble_vertex_ids, bool test_weighted, - bool renumber); + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> @@ -383,49 +597,60 @@ generate_graph_from_rmat_params( bool undirected, bool scramble_vertex_ids, bool test_weighted, - bool renumber); + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); } // namespace test } // namespace cugraph diff --git a/cpp/tests/utilities/test_utilities.hpp b/cpp/tests/utilities/test_utilities.hpp index 37e87c62247..e81a76b4163 100644 --- a/cpp/tests/utilities/test_utilities.hpp +++ b/cpp/tests/utilities/test_utilities.hpp @@ -106,6 +106,22 @@ static const std::string& get_rapids_dataset_root_dir() return rdrd; } +template +std::tuple, + rmm::device_uvector> +generate_graph_from_edgelist(raft::handle_t const& handle, + rmm::device_uvector&& vertices, + rmm::device_uvector&& edgelist_rows, + rmm::device_uvector&& edgelist_cols, + rmm::device_uvector&& edgelist_weights, + bool is_symmetric, + bool test_weighted, + bool renumber); + // returns a tuple of (rows, columns, weights, number_of_vertices, is_symmetric) template std::tuple, @@ -130,22 +146,6 @@ read_graph_from_matrix_market_file(raft::handle_t const& handle, bool test_weighted, bool renumber); -template -std::tuple, - rmm::device_uvector> -generate_graph_from_edgelist(raft::handle_t const& handle, - rmm::device_uvector&& vertices, - rmm::device_uvector&& edgelist_rows, - rmm::device_uvector&& edgelist_cols, - rmm::device_uvector&& edgelist_weights, - bool is_symmetric, - bool test_weighted, - bool renumber); - template const& partition_ids, + size_t num_partitions); struct rmat_params_t { size_t scale{}; @@ -182,19 +184,5 @@ struct input_graph_specifier_t { rmat_params_t rmat_params{}; }; -template -std::enable_if_t::value, bool> is_valid_vertex(vertex_t num_vertices, - vertex_t v) -{ - return (v >= 0) && (v < num_vertices); -} - -template -std::enable_if_t::value, bool> is_valid_vertex(vertex_t num_vertices, - vertex_t v) -{ - return v < num_vertices; -} - } // namespace test } // namespace cugraph diff --git a/cpp/tests/utilities/thrust_wrapper.cu b/cpp/tests/utilities/thrust_wrapper.cu new file mode 100644 index 00000000000..5d32fb8a5d1 --- /dev/null +++ b/cpp/tests/utilities/thrust_wrapper.cu @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include + +namespace cugraph { +namespace test { + +template +rmm::device_uvector sort_by_key(raft::handle_t const& handle, + vertex_t const* keys, + value_t const* values, + size_t num_pairs) +{ + rmm::device_uvector sorted_keys(num_pairs, handle.get_stream_view()); + rmm::device_uvector sorted_values(num_pairs, handle.get_stream_view()); + + thrust::copy( + rmm::exec_policy(handle.get_stream_view()), keys, keys + num_pairs, sorted_keys.begin()); + thrust::copy( + rmm::exec_policy(handle.get_stream_view()), values, values + num_pairs, sorted_values.begin()); + + thrust::sort_by_key(rmm::exec_policy(handle.get_stream_view()), + sorted_keys.begin(), + sorted_keys.end(), + sorted_values.begin()); + + return sorted_values; +} + +template rmm::device_uvector sort_by_key(raft::handle_t const& handle, + int32_t const* keys, + float const* values, + size_t num_pairs); + +template rmm::device_uvector sort_by_key(raft::handle_t const& handle, + int32_t const* keys, + double const* values, + size_t num_pairs); + +template rmm::device_uvector sort_by_key(raft::handle_t const& handle, + int32_t const* keys, + int32_t const* values, + size_t num_pairs); + +template rmm::device_uvector sort_by_key(raft::handle_t const& handle, + int64_t const* keys, + float const* values, + size_t num_pairs); + +template rmm::device_uvector sort_by_key(raft::handle_t const& handle, + int64_t const* keys, + double const* values, + size_t num_pairs); + +template rmm::device_uvector sort_by_key(raft::handle_t const& handle, + int64_t const* keys, + int64_t const* values, + size_t num_pairs); + +} // namespace test +} // namespace cugraph diff --git a/cpp/tests/utilities/thrust_wrapper.hpp b/cpp/tests/utilities/thrust_wrapper.hpp new file mode 100644 index 00000000000..579dc3c550f --- /dev/null +++ b/cpp/tests/utilities/thrust_wrapper.hpp @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace cugraph { +namespace test { + +template +rmm::device_uvector sort_by_key(raft::handle_t const& handle, + vertex_t const* keys, + value_t const* values, + size_t num_pairs); + +} // namespace test +} // namespace cugraph diff --git a/python/cugraph/community/egonet_wrapper.pyx b/python/cugraph/community/egonet_wrapper.pyx index ead41705628..23aa159314f 100644 --- a/python/cugraph/community/egonet_wrapper.pyx +++ b/python/cugraph/community/egonet_wrapper.pyx @@ -42,7 +42,7 @@ def egonet(input_graph, vertices, radius=1): num_verts = input_graph.number_of_vertices() num_edges = input_graph.number_of_edges(directed_edges=True) - num_partition_edges = num_edges + num_local_edges = num_edges cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0] cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0] @@ -50,8 +50,10 @@ def egonet(input_graph, vertices, radius=1): if weights is not None: c_edge_weights = weights.__cuda_array_interface__['data'][0] weight_t = weights.dtype + is_weighted = True else: weight_t = np.dtype("float32") + is_weighted = False # Pointers for egonet vertices = vertices.astype('int32') @@ -72,10 +74,11 @@ def egonet(input_graph, vertices, radius=1): ((numberTypeMap[vertex_t])), ((numberTypeMap[edge_t])), ((numberTypeMap[weight_t])), - num_partition_edges, + num_local_edges, num_verts, num_edges, False, + is_weighted, False, False) if(weight_t==np.dtype("float32")): diff --git a/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx b/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx index ccae26fe7e6..5fb9de788cf 100644 --- a/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx +++ b/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx @@ -52,8 +52,12 @@ def mg_katz_centrality(input_df, if "value" in input_df.columns: weights = input_df['value'] weight_t = weights.dtype + is_weighted = True + raise NotImplementedError # FIXME: c_edge_weights is always set to NULL else: + weights = None weight_t = np.dtype("float32") + is_weighted = False if alpha is None: alpha = 0.1 @@ -67,11 +71,13 @@ def mg_katz_centrality(input_df, np.dtype("double") : numberTypeEnum.doubleType} # FIXME: needs to be edge_t type not int - cdef int num_partition_edges = len(src) + cdef int num_local_edges = len(src) cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0] cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0] cdef uintptr_t c_edge_weights = NULL + if weights is not None: + c_edge_weights = weights.__cuda_array_interface__['data'][0] # FIXME: data is on device, move to host (to_pandas()), convert to np array and access pointer to pass to C vertex_partition_offsets_host = vertex_partition_offsets.values_host @@ -85,9 +91,10 @@ def mg_katz_centrality(input_df, ((numberTypeMap[vertex_t])), ((numberTypeMap[edge_t])), ((numberTypeMap[weight_t])), - num_partition_edges, + num_local_edges, num_global_verts, num_global_edges, True, + is_weighted, True, True) df = cudf.DataFrame() diff --git a/python/cugraph/dask/community/louvain_wrapper.pyx b/python/cugraph/dask/community/louvain_wrapper.pyx index f58630d07aa..a3cebeac272 100644 --- a/python/cugraph/dask/community/louvain_wrapper.pyx +++ b/python/cugraph/dask/community/louvain_wrapper.pyx @@ -56,12 +56,12 @@ def louvain(input_df, src = input_df['src'] dst = input_df['dst'] - num_partition_edges = len(src) + num_local_edges = len(src) if "value" in input_df.columns: weights = input_df['value'] else: - weights = cudf.Series(np.full(num_partition_edges, 1.0, dtype=np.float32)) + weights = cudf.Series(np.full(num_local_edges, 1.0, dtype=np.float32)) vertex_t = src.dtype if num_global_edges > (2**31 - 1): @@ -94,9 +94,10 @@ def louvain(input_df, ((numberTypeMap[vertex_t])), ((numberTypeMap[edge_t])), ((numberTypeMap[weight_t])), - num_partition_edges, + num_local_edges, num_global_verts, num_global_edges, sorted_by_degree, + True, False, True) # store_transposed, multi_gpu # Create the output dataframe, column lengths must be equal to the number of diff --git a/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx b/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx index 12f2342559b..c2f92f0f33b 100644 --- a/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx +++ b/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx @@ -51,8 +51,12 @@ def mg_pagerank(input_df, if "value" in input_df.columns: weights = input_df['value'] weight_t = weights.dtype + is_weighted = True + raise NotImplementedError # FIXME: c_edge_weights is always set to NULL else: + weights = None weight_t = np.dtype("float32") + is_weighted = False # FIXME: Offsets and indices are currently hardcoded to int, but this may # not be acceptable in the future. @@ -62,11 +66,13 @@ def mg_pagerank(input_df, np.dtype("double") : numberTypeEnum.doubleType} # FIXME: needs to be edge_t type not int - cdef int num_partition_edges = len(src) + cdef int num_local_edges = len(src) cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0] cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0] cdef uintptr_t c_edge_weights = NULL + if weights is not None: + c_edge_weights = weights.__cuda_array_interface__['data'][0] # FIXME: data is on device, move to host (to_pandas()), convert to np array and access pointer to pass to C vertex_partition_offsets_host = vertex_partition_offsets.values_host @@ -81,9 +87,10 @@ def mg_pagerank(input_df, ((numberTypeMap[vertex_t])), ((numberTypeMap[edge_t])), ((numberTypeMap[weight_t])), - num_partition_edges, + num_local_edges, num_global_verts, num_global_edges, True, + is_weighted, True, True) df = cudf.DataFrame() diff --git a/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx b/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx index 527cb2bcf0a..44630ba5fb3 100644 --- a/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx +++ b/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx @@ -58,7 +58,7 @@ def mg_bfs(input_df, np.dtype("double") : numberTypeEnum.doubleType} # FIXME: needs to be edge_t type not int - cdef int num_partition_edges = len(src) + cdef int num_local_edges = len(src) cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0] cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0] @@ -77,9 +77,10 @@ def mg_bfs(input_df, ((numberTypeMap[vertex_t])), ((numberTypeMap[edge_t])), ((numberTypeMap[weight_t])), - num_partition_edges, + num_local_edges, num_global_verts, num_global_edges, True, + False, # BFS runs on unweighted graphs False, True) # Generate the cudf.DataFrame result diff --git a/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx b/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx index 15d956836b4..82a4ebe04d6 100644 --- a/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx +++ b/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx @@ -46,9 +46,11 @@ def mg_sssp(input_df, if "value" in input_df.columns: weights = input_df['value'] weight_t = weights.dtype + is_weighted = True else: weights = None weight_t = np.dtype("float32") + is_weighted = False # FIXME: Offsets and indices are currently hardcoded to int, but this may # not be acceptable in the future. @@ -58,7 +60,7 @@ def mg_sssp(input_df, np.dtype("double") : numberTypeEnum.doubleType} # FIXME: needs to be edge_t type not int - cdef int num_partition_edges = len(src) + cdef int num_local_edges = len(src) cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0] cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0] @@ -79,9 +81,10 @@ def mg_sssp(input_df, ((numberTypeMap[vertex_t])), ((numberTypeMap[edge_t])), ((numberTypeMap[weight_t])), - num_partition_edges, + num_local_edges, num_global_verts, num_global_edges, True, + is_weighted, False, True) # Generate the cudf.DataFrame result diff --git a/python/cugraph/link_analysis/pagerank_wrapper.pyx b/python/cugraph/link_analysis/pagerank_wrapper.pyx index 81a68d42360..2c619a052ec 100644 --- a/python/cugraph/link_analysis/pagerank_wrapper.pyx +++ b/python/cugraph/link_analysis/pagerank_wrapper.pyx @@ -42,7 +42,7 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1. num_verts = input_graph.number_of_vertices() num_edges = input_graph.number_of_edges(directed_edges=True) # FIXME: needs to be edge_t type not int - cdef int num_partition_edges = len(src) + cdef int num_local_edges = len(src) df = cudf.DataFrame() df['vertex'] = cudf.Series(np.arange(num_verts, dtype=np.int32)) @@ -71,8 +71,10 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1. if weights is not None: c_edge_weights = weights.__cuda_array_interface__['data'][0] weight_t = weights.dtype + is_weighted = True else: weight_t = np.dtype("float32") + is_weighted = False # FIXME: Offsets and indices are currently hardcoded to int, but this may # not be acceptable in the future. @@ -96,10 +98,10 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1. ((numberTypeEnum.int32Type)), ((numberTypeEnum.int32Type)), ((numberTypeMap[weight_t])), - #num_verts, num_edges, - num_partition_edges, + num_local_edges, num_verts, num_edges, False, + is_weighted, True, False) diff --git a/python/cugraph/structure/graph_utilities.pxd b/python/cugraph/structure/graph_utilities.pxd index 10c90f44cb8..b169e42ccf8 100644 --- a/python/cugraph/structure/graph_utilities.pxd +++ b/python/cugraph/structure/graph_utilities.pxd @@ -46,10 +46,11 @@ cdef extern from "utilities/cython.hpp" namespace "cugraph::cython": numberTypeEnum vertexType, numberTypeEnum edgeType, numberTypeEnum weightType, - size_t num_partition_edges, + size_t num_local_edges, size_t num_global_vertices, size_t num_global_edges, bool sorted_by_degree, + bool is_weighted, bool transposed, bool multi_gpu) except + @@ -106,18 +107,21 @@ cdef extern from "experimental/graph_view.hpp" namespace "cugraph::experimental" # cdef extern from "utilities/cython.hpp" namespace "cugraph::cython": - cdef cppclass major_minor_weights_t[vertex_t, weight_t]: + cdef cppclass major_minor_weights_t[vertex_t, edge_t, weight_t]: major_minor_weights_t(const handle_t &handle) pair[unique_ptr[device_buffer], size_t] get_major_wrap() pair[unique_ptr[device_buffer], size_t] get_minor_wrap() pair[unique_ptr[device_buffer], size_t] get_weights_wrap() + unique_ptr[vector[edge_t]] get_edge_counts_wrap() ctypedef fused shuffled_vertices_t: - major_minor_weights_t[int, float] - major_minor_weights_t[int, double] - major_minor_weights_t[long, float] - major_minor_weights_t[long, double] + major_minor_weights_t[int, int, float] + major_minor_weights_t[int, int, double] + major_minor_weights_t[int, long, float] + major_minor_weights_t[int, long, double] + major_minor_weights_t[long, long, float] + major_minor_weights_t[long, long, double] # 3. return type for renumber: # @@ -151,13 +155,12 @@ cdef extern from "utilities/cython.hpp" namespace "cugraph::cython": # cdef extern from "utilities/cython.hpp" namespace "cugraph::cython": - cdef unique_ptr[major_minor_weights_t[vertex_t, weight_t]] call_shuffle[vertex_t, edge_t, weight_t]( + cdef unique_ptr[major_minor_weights_t[vertex_t, edge_t, weight_t]] call_shuffle[vertex_t, edge_t, weight_t]( const handle_t &handle, vertex_t *edgelist_major_vertices, vertex_t *edgelist_minor_vertices, weight_t* edgelist_weights, - edge_t num_edges, - bool is_hyper_partitioned) except + + edge_t num_edges) except + # 5. `renumber_edgelist()` wrapper # @@ -167,7 +170,6 @@ cdef extern from "utilities/cython.hpp" namespace "cugraph::cython": const handle_t &handle, vertex_t *edgelist_major_vertices, vertex_t *edgelist_minor_vertices, - edge_t num_edges, - bool is_hyper_partitioned, + const vector[edge_t]& edge_counts, bool do_check, bool multi_gpu) except + diff --git a/python/cugraph/structure/renumber_wrapper.pyx b/python/cugraph/structure/renumber_wrapper.pyx index 682c6b32a0f..99626cdee08 100644 --- a/python/cugraph/structure/renumber_wrapper.pyx +++ b/python/cugraph/structure/renumber_wrapper.pyx @@ -22,6 +22,7 @@ from libc.stdint cimport uintptr_t from cython.operator cimport dereference as deref import numpy as np +from libcpp.memory cimport make_unique from libcpp.utility cimport move from rmm._lib.device_buffer cimport device_buffer, DeviceBuffer @@ -103,13 +104,11 @@ def renumber(input_df, # maybe use cpdef ? raise Exception("Incompatible vertex_t and edge_t types.") # FIXME: needs to be edge_t type not int - cdef int num_partition_edges = len(major_vertices) + cdef int num_local_edges = len(major_vertices) cdef uintptr_t c_major_vertices = major_vertices.__cuda_array_interface__['data'][0] cdef uintptr_t c_minor_vertices = minor_vertices.__cuda_array_interface__['data'][0] - cdef bool is_hyper_partitioned = False # for now - cdef uintptr_t shuffled_major = NULL cdef uintptr_t shuffled_minor = NULL @@ -119,12 +118,14 @@ def renumber(input_df, # maybe use cpdef ? cdef pair[unique_ptr[device_buffer], size_t] pair_original cdef pair[unique_ptr[device_buffer], size_t] pair_partition - # tparams: vertex_t, weight_t: + # tparams: vertex_t, edge_t, weight_t: # - cdef unique_ptr[major_minor_weights_t[int, float]] ptr_shuffled_32_32 - cdef unique_ptr[major_minor_weights_t[int, double]] ptr_shuffled_32_64 - cdef unique_ptr[major_minor_weights_t[long, float]] ptr_shuffled_64_32 - cdef unique_ptr[major_minor_weights_t[long, double]] ptr_shuffled_64_64 + cdef unique_ptr[major_minor_weights_t[int, int, float]] ptr_shuffled_32_32_32 + cdef unique_ptr[major_minor_weights_t[int, int, double]] ptr_shuffled_32_32_64 + cdef unique_ptr[major_minor_weights_t[int, long, float]] ptr_shuffled_32_64_32 + cdef unique_ptr[major_minor_weights_t[int, long, double]] ptr_shuffled_32_64_64 + cdef unique_ptr[major_minor_weights_t[long, long, float]] ptr_shuffled_64_64_32 + cdef unique_ptr[major_minor_weights_t[long, long, double]] ptr_shuffled_64_64_64 # tparams: vertex_t, edge_t: # @@ -132,6 +133,11 @@ def renumber(input_df, # maybe use cpdef ? cdef unique_ptr[renum_quad_t[int, long]] ptr_renum_quad_32_64 cdef unique_ptr[renum_quad_t[long, long]] ptr_renum_quad_64_64 + # tparam: vertex_t: + # + cdef unique_ptr[vector[int]] edge_counts_32 + cdef unique_ptr[vector[long]] edge_counts_64 + # tparam: vertex_t: # cdef unique_ptr[vector[int]] uniq_partition_vector_32 @@ -143,31 +149,32 @@ def renumber(input_df, # maybe use cpdef ? if ( edge_t == np.dtype("int32")): if( weight_t == np.dtype("float32")): if(is_multi_gpu): - ptr_shuffled_32_32.reset(call_shuffle[int, int, float](deref(handle_ptr), + ptr_shuffled_32_32_32.reset(call_shuffle[int, int, float](deref(handle_ptr), c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges, - is_hyper_partitioned).release()) - shuffled_df = renumber_helper(ptr_shuffled_32_32.get(), vertex_t, weights) + num_local_edges).release()) + shuffled_df = renumber_helper(ptr_shuffled_32_32_32.get(), vertex_t, weights) major_vertices = shuffled_df['major_vertices'] minor_vertices = shuffled_df['minor_vertices'] - num_partition_edges = len(shuffled_df) + num_local_edges = len(shuffled_df) if not transposed: major = 'src'; minor = 'dst' else: major = 'dst'; minor = 'src' shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False) + edge_counts_32 = move(ptr_shuffled_32_32_32.get().get_edge_counts_wrap()) else: shuffled_df = input_df - + edge_counts_32 = make_unique[vector[int]](1, num_local_edges) + shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] + ptr_renum_quad_32_32.reset(call_renumber[int, int](deref(handle_ptr), shuffled_major, shuffled_minor, - num_partition_edges, - is_hyper_partitioned, + deref(edge_counts_32.get()), 1, mg_flag).release()) @@ -190,8 +197,7 @@ def renumber(input_df, # maybe use cpdef ? uniq_partition_vector_32.get()[0].at(rank_indx+1)), dtype=vertex_t) else: - new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(0), - uniq_partition_vector_32.get()[0].at(1)), + new_series = cudf.Series(np.arange(0, ptr_renum_quad_32_32.get().get_num_vertices()), dtype=vertex_t) # create new cudf df # @@ -205,24 +211,25 @@ def renumber(input_df, # maybe use cpdef ? elif( weight_t == np.dtype("float64")): if(is_multi_gpu): - ptr_shuffled_32_64.reset(call_shuffle[int, int, double](deref(handle_ptr), + ptr_shuffled_32_32_64.reset(call_shuffle[int, int, double](deref(handle_ptr), c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges, - is_hyper_partitioned).release()) + num_local_edges).release()) - shuffled_df = renumber_helper(ptr_shuffled_32_64.get(), vertex_t, weights) + shuffled_df = renumber_helper(ptr_shuffled_32_32_64.get(), vertex_t, weights) major_vertices = shuffled_df['major_vertices'] minor_vertices = shuffled_df['minor_vertices'] - num_partition_edges = len(shuffled_df) + num_local_edges = len(shuffled_df) if not transposed: major = 'src'; minor = 'dst' else: major = 'dst'; minor = 'src' shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False) + edge_counts_32 = move(ptr_shuffled_32_32_64.get().get_edge_counts_wrap()) else: shuffled_df = input_df + edge_counts_32 = make_unique[vector[int]](1, num_local_edges) shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] @@ -230,8 +237,7 @@ def renumber(input_df, # maybe use cpdef ? ptr_renum_quad_32_32.reset(call_renumber[int, int](deref(handle_ptr), shuffled_major, shuffled_minor, - num_partition_edges, - is_hyper_partitioned, + deref(edge_counts_32.get()), do_check, mg_flag).release()) @@ -254,8 +260,7 @@ def renumber(input_df, # maybe use cpdef ? uniq_partition_vector_32.get()[0].at(rank_indx+1)), dtype=vertex_t) else: - new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(0), - uniq_partition_vector_32.get()[0].at(1)), + new_series = cudf.Series(np.arange(0, ptr_renum_quad_32_32.get().get_num_vertices()), dtype=vertex_t) # create new cudf df @@ -271,24 +276,25 @@ def renumber(input_df, # maybe use cpdef ? elif ( edge_t == np.dtype("int64")): if( weight_t == np.dtype("float32")): if(is_multi_gpu): - ptr_shuffled_32_32.reset(call_shuffle[int, long, float](deref(handle_ptr), + ptr_shuffled_32_64_32.reset(call_shuffle[int, long, float](deref(handle_ptr), c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges, - is_hyper_partitioned).release()) + num_local_edges).release()) - shuffled_df = renumber_helper(ptr_shuffled_32_32.get(), vertex_t, weights) + shuffled_df = renumber_helper(ptr_shuffled_32_64_32.get(), vertex_t, weights) major_vertices = shuffled_df['major_vertices'] minor_vertices = shuffled_df['minor_vertices'] - num_partition_edges = len(shuffled_df) + num_local_edges = len(shuffled_df) if not transposed: major = 'src'; minor = 'dst' else: major = 'dst'; minor = 'src' shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False) + edge_counts_64 = move(ptr_shuffled_32_64_32.get().get_edge_counts_wrap()) else: shuffled_df = input_df + edge_counts_64 = make_unique[vector[long]](1, num_local_edges) shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] @@ -296,8 +302,7 @@ def renumber(input_df, # maybe use cpdef ? ptr_renum_quad_32_64.reset(call_renumber[int, long](deref(handle_ptr), shuffled_major, shuffled_minor, - num_partition_edges, - is_hyper_partitioned, + deref(edge_counts_64.get()), do_check, mg_flag).release()) @@ -320,8 +325,7 @@ def renumber(input_df, # maybe use cpdef ? uniq_partition_vector_32.get()[0].at(rank_indx+1)), dtype=vertex_t) else: - new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(0), - uniq_partition_vector_32.get()[0].at(1)), + new_series = cudf.Series(np.arange(0, ptr_renum_quad_32_64.get().get_num_vertices()), dtype=vertex_t) # create new cudf df @@ -335,24 +339,25 @@ def renumber(input_df, # maybe use cpdef ? return renumbered_map, shuffled_df elif( weight_t == np.dtype("float64")): if(is_multi_gpu): - ptr_shuffled_32_64.reset(call_shuffle[int, long, double](deref(handle_ptr), + ptr_shuffled_32_64_64.reset(call_shuffle[int, long, double](deref(handle_ptr), c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges, - is_hyper_partitioned).release()) + num_local_edges).release()) - shuffled_df = renumber_helper(ptr_shuffled_32_64.get(), vertex_t, weights) + shuffled_df = renumber_helper(ptr_shuffled_32_64_64.get(), vertex_t, weights) major_vertices = shuffled_df['major_vertices'] minor_vertices = shuffled_df['minor_vertices'] - num_partition_edges = len(shuffled_df) + num_local_edges = len(shuffled_df) if not transposed: major = 'src'; minor = 'dst' else: major = 'dst'; minor = 'src' shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False) + edge_counts_64 = move(ptr_shuffled_32_64_64.get().get_edge_counts_wrap()) else: shuffled_df = input_df + edge_counts_64 = make_unique[vector[long]](1, num_local_edges) shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] @@ -360,8 +365,7 @@ def renumber(input_df, # maybe use cpdef ? ptr_renum_quad_32_64.reset(call_renumber[int, long](deref(handle_ptr), shuffled_major, shuffled_minor, - num_partition_edges, - is_hyper_partitioned, + deref(edge_counts_64.get()), do_check, mg_flag).release()) @@ -384,8 +388,7 @@ def renumber(input_df, # maybe use cpdef ? uniq_partition_vector_32.get()[0].at(rank_indx+1)), dtype=vertex_t) else: - new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(0), - uniq_partition_vector_32.get()[0].at(1)), + new_series = cudf.Series(np.arange(0, ptr_renum_quad_32_64.get().get_num_vertices()), dtype=vertex_t) # create new cudf df # @@ -401,24 +404,25 @@ def renumber(input_df, # maybe use cpdef ? if ( edge_t == np.dtype("int64")): if( weight_t == np.dtype("float32")): if(is_multi_gpu): - ptr_shuffled_64_32.reset(call_shuffle[long, long, float](deref(handle_ptr), + ptr_shuffled_64_64_32.reset(call_shuffle[long, long, float](deref(handle_ptr), c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges, - is_hyper_partitioned).release()) + num_local_edges).release()) - shuffled_df = renumber_helper(ptr_shuffled_64_32.get(), vertex_t, weights) + shuffled_df = renumber_helper(ptr_shuffled_64_64_32.get(), vertex_t, weights) major_vertices = shuffled_df['major_vertices'] minor_vertices = shuffled_df['minor_vertices'] - num_partition_edges = len(shuffled_df) + num_local_edges = len(shuffled_df) if not transposed: major = 'src'; minor = 'dst' else: major = 'dst'; minor = 'src' shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False) + edge_counts_64 = move(ptr_shuffled_64_64_32.get().get_edge_counts_wrap()) else: shuffled_df = input_df + edge_counts_64 = make_unique[vector[long]](1, num_local_edges) shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] @@ -426,8 +430,7 @@ def renumber(input_df, # maybe use cpdef ? ptr_renum_quad_64_64.reset(call_renumber[long, long](deref(handle_ptr), shuffled_major, shuffled_minor, - num_partition_edges, - is_hyper_partitioned, + deref(edge_counts_64.get()), do_check, mg_flag).release()) @@ -450,8 +453,7 @@ def renumber(input_df, # maybe use cpdef ? uniq_partition_vector_64.get()[0].at(rank_indx+1)), dtype=vertex_t) else: - new_series = cudf.Series(np.arange(uniq_partition_vector_64.get()[0].at(0), - uniq_partition_vector_64.get()[0].at(1)), + new_series = cudf.Series(np.arange(0, ptr_renum_quad_64_64.get().get_num_vertices()), dtype=vertex_t) # create new cudf df @@ -466,24 +468,25 @@ def renumber(input_df, # maybe use cpdef ? elif( weight_t == np.dtype("float64")): if(is_multi_gpu): - ptr_shuffled_64_64.reset(call_shuffle[long, long, double](deref(handle_ptr), + ptr_shuffled_64_64_64.reset(call_shuffle[long, long, double](deref(handle_ptr), c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges, - is_hyper_partitioned).release()) + num_local_edges).release()) - shuffled_df = renumber_helper(ptr_shuffled_64_64.get(), vertex_t, weights) + shuffled_df = renumber_helper(ptr_shuffled_64_64_64.get(), vertex_t, weights) major_vertices = shuffled_df['major_vertices'] minor_vertices = shuffled_df['minor_vertices'] - num_partition_edges = len(shuffled_df) + num_local_edges = len(shuffled_df) if not transposed: major = 'src'; minor = 'dst' else: major = 'dst'; minor = 'src' shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False) + edge_counts_64 = move(ptr_shuffled_64_64_64.get().get_edge_counts_wrap()) else: shuffled_df = input_df + edge_counts_64 = make_unique[vector[long]](1, num_local_edges) shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] @@ -491,8 +494,7 @@ def renumber(input_df, # maybe use cpdef ? ptr_renum_quad_64_64.reset(call_renumber[long, long](deref(handle_ptr), shuffled_major, shuffled_minor, - num_partition_edges, - is_hyper_partitioned, + deref(edge_counts_64.get()), do_check, mg_flag).release()) @@ -515,8 +517,7 @@ def renumber(input_df, # maybe use cpdef ? uniq_partition_vector_64.get()[0].at(rank_indx+1)), dtype=vertex_t) else: - new_series = cudf.Series(np.arange(uniq_partition_vector_64.get()[0].at(0), - uniq_partition_vector_64.get()[0].at(1)), + new_series = cudf.Series(np.arange(0, ptr_renum_quad_64_64.get().get_num_vertices()), dtype=vertex_t) # create new cudf df