From 026387e4d9b2ce819fa79e3e98b03259386c04a3 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 23 Feb 2021 15:19:32 -0500 Subject: [PATCH 01/63] switch the graph partitioning scheme in graph(_view)_t --- cpp/include/experimental/graph_view.hpp | 60 ++++--------------------- cpp/src/experimental/graph.cu | 42 +++++------------ cpp/src/experimental/graph_view.cu | 17 +++---- 3 files changed, 25 insertions(+), 94 deletions(-) diff --git a/cpp/include/experimental/graph_view.hpp b/cpp/include/experimental/graph_view.hpp index d2ae1150970..f940b1013bf 100644 --- a/cpp/include/experimental/graph_view.hpp +++ b/cpp/include/experimental/graph_view.hpp @@ -40,32 +40,11 @@ namespace experimental { * * We need to partition 1D vertex arrays (storing per vertex values) and the 2D graph adjacency * matrix (or transposed 2D graph adjacency matrix) of G. An 1D vertex array of size V is divided to - * P linear partitions; each partition has the size close to V / P. We consider two different - * strategies to partition the 2D matrix: the default strategy and the hypergraph partitioning based - * strategy (the latter is for future extension). - * FIXME: in the future we may use the latter for both as this leads to simpler communication - * patterns and better control over parallelism vs memory footprint trade-off. + * P linear partitions; each partition has the size close to V / P. * - * In the default case, one GPU will be responsible for 1 rectangular partition. The matrix will be - * horizontally partitioned first to P_row slabs. Each slab will be further vertically partitioned - * to P_col rectangles. Each rectangular partition will have the size close to V / P_row by V / - * P_col. - * - * To be more specific, a GPU with (col_comm_rank, row_comm_rank) will be responsible for one - * rectangular partition [a,b) by [c,d) where a = vertex_partition_offsets[row_comm_size * - * col_comm_rank], b = vertex_partition_offsets[row_comm_size * (col_comm_rank + 1)], c = - * vertex_partition_offsets[col_comm_size * row_comm_rank], and d = - * vertex_partition_offsets[col_comm_size * (row_comm_rank + 1)]. - * - * In the future, we may apply hyper-graph partitioning to divide V vertices to P groups minimizing - * edge cuts across groups while balancing the number of vertices in each group. We will also - * renumber vertices so the vertices in each group are mapped to consecutive integers. Then, there - * will be more non-zeros in the diagonal partitions of the 2D graph adjacency matrix (or the - * transposed 2D graph adjacency matrix) than the off-diagonal partitions. The default strategy does - * not balance the number of nonzeros if hyper-graph partitioning is applied. To solve this problem, - * the matrix is first horizontally partitioned to P slabs, then each slab will be further - * vertically partitioned to P_row (instead of P_col in the default case) rectangles. One GPU will - * be responsible col_comm_size rectangular partitions in this case. + * The 2D graph adjacency matrix is first horizontally partitioned to P slabs, then each slab will + * be further vertically partitioned to P_row (instead of P_col in the default case) rectangles. One + * GPU will be responsible col_comm_size rectangular partitions. * * To be more specific, a GPU with (col_comm_rank, row_comm_rank) will be responsible for * col_comm_size rectangular partitions [a_i,b_i) by [c,d) where a_i = @@ -83,13 +62,11 @@ template class partition_t { public: partition_t(std::vector const& vertex_partition_offsets, - bool hypergraph_partitioned, int row_comm_size, int col_comm_size, int row_comm_rank, int col_comm_rank) : vertex_partition_offsets_(vertex_partition_offsets), - hypergraph_partitioned_(hypergraph_partitioned), comm_rank_(col_comm_rank * row_comm_size + row_comm_rank), row_comm_size_(row_comm_size), col_comm_size_(col_comm_size), @@ -157,10 +134,7 @@ class partition_t { get_vertex_partition_first(vertex_partition_idx); } - size_t get_number_of_matrix_partitions() const - { - return hypergraph_partitioned_ ? col_comm_size_ : 1; - } + size_t get_number_of_matrix_partitions() const { return col_comm_size_; } // major: row of the graph adjacency matrix (if the graph adjacency matrix is stored as is) or // column of the graph adjacency matrix (if the transposed graph adjacency matrix is stored). @@ -173,16 +147,12 @@ class partition_t { vertex_t get_matrix_partition_major_first(size_t partition_idx) const { - return hypergraph_partitioned_ - ? vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_] - : vertex_partition_offsets_[col_comm_rank_ * row_comm_size_]; + return vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_]; } vertex_t get_matrix_partition_major_last(size_t partition_idx) const { - return hypergraph_partitioned_ - ? vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_ + 1] - : vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_]; + return vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_ + 1]; } vertex_t get_matrix_partition_major_value_start_offset(size_t partition_idx) const @@ -202,24 +172,16 @@ class partition_t { vertex_t get_matrix_partition_minor_first() const { - return hypergraph_partitioned_ ? vertex_partition_offsets_[col_comm_rank_ * row_comm_size_] - : vertex_partition_offsets_[row_comm_rank_ * col_comm_size_]; + return vertex_partition_offsets_[col_comm_rank_ * row_comm_size_]; } vertex_t get_matrix_partition_minor_last() const { - return hypergraph_partitioned_ - ? vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_] - : vertex_partition_offsets_[(row_comm_rank_ + 1) * col_comm_size_]; + return vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_]; } - // FIXME: this function may be removed if we use the same partitioning strategy whether hypergraph - // partitioning is applied or not - bool is_hypergraph_partitioned() const { return hypergraph_partitioned_; } - private: std::vector vertex_partition_offsets_{}; // size = P + 1 - bool hypergraph_partitioned_{false}; int comm_rank_{0}; int row_comm_size_{0}; @@ -445,8 +407,6 @@ class graph_view_t(col_comm_size))) || - (!(partition.is_hypergraph_partitioned()) && (edgelists.size() == 1)), + CUGRAPH_EXPECTS(edgelists.size() == static_cast(col_comm_size), "Invalid input argument: errneous edgelists.size()."); // optional expensive checks (part 1/3) @@ -315,22 +313,12 @@ graph_t aggregate_segment_offsets(0, default_stream); - if (partition.is_hypergraph_partitioned()) { - rmm::device_uvector aggregate_segment_offsets( - col_comm_size * segment_offsets.size(), default_stream); - col_comm.allgather(segment_offsets.data(), - aggregate_segment_offsets.data(), - segment_offsets.size(), - default_stream); - } else { - rmm::device_uvector aggregate_segment_offsets( - row_comm_size * segment_offsets.size(), default_stream); - row_comm.allgather(segment_offsets.data(), - aggregate_segment_offsets.data(), - segment_offsets.size(), - default_stream); - } + rmm::device_uvector aggregate_segment_offsets(col_comm_size * segment_offsets.size(), + default_stream); + col_comm.allgather(segment_offsets.data(), + aggregate_segment_offsets.data(), + segment_offsets.size(), + default_stream); vertex_partition_segment_offsets_.resize(aggregate_segment_offsets.size()); raft::update_host(vertex_partition_segment_offsets_.data(), @@ -338,18 +326,10 @@ graph_t(row_comm_size))) || - (!(partition.is_hypergraph_partitioned()) && (adj_matrix_partition_offsets.size() == 1)), - "Internal Error: erroneous adj_matrix_partition_offsets.size()."); + CUGRAPH_EXPECTS(adj_matrix_partition_offsets.size() == static_cast(row_comm_size), + "Internal Error: erroneous adj_matrix_partition_offsets.size()."); CUGRAPH_EXPECTS((sorted_by_global_degree_within_vertex_partition && (vertex_partition_segment_offsets.size() == - (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size) * - (detail::num_segments_per_vertex_partition + 1))) || + col_comm_size * (detail::num_segments_per_vertex_partition + 1))) || (!sorted_by_global_degree_within_vertex_partition && (vertex_partition_segment_offsets.size() == 0)), "Internal Error: vertex_partition_segment_offsets.size() does not match " @@ -189,8 +185,7 @@ graph_view_t Date: Wed, 24 Feb 2021 00:36:24 -0500 Subject: [PATCH 02/63] switch the graph partitioning schme in patter accelerator headers and graph_utils.cuh --- .../experimental/detail/graph_utils.cuh | 67 +-- .../patterns/copy_to_adj_matrix_row_col.cuh | 434 +++++++----------- .../copy_v_transform_reduce_in_out_nbr.cuh | 120 ++--- ...ransform_reduce_key_aggregated_out_nbr.cuh | 25 +- ...orm_reduce_by_adj_matrix_row_col_key_e.cuh | 17 +- .../update_frontier_v_push_if_out_nbr.cuh | 62 +-- 6 files changed, 234 insertions(+), 491 deletions(-) diff --git a/cpp/include/experimental/detail/graph_utils.cuh b/cpp/include/experimental/detail/graph_utils.cuh index 3ac2e2163c6..380a75b5c34 100644 --- a/cpp/include/experimental/detail/graph_utils.cuh +++ b/cpp/include/experimental/detail/graph_utils.cuh @@ -55,63 +55,36 @@ rmm::device_uvector compute_major_degree( rmm::device_uvector degrees(0, handle.get_stream()); vertex_t max_num_local_degrees{0}; - for (int i = 0; i < (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size); - ++i) { - auto vertex_partition_idx = partition.is_hypergraph_partitioned() - ? static_cast(i * row_comm_size + row_comm_rank) - : static_cast(col_comm_rank * row_comm_size + i); + for (int i = 0; i < col_comm_size; ++i) { + auto vertex_partition_idx = static_cast(i * row_comm_size + row_comm_rank); auto vertex_partition_size = partition.get_vertex_partition_size(vertex_partition_idx); max_num_local_degrees = std::max(max_num_local_degrees, vertex_partition_size); - if (i == (partition.is_hypergraph_partitioned() ? col_comm_rank : row_comm_rank)) { - degrees.resize(vertex_partition_size, handle.get_stream()); - } + if (i == col_comm_rank) { degrees.resize(vertex_partition_size, handle.get_stream()); } } local_degrees.resize(max_num_local_degrees, handle.get_stream()); - for (int i = 0; i < (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size); - ++i) { - auto vertex_partition_idx = partition.is_hypergraph_partitioned() - ? static_cast(i * row_comm_size + row_comm_rank) - : static_cast(col_comm_rank * row_comm_size + i); + for (int i = 0; i < col_comm_size; ++i) { + auto vertex_partition_idx = static_cast(i * row_comm_size + row_comm_rank); vertex_t major_first{}; vertex_t major_last{}; std::tie(major_first, major_last) = partition.get_vertex_partition_range(vertex_partition_idx); - auto p_offsets = - partition.is_hypergraph_partitioned() - ? adj_matrix_partition_offsets[i] - : adj_matrix_partition_offsets[0] + - (major_first - partition.get_vertex_partition_first(col_comm_rank * row_comm_size)); + auto p_offsets = adj_matrix_partition_offsets[i]; thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), thrust::make_counting_iterator(vertex_t{0}), thrust::make_counting_iterator(major_last - major_first), local_degrees.data(), [p_offsets] __device__(auto i) { return p_offsets[i + 1] - p_offsets[i]; }); - if (partition.is_hypergraph_partitioned()) { - col_comm.reduce(local_degrees.data(), - i == col_comm_rank ? degrees.data() : static_cast(nullptr), - static_cast(major_last - major_first), - raft::comms::op_t::SUM, - i, - handle.get_stream()); - } else { - row_comm.reduce(local_degrees.data(), - i == row_comm_rank ? degrees.data() : static_cast(nullptr), - static_cast(major_last - major_first), - raft::comms::op_t::SUM, - i, - handle.get_stream()); - } + col_comm.reduce(local_degrees.data(), + i == col_comm_rank ? degrees.data() : static_cast(nullptr), + static_cast(major_last - major_first), + raft::comms::op_t::SUM, + i, + handle.get_stream()); } - raft::comms::status_t status{}; - if (partition.is_hypergraph_partitioned()) { - status = - col_comm.sync_stream(handle.get_stream()); // this is neessary as local_degrees will become - // out-of-scope once this function returns. - } else { - status = - row_comm.sync_stream(handle.get_stream()); // this is neessary as local_degrees will become - // out-of-scope once this function returns. - } + // FIXME: is this necessary? + auto status = + col_comm.sync_stream(handle.get_stream()); // this is necessary as local_degrees will become + // out-of-scope once this function returns. CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure."); return degrees; @@ -154,7 +127,6 @@ struct compute_gpu_id_from_vertex_t { template struct compute_gpu_id_from_edge_t { - bool hypergraph_partitioned{false}; int comm_size{0}; int row_comm_size{0}; int col_comm_size{0}; @@ -164,12 +136,7 @@ struct compute_gpu_id_from_edge_t { cuco::detail::MurmurHash3_32 hash_func{}; auto major_comm_rank = static_cast(hash_func(major) % comm_size); auto minor_comm_rank = static_cast(hash_func(minor) % comm_size); - if (hypergraph_partitioned) { - return (minor_comm_rank / col_comm_size) * row_comm_size + (major_comm_rank % row_comm_size); - } else { - return (major_comm_rank - (major_comm_rank % row_comm_size)) + - (minor_comm_rank / col_comm_size); - } + return (minor_comm_rank / col_comm_size) * row_comm_size + (major_comm_rank % row_comm_size); } }; diff --git a/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh b/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh index d4559de06af..26876957b44 100644 --- a/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh +++ b/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh @@ -51,31 +51,27 @@ void copy_to_matrix_major(raft::handle_t const& handle, MatrixMajorValueOutputIterator matrix_major_value_output_first) { if (GraphViewType::is_multi_gpu) { - if (graph_view.is_hypergraph_partitioned()) { - CUGRAPH_FAIL("unimplemented."); - } else { - auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - auto const col_comm_size = col_comm.get_size(); - - std::vector rx_counts(row_comm_size, size_t{0}); - std::vector displacements(row_comm_size, size_t{0}); - for (int i = 0; i < row_comm_size; ++i) { - rx_counts[i] = graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i); - displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1]; - } - device_allgatherv(row_comm, - vertex_value_input_first, - matrix_major_value_output_first, - rx_counts, - displacements, - handle.get_stream()); + auto& comm = handle.get_comms(); + auto const comm_rank = comm.get_rank(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_rank = row_comm.get_rank(); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); + + std::vector rx_counts(col_comm_size, size_t{0}); + std::vector displacements(col_comm_size, size_t{0}); + for (int i = 0; i < col_comm_size; ++i) { + rx_counts[i] = graph_view.get_vertex_partition_size(i * row_comm_size + row_comm_rank); + displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1]; } + device_allgatherv(col_comm, + vertex_value_input_first, + matrix_major_value_output_first, + rx_counts, + displacements, + handle.get_stream()); } else { assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols() @@ -101,80 +97,76 @@ void copy_to_matrix_major(raft::handle_t const& handle, using vertex_t = typename GraphViewType::vertex_type; if (GraphViewType::is_multi_gpu) { - if (graph_view.is_hypergraph_partitioned()) { - CUGRAPH_FAIL("unimplemented."); - } else { - auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - auto const col_comm_size = col_comm.get_size(); - - auto rx_counts = - host_scalar_allgather(row_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); - - matrix_partition_device_t matrix_partition(graph_view, 0); - for (int i = 0; i < row_comm_size; ++i) { - rmm::device_uvector rx_vertices(row_comm_rank == i ? size_t{0} : rx_counts[i], - handle.get_stream()); - auto rx_tmp_buffer = allocate_dataframe_buffer< - typename std::iterator_traits::value_type>(rx_counts[i], - handle.get_stream()); - auto rx_value_first = get_dataframe_buffer_begin< - typename std::iterator_traits::value_type>(rx_tmp_buffer); + auto& comm = handle.get_comms(); + auto const comm_rank = comm.get_rank(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_rank = row_comm.get_rank(); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); + + auto rx_counts = + host_scalar_allgather(col_comm, + static_cast(thrust::distance(vertex_first, vertex_last)), + handle.get_stream()); + + for (int i = 0; i < col_comm_size; ++i) { + matrix_partition_device_t matrix_partition(graph_view, i); + + rmm::device_uvector rx_vertices(col_comm_rank == i ? size_t{0} : rx_counts[i], + handle.get_stream()); + auto rx_tmp_buffer = allocate_dataframe_buffer< + typename std::iterator_traits::value_type>(rx_counts[i], + handle.get_stream()); + auto rx_value_first = get_dataframe_buffer_begin< + typename std::iterator_traits::value_type>(rx_tmp_buffer); - if (row_comm_rank == i) { - vertex_partition_device_t vertex_partition(graph_view); - auto map_first = - thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) { - return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v); - }); - // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a - // permutation iterator (and directly gathers to the internal buffer) - thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - map_first, - map_first + thrust::distance(vertex_first, vertex_last), - vertex_value_input_first, - rx_value_first); - } + if (col_comm_rank == i) { + vertex_partition_device_t vertex_partition(graph_view); + auto map_first = + thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) { + return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v); + }); + // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a + // permutation iterator (and directly gathers to the internal buffer) + thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + map_first, + map_first + thrust::distance(vertex_first, vertex_last), + vertex_value_input_first, + rx_value_first); + } - // FIXME: these broadcast operations can be placed between ncclGroupStart() and - // ncclGroupEnd() - device_bcast( - row_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); - device_bcast( - row_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream()); + // FIXME: these broadcast operations can be placed between ncclGroupStart() and + // ncclGroupEnd() + device_bcast( + col_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + device_bcast(col_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream()); - if (row_comm_rank == i) { - auto map_first = - thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) { - return matrix_partition.get_major_offset_from_major_nocheck(v); - }); - // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and - // directly scatters from the internal buffer) - thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_value_first, - rx_value_first + rx_counts[i], - map_first, - matrix_major_value_output_first); - } else { - auto map_first = thrust::make_transform_iterator( - rx_vertices.begin(), [matrix_partition] __device__(auto v) { - return matrix_partition.get_major_offset_from_major_nocheck(v); - }); - // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and - // directly scatters from the internal buffer) - thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_value_first, - rx_value_first + rx_counts[i], - map_first, - matrix_major_value_output_first); - } + if (col_comm_rank == i) { + auto map_first = + thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) { + return matrix_partition.get_major_offset_from_major_nocheck(v); + }); + // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and + // directly scatters from the internal buffer) + thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_value_first, + rx_value_first + rx_counts[i], + map_first, + matrix_major_value_output_first); + } else { + auto map_first = thrust::make_transform_iterator( + rx_vertices.begin(), [matrix_partition] __device__(auto v) { + return matrix_partition.get_major_offset_from_major_nocheck(v); + }); + // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and + // directly scatters from the internal buffer) + thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_value_first, + rx_value_first + rx_counts[i], + map_first, + matrix_major_value_output_first); } } } else { @@ -199,59 +191,27 @@ void copy_to_matrix_minor(raft::handle_t const& handle, MatrixMinorValueOutputIterator matrix_minor_value_output_first) { if (GraphViewType::is_multi_gpu) { - if (graph_view.is_hypergraph_partitioned()) { - CUGRAPH_FAIL("unimplemented."); - } else { - auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - auto const col_comm_size = col_comm.get_size(); - - // FIXME: this P2P is unnecessary if we apply the partitioning scheme used with hypergraph - // partitioning - auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank; - auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size; - // FIXME: this branch may be no longer necessary with NCCL backend - if (comm_src_rank == comm_rank) { - assert(comm_dst_rank == comm_rank); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - vertex_value_input_first, - vertex_value_input_first + graph_view.get_number_of_local_vertices(), - matrix_minor_value_output_first + - (graph_view.get_vertex_partition_first(comm_src_rank) - - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size))); - } else { - device_sendrecv( - comm, - vertex_value_input_first, - static_cast(graph_view.get_number_of_local_vertices()), - comm_dst_rank, - matrix_minor_value_output_first + - (graph_view.get_vertex_partition_first(comm_src_rank) - - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)), - static_cast(graph_view.get_vertex_partition_size(comm_src_rank)), - comm_src_rank, - handle.get_stream()); - } - - // FIXME: these broadcast operations can be placed between ncclGroupStart() and - // ncclGroupEnd() - for (int i = 0; i < col_comm_size; ++i) { - auto offset = graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + i) - - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size); - auto count = graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + i); - device_bcast(col_comm, - matrix_minor_value_output_first + offset, - matrix_minor_value_output_first + offset, - count, - i, - handle.get_stream()); - } + auto& comm = handle.get_comms(); + auto const comm_rank = comm.get_rank(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_rank = row_comm.get_rank(); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); + + std::vector rx_counts(row_comm_size, size_t{0}); + std::vector displacements(row_comm_size, size_t{0}); + for (int i = 0; i < row_comm_size; ++i) { + rx_counts[i] = graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i); + displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1]; } + device_allgatherv(row_comm, + vertex_value_input_first, + matrix_minor_value_output_first, + rx_counts, + displacements, + handle.get_stream()); } else { assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed ? graph_view.get_number_of_local_adj_matrix_partition_rows() @@ -277,143 +237,75 @@ void copy_to_matrix_minor(raft::handle_t const& handle, using vertex_t = typename GraphViewType::vertex_type; if (GraphViewType::is_multi_gpu) { - if (graph_view.is_hypergraph_partitioned()) { - CUGRAPH_FAIL("unimplemented."); - } else { - auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - auto const col_comm_size = col_comm.get_size(); - - // FIXME: this P2P is unnecessary if apply the same partitioning scheme regardless of - // hypergraph partitioning is applied or not - auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank; - auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size; - size_t tx_count = thrust::distance(vertex_first, vertex_last); - size_t rx_count{}; - // FIXME: it seems like raft::isend and raft::irecv do not properly handle the destination (or - // source) == self case. Need to double check and fix this if this is indeed the case (or RAFT - // may use ncclSend/ncclRecv instead of UCX for device data). - if (comm_src_rank == comm_rank) { - assert(comm_dst_rank == comm_rank); - rx_count = tx_count; - } else { - std::vector count_requests(2); - comm.isend(&tx_count, 1, comm_dst_rank, 0 /* tag */, count_requests.data()); - comm.irecv(&rx_count, 1, comm_src_rank, 0 /* tag */, count_requests.data() + 1); - comm.waitall(count_requests.size(), count_requests.data()); - } - - vertex_partition_device_t vertex_partition(graph_view); - rmm::device_uvector dst_vertices(rx_count, handle.get_stream()); - auto dst_tmp_buffer = allocate_dataframe_buffer< - typename std::iterator_traits::value_type>(rx_count, + auto& comm = handle.get_comms(); + auto const comm_rank = comm.get_rank(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_rank = row_comm.get_rank(); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); + + auto rx_counts = + host_scalar_allgather(row_comm, + static_cast(thrust::distance(vertex_first, vertex_last)), + handle.get_stream()); + + matrix_partition_device_t matrix_partition(graph_view, 0); + for (int i = 0; i < row_comm_size; ++i) { + rmm::device_uvector rx_vertices(row_comm_rank == i ? size_t{0} : rx_counts[i], + handle.get_stream()); + auto rx_tmp_buffer = allocate_dataframe_buffer< + typename std::iterator_traits::value_type>(rx_counts[i], handle.get_stream()); - auto dst_value_first = get_dataframe_buffer_begin< - typename std::iterator_traits::value_type>(dst_tmp_buffer); - if (comm_src_rank == comm_rank) { - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - vertex_first, - vertex_last, - dst_vertices.begin()); - auto map_first = - thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) { - return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v); - }); - thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - map_first, - map_first + thrust::distance(vertex_first, vertex_last), - vertex_value_input_first, - dst_value_first); - } else { - auto src_tmp_buffer = allocate_dataframe_buffer< - typename std::iterator_traits::value_type>(tx_count, - handle.get_stream()); - auto src_value_first = get_dataframe_buffer_begin< - typename std::iterator_traits::value_type>(src_tmp_buffer); + auto rx_value_first = get_dataframe_buffer_begin< + typename std::iterator_traits::value_type>(rx_tmp_buffer); + if (row_comm_rank == i) { + vertex_partition_device_t vertex_partition(graph_view); auto map_first = thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) { return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v); }); + // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a + // permutation iterator (and directly gathers to the internal buffer) thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), map_first, map_first + thrust::distance(vertex_first, vertex_last), vertex_value_input_first, - src_value_first); - - device_sendrecv( - comm, - vertex_first, - tx_count, - comm_dst_rank, - dst_vertices.begin(), - rx_count, - comm_src_rank, - handle.get_stream()); - - device_sendrecv(comm, - src_value_first, - tx_count, - comm_dst_rank, - dst_value_first, - rx_count, - comm_src_rank, - handle.get_stream()); + rx_value_first); } - // FIXME: now we can clear tx_tmp_buffer - - auto rx_counts = host_scalar_allgather(col_comm, rx_count, handle.get_stream()); - - matrix_partition_device_t matrix_partition(graph_view, 0); - for (int i = 0; i < col_comm_size; ++i) { - rmm::device_uvector rx_vertices(col_comm_rank == i ? size_t{0} : rx_counts[i], - handle.get_stream()); - auto rx_tmp_buffer = allocate_dataframe_buffer< - typename std::iterator_traits::value_type>(rx_counts[i], - handle.get_stream()); - auto rx_value_first = get_dataframe_buffer_begin< - typename std::iterator_traits::value_type>(rx_tmp_buffer); - - // FIXME: these broadcast operations can be placed between ncclGroupStart() and - // ncclGroupEnd() - device_bcast(col_comm, - dst_vertices.begin(), - rx_vertices.begin(), - rx_counts[i], - i, - handle.get_stream()); - device_bcast( - col_comm, dst_value_first, rx_value_first, rx_counts[i], i, handle.get_stream()); - - if (col_comm_rank == i) { - auto map_first = thrust::make_transform_iterator( - dst_vertices.begin(), [matrix_partition] __device__(auto v) { - return matrix_partition.get_minor_offset_from_minor_nocheck(v); - }); - - thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - dst_value_first, - dst_value_first + rx_counts[i], - map_first, - matrix_minor_value_output_first); - } else { - auto map_first = thrust::make_transform_iterator( - rx_vertices.begin(), [matrix_partition] __device__(auto v) { - return matrix_partition.get_minor_offset_from_minor_nocheck(v); - }); + // FIXME: these broadcast operations can be placed between ncclGroupStart() and + // ncclGroupEnd() + device_bcast( + row_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + device_bcast(row_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream()); - thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_value_first, - rx_value_first + rx_counts[i], - map_first, - matrix_minor_value_output_first); - } + if (row_comm_rank == i) { + auto map_first = + thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) { + return matrix_partition.get_minor_offset_from_minor_nocheck(v); + }); + // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and + // directly scatters from the internal buffer) + thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_value_first, + rx_value_first + rx_counts[i], + map_first, + matrix_minor_value_output_first); + } else { + auto map_first = thrust::make_transform_iterator( + rx_vertices.begin(), [matrix_partition] __device__(auto v) { + return matrix_partition.get_minor_offset_from_minor_nocheck(v); + }); + // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and + // directly scatters from the internal buffer) + thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_value_first, + rx_value_first + rx_counts[i], + map_first, + matrix_minor_value_output_first); } } } else { diff --git a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh index 3059cf95852..58a9fb73bbc 100644 --- a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh +++ b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh @@ -362,15 +362,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); - auto loop_count = size_t{1}; - if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_size = row_comm.get_size(); - loop_count = graph_view.is_hypergraph_partitioned() - ? graph_view.get_number_of_local_adj_matrix_partitions() - : static_cast(row_comm_size); - } - auto comm_rank = handle.comms_initialized() ? handle.get_comms().get_rank() : int{0}; + auto comm_rank = GraphViewType::is_multi_gpu ? handle.get_comms().get_rank() : int{0}; auto minor_tmp_buffer_size = (GraphViewType::is_multi_gpu && (in != GraphViewType::is_adj_matrix_transposed)) @@ -388,8 +380,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, auto const row_comm_rank = row_comm.get_rank(); auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_rank = col_comm.get_rank(); - minor_init = graph_view.is_hypergraph_partitioned() ? (row_comm_rank == 0) ? init : T{} - : (col_comm_rank == 0) ? init : T{}; + minor_init = (row_comm_rank == 0) ? init : T {} } if (GraphViewType::is_multi_gpu) { @@ -407,9 +398,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, assert(minor_tmp_buffer_size == 0); } - for (size_t i = 0; i < loop_count; ++i) { - matrix_partition_device_t matrix_partition( - graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + matrix_partition_device_t matrix_partition(graph_view, i); auto major_tmp_buffer_size = vertex_t{0}; if (GraphViewType::is_multi_gpu) { @@ -418,12 +408,9 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_rank = col_comm.get_rank(); - major_tmp_buffer_size = - (in == GraphViewType::is_adj_matrix_transposed) - ? graph_view.is_hypergraph_partitioned() - ? matrix_partition.get_major_size() - : graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i) - : vertex_t{0}; + major_tmp_buffer_size = (in == GraphViewType::is_adj_matrix_transposed) + ? matrix_partition.get_major_size() + : vertex_t{0}; } auto major_tmp_buffer = allocate_dataframe_buffer(major_tmp_buffer_size, handle.get_stream()); @@ -432,12 +419,9 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, auto major_init = T{}; if (in == GraphViewType::is_adj_matrix_transposed) { if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_rank = col_comm.get_rank(); - major_init = graph_view.is_hypergraph_partitioned() ? (col_comm_rank == 0) ? init : T{} - : (row_comm_rank == 0) ? init : T{}; + major_init = (col_comm_rank == 0) ? init : T{}; } else { major_init = init; } @@ -450,8 +434,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, auto const row_comm_size = row_comm.get_size(); auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_rank = col_comm.get_rank(); - comm_root_rank = graph_view.is_hypergraph_partitioned() ? i * row_comm_size + row_comm_rank - : col_comm_rank * row_comm_size + i; + comm_root_rank = i * row_comm_size + row_comm_rank; } if (graph_view.get_vertex_partition_size(comm_root_rank) > 0) { @@ -505,25 +488,14 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); - if (graph_view.is_hypergraph_partitioned()) { - device_reduce( - col_comm, - major_buffer_first, - vertex_value_output_first, - static_cast(graph_view.get_vertex_partition_size(i * row_comm_size + i)), - raft::comms::op_t::SUM, - i, - handle.get_stream()); - } else { - device_reduce(row_comm, - major_buffer_first, - vertex_value_output_first, - static_cast( - graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i)), - raft::comms::op_t::SUM, - i, - handle.get_stream()); - } + device_reduce( + col_comm, + major_buffer_first, + vertex_value_output_first, + static_cast(graph_view.get_vertex_partition_size(i * row_comm_size + i)), + raft::comms::op_t::SUM, + i, + handle.get_stream()); } } @@ -537,53 +509,17 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); - if (graph_view.is_hypergraph_partitioned()) { - CUGRAPH_FAIL("unimplemented."); - } else { - for (int i = 0; i < col_comm_size; ++i) { - auto offset = (graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + i) - - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)); - auto size = static_cast( - graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + i)); - device_reduce(col_comm, - minor_buffer_first + offset, - minor_buffer_first + offset, - size, - raft::comms::op_t::SUM, - i, - handle.get_stream()); - } - - // FIXME: this P2P is unnecessary if we apply the partitioning scheme used with hypergraph - // partitioning - auto comm_src_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size; - auto comm_dst_rank = row_comm_rank * col_comm_size + col_comm_rank; - // FIXME: this branch may no longer necessary with NCCL backend - if (comm_src_rank == comm_rank) { - assert(comm_dst_rank == comm_rank); - auto offset = - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + col_comm_rank) - - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size); - auto size = static_cast( - graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + col_comm_rank)); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - minor_buffer_first + offset, - minor_buffer_first + offset + size, - vertex_value_output_first); - } else { - device_sendrecv( - comm, - minor_buffer_first + - (graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + col_comm_rank) - - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)), - static_cast( - graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + col_comm_rank)), - comm_dst_rank, - vertex_value_output_first, - static_cast(graph_view.get_vertex_partition_size(comm_rank)), - comm_src_rank, - handle.get_stream()); - } + for (int i = 0; i < row_comm_size; ++i) { + auto offset = (graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size + i) - + graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size)); + device_reduce(row_comm, + minor_buffer_first + offset, + vertex_value_output_first, + static_cast( + graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i)), + raft::comms::op_t::SUM, + i, + handle.get_stream()); } } } diff --git a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index 785f8197aff..b2a30baece4 100644 --- a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -277,20 +277,10 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( // 2. aggregate each vertex out-going edges based on keys and transform-reduce. - auto loop_count = size_t{1}; - if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_size = row_comm.get_size(); - loop_count = graph_view.is_hypergraph_partitioned() - ? graph_view.get_number_of_local_adj_matrix_partitions() - : static_cast(row_comm_size); - } - rmm::device_uvector major_vertices(0, handle.get_stream()); auto e_op_result_buffer = allocate_dataframe_buffer(0, handle.get_stream()); - for (size_t i = 0; i < loop_count; ++i) { - matrix_partition_device_t matrix_partition( - graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + matrix_partition_device_t matrix_partition(graph_view, i); int comm_root_rank = 0; if (GraphViewType::is_multi_gpu) { @@ -299,8 +289,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( auto const row_comm_size = row_comm.get_size(); auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_rank = col_comm.get_rank(); - comm_root_rank = graph_view.is_hypergraph_partitioned() ? i * row_comm_size + row_comm_rank - : col_comm_rank * row_comm_size + i; + comm_root_rank = i * row_comm_size + row_comm_rank; } auto num_edges = thrust::transform_reduce( @@ -358,9 +347,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( tmp_key_aggregated_edge_weights.resize(tmp_major_vertices.size(), handle.get_stream()); if (GraphViewType::is_multi_gpu) { - auto& sub_comm = handle.get_subcomm(graph_view.is_hypergraph_partitioned() - ? cugraph::partition_2d::key_naming_t().col_name() - : cugraph::partition_2d::key_naming_t().row_name()); + auto& sub_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const sub_comm_size = sub_comm.get_size(); triplet_first = @@ -416,9 +403,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( tmp_key_aggregated_edge_weights.shrink_to_fit(handle.get_stream()); if (GraphViewType::is_multi_gpu) { - auto& sub_comm = handle.get_subcomm(graph_view.is_hypergraph_partitioned() - ? cugraph::partition_2d::key_naming_t().col_name() - : cugraph::partition_2d::key_naming_t().row_name()); + auto& sub_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const sub_comm_rank = sub_comm.get_rank(); auto const sub_comm_size = sub_comm.get_size(); diff --git a/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh b/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh index 70b6dc92752..27b52f992ad 100644 --- a/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh +++ b/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh @@ -150,20 +150,10 @@ transform_reduce_by_adj_matrix_row_col_key_e( using edge_t = typename GraphViewType::edge_type; using weight_t = typename GraphViewType::weight_type; - auto loop_count = size_t{1}; - if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_size = row_comm.get_size(); - loop_count = graph_view.is_hypergraph_partitioned() - ? graph_view.get_number_of_local_adj_matrix_partitions() - : static_cast(row_comm_size); - } - rmm::device_uvector keys(0, handle.get_stream()); auto value_buffer = allocate_dataframe_buffer(0, handle.get_stream()); - for (size_t i = 0; i < loop_count; ++i) { - matrix_partition_device_t matrix_partition( - graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + matrix_partition_device_t matrix_partition(graph_view, i); int comm_root_rank = 0; if (GraphViewType::is_multi_gpu) { @@ -172,8 +162,7 @@ transform_reduce_by_adj_matrix_row_col_key_e( auto const row_comm_size = row_comm.get_size(); auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_rank = col_comm.get_rank(); - comm_root_rank = graph_view.is_hypergraph_partitioned() ? i * row_comm_size + row_comm_rank - : col_comm_rank * row_comm_size + i; + comm_root_rank = i * row_comm_size + row_comm_rank; } auto num_edges = thrust::transform_reduce( diff --git a/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh index 4c76322fa79..14484e75cbf 100644 --- a/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh +++ b/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh @@ -358,18 +358,8 @@ void update_frontier_v_push_if_out_nbr( vertex_frontier.set_buffer_idx_value(0); - auto loop_count = size_t{1}; - if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_size = row_comm.get_size(); - loop_count = graph_view.is_hypergraph_partitioned() - ? graph_view.get_number_of_local_adj_matrix_partitions() - : static_cast(row_comm_size); - } - - for (size_t i = 0; i < loop_count; ++i) { - matrix_partition_device_t matrix_partition( - graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + matrix_partition_device_t matrix_partition(graph_view, i); rmm::device_uvector frontier_rows( 0, handle.get_stream()); // relevant only if GraphViewType::is_multi_gpu is true @@ -382,22 +372,18 @@ void update_frontier_v_push_if_out_nbr( auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_rank = col_comm.get_rank(); - auto sub_comm_rank = graph_view.is_hypergraph_partitioned() ? col_comm_rank : row_comm_rank; - frontier_size = host_scalar_bcast( - graph_view.is_hypergraph_partitioned() ? col_comm : row_comm, - (static_cast(sub_comm_rank) == i) ? thrust::distance(vertex_first, vertex_last) - : size_t{0}, - i, - handle.get_stream()); + auto sub_comm_rank = col_comm_rank; + frontier_size = host_scalar_bcast(col_comm, + (static_cast(sub_comm_rank) == i) + ? thrust::distance(vertex_first, vertex_last) + : size_t{0}, + i, + handle.get_stream()); if (static_cast(sub_comm_rank) != i) { frontier_rows.resize(frontier_size, handle.get_stream()); } - device_bcast(graph_view.is_hypergraph_partitioned() ? col_comm : row_comm, - vertex_first, - frontier_rows.begin(), - frontier_size, - i, - handle.get_stream()); + device_bcast( + col_comm, vertex_first, frontier_rows.begin(), frontier_size, i, handle.get_stream()); } else { frontier_size = thrust::distance(vertex_first, vertex_last); } @@ -515,12 +501,9 @@ void update_frontier_v_push_if_out_nbr( auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); - std::vector h_vertex_lasts(graph_view.is_hypergraph_partitioned() ? row_comm_size - : col_comm_size); + std::vector h_vertex_lasts(row_comm_size); for (size_t i = 0; i < h_vertex_lasts.size(); ++i) { - h_vertex_lasts[i] = graph_view.get_vertex_partition_last( - graph_view.is_hypergraph_partitioned() ? col_comm_rank * row_comm_size + i - : row_comm_rank * col_comm_size + i); + h_vertex_lasts[i] = graph_view.get_vertex_partition_last(col_comm_rank * row_comm_size + i); } rmm::device_uvector d_vertex_lasts(h_vertex_lasts.size(), handle.get_stream()); @@ -544,14 +527,11 @@ void update_frontier_v_push_if_out_nbr( std::adjacent_difference( h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin()); - std::vector rx_counts(graph_view.is_hypergraph_partitioned() ? row_comm_size - : col_comm_size); + std::vector rx_counts(row_comm_size); std::vector count_requests(tx_counts.size() + rx_counts.size()); size_t tx_self_i = std::numeric_limits::max(); for (size_t i = 0; i < tx_counts.size(); ++i) { - auto comm_dst_rank = graph_view.is_hypergraph_partitioned() - ? col_comm_rank * row_comm_size + static_cast(i) - : row_comm_rank * col_comm_size + static_cast(i); + auto comm_dst_rank = col_comm_rank * row_comm_size + static_cast(i); if (comm_dst_rank == comm_rank) { tx_self_i = i; // FIXME: better define request_null (similar to MPI_REQUEST_NULL) under raft::comms @@ -561,9 +541,7 @@ void update_frontier_v_push_if_out_nbr( } } for (size_t i = 0; i < rx_counts.size(); ++i) { - auto comm_src_rank = graph_view.is_hypergraph_partitioned() - ? col_comm_rank * row_comm_size + static_cast(i) - : static_cast(i) * row_comm_size + comm_rank / col_comm_size; + auto comm_src_rank = col_comm_rank * row_comm_size + static_cast(i); if (comm_src_rank == comm_rank) { assert(tx_self_i != std::numeric_limits::max()); rx_counts[i] = tx_counts[tx_self_i]; @@ -603,14 +581,10 @@ void update_frontier_v_push_if_out_nbr( std::vector tx_dst_ranks(tx_counts.size()); std::vector rx_src_ranks(rx_counts.size()); for (size_t i = 0; i < tx_dst_ranks.size(); ++i) { - tx_dst_ranks[i] = graph_view.is_hypergraph_partitioned() - ? col_comm_rank * row_comm_size + static_cast(i) - : row_comm_rank * col_comm_size + static_cast(i); + tx_dst_ranks[i] = col_comm_rank * row_comm_size + static_cast(i); } for (size_t i = 0; i < rx_src_ranks.size(); ++i) { - rx_src_ranks[i] = graph_view.is_hypergraph_partitioned() - ? col_comm_rank * row_comm_size + static_cast(i) - : static_cast(i) * row_comm_size + comm_rank / col_comm_size; + rx_src_ranks[i] = col_comm_rank * row_comm_size + static_cast(i); } device_multicast_sendrecv( From 187a5f92a6e59efaeac0605984942bdecf39553d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 25 Feb 2021 23:51:48 -0500 Subject: [PATCH 03/63] function renaming --- cpp/include/utilities/shuffle_comm.cuh | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/include/utilities/shuffle_comm.cuh b/cpp/include/utilities/shuffle_comm.cuh index 7e04c7e1972..e25ce168c7a 100644 --- a/cpp/include/utilities/shuffle_comm.cuh +++ b/cpp/include/utilities/shuffle_comm.cuh @@ -32,11 +32,11 @@ namespace experimental { namespace detail { template -rmm::device_uvector sort_and_count(raft::comms::comms_t const &comm, - ValueIterator tx_value_first /* [INOUT */, - ValueIterator tx_value_last /* [INOUT */, - ValueToGPUIdOp value_to_gpu_id_op, - cudaStream_t stream) +rmm::device_uvector groupby_and_count(raft::comms::comms_t const &comm, + ValueIterator tx_value_first /* [INOUT */, + ValueIterator tx_value_last /* [INOUT */, + ValueToGPUIdOp value_to_gpu_id_op, + cudaStream_t stream) { auto const comm_size = comm.get_size(); @@ -73,12 +73,12 @@ rmm::device_uvector sort_and_count(raft::comms::comms_t const &comm, } template -rmm::device_uvector sort_and_count(raft::comms::comms_t const &comm, - VertexIterator tx_key_first /* [INOUT */, - VertexIterator tx_key_last /* [INOUT */, - ValueIterator tx_value_first /* [INOUT */, - KeyToGPUIdOp key_to_gpu_id_op, - cudaStream_t stream) +rmm::device_uvector groupby_and_count(raft::comms::comms_t const &comm, + VertexIterator tx_key_first /* [INOUT */, + VertexIterator tx_key_last /* [INOUT */, + ValueIterator tx_value_first /* [INOUT */, + KeyToGPUIdOp key_to_gpu_id_op, + cudaStream_t stream) { auto const comm_size = comm.get_size(); @@ -241,7 +241,7 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const &comm, auto const comm_size = comm.get_size(); auto d_tx_value_counts = - detail::sort_and_count(comm, tx_value_first, tx_value_last, value_to_gpu_id_op, stream); + detail::groupby_and_count(comm, tx_value_first, tx_value_last, value_to_gpu_id_op, stream); std::vector tx_counts{}; std::vector tx_offsets{}; @@ -282,7 +282,7 @@ auto groupby_gpuid_and_shuffle_kv_pairs(raft::comms::comms_t const &comm, KeyToGPUIdOp key_to_gpu_id_op, cudaStream_t stream) { - auto d_tx_value_counts = detail::sort_and_count( + auto d_tx_value_counts = detail::groupby_and_count( comm, tx_key_first, tx_key_last, tx_value_first, key_to_gpu_id_op, stream); std::vector tx_counts{}; From cfe54ce20ff832604192ec57f2f1b6f9d6263bcd Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 25 Feb 2021 23:52:35 -0500 Subject: [PATCH 04/63] add additional utility functions to graph_view_t --- cpp/include/experimental/graph_view.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpp/include/experimental/graph_view.hpp b/cpp/include/experimental/graph_view.hpp index f940b1013bf..72e00848fce 100644 --- a/cpp/include/experimental/graph_view.hpp +++ b/cpp/include/experimental/graph_view.hpp @@ -155,6 +155,11 @@ class partition_t { return vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_ + 1]; } + vertex_t get_matrix_partition_major_size(size_t partition_idx) const + { + return get_matrix_partition_major_last(partition_idx) - get_matrix_partition_major_first(partition_idx); + } + vertex_t get_matrix_partition_major_value_start_offset(size_t partition_idx) const { return matrix_partition_major_value_start_offsets_[partition_idx]; @@ -180,6 +185,11 @@ class partition_t { return vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_]; } + vertex_t get_matrix_partition_minor_size() const + { + return get_matrix_partition_minor_last() - get_matrix_partition_minor_first(); + } + private: std::vector vertex_partition_offsets_{}; // size = P + 1 From efc58c618c332f0ded59900b6f7d77fb6e3124f5 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 26 Feb 2021 17:37:12 -0500 Subject: [PATCH 05/63] cosmetic updates --- ...opy_v_transform_reduce_key_aggregated_out_nbr.cuh | 12 +++++------- cpp/include/utilities/shuffle_comm.cuh | 4 ++-- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index b2a30baece4..3d77b410b00 100644 --- a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -412,16 +412,14 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( auto rx_sizes = host_scalar_gather(sub_comm, tmp_major_vertices.size(), i, handle.get_stream()); - std::vector rx_displs( - static_cast(sub_comm_rank) == i ? sub_comm_size : int{0}, size_t{0}); + std::vector rx_displs{}; + rmm::device_uvector rx_major_vertices(0, handle.get_stream()); if (static_cast(sub_comm_rank) == i) { + rx_displs.assign(sub_comm_size, size_t{0}); std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1); + rmm::device_uvector rx_major_vertices.resize(rx_displs.back() + rx_sizes.back(), + handle.get_stream()); } - rmm::device_uvector rx_major_vertices( - static_cast(sub_comm_rank) == i - ? std::accumulate(rx_sizes.begin(), rx_sizes.end(), size_t{0}) - : size_t{0}, - handle.get_stream()); auto rx_tmp_e_op_result_buffer = allocate_dataframe_buffer(rx_major_vertices.size(), handle.get_stream()); diff --git a/cpp/include/utilities/shuffle_comm.cuh b/cpp/include/utilities/shuffle_comm.cuh index e25ce168c7a..b4236cfa57e 100644 --- a/cpp/include/utilities/shuffle_comm.cuh +++ b/cpp/include/utilities/shuffle_comm.cuh @@ -58,7 +58,7 @@ rmm::device_uvector groupby_and_count(raft::comms::comms_t const &comm, thrust::make_constant_iterator(size_t{1}), d_tx_dst_ranks.begin(), d_tx_value_counts.begin()); - if (thrust::distance(d_tx_value_counts.begin(), thrust::get<1>(last)) < comm_size) { + if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < comm_size) { rmm::device_uvector d_counts(comm_size, stream); thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0}); thrust::scatter(rmm::exec_policy(stream)->on(stream), @@ -100,7 +100,7 @@ rmm::device_uvector groupby_and_count(raft::comms::comms_t const &comm, thrust::make_constant_iterator(size_t{1}), d_tx_dst_ranks.begin(), d_tx_value_counts.begin()); - if (thrust::distance(d_tx_value_counts.begin(), thrust::get<1>(last)) < comm_size) { + if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < comm_size) { rmm::device_uvector d_counts(comm_size, stream); thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0}); thrust::scatter(rmm::exec_policy(stream)->on(stream), From 2e5e45c61dc5a2f02896f9a843bb0e8e9dce0f6c Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 26 Feb 2021 17:38:44 -0500 Subject: [PATCH 06/63] switch the graph partitioning scheme in graph functions --- .../experimental/detail/graph_utils.cuh | 15 + cpp/include/experimental/graph_functions.hpp | 46 +- cpp/src/experimental/coarsen_graph.cu | 394 ++++++---- cpp/src/experimental/renumber_edgelist.cu | 695 +++++++++--------- 4 files changed, 636 insertions(+), 514 deletions(-) diff --git a/cpp/include/experimental/detail/graph_utils.cuh b/cpp/include/experimental/detail/graph_utils.cuh index 380a75b5c34..fbbb72a8947 100644 --- a/cpp/include/experimental/detail/graph_utils.cuh +++ b/cpp/include/experimental/detail/graph_utils.cuh @@ -140,6 +140,21 @@ struct compute_gpu_id_from_edge_t { } }; +template +struct compute_partition_id_from_edge_t { + int comm_size{0}; + int row_comm_size{0}; + int col_comm_size{0}; + + __device__ int operator()(vertex_t major, vertex_t minor) const + { + cuco::detail::MurmurHash3_32 hash_func{}; + auto major_comm_rank = static_cast(hash_func(major) % comm_size); + auto minor_comm_rank = static_cast(hash_func(minor) % comm_size); + return major_comm_rank * col_comm_size + minor_comm_rank / row_comm_size; + } +}; + } // namespace detail } // namespace experimental } // namespace cugraph diff --git a/cpp/include/experimental/graph_functions.hpp b/cpp/include/experimental/graph_functions.hpp index 7b4bb466b97..f02bafb3aea 100644 --- a/cpp/include/experimental/graph_functions.hpp +++ b/cpp/include/experimental/graph_functions.hpp @@ -40,19 +40,9 @@ namespace experimental { * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. - * @param edgelist_major_vertices Edge source vertex IDs (if the graph adjacency matrix is stored as - * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex - * IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t functor to - * every (major, minor) pair should return the local GPU ID for this function to work (edges should - * be pre-shuffled). - * @param edgelist_minor_vertices Edge destination vertex IDs (if the graph adjacency matrix is - * stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). - * Vertex IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t - * functor to every (major, minor) pair should return the local GPU ID for this function to work - * (edges should be pre-shuffled). - * @param num_edgelist_edges Number of edges in the edgelist. - * @param is_hypergraph_partitioned Flag indicating whether we are assuming hypergraph partitioning - * (this flag will be removed in the future). + * @param edgelist_major_vertices Pointers (one pointer per local graph adjacency matrix partition assigned to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, minor) pair should return the GPU ID of this process and applying the compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition should return the partition ID of the corresponding matrix partition. + * @param edgelist_minor_vertices Pointers (one pointer per local graph adjacency matrix partition assigned to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, minor) pair should return the GPU ID of this process and applying the compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition should return the partition ID of the corresponding matrix partition. + * @param edgelist_edge_counts Edge counts (one count per local graph adjacency matrix partition assigned to this process). * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). * @return std::tuple, partition_t, vertex_t, edge_t> * Quadruplet of labels (vertex IDs before renumbering) for the entire set of vertices (assigned to @@ -63,10 +53,9 @@ template std::enable_if_t, partition_t, vertex_t, edge_t>> renumber_edgelist(raft::handle_t const& handle, - vertex_t* edgelist_major_vertices /* [INOUT] */, - vertex_t* edgelist_minor_vertices /* [INOUT] */, - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& num_edgelist_edges, bool do_expensive_check = false); /** @@ -115,19 +104,9 @@ std::enable_if_t> renumber_edgelist( * the compute_gpu_id_from_vertex_t to every vertex should return the local GPU ID for this function * to work (vertices should be pre-shuffled). * @param num_local_vertices Number of local vertices. - * @param edgelist_major_vertices Edge source vertex IDs (if the graph adjacency matrix is stored as - * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex - * IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t functor to - * every (major, minor) pair should return the local GPU ID for this function to work (edges should - * be pre-shuffled). - * @param edgelist_minor_vertices Edge destination vertex IDs (if the graph adjacency matrix is - * stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). - * Vertex IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t - * functor to every (major, minor) pair should return the local GPU ID for this function to work - * (edges should be pre-shuffled). - * @param num_edgelist_edges Number of edges in the edgelist. - * @param is_hypergraph_partitioned Flag indicating whether we are assuming hypergraph partitioning - * (this flag will be removed in the future). + * @param edgelist_major_vertices Pointers (one pointer per local graph adjacency matrix partition assigned to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, minor) pair should return the GPU ID of this process and applying the compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition should return the partition ID of the corresponding matrix partition. + * @param edgelist_minor_vertices Pointers (one pointer per local graph adjacency matrix partition assigned to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, minor) pair should return the GPU ID of this process and applying the compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition should return the partition ID of the corresponding matrix partition. + * @param edgelist_edge_counts Edge counts (one count per local graph adjacency matrix partition assigned to this process). * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). * @return std::tuple, partition_t, vertex_t, edge_t> * Quadruplet of labels (vertex IDs before renumbering) for the entire set of vertices (assigned to @@ -140,10 +119,9 @@ std::enable_if_t const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& num_edgelist_edges, bool do_expensive_check = false); /** diff --git a/cpp/src/experimental/coarsen_graph.cu b/cpp/src/experimental/coarsen_graph.cu index 0cd551b0d73..0e43a3b5b07 100644 --- a/cpp/src/experimental/coarsen_graph.cu +++ b/cpp/src/experimental/coarsen_graph.cu @@ -90,61 +90,63 @@ std:: } template -void sort_and_coarsen_edgelist(rmm::device_uvector &edgelist_major_vertices /* [INOUT] */, - rmm::device_uvector &edgelist_minor_vertices /* [INOUT] */, - rmm::device_uvector &edgelist_weights /* [INOUT] */, - cudaStream_t stream) +edge_t groupby_e_and_coarsen_edgelist(vertex_t const *edgelist_major_vertices, + vertex_t const *edgelist_minor_vertices, + weight_t const *edgelist_weights, + edge_t number_of_edges, + cudaStream_t stream) { - auto pair_first = thrust::make_zip_iterator( - thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); + auto pair_first = + thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices)); - size_t number_of_edges{0}; - if (edgelist_weights.size() > 0) { + if (edgelist_weights != nullptr) { thrust::sort_by_key(rmm::exec_policy(stream)->on(stream), pair_first, pair_first + edgelist_major_vertices.size(), edgelist_weights.begin()); - rmm::device_uvector tmp_edgelist_major_vertices(edgelist_major_vertices.size(), - stream); + rmm::device_uvector tmp_edgelist_major_vertices(number_of_edges, stream); rmm::device_uvector tmp_edgelist_minor_vertices(tmp_edgelist_major_vertices.size(), stream); rmm::device_uvector tmp_edgelist_weights(tmp_edgelist_major_vertices.size(), stream); auto it = thrust::reduce_by_key( rmm::exec_policy(stream)->on(stream), pair_first, - pair_first + edgelist_major_vertices.size(), - edgelist_weights.begin(), + pair_first + number_of_edges, + edgelist_weights, thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(), tmp_edgelist_minor_vertices.begin())), tmp_edgelist_weights.begin()); - number_of_edges = thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it)); + auto ret = + static_cast(thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it))); - edgelist_major_vertices = std::move(tmp_edgelist_major_vertices); - edgelist_minor_vertices = std::move(tmp_edgelist_minor_vertices); - edgelist_weights = std::move(tmp_edgelist_weights); + auto edge_first = + thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(), + tmp_edgelist_minor_vertices.begin(), + tmp_edgelist_weights.begin())); + thrust::copy(rmm::exec_policy(stream)->on(stream), + edge_first, + edge_first + ret, + thrust::make_zip_iterator(thrust::make_tuple( + edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights))); + + return ret; } else { thrust::sort(rmm::exec_policy(stream)->on(stream), pair_first, pair_first + edgelist_major_vertices.size()); - auto it = thrust::unique(rmm::exec_policy(stream)->on(stream), - pair_first, - pair_first + edgelist_major_vertices.size()); - number_of_edges = thrust::distance(pair_first, it); + return static_cast( + thrust::distance(pair_first, + thrust::unique(rmm::exec_policy(stream)->on(stream), + pair_first, + pair_first + edgelist_major_vertices.size()))); } - - edgelist_major_vertices.resize(number_of_edges, stream); - edgelist_minor_vertices.resize(number_of_edges, stream); - edgelist_weights.resize(number_of_edges, stream); - edgelist_major_vertices.shrink_to_fit(stream); - edgelist_minor_vertices.shrink_to_fit(stream); - edgelist_weights.shrink_to_fit(stream); } template std:: tuple, rmm::device_uvector, rmm::device_uvector> - compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist( + compressed_sparse_to_relabeled_and_grouped_and_coarsened_edgelist( edge_t const *compressed_sparse_offsets, vertex_t const *compressed_sparse_indices, weight_t const *compressed_sparse_weights, @@ -182,8 +184,20 @@ std:: p_minor_labels[thrust::get<1>(val) - minor_first]); }); - sort_and_coarsen_edgelist( - edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights, stream); + auto number_of_edges = + groupby_e_and_coarsen_edgelist(edgelist_major_vertices.data(), + edgelist_minor_vertices.data(), + edgelist_weights.data(), + static_cast(edgelist_major_vertices.size()), + stream); + edgelist_major_vertices.resize(number_of_edges, handle.get_stream()); + edgelist_major_vertices.shrink_to_fit(handle.get_stream()); + edgelist_minor_vertices.resize(number_of_edges, handle.get_stream()); + edgelist_minor_vertices.shrink_to_fit(handle.get_stream()); + if (edgelist_weights.size() > 0) { + edgelist_weights.resize(number_of_edges, handle.get_stream()); + edgelist_weights.shrink_to_fit(handle.get_stream()); + } return std::make_tuple(std::move(edgelist_major_vertices), std::move(edgelist_minor_vertices), @@ -220,47 +234,52 @@ coarsen_graph( // currently, nothing to do } - // 1. locally construct coarsened edge list + // 1. construct coarsened edge list - // FIXME: we don't need adj_matrix_major_labels if we apply the same partitioning scheme - // regardless of hypergraph partitioning is applied or not - rmm::device_uvector adj_matrix_major_labels( - store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols() - : graph_view.get_number_of_local_adj_matrix_partition_rows(), - handle.get_stream()); rmm::device_uvector adj_matrix_minor_labels( store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_rows() : graph_view.get_number_of_local_adj_matrix_partition_cols(), handle.get_stream()); if (store_transposed) { - copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_major_labels.data()); + // copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_major_labels.data()); copy_to_adj_matrix_row(handle, graph_view, labels, adj_matrix_minor_labels.data()); } else { - copy_to_adj_matrix_row(handle, graph_view, labels, adj_matrix_major_labels.data()); + // copy_to_adj_matrix_row(handle, graph_view, labels, adj_matrix_major_labels.data()); copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_minor_labels.data()); } - rmm::device_uvector coarsened_edgelist_major_vertices(0, handle.get_stream()); - rmm::device_uvector coarsened_edgelist_minor_vertices(0, handle.get_stream()); - rmm::device_uvector coarsened_edgelist_weights(0, handle.get_stream()); + std::vector> coarsened_edgelist_major_vertices( + graph_view.get_number_of_local_adj_matrix_partitions(), + rmm::device_uvector(0, handle.get_stream())); + std::vector> coarsened_edgelist_minor_vertices( + coarsened_edgelist_major_vertices.size(), + rmm::device_uvector(0, handle.get_stream())); + std::vector> coarsened_edgelist_weights( + graph_view.is_weighted() ? coarsened_edgelist_major_vertices.size() : size_t{0}, + rmm::device_uvector(0, handle.get_stream())); // FIXME: we may compare performance/memory footprint with the hash_based approach especially when // cuco::dynamic_map becomes available (so we don't need to preallocate memory assuming the worst // case). We may be able to limit the memory requirement close to the final coarsened edgelist // with the hash based approach. for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { - // get edge list + // 1-1. locally construct coarsened edge list + + rmm::device_uvector major_labels( + store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols(i) + : graph_view.get_number_of_local_adj_matrix_partition_rows(i), + handle.get_stream()); + device_bcast( + col_comm, labels, major_labels.data(), major_labels.size(), i, handle.get_stream()); rmm::device_uvector edgelist_major_vertices(0, handle.get_stream()); rmm::device_uvector edgelist_minor_vertices(0, handle.get_stream()); rmm::device_uvector edgelist_weights(0, handle.get_stream()); std::tie(edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights) = - compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist( + compressed_sparse_to_relabeled_and_grouped_and_coarsened_edgelist( graph_view.offsets(i), graph_view.indices(i), graph_view.weights(i), - adj_matrix_major_labels.begin() + - (store_transposed ? graph_view.get_local_adj_matrix_partition_col_value_start_offset(i) - : graph_view.get_local_adj_matrix_partition_row_value_start_offset(i)), + major_labels.data(), adj_matrix_minor_labels.begin(), store_transposed ? graph_view.get_local_adj_matrix_partition_col_first(i) : graph_view.get_local_adj_matrix_partition_row_first(i), @@ -272,84 +291,184 @@ coarsen_graph( : graph_view.get_local_adj_matrix_partition_col_last(i), handle.get_stream()); - auto cur_size = coarsened_edgelist_major_vertices.size(); - // FIXME: this can lead to frequent costly reallocation; we may be able to avoid this if we can - // reserve address space to avoid expensive reallocation. - // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management - coarsened_edgelist_major_vertices.resize(cur_size + edgelist_major_vertices.size(), - handle.get_stream()); - coarsened_edgelist_minor_vertices.resize(coarsened_edgelist_major_vertices.size(), - handle.get_stream()); - coarsened_edgelist_weights.resize( - graph_view.is_weighted() ? coarsened_edgelist_major_vertices.size() : 0, handle.get_stream()); + // 1-2. globaly shuffle + + { + rmm::device_uvector rx_edgelist_major_vertices(0, handle.get_stream()); + rmm::device_uvector rx_edgelist_minor_vertices(0, handle.get_stream()); + rmm::device_uvector rx_edgelist_weights(0, handle.get_stream()); + if (graph_view.is_weighted()) { + auto edge_first = + thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(), + edgelist_minor_vertices.begin(), + edgelist_weights.begin())); + std::forward_as_tuple( + std::tie(rx_edgelist_major_vertices, rx_edgelist_minor_vertices, rx_edgelist_weights), + std::ignore) = + groupby_gpuid_and_shuffle_values( + handle.get_comms(), + edge_first, + edge_first + coarsened_edgelist_major_vertices.size(), + [key_func = + detail::compute_gpu_id_from_edge_t{ + comm.get_size(), row_comm.get_size(), col_comm.get_size()}] __device__(auto val) { + return key_func(thrust::get<0>(val), thrust::get<1>(val)); + }, + handle.get_stream()); + } else { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); + std::forward_as_tuple(std::tie(rx_edgelist_major_vertices, rx_edgelist_minor_vertices), + std::ignore) = + groupby_gpuid_and_shuffle_values( + handle.get_comms(), + edge_first, + edge_first + coarsened_edgelist_major_vertices.size(), + [key_func = + detail::compute_gpu_id_from_edge_t{ + comm.get_size(), row_comm.get_size(), col_comm.get_size()}] __device__(auto val) { + return key_func(thrust::get<0>(val), thrust::get<1>(val)); + }, + handle.get_stream()); + } + + edgelist_major_vertices = std::move(rx_edgelist_major_vertices); + edgelist_minor_vertices = std::move(rx_edgelist_minor_vertices); + edgelist_weights = std::move(rx_edgelist_weights); + } + + // 1-3. append data to local adjacency matrix partitions + // FIXME: we can skip this if groupby_gpuid_and_shuffle_values is updated to return sorted edge + // list based on the final matrix partition (maybe add + // groupby_adj_matrix_partition_and_shuffle_values). + auto key_func = detail::compute_partition_id_from_edge_t{ + comm.get_size(), row_comm.get_size(), col_comm.get_size()}; if (graph_view.is_weighted()) { - auto src_edge_first = + auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin(), edgelist_weights.begin())); - auto dst_edge_first = - thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices.begin(), - coarsened_edgelist_minor_vertices.begin(), - coarsened_edgelist_weights.begin())) + - cur_size; - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - src_edge_first, - src_edge_first + edgelist_major_vertices.size(), - dst_edge_first); + thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edge_first, + edge_first + edgelist_major_vertices.size(), + [key_func] __device__(auto val) { + return key_func(thrust::get<0>(val), thrust::get<1>(val)); + }); } else { - auto src_edge_first = thrust::make_zip_iterator( + auto edge_first = thrust::make_zip_iterator( thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); - auto dst_edge_first = - thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices.begin(), - coarsened_edgelist_minor_vertices.begin())) + - cur_size; - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - src_edge_first, - src_edge_first + edgelist_major_vertices.size(), - dst_edge_first); + thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edge_first, + edge_first + edgelist_major_vertices.size(), + [key_func] __device__(auto val) { + return key_func(thrust::get<0>(val), thrust::get<1>(val)); + }); } - } - - sort_and_coarsen_edgelist(coarsened_edgelist_major_vertices, - coarsened_edgelist_minor_vertices, - coarsened_edgelist_weights, - handle.get_stream()); - - // 2. globally shuffle edge list and re-coarsen + auto partition_id_first = thrust::make_transform_iterator( + thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())), + [key_func] __device__(auto val) { + return key_func(thrust::get<0>(val), thrust::get<1>(val)); + }); + rmm::device_uvector partition_ids(graph_view.get_number_of_local_adj_matrix_partitions(), + handle.get_stream()); + rmm::device_uvector displacements(partition_ids.size() + 1, handle.get_stream()); + auto last = + thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + partition_id_first, + partition_id_first + edgelist_major_vertices.size(), + thrust::make_constant_iterator(edge_t{1}), + partition_ids.begin(), + displacements.begin()); + if (thrust::distance(partition_ids.begin(), thrust::get<0>(last)) < partition_ids.size()) { + rmm::device_uvector tmps(displacements.size(), handle.get_stream()); + thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + displacements.begin(), + displacements.end(), + edge_t{0}); + thrust::scatter(displacements.begin(), + thrust::get<1>(last), + thrust::make_transform_iterator( + partition_ids.begin(), + [comm_size] __device__(auto val) { + return id / comm_size; // global partition id to local partition id + }), + tmps.begin()); + displacements = std::move(tmps); + } + thrust::excludsive_scan(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + displacements.begin(), + displacements.end(), + displacements.begin()); + + for (size_t j = 0; j < col_comm_size; ++j) { + auto number_of_partition_edges = groupby_e_and_coarsen_edgelist( + edgelist_major_vertices.begin() + h_displacements[j], + edgelist_minor_vertices + h_displacements[j], + graph_view.is_weighted() ? edgelist_weights.begin() + h_displacements[j] + : static_cast(nullptr), + h_displacements[j + 1] - h_displacements[j], + handle.get_stream()); - { - auto edge_first = - thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices.begin(), - coarsened_edgelist_minor_vertices.begin(), - coarsened_edgelist_weights.begin())); - rmm::device_uvector rx_edgelist_major_vertices(0, handle.get_stream()); - rmm::device_uvector rx_edgelist_minor_vertices(0, handle.get_stream()); - rmm::device_uvector rx_edgelist_weights(0, handle.get_stream()); - std::forward_as_tuple( - std::tie(rx_edgelist_major_vertices, rx_edgelist_minor_vertices, rx_edgelist_weights), - std::ignore) = - groupby_gpuid_and_shuffle_values( - handle.get_comms(), - edge_first, - edge_first + coarsened_edgelist_major_vertices.size(), - [key_func = - detail::compute_gpu_id_from_edge_t{graph_view.is_hypergraph_partitioned(), - comm.get_size(), - row_comm.get_size(), - col_comm.get_size()}] __device__(auto val) { - return key_func(thrust::get<0>(val), thrust::get<1>(val)); - }, + auto cur_size = coarsened_edgelist_major_vertices[j].size(); + // FIXME: this can lead to frequent costly reallocation; we may be able to avoid this if we + // can reserve address space to avoid expensive reallocation. + // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management + coarsened_edgelist_major_vertices[j].resize(cur_size + edgelist_major_vertices.size(), + handle.get_stream()); + coarsened_edgelist_minor_vertices[j].resize(coarsened_edgelist_major_vertices.size(), + handle.get_stream()); + coarsened_edgelist_weights[j].resize( + graph_view.is_weighted() ? coarsened_edgelist_major_vertices.size() : 0, handle.get_stream()); - sort_and_coarsen_edgelist(rx_edgelist_major_vertices, - rx_edgelist_minor_vertices, - rx_edgelist_weights, - handle.get_stream()); + if (graph_view.is_weighted()) { + auto src_edge_first = + thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(), + edgelist_minor_vertices.begin(), + edgelist_weights.begin())) + + displacements[j]; + auto dst_edge_first = + thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices[j].begin(), + coarsened_edgelist_minor_vertices[j].begin(), + coarsened_edgelist_weights[j].begin())) + + cur_size; + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + src_edge_first, + src_edge_first + number_of_partition_edges, + dst_edge_first); + } else { + auto src_edge_first = thrust::make_zip_iterator(thrust::make_tuple( + edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())) + + displacements[j]; + auto dst_edge_first = thrust::make_zip_iterator( + thrust::make_tuple(coarsened_edgelist_major_vertices[j].begin(), + coarsened_edgelist_minor_vertices[j].begin())) + + cur_size; + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + src_edge_first, + src_edge_first + edgelist_major_vertices.size(), + dst_edge_first); + } + } + } - coarsened_edgelist_major_vertices = std::move(rx_edgelist_major_vertices); - coarsened_edgelist_minor_vertices = std::move(rx_edgelist_minor_vertices); - coarsened_edgelist_weights = std::move(rx_edgelist_weights); + for (size_t i = 0; i < coarsened_edgelist_major_vertices.size(); ++i) { + auto number_of_partition_edges = + groupby_e_and_coarsen_edgelist(coarsened_edgelist_major_vertices[i], + coarsened_edgelist_minor_vertices[i], + coarsened_edgelist_weights[i], + coarsened_edgelist_major_vertices[i].size(), + handle.get_stream()); + coarsened_edgelist_major_vertices[j].resize(number_of_partition_edges, handle.get_stream()); + coarsened_edgelist_major_vertices[j].shrink_to_fit(handle.get_stream()); + coarsened_edgelist_minor_vertices[j].resize(number_of_partition_edges, handle.get_stream()); + coarsened_edgelist_minor_vertices[j].shrink_to_fit(handle.get_stream()); + if (edgelist_weights.size() > 0) { + coarsened_edgelist_weights[j].resize(number_of_partition_edges, handle.get_stream()); + coarsened_edgelist_weights[j].shrink_to_fit(handle.get_stream()); + } } // 3. find unique labels for this GPU @@ -395,37 +514,42 @@ coarsen_graph( rmm::device_uvector renumber_map_labels(0, handle.get_stream()); partition_t partition(std::vector(comm_size + 1, 0), - graph_view.is_hypergraph_partitioned(), row_comm_size, col_comm_size, row_comm_rank, col_comm_rank); vertex_t number_of_vertices{}; edge_t number_of_edges{}; - std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges) = - renumber_edgelist( - handle, - unique_labels.data(), - static_cast(unique_labels.size()), - coarsened_edgelist_major_vertices.data(), - coarsened_edgelist_minor_vertices.data(), - static_cast(coarsened_edgelist_major_vertices.size()), - graph_view.is_hypergraph_partitioned(), - do_expensive_check); + { + std::vector major_ptrs(coarsened_edgelist_major_vertices.size()); + std::vector minor_ptrs(major_ptrs.size()); + std::vector counts(major_ptrs.size()); + for (size_t i = 0; i < coarsened_edgelist_major_vertices.size(); ++i) { + major_ptrs[i] = coarsened_edgelist_major_vertices[i].data(); + minor_ptrs[i] = coarsened_edgelist_minor_vertices[i].data(); + counts[i] = static_cast(coarsened_edgelist_major_vertices[i].size()); + } + std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges) = + renumber_edgelist(handle, + unique_labels.data(), + static_cast(unique_labels.size()), + major_ptrs, + minor_ptrs, + counts, + do_expensive_check); + } // 5. build a graph std::vector> edgelists{}; - if (graph_view.is_hypergraph_partitioned()) { - CUGRAPH_FAIL("unimplemented."); - } else { - edgelists.resize(1); - edgelists[0].p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices.data() - : coarsened_edgelist_major_vertices.data(); - edgelists[0].p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices.data() - : coarsened_edgelist_minor_vertices.data(); - edgelists[0].p_edge_weights = coarsened_edgelist_weights.data(); - edgelists[0].number_of_edges = static_cast(coarsened_edgelist_major_vertices.size()); + edgelists.resize(graph_view.get_number_of_local_adj_matrix_partitions()); + for (size_t i = 0; edgelists.size(); ++i) { + edgelists[i].p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices[i].data() + : coarsened_edgelist_major_vertices[i].data(); + edgelists[i].p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices[i].data() + : coarsened_edgelist_minor_vertices[i].data(); + edgelists[i].p_edge_weights = coarsened_edgelist_weights[i].data(); + edgelists[i].number_of_edges = static_cast(coarsened_edgelist_major_vertices[i].size()); } return std::make_tuple( @@ -466,7 +590,7 @@ coarsen_graph( std::tie(coarsened_edgelist_major_vertices, coarsened_edgelist_minor_vertices, coarsened_edgelist_weights) = - compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist( + compressed_sparse_to_relabeled_and_grouped_and_coarsened_edgelist( graph_view.offsets(), graph_view.indices(), graph_view.weights(), diff --git a/cpp/src/experimental/renumber_edgelist.cu b/cpp/src/experimental/renumber_edgelist.cu index 6a5a1c732c2..0614989a816 100644 --- a/cpp/src/experimental/renumber_edgelist.cu +++ b/cpp/src/experimental/renumber_edgelist.cu @@ -50,62 +50,135 @@ rmm::device_uvector compute_renumber_map( raft::handle_t const& handle, vertex_t const* vertices, vertex_t num_local_vertices /* relevant only if vertices != nullptr */, - vertex_t const* edgelist_major_vertices, - vertex_t const* edgelist_minor_vertices, - edge_t num_edgelist_edges) + std::vector const& edgelist_major_vertices, + std::vector const& edgelist_minor_vertices, + std::vector const& edgelist_edge_counts) { // FIXME: compare this sort based approach with hash based approach in both speed and memory // footprint // 1. acquire (unique major label, count) pairs - rmm::device_uvector tmp_labels(num_edgelist_edges, handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_major_vertices, - edgelist_major_vertices + num_edgelist_edges, - tmp_labels.begin()); - thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - tmp_labels.begin(), - tmp_labels.end()); - rmm::device_uvector major_labels(tmp_labels.size(), handle.get_stream()); - rmm::device_uvector major_counts(major_labels.size(), handle.get_stream()); - auto major_pair_it = - thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - tmp_labels.begin(), - tmp_labels.end(), - thrust::make_constant_iterator(edge_t{1}), - major_labels.begin(), - major_counts.begin()); - tmp_labels.resize(0, handle.get_stream()); - tmp_labels.shrink_to_fit(handle.get_stream()); - major_labels.resize(thrust::distance(major_labels.begin(), thrust::get<0>(major_pair_it)), + rmm::device_uvector major_labels(0, handle.get_stream()); + rmm::device_uvector major_counts(0, handle.get_stream()); + for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + rmm::device_uvector sorted_major_labels(edgelist_edge_counts[i].size(), + handle.get_stream()); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edgelist_major_vertices[i], + edgelist_major_vertices[i] + edgelist_edge_counts[i], + sorted_major_labels.begin()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_major_labels.begin(), + sorted_major_labels.end()); + major_labels.resize(sorted_major_labels.size(), handle.get_stream()); + major_counts.resize(major_labels.size(), handle.get_stream()); + auto major_pair_it = + thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_major_labels.begin(), + sorted_major_labels.end(), + thrust::make_constant_iterator(edge_t{1}), + major_labels.begin(), + major_counts.begin()); + major_labels.resize(thrust::distance(tmp_major_labels.begin(), thrust::get<0>(major_pair_it)), + handle.get_stream()); + major_counts.resize(tmp_major_labels.size(), handle.get_stream()); + + if (multi_gpu) { + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); + + rmm::device_uvector rx_major_labels(0, handle.get_stream()); + rmm::device_uvector rx_major_counts(0, handle.get_stream()); + auto rx_sizes = + host_scalar_gather(col_comm, major_labels.size(), static_cast(i), handle.get_stream()); + std::vector rx_displs{}; + if (static_cast(i) == col_comm_rank) { + rx_displs.assign(col_comm_size, size_t{0}); + std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1); + rx_major_labels.resize(rx_displs.back() + rx_sizes.back(), handle.get_stream()); + rx_major_counts.resize(major_labels.size(), handle.get_stream()); + } + device_gatherv( + col_comm, + thrust::make_zip_iterator(thrust::make_tuple(major_labels.begin(), major_counts.begin())), + thrust::make_zip_iterator( + thrust::make_tuple(rx_major_labels.begin(), rx_major_counts.begin())), + major_labels.size(), + rx_sizes, + rx_displs, + static_cast(i), + handle.get_stream()); + major_labels = std::move(rx_major_labels); + major_counts = std::move(rx_major_counts); + } else { + major_labels.shrink_to_fit(handle.get_stream()); + major_counts.shrink_to_fit(handle.get_stream()); + } + } + + if (multi_gpu) { + thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + major_labels.begin(), + major_labels.end(), + major_counts.begin()); + rmm::device_uvector tmp_labels(major_labels.size(), handle.get_stream()); + rmm::device_uvector tmp_counts(tmp_labels.size(), handle.get_stream()); + pair_it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + major_labels.begin(), + major_labels.end(), + major_counts.begin(), + tmp_labels.begin(), + tmp_counts.begin()); + tmp_labels.resize(thrust::distance(tmp_labels.begin(), thrust::get<0>(pair_it)), handle.get_stream()); - major_counts.resize(major_labels.size(), handle.get_stream()); - major_labels.shrink_to_fit(handle.get_stream()); - major_counts.shrink_to_fit(handle.get_stream()); + tmp_counts.resize(tmp_labels.size(), handle.get_stream()); + + major_labels = std::move(tmp_labels); + major_counts = std::move(tmp_counts); + } // 2. acquire unique minor labels - rmm::device_uvector minor_labels(num_edgelist_edges, handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_minor_vertices, - edgelist_minor_vertices + num_edgelist_edges, - minor_labels.begin()); + std::vector minor_displs(edgelist_minor_vertices.size(), edge_t{0}); + std::partial_sum( + edgelist_edge_counts.begin(), edgelist_edge_counts.end() - 1, minor_displs.begin() + 1); + rmm::device_uvector minor_labels(minor_displs.back(), handle.get_stream()); + for (size_t i = 0; i < edgelist_minor_vertices.size(); ++i) { + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edgelist_minor_vertices[i], + edgelist_minor_vertices[i] + edgelist_edge_counts[i], + minor_labels.begin() + minor_displs[i]); + } thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), minor_labels.begin(), minor_labels.end()); - auto minor_label_it = - thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - minor_labels.begin(), - minor_labels.end()); - minor_labels.resize(thrust::distance(minor_labels.begin(), minor_label_it), handle.get_stream()); - minor_labels.shrink_to_fit(handle.get_stream()); + minor_labels.resize( + thrust::distance(minor_labels.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + minor_labels.begin(), + minor_labels.end())), + handle.get_stream()); + if (multi_gpu) { + rmm::device_uvector rx_minor_labels(0, handle.get_stream()); + std::tie(rx_minor_labels, std::ignore) = groupby_gpuid_and_shuffle_values( + comm, + minor_labels.begin(), + minor_labels.end(), + [key_func = detail::compute_gpu_id_from_vertex_t{comm_size}] __device__(auto val) { + return key_func(val); + }, + handle.get_stream()); + minor_labels = std::move(rx_minor_labels); + } else { + minor_labels.shrink_to_fit(handle.get_stream()); + } // 3. merge major and minor labels and vertex labels rmm::device_uvector merged_labels(major_labels.size() + minor_labels.size(), handle.get_stream()); - rmm::device_uvector merged_counts(merged_labels.size(), handle.get_stream()); thrust::merge_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), major_labels.begin(), @@ -142,47 +215,7 @@ rmm::device_uvector compute_renumber_map( labels.shrink_to_fit(handle.get_stream()); counts.shrink_to_fit(handle.get_stream()); - // 4. if multi-GPU, shuffle and reduce (label, count) pairs - - if (multi_gpu) { - auto& comm = handle.get_comms(); - auto const comm_size = comm.get_size(); - - auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(labels.begin(), counts.begin())); - rmm::device_uvector rx_labels(0, handle.get_stream()); - rmm::device_uvector rx_counts(0, handle.get_stream()); - std::forward_as_tuple(std::tie(rx_labels, rx_counts), std::ignore) = - groupby_gpuid_and_shuffle_values( - comm, - pair_first, - pair_first + labels.size(), - [key_func = detail::compute_gpu_id_from_vertex_t{comm_size}] __device__( - auto val) { return key_func(thrust::get<0>(val)); }, - handle.get_stream()); - - labels.resize(rx_labels.size(), handle.get_stream()); - counts.resize(labels.size(), handle.get_stream()); - thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_labels.begin(), - rx_labels.end(), - rx_counts.begin()); - pair_it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_labels.begin(), - rx_labels.end(), - rx_counts.begin(), - labels.begin(), - counts.begin()); - rx_labels.resize(0, handle.get_stream()); - rx_counts.resize(0, handle.get_stream()); - rx_labels.shrink_to_fit(handle.get_stream()); - rx_counts.shrink_to_fit(handle.get_stream()); - labels.resize(thrust::distance(labels.begin(), thrust::get<0>(pair_it)), handle.get_stream()); - counts.resize(labels.size(), handle.get_stream()); - labels.shrink_to_fit(handle.get_stream()); - labels.shrink_to_fit(handle.get_stream()); - } - - // 5. if vertices != nullptr, add isolated vertices + // 4. if vertices != nullptr, add isolated vertices rmm::device_uvector isolated_vertices(0, handle.get_stream()); if (vertices != nullptr) { @@ -232,10 +265,9 @@ void expensive_check_edgelist( raft::handle_t const& handle, vertex_t const* local_vertices, vertex_t num_local_vertices /* relevant only if local_vertices != nullptr */, - vertex_t const* edgelist_major_vertices, - vertex_t const* edgelist_minor_vertices, - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned /* relevant only if multi_gpu == true */) + std::vector const& edgelist_major_vertices, + std::vector const& edgelist_minor_vertices, + edge_t num_edgelist_edges) { rmm::device_uvector sorted_local_vertices( local_vertices != nullptr ? num_local_vertices : vertex_t{0}, handle.get_stream()); @@ -246,6 +278,12 @@ void expensive_check_edgelist( thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), sorted_local_vertices.begin(), sorted_local_vertices.end()); + CUGRAPH_EXPECTS( + thrust::distance(sorted_local_vertices.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_local_vertices.begin(), + sorted_local_vertices.end())) == sorted_local_vertices.size(), + "Invalid input argument: local_vertices should not have duplicates."); if (multi_gpu) { auto& comm = handle.get_comms(); @@ -256,6 +294,11 @@ void expensive_check_edgelist( auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_size = col_comm.get_size(); + CUGRAPH_EXPECTS((edgelist_major_vertices.size() == edgelist_minor_vertices.size()) && + (edgelist_major_vertices.size() == col_comm_size), + "Invalid input argument: both edgelist_major_vertices.size() & " + "edgelist_minor_vertices.size() should coincide with col_comm_size."); + CUGRAPH_EXPECTS( thrust::count_if( rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), @@ -268,71 +311,79 @@ void expensive_check_edgelist( }) == 0, "Invalid input argument: local_vertices should be pre-shuffled."); - auto edge_first = thrust::make_zip_iterator( - thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices)); - CUGRAPH_EXPECTS( - thrust::count_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edge_first, - edge_first + num_edgelist_edges, - [comm_rank, - key_func = - detail::compute_gpu_id_from_edge_t{is_hypergraph_partitioned, - comm_size, - row_comm_size, - col_comm_size}] __device__(auto edge) { - return key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != comm_rank; - }) == 0, - "Invalid input argument: edgelist_major_vertices & edgelist_minor_vertices should be " - "pre-shuffled."); - - if (local_vertices != nullptr) { - rmm::device_uvector unique_edge_vertices(num_edgelist_edges * 2, - handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_major_vertices, - edgelist_major_vertices + num_edgelist_edges, - unique_edge_vertices.begin()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_minor_vertices, - edgelist_minor_vertices + num_edgelist_edges, - unique_edge_vertices.begin() + num_edgelist_edges); - thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - unique_edge_vertices.begin(), - unique_edge_vertices.end()); - unique_edge_vertices.resize( - thrust::distance( - unique_edge_vertices.begin(), - thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - unique_edge_vertices.begin(), - unique_edge_vertices.end())), - handle.get_stream()); - - rmm::device_uvector rx_unique_edge_vertices(0, handle.get_stream()); - std::tie(rx_unique_edge_vertices, std::ignore) = groupby_gpuid_and_shuffle_values( - handle.get_comms(), - unique_edge_vertices.begin(), - unique_edge_vertices.end(), - [key_func = detail::compute_gpu_id_from_vertex_t{comm_size}] __device__( - auto val) { return key_func(val); }, - handle.get_stream()); - - unique_edge_vertices = std::move(rx_unique_edge_vertices); - + for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices[i], edgelist_minor_vertices[i])); CUGRAPH_EXPECTS( thrust::count_if( rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edge_first, + edge_first + num_edgelist_edges, + [comm_rank, + i, + gpu_id_key_func = + detail::compute_gpu_id_from_edge_t{comm_size, row_comm_size, col_comm_size}, + partition_id_key_func = + detail::compute_partition_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto edge) { + return (gpu_id_key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != comm_rank) || + (partition_key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != + row_comm_rank * col_comm_size + col_comm_rank + i * comm_size); + }) == 0, + "Invalid input argument: edgelist_major_vertices & edgelist_minor_vertices should be " + "pre-shuffled."); + + if (local_vertices != nullptr) { + rmm::device_uvector unique_edge_vertices(num_edgelist_edges * 2, + handle.get_stream()); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edgelist_major_vertices, + edgelist_major_vertices + num_edgelist_edges, + unique_edge_vertices.begin()); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edgelist_minor_vertices, + edgelist_minor_vertices + num_edgelist_edges, + unique_edge_vertices.begin() + num_edgelist_edges); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + unique_edge_vertices.begin(), + unique_edge_vertices.end()); + unique_edge_vertices.resize( + thrust::distance( + unique_edge_vertices.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + unique_edge_vertices.begin(), + unique_edge_vertices.end())), + handle.get_stream()); + + rmm::device_uvector rx_unique_edge_vertices(0, handle.get_stream()); + std::tie(rx_unique_edge_vertices, std::ignore) = groupby_gpuid_and_shuffle_values( + handle.get_comms(), unique_edge_vertices.begin(), unique_edge_vertices.end(), - [num_local_vertices, - sorted_local_vertices = sorted_local_vertices.data()] __device__(auto v) { - return !thrust::binary_search( - thrust::seq, sorted_local_vertices, sorted_local_vertices + num_local_vertices, v); - }) == 0, - "Invalid input argument: edgelist_major_vertices and/or edgelist_minor_vertices have " - "invalid vertex ID(s)."); + [key_func = detail::compute_gpu_id_from_vertex_t{comm_size}] __device__( + auto val) { return key_func(val); }, + handle.get_stream()); + + unique_edge_vertices = std::move(rx_unique_edge_vertices); + + CUGRAPH_EXPECTS( + thrust::count_if( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + unique_edge_vertices.begin(), + unique_edge_vertices.end(), + [num_local_vertices, + sorted_local_vertices = sorted_local_vertices.data()] __device__(auto v) { + return !thrust::binary_search( + thrust::seq, sorted_local_vertices, sorted_local_vertices + num_local_vertices, v); + }) == 0, + "Invalid input argument: edgelist_major_vertices and/or edgelist_minor_vertices have " + "invalid vertex ID(s)."); + } } } else { + assert(edgelist_major_vertices.size() == 1); + assert(edgelist_minor_vertices.size() == 1); + if (local_vertices != nullptr) { CUGRAPH_EXPECTS( thrust::count_if( @@ -368,15 +419,15 @@ std::enable_if_t const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, bool do_expensive_check) { // FIXME: remove this check once we drop Pascal support - CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, - "Relabel not supported on Pascal and older architectures."); + CUGRAPH_EXPECTS( + handle.get_device_properties().major >= 7, + "This version of enumber_edgelist not supported on Pascal and older architectures."); #ifdef CUCO_STATIC_MAP_DEFINED auto& comm = handle.get_comms(); @@ -395,8 +446,7 @@ renumber_edgelist(raft::handle_t const& handle, num_local_vertices, edgelist_major_vertices, edgelist_minor_vertices, - num_edgelist_edges, - is_hypergraph_partitioned); + edgelist_edge_counts); } // 1. compute renumber map @@ -407,141 +457,94 @@ renumber_edgelist(raft::handle_t const& handle, num_local_vertices, edgelist_major_vertices, edgelist_minor_vertices, - num_edgelist_edges); + edgelist_edge_counts); // 2. initialize partition_t object, number_of_vertices, and number_of_edges for the coarsened // graph - auto vertex_partition_counts = host_scalar_allgather( + auto vertex_counts = host_scalar_allgather( comm, static_cast(renumber_map_labels.size()), handle.get_stream()); std::vector vertex_partition_offsets(comm_size + 1, 0); - std::partial_sum(vertex_partition_counts.begin(), - vertex_partition_counts.end(), - vertex_partition_offsets.begin() + 1); + std::partial_sum( + vertex_counts.begin(), vertex_counts.end(), vertex_partition_offsets.begin() + 1); - partition_t partition(vertex_partition_offsets, - is_hypergraph_partitioned, - row_comm_size, - col_comm_size, - row_comm_rank, - col_comm_rank); + partition_t partition( + vertex_partition_offsets, row_comm_size, col_comm_size, row_comm_rank, col_comm_rank); auto number_of_vertices = vertex_partition_offsets.back(); auto number_of_edges = host_scalar_allreduce(comm, num_edgelist_edges, handle.get_stream()); // 3. renumber edges - if (is_hypergraph_partitioned) { - CUGRAPH_FAIL("unimplemented."); - } else { - double constexpr load_factor = 0.7; - - // FIXME: compare this hash based approach with a binary search based approach in both memory - // footprint and execution time - - { - vertex_t major_first{}; - vertex_t major_last{}; - std::tie(major_first, major_last) = partition.get_matrix_partition_major_range(0); - rmm::device_uvector renumber_map_major_labels(major_last - major_first, - handle.get_stream()); - std::vector recvcounts(row_comm_size); - for (int i = 0; i < row_comm_size; ++i) { - recvcounts[i] = partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i); - } - std::vector displacements(row_comm_size, 0); - std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); - device_allgatherv(row_comm, - renumber_map_labels.begin(), - renumber_map_major_labels.begin(), - recvcounts, - displacements, - handle.get_stream()); - - CUDA_TRY(cudaStreamSynchronize( - handle.get_stream())); // cuco::static_map currently does not take stream - - cuco::static_map renumber_map{ - static_cast(static_cast(renumber_map_major_labels.size()) / load_factor), - invalid_vertex_id::value, - invalid_vertex_id::value}; - auto pair_first = thrust::make_transform_iterator( - thrust::make_zip_iterator(thrust::make_tuple(renumber_map_major_labels.begin(), - thrust::make_counting_iterator(major_first))), - [] __device__(auto val) { - return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); - }); - renumber_map.insert(pair_first, pair_first + renumber_map_major_labels.size()); - renumber_map.find(edgelist_major_vertices, - edgelist_major_vertices + num_edgelist_edges, - edgelist_major_vertices); - } + double constexpr load_factor = 0.7; - { - vertex_t minor_first{}; - vertex_t minor_last{}; - std::tie(minor_first, minor_last) = partition.get_matrix_partition_minor_range(); - rmm::device_uvector renumber_map_minor_labels(minor_last - minor_first, - handle.get_stream()); - - // FIXME: this P2P is unnecessary if we apply the partitioning scheme used with hypergraph - // partitioning - auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank; - auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size; - // FIXME: this branch may be no longer necessary with NCCL backend - if (comm_src_rank == comm_rank) { - assert(comm_dst_rank == comm_rank); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - renumber_map_labels.begin(), - renumber_map_labels.end(), - renumber_map_minor_labels.begin() + - (partition.get_vertex_partition_first(comm_src_rank) - - partition.get_vertex_partition_first(row_comm_rank * col_comm_size))); - } else { - device_sendrecv(comm, - renumber_map_labels.begin(), - renumber_map_labels.size(), - comm_dst_rank, - renumber_map_minor_labels.begin() + - (partition.get_vertex_partition_first(comm_src_rank) - - partition.get_vertex_partition_first(row_comm_rank * col_comm_size)), - static_cast(partition.get_vertex_partition_size(comm_src_rank)), - comm_src_rank, - handle.get_stream()); - } + // FIXME: compare this hash based approach with a binary search based approach in both memory + // footprint and execution time - // FIXME: these broadcast operations can be placed between ncclGroupStart() and - // ncclGroupEnd() - for (int i = 0; i < col_comm_size; ++i) { - auto offset = partition.get_vertex_partition_first(row_comm_rank * col_comm_size + i) - - partition.get_vertex_partition_first(row_comm_rank * col_comm_size); - auto count = partition.get_vertex_partition_size(row_comm_rank * col_comm_size + i); - device_bcast(col_comm, - renumber_map_minor_labels.begin() + offset, - renumber_map_minor_labels.begin() + offset, - count, - i, - handle.get_stream()); - } + for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + rmm::device_uvector renumber_map_major_labels( + partition.get_matrix_partition_major_size(i), handle.get_stream()); + device_bcast(col_comm, + renumber_map_labels.data(), + renumber_map_major_labels.data(), + renumber_map_major_labels.size(), + i, + handle.get_stream()); + + CUDA_TRY(cudaStreamSynchronize( + handle.get_stream())); // cuco::static_map currently does not take stream + + cuco::static_map renumber_map{ + static_cast(static_cast(renumber_map_major_labels.size()) / load_factor), + invalid_vertex_id::value, + invalid_vertex_id::value}; + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple( + renumber_map_major_labels.begin(), + thrust::make_counting_iterator(partition.get_matrix_partition_major_first(i)))), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + renumber_map.insert(pair_first, pair_first + renumber_map_major_labels.size()); + renumber_map.find(edgelist_major_vertices[i], + edgelist_major_vertices[i] + edgelist_edge_counts[i], + edgelist_major_vertices[i]); + } - CUDA_TRY(cudaStreamSynchronize( - handle.get_stream())); // cuco::static_map currently does not take stream - - cuco::static_map renumber_map{ - static_cast(static_cast(renumber_map_minor_labels.size()) / load_factor), - invalid_vertex_id::value, - invalid_vertex_id::value}; - auto pair_first = thrust::make_transform_iterator( - thrust::make_zip_iterator(thrust::make_tuple(renumber_map_minor_labels.begin(), - thrust::make_counting_iterator(minor_first))), - [] __device__(auto val) { - return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); - }); - renumber_map.insert(pair_first, pair_first + renumber_map_minor_labels.size()); - renumber_map.find(edgelist_minor_vertices, - edgelist_minor_vertices + num_edgelist_edges, - edgelist_minor_vertices); + { + rmm::device_uvector renumber_map_minor_labels( + partition.get_matrix_partition_minor_size(), handle.get_stream()); + std::vector recvcounts(row_comm_size); + for (int i = 0; i < row_comm_size; ++i) { + recvcounts[i] = partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i); } + std::vector displacements(recvcounts.size(), 0); + std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); + device_allgatherv(row_comm, + renumber_map_labels.begin(), + renumber_map_minor_labels.begin(), + recvcounts, + displacements, + handle.get_stream()); + + CUDA_TRY(cudaStreamSynchronize( + handle.get_stream())); // cuco::static_map currently does not take stream + + cuco::static_map renumber_map{ + static_cast(static_cast(renumber_map_minor_labels.size()) / load_factor), + invalid_vertex_id::value, + invalid_vertex_id::value}; + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple( + renumber_map_minor_labels.begin(), + thrust::make_counting_iterator(partition.get_matrix_partition_minor_first()))), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + renumber_map.insert(pair_first, pair_first + renumber_map_minor_labels.size()); + renumber_map.find(edgelist_minor_vertices, + edgelist_minor_vertices + num_edgelist_edges, + edgelist_minor_vertices); } return std::make_tuple( @@ -566,27 +569,29 @@ std::enable_if_t> renumber_edgelist( bool do_expensive_check) { // FIXME: remove this check once we drop Pascal support - CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, - "Relabel not supported on Pascal and older architectures."); + CUGRAPH_EXPECTS( + handle.get_device_properties().major >= 7, + "This version of renumber_edgelist not supported on Pascal and older architectures."); #ifdef CUCO_STATIC_MAP_DEFINED if (do_expensive_check) { - expensive_check_edgelist(handle, - vertices, - num_vertices, - edgelist_major_vertices, - edgelist_minor_vertices, - num_edgelist_edges, - false); + expensive_check_edgelist( + handle, + vertices, + num_vertices, + std::vector{edgelist_major_vertices}, + std::vector{edgelist_minor_vertices}, + std::vertex_t{num_edgelist_edges}, + false); } - auto renumber_map_labels = - detail::compute_renumber_map(handle, - vertices, - num_vertices, - edgelist_major_vertices, - edgelist_minor_vertices, - num_edgelist_edges); + auto renumber_map_labels = detail::compute_renumber_map( + handle, + vertices, + num_vertices, + std::vector{edgelist_major_vertices}, + std::vector{edgelist_minor_vertices}, + std::vector{num_edgelist_edges}); double constexpr load_factor = 0.7; @@ -621,22 +626,21 @@ template std::enable_if_t, partition_t, vertex_t, edge_t>> renumber_edgelist(raft::handle_t const& handle, - vertex_t* edgelist_major_vertices /* [INOUT] */, - vertex_t* edgelist_minor_vertices /* [INOUT] */, - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, bool do_expensive_check) { // FIXME: remove this check once we drop Pascal support - CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, - "Relabel not supported on Pascal and older architectures."); + CUGRAPH_EXPECTS( + handle.get_device_properties().major >= 7, + "This version of renumber_edgelist not supported on Pascal and older architectures."); return detail::renumber_edgelist(handle, static_cast(nullptr), vertex_t{0}, edgelist_major_vertices, edgelist_minor_vertices, - num_edgelist_edges, - is_hypergraph_partitioned, + edgelist_edge_counts, do_expensive_check); } @@ -649,8 +653,9 @@ std::enable_if_t> renumber_edgelist( bool do_expensive_check) { // FIXME: remove this check once we drop Pascal support - CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, - "Relabel not supported on Pascal and older architectures."); + CUGRAPH_EXPECTS( + handle.get_device_properties().major >= 7, + "This version of renumber_edgelist not supported on Pascal and older architectures."); return detail::renumber_edgelist(handle, static_cast(nullptr), vertex_t{0} /* dummy */, @@ -666,22 +671,21 @@ std::enable_if_t const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, bool do_expensive_check) { // FIXME: remove this check once we drop Pascal support - CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, - "Relabel not supported on Pascal and older architectures."); + CUGRAPH_EXPECTS( + handle.get_device_properties().major >= 7, + "This version of renumber_edgelist not supported on Pascal and older architectures."); return detail::renumber_edgelist(handle, local_vertices, num_local_vertices, edgelist_major_vertices, edgelist_minor_vertices, - num_edgelist_edges, - is_hypergraph_partitioned, + edgelist_edge_counts, do_expensive_check); } @@ -696,8 +700,9 @@ std::enable_if_t> renumber_edgelist( bool do_expensive_check) { // FIXME: remove this check once we drop Pascal support - CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, - "Relabel not supported on Pascal and older architectures."); + CUGRAPH_EXPECTS( + handle.get_device_properties().major >= 7, + "This version of renumber_edgelist not supported on Pascal and older architectures."); return detail::renumber_edgelist(handle, vertices, num_vertices, @@ -712,12 +717,12 @@ std::enable_if_t> renumber_edgelist( // instantiations for // template std::tuple, partition_t, int32_t, int32_t> -renumber_edgelist(raft::handle_t const& handle, - int32_t* edgelist_major_vertices /* [INOUT] */, - int32_t* edgelist_minor_vertices /* [INOUT] */, - int32_t num_edgelist_edges, - bool is_hypergraph_partitioned, - bool do_expensive_check); +renumber_edgelist( + raft::handle_t const& handle, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + bool do_expensive_check); template rmm::device_uvector renumber_edgelist( raft::handle_t const& handle, @@ -727,14 +732,14 @@ template rmm::device_uvector renumber_edgelist bool do_expensive_check); template std::tuple, partition_t, int32_t, int32_t> -renumber_edgelist(raft::handle_t const& handle, - int32_t const* local_vertices, - int32_t num_local_vertices, - int32_t* edgelist_major_vertices /* [INOUT] */, - int32_t* edgelist_minor_vertices /* [INOUT] */, - int32_t num_edgelist_edges, - bool is_hypergraph_partitioned, - bool do_expensive_check); +renumber_edgelist( + raft::handle_t const& handle, + int32_t const* local_vertices, + int32_t num_local_vertices, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + bool do_expensive_check); template rmm::device_uvector renumber_edgelist( raft::handle_t const& handle, @@ -748,12 +753,12 @@ template rmm::device_uvector renumber_edgelist // instantiations for // template std::tuple, partition_t, int32_t, int64_t> -renumber_edgelist(raft::handle_t const& handle, - int32_t* edgelist_major_vertices /* [INOUT] */, - int32_t* edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned, - bool do_expensive_check); +renumber_edgelist( + raft::handle_t const& handle, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + bool do_expensive_check); template rmm::device_uvector renumber_edgelist( raft::handle_t const& handle, @@ -763,14 +768,14 @@ template rmm::device_uvector renumber_edgelist bool do_expensive_check); template std::tuple, partition_t, int32_t, int64_t> -renumber_edgelist(raft::handle_t const& handle, - int32_t const* local_vertices, - int32_t num_local_vertices, - int32_t* edgelist_major_vertices /* [INOUT] */, - int32_t* edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned, - bool do_expensive_check); +renumber_edgelist( + raft::handle_t const& handle, + int32_t const* local_vertices, + int32_t num_local_vertices, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + bool do_expensive_check); template rmm::device_uvector renumber_edgelist( raft::handle_t const& handle, @@ -784,12 +789,12 @@ template rmm::device_uvector renumber_edgelist // instantiations for // template std::tuple, partition_t, int64_t, int64_t> -renumber_edgelist(raft::handle_t const& handle, - int64_t* edgelist_major_vertices /* [INOUT] */, - int64_t* edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned, - bool do_expensive_check); +renumber_edgelist( + raft::handle_t const& handle, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + bool do_expensive_check); template rmm::device_uvector renumber_edgelist( raft::handle_t const& handle, @@ -799,14 +804,14 @@ template rmm::device_uvector renumber_edgelist bool do_expensive_check); template std::tuple, partition_t, int64_t, int64_t> -renumber_edgelist(raft::handle_t const& handle, - int64_t const* local_vertices, - int64_t num_local_vertices, - int64_t* edgelist_major_vertices /* [INOUT] */, - int64_t* edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned, - bool do_expensive_check); +renumber_edgelist( + raft::handle_t const& handle, + int64_t const* local_vertices, + int64_t num_local_vertices, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + bool do_expensive_check); template rmm::device_uvector renumber_edgelist( raft::handle_t const& handle, From 8c309497742eb0c915ada835a6cf59fd3ef84c3d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 1 Mar 2021 10:41:26 -0500 Subject: [PATCH 07/63] compile error fixes --- cpp/include/experimental/graph_view.hpp | 8 +++ .../copy_v_transform_reduce_in_out_nbr.cuh | 21 ++---- cpp/src/experimental/coarsen_graph.cu | 69 +++++++++---------- 3 files changed, 47 insertions(+), 51 deletions(-) diff --git a/cpp/include/experimental/graph_view.hpp b/cpp/include/experimental/graph_view.hpp index 7f9051d1ee6..ee41be7a4cb 100644 --- a/cpp/include/experimental/graph_view.hpp +++ b/cpp/include/experimental/graph_view.hpp @@ -389,6 +389,10 @@ class graph_view_t::value); - auto comm_rank = GraphViewType::is_multi_gpu ? handle.get_comms().get_rank() : int{0}; - auto minor_tmp_buffer_size = (GraphViewType::is_multi_gpu && (in != GraphViewType::is_adj_matrix_transposed)) ? GraphViewType::is_adj_matrix_transposed @@ -378,9 +376,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, if (GraphViewType::is_multi_gpu) { auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); auto const row_comm_rank = row_comm.get_rank(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - minor_init = (row_comm_rank == 0) ? init : T {} + minor_init = (row_comm_rank == 0) ? init : T {}; } if (GraphViewType::is_multi_gpu) { @@ -401,17 +397,10 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { matrix_partition_device_t matrix_partition(graph_view, i); - auto major_tmp_buffer_size = vertex_t{0}; - if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - - major_tmp_buffer_size = (in == GraphViewType::is_adj_matrix_transposed) - ? matrix_partition.get_major_size() - : vertex_t{0}; - } + auto major_tmp_buffer_size = + GraphViewType::is_multi_gpu && (in == GraphViewType::is_adj_matrix_transposed) + ? matrix_partition.get_major_size() + : vertex_t{0}; auto major_tmp_buffer = allocate_dataframe_buffer(major_tmp_buffer_size, handle.get_stream()); auto major_buffer_first = get_dataframe_buffer_begin(major_tmp_buffer); diff --git a/cpp/src/experimental/coarsen_graph.cu b/cpp/src/experimental/coarsen_graph.cu index 0e43a3b5b07..b23afebdaa7 100644 --- a/cpp/src/experimental/coarsen_graph.cu +++ b/cpp/src/experimental/coarsen_graph.cu @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -89,7 +90,7 @@ std:: std::move(edgelist_weights)); } -template +template edge_t groupby_e_and_coarsen_edgelist(vertex_t const *edgelist_major_vertices, vertex_t const *edgelist_minor_vertices, weight_t const *edgelist_weights, @@ -102,8 +103,8 @@ edge_t groupby_e_and_coarsen_edgelist(vertex_t const *edgelist_major_vertices, if (edgelist_weights != nullptr) { thrust::sort_by_key(rmm::exec_policy(stream)->on(stream), pair_first, - pair_first + edgelist_major_vertices.size(), - edgelist_weights.begin()); + pair_first + number_of_edges, + edgelist_weights); rmm::device_uvector tmp_edgelist_major_vertices(number_of_edges, stream); rmm::device_uvector tmp_edgelist_minor_vertices(tmp_edgelist_major_vertices.size(), @@ -134,12 +135,12 @@ edge_t groupby_e_and_coarsen_edgelist(vertex_t const *edgelist_major_vertices, } else { thrust::sort(rmm::exec_policy(stream)->on(stream), pair_first, - pair_first + edgelist_major_vertices.size()); + pair_first + number_of_edges); return static_cast( thrust::distance(pair_first, thrust::unique(rmm::exec_policy(stream)->on(stream), pair_first, - pair_first + edgelist_major_vertices.size()))); + pair_first + number_of_edges))); } } @@ -190,13 +191,13 @@ std:: edgelist_weights.data(), static_cast(edgelist_major_vertices.size()), stream); - edgelist_major_vertices.resize(number_of_edges, handle.get_stream()); - edgelist_major_vertices.shrink_to_fit(handle.get_stream()); - edgelist_minor_vertices.resize(number_of_edges, handle.get_stream()); - edgelist_minor_vertices.shrink_to_fit(handle.get_stream()); + edgelist_major_vertices.resize(number_of_edges, stream); + edgelist_major_vertices.shrink_to_fit(stream); + edgelist_minor_vertices.resize(number_of_edges, stream); + edgelist_minor_vertices.shrink_to_fit(stream); if (edgelist_weights.size() > 0) { - edgelist_weights.resize(number_of_edges, handle.get_stream()); - edgelist_weights.shrink_to_fit(handle.get_stream()); + edgelist_weights.resize(number_of_edges, stream); + edgelist_weights.shrink_to_fit(stream); } return std::make_tuple(std::move(edgelist_major_vertices), @@ -241,22 +242,17 @@ coarsen_graph( : graph_view.get_number_of_local_adj_matrix_partition_cols(), handle.get_stream()); if (store_transposed) { - // copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_major_labels.data()); copy_to_adj_matrix_row(handle, graph_view, labels, adj_matrix_minor_labels.data()); } else { - // copy_to_adj_matrix_row(handle, graph_view, labels, adj_matrix_major_labels.data()); copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_minor_labels.data()); } - std::vector> coarsened_edgelist_major_vertices( - graph_view.get_number_of_local_adj_matrix_partitions(), - rmm::device_uvector(0, handle.get_stream())); - std::vector> coarsened_edgelist_minor_vertices( - coarsened_edgelist_major_vertices.size(), - rmm::device_uvector(0, handle.get_stream())); - std::vector> coarsened_edgelist_weights( - graph_view.is_weighted() ? coarsened_edgelist_major_vertices.size() : size_t{0}, - rmm::device_uvector(0, handle.get_stream())); + std::vector>> coarsened_edgelist_major_vertices( + graph_view.get_number_of_local_adj_matrix_partitions()); + std::vector>> coarsened_edgelist_minor_vertices( + coarsened_edgelist_major_vertices.size()); + std::vector>> coarsened_edgelist_weights( + graph_view.is_weighted() ? coarsened_edgelist_major_vertices.size() : size_t{0}); // FIXME: we may compare performance/memory footprint with the hash_based approach especially when // cuco::dynamic_map becomes available (so we don't need to preallocate memory assuming the worst // case). We may be able to limit the memory requirement close to the final coarsened edgelist @@ -269,7 +265,7 @@ coarsen_graph( : graph_view.get_number_of_local_adj_matrix_partition_rows(i), handle.get_stream()); device_bcast( - col_comm, labels, major_labels.data(), major_labels.size(), i, handle.get_stream()); + col_comm, labels, major_labels.data(), major_labels.size(), static_cast(i), handle.get_stream()); rmm::device_uvector edgelist_major_vertices(0, handle.get_stream()); rmm::device_uvector edgelist_minor_vertices(0, handle.get_stream()); @@ -308,7 +304,7 @@ coarsen_graph( groupby_gpuid_and_shuffle_values( handle.get_comms(), edge_first, - edge_first + coarsened_edgelist_major_vertices.size(), + edge_first + edgelist_major_vertices.size(), [key_func = detail::compute_gpu_id_from_edge_t{ comm.get_size(), row_comm.get_size(), col_comm.get_size()}] __device__(auto val) { @@ -323,7 +319,7 @@ coarsen_graph( groupby_gpuid_and_shuffle_values( handle.get_comms(), edge_first, - edge_first + coarsened_edgelist_major_vertices.size(), + edge_first + edgelist_major_vertices.size(), [key_func = detail::compute_gpu_id_from_edge_t{ comm.get_size(), row_comm.get_size(), col_comm.get_size()}] __device__(auto val) { @@ -391,16 +387,19 @@ coarsen_graph( thrust::get<1>(last), thrust::make_transform_iterator( partition_ids.begin(), - [comm_size] __device__(auto val) { + [comm_size] __device__(auto id) { return id / comm_size; // global partition id to local partition id }), tmps.begin()); displacements = std::move(tmps); } - thrust::excludsive_scan(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::exclusive_scan(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), displacements.begin(), displacements.end(), displacements.begin()); + std::vector h_displacements(displacements.size()); + raft::update_host(h_displacements.data(), displacements.data(), displacements.size(), handle.get_stream()); + CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); for (size_t j = 0; j < col_comm_size; ++j) { auto number_of_partition_edges = groupby_e_and_coarsen_edgelist( @@ -459,15 +458,15 @@ coarsen_graph( groupby_e_and_coarsen_edgelist(coarsened_edgelist_major_vertices[i], coarsened_edgelist_minor_vertices[i], coarsened_edgelist_weights[i], - coarsened_edgelist_major_vertices[i].size(), + static_cast(coarsened_edgelist_major_vertices[i].size()), handle.get_stream()); - coarsened_edgelist_major_vertices[j].resize(number_of_partition_edges, handle.get_stream()); - coarsened_edgelist_major_vertices[j].shrink_to_fit(handle.get_stream()); - coarsened_edgelist_minor_vertices[j].resize(number_of_partition_edges, handle.get_stream()); - coarsened_edgelist_minor_vertices[j].shrink_to_fit(handle.get_stream()); - if (edgelist_weights.size() > 0) { - coarsened_edgelist_weights[j].resize(number_of_partition_edges, handle.get_stream()); - coarsened_edgelist_weights[j].shrink_to_fit(handle.get_stream()); + coarsened_edgelist_major_vertices[i].resize(number_of_partition_edges, handle.get_stream()); + coarsened_edgelist_major_vertices[i].shrink_to_fit(handle.get_stream()); + coarsened_edgelist_minor_vertices[i].resize(number_of_partition_edges, handle.get_stream()); + coarsened_edgelist_minor_vertices[i].shrink_to_fit(handle.get_stream()); + if (coarsened_edgelist_weights.size() > 0) { + coarsened_edgelist_weights[i].resize(number_of_partition_edges, handle.get_stream()); + coarsened_edgelist_weights[i].shrink_to_fit(handle.get_stream()); } } From 3ddff4cad22e28e9d0418c184b49164b1b130964 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 6 Mar 2021 00:18:42 -0500 Subject: [PATCH 08/63] fix compile errors --- cpp/src/experimental/coarsen_graph.cu | 131 +++++---- cpp/src/experimental/renumber_edgelist.cu | 307 +++++++++++++--------- 2 files changed, 255 insertions(+), 183 deletions(-) diff --git a/cpp/src/experimental/coarsen_graph.cu b/cpp/src/experimental/coarsen_graph.cu index b23afebdaa7..3a8aa09c28a 100644 --- a/cpp/src/experimental/coarsen_graph.cu +++ b/cpp/src/experimental/coarsen_graph.cu @@ -91,9 +91,9 @@ std:: } template -edge_t groupby_e_and_coarsen_edgelist(vertex_t const *edgelist_major_vertices, - vertex_t const *edgelist_minor_vertices, - weight_t const *edgelist_weights, +edge_t groupby_e_and_coarsen_edgelist(vertex_t *edgelist_major_vertices /* [INOUT] */, + vertex_t *edgelist_minor_vertices /* [INOUT] */, + weight_t *edgelist_weights /* [INOUT] */, edge_t number_of_edges, cudaStream_t stream) { @@ -133,14 +133,11 @@ edge_t groupby_e_and_coarsen_edgelist(vertex_t const *edgelist_major_vertices, return ret; } else { - thrust::sort(rmm::exec_policy(stream)->on(stream), - pair_first, - pair_first + number_of_edges); - return static_cast( - thrust::distance(pair_first, - thrust::unique(rmm::exec_policy(stream)->on(stream), - pair_first, - pair_first + number_of_edges))); + thrust::sort(rmm::exec_policy(stream)->on(stream), pair_first, pair_first + number_of_edges); + return static_cast(thrust::distance( + pair_first, + thrust::unique( + rmm::exec_policy(stream)->on(stream), pair_first, pair_first + number_of_edges))); } } @@ -247,12 +244,20 @@ coarsen_graph( copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_minor_labels.data()); } - std::vector>> coarsened_edgelist_major_vertices( - graph_view.get_number_of_local_adj_matrix_partitions()); - std::vector>> coarsened_edgelist_minor_vertices( - coarsened_edgelist_major_vertices.size()); - std::vector>> coarsened_edgelist_weights( + std::vector> coarsened_edgelist_major_vertices{}; + std::vector> coarsened_edgelist_minor_vertices{}; + std::vector> coarsened_edgelist_weights{}; + coarsened_edgelist_major_vertices.reserve(graph_view.get_number_of_local_adj_matrix_partitions()); + coarsened_edgelist_minor_vertices.reserve(coarsened_edgelist_major_vertices.size()); + coarsened_edgelist_weights.reserve( graph_view.is_weighted() ? coarsened_edgelist_major_vertices.size() : size_t{0}); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + coarsened_edgelist_major_vertices.emplace_back(0, handle.get_stream()); + coarsened_edgelist_minor_vertices.emplace_back(0, handle.get_stream()); + if (graph_view.is_weighted()) { + coarsened_edgelist_weights.emplace_back(0, handle.get_stream()); + } + } // FIXME: we may compare performance/memory footprint with the hash_based approach especially when // cuco::dynamic_map becomes available (so we don't need to preallocate memory assuming the worst // case). We may be able to limit the memory requirement close to the final coarsened edgelist @@ -264,8 +269,18 @@ coarsen_graph( store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols(i) : graph_view.get_number_of_local_adj_matrix_partition_rows(i), handle.get_stream()); - device_bcast( - col_comm, labels, major_labels.data(), major_labels.size(), static_cast(i), handle.get_stream()); + // FIXME: this is copy is unnecessary, beter fix RAFT comm's bcast to take const iterators for + // input + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + labels, + labels + major_labels.size(), + major_labels.begin()); + device_bcast(col_comm, + major_labels.data(), + major_labels.data(), + major_labels.size(), + static_cast(i), + handle.get_stream()); rmm::device_uvector edgelist_major_vertices(0, handle.get_stream()); rmm::device_uvector edgelist_minor_vertices(0, handle.get_stream()); @@ -276,7 +291,7 @@ coarsen_graph( graph_view.indices(i), graph_view.weights(i), major_labels.data(), - adj_matrix_minor_labels.begin(), + adj_matrix_minor_labels.data(), store_transposed ? graph_view.get_local_adj_matrix_partition_col_first(i) : graph_view.get_local_adj_matrix_partition_row_first(i), store_transposed ? graph_view.get_local_adj_matrix_partition_col_last(i) @@ -340,26 +355,25 @@ coarsen_graph( // groupby_adj_matrix_partition_and_shuffle_values). auto key_func = detail::compute_partition_id_from_edge_t{ comm.get_size(), row_comm.get_size(), col_comm.get_size()}; + auto pair_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); if (graph_view.is_weighted()) { - auto edge_first = - thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(), - edgelist_minor_vertices.begin(), - edgelist_weights.begin())); thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edge_first, - edge_first + edgelist_major_vertices.size(), - [key_func] __device__(auto val) { - return key_func(thrust::get<0>(val), thrust::get<1>(val)); + pair_first, + pair_first + edgelist_major_vertices.size(), + edgelist_weights.begin(), + [key_func] __device__(auto lhs, auto rhs) { + return key_func(thrust::get<0>(lhs), thrust::get<1>(lhs)) < + key_func(thrust::get<0>(rhs), thrust::get<1>(rhs)); }); } else { - auto edge_first = thrust::make_zip_iterator( - thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); - thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edge_first, - edge_first + edgelist_major_vertices.size(), - [key_func] __device__(auto val) { - return key_func(thrust::get<0>(val), thrust::get<1>(val)); - }); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + pair_first, + pair_first + edgelist_major_vertices.size(), + [key_func] __device__(auto lhs, auto rhs) { + return key_func(thrust::get<0>(lhs), thrust::get<1>(lhs)) < + key_func(thrust::get<0>(rhs), thrust::get<1>(rhs)); + }); } auto partition_id_first = thrust::make_transform_iterator( thrust::make_zip_iterator( @@ -377,7 +391,8 @@ coarsen_graph( thrust::make_constant_iterator(edge_t{1}), partition_ids.begin(), displacements.begin()); - if (thrust::distance(partition_ids.begin(), thrust::get<0>(last)) < partition_ids.size()) { + if (static_cast(thrust::distance(partition_ids.begin(), thrust::get<0>(last))) < + partition_ids.size()) { rmm::device_uvector tmps(displacements.size(), handle.get_stream()); thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), displacements.begin(), @@ -394,19 +409,20 @@ coarsen_graph( displacements = std::move(tmps); } thrust::exclusive_scan(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - displacements.begin(), - displacements.end(), - displacements.begin()); + displacements.begin(), + displacements.end(), + displacements.begin()); std::vector h_displacements(displacements.size()); - raft::update_host(h_displacements.data(), displacements.data(), displacements.size(), handle.get_stream()); + raft::update_host( + h_displacements.data(), displacements.data(), displacements.size(), handle.get_stream()); CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - for (size_t j = 0; j < col_comm_size; ++j) { + for (int j = 0; j < col_comm_size; ++j) { auto number_of_partition_edges = groupby_e_and_coarsen_edgelist( edgelist_major_vertices.begin() + h_displacements[j], - edgelist_minor_vertices + h_displacements[j], + edgelist_minor_vertices.begin() + h_displacements[j], graph_view.is_weighted() ? edgelist_weights.begin() + h_displacements[j] - : static_cast(nullptr), + : static_cast(nullptr), h_displacements[j + 1] - h_displacements[j], handle.get_stream()); @@ -414,20 +430,19 @@ coarsen_graph( // FIXME: this can lead to frequent costly reallocation; we may be able to avoid this if we // can reserve address space to avoid expensive reallocation. // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management - coarsened_edgelist_major_vertices[j].resize(cur_size + edgelist_major_vertices.size(), + coarsened_edgelist_major_vertices[j].resize(cur_size + number_of_partition_edges, handle.get_stream()); coarsened_edgelist_minor_vertices[j].resize(coarsened_edgelist_major_vertices.size(), handle.get_stream()); - coarsened_edgelist_weights[j].resize( - graph_view.is_weighted() ? coarsened_edgelist_major_vertices.size() : 0, - handle.get_stream()); - if (graph_view.is_weighted()) { + coarsened_edgelist_weights[j].resize(coarsened_edgelist_major_vertices.size(), + handle.get_stream()); + auto src_edge_first = thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin(), edgelist_weights.begin())) + - displacements[j]; + h_displacements[j]; auto dst_edge_first = thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices[j].begin(), coarsened_edgelist_minor_vertices[j].begin(), @@ -440,7 +455,7 @@ coarsen_graph( } else { auto src_edge_first = thrust::make_zip_iterator(thrust::make_tuple( edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())) + - displacements[j]; + h_displacements[j]; auto dst_edge_first = thrust::make_zip_iterator( thrust::make_tuple(coarsened_edgelist_major_vertices[j].begin(), coarsened_edgelist_minor_vertices[j].begin())) + @@ -454,12 +469,13 @@ coarsen_graph( } for (size_t i = 0; i < coarsened_edgelist_major_vertices.size(); ++i) { - auto number_of_partition_edges = - groupby_e_and_coarsen_edgelist(coarsened_edgelist_major_vertices[i], - coarsened_edgelist_minor_vertices[i], - coarsened_edgelist_weights[i], - static_cast(coarsened_edgelist_major_vertices[i].size()), - handle.get_stream()); + auto number_of_partition_edges = groupby_e_and_coarsen_edgelist( + coarsened_edgelist_major_vertices[i].data(), + coarsened_edgelist_minor_vertices[i].data(), + graph_view.is_weighted() ? coarsened_edgelist_weights[i].data() + : static_cast(nullptr), + static_cast(coarsened_edgelist_major_vertices[i].size()), + handle.get_stream()); coarsened_edgelist_major_vertices[i].resize(number_of_partition_edges, handle.get_stream()); coarsened_edgelist_major_vertices[i].shrink_to_fit(handle.get_stream()); coarsened_edgelist_minor_vertices[i].resize(number_of_partition_edges, handle.get_stream()); @@ -547,7 +563,8 @@ coarsen_graph( : coarsened_edgelist_major_vertices[i].data(); edgelists[i].p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices[i].data() : coarsened_edgelist_minor_vertices[i].data(); - edgelists[i].p_edge_weights = coarsened_edgelist_weights[i].data(); + edgelists[i].p_edge_weights = graph_view.is_weighted() ? coarsened_edgelist_weights[i].data() + : static_cast(nullptr); edgelists[i].number_of_edges = static_cast(coarsened_edgelist_major_vertices[i].size()); } diff --git a/cpp/src/experimental/renumber_edgelist.cu b/cpp/src/experimental/renumber_edgelist.cu index c77936b607a..c619084f54f 100644 --- a/cpp/src/experimental/renumber_edgelist.cu +++ b/cpp/src/experimental/renumber_edgelist.cu @@ -62,8 +62,7 @@ rmm::device_uvector compute_renumber_map( rmm::device_uvector major_labels(0, handle.get_stream()); rmm::device_uvector major_counts(0, handle.get_stream()); for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { - rmm::device_uvector sorted_major_labels(edgelist_edge_counts[i].size(), - handle.get_stream()); + rmm::device_uvector sorted_major_labels(edgelist_edge_counts[i], handle.get_stream()); thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), edgelist_major_vertices[i], edgelist_major_vertices[i] + edgelist_edge_counts[i], @@ -71,18 +70,19 @@ rmm::device_uvector compute_renumber_map( thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), sorted_major_labels.begin(), sorted_major_labels.end()); - major_labels.resize(sorted_major_labels.size(), handle.get_stream()); - major_counts.resize(major_labels.size(), handle.get_stream()); + rmm::device_uvector tmp_major_labels(sorted_major_labels.size(), handle.get_stream()); + rmm::device_uvector tmp_major_counts(tmp_major_labels.size(), handle.get_stream()); auto major_pair_it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), sorted_major_labels.begin(), sorted_major_labels.end(), thrust::make_constant_iterator(edge_t{1}), - major_labels.begin(), - major_counts.begin()); - major_labels.resize(thrust::distance(tmp_major_labels.begin(), thrust::get<0>(major_pair_it)), - handle.get_stream()); - major_counts.resize(tmp_major_labels.size(), handle.get_stream()); + tmp_major_labels.begin(), + tmp_major_counts.begin()); + tmp_major_labels.resize( + thrust::distance(tmp_major_labels.begin(), thrust::get<0>(major_pair_it)), + handle.get_stream()); + tmp_major_counts.resize(tmp_major_labels.size(), handle.get_stream()); if (multi_gpu) { auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); @@ -91,60 +91,60 @@ rmm::device_uvector compute_renumber_map( rmm::device_uvector rx_major_labels(0, handle.get_stream()); rmm::device_uvector rx_major_counts(0, handle.get_stream()); - auto rx_sizes = - host_scalar_gather(col_comm, major_labels.size(), static_cast(i), handle.get_stream()); + auto rx_sizes = host_scalar_gather( + col_comm, tmp_major_labels.size(), static_cast(i), handle.get_stream()); std::vector rx_displs{}; - if (static_cast(i) == col_comm_rank) { + if (static_cast(i) == col_comm_rank) { rx_displs.assign(col_comm_size, size_t{0}); std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1); rx_major_labels.resize(rx_displs.back() + rx_sizes.back(), handle.get_stream()); rx_major_counts.resize(major_labels.size(), handle.get_stream()); } - device_gatherv( - col_comm, - thrust::make_zip_iterator(thrust::make_tuple(major_labels.begin(), major_counts.begin())), - thrust::make_zip_iterator( - thrust::make_tuple(rx_major_labels.begin(), rx_major_counts.begin())), - major_labels.size(), - rx_sizes, - rx_displs, - static_cast(i), - handle.get_stream()); - major_labels = std::move(rx_major_labels); - major_counts = std::move(rx_major_counts); + device_gatherv(col_comm, + thrust::make_zip_iterator( + thrust::make_tuple(tmp_major_labels.begin(), tmp_major_counts.begin())), + thrust::make_zip_iterator( + thrust::make_tuple(rx_major_labels.begin(), rx_major_counts.begin())), + tmp_major_labels.size(), + rx_sizes, + rx_displs, + static_cast(i), + handle.get_stream()); + if (static_cast(i) == col_comm_rank) { + thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_major_labels.begin(), + rx_major_labels.end(), + rx_major_counts.begin()); + major_labels.resize(rx_major_labels.size(), handle.get_stream()); + major_counts.resize(major_labels.size(), handle.get_stream()); + auto pair_it = + thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_major_labels.begin(), + rx_major_labels.end(), + rx_major_counts.begin(), + major_labels.begin(), + major_counts.begin()); + major_labels.resize(thrust::distance(major_labels.begin(), thrust::get<0>(pair_it)), + handle.get_stream()); + major_counts.resize(major_labels.size(), handle.get_stream()); + major_labels.shrink_to_fit(handle.get_stream()); + major_counts.shrink_to_fit(handle.get_stream()); + } } else { - major_labels.shrink_to_fit(handle.get_stream()); - major_counts.shrink_to_fit(handle.get_stream()); + tmp_major_labels.shrink_to_fit(handle.get_stream()); + tmp_major_counts.shrink_to_fit(handle.get_stream()); + major_labels = std::move(tmp_major_labels); + major_counts = std::move(tmp_major_counts); } } - if (multi_gpu) { - thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - major_labels.begin(), - major_labels.end(), - major_counts.begin()); - rmm::device_uvector tmp_labels(major_labels.size(), handle.get_stream()); - rmm::device_uvector tmp_counts(tmp_labels.size(), handle.get_stream()); - pair_it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - major_labels.begin(), - major_labels.end(), - major_counts.begin(), - tmp_labels.begin(), - tmp_counts.begin()); - tmp_labels.resize(thrust::distance(tmp_labels.begin(), thrust::get<0>(pair_it)), - handle.get_stream()); - tmp_counts.resize(tmp_labels.size(), handle.get_stream()); - - major_labels = std::move(tmp_labels); - major_counts = std::move(tmp_counts); - } - // 2. acquire unique minor labels std::vector minor_displs(edgelist_minor_vertices.size(), edge_t{0}); std::partial_sum( edgelist_edge_counts.begin(), edgelist_edge_counts.end() - 1, minor_displs.begin() + 1); - rmm::device_uvector minor_labels(minor_displs.back(), handle.get_stream()); + rmm::device_uvector minor_labels(minor_displs.back() + edgelist_edge_counts.back(), + handle.get_stream()); for (size_t i = 0; i < edgelist_minor_vertices.size(); ++i) { thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), edgelist_minor_vertices[i], @@ -161,19 +161,30 @@ rmm::device_uvector compute_renumber_map( minor_labels.end())), handle.get_stream()); if (multi_gpu) { + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_size = row_comm.get_size(); + rmm::device_uvector rx_minor_labels(0, handle.get_stream()); std::tie(rx_minor_labels, std::ignore) = groupby_gpuid_and_shuffle_values( - comm, + row_comm, minor_labels.begin(), minor_labels.end(), - [key_func = detail::compute_gpu_id_from_vertex_t{comm_size}] __device__(auto val) { - return key_func(val); - }, + [key_func = detail::compute_gpu_id_from_vertex_t{row_comm_size}] __device__( + auto val) { return key_func(val); }, + handle.get_stream()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_minor_labels.begin(), + rx_minor_labels.end()); + rx_minor_labels.resize( + thrust::distance( + rx_minor_labels.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_minor_labels.begin(), + rx_minor_labels.end())), handle.get_stream()); minor_labels = std::move(rx_minor_labels); - } else { - minor_labels.shrink_to_fit(handle.get_stream()); } + minor_labels.shrink_to_fit(handle.get_stream()); // 3. merge major and minor labels and vertex labels @@ -267,7 +278,7 @@ void expensive_check_edgelist( vertex_t num_local_vertices /* relevant only if local_vertices != nullptr */, std::vector const& edgelist_major_vertices, std::vector const& edgelist_minor_vertices, - edge_t num_edgelist_edges) + std::vector const& edgelist_edge_counts) { rmm::device_uvector sorted_local_vertices( local_vertices != nullptr ? num_local_vertices : vertex_t{0}, handle.get_stream()); @@ -278,12 +289,12 @@ void expensive_check_edgelist( thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), sorted_local_vertices.begin(), sorted_local_vertices.end()); - CUGRAPH_EXPECTS( - thrust::distance(sorted_local_vertices.begin(), - thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - sorted_local_vertices.begin(), - sorted_local_vertices.end())) == sorted_local_vertices.size(), - "Invalid input argument: local_vertices should not have duplicates."); + CUGRAPH_EXPECTS(static_cast(thrust::distance( + sorted_local_vertices.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_local_vertices.begin(), + sorted_local_vertices.end()))) == sorted_local_vertices.size(), + "Invalid input argument: local_vertices should not have duplicates."); if (multi_gpu) { auto& comm = handle.get_comms(); @@ -291,11 +302,13 @@ void expensive_check_edgelist( auto const comm_rank = comm.get_rank(); auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); auto const row_comm_size = row_comm.get_size(); + auto const row_comm_rank = row_comm.get_rank(); auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_size = col_comm.get_size(); + auto const col_comm_rank = col_comm.get_rank(); CUGRAPH_EXPECTS((edgelist_major_vertices.size() == edgelist_minor_vertices.size()) && - (edgelist_major_vertices.size() == col_comm_size), + (edgelist_major_vertices.size() == static_cast(col_comm_size)), "Invalid input argument: both edgelist_major_vertices.size() & " "edgelist_minor_vertices.size() should coincide with col_comm_size."); @@ -318,8 +331,12 @@ void expensive_check_edgelist( thrust::count_if( rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), edge_first, - edge_first + num_edgelist_edges, - [comm_rank, + edge_first + edgelist_edge_counts[i], + [comm_size, + comm_rank, + row_comm_rank, + col_comm_size, + col_comm_rank, i, gpu_id_key_func = detail::compute_gpu_id_from_edge_t{comm_size, row_comm_size, col_comm_size}, @@ -327,56 +344,75 @@ void expensive_check_edgelist( detail::compute_partition_id_from_edge_t{ comm_size, row_comm_size, col_comm_size}] __device__(auto edge) { return (gpu_id_key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != comm_rank) || - (partition_key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != + (partition_id_key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != row_comm_rank * col_comm_size + col_comm_rank + i * comm_size); }) == 0, "Invalid input argument: edgelist_major_vertices & edgelist_minor_vertices should be " "pre-shuffled."); if (local_vertices != nullptr) { - rmm::device_uvector unique_edge_vertices(num_edgelist_edges * 2, - handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_major_vertices, - edgelist_major_vertices + num_edgelist_edges, - unique_edge_vertices.begin()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_minor_vertices, - edgelist_minor_vertices + num_edgelist_edges, - unique_edge_vertices.begin() + num_edgelist_edges); - thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - unique_edge_vertices.begin(), - unique_edge_vertices.end()); - unique_edge_vertices.resize( - thrust::distance( - unique_edge_vertices.begin(), - thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - unique_edge_vertices.begin(), - unique_edge_vertices.end())), - handle.get_stream()); - - rmm::device_uvector rx_unique_edge_vertices(0, handle.get_stream()); - std::tie(rx_unique_edge_vertices, std::ignore) = groupby_gpuid_and_shuffle_values( - handle.get_comms(), - unique_edge_vertices.begin(), - unique_edge_vertices.end(), - [key_func = detail::compute_gpu_id_from_vertex_t{comm_size}] __device__( - auto val) { return key_func(val); }, - handle.get_stream()); - - unique_edge_vertices = std::move(rx_unique_edge_vertices); - + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + + rmm::device_uvector sorted_major_vertices(0, handle.get_stream()); + { + auto recvcounts = + host_scalar_allgather(col_comm, sorted_local_vertices.size(), handle.get_stream()); + std::vector displacements(recvcounts.size(), size_t{0}); + std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); + sorted_major_vertices.resize(displacements.back() + recvcounts.back(), + handle.get_stream()); + device_allgatherv(col_comm, + sorted_local_vertices.data(), + sorted_major_vertices.data(), + recvcounts, + displacements, + handle.get_stream()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_major_vertices.begin(), + sorted_major_vertices.end()); + } + + rmm::device_uvector sorted_minor_vertices(0, handle.get_stream()); + { + auto recvcounts = + host_scalar_allgather(row_comm, sorted_local_vertices.size(), handle.get_stream()); + std::vector displacements(recvcounts.size(), size_t{0}); + std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); + sorted_minor_vertices.resize(displacements.back() + recvcounts.back(), + handle.get_stream()); + device_allgatherv(row_comm, + sorted_local_vertices.data(), + sorted_minor_vertices.data(), + recvcounts, + displacements, + handle.get_stream()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_minor_vertices.begin(), + sorted_minor_vertices.end()); + } + + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices[i], edgelist_minor_vertices[i])); CUGRAPH_EXPECTS( thrust::count_if( rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - unique_edge_vertices.begin(), - unique_edge_vertices.end(), - [num_local_vertices, - sorted_local_vertices = sorted_local_vertices.data()] __device__(auto v) { - return !thrust::binary_search( - thrust::seq, sorted_local_vertices, sorted_local_vertices + num_local_vertices, v); + edge_first, + edge_first + edgelist_edge_counts[i], + [num_major_vertices = static_cast(sorted_major_vertices.size()), + sorted_major_vertices = sorted_major_vertices.data(), + num_minor_vertices = static_cast(sorted_minor_vertices.size()), + sorted_minor_vertices = sorted_minor_vertices.data()] __device__(auto e) { + return !thrust::binary_search(thrust::seq, + sorted_major_vertices, + sorted_major_vertices + num_major_vertices, + thrust::get<0>(e)) || + !thrust::binary_search(thrust::seq, + sorted_minor_vertices, + sorted_minor_vertices + num_minor_vertices, + thrust::get<1>(e)); }) == 0, - "Invalid input argument: edgelist_major_vertices and/or edgelist_minor_vertices have " + "Invalid input argument: edgelist_major_vertices and/or edgelist_mior_vertices have " "invalid vertex ID(s)."); } } @@ -388,8 +424,8 @@ void expensive_check_edgelist( CUGRAPH_EXPECTS( thrust::count_if( rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_major_vertices, - edgelist_major_vertices + num_edgelist_edges, + edgelist_major_vertices[0], + edgelist_major_vertices[0] + edgelist_edge_counts[0], [num_local_vertices, sorted_local_vertices = sorted_local_vertices.data()] __device__(auto v) { return !thrust::binary_search( @@ -397,17 +433,25 @@ void expensive_check_edgelist( }) == 0, "Invalid input argument: edgelist_major_vertices has invalid vertex ID(s)."); + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices[0], edgelist_minor_vertices[0])); CUGRAPH_EXPECTS( - thrust::count_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_major_vertices, - edgelist_major_vertices + num_edgelist_edges, - [num_local_vertices, - sorted_local_vertices = sorted_local_vertices.data()] __device__(auto v) { - return !thrust::binary_search( - thrust::seq, sorted_local_vertices, sorted_local_vertices + num_local_vertices, v); - }) == 0, - "Invalid input argument: edgelist_major_vertices has invalid vertex ID(s)."); + thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edge_first, + edge_first + edgelist_edge_counts[0], + [num_local_vertices, + sorted_local_vertices = sorted_local_vertices.data()] __device__(auto e) { + return !thrust::binary_search(thrust::seq, + sorted_local_vertices, + sorted_local_vertices + num_local_vertices, + thrust::get<0>(e)) || + !thrust::binary_search(thrust::seq, + sorted_local_vertices, + sorted_local_vertices + num_local_vertices, + thrust::get<1>(e)); + }) == 0, + "Invalid input argument: edgelist_major_vertices and/or edgelist_minor_vertices have " + "invalid vertex ID(s)."); } } } @@ -440,12 +484,19 @@ renumber_edgelist(raft::handle_t const& handle, auto const col_comm_size = col_comm.get_size(); auto const col_comm_rank = col_comm.get_rank(); + std::vector edgelist_const_major_vertices(edgelist_major_vertices.size()); + std::vector edgelist_const_minor_vertices(edgelist_const_major_vertices.size()); + for (size_t i = 0; i < edgelist_const_major_vertices.size(); ++i) { + edgelist_const_major_vertices[i] = edgelist_major_vertices[i]; + edgelist_const_minor_vertices[i] = edgelist_minor_vertices[i]; + } + if (do_expensive_check) { expensive_check_edgelist(handle, local_vertices, num_local_vertices, - edgelist_major_vertices, - edgelist_minor_vertices, + edgelist_const_major_vertices, + edgelist_const_minor_vertices, edgelist_edge_counts); } @@ -455,8 +506,8 @@ renumber_edgelist(raft::handle_t const& handle, detail::compute_renumber_map(handle, local_vertices, num_local_vertices, - edgelist_major_vertices, - edgelist_minor_vertices, + edgelist_const_major_vertices, + edgelist_const_minor_vertices, edgelist_edge_counts); // 2. initialize partition_t object, number_of_vertices, and number_of_edges for the coarsened @@ -472,7 +523,10 @@ renumber_edgelist(raft::handle_t const& handle, vertex_partition_offsets, row_comm_size, col_comm_size, row_comm_rank, col_comm_rank); auto number_of_vertices = vertex_partition_offsets.back(); - auto number_of_edges = host_scalar_allreduce(comm, num_edgelist_edges, handle.get_stream()); + auto number_of_edges = host_scalar_allreduce( + comm, + std::accumulate(edgelist_edge_counts.begin(), edgelist_edge_counts.end(), edge_t{0}), + handle.get_stream()); // 3. renumber edges @@ -542,9 +596,11 @@ renumber_edgelist(raft::handle_t const& handle, return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); renumber_map.insert(pair_first, pair_first + renumber_map_minor_labels.size()); - renumber_map.find(edgelist_minor_vertices, - edgelist_minor_vertices + num_edgelist_edges, - edgelist_minor_vertices); + for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + renumber_map.find(edgelist_minor_vertices[i], + edgelist_minor_vertices[i] + edgelist_edge_counts[i], + edgelist_minor_vertices[i]); + } } return std::make_tuple( @@ -580,8 +636,7 @@ std::enable_if_t> renumber_edgelist( num_vertices, std::vector{edgelist_major_vertices}, std::vector{edgelist_minor_vertices}, - std::vertex_t{num_edgelist_edges}, - false); + std::vector{num_edgelist_edges}); } auto renumber_map_labels = detail::compute_renumber_map( From a71b0455f30d03d228abe855ab11bfd88d537ec3 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 8 Mar 2021 11:05:32 -0500 Subject: [PATCH 09/63] refactor groupby and count based on key_to_id_op --- cpp/include/utilities/shuffle_comm.cuh | 177 +++++++++++++------------ cpp/src/experimental/coarsen_graph.cu | 81 ++++------- 2 files changed, 115 insertions(+), 143 deletions(-) diff --git a/cpp/include/utilities/shuffle_comm.cuh b/cpp/include/utilities/shuffle_comm.cuh index b033b38caf0..d55e782df72 100644 --- a/cpp/include/utilities/shuffle_comm.cuh +++ b/cpp/include/utilities/shuffle_comm.cuh @@ -22,6 +22,12 @@ #include #include +#include +#include +#include +#include +#include + #include #include #include @@ -31,89 +37,6 @@ namespace experimental { namespace detail { -template -rmm::device_uvector groupby_and_count(raft::comms::comms_t const &comm, - ValueIterator tx_value_first /* [INOUT */, - ValueIterator tx_value_last /* [INOUT */, - ValueToGPUIdOp value_to_gpu_id_op, - cudaStream_t stream) -{ - auto const comm_size = comm.get_size(); - - thrust::sort(rmm::exec_policy(stream)->on(stream), - tx_value_first, - tx_value_last, - [value_to_gpu_id_op] __device__(auto lhs, auto rhs) { - return value_to_gpu_id_op(lhs) < value_to_gpu_id_op(rhs); - }); - - auto gpu_id_first = thrust::make_transform_iterator( - tx_value_first, - [value_to_gpu_id_op] __device__(auto value) { return value_to_gpu_id_op(value); }); - rmm::device_uvector d_tx_dst_ranks(comm_size, stream); - rmm::device_uvector d_tx_value_counts(comm_size, stream); - auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), - gpu_id_first, - gpu_id_first + thrust::distance(tx_value_first, tx_value_last), - thrust::make_constant_iterator(size_t{1}), - d_tx_dst_ranks.begin(), - d_tx_value_counts.begin()); - if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < comm_size) { - rmm::device_uvector d_counts(comm_size, stream); - thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0}); - thrust::scatter(rmm::exec_policy(stream)->on(stream), - d_tx_value_counts.begin(), - thrust::get<1>(last), - d_tx_dst_ranks.begin(), - d_counts.begin()); - d_tx_value_counts = std::move(d_counts); - } - - return d_tx_value_counts; -} - -template -rmm::device_uvector groupby_and_count(raft::comms::comms_t const &comm, - VertexIterator tx_key_first /* [INOUT */, - VertexIterator tx_key_last /* [INOUT */, - ValueIterator tx_value_first /* [INOUT */, - KeyToGPUIdOp key_to_gpu_id_op, - cudaStream_t stream) -{ - auto const comm_size = comm.get_size(); - - thrust::sort_by_key(rmm::exec_policy(stream)->on(stream), - tx_key_first, - tx_key_last, - tx_value_first, - [key_to_gpu_id_op] __device__(auto lhs, auto rhs) { - return key_to_gpu_id_op(lhs) < key_to_gpu_id_op(rhs); - }); - - auto gpu_id_first = thrust::make_transform_iterator( - tx_key_first, [key_to_gpu_id_op] __device__(auto key) { return key_to_gpu_id_op(key); }); - rmm::device_uvector d_tx_dst_ranks(comm_size, stream); - rmm::device_uvector d_tx_value_counts(comm_size, stream); - auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), - gpu_id_first, - gpu_id_first + thrust::distance(tx_key_first, tx_key_last), - thrust::make_constant_iterator(size_t{1}), - d_tx_dst_ranks.begin(), - d_tx_value_counts.begin()); - if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < comm_size) { - rmm::device_uvector d_counts(comm_size, stream); - thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0}); - thrust::scatter(rmm::exec_policy(stream)->on(stream), - d_tx_value_counts.begin(), - thrust::get<1>(last), - d_tx_dst_ranks.begin(), - d_counts.begin()); - d_tx_value_counts = std::move(d_counts); - } - - return d_tx_value_counts; -} - // inline to suppress a complaint about ODR violation inline std::tuple, std::vector, @@ -187,6 +110,86 @@ compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const &comm, } // namespace detail +template +rmm::device_uvector groupby_and_count(ValueIterator tx_value_first /* [INOUT */, + ValueIterator tx_value_last /* [INOUT */, + ValueToGPUIdOp value_to_group_id_op, + int num_groups, + cudaStream_t stream) +{ + thrust::sort(rmm::exec_policy(stream)->on(stream), + tx_value_first, + tx_value_last, + [value_to_group_id_op] __device__(auto lhs, auto rhs) { + return value_to_group_id_op(lhs) < value_to_group_id_op(rhs); + }); + + auto group_id_first = thrust::make_transform_iterator( + tx_value_first, + [value_to_group_id_op] __device__(auto value) { return value_to_group_id_op(value); }); + rmm::device_uvector d_tx_dst_ranks(num_groups, stream); + rmm::device_uvector d_tx_value_counts(d_tx_dst_ranks.size(), stream); + auto last = + thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), + group_id_first, + group_id_first + thrust::distance(tx_value_first, tx_value_last), + thrust::make_constant_iterator(size_t{1}), + d_tx_dst_ranks.begin(), + d_tx_value_counts.begin()); + if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < num_groups) { + rmm::device_uvector d_counts(num_groups, stream); + thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0}); + thrust::scatter(rmm::exec_policy(stream)->on(stream), + d_tx_value_counts.begin(), + thrust::get<1>(last), + d_tx_dst_ranks.begin(), + d_counts.begin()); + d_tx_value_counts = std::move(d_counts); + } + + return d_tx_value_counts; +} + +template +rmm::device_uvector groupby_and_count(VertexIterator tx_key_first /* [INOUT */, + VertexIterator tx_key_last /* [INOUT */, + ValueIterator tx_value_first /* [INOUT */, + KeyToGPUIdOp key_to_group_id_op, + int num_groups, + cudaStream_t stream) +{ + thrust::sort_by_key(rmm::exec_policy(stream)->on(stream), + tx_key_first, + tx_key_last, + tx_value_first, + [key_to_group_id_op] __device__(auto lhs, auto rhs) { + return key_to_group_id_op(lhs) < key_to_group_id_op(rhs); + }); + + auto group_id_first = thrust::make_transform_iterator( + tx_key_first, [key_to_group_id_op] __device__(auto key) { return key_to_group_id_op(key); }); + rmm::device_uvector d_tx_dst_ranks(num_groups, stream); + rmm::device_uvector d_tx_value_counts(d_tx_dst_ranks.size(), stream); + auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), + group_id_first, + group_id_first + thrust::distance(tx_key_first, tx_key_last), + thrust::make_constant_iterator(size_t{1}), + d_tx_dst_ranks.begin(), + d_tx_value_counts.begin()); + if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < num_groups) { + rmm::device_uvector d_counts(num_groups, stream); + thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0}); + thrust::scatter(rmm::exec_policy(stream)->on(stream), + d_tx_value_counts.begin(), + thrust::get<1>(last), + d_tx_dst_ranks.begin(), + d_counts.begin()); + d_tx_value_counts = std::move(d_counts); + } + + return d_tx_value_counts; +} + template auto shuffle_values(raft::comms::comms_t const &comm, TxValueIterator tx_value_first, @@ -240,8 +243,8 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const &comm, { auto const comm_size = comm.get_size(); - auto d_tx_value_counts = - detail::groupby_and_count(comm, tx_value_first, tx_value_last, value_to_gpu_id_op, stream); + auto d_tx_value_counts = groupby_and_count( + tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), stream); std::vector tx_counts{}; std::vector tx_offsets{}; @@ -282,8 +285,8 @@ auto groupby_gpuid_and_shuffle_kv_pairs(raft::comms::comms_t const &comm, KeyToGPUIdOp key_to_gpu_id_op, cudaStream_t stream) { - auto d_tx_value_counts = detail::groupby_and_count( - comm, tx_key_first, tx_key_last, tx_value_first, key_to_gpu_id_op, stream); + auto d_tx_value_counts = groupby_and_count( + tx_key_first, tx_key_last, tx_value_first, key_to_gpu_id_op, comm.get_size(), stream); std::vector tx_counts{}; std::vector tx_offsets{}; diff --git a/cpp/src/experimental/coarsen_graph.cu b/cpp/src/experimental/coarsen_graph.cu index 3a8aa09c28a..f2f01f7e609 100644 --- a/cpp/src/experimental/coarsen_graph.cu +++ b/cpp/src/experimental/coarsen_graph.cu @@ -322,7 +322,7 @@ coarsen_graph( edge_first + edgelist_major_vertices.size(), [key_func = detail::compute_gpu_id_from_edge_t{ - comm.get_size(), row_comm.get_size(), col_comm.get_size()}] __device__(auto val) { + comm_size, row_comm_size, col_comm_size}] __device__(auto val) { return key_func(thrust::get<0>(val), thrust::get<1>(val)); }, handle.get_stream()); @@ -337,7 +337,7 @@ coarsen_graph( edge_first + edgelist_major_vertices.size(), [key_func = detail::compute_gpu_id_from_edge_t{ - comm.get_size(), row_comm.get_size(), col_comm.get_size()}] __device__(auto val) { + comm_size, row_comm_size, col_comm_size}] __device__(auto val) { return key_func(thrust::get<0>(val), thrust::get<1>(val)); }, handle.get_stream()); @@ -353,66 +353,35 @@ coarsen_graph( // FIXME: we can skip this if groupby_gpuid_and_shuffle_values is updated to return sorted edge // list based on the final matrix partition (maybe add // groupby_adj_matrix_partition_and_shuffle_values). - auto key_func = detail::compute_partition_id_from_edge_t{ - comm.get_size(), row_comm.get_size(), col_comm.get_size()}; + + auto local_partition_id_op = + [comm_size, + key_func = detail::compute_partition_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto pair) { + return key_func(thrust::get<0>(pair), thrust::get<1>(pair)) / + comm_size; // global partition id to local partition id + }; auto pair_first = thrust::make_zip_iterator( thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); - if (graph_view.is_weighted()) { - thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - pair_first, - pair_first + edgelist_major_vertices.size(), - edgelist_weights.begin(), - [key_func] __device__(auto lhs, auto rhs) { - return key_func(thrust::get<0>(lhs), thrust::get<1>(lhs)) < - key_func(thrust::get<0>(rhs), thrust::get<1>(rhs)); - }); - } else { - thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - pair_first, - pair_first + edgelist_major_vertices.size(), - [key_func] __device__(auto lhs, auto rhs) { - return key_func(thrust::get<0>(lhs), thrust::get<1>(lhs)) < - key_func(thrust::get<0>(rhs), thrust::get<1>(rhs)); - }); - } - auto partition_id_first = thrust::make_transform_iterator( - thrust::make_zip_iterator( - thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())), - [key_func] __device__(auto val) { - return key_func(thrust::get<0>(val), thrust::get<1>(val)); - }); - rmm::device_uvector partition_ids(graph_view.get_number_of_local_adj_matrix_partitions(), - handle.get_stream()); - rmm::device_uvector displacements(partition_ids.size() + 1, handle.get_stream()); - auto last = - thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - partition_id_first, - partition_id_first + edgelist_major_vertices.size(), - thrust::make_constant_iterator(edge_t{1}), - partition_ids.begin(), - displacements.begin()); - if (static_cast(thrust::distance(partition_ids.begin(), thrust::get<0>(last))) < - partition_ids.size()) { - rmm::device_uvector tmps(displacements.size(), handle.get_stream()); - thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - displacements.begin(), - displacements.end(), - edge_t{0}); - thrust::scatter(displacements.begin(), - thrust::get<1>(last), - thrust::make_transform_iterator( - partition_ids.begin(), - [comm_size] __device__(auto id) { - return id / comm_size; // global partition id to local partition id - }), - tmps.begin()); - displacements = std::move(tmps); - } + auto displacements = + graph_view.is_weighted() + ? groupby_and_count(pair_first, + pair_first + edgelist_major_vertices.size(), + edgelist_weights.begin(), + local_partition_id_op, + graph_view.get_number_of_local_adj_matrix_partitions(), + handle.get_stream()) + : groupby_and_count(pair_first, + pair_first + edgelist_major_vertices.size(), + local_partition_id_op, + graph_view.get_number_of_local_adj_matrix_partitions(), + handle.get_stream()); thrust::exclusive_scan(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), displacements.begin(), displacements.end(), displacements.begin()); - std::vector h_displacements(displacements.size()); + + std::vector h_displacements(displacements.size()); raft::update_host( h_displacements.data(), displacements.data(), displacements.size(), handle.get_stream()); CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); From d5d9a1724f87ae3be087926baec4618bcd07ee16 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 8 Mar 2021 11:30:09 -0500 Subject: [PATCH 10/63] fix compile error due to recent API changes --- .../utilities/generate_graph_from_edgelist.cu | 73 +++++++++++++++---- 1 file changed, 60 insertions(+), 13 deletions(-) diff --git a/cpp/tests/utilities/generate_graph_from_edgelist.cu b/cpp/tests/utilities/generate_graph_from_edgelist.cu index 1b9fe6051f7..f53db51a2c5 100644 --- a/cpp/tests/utilities/generate_graph_from_edgelist.cu +++ b/cpp/tests/utilities/generate_graph_from_edgelist.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -75,7 +76,7 @@ generate_graph_from_edgelist(raft::handle_t const& handle, vertices.shrink_to_fit(handle.get_stream()); auto edge_key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t{ - false, comm_size, row_comm_size, col_comm_size}; + comm_size, row_comm_size, col_comm_size}; size_t number_of_local_edges{}; if (test_weighted) { auto edge_first = thrust::make_zip_iterator( @@ -114,24 +115,70 @@ generate_graph_from_edgelist(raft::handle_t const& handle, edgelist_weights.shrink_to_fit(handle.get_stream()); } + auto local_partition_id_op = + [comm_size, + key_func = cugraph::experimental::detail::compute_partition_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto pair) { + return key_func(thrust::get<0>(pair), thrust::get<1>(pair)) / + comm_size; // global partition id to local partition id + }; + auto pair_first = + store_transposed + ? thrust::make_zip_iterator(thrust::make_tuple(edgelist_cols.begin(), edgelist_rows.begin())) + : thrust::make_zip_iterator(thrust::make_tuple(edgelist_rows.begin(), edgelist_cols.begin())); + auto displacements = + test_weighted ? cugraph::experimental::groupby_and_count(pair_first, + pair_first + edgelist_rows.size(), + edgelist_weights.begin(), + local_partition_id_op, + col_comm_size, + handle.get_stream()) + : cugraph::experimental::groupby_and_count(pair_first, + pair_first + edgelist_rows.size(), + local_partition_id_op, + col_comm_size, + handle.get_stream()); + thrust::exclusive_scan(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + displacements.begin(), + displacements.end(), + displacements.begin()); + + std::vector h_displacements(displacements.size()); + raft::update_host( + h_displacements.data(), displacements.data(), displacements.size(), handle.get_stream()); + CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); + // 3. renumber rmm::device_uvector renumber_map_labels(0, handle.get_stream()); cugraph::experimental::partition_t partition{}; vertex_t aggregate_number_of_vertices{}; edge_t number_of_edges{}; - // FIXME: set do_expensive_check to false once validated - std::tie(renumber_map_labels, partition, aggregate_number_of_vertices, number_of_edges) = - cugraph::experimental::renumber_edgelist( - handle, - vertices.data(), - static_cast(vertices.size()), - store_transposed ? edgelist_cols.data() : edgelist_rows.data(), - store_transposed ? edgelist_rows.data() : edgelist_cols.data(), - edgelist_rows.size(), - false, - true); - assert(aggregate_number_of_vertices == number_of_vertices); + { + std::vector major_ptrs(edgelist_rows.size()); + std::vector minor_ptrs(major_ptrs.size()); + std::vector counts(major_ptrs.size()); + for (size_t i = 0; i < edgelist_rows.size(); ++i) { + major_ptrs[i] = + (store_transposed ? edgelist_cols.begin() : edgelist_rows.begin()) + h_displacements[i]; + minor_ptrs[i] = + (store_transposed ? edgelist_rows.begin() : edgelist_cols.begin()) + h_displacements[i]; + counts[i] = static_cast( + ((i == edgelist_rows.size() - 1) ? edgelist_rows.size() : h_displacements[i + 1]) - + h_displacements[i]); + } + // FIXME: set do_expensive_check to false once validated + std::tie(renumber_map_labels, partition, aggregate_number_of_vertices, number_of_edges) = + cugraph::experimental::renumber_edgelist( + handle, + vertices.data(), + static_cast(vertices.size()), + major_ptrs, + minor_ptrs, + counts, + true); + assert(aggregate_number_of_vertices == number_of_vertices); + } // 4. create a graph From e0efcd58e577c43872feb32cd3b73400a5e15917 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 10 Mar 2021 22:27:11 -0500 Subject: [PATCH 11/63] fix variable naming inconsistencies --- cpp/include/experimental/graph_functions.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/experimental/graph_functions.hpp b/cpp/include/experimental/graph_functions.hpp index f02bafb3aea..e9740d67666 100644 --- a/cpp/include/experimental/graph_functions.hpp +++ b/cpp/include/experimental/graph_functions.hpp @@ -55,7 +55,7 @@ std::enable_if_t const& edgelist_major_vertices /* [INOUT] */, std::vector const& edgelist_minor_vertices /* [INOUT] */, - std::vector const& num_edgelist_edges, + std::vector const& edgelist_edge_counts, bool do_expensive_check = false); /** @@ -121,7 +121,7 @@ renumber_edgelist(raft::handle_t const& handle, vertex_t num_local_vertices, std::vector const& edgelist_major_vertices /* [INOUT] */, std::vector const& edgelist_minor_vertices /* [INOUT] */, - std::vector const& num_edgelist_edges, + std::vector const& edgelist_edge_counts, bool do_expensive_check = false); /** From 50982242d04750e8baa105d33241ef301d8728b4 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 15 Mar 2021 13:13:03 -0400 Subject: [PATCH 12/63] minor cosmetic updates --- cpp/tests/experimental/bfs_test.cpp | 6 ++++-- cpp/tests/experimental/katz_centrality_test.cpp | 6 ++++-- cpp/tests/experimental/pagerank_test.cpp | 6 ++++-- cpp/tests/experimental/sssp_test.cpp | 6 ++++-- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/cpp/tests/experimental/bfs_test.cpp b/cpp/tests/experimental/bfs_test.cpp index ad9ece99ef9..d1b4def749e 100644 --- a/cpp/tests/experimental/bfs_test.cpp +++ b/cpp/tests/experimental/bfs_test.cpp @@ -107,6 +107,8 @@ class Tests_BFS : public ::testing::TestWithParam { template void run_current_test(BFS_Usecase const& configuration) { + constexpr bool renumber = false; + using weight_t = float; raft::handle_t handle{}; @@ -117,7 +119,7 @@ class Tests_BFS : public ::testing::TestWithParam { cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH ? cugraph::test:: read_graph_from_matrix_market_file( - handle, configuration.input_graph_specifier.graph_file_full_path, false, false) + handle, configuration.input_graph_specifier.graph_file_full_path, false, renumber) : cugraph::test::generate_graph_from_rmat_params( handle, configuration.input_graph_specifier.rmat_params.scale, @@ -129,7 +131,7 @@ class Tests_BFS : public ::testing::TestWithParam { configuration.input_graph_specifier.rmat_params.undirected, configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, false, - false); + renumber); auto graph_view = graph.view(); std::vector h_offsets(graph_view.get_number_of_vertices() + 1); diff --git a/cpp/tests/experimental/katz_centrality_test.cpp b/cpp/tests/experimental/katz_centrality_test.cpp index 776bb60716c..ce4a9941c79 100644 --- a/cpp/tests/experimental/katz_centrality_test.cpp +++ b/cpp/tests/experimental/katz_centrality_test.cpp @@ -128,6 +128,8 @@ class Tests_KatzCentrality : public ::testing::TestWithParam void run_current_test(KatzCentrality_Usecase const& configuration) { + constexpr bool renumber = false; + raft::handle_t handle{}; cugraph::experimental::graph_t graph(handle); @@ -139,7 +141,7 @@ class Tests_KatzCentrality : public ::testing::TestWithParam( handle, configuration.input_graph_specifier.rmat_params.scale, @@ -151,7 +153,7 @@ class Tests_KatzCentrality : public ::testing::TestWithParam h_offsets(graph_view.get_number_of_vertices() + 1); diff --git a/cpp/tests/experimental/pagerank_test.cpp b/cpp/tests/experimental/pagerank_test.cpp index ff3b073cbc7..4bd1946e163 100644 --- a/cpp/tests/experimental/pagerank_test.cpp +++ b/cpp/tests/experimental/pagerank_test.cpp @@ -166,6 +166,8 @@ class Tests_PageRank : public ::testing::TestWithParam { template void run_current_test(PageRank_Usecase const& configuration) { + constexpr bool renumber = false; + raft::handle_t handle{}; cugraph::experimental::graph_t graph(handle); @@ -177,7 +179,7 @@ class Tests_PageRank : public ::testing::TestWithParam { handle, configuration.input_graph_specifier.graph_file_full_path, configuration.test_weighted, - false) + renumber) : cugraph::test::generate_graph_from_rmat_params( handle, configuration.input_graph_specifier.rmat_params.scale, @@ -189,7 +191,7 @@ class Tests_PageRank : public ::testing::TestWithParam { configuration.input_graph_specifier.rmat_params.undirected, configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, configuration.test_weighted, - false); + renumber); auto graph_view = graph.view(); std::vector h_offsets(graph_view.get_number_of_vertices() + 1); diff --git a/cpp/tests/experimental/sssp_test.cpp b/cpp/tests/experimental/sssp_test.cpp index 611abcb0d75..b8b6d2b8cde 100644 --- a/cpp/tests/experimental/sssp_test.cpp +++ b/cpp/tests/experimental/sssp_test.cpp @@ -113,6 +113,8 @@ class Tests_SSSP : public ::testing::TestWithParam { template void run_current_test(SSSP_Usecase const& configuration) { + constexpr bool renumber = false; + raft::handle_t handle{}; cugraph::experimental::graph_t graph(handle); @@ -121,7 +123,7 @@ class Tests_SSSP : public ::testing::TestWithParam { cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH ? cugraph::test:: read_graph_from_matrix_market_file( - handle, configuration.input_graph_specifier.graph_file_full_path, true, false) + handle, configuration.input_graph_specifier.graph_file_full_path, true, renumber) : cugraph::test::generate_graph_from_rmat_params( handle, configuration.input_graph_specifier.rmat_params.scale, @@ -133,7 +135,7 @@ class Tests_SSSP : public ::testing::TestWithParam { configuration.input_graph_specifier.rmat_params.undirected, configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, true, - false); + renumber); auto graph_view = graph.view(); std::vector h_offsets(graph_view.get_number_of_vertices() + 1); From 7b36f8acc74103be9e2e1e7b9261117e26ce2d6b Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 16 Mar 2021 12:50:53 -0400 Subject: [PATCH 13/63] update python binding (C++ part) to accomodate new partitioning scheme --- cpp/include/utilities/cython.hpp | 13 +-- cpp/include/utilities/shuffle_comm.cuh | 2 + cpp/src/utilities/cython.cu | 142 ++++++++++++++----------- 3 files changed, 87 insertions(+), 70 deletions(-) diff --git a/cpp/include/utilities/cython.hpp b/cpp/include/utilities/cython.hpp index 98e850abbf0..4f844f3101e 100644 --- a/cpp/include/utilities/cython.hpp +++ b/cpp/include/utilities/cython.hpp @@ -102,7 +102,6 @@ struct graph_container_t { bool is_multi_gpu; bool sorted_by_degree; bool do_expensive_check; - bool hypergraph_partitioned; int row_comm_size; int col_comm_size; int row_comm_rank; @@ -146,7 +145,7 @@ struct cy_multi_edgelists_t { // replacement for std::tuple<,,>, since std::tuple is not // supported in cython // -template +template struct major_minor_weights_t { explicit major_minor_weights_t(raft::handle_t const& handle) : shuffled_major_vertices_(0, handle.get_stream()), @@ -154,12 +153,15 @@ struct major_minor_weights_t { shuffled_weights_(0, handle.get_stream()) { } + rmm::device_uvector& get_major(void) { return shuffled_major_vertices_; } rmm::device_uvector& get_minor(void) { return shuffled_minor_vertices_; } rmm::device_uvector& get_weights(void) { return shuffled_weights_; } + std::vector& get_edge_counts(void) { return edge_counts_; } + std::pair, size_t> get_major_wrap( void) // const: triggers errors in Cython autogen-ed C++ { @@ -183,6 +185,7 @@ struct major_minor_weights_t { rmm::device_uvector shuffled_major_vertices_; rmm::device_uvector shuffled_minor_vertices_; rmm::device_uvector shuffled_weights_; + std::vector edge_counts_{}; }; // wrapper for renumber_edgelist() return @@ -446,14 +449,13 @@ std::unique_ptr call_egonet(raft::handle_t const& handle, // wrapper for shuffling: // template -std::unique_ptr> call_shuffle( +std::unique_ptr> call_shuffle( raft::handle_t const& handle, vertex_t* edgelist_major_vertices, // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place vertex_t* edgelist_minor_vertices, // [IN / OUT] weight_t* edgelist_weights, // [IN / OUT] - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned); // = false + edge_t num_edgelist_edges); // Wrapper for calling renumber_edeglist() inplace: // @@ -463,7 +465,6 @@ std::unique_ptr> call_renumber( vertex_t* shuffled_edgelist_major_vertices /* [INOUT] */, vertex_t* shuffled_edgelist_minor_vertices /* [INOUT] */, edge_t num_edgelist_edges, - bool is_hypergraph_partitioned, bool do_expensive_check, bool multi_gpu); diff --git a/cpp/include/utilities/shuffle_comm.cuh b/cpp/include/utilities/shuffle_comm.cuh index fe394d503d8..867b554ab39 100644 --- a/cpp/include/utilities/shuffle_comm.cuh +++ b/cpp/include/utilities/shuffle_comm.cuh @@ -302,6 +302,8 @@ auto groupby_gpuid_and_shuffle_kv_pairs(raft::comms::comms_t const &comm, KeyToGPUIdOp key_to_gpu_id_op, cudaStream_t stream) { + auto const comm_size = comm.get_size(); + auto d_tx_value_counts = groupby_and_count( tx_key_first, tx_key_last, tx_value_first, key_to_gpu_id_op, comm.get_size(), stream); diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu index 5382b4856f3..3240e299ac4 100644 --- a/cpp/src/utilities/cython.cu +++ b/cpp/src/utilities/cython.cu @@ -57,7 +57,6 @@ create_graph(raft::handle_t const& handle, graph_container_t const& graph_contai (graph_container.row_comm_size * graph_container.col_comm_size) + 1); experimental::partition_t partition(partition_offsets_vector, - graph_container.hypergraph_partitioned, graph_container.row_comm_size, graph_container.col_comm_size, graph_container.row_comm_rank, @@ -124,7 +123,6 @@ void populate_graph_container(graph_container_t& graph_container, "populate_graph_container() can only be called on an empty container."); bool do_expensive_check{true}; - bool hypergraph_partitioned{false}; if (multi_gpu) { auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); @@ -151,7 +149,6 @@ void populate_graph_container(graph_container_t& graph_container, graph_container.weightType = weightType; graph_container.transposed = transposed; graph_container.is_multi_gpu = multi_gpu; - graph_container.hypergraph_partitioned = hypergraph_partitioned; graph_container.sorted_by_degree = sorted_by_degree; graph_container.do_expensive_check = do_expensive_check; @@ -747,23 +744,23 @@ void call_sssp(raft::handle_t const& handle, // wrapper for shuffling: // template -std::unique_ptr> call_shuffle( +std::unique_ptr> call_shuffle( raft::handle_t const& handle, vertex_t* edgelist_major_vertices, // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place vertex_t* edgelist_minor_vertices, // [IN / OUT] weight_t* edgelist_weights, // [IN / OUT] - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned) // = false + edge_t num_edgelist_edges) { - auto& comm = handle.get_comms(); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_size = col_comm.get_size(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - - std::unique_ptr> ptr_ret = - std::make_unique>(handle); + std::unique_ptr> ptr_ret = + std::make_unique>(handle); if (edgelist_weights != nullptr) { auto zip_edge = thrust::make_zip_iterator( @@ -778,10 +775,7 @@ std::unique_ptr> call_shuffle( zip_edge + num_edgelist_edges, [key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t{ - is_hypergraph_partitioned, - comm.get_size(), - row_comm.get_size(), - col_comm.get_size()}] __device__(auto val) { + comm.get_size(), row_comm.get_size(), col_comm.get_size()}] __device__(auto val) { return key_func(thrust::get<0>(val), thrust::get<1>(val)); }, handle.get_stream()); @@ -797,15 +791,46 @@ std::unique_ptr> call_shuffle( zip_edge + num_edgelist_edges, [key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t{ - is_hypergraph_partitioned, - comm.get_size(), - row_comm.get_size(), - col_comm.get_size()}] __device__(auto val) { + comm.get_size(), row_comm.get_size(), col_comm.get_size()}] __device__(auto val) { return key_func(thrust::get<0>(val), thrust::get<1>(val)); }, handle.get_stream()); } + auto local_partition_id_op = + [comm_size, + key_func = cugraph::experimental::detail::compute_partition_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto pair) { + return key_func(thrust::get<0>(pair), thrust::get<1>(pair)) / + comm_size; // global partition id to local partition id + }; + auto pair_first = thrust::make_zip_iterator( + thrust::make_tuple(ptr_ret->get_major().data(), ptr_ret->get_minor().data())); + + auto edge_counts = + (edgelist_weights != nullptr) + ? cugraph::experimental::groupby_and_count(pair_first, + pair_first + ptr_ret->get_major().size(), + ptr_ret->get_weights().data(), + local_partition_id_op, + col_comm_size, + handle.get_stream()) + : cugraph::experimental::groupby_and_count(pair_first, + pair_first + ptr_ret->get_major().size(), + local_partition_id_op, + col_comm_size, + handle.get_stream()); + + std::vector h_edge_counts(edge_counts.size()); + raft::update_host( + h_edge_counts.data(), edge_counts.data(), edge_counts.size(), handle.get_stream()); + handle.get_stream_view().synchronize(); + + ptr_ret->get_edge_counts().resize(h_edge_counts.size()); + for (size_t i = 0; i < h_edge_counts.size(); ++i) { + ptr_ret->get_edge_counts()[i] = static_cast(h_edge_counts[i]); + } + return ptr_ret; // RVO-ed } @@ -817,8 +842,7 @@ std::unique_ptr> call_renumber( raft::handle_t const& handle, vertex_t* shuffled_edgelist_major_vertices /* [INOUT] */, vertex_t* shuffled_edgelist_minor_vertices /* [INOUT] */, - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edge_counts, bool do_expensive_check, bool multi_gpu) // bc. cython cannot take non-type template params { @@ -828,33 +852,32 @@ std::unique_ptr> call_renumber( std::make_unique>(handle); if (multi_gpu) { + std::vector displacements(edge_counts.size(), edge_t{0}); + std::partial_sum(edge_counts.begin(), edge_counts.end() - 1, displacements.begin() + 1); + std::vector major_ptrs(edge_counts.size()); + std::vector minor_ptrs(major_ptrs.size()); + for (size_t i = 0; i < edge_counts.size(); ++i) { + major_ptrs[i] = shuffled_edgelist_major_vertices + displacements[i]; + minor_ptrs[i] = shuffled_edgelist_minor_vertices + displacements[i]; + } + assert(aggregate_number_of_vertices == number_of_vertices); + std::tie( p_ret->get_dv(), p_ret->get_partition(), p_ret->get_num_vertices(), p_ret->get_num_edges()) = cugraph::experimental::renumber_edgelist( - handle, - shuffled_edgelist_major_vertices, - shuffled_edgelist_minor_vertices, - num_edgelist_edges, - is_hypergraph_partitioned, - do_expensive_check); + handle, major_ptrs, minor_ptrs, edge_counts, do_expensive_check); } else { - auto ret_f = cugraph::experimental::renumber_edgelist( + p_ret->get_dv() = cugraph::experimental::renumber_edgelist( handle, shuffled_edgelist_major_vertices, shuffled_edgelist_minor_vertices, - num_edgelist_edges, + edge_counts[0], do_expensive_check); - auto tot_vertices = static_cast(ret_f.size()); - - p_ret->get_dv() = std::move(ret_f); - cugraph::experimental::partition_t part_sg( - std::vector{0, tot_vertices}, false, 1, 1, 0, 0); - - p_ret->get_partition() = std::move(part_sg); + p_ret->get_partition() = cugraph::experimental::partition_t{}; // dummy - p_ret->get_num_vertices() = tot_vertices; - p_ret->get_num_edges() = num_edgelist_edges; + p_ret->get_num_vertices() = static_cast(p_ret->get_dv().size()); + p_ret->get_num_edges() = edge_counts[0]; } return p_ret; // RVO-ed (copy ellision) @@ -1066,53 +1089,47 @@ template void call_sssp(raft::handle_t const& handle, int64_t* predecessors, const int64_t source_vertex); -template std::unique_ptr> call_shuffle( +template std::unique_ptr> call_shuffle( raft::handle_t const& handle, int32_t* edgelist_major_vertices, int32_t* edgelist_minor_vertices, float* edgelist_weights, - int32_t num_edgelist_edges, - bool is_hypergraph_partitioned); + int32_t num_edgelist_edges); -template std::unique_ptr> call_shuffle( +template std::unique_ptr> call_shuffle( raft::handle_t const& handle, int32_t* edgelist_major_vertices, int32_t* edgelist_minor_vertices, float* edgelist_weights, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned); + int64_t num_edgelist_edges); -template std::unique_ptr> call_shuffle( +template std::unique_ptr> call_shuffle( raft::handle_t const& handle, int32_t* edgelist_major_vertices, int32_t* edgelist_minor_vertices, double* edgelist_weights, - int32_t num_edgelist_edges, - bool is_hypergraph_partitioned); + int32_t num_edgelist_edges); -template std::unique_ptr> call_shuffle( +template std::unique_ptr> call_shuffle( raft::handle_t const& handle, int32_t* edgelist_major_vertices, int32_t* edgelist_minor_vertices, double* edgelist_weights, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned); + int64_t num_edgelist_edges); -template std::unique_ptr> call_shuffle( +template std::unique_ptr> call_shuffle( raft::handle_t const& handle, int64_t* edgelist_major_vertices, int64_t* edgelist_minor_vertices, float* edgelist_weights, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned); + int64_t num_edgelist_edges); -template std::unique_ptr> call_shuffle( +template std::unique_ptr> call_shuffle( raft::handle_t const& handle, int64_t* edgelist_major_vertices, int64_t* edgelist_minor_vertices, double* edgelist_weights, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned); + int64_t num_edgelist_edges); // TODO: add the remaining relevant EIDIr's: // @@ -1120,8 +1137,7 @@ template std::unique_ptr> call_renumber( raft::handle_t const& handle, int32_t* shuffled_edgelist_major_vertices /* [INOUT] */, int32_t* shuffled_edgelist_minor_vertices /* [INOUT] */, - int32_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edge_counts, bool do_expensive_check, bool multi_gpu); @@ -1129,8 +1145,7 @@ template std::unique_ptr> call_renumber( raft::handle_t const& handle, int32_t* shuffled_edgelist_major_vertices /* [INOUT] */, int32_t* shuffled_edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edge_counts, bool do_expensive_check, bool multi_gpu); @@ -1138,8 +1153,7 @@ template std::unique_ptr> call_renumber( raft::handle_t const& handle, int64_t* shuffled_edgelist_major_vertices /* [INOUT] */, int64_t* shuffled_edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edge_counts, bool do_expensive_check, bool multi_gpu); From e5c17f37b048d054772f378f1e766aa44114dac3 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 17 Mar 2021 00:59:54 -0400 Subject: [PATCH 14/63] bug fixes --- cpp/src/experimental/coarsen_graph.cu | 2 +- cpp/src/experimental/renumber_edgelist.cu | 14 +++--- cpp/src/utilities/cython.cu | 1 - .../utilities/generate_graph_from_edgelist.cu | 47 +++++++++---------- 4 files changed, 31 insertions(+), 33 deletions(-) diff --git a/cpp/src/experimental/coarsen_graph.cu b/cpp/src/experimental/coarsen_graph.cu index f2f01f7e609..9fc64cf7c8b 100644 --- a/cpp/src/experimental/coarsen_graph.cu +++ b/cpp/src/experimental/coarsen_graph.cu @@ -269,7 +269,7 @@ coarsen_graph( store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols(i) : graph_view.get_number_of_local_adj_matrix_partition_rows(i), handle.get_stream()); - // FIXME: this is copy is unnecessary, beter fix RAFT comm's bcast to take const iterators for + // FIXME: this copy is unnecessary, beter fix RAFT comm's bcast to take const iterators for // input thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), labels, diff --git a/cpp/src/experimental/renumber_edgelist.cu b/cpp/src/experimental/renumber_edgelist.cu index c619084f54f..a7931d9710d 100644 --- a/cpp/src/experimental/renumber_edgelist.cu +++ b/cpp/src/experimental/renumber_edgelist.cu @@ -98,7 +98,7 @@ rmm::device_uvector compute_renumber_map( rx_displs.assign(col_comm_size, size_t{0}); std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1); rx_major_labels.resize(rx_displs.back() + rx_sizes.back(), handle.get_stream()); - rx_major_counts.resize(major_labels.size(), handle.get_stream()); + rx_major_counts.resize(rx_major_labels.size(), handle.get_stream()); } device_gatherv(col_comm, thrust::make_zip_iterator( @@ -537,11 +537,12 @@ renumber_edgelist(raft::handle_t const& handle, for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { rmm::device_uvector renumber_map_major_labels( - partition.get_matrix_partition_major_size(i), handle.get_stream()); + col_comm_rank == i ? vertex_t{0} : partition.get_matrix_partition_major_size(i), + handle.get_stream()); device_bcast(col_comm, renumber_map_labels.data(), renumber_map_major_labels.data(), - renumber_map_major_labels.size(), + partition.get_matrix_partition_major_size(i), i, handle.get_stream()); @@ -549,17 +550,18 @@ renumber_edgelist(raft::handle_t const& handle, handle.get_stream())); // cuco::static_map currently does not take stream cuco::static_map renumber_map{ - static_cast(static_cast(renumber_map_major_labels.size()) / load_factor), + static_cast(static_cast(partition.get_matrix_partition_major_size(i)) / + load_factor), invalid_vertex_id::value, invalid_vertex_id::value}; auto pair_first = thrust::make_transform_iterator( thrust::make_zip_iterator(thrust::make_tuple( - renumber_map_major_labels.begin(), + col_comm_rank == i ? renumber_map_labels.begin() : renumber_map_major_labels.begin(), thrust::make_counting_iterator(partition.get_matrix_partition_major_first(i)))), [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - renumber_map.insert(pair_first, pair_first + renumber_map_major_labels.size()); + renumber_map.insert(pair_first, pair_first + partition.get_matrix_partition_major_size(i)); renumber_map.find(edgelist_major_vertices[i], edgelist_major_vertices[i] + edgelist_edge_counts[i], edgelist_major_vertices[i]); diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu index 3240e299ac4..3a68cc8ff4f 100644 --- a/cpp/src/utilities/cython.cu +++ b/cpp/src/utilities/cython.cu @@ -860,7 +860,6 @@ std::unique_ptr> call_renumber( major_ptrs[i] = shuffled_edgelist_major_vertices + displacements[i]; minor_ptrs[i] = shuffled_edgelist_minor_vertices + displacements[i]; } - assert(aggregate_number_of_vertices == number_of_vertices); std::tie( p_ret->get_dv(), p_ret->get_partition(), p_ret->get_num_vertices(), p_ret->get_num_edges()) = diff --git a/cpp/tests/utilities/generate_graph_from_edgelist.cu b/cpp/tests/utilities/generate_graph_from_edgelist.cu index f53db51a2c5..7e9214a9137 100644 --- a/cpp/tests/utilities/generate_graph_from_edgelist.cu +++ b/cpp/tests/utilities/generate_graph_from_edgelist.cu @@ -126,27 +126,23 @@ generate_graph_from_edgelist(raft::handle_t const& handle, store_transposed ? thrust::make_zip_iterator(thrust::make_tuple(edgelist_cols.begin(), edgelist_rows.begin())) : thrust::make_zip_iterator(thrust::make_tuple(edgelist_rows.begin(), edgelist_cols.begin())); - auto displacements = - test_weighted ? cugraph::experimental::groupby_and_count(pair_first, - pair_first + edgelist_rows.size(), - edgelist_weights.begin(), - local_partition_id_op, - col_comm_size, - handle.get_stream()) - : cugraph::experimental::groupby_and_count(pair_first, - pair_first + edgelist_rows.size(), - local_partition_id_op, - col_comm_size, - handle.get_stream()); - thrust::exclusive_scan(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - displacements.begin(), - displacements.end(), - displacements.begin()); - - std::vector h_displacements(displacements.size()); + auto edge_counts = test_weighted + ? cugraph::experimental::groupby_and_count(pair_first, + pair_first + edgelist_rows.size(), + edgelist_weights.begin(), + local_partition_id_op, + col_comm_size, + handle.get_stream()) + : cugraph::experimental::groupby_and_count(pair_first, + pair_first + edgelist_rows.size(), + local_partition_id_op, + col_comm_size, + handle.get_stream()); + + std::vector h_edge_counts(edge_counts.size()); raft::update_host( - h_displacements.data(), displacements.data(), displacements.size(), handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); + h_edge_counts.data(), edge_counts.data(), edge_counts.size(), handle.get_stream()); + handle.get_stream_view().synchronize(); // 3. renumber @@ -155,17 +151,18 @@ generate_graph_from_edgelist(raft::handle_t const& handle, vertex_t aggregate_number_of_vertices{}; edge_t number_of_edges{}; { - std::vector major_ptrs(edgelist_rows.size()); + std::vector h_displacements(h_edge_counts.size(), size_t{0}); + std::partial_sum(h_edge_counts.begin(), h_edge_counts.end() - 1, h_displacements.begin() + 1); + + std::vector major_ptrs(h_edge_counts.size()); std::vector minor_ptrs(major_ptrs.size()); std::vector counts(major_ptrs.size()); - for (size_t i = 0; i < edgelist_rows.size(); ++i) { + for (size_t i = 0; i < h_edge_counts.size(); ++i) { major_ptrs[i] = (store_transposed ? edgelist_cols.begin() : edgelist_rows.begin()) + h_displacements[i]; minor_ptrs[i] = (store_transposed ? edgelist_rows.begin() : edgelist_cols.begin()) + h_displacements[i]; - counts[i] = static_cast( - ((i == edgelist_rows.size() - 1) ? edgelist_rows.size() : h_displacements[i + 1]) - - h_displacements[i]); + counts[i] = static_cast(h_edge_counts[i]); } // FIXME: set do_expensive_check to false once validated std::tie(renumber_map_labels, partition, aggregate_number_of_vertices, number_of_edges) = From 8d4f96767ddcc148eb038835f6ba3d1750361fc3 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 17 Mar 2021 11:39:20 -0400 Subject: [PATCH 15/63] bug fixes --- .../experimental/detail/graph_utils.cuh | 2 +- .../copy_v_transform_reduce_in_out_nbr.cuh | 2 +- cpp/src/experimental/graph_view.cu | 2 +- .../utilities/generate_graph_from_edgelist.cu | 24 ++++++++++++------- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/cpp/include/experimental/detail/graph_utils.cuh b/cpp/include/experimental/detail/graph_utils.cuh index f7e97508b4e..490ba5cd4b1 100644 --- a/cpp/include/experimental/detail/graph_utils.cuh +++ b/cpp/include/experimental/detail/graph_utils.cuh @@ -152,7 +152,7 @@ struct compute_gpu_id_from_edge_t { cuco::detail::MurmurHash3_32 hash_func{}; auto major_comm_rank = static_cast(hash_func(major) % comm_size); auto minor_comm_rank = static_cast(hash_func(minor) % comm_size); - return (minor_comm_rank / col_comm_size) * row_comm_size + (major_comm_rank % row_comm_size); + return (minor_comm_rank / row_comm_size) * row_comm_size + (major_comm_rank % row_comm_size); } }; diff --git a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh index 69e6fc178a0..32ca956b535 100644 --- a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh +++ b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh @@ -481,7 +481,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, col_comm, major_buffer_first, vertex_value_output_first, - static_cast(graph_view.get_vertex_partition_size(i * row_comm_size + i)), + matrix_partition.get_major_size(), raft::comms::op_t::SUM, i, handle.get_stream()); diff --git a/cpp/src/experimental/graph_view.cu b/cpp/src/experimental/graph_view.cu index ad07ff733bc..2fa2fe9560b 100644 --- a/cpp/src/experimental/graph_view.cu +++ b/cpp/src/experimental/graph_view.cu @@ -195,7 +195,7 @@ graph_view_t(row_comm_size), + CUGRAPH_EXPECTS(adj_matrix_partition_offsets.size() == static_cast(col_comm_size), "Internal Error: erroneous adj_matrix_partition_offsets.size()."); CUGRAPH_EXPECTS((sorted_by_global_degree_within_vertex_partition && diff --git a/cpp/tests/utilities/generate_graph_from_edgelist.cu b/cpp/tests/utilities/generate_graph_from_edgelist.cu index 7e9214a9137..0774f6a1120 100644 --- a/cpp/tests/utilities/generate_graph_from_edgelist.cu +++ b/cpp/tests/utilities/generate_graph_from_edgelist.cu @@ -144,6 +144,9 @@ generate_graph_from_edgelist(raft::handle_t const& handle, h_edge_counts.data(), edge_counts.data(), edge_counts.size(), handle.get_stream()); handle.get_stream_view().synchronize(); + std::vector h_displacements(h_edge_counts.size(), size_t{0}); + std::partial_sum(h_edge_counts.begin(), h_edge_counts.end() - 1, h_displacements.begin() + 1); + // 3. renumber rmm::device_uvector renumber_map_labels(0, handle.get_stream()); @@ -151,9 +154,6 @@ generate_graph_from_edgelist(raft::handle_t const& handle, vertex_t aggregate_number_of_vertices{}; edge_t number_of_edges{}; { - std::vector h_displacements(h_edge_counts.size(), size_t{0}); - std::partial_sum(h_edge_counts.begin(), h_edge_counts.end() - 1, h_displacements.begin() + 1); - std::vector major_ptrs(h_edge_counts.size()); std::vector minor_ptrs(major_ptrs.size()); std::vector counts(major_ptrs.size()); @@ -179,15 +179,21 @@ generate_graph_from_edgelist(raft::handle_t const& handle, // 4. create a graph + std::vector> edgelists( + h_edge_counts.size()); + for (size_t i = 0; i < h_edge_counts.size(); ++i) { + edgelists[i] = cugraph::experimental::edgelist_t{ + edgelist_rows.data() + h_displacements[i], + edgelist_cols.data() + h_displacements[i], + test_weighted ? edgelist_weights.data() + h_displacements[i] + : static_cast(nullptr), + static_cast(h_edge_counts[i])}; + } + return std::make_tuple( cugraph::experimental::graph_t( handle, - std::vector>{ - cugraph::experimental::edgelist_t{ - edgelist_rows.data(), - edgelist_cols.data(), - test_weighted ? edgelist_weights.data() : nullptr, - static_cast(edgelist_rows.size())}}, + edgelists, partition, number_of_vertices, number_of_edges, From d23c8774e1f866d6027011fb429f069719543017 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 17 Mar 2021 15:49:01 -0400 Subject: [PATCH 16/63] update BFS, SSSP, Katz, PageRank tests to promote performance testing --- cpp/tests/experimental/bfs_test.cpp | 180 ++++++++------ .../experimental/katz_centrality_test.cpp | 220 ++++++++++-------- cpp/tests/experimental/pagerank_test.cpp | 203 +++++++++------- cpp/tests/experimental/sssp_test.cpp | 204 +++++++++------- 4 files changed, 469 insertions(+), 338 deletions(-) diff --git a/cpp/tests/experimental/bfs_test.cpp b/cpp/tests/experimental/bfs_test.cpp index d1b4def749e..8bcb5de7043 100644 --- a/cpp/tests/experimental/bfs_test.cpp +++ b/cpp/tests/experimental/bfs_test.cpp @@ -32,6 +32,11 @@ #include #include +// do the perf measurements +// enabled by command line parameter s'--perf' +// +static int PERF = 0; + template void bfs_reference(edge_t const* offsets, vertex_t const* indices, @@ -74,9 +79,12 @@ void bfs_reference(edge_t const* offsets, typedef struct BFS_Usecase_t { cugraph::test::input_graph_specifier_t input_graph_specifier{}; + size_t source{false}; + bool check_correctness{false}; - BFS_Usecase_t(std::string const& graph_file_path, size_t source) : source(source) + BFS_Usecase_t(std::string const& graph_file_path, size_t source, bool check_correctness = true) + : source(source), check_correctness(check_correctness) { std::string graph_file_full_path{}; if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { @@ -88,13 +96,42 @@ typedef struct BFS_Usecase_t { input_graph_specifier.graph_file_full_path = graph_file_full_path; }; - BFS_Usecase_t(cugraph::test::rmat_params_t rmat_params, size_t source) : source(source) + BFS_Usecase_t(cugraph::test::rmat_params_t rmat_params, + size_t source, + bool check_correctness = true) + : source(source), check_correctness(check_correctness) { input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; input_graph_specifier.rmat_params = rmat_params; } } BFS_Usecase; +template +cugraph::experimental::graph_t read_graph( + raft::handle_t const& handle, BFS_Usecase const& configuration, bool renumber) +{ + cugraph::experimental::graph_t graph(handle); + std::tie(graph, std::ignore) = + configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test::read_graph_from_matrix_market_file( + handle, configuration.input_graph_specifier.graph_file_full_path, false, renumber) + : cugraph::test::generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + false, + renumber); + + return graph; +} + class Tests_BFS : public ::testing::TestWithParam { public: Tests_BFS() {} @@ -113,55 +150,14 @@ class Tests_BFS : public ::testing::TestWithParam { raft::handle_t handle{}; - cugraph::experimental::graph_t graph(handle); - std::tie(graph, std::ignore) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test:: - read_graph_from_matrix_market_file( - handle, configuration.input_graph_specifier.graph_file_full_path, false, renumber) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - false, - renumber); + auto graph = read_graph(handle, configuration, renumber); auto graph_view = graph.view(); - std::vector h_offsets(graph_view.get_number_of_vertices() + 1); - std::vector h_indices(graph_view.get_number_of_edges()); - raft::update_host(h_offsets.data(), - graph_view.offsets(), - graph_view.get_number_of_vertices() + 1, - handle.get_stream()); - raft::update_host(h_indices.data(), - graph_view.indices(), - graph_view.get_number_of_edges(), - handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - ASSERT_TRUE(configuration.source >= 0 && configuration.source <= graph_view.get_number_of_vertices()) << "Starting sources should be >= 0 and" << " less than the number of vertices in the graph."; - std::vector h_reference_distances(graph_view.get_number_of_vertices()); - std::vector h_reference_predecessors(graph_view.get_number_of_vertices()); - - bfs_reference(h_offsets.data(), - h_indices.data(), - h_reference_distances.data(), - h_reference_predecessors.data(), - graph_view.get_number_of_vertices(), - static_cast(configuration.source), - std::numeric_limits::max()); - rmm::device_uvector d_distances(graph_view.get_number_of_vertices(), handle.get_stream()); rmm::device_uvector d_predecessors(graph_view.get_number_of_vertices(), @@ -180,37 +176,70 @@ class Tests_BFS : public ::testing::TestWithParam { CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - std::vector h_cugraph_distances(graph_view.get_number_of_vertices()); - std::vector h_cugraph_predecessors(graph_view.get_number_of_vertices()); - - raft::update_host( - h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream()); - raft::update_host(h_cugraph_predecessors.data(), - d_predecessors.data(), - d_predecessors.size(), - handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - ASSERT_TRUE(std::equal( - h_reference_distances.begin(), h_reference_distances.end(), h_cugraph_distances.begin())) - << "distances do not match with the reference values."; - - for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) { - auto i = std::distance(h_cugraph_predecessors.begin(), it); - if (*it == cugraph::invalid_vertex_id::value) { - ASSERT_TRUE(h_reference_predecessors[i] == *it) - << "vertex reachability do not match with the reference."; - } else { - ASSERT_TRUE(h_reference_distances[*it] + 1 == h_reference_distances[i]) - << "distance to this vertex != distance to the predecessor vertex + 1."; - bool found{false}; - for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) { - if (h_indices[j] == i) { - found = true; - break; + if (configuration.check_correctness) { + cugraph::experimental::graph_t unrenumbered_graph( + handle); + if (renumber) { + unrenumbered_graph = read_graph(handle, configuration, false); + } + auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view; + + std::vector h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1); + std::vector h_indices(unrenumbered_graph_view.get_number_of_edges()); + raft::update_host(h_offsets.data(), + unrenumbered_graph_view.offsets(), + unrenumbered_graph_view.get_number_of_vertices() + 1, + handle.get_stream()); + raft::update_host(h_indices.data(), + unrenumbered_graph_view.indices(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + handle.get_stream_view().synchronize(); + + std::vector h_reference_distances(unrenumbered_graph_view.get_number_of_vertices()); + std::vector h_reference_predecessors( + unrenumbered_graph_view.get_number_of_vertices()); + + bfs_reference(h_offsets.data(), + h_indices.data(), + h_reference_distances.data(), + h_reference_predecessors.data(), + unrenumbered_graph_view.get_number_of_vertices(), + static_cast(configuration.source), + std::numeric_limits::max()); + + std::vector h_cugraph_distances(graph_view.get_number_of_vertices()); + std::vector h_cugraph_predecessors(graph_view.get_number_of_vertices()); + + raft::update_host( + h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream()); + raft::update_host(h_cugraph_predecessors.data(), + d_predecessors.data(), + d_predecessors.size(), + handle.get_stream()); + handle.get_stream_view().synchronize(); + + ASSERT_TRUE(std::equal( + h_reference_distances.begin(), h_reference_distances.end(), h_cugraph_distances.begin())) + << "distances do not match with the reference values."; + + for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) { + auto i = std::distance(h_cugraph_predecessors.begin(), it); + if (*it == cugraph::invalid_vertex_id::value) { + ASSERT_TRUE(h_reference_predecessors[i] == *it) + << "vertex reachability do not match with the reference."; + } else { + ASSERT_TRUE(h_reference_distances[*it] + 1 == h_reference_distances[i]) + << "distance to this vertex != distance to the predecessor vertex + 1."; + bool found{false}; + for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) { + if (h_indices[j] == i) { + found = true; + break; + } } + ASSERT_TRUE(found) << "no edge from the predecessor vertex to this vertex."; } - ASSERT_TRUE(found) << "no edge from the predecessor vertex to this vertex."; } } } @@ -223,12 +252,15 @@ INSTANTIATE_TEST_CASE_P( simple_test, Tests_BFS, ::testing::Values( + // enable correctness checks BFS_Usecase("test/datasets/karate.mtx", 0), BFS_Usecase("test/datasets/polbooks.mtx", 0), BFS_Usecase("test/datasets/netscience.mtx", 0), BFS_Usecase("test/datasets/netscience.mtx", 100), BFS_Usecase("test/datasets/wiki2003.mtx", 1000), BFS_Usecase("test/datasets/wiki-Talk.mtx", 1000), - BFS_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0))); + BFS_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0), + // disable correctness checks for large graphs + BFS_Usecase(cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, 0, false))); CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/katz_centrality_test.cpp b/cpp/tests/experimental/katz_centrality_test.cpp index ce4a9941c79..27b377a99d3 100644 --- a/cpp/tests/experimental/katz_centrality_test.cpp +++ b/cpp/tests/experimental/katz_centrality_test.cpp @@ -34,6 +34,11 @@ #include #include +// do the perf measurements +// enabled by command line parameter s'--perf' +// +static int PERF = 0; + template void katz_centrality_reference(edge_t const* offsets, vertex_t const* indices, @@ -92,9 +97,12 @@ typedef struct KatzCentrality_Usecase_t { cugraph::test::input_graph_specifier_t input_graph_specifier{}; bool test_weighted{false}; + bool check_correctness{false}; - KatzCentrality_Usecase_t(std::string const& graph_file_path, bool test_weighted) - : test_weighted(test_weighted) + KatzCentrality_Usecase_t(std::string const& graph_file_path, + bool test_weighted, + bool check_correctness = true) + : test_weighted(test_weighted), check_correctness(check_correctness) { std::string graph_file_full_path{}; if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { @@ -107,15 +115,44 @@ typedef struct KatzCentrality_Usecase_t { }; KatzCentrality_Usecase_t(cugraph::test::rmat_params_t rmat_params, - double personalization_ratio, - bool test_weighted) - : test_weighted(test_weighted) + bool test_weighted, + bool check_correctness = true) + : test_weighted(test_weighted), check_correctness(check_correctness) { input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; input_graph_specifier.rmat_params = rmat_params; } } KatzCentrality_Usecase; +template +cugraph::experimental::graph_t read_graph( + raft::handle_t const& handle, KatzCentrality_Usecase const& configuration, bool renumber) +{ + cugraph::experimental::graph_t graph(handle); + std::tie(graph, std::ignore) = + configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test::read_graph_from_matrix_market_file( + handle, + configuration.input_graph_specifier.graph_file_full_path, + configuration.test_weighted, + renumber) + : cugraph::test::generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + configuration.test_weighted, + renumber); + + return graph; +} + class Tests_KatzCentrality : public ::testing::TestWithParam { public: Tests_KatzCentrality() {} @@ -132,74 +169,19 @@ class Tests_KatzCentrality : public ::testing::TestWithParam graph(handle); - std::tie(graph, std::ignore) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test:: - read_graph_from_matrix_market_file( - handle, - configuration.input_graph_specifier.graph_file_full_path, - configuration.test_weighted, - renumber) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - configuration.test_weighted, - renumber); + auto graph = read_graph(handle, configuration, renumber); auto graph_view = graph.view(); - std::vector h_offsets(graph_view.get_number_of_vertices() + 1); - std::vector h_indices(graph_view.get_number_of_edges()); - std::vector h_weights{}; - raft::update_host(h_offsets.data(), - graph_view.offsets(), - graph_view.get_number_of_vertices() + 1, - handle.get_stream()); - raft::update_host(h_indices.data(), - graph_view.indices(), - graph_view.get_number_of_edges(), - handle.get_stream()); - if (graph_view.is_weighted()) { - h_weights.assign(graph_view.get_number_of_edges(), weight_t{0.0}); - raft::update_host(h_weights.data(), - graph_view.weights(), - graph_view.get_number_of_edges(), - handle.get_stream()); - } - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - std::vector h_reference_katz_centralities(graph_view.get_number_of_vertices()); - - std::vector tmps(h_offsets.size()); - std::adjacent_difference(h_offsets.begin(), h_offsets.end(), tmps.begin()); - auto max_it = std::max_element(tmps.begin(), tmps.end()); + auto degrees = graph_view.compute_in_degrees(handle); + std::vector h_degrees(degrees.size()); + raft::update_host(h_degrees.data(), degrees.data(), degrees.size(), handle.get_stream()); + handle.get_stream_view().synchronize(); + auto max_it = std::max_element(h_degrees.begin(), h_degrees.end()); result_t const alpha = result_t{1.0} / static_cast(*max_it + 1); result_t constexpr beta{1.0}; result_t constexpr epsilon{1e-6}; - katz_centrality_reference( - h_offsets.data(), - h_indices.data(), - h_weights.size() > 0 ? h_weights.data() : static_cast(nullptr), - static_cast(nullptr), - h_reference_katz_centralities.data(), - graph_view.get_number_of_vertices(), - alpha, - beta, - epsilon, - std::numeric_limits::max(), - false, - true); - rmm::device_uvector d_katz_centralities(graph_view.get_number_of_vertices(), handle.get_stream()); @@ -219,28 +201,74 @@ class Tests_KatzCentrality : public ::testing::TestWithParam h_cugraph_katz_centralities(graph_view.get_number_of_vertices()); - - raft::update_host(h_cugraph_katz_centralities.data(), - d_katz_centralities.data(), - d_katz_centralities.size(), - handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - auto threshold_ratio = 1e-3; - auto threshold_magnitude = - (1.0 / static_cast(graph_view.get_number_of_vertices())) * - threshold_ratio; // skip comparison for low Katz Centrality verties (lowly ranked vertices) - auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { - return std::abs(lhs - rhs) < - std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); - }; - - ASSERT_TRUE(std::equal(h_reference_katz_centralities.begin(), - h_reference_katz_centralities.end(), - h_cugraph_katz_centralities.begin(), - nearly_equal)) - << "Katz centrality values do not match with the reference values."; + if (configuration.check_correctness) { + cugraph::experimental::graph_t unrenumbered_graph( + handle); + if (renumber) { + unrenumbered_graph = read_graph(handle, configuration, false); + } + auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view; + + std::vector h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1); + std::vector h_indices(unrenumbered_graph_view.get_number_of_edges()); + std::vector h_weights{}; + raft::update_host(h_offsets.data(), + unrenumbered_graph_view.offsets(), + unrenumbered_graph_view.get_number_of_vertices() + 1, + handle.get_stream()); + raft::update_host(h_indices.data(), + unrenumbered_graph_view.indices(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + if (unrenumbered_graph_view.is_weighted()) { + h_weights.assign(unrenumbered_graph_view.get_number_of_edges(), weight_t{0.0}); + raft::update_host(h_weights.data(), + unrenumbered_graph_view.weights(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + } + CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); + + std::vector h_reference_katz_centralities( + unrenumbered_graph_view.get_number_of_vertices()); + + katz_centrality_reference( + h_offsets.data(), + h_indices.data(), + h_weights.size() > 0 ? h_weights.data() : static_cast(nullptr), + static_cast(nullptr), + h_reference_katz_centralities.data(), + unrenumbered_graph_view.get_number_of_vertices(), + alpha, + beta, + epsilon, + std::numeric_limits::max(), + false, + true); + + std::vector h_cugraph_katz_centralities(graph_view.get_number_of_vertices()); + + raft::update_host(h_cugraph_katz_centralities.data(), + d_katz_centralities.data(), + d_katz_centralities.size(), + handle.get_stream()); + handle.get_stream_view().synchronize(); + + auto threshold_ratio = 1e-3; + auto threshold_magnitude = + (1.0 / static_cast(graph_view.get_number_of_vertices())) * + threshold_ratio; // skip comparison for low Katz Centrality verties (lowly ranked vertices) + auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { + return std::abs(lhs - rhs) < + std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); + }; + + ASSERT_TRUE(std::equal(h_reference_katz_centralities.begin(), + h_reference_katz_centralities.end(), + h_cugraph_katz_centralities.begin(), + nearly_equal)) + << "Katz centrality values do not match with the reference values."; + } } }; @@ -254,6 +282,7 @@ INSTANTIATE_TEST_CASE_P( simple_test, Tests_KatzCentrality, ::testing::Values( + // enable correctness checks KatzCentrality_Usecase("test/datasets/karate.mtx", false), KatzCentrality_Usecase("test/datasets/karate.mtx", true), KatzCentrality_Usecase("test/datasets/web-Google.mtx", false), @@ -263,16 +292,15 @@ INSTANTIATE_TEST_CASE_P( KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", false), KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", true), KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, - 0.0, false), KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, - 0.5, - false), - KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, - 0.0, true), - KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, - 0.5, - true))); + // disable correctness checks for large graphs + KatzCentrality_Usecase(cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, + false, + false), + KatzCentrality_Usecase(cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, + true, + false))); CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/pagerank_test.cpp b/cpp/tests/experimental/pagerank_test.cpp index 4bd1946e163..cc63ae1e6c5 100644 --- a/cpp/tests/experimental/pagerank_test.cpp +++ b/cpp/tests/experimental/pagerank_test.cpp @@ -35,6 +35,11 @@ #include #include +// do the perf measurements +// enabled by command line parameter s'--perf' +// +static int PERF = 0; + template void pagerank_reference(edge_t const* offsets, vertex_t const* indices, @@ -128,11 +133,15 @@ typedef struct PageRank_Usecase_t { double personalization_ratio{0.0}; bool test_weighted{false}; + bool check_correctness{false}; PageRank_Usecase_t(std::string const& graph_file_path, double personalization_ratio, - bool test_weighted) - : personalization_ratio(personalization_ratio), test_weighted(test_weighted) + bool test_weighted, + bool check_correctness = true) + : personalization_ratio(personalization_ratio), + test_weighted(test_weighted), + check_correctness(check_correctness) { std::string graph_file_full_path{}; if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { @@ -146,14 +155,46 @@ typedef struct PageRank_Usecase_t { PageRank_Usecase_t(cugraph::test::rmat_params_t rmat_params, double personalization_ratio, - bool test_weighted) - : personalization_ratio(personalization_ratio), test_weighted(test_weighted) + bool test_weighted, + bool check_correctness = true) + : personalization_ratio(personalization_ratio), + test_weighted(test_weighted), + check_correctness(check_correctness) { input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; input_graph_specifier.rmat_params = rmat_params; } } PageRank_Usecase; +template +cugraph::experimental::graph_t read_graph( + raft::handle_t const& handle, PageRank_Usecase const& configuration, bool renumber) +{ + cugraph::experimental::graph_t graph(handle); + std::tie(graph, std::ignore) = + configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test::read_graph_from_matrix_market_file( + handle, + configuration.input_graph_specifier.graph_file_full_path, + configuration.test_weighted, + renumber) + : cugraph::test::generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + configuration.test_weighted, + renumber); + + return graph; +} + class Tests_PageRank : public ::testing::TestWithParam { public: Tests_PageRank() {} @@ -170,50 +211,9 @@ class Tests_PageRank : public ::testing::TestWithParam { raft::handle_t handle{}; - cugraph::experimental::graph_t graph(handle); - std::tie(graph, std::ignore) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test:: - read_graph_from_matrix_market_file( - handle, - configuration.input_graph_specifier.graph_file_full_path, - configuration.test_weighted, - renumber) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - configuration.test_weighted, - renumber); + auto graph = read_graph(handle, configuration, renumber); auto graph_view = graph.view(); - std::vector h_offsets(graph_view.get_number_of_vertices() + 1); - std::vector h_indices(graph_view.get_number_of_edges()); - std::vector h_weights{}; - raft::update_host(h_offsets.data(), - graph_view.offsets(), - graph_view.get_number_of_vertices() + 1, - handle.get_stream()); - raft::update_host(h_indices.data(), - graph_view.indices(), - graph_view.get_number_of_edges(), - handle.get_stream()); - if (graph_view.is_weighted()) { - h_weights.assign(graph_view.get_number_of_edges(), weight_t{0.0}); - raft::update_host(h_weights.data(), - graph_view.weights(), - graph_view.get_number_of_edges(), - handle.get_stream()); - } - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - std::vector h_personalization_vertices{}; std::vector h_personalization_values{}; if (configuration.personalization_ratio > 0.0) { @@ -262,21 +262,6 @@ class Tests_PageRank : public ::testing::TestWithParam { result_t constexpr alpha{0.85}; result_t constexpr epsilon{1e-6}; - std::vector h_reference_pageranks(graph_view.get_number_of_vertices()); - - pagerank_reference(h_offsets.data(), - h_indices.data(), - h_weights.size() > 0 ? h_weights.data() : static_cast(nullptr), - h_personalization_vertices.data(), - h_personalization_values.data(), - h_reference_pageranks.data(), - graph_view.get_number_of_vertices(), - static_cast(h_personalization_vertices.size()), - alpha, - epsilon, - std::numeric_limits::max(), - false); - rmm::device_uvector d_pageranks(graph_view.get_number_of_vertices(), handle.get_stream()); @@ -297,26 +282,70 @@ class Tests_PageRank : public ::testing::TestWithParam { CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - std::vector h_cugraph_pageranks(graph_view.get_number_of_vertices()); - - raft::update_host( - h_cugraph_pageranks.data(), d_pageranks.data(), d_pageranks.size(), handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - auto threshold_ratio = 1e-3; - auto threshold_magnitude = - (1.0 / static_cast(graph_view.get_number_of_vertices())) * - threshold_ratio; // skip comparison for low PageRank verties (lowly ranked vertices) - auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { - return std::abs(lhs - rhs) < - std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); - }; - - ASSERT_TRUE(std::equal(h_reference_pageranks.begin(), - h_reference_pageranks.end(), - h_cugraph_pageranks.begin(), - nearly_equal)) - << "PageRank values do not match with the reference values."; + if (configuration.check_correctness) { + cugraph::experimental::graph_t unrenumbered_graph( + handle); + if (renumber) { + unrenumbered_graph = read_graph(handle, configuration, false); + } + auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view; + + std::vector h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1); + std::vector h_indices(unrenumbered_graph_view.get_number_of_edges()); + std::vector h_weights{}; + raft::update_host(h_offsets.data(), + unrenumbered_graph_view.offsets(), + unrenumbered_graph_view.get_number_of_vertices() + 1, + handle.get_stream()); + raft::update_host(h_indices.data(), + unrenumbered_graph_view.indices(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + if (unrenumbered_graph_view.is_weighted()) { + h_weights.assign(unrenumbered_graph_view.get_number_of_edges(), weight_t{0.0}); + raft::update_host(h_weights.data(), + unrenumbered_graph_view.weights(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + } + handle.get_stream_view().synchronize(); + + std::vector h_reference_pageranks(unrenumbered_graph_view.get_number_of_vertices()); + + pagerank_reference(h_offsets.data(), + h_indices.data(), + h_weights.size() > 0 ? h_weights.data() : static_cast(nullptr), + h_personalization_vertices.data(), + h_personalization_values.data(), + h_reference_pageranks.data(), + unrenumbered_graph_view.get_number_of_vertices(), + static_cast(h_personalization_vertices.size()), + alpha, + epsilon, + std::numeric_limits::max(), + false); + + std::vector h_cugraph_pageranks(graph_view.get_number_of_vertices()); + + raft::update_host( + h_cugraph_pageranks.data(), d_pageranks.data(), d_pageranks.size(), handle.get_stream()); + handle.get_stream_view().synchronize(); + + auto threshold_ratio = 1e-3; + auto threshold_magnitude = + (1.0 / static_cast(graph_view.get_number_of_vertices())) * + threshold_ratio; // skip comparison for low PageRank verties (lowly ranked vertices) + auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { + return std::abs(lhs - rhs) < + std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); + }; + + ASSERT_TRUE(std::equal(h_reference_pageranks.begin(), + h_reference_pageranks.end(), + h_cugraph_pageranks.begin(), + nearly_equal)) + << "PageRank values do not match with the reference values."; + } } }; @@ -330,6 +359,7 @@ INSTANTIATE_TEST_CASE_P( simple_test, Tests_PageRank, ::testing::Values( + // enable correctness checks PageRank_Usecase("test/datasets/karate.mtx", 0.0, false), PageRank_Usecase("test/datasets/karate.mtx", 0.5, false), PageRank_Usecase("test/datasets/karate.mtx", 0.0, true), @@ -357,6 +387,15 @@ INSTANTIATE_TEST_CASE_P( true), PageRank_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0.5, - true))); + true), + // disable correctness checks for large graphs + PageRank_Usecase( + cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, false, false), + PageRank_Usecase( + cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, false, false), + PageRank_Usecase( + cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, true, false), + PageRank_Usecase( + cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, true, false))); CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/sssp_test.cpp b/cpp/tests/experimental/sssp_test.cpp index b8b6d2b8cde..b2224badb2b 100644 --- a/cpp/tests/experimental/sssp_test.cpp +++ b/cpp/tests/experimental/sssp_test.cpp @@ -34,6 +34,11 @@ #include #include +// do the perf measurements +// enabled by command line parameter s'--perf' +// +static int PERF = 0; + // Dijkstra's algorithm template void sssp_reference(edge_t const* offsets, @@ -80,9 +85,12 @@ void sssp_reference(edge_t const* offsets, typedef struct SSSP_Usecase_t { cugraph::test::input_graph_specifier_t input_graph_specifier{}; + size_t source{false}; + bool check_correctness{false}; - SSSP_Usecase_t(std::string const& graph_file_path, size_t source) : source(source) + SSSP_Usecase_t(std::string const& graph_file_path, size_t source, bool check_correctness = true) + : source(source), check_correctness(check_correctness) { std::string graph_file_full_path{}; if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { @@ -94,13 +102,42 @@ typedef struct SSSP_Usecase_t { input_graph_specifier.graph_file_full_path = graph_file_full_path; }; - SSSP_Usecase_t(cugraph::test::rmat_params_t rmat_params, size_t source) : source(source) + SSSP_Usecase_t(cugraph::test::rmat_params_t rmat_params, + size_t source, + bool check_correctness = true) + : source(source), check_correctness(check_correctness) { input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; input_graph_specifier.rmat_params = rmat_params; } } SSSP_Usecase; +template +cugraph::experimental::graph_t read_graph( + raft::handle_t const& handle, SSSP_Usecase const& configuration, bool renumber) +{ + cugraph::experimental::graph_t graph(handle); + std::tie(graph, std::ignore) = + configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test::read_graph_from_matrix_market_file( + handle, configuration.input_graph_specifier.graph_file_full_path, true, renumber) + : cugraph::test::generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + true, + renumber); + + return graph; +} + class Tests_SSSP : public ::testing::TestWithParam { public: Tests_SSSP() {} @@ -117,60 +154,14 @@ class Tests_SSSP : public ::testing::TestWithParam { raft::handle_t handle{}; - cugraph::experimental::graph_t graph(handle); - std::tie(graph, std::ignore) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test:: - read_graph_from_matrix_market_file( - handle, configuration.input_graph_specifier.graph_file_full_path, true, renumber) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - true, - renumber); + auto graph = read_graph(handle, configuration, renumber); auto graph_view = graph.view(); - std::vector h_offsets(graph_view.get_number_of_vertices() + 1); - std::vector h_indices(graph_view.get_number_of_edges()); - std::vector h_weights(graph_view.get_number_of_edges()); - raft::update_host(h_offsets.data(), - graph_view.offsets(), - graph_view.get_number_of_vertices() + 1, - handle.get_stream()); - raft::update_host(h_indices.data(), - graph_view.indices(), - graph_view.get_number_of_edges(), - handle.get_stream()); - raft::update_host(h_weights.data(), - graph_view.weights(), - graph_view.get_number_of_edges(), - handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - ASSERT_TRUE(configuration.source >= 0 && configuration.source <= graph_view.get_number_of_vertices()) << "Starting sources should be >= 0 and" << " less than the number of vertices in the graph."; - std::vector h_reference_distances(graph_view.get_number_of_vertices()); - std::vector h_reference_predecessors(graph_view.get_number_of_vertices()); - - sssp_reference(h_offsets.data(), - h_indices.data(), - h_weights.data(), - h_reference_distances.data(), - h_reference_predecessors.data(), - graph_view.get_number_of_vertices(), - static_cast(configuration.source)); - rmm::device_uvector d_distances(graph_view.get_number_of_vertices(), handle.get_stream()); rmm::device_uvector d_predecessors(graph_view.get_number_of_vertices(), @@ -188,45 +179,83 @@ class Tests_SSSP : public ::testing::TestWithParam { CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - std::vector h_cugraph_distances(graph_view.get_number_of_vertices()); - std::vector h_cugraph_predecessors(graph_view.get_number_of_vertices()); - - raft::update_host( - h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream()); - raft::update_host(h_cugraph_predecessors.data(), - d_predecessors.data(), - d_predecessors.size(), - handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - auto max_weight_element = std::max_element(h_weights.begin(), h_weights.end()); - auto epsilon = *max_weight_element * weight_t{1e-6}; - auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; }; - - ASSERT_TRUE(std::equal(h_reference_distances.begin(), - h_reference_distances.end(), - h_cugraph_distances.begin(), - nearly_equal)) - << "distances do not match with the reference values."; - - for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) { - auto i = std::distance(h_cugraph_predecessors.begin(), it); - if (*it == cugraph::invalid_vertex_id::value) { - ASSERT_TRUE(h_reference_predecessors[i] == *it) - << "vertex reachability do not match with the reference."; - } else { - auto pred_distance = h_reference_distances[*it]; - bool found{false}; - for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) { - if (h_indices[j] == i) { - if (nearly_equal(pred_distance + h_weights[j], h_reference_distances[i])) { - found = true; - break; + if (configuration.check_correctness) { + cugraph::experimental::graph_t unrenumbered_graph( + handle); + if (renumber) { + unrenumbered_graph = read_graph(handle, configuration, false); + } + auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view; + + std::vector h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1); + std::vector h_indices(unrenumbered_graph_view.get_number_of_edges()); + std::vector h_weights(unrenumbered_graph_view.get_number_of_edges()); + raft::update_host(h_offsets.data(), + unrenumbered_graph_view.offsets(), + unrenumbered_graph_view.get_number_of_vertices() + 1, + handle.get_stream()); + raft::update_host(h_indices.data(), + unrenumbered_graph_view.indices(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + raft::update_host(h_weights.data(), + unrenumbered_graph_view.weights(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + handle.get_stream_view().synchronize(); + + std::vector h_reference_distances(unrenumbered_graph_view.get_number_of_vertices()); + std::vector h_reference_predecessors( + unrenumbered_graph_view.get_number_of_vertices()); + + sssp_reference(h_offsets.data(), + h_indices.data(), + h_weights.data(), + h_reference_distances.data(), + h_reference_predecessors.data(), + unrenumbered_graph_view.get_number_of_vertices(), + static_cast(configuration.source)); + + std::vector h_cugraph_distances(graph_view.get_number_of_vertices()); + std::vector h_cugraph_predecessors(graph_view.get_number_of_vertices()); + + raft::update_host( + h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream()); + raft::update_host(h_cugraph_predecessors.data(), + d_predecessors.data(), + d_predecessors.size(), + handle.get_stream()); + handle.get_stream_view().synchronize(); + + auto max_weight_element = std::max_element(h_weights.begin(), h_weights.end()); + auto epsilon = *max_weight_element * weight_t{1e-6}; + auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; }; + + ASSERT_TRUE(std::equal(h_reference_distances.begin(), + h_reference_distances.end(), + h_cugraph_distances.begin(), + nearly_equal)) + << "distances do not match with the reference values."; + + for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) { + auto i = std::distance(h_cugraph_predecessors.begin(), it); + if (*it == cugraph::invalid_vertex_id::value) { + ASSERT_TRUE(h_reference_predecessors[i] == *it) + << "vertex reachability do not match with the reference."; + } else { + auto pred_distance = h_reference_distances[*it]; + bool found{false}; + for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) { + if (h_indices[j] == i) { + if (nearly_equal(pred_distance + h_weights[j], h_reference_distances[i])) { + found = true; + break; + } } } + ASSERT_TRUE(found) + << "no edge from the predecessor vertex to this vertex with the matching weight."; } - ASSERT_TRUE(found) - << "no edge from the predecessor vertex to this vertex with the matching weight."; } } } @@ -239,9 +268,12 @@ INSTANTIATE_TEST_CASE_P( simple_test, Tests_SSSP, ::testing::Values( + // enable correctness checks SSSP_Usecase("test/datasets/karate.mtx", 0), SSSP_Usecase("test/datasets/dblp.mtx", 0), SSSP_Usecase("test/datasets/wiki2003.mtx", 1000), - SSSP_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0))); + SSSP_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0), + // disable correctness checks for large graphs + SSSP_Usecase(cugraph::test::rmat_params_t{25, 16, 0.57, 0.19, 0.19, 0, false, false}, 0, false))); CUGRAPH_TEST_PROGRAM_MAIN() From e3add17ef450e5e13a33ce1fac52bedb9a00775f Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 17 Mar 2021 15:55:16 -0400 Subject: [PATCH 17/63] bug fix --- cpp/tests/utilities/rmat_utilities.cu | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cpp/tests/utilities/rmat_utilities.cu b/cpp/tests/utilities/rmat_utilities.cu index 16ea7a486fc..9ab05cfbbc2 100644 --- a/cpp/tests/utilities/rmat_utilities.cu +++ b/cpp/tests/utilities/rmat_utilities.cu @@ -50,8 +50,15 @@ generate_graph_from_rmat_params(raft::handle_t const& handle, rmm::device_uvector d_edgelist_rows(0, handle.get_stream()); rmm::device_uvector d_edgelist_cols(0, handle.get_stream()); std::tie(d_edgelist_rows, d_edgelist_cols) = - cugraph::experimental::generate_rmat_edgelist( - handle, scale, edge_factor, a, b, c, seed, undirected ? true : false, scramble_vertex_ids); + cugraph::experimental::generate_rmat_edgelist(handle, + scale, + (size_t{1} << scale) * edge_factor, + a, + b, + c, + seed, + undirected ? true : false, + scramble_vertex_ids); if (undirected) { // FIXME: need to symmetrize CUGRAPH_FAIL("unimplemented."); From 550ef1eb582081b4ec1464efef02f42c3eb06a6e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 18 Mar 2021 16:56:31 -0400 Subject: [PATCH 18/63] add test renumber utilities --- cpp/tests/utilities/renumber_utilities.cu | 136 +++++++++++++++++++++ cpp/tests/utilities/renumber_utilities.hpp | 39 ++++++ 2 files changed, 175 insertions(+) create mode 100644 cpp/tests/utilities/renumber_utilities.cu create mode 100644 cpp/tests/utilities/renumber_utilities.hpp diff --git a/cpp/tests/utilities/renumber_utilities.cu b/cpp/tests/utilities/renumber_utilities.cu new file mode 100644 index 00000000000..ef89c84f65a --- /dev/null +++ b/cpp/tests/utilities/renumber_utilities.cu @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include + +namespace cugraph { +namespace test { + +template +std::tuple, rmm::device_uvector> unrenumber_kv_pairs( + raft::handle_t const& handle, + vertex_t const* keys /* 0 <= keys[] < renumber_map_size */, + value_t const* values, + size_t num_pairs, + vertex_t const* renumber_map_labels, + size_t renumber_map_size) +{ + rmm::device_uvector unrenumbered_keys(num_pairs, handle.get_stream_view()); + rmm::device_uvector values_for_unrenumbered_keys(num_pairs, handle.get_stream_view()); + + auto unrenumbered_key_first = thrust::make_transform_iterator( + keys, [renumber_map_labels] __device__(auto v) { return renumber_map_labels[v]; }); + thrust::copy(rmm::exec_policy(handle.get_stream_view()), + unrenumbered_key_first, + unrenumbered_key_first + num_pairs, + unrenumbered_keys.begin()); + thrust::copy(rmm::exec_policy(handle.get_stream_view()), + values, + values + num_pairs, + values_for_unrenumbered_keys.begin()); + + thrust::sort_by_key(rmm::exec_policy(handle.get_stream_view()), + unrenumbered_keys.begin(), + unrenumbered_keys.end(), + values_for_unrenumbered_keys.begin()); + + return std::make_tuple(std::move(unrenumbered_keys), std::move(values_for_unrenumbered_keys)); +} + +template +rmm::device_uvector sort_values_by_key(raft::handle_t const& handle, + vertex_t const* keys, + value_t const* values, + size_t num_pairs) +{ + rmm::device_uvector sorted_keys(num_pairs, handle.get_stream_view()); + rmm::device_uvector sorted_values(num_pairs, handle.get_stream_view()); + + thrust::copy( + rmm::exec_policy(handle.get_stream_view()), keys, keys + num_pairs, sorted_keys.begin()); + thrust::copy( + rmm::exec_policy(handle.get_stream_view()), values, values + num_pairs, sorted_values.begin()); + + thrust::sort_by_key(rmm::exec_policy(handle.get_stream_view()), + sorted_keys.begin(), + sorted_keys.end(), + sorted_values.begin()); + + return sorted_values; +} + +template std::tuple, rmm::device_uvector> +unrenumber_kv_pairs(raft::handle_t const& handle, + int32_t const* keys, + float const* values, + size_t num_pairs, + int32_t const* renumber_map_labels, + size_t renumber_map_size); + +template std::tuple, rmm::device_uvector> +unrenumber_kv_pairs(raft::handle_t const& handle, + int32_t const* keys, + double const* values, + size_t num_pairs, + int32_t const* renumber_map_labels, + size_t renumber_map_size); + +template std::tuple, rmm::device_uvector> +unrenumber_kv_pairs(raft::handle_t const& handle, + int64_t const* keys, + float const* values, + size_t num_pairs, + int64_t const* renumber_map_labels, + size_t renumber_map_size); + +template std::tuple, rmm::device_uvector> +unrenumber_kv_pairs(raft::handle_t const& handle, + int64_t const* keys, + double const* values, + size_t num_pairs, + int64_t const* renumber_map_labels, + size_t renumber_map_size); + +template rmm::device_uvector sort_values_by_key(raft::handle_t const& handle, + int32_t const* keys, + float const* values, + size_t num_pairs); + +template rmm::device_uvector sort_values_by_key( + raft::handle_t const& handle, int32_t const* keys, double const* values, size_t num_pairs); + +template rmm::device_uvector sort_values_by_key( + raft::handle_t const& handle, int32_t const* keys, int32_t const* values, size_t num_pairs); + +template rmm::device_uvector sort_values_by_key(raft::handle_t const& handle, + int64_t const* keys, + float const* values, + size_t num_pairs); + +template rmm::device_uvector sort_values_by_key( + raft::handle_t const& handle, int64_t const* keys, double const* values, size_t num_pairs); + +template rmm::device_uvector sort_values_by_key( + raft::handle_t const& handle, int64_t const* keys, int64_t const* values, size_t num_pairs); + +} // namespace test +} // namespace cugraph diff --git a/cpp/tests/utilities/renumber_utilities.hpp b/cpp/tests/utilities/renumber_utilities.hpp new file mode 100644 index 00000000000..3e3d651c3e4 --- /dev/null +++ b/cpp/tests/utilities/renumber_utilities.hpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace cugraph { +namespace test { + +template +std::tuple, rmm::device_uvector> unrenumber_kv_pairs( + raft::handle_t const& handle, + vertex_t const* keys, + value_t const* values, + size_t num_pairs, + vertex_t const* renumber_map_labels, + size_t renumber_map_size); + +template +rmm::device_uvector sort_values_by_key(raft::handle_t const& handle, + vertex_t const* keys, + value_t const* values, + size_t num_pairs); + +} // namespace test +} // namespace cugraph From 14026e0bc597f75331bf6b318605b8b871aecc86 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 18 Mar 2021 16:59:42 -0400 Subject: [PATCH 19/63] update SG C++ BFS/SSSP/PageRank/Katz --- cpp/tests/experimental/bfs_test.cpp | 131 +++++++++++++----- .../experimental/katz_centrality_test.cpp | 83 ++++++----- cpp/tests/experimental/pagerank_test.cpp | 112 ++++++++++----- cpp/tests/experimental/sssp_test.cpp | 131 +++++++++++++----- 4 files changed, 318 insertions(+), 139 deletions(-) diff --git a/cpp/tests/experimental/bfs_test.cpp b/cpp/tests/experimental/bfs_test.cpp index 8bcb5de7043..6df768aba82 100644 --- a/cpp/tests/experimental/bfs_test.cpp +++ b/cpp/tests/experimental/bfs_test.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -28,6 +29,7 @@ #include +#include #include #include #include @@ -107,29 +109,28 @@ typedef struct BFS_Usecase_t { } BFS_Usecase; template -cugraph::experimental::graph_t read_graph( - raft::handle_t const& handle, BFS_Usecase const& configuration, bool renumber) +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, BFS_Usecase const& configuration, bool renumber) { - cugraph::experimental::graph_t graph(handle); - std::tie(graph, std::ignore) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test::read_graph_from_matrix_market_file( - handle, configuration.input_graph_specifier.graph_file_full_path, false, renumber) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - false, - renumber); - - return graph; + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, configuration.input_graph_specifier.graph_file_full_path, false, renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + false, + renumber); } class Tests_BFS : public ::testing::TestWithParam { @@ -144,17 +145,35 @@ class Tests_BFS : public ::testing::TestWithParam { template void run_current_test(BFS_Usecase const& configuration) { - constexpr bool renumber = false; + constexpr bool renumber = true; using weight_t = float; raft::handle_t handle{}; - auto graph = read_graph(handle, configuration, renumber); + cugraph::experimental::graph_t graph(handle); + rmm::device_uvector d_renumber_map_labels(0, handle.get_stream()); + std::tie(graph, d_renumber_map_labels) = + read_graph(handle, configuration, renumber); auto graph_view = graph.view(); - ASSERT_TRUE(configuration.source >= 0 && - configuration.source <= graph_view.get_number_of_vertices()) + auto source = static_cast(configuration.source); + if (renumber) { + std::vector h_renumber_map_labels(d_renumber_map_labels.size()); + raft::update_host(h_renumber_map_labels.data(), + d_renumber_map_labels.data(), + d_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + source = static_cast(thrust::distance( + h_renumber_map_labels.begin(), + std::find( + h_renumber_map_labels.begin(), h_renumber_map_labels.end(), configuration.source))); + } + + ASSERT_TRUE(source >= 0 && source < graph_view.get_number_of_vertices()) << "Starting sources should be >= 0 and" << " less than the number of vertices in the graph."; @@ -169,7 +188,7 @@ class Tests_BFS : public ::testing::TestWithParam { graph_view, d_distances.begin(), d_predecessors.begin(), - static_cast(configuration.source), + source, false, std::numeric_limits::max(), false); @@ -180,7 +199,8 @@ class Tests_BFS : public ::testing::TestWithParam { cugraph::experimental::graph_t unrenumbered_graph( handle); if (renumber) { - unrenumbered_graph = read_graph(handle, configuration, false); + std::tie(unrenumbered_graph, std::ignore) = + read_graph(handle, configuration, false); } auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view; @@ -194,6 +214,7 @@ class Tests_BFS : public ::testing::TestWithParam { unrenumbered_graph_view.indices(), unrenumbered_graph_view.get_number_of_edges(), handle.get_stream()); + handle.get_stream_view().synchronize(); std::vector h_reference_distances(unrenumbered_graph_view.get_number_of_vertices()); @@ -210,14 +231,48 @@ class Tests_BFS : public ::testing::TestWithParam { std::vector h_cugraph_distances(graph_view.get_number_of_vertices()); std::vector h_cugraph_predecessors(graph_view.get_number_of_vertices()); - - raft::update_host( - h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream()); - raft::update_host(h_cugraph_predecessors.data(), - d_predecessors.data(), - d_predecessors.size(), - handle.get_stream()); - handle.get_stream_view().synchronize(); + if (renumber) { + auto d_unrenumbered_distances = cugraph::test::sort_values_by_key( + handle, d_renumber_map_labels.data(), d_distances.data(), d_renumber_map_labels.size()); + auto d_unrenumbered_predecessors = + cugraph::test::sort_values_by_key(handle, + d_renumber_map_labels.data(), + d_predecessors.data(), + d_renumber_map_labels.size()); + raft::update_host(h_cugraph_distances.data(), + d_unrenumbered_distances.data(), + d_unrenumbered_distances.size(), + handle.get_stream()); + raft::update_host(h_cugraph_predecessors.data(), + d_unrenumbered_predecessors.data(), + d_unrenumbered_predecessors.size(), + handle.get_stream()); + + std::vector h_renumber_map_labels(d_renumber_map_labels.size()); + raft::update_host(h_renumber_map_labels.data(), + d_renumber_map_labels.data(), + d_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + std::transform( + h_cugraph_predecessors.begin(), + h_cugraph_predecessors.end(), + h_cugraph_predecessors.begin(), + [&h_renumber_map_labels](auto v) { + return v == cugraph::invalid_vertex_id::value ? v : h_renumber_map_labels[v]; + }); + } else { + raft::update_host( + h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream()); + raft::update_host(h_cugraph_predecessors.data(), + d_predecessors.data(), + d_predecessors.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + } ASSERT_TRUE(std::equal( h_reference_distances.begin(), h_reference_distances.end(), h_cugraph_distances.begin())) @@ -261,6 +316,8 @@ INSTANTIATE_TEST_CASE_P( BFS_Usecase("test/datasets/wiki-Talk.mtx", 1000), BFS_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0), // disable correctness checks for large graphs - BFS_Usecase(cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, 0, false))); + BFS_Usecase(cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, + 0, + false))); CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/katz_centrality_test.cpp b/cpp/tests/experimental/katz_centrality_test.cpp index 27b377a99d3..c8dc3ec5fd5 100644 --- a/cpp/tests/experimental/katz_centrality_test.cpp +++ b/cpp/tests/experimental/katz_centrality_test.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -125,32 +126,31 @@ typedef struct KatzCentrality_Usecase_t { } KatzCentrality_Usecase; template -cugraph::experimental::graph_t read_graph( - raft::handle_t const& handle, KatzCentrality_Usecase const& configuration, bool renumber) +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, KatzCentrality_Usecase const& configuration, bool renumber) { - cugraph::experimental::graph_t graph(handle); - std::tie(graph, std::ignore) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test::read_graph_from_matrix_market_file( - handle, - configuration.input_graph_specifier.graph_file_full_path, - configuration.test_weighted, - renumber) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - configuration.test_weighted, - renumber); - - return graph; + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, + configuration.input_graph_specifier.graph_file_full_path, + configuration.test_weighted, + renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + configuration.test_weighted, + renumber); } class Tests_KatzCentrality : public ::testing::TestWithParam { @@ -165,11 +165,14 @@ class Tests_KatzCentrality : public ::testing::TestWithParam void run_current_test(KatzCentrality_Usecase const& configuration) { - constexpr bool renumber = false; + constexpr bool renumber = true; raft::handle_t handle{}; - auto graph = read_graph(handle, configuration, renumber); + cugraph::experimental::graph_t graph(handle); + rmm::device_uvector d_renumber_map_labels(0, handle.get_stream()); + std::tie(graph, d_renumber_map_labels) = + read_graph(handle, configuration, renumber); auto graph_view = graph.view(); auto degrees = graph_view.compute_in_degrees(handle); @@ -205,7 +208,8 @@ class Tests_KatzCentrality : public ::testing::TestWithParam unrenumbered_graph( handle); if (renumber) { - unrenumbered_graph = read_graph(handle, configuration, false); + std::tie(unrenumbered_graph, std::ignore) = + read_graph(handle, configuration, false); } auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view; @@ -227,7 +231,8 @@ class Tests_KatzCentrality : public ::testing::TestWithParam h_reference_katz_centralities( unrenumbered_graph_view.get_number_of_vertices()); @@ -247,11 +252,23 @@ class Tests_KatzCentrality : public ::testing::TestWithParam h_cugraph_katz_centralities(graph_view.get_number_of_vertices()); + if (renumber) { + auto d_unrenumbered_katz_centralities = + cugraph::test::sort_values_by_key(handle, + d_renumber_map_labels.data(), + d_katz_centralities.data(), + d_renumber_map_labels.size()); + raft::update_host(h_cugraph_katz_centralities.data(), + d_unrenumbered_katz_centralities.data(), + d_unrenumbered_katz_centralities.size(), + handle.get_stream()); + } else { + raft::update_host(h_cugraph_katz_centralities.data(), + d_katz_centralities.data(), + d_katz_centralities.size(), + handle.get_stream()); + } - raft::update_host(h_cugraph_katz_centralities.data(), - d_katz_centralities.data(), - d_katz_centralities.size(), - handle.get_stream()); handle.get_stream_view().synchronize(); auto threshold_ratio = 1e-3; diff --git a/cpp/tests/experimental/pagerank_test.cpp b/cpp/tests/experimental/pagerank_test.cpp index cc63ae1e6c5..49feaffd69f 100644 --- a/cpp/tests/experimental/pagerank_test.cpp +++ b/cpp/tests/experimental/pagerank_test.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -167,32 +168,31 @@ typedef struct PageRank_Usecase_t { } PageRank_Usecase; template -cugraph::experimental::graph_t read_graph( - raft::handle_t const& handle, PageRank_Usecase const& configuration, bool renumber) +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, PageRank_Usecase const& configuration, bool renumber) { - cugraph::experimental::graph_t graph(handle); - std::tie(graph, std::ignore) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test::read_graph_from_matrix_market_file( - handle, - configuration.input_graph_specifier.graph_file_full_path, - configuration.test_weighted, - renumber) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - configuration.test_weighted, - renumber); - - return graph; + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, + configuration.input_graph_specifier.graph_file_full_path, + configuration.test_weighted, + renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + configuration.test_weighted, + renumber); } class Tests_PageRank : public ::testing::TestWithParam { @@ -207,11 +207,14 @@ class Tests_PageRank : public ::testing::TestWithParam { template void run_current_test(PageRank_Usecase const& configuration) { - constexpr bool renumber = false; + constexpr bool renumber = true; raft::handle_t handle{}; - auto graph = read_graph(handle, configuration, renumber); + cugraph::experimental::graph_t graph(handle); + rmm::device_uvector d_renumber_map_labels(0, handle.get_stream()); + std::tie(graph, d_renumber_map_labels) = + read_graph(handle, configuration, renumber); auto graph_view = graph.view(); std::vector h_personalization_vertices{}; @@ -286,7 +289,8 @@ class Tests_PageRank : public ::testing::TestWithParam { cugraph::experimental::graph_t unrenumbered_graph( handle); if (renumber) { - unrenumbered_graph = read_graph(handle, configuration, false); + std::tie(unrenumbered_graph, std::ignore) = + read_graph(handle, configuration, false); } auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view; @@ -308,6 +312,41 @@ class Tests_PageRank : public ::testing::TestWithParam { unrenumbered_graph_view.get_number_of_edges(), handle.get_stream()); } + + std::vector h_unrenumbered_personalization_vertices( + d_personalization_vertices.size()); + std::vector h_unrenumbered_personalization_values(d_personalization_values.size()); + if (renumber) { + rmm::device_uvector d_unrenumbered_personalization_vertices(0, + handle.get_stream()); + rmm::device_uvector d_unrenumbered_personalization_values(0, handle.get_stream()); + std::tie(d_unrenumbered_personalization_vertices, d_unrenumbered_personalization_values) = + cugraph::test::unrenumber_kv_pairs(handle, + d_personalization_vertices.data(), + d_personalization_values.data(), + d_personalization_vertices.size(), + d_renumber_map_labels.data(), + d_renumber_map_labels.size()); + + raft::update_host(h_unrenumbered_personalization_vertices.data(), + d_unrenumbered_personalization_vertices.data(), + d_unrenumbered_personalization_vertices.size(), + handle.get_stream()); + raft::update_host(h_unrenumbered_personalization_values.data(), + d_unrenumbered_personalization_values.data(), + d_unrenumbered_personalization_values.size(), + handle.get_stream()); + } else { + raft::update_host(h_unrenumbered_personalization_vertices.data(), + d_personalization_vertices.data(), + d_personalization_vertices.size(), + handle.get_stream()); + raft::update_host(h_unrenumbered_personalization_values.data(), + d_personalization_values.data(), + d_personalization_values.size(), + handle.get_stream()); + } + handle.get_stream_view().synchronize(); std::vector h_reference_pageranks(unrenumbered_graph_view.get_number_of_vertices()); @@ -315,8 +354,8 @@ class Tests_PageRank : public ::testing::TestWithParam { pagerank_reference(h_offsets.data(), h_indices.data(), h_weights.size() > 0 ? h_weights.data() : static_cast(nullptr), - h_personalization_vertices.data(), - h_personalization_values.data(), + h_unrenumbered_personalization_vertices.data(), + h_unrenumbered_personalization_values.data(), h_reference_pageranks.data(), unrenumbered_graph_view.get_number_of_vertices(), static_cast(h_personalization_vertices.size()), @@ -326,9 +365,18 @@ class Tests_PageRank : public ::testing::TestWithParam { false); std::vector h_cugraph_pageranks(graph_view.get_number_of_vertices()); + if (renumber) { + auto d_unrenumbered_pageranks = cugraph::test::sort_values_by_key( + handle, d_renumber_map_labels.data(), d_pageranks.data(), d_renumber_map_labels.size()); + raft::update_host(h_cugraph_pageranks.data(), + d_unrenumbered_pageranks.data(), + d_unrenumbered_pageranks.size(), + handle.get_stream()); + } else { + raft::update_host( + h_cugraph_pageranks.data(), d_pageranks.data(), d_pageranks.size(), handle.get_stream()); + } - raft::update_host( - h_cugraph_pageranks.data(), d_pageranks.data(), d_pageranks.size(), handle.get_stream()); handle.get_stream_view().synchronize(); auto threshold_ratio = 1e-3; diff --git a/cpp/tests/experimental/sssp_test.cpp b/cpp/tests/experimental/sssp_test.cpp index b2224badb2b..7ab4321aefb 100644 --- a/cpp/tests/experimental/sssp_test.cpp +++ b/cpp/tests/experimental/sssp_test.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -28,6 +29,7 @@ #include +#include #include #include #include @@ -113,29 +115,28 @@ typedef struct SSSP_Usecase_t { } SSSP_Usecase; template -cugraph::experimental::graph_t read_graph( - raft::handle_t const& handle, SSSP_Usecase const& configuration, bool renumber) +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, SSSP_Usecase const& configuration, bool renumber) { - cugraph::experimental::graph_t graph(handle); - std::tie(graph, std::ignore) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test::read_graph_from_matrix_market_file( - handle, configuration.input_graph_specifier.graph_file_full_path, true, renumber) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - true, - renumber); - - return graph; + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, configuration.input_graph_specifier.graph_file_full_path, true, renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + true, + renumber); } class Tests_SSSP : public ::testing::TestWithParam { @@ -150,15 +151,33 @@ class Tests_SSSP : public ::testing::TestWithParam { template void run_current_test(SSSP_Usecase const& configuration) { - constexpr bool renumber = false; + constexpr bool renumber = true; raft::handle_t handle{}; - auto graph = read_graph(handle, configuration, renumber); + cugraph::experimental::graph_t graph(handle); + rmm::device_uvector d_renumber_map_labels(0, handle.get_stream()); + std::tie(graph, d_renumber_map_labels) = + read_graph(handle, configuration, renumber); auto graph_view = graph.view(); - ASSERT_TRUE(configuration.source >= 0 && - configuration.source <= graph_view.get_number_of_vertices()) + auto source = static_cast(configuration.source); + if (renumber) { + std::vector h_renumber_map_labels(d_renumber_map_labels.size()); + raft::update_host(h_renumber_map_labels.data(), + d_renumber_map_labels.data(), + d_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + source = static_cast(thrust::distance( + h_renumber_map_labels.begin(), + std::find( + h_renumber_map_labels.begin(), h_renumber_map_labels.end(), configuration.source))); + } + + ASSERT_TRUE(source >= 0 && source < graph_view.get_number_of_vertices()) << "Starting sources should be >= 0 and" << " less than the number of vertices in the graph."; @@ -173,7 +192,7 @@ class Tests_SSSP : public ::testing::TestWithParam { graph_view, d_distances.begin(), d_predecessors.begin(), - static_cast(configuration.source), + source, std::numeric_limits::max(), false); @@ -183,7 +202,8 @@ class Tests_SSSP : public ::testing::TestWithParam { cugraph::experimental::graph_t unrenumbered_graph( handle); if (renumber) { - unrenumbered_graph = read_graph(handle, configuration, false); + std::tie(unrenumbered_graph, std::ignore) = + read_graph(handle, configuration, false); } auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view; @@ -202,6 +222,7 @@ class Tests_SSSP : public ::testing::TestWithParam { unrenumbered_graph_view.weights(), unrenumbered_graph_view.get_number_of_edges(), handle.get_stream()); + handle.get_stream_view().synchronize(); std::vector h_reference_distances(unrenumbered_graph_view.get_number_of_vertices()); @@ -218,14 +239,48 @@ class Tests_SSSP : public ::testing::TestWithParam { std::vector h_cugraph_distances(graph_view.get_number_of_vertices()); std::vector h_cugraph_predecessors(graph_view.get_number_of_vertices()); - - raft::update_host( - h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream()); - raft::update_host(h_cugraph_predecessors.data(), - d_predecessors.data(), - d_predecessors.size(), - handle.get_stream()); - handle.get_stream_view().synchronize(); + if (renumber) { + auto d_unrenumbered_distances = cugraph::test::sort_values_by_key( + handle, d_renumber_map_labels.data(), d_distances.data(), d_renumber_map_labels.size()); + auto d_unrenumbered_predecessors = + cugraph::test::sort_values_by_key(handle, + d_renumber_map_labels.data(), + d_predecessors.data(), + d_renumber_map_labels.size()); + raft::update_host(h_cugraph_distances.data(), + d_unrenumbered_distances.data(), + d_unrenumbered_distances.size(), + handle.get_stream()); + raft::update_host(h_cugraph_predecessors.data(), + d_unrenumbered_predecessors.data(), + d_unrenumbered_predecessors.size(), + handle.get_stream()); + + std::vector h_renumber_map_labels(d_renumber_map_labels.size()); + raft::update_host(h_renumber_map_labels.data(), + d_renumber_map_labels.data(), + d_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + std::transform( + h_cugraph_predecessors.begin(), + h_cugraph_predecessors.end(), + h_cugraph_predecessors.begin(), + [&h_renumber_map_labels](auto v) { + return v == cugraph::invalid_vertex_id::value ? v : h_renumber_map_labels[v]; + }); + } else { + raft::update_host( + h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream()); + raft::update_host(h_cugraph_predecessors.data(), + d_predecessors.data(), + d_predecessors.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + } auto max_weight_element = std::max_element(h_weights.begin(), h_weights.end()); auto epsilon = *max_weight_element * weight_t{1e-6}; @@ -274,6 +329,8 @@ INSTANTIATE_TEST_CASE_P( SSSP_Usecase("test/datasets/wiki2003.mtx", 1000), SSSP_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0), // disable correctness checks for large graphs - SSSP_Usecase(cugraph::test::rmat_params_t{25, 16, 0.57, 0.19, 0.19, 0, false, false}, 0, false))); + SSSP_Usecase(cugraph::test::rmat_params_t{25, 16, 0.57, 0.19, 0.19, 0, false, false}, + 0, + false))); CUGRAPH_TEST_PROGRAM_MAIN() From af411875f955e5ce8d06d348d385953bc7dbb2f9 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 18 Mar 2021 17:20:40 -0400 Subject: [PATCH 20/63] CMake update for adding test renumber utilities --- cpp/tests/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 1db2f9df42e..268b8bd9bde 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -23,6 +23,7 @@ add_library(cugraphtestutil STATIC "${CMAKE_CURRENT_SOURCE_DIR}/utilities/generate_graph_from_edgelist.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/matrix_market_file_utilities.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/rmat_utilities.cu" + "${CMAKE_CURRENT_SOURCE_DIR}/utilities/renumber_utilities.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/misc_utilities.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c") From 0eb0b88d677092790b84db2a937e945af3e9d9ac Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 18 Mar 2021 22:48:10 -0400 Subject: [PATCH 21/63] update MG C++ PageRank tests to promote performance testing --- cpp/tests/experimental/pagerank_test.cpp | 3 +- cpp/tests/pagerank/mg_pagerank_test.cpp | 362 +++++++++++---------- cpp/tests/utilities/renumber_utilities.cu | 23 +- cpp/tests/utilities/renumber_utilities.hpp | 5 +- 4 files changed, 204 insertions(+), 189 deletions(-) diff --git a/cpp/tests/experimental/pagerank_test.cpp b/cpp/tests/experimental/pagerank_test.cpp index 49feaffd69f..4e846395a96 100644 --- a/cpp/tests/experimental/pagerank_test.cpp +++ b/cpp/tests/experimental/pagerank_test.cpp @@ -326,7 +326,8 @@ class Tests_PageRank : public ::testing::TestWithParam { d_personalization_values.data(), d_personalization_vertices.size(), d_renumber_map_labels.data(), - d_renumber_map_labels.size()); + vertex_t{0}, + static_cast(d_renumber_map_labels.size())); raft::update_host(h_unrenumbered_personalization_vertices.data(), d_unrenumbered_personalization_vertices.data(), diff --git a/cpp/tests/pagerank/mg_pagerank_test.cpp b/cpp/tests/pagerank/mg_pagerank_test.cpp index 85ee9a4243e..8e801ea8085 100644 --- a/cpp/tests/pagerank/mg_pagerank_test.cpp +++ b/cpp/tests/pagerank/mg_pagerank_test.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -23,6 +24,8 @@ #include #include #include +#include +#include #include @@ -33,11 +36,15 @@ typedef struct PageRank_Usecase_t { double personalization_ratio{0.0}; bool test_weighted{false}; + bool check_correctness{false}; PageRank_Usecase_t(std::string const& graph_file_path, double personalization_ratio, - bool test_weighted) - : personalization_ratio(personalization_ratio), test_weighted(test_weighted) + bool test_weighted, + bool check_correctness = true) + : personalization_ratio(personalization_ratio), + test_weighted(test_weighted), + check_correctness(check_correctness) { std::string graph_file_full_path{}; if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { @@ -51,14 +58,45 @@ typedef struct PageRank_Usecase_t { PageRank_Usecase_t(cugraph::test::rmat_params_t rmat_params, double personalization_ratio, - bool test_weighted) - : personalization_ratio(personalization_ratio), test_weighted(test_weighted) + bool test_weighted, + bool check_correctness = true) + : personalization_ratio(personalization_ratio), + test_weighted(test_weighted), + check_correctness(check_correctness) { input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; input_graph_specifier.rmat_params = rmat_params; } } PageRank_Usecase; +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, PageRank_Usecase const& configuration, bool renumber) +{ + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, + configuration.input_graph_specifier.graph_file_full_path, + configuration.test_weighted, + renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + configuration.test_weighted, + renumber); +} + class Tests_MGPageRank : public ::testing::TestWithParam { public: Tests_MGPageRank() {} @@ -86,168 +124,40 @@ class Tests_MGPageRank : public ::testing::TestWithParam { cugraph::partition_2d::subcomm_factory_t subcomm_factory(handle, row_comm_size); - // 2. create SG & MG graphs - - cugraph::experimental::graph_t sg_graph(handle); - rmm::device_uvector d_sg_renumber_map_labels(0, handle.get_stream()); - std::tie(sg_graph, d_sg_renumber_map_labels) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test:: - read_graph_from_matrix_market_file( - handle, - configuration.input_graph_specifier.graph_file_full_path, - configuration.test_weighted, - true) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - configuration.test_weighted, - true); - - auto sg_graph_view = sg_graph.view(); + // 2. create MG graph cugraph::experimental::graph_t mg_graph(handle); rmm::device_uvector d_mg_renumber_map_labels(0, handle.get_stream()); std::tie(mg_graph, d_mg_renumber_map_labels) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test::read_graph_from_matrix_market_file( - handle, - configuration.input_graph_specifier.graph_file_full_path, - configuration.test_weighted, - true) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - configuration.test_weighted, - true); + read_graph(handle, configuration, true); auto mg_graph_view = mg_graph.view(); - std::vector h_sg_renumber_map_labels(d_sg_renumber_map_labels.size()); - raft::update_host(h_sg_renumber_map_labels.data(), - d_sg_renumber_map_labels.data(), - d_sg_renumber_map_labels.size(), - handle.get_stream()); - - std::vector h_mg_renumber_map_labels(mg_graph_view.get_number_of_local_vertices()); - raft::update_host(h_mg_renumber_map_labels.data(), - d_mg_renumber_map_labels.data(), - d_mg_renumber_map_labels.size(), - handle.get_stream()); - - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - // 2. generate personalization vertex/value pairs + // 3. generate personalization vertex/value pairs - std::vector h_personalization_vertices{}; - std::vector h_personalization_values{}; + std::vector h_mg_personalization_vertices{}; + std::vector h_mg_personalization_values{}; if (configuration.personalization_ratio > 0.0) { - std::default_random_engine generator{}; + std::default_random_engine generator{ + static_cast(comm.get_rank()) /* seed */}; std::uniform_real_distribution distribution{0.0, 1.0}; - h_personalization_vertices.resize(sg_graph_view.get_number_of_vertices()); - std::iota(h_personalization_vertices.begin(), h_personalization_vertices.end(), vertex_t{0}); - h_personalization_vertices.erase( - std::remove_if(h_personalization_vertices.begin(), - h_personalization_vertices.end(), + h_mg_personalization_vertices.resize(mg_graph_view.get_number_of_local_vertices()); + std::iota(h_mg_personalization_vertices.begin(), + h_mg_personalization_vertices.end(), + mg_graph_view.get_local_vertex_first()); + h_mg_personalization_vertices.erase( + std::remove_if(h_mg_personalization_vertices.begin(), + h_mg_personalization_vertices.end(), [&generator, &distribution, configuration](auto v) { return distribution(generator) >= configuration.personalization_ratio; }), - h_personalization_vertices.end()); - h_personalization_values.resize(h_personalization_vertices.size()); - std::for_each(h_personalization_values.begin(), - h_personalization_values.end(), + h_mg_personalization_vertices.end()); + h_mg_personalization_values.resize(h_mg_personalization_vertices.size()); + std::for_each(h_mg_personalization_values.begin(), + h_mg_personalization_values.end(), [&distribution, &generator](auto& val) { val = distribution(generator); }); } - result_t constexpr alpha{0.85}; - result_t constexpr epsilon{1e-6}; - - // 3. run SG pagerank - - std::vector h_sg_personalization_vertices{}; - std::vector h_sg_personalization_values{}; - if (h_personalization_vertices.size() > 0) { - for (vertex_t i = 0; i < sg_graph_view.get_number_of_vertices(); ++i) { - auto it = std::lower_bound(h_personalization_vertices.begin(), - h_personalization_vertices.end(), - h_sg_renumber_map_labels[i]); - if (*it == h_sg_renumber_map_labels[i]) { - h_sg_personalization_vertices.push_back(i); - h_sg_personalization_values.push_back( - h_personalization_values[std::distance(h_personalization_vertices.begin(), it)]); - } - } - } - - rmm::device_uvector d_sg_personalization_vertices( - h_sg_personalization_vertices.size(), handle.get_stream()); - rmm::device_uvector d_sg_personalization_values(d_sg_personalization_vertices.size(), - handle.get_stream()); - if (d_sg_personalization_vertices.size() > 0) { - raft::update_device(d_sg_personalization_vertices.data(), - h_sg_personalization_vertices.data(), - h_sg_personalization_vertices.size(), - handle.get_stream()); - raft::update_device(d_sg_personalization_values.data(), - h_sg_personalization_values.data(), - h_sg_personalization_values.size(), - handle.get_stream()); - } - - rmm::device_uvector d_sg_pageranks(sg_graph_view.get_number_of_vertices(), - handle.get_stream()); - - cugraph::experimental::pagerank(handle, - sg_graph_view, - static_cast(nullptr), - d_sg_personalization_vertices.data(), - d_sg_personalization_values.data(), - static_cast(d_sg_personalization_vertices.size()), - d_sg_pageranks.begin(), - alpha, - epsilon, - std::numeric_limits::max(), // max_iterations - false, - false); - - std::vector h_sg_pageranks(sg_graph_view.get_number_of_vertices()); - raft::update_host( - h_sg_pageranks.data(), d_sg_pageranks.data(), d_sg_pageranks.size(), handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - // 4. run MG pagerank - - std::vector h_mg_personalization_vertices{}; - std::vector h_mg_personalization_values{}; - if (h_personalization_vertices.size() > 0) { - for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) { - auto it = std::lower_bound(h_personalization_vertices.begin(), - h_personalization_vertices.end(), - h_mg_renumber_map_labels[i]); - if (*it == h_mg_renumber_map_labels[i]) { - h_mg_personalization_vertices.push_back(mg_graph_view.get_local_vertex_first() + i); - h_mg_personalization_values.push_back( - h_personalization_values[std::distance(h_personalization_vertices.begin(), it)]); - } - } - } - rmm::device_uvector d_mg_personalization_vertices( h_mg_personalization_vertices.size(), handle.get_stream()); rmm::device_uvector d_mg_personalization_values(d_mg_personalization_vertices.size(), @@ -263,6 +173,11 @@ class Tests_MGPageRank : public ::testing::TestWithParam { handle.get_stream()); } + // 4. run MG pagerank + + result_t constexpr alpha{0.85}; + result_t constexpr epsilon{1e-6}; + rmm::device_uvector d_mg_pageranks(mg_graph_view.get_number_of_local_vertices(), handle.get_stream()); @@ -278,40 +193,121 @@ class Tests_MGPageRank : public ::testing::TestWithParam { alpha, epsilon, std::numeric_limits::max(), - false, false); CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - std::vector h_mg_pageranks(mg_graph_view.get_number_of_local_vertices()); - raft::update_host( - h_mg_pageranks.data(), d_mg_pageranks.data(), d_mg_pageranks.size(), handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - // 5. copmare SG & MG results - std::vector h_sg_shuffled_pageranks(sg_graph_view.get_number_of_vertices(), - result_t{0.0}); - for (size_t i = 0; i < h_sg_pageranks.size(); ++i) { - h_sg_shuffled_pageranks[h_sg_renumber_map_labels[i]] = h_sg_pageranks[i]; - } + if (configuration.check_correctness) { + // 5-1. create SG graph + + cugraph::experimental::graph_t sg_graph(handle); + std::tie(sg_graph, std::ignore) = + read_graph(handle, configuration, false); + + auto sg_graph_view = sg_graph.view(); + + // 5-2. collect personalization vertex/value pairs + + rmm::device_uvector d_sg_personalization_vertices(0, handle.get_stream()); + rmm::device_uvector d_sg_personalization_values(0, handle.get_stream()); + if (configuration.personalization_ratio > 0.0) { + rmm::device_uvector d_unrenumbered_personalization_vertices(0, + handle.get_stream()); + rmm::device_uvector d_unrenumbered_personalization_values(0, handle.get_stream()); + std::tie(d_unrenumbered_personalization_vertices, d_unrenumbered_personalization_values) = + cugraph::test::unrenumber_kv_pairs(handle, + d_mg_personalization_vertices.data(), + d_mg_personalization_values.data(), + d_mg_personalization_vertices.size(), + d_mg_renumber_map_labels.data(), + mg_graph_view.get_local_vertex_first(), + mg_graph_view.get_local_vertex_last()); + + rmm::device_scalar d_local_personalization_vector_size( + d_unrenumbered_personalization_vertices.size(), handle.get_stream()); + rmm::device_uvector d_recvcounts(comm_size, handle.get_stream()); + comm.allgather( + d_local_personalization_vector_size.data(), d_recvcounts.data(), 1, handle.get_stream()); + std::vector recvcounts(d_recvcounts.size()); + raft::update_host( + recvcounts.data(), d_recvcounts.data(), d_recvcounts.size(), handle.get_stream()); + auto status = comm.sync_stream(handle.get_stream()); + ASSERT_EQ(status, raft::comms::status_t::SUCCESS); + + std::vector displacements(recvcounts.size(), size_t{0}); + std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); + + d_sg_personalization_vertices.resize(displacements.back() + recvcounts.back(), + handle.get_stream()); + d_sg_personalization_values.resize(d_sg_personalization_vertices.size(), + handle.get_stream()); + + comm.allgatherv(d_unrenumbered_personalization_vertices.data(), + d_sg_personalization_vertices.data(), + recvcounts.data(), + displacements.data(), + handle.get_stream()); + comm.allgatherv(d_unrenumbered_personalization_values.data(), + d_sg_personalization_values.data(), + recvcounts.data(), + displacements.data(), + handle.get_stream()); + } + + // 5-3. run SG pagerank + + rmm::device_uvector d_sg_pageranks(sg_graph_view.get_number_of_vertices(), + handle.get_stream()); + + cugraph::experimental::pagerank(handle, + sg_graph_view, + static_cast(nullptr), + d_sg_personalization_vertices.data(), + d_sg_personalization_values.data(), + static_cast(d_sg_personalization_vertices.size()), + d_sg_pageranks.begin(), + alpha, + epsilon, + std::numeric_limits::max(), // max_iterations + false); + + // 5-4. compare + + std::vector h_sg_pageranks(sg_graph_view.get_number_of_vertices()); + raft::update_host( + h_sg_pageranks.data(), d_sg_pageranks.data(), d_sg_pageranks.size(), handle.get_stream()); + + std::vector h_mg_pageranks(mg_graph_view.get_number_of_local_vertices()); + raft::update_host( + h_mg_pageranks.data(), d_mg_pageranks.data(), d_mg_pageranks.size(), handle.get_stream()); + + std::vector h_mg_renumber_map_labels(d_mg_renumber_map_labels.size()); + raft::update_host(h_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + auto threshold_ratio = 1e-3; + auto threshold_magnitude = + (1.0 / static_cast(mg_graph_view.get_number_of_vertices())) * + threshold_ratio; // skip comparison for low PageRank verties (lowly ranked vertices) + auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { + return std::abs(lhs - rhs) < + std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); + }; - auto threshold_ratio = 1e-3; - auto threshold_magnitude = - (1.0 / static_cast(mg_graph_view.get_number_of_vertices())) * - threshold_ratio; // skip comparison for low PageRank verties (lowly ranked vertices) - auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { - return std::abs(lhs - rhs) < - std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); - }; - - for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) { - auto mapped_vertex = h_mg_renumber_map_labels[i]; - ASSERT_TRUE(nearly_equal(h_mg_pageranks[i], h_sg_shuffled_pageranks[mapped_vertex])) - << "MG PageRank value for vertex: " << i << " in rank: " << comm_rank - << " has value: " << h_mg_pageranks[i] - << " which exceeds the error margin for comparing to SG value: " - << h_sg_shuffled_pageranks[mapped_vertex]; + for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) { + auto mapped_vertex = h_mg_renumber_map_labels[i]; + ASSERT_TRUE(nearly_equal(h_mg_pageranks[i], h_sg_pageranks[mapped_vertex])) + << "MG PageRank value for vertex: " << mapped_vertex << " in rank: " << comm_rank + << " has value: " << h_mg_pageranks[i] + << " which exceeds the error margin for comparing to SG value: " + << h_sg_pageranks[mapped_vertex]; + } } } }; @@ -325,6 +321,7 @@ INSTANTIATE_TEST_CASE_P( simple_test, Tests_MGPageRank, ::testing::Values( + // enable correctness checks PageRank_Usecase("test/datasets/karate.mtx", 0.0, false), PageRank_Usecase("test/datasets/karate.mtx", 0.5, false), PageRank_Usecase("test/datasets/karate.mtx", 0.0, true), @@ -352,6 +349,15 @@ INSTANTIATE_TEST_CASE_P( true), PageRank_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0.5, - true))); + true), + // disable correctness checks for large graphs + PageRank_Usecase( + cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, false, false), + PageRank_Usecase( + cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, false, false), + PageRank_Usecase( + cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, true, false), + PageRank_Usecase( + cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, true, false))); CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/utilities/renumber_utilities.cu b/cpp/tests/utilities/renumber_utilities.cu index ef89c84f65a..306faaee1aa 100644 --- a/cpp/tests/utilities/renumber_utilities.cu +++ b/cpp/tests/utilities/renumber_utilities.cu @@ -28,17 +28,20 @@ namespace test { template std::tuple, rmm::device_uvector> unrenumber_kv_pairs( raft::handle_t const& handle, - vertex_t const* keys /* 0 <= keys[] < renumber_map_size */, + vertex_t const* keys, value_t const* values, size_t num_pairs, vertex_t const* renumber_map_labels, - size_t renumber_map_size) + vertex_t map_key_first, + vertex_t map_key_last) { rmm::device_uvector unrenumbered_keys(num_pairs, handle.get_stream_view()); rmm::device_uvector values_for_unrenumbered_keys(num_pairs, handle.get_stream_view()); - auto unrenumbered_key_first = thrust::make_transform_iterator( - keys, [renumber_map_labels] __device__(auto v) { return renumber_map_labels[v]; }); + auto unrenumbered_key_first = + thrust::make_transform_iterator(keys, [renumber_map_labels, map_key_first] __device__(auto v) { + return renumber_map_labels[v - map_key_first]; + }); thrust::copy(rmm::exec_policy(handle.get_stream_view()), unrenumbered_key_first, unrenumbered_key_first + num_pairs, @@ -84,7 +87,8 @@ unrenumber_kv_pairs(raft::handle_t const& handle, float const* values, size_t num_pairs, int32_t const* renumber_map_labels, - size_t renumber_map_size); + int32_t map_key_first, + int32_t map_key_last); template std::tuple, rmm::device_uvector> unrenumber_kv_pairs(raft::handle_t const& handle, @@ -92,7 +96,8 @@ unrenumber_kv_pairs(raft::handle_t const& handle, double const* values, size_t num_pairs, int32_t const* renumber_map_labels, - size_t renumber_map_size); + int32_t map_key_first, + int32_t map_key_last); template std::tuple, rmm::device_uvector> unrenumber_kv_pairs(raft::handle_t const& handle, @@ -100,7 +105,8 @@ unrenumber_kv_pairs(raft::handle_t const& handle, float const* values, size_t num_pairs, int64_t const* renumber_map_labels, - size_t renumber_map_size); + int64_t map_key_first, + int64_t map_key_last); template std::tuple, rmm::device_uvector> unrenumber_kv_pairs(raft::handle_t const& handle, @@ -108,7 +114,8 @@ unrenumber_kv_pairs(raft::handle_t const& handle, double const* values, size_t num_pairs, int64_t const* renumber_map_labels, - size_t renumber_map_size); + int64_t map_key_first, + int64_t map_key_last); template rmm::device_uvector sort_values_by_key(raft::handle_t const& handle, int32_t const* keys, diff --git a/cpp/tests/utilities/renumber_utilities.hpp b/cpp/tests/utilities/renumber_utilities.hpp index 3e3d651c3e4..01bf865e58c 100644 --- a/cpp/tests/utilities/renumber_utilities.hpp +++ b/cpp/tests/utilities/renumber_utilities.hpp @@ -23,11 +23,12 @@ namespace test { template std::tuple, rmm::device_uvector> unrenumber_kv_pairs( raft::handle_t const& handle, - vertex_t const* keys, + vertex_t const* keys /* map_key_first <= keys[] < map_key_last */, value_t const* values, size_t num_pairs, vertex_t const* renumber_map_labels, - size_t renumber_map_size); + vertex_t map_key_first, + vertex_t map_key_last); template rmm::device_uvector sort_values_by_key(raft::handle_t const& handle, From aa5ed5d454e28441d948f762ca5f4e6ff08a78c3 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 19 Mar 2021 00:57:03 -0400 Subject: [PATCH 22/63] cut memory footprint in renumber_edgelist --- cpp/src/experimental/renumber_edgelist.cu | 37 ++++++++++++++--------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/cpp/src/experimental/renumber_edgelist.cu b/cpp/src/experimental/renumber_edgelist.cu index a7931d9710d..29579829bff 100644 --- a/cpp/src/experimental/renumber_edgelist.cu +++ b/cpp/src/experimental/renumber_edgelist.cu @@ -62,27 +62,34 @@ rmm::device_uvector compute_renumber_map( rmm::device_uvector major_labels(0, handle.get_stream()); rmm::device_uvector major_counts(0, handle.get_stream()); for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { - rmm::device_uvector sorted_major_labels(edgelist_edge_counts[i], handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_major_vertices[i], - edgelist_major_vertices[i] + edgelist_edge_counts[i], - sorted_major_labels.begin()); - thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - sorted_major_labels.begin(), - sorted_major_labels.end()); - rmm::device_uvector tmp_major_labels(sorted_major_labels.size(), handle.get_stream()); - rmm::device_uvector tmp_major_counts(tmp_major_labels.size(), handle.get_stream()); - auto major_pair_it = + rmm::device_uvector tmp_major_labels(0, handle.get_stream()); + rmm::device_uvector tmp_major_counts(0, handle.get_stream()); + { + rmm::device_uvector sorted_major_labels(edgelist_edge_counts[i], + handle.get_stream()); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edgelist_major_vertices[i], + edgelist_major_vertices[i] + edgelist_edge_counts[i], + sorted_major_labels.begin()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_major_labels.begin(), + sorted_major_labels.end()); + auto num_unique_labels = + thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(sorted_major_labels.size()), + [labels = sorted_major_labels.data()] __device__(auto i) { + return (i == 0) || (labels[i - 1] != labels[i]); + }); + tmp_major_labels.resize(num_unique_labels, handle.get_stream()); + tmp_major_counts.resize(tmp_major_labels.size(), handle.get_stream()); thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), sorted_major_labels.begin(), sorted_major_labels.end(), thrust::make_constant_iterator(edge_t{1}), tmp_major_labels.begin(), tmp_major_counts.begin()); - tmp_major_labels.resize( - thrust::distance(tmp_major_labels.begin(), thrust::get<0>(major_pair_it)), - handle.get_stream()); - tmp_major_counts.resize(tmp_major_labels.size(), handle.get_stream()); + } if (multi_gpu) { auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); From 8e365fed11d5525bff7691e60852cf297751ed04 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 19 Mar 2021 23:43:36 -0400 Subject: [PATCH 23/63] update SG/MG C++ testing infrastructures --- .../experimental/generate_rmat_edgelist.cu | 4 +- cpp/src/experimental/renumber_edgelist.cu | 6 +- cpp/tests/CMakeLists.txt | 2 +- cpp/tests/experimental/bfs_test.cpp | 4 +- .../experimental/katz_centrality_test.cpp | 4 +- cpp/tests/experimental/pagerank_test.cpp | 4 +- cpp/tests/experimental/sssp_test.cpp | 4 +- cpp/tests/pagerank/mg_pagerank_test.cpp | 13 +- .../generate_graph_from_edgelist.cu | 117 +-- .../detail/generate_graph_from_edgelist.hpp | 49 ++ .../utilities/matrix_market_file_utilities.cu | 89 +- cpp/tests/utilities/rmat_utilities.cu | 757 +++++++++++------- cpp/tests/utilities/test_utilities.hpp | 20 +- 13 files changed, 680 insertions(+), 393 deletions(-) rename cpp/tests/utilities/{ => detail}/generate_graph_from_edgelist.cu (81%) create mode 100644 cpp/tests/utilities/detail/generate_graph_from_edgelist.hpp diff --git a/cpp/src/experimental/generate_rmat_edgelist.cu b/cpp/src/experimental/generate_rmat_edgelist.cu index 0a6d666432f..bf9bcea149b 100644 --- a/cpp/src/experimental/generate_rmat_edgelist.cu +++ b/cpp/src/experimental/generate_rmat_edgelist.cu @@ -44,13 +44,13 @@ std::tuple, rmm::device_uvector> generat bool clip_and_flip, bool scramble_vertex_ids) { - CUGRAPH_EXPECTS(size_t{1} << scale <= std::numeric_limits::max(), + CUGRAPH_EXPECTS((size_t{1} << scale) <= static_cast(std::numeric_limits::max()), "Invalid input argument: scale too large for vertex_t."); CUGRAPH_EXPECTS((a >= 0.0) && (b >= 0.0) && (c >= 0.0) && (a + b + c <= 1.0), "Invalid input argument: a, b, c should be non-negative and a + b + c should not " "be larger than 1.0."); - raft::random::Rng rng(seed + 10); + raft::random::Rng rng(seed); // to limit memory footprint (1024 is a tuning parameter) auto max_edges_to_generate_per_iteration = static_cast(handle.get_device_properties().multiProcessorCount) * 1024; diff --git a/cpp/src/experimental/renumber_edgelist.cu b/cpp/src/experimental/renumber_edgelist.cu index 29579829bff..a68cb8a7d9c 100644 --- a/cpp/src/experimental/renumber_edgelist.cu +++ b/cpp/src/experimental/renumber_edgelist.cu @@ -544,7 +544,8 @@ renumber_edgelist(raft::handle_t const& handle, for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { rmm::device_uvector renumber_map_major_labels( - col_comm_rank == i ? vertex_t{0} : partition.get_matrix_partition_major_size(i), + col_comm_rank == static_cast(i) ? vertex_t{0} + : partition.get_matrix_partition_major_size(i), handle.get_stream()); device_bcast(col_comm, renumber_map_labels.data(), @@ -563,7 +564,8 @@ renumber_edgelist(raft::handle_t const& handle, invalid_vertex_id::value}; auto pair_first = thrust::make_transform_iterator( thrust::make_zip_iterator(thrust::make_tuple( - col_comm_rank == i ? renumber_map_labels.begin() : renumber_map_major_labels.begin(), + col_comm_rank == static_cast(i) ? renumber_map_labels.begin() + : renumber_map_major_labels.begin(), thrust::make_counting_iterator(partition.get_matrix_partition_major_first(i)))), [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index baea6cdea2a..8d4d2fa10d8 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -20,11 +20,11 @@ # - common test utils ----------------------------------------------------------------------------- add_library(cugraphtestutil STATIC - "${CMAKE_CURRENT_SOURCE_DIR}/utilities/generate_graph_from_edgelist.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/matrix_market_file_utilities.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/rmat_utilities.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/renumber_utilities.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/misc_utilities.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/utilities/detail/generate_graph_from_edgelist.cu" "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c") set_property(TARGET cugraphtestutil PROPERTY POSITION_INDEPENDENT_CODE ON) diff --git a/cpp/tests/experimental/bfs_test.cpp b/cpp/tests/experimental/bfs_test.cpp index 6df768aba82..a43b8ee9ea8 100644 --- a/cpp/tests/experimental/bfs_test.cpp +++ b/cpp/tests/experimental/bfs_test.cpp @@ -130,7 +130,9 @@ read_graph(raft::handle_t const& handle, BFS_Usecase const& configuration, bool configuration.input_graph_specifier.rmat_params.undirected, configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, false, - renumber); + renumber, + std::vector{0}, + size_t{1}); } class Tests_BFS : public ::testing::TestWithParam { diff --git a/cpp/tests/experimental/katz_centrality_test.cpp b/cpp/tests/experimental/katz_centrality_test.cpp index c8dc3ec5fd5..60f6ee20084 100644 --- a/cpp/tests/experimental/katz_centrality_test.cpp +++ b/cpp/tests/experimental/katz_centrality_test.cpp @@ -150,7 +150,9 @@ read_graph(raft::handle_t const& handle, KatzCentrality_Usecase const& configura configuration.input_graph_specifier.rmat_params.undirected, configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, configuration.test_weighted, - renumber); + renumber, + std::vector{0}, + size_t{1}); } class Tests_KatzCentrality : public ::testing::TestWithParam { diff --git a/cpp/tests/experimental/pagerank_test.cpp b/cpp/tests/experimental/pagerank_test.cpp index 4e846395a96..e87394ed033 100644 --- a/cpp/tests/experimental/pagerank_test.cpp +++ b/cpp/tests/experimental/pagerank_test.cpp @@ -192,7 +192,9 @@ read_graph(raft::handle_t const& handle, PageRank_Usecase const& configuration, configuration.input_graph_specifier.rmat_params.undirected, configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, configuration.test_weighted, - renumber); + renumber, + std::vector{0}, + size_t{1}); } class Tests_PageRank : public ::testing::TestWithParam { diff --git a/cpp/tests/experimental/sssp_test.cpp b/cpp/tests/experimental/sssp_test.cpp index 7ab4321aefb..a986a238911 100644 --- a/cpp/tests/experimental/sssp_test.cpp +++ b/cpp/tests/experimental/sssp_test.cpp @@ -136,7 +136,9 @@ read_graph(raft::handle_t const& handle, SSSP_Usecase const& configuration, bool configuration.input_graph_specifier.rmat_params.undirected, configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, true, - renumber); + renumber, + std::vector{0}, + size_t{1}); } class Tests_SSSP : public ::testing::TestWithParam { diff --git a/cpp/tests/pagerank/mg_pagerank_test.cpp b/cpp/tests/pagerank/mg_pagerank_test.cpp index 8e801ea8085..8e6d1957c8b 100644 --- a/cpp/tests/pagerank/mg_pagerank_test.cpp +++ b/cpp/tests/pagerank/mg_pagerank_test.cpp @@ -74,6 +74,15 @@ std::tuple> read_graph(raft::handle_t const& handle, PageRank_Usecase const& configuration, bool renumber) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + std::vector partition_ids(multi_gpu ? size_t{1} : static_cast(comm_size)); + std::iota(partition_ids.begin(), + partition_ids.end(), + multi_gpu ? static_cast(comm_rank) : size_t{0}); + return configuration.input_graph_specifier.tag == cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH ? cugraph::test:: @@ -94,7 +103,9 @@ read_graph(raft::handle_t const& handle, PageRank_Usecase const& configuration, configuration.input_graph_specifier.rmat_params.undirected, configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, configuration.test_weighted, - renumber); + renumber, + partition_ids, + static_cast(comm_size)); } class Tests_MGPageRank : public ::testing::TestWithParam { diff --git a/cpp/tests/utilities/generate_graph_from_edgelist.cu b/cpp/tests/utilities/detail/generate_graph_from_edgelist.cu similarity index 81% rename from cpp/tests/utilities/generate_graph_from_edgelist.cu rename to cpp/tests/utilities/detail/generate_graph_from_edgelist.cu index 0774f6a1120..be93b98b833 100644 --- a/cpp/tests/utilities/generate_graph_from_edgelist.cu +++ b/cpp/tests/utilities/detail/generate_graph_from_edgelist.cu @@ -28,9 +28,10 @@ namespace cugraph { namespace test { - namespace detail { +namespace { + template , rmm::device_uvector>> -generate_graph_from_edgelist(raft::handle_t const& handle, - rmm::device_uvector&& vertices, - rmm::device_uvector&& edgelist_rows, - rmm::device_uvector&& edgelist_cols, - rmm::device_uvector&& edgelist_weights, - bool is_symmetric, - bool test_weighted, - bool renumber) +generate_graph_from_edgelist_impl(raft::handle_t const& handle, + rmm::device_uvector&& vertices, + rmm::device_uvector&& edgelist_rows, + rmm::device_uvector&& edgelist_cols, + rmm::device_uvector&& edgelist_weights, + bool is_symmetric, + bool test_weighted, + bool renumber) { CUGRAPH_EXPECTS(renumber, "renumber should be true if multi_gpu is true."); @@ -60,61 +61,6 @@ generate_graph_from_edgelist(raft::handle_t const& handle, auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_size = col_comm.get_size(); - vertex_t number_of_vertices = static_cast(vertices.size()); - - auto vertex_key_func = - cugraph::experimental::detail::compute_gpu_id_from_vertex_t{comm_size}; - vertices.resize(thrust::distance(vertices.begin(), - thrust::remove_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - vertices.begin(), - vertices.end(), - [comm_rank, key_func = vertex_key_func] __device__(auto val) { - return key_func(val) != comm_rank; - })), - handle.get_stream()); - vertices.shrink_to_fit(handle.get_stream()); - - auto edge_key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t{ - comm_size, row_comm_size, col_comm_size}; - size_t number_of_local_edges{}; - if (test_weighted) { - auto edge_first = thrust::make_zip_iterator( - thrust::make_tuple(edgelist_rows.begin(), edgelist_cols.begin(), edgelist_weights.begin())); - number_of_local_edges = thrust::distance( - edge_first, - thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edge_first, - edge_first + edgelist_rows.size(), - [comm_rank, key_func = edge_key_func] __device__(auto e) { - auto major = store_transposed ? thrust::get<1>(e) : thrust::get<0>(e); - auto minor = store_transposed ? thrust::get<0>(e) : thrust::get<1>(e); - return key_func(major, minor) != comm_rank; - })); - } else { - auto edge_first = - thrust::make_zip_iterator(thrust::make_tuple(edgelist_rows.begin(), edgelist_cols.begin())); - number_of_local_edges = thrust::distance( - edge_first, - thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edge_first, - edge_first + edgelist_rows.size(), - [comm_rank, key_func = edge_key_func] __device__(auto e) { - auto major = store_transposed ? thrust::get<1>(e) : thrust::get<0>(e); - auto minor = store_transposed ? thrust::get<0>(e) : thrust::get<1>(e); - return key_func(major, minor) != comm_rank; - })); - } - - edgelist_rows.resize(number_of_local_edges, handle.get_stream()); - edgelist_rows.shrink_to_fit(handle.get_stream()); - edgelist_cols.resize(number_of_local_edges, handle.get_stream()); - edgelist_cols.shrink_to_fit(handle.get_stream()); - if (test_weighted) { - edgelist_weights.resize(number_of_local_edges, handle.get_stream()); - edgelist_weights.shrink_to_fit(handle.get_stream()); - } - auto local_partition_id_op = [comm_size, key_func = cugraph::experimental::detail::compute_partition_id_from_edge_t{ @@ -151,7 +97,7 @@ generate_graph_from_edgelist(raft::handle_t const& handle, rmm::device_uvector renumber_map_labels(0, handle.get_stream()); cugraph::experimental::partition_t partition{}; - vertex_t aggregate_number_of_vertices{}; + vertex_t number_of_vertices{}; edge_t number_of_edges{}; { std::vector major_ptrs(h_edge_counts.size()); @@ -165,7 +111,7 @@ generate_graph_from_edgelist(raft::handle_t const& handle, counts[i] = static_cast(h_edge_counts[i]); } // FIXME: set do_expensive_check to false once validated - std::tie(renumber_map_labels, partition, aggregate_number_of_vertices, number_of_edges) = + std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges) = cugraph::experimental::renumber_edgelist( handle, vertices.data(), @@ -174,7 +120,6 @@ generate_graph_from_edgelist(raft::handle_t const& handle, minor_ptrs, counts, true); - assert(aggregate_number_of_vertices == number_of_vertices); } // 4. create a graph @@ -213,14 +158,14 @@ std::enable_if_t< std::tuple< cugraph::experimental::graph_t, rmm::device_uvector>> -generate_graph_from_edgelist(raft::handle_t const& handle, - rmm::device_uvector&& vertices, - rmm::device_uvector&& edgelist_rows, - rmm::device_uvector&& edgelist_cols, - rmm::device_uvector&& edgelist_weights, - bool is_symmetric, - bool test_weighted, - bool renumber) +generate_graph_from_edgelist_impl(raft::handle_t const& handle, + rmm::device_uvector&& vertices, + rmm::device_uvector&& edgelist_rows, + rmm::device_uvector&& edgelist_cols, + rmm::device_uvector&& edgelist_weights, + bool is_symmetric, + bool test_weighted, + bool renumber) { vertex_t number_of_vertices = static_cast(vertices.size()); @@ -252,7 +197,7 @@ generate_graph_from_edgelist(raft::handle_t const& handle, std::move(renumber_map_labels)); } -} // namespace detail +} // namespace template ( - handle, - std::move(vertices), - std::move(edgelist_rows), - std::move(edgelist_cols), - std::move(edgelist_weights), - is_symmetric, - test_weighted, - renumber); + return generate_graph_from_edgelist_impl( + handle, + std::move(vertices), + std::move(edgelist_rows), + std::move(edgelist_cols), + std::move(edgelist_weights), + is_symmetric, + test_weighted, + renumber); } // explicit instantiations @@ -572,5 +516,6 @@ generate_graph_from_edgelist( bool test_weighted, bool renumber); +} // namespace detail } // namespace test } // namespace cugraph diff --git a/cpp/tests/utilities/detail/generate_graph_from_edgelist.hpp b/cpp/tests/utilities/detail/generate_graph_from_edgelist.hpp new file mode 100644 index 00000000000..b0ece55be7e --- /dev/null +++ b/cpp/tests/utilities/detail/generate_graph_from_edgelist.hpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include +#include + +#include +#include + +namespace cugraph { +namespace test { +namespace detail { + +template +std::tuple, + rmm::device_uvector> +generate_graph_from_edgelist(raft::handle_t const& handle, + rmm::device_uvector&& vertices, + rmm::device_uvector&& edgelist_rows, + rmm::device_uvector&& edgelist_cols, + rmm::device_uvector&& edgelist_weights, + bool is_symmetric, + bool test_weighted, + bool renumber); + +} // namespace detail +} // namespace test +} // namespace cugraph diff --git a/cpp/tests/utilities/matrix_market_file_utilities.cu b/cpp/tests/utilities/matrix_market_file_utilities.cu index ddbbac603ee..cbab0e988fe 100644 --- a/cpp/tests/utilities/matrix_market_file_utilities.cu +++ b/cpp/tests/utilities/matrix_market_file_utilities.cu @@ -13,9 +13,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +#include #include +#include #include +#include #include #include @@ -339,16 +343,83 @@ read_graph_from_matrix_market_file(raft::handle_t const& handle, d_vertices.begin(), d_vertices.end(), vertex_t{0}); + handle.get_stream_view().synchronize(); + + if (multi_gpu) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_size = col_comm.get_size(); + + auto vertex_key_func = + cugraph::experimental::detail::compute_gpu_id_from_vertex_t{comm_size}; + d_vertices.resize( + thrust::distance( + d_vertices.begin(), + thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_vertices.begin(), + d_vertices.end(), + [comm_rank, key_func = vertex_key_func] __device__(auto val) { + return key_func(val) != comm_rank; + })), + handle.get_stream()); + d_vertices.shrink_to_fit(handle.get_stream()); + + auto edge_key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}; + size_t number_of_local_edges{}; + if (test_weighted) { + auto edge_first = thrust::make_zip_iterator(thrust::make_tuple( + d_edgelist_rows.begin(), d_edgelist_cols.begin(), d_edgelist_weights.begin())); + number_of_local_edges = thrust::distance( + edge_first, + thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edge_first, + edge_first + d_edgelist_rows.size(), + [comm_rank, key_func = edge_key_func] __device__(auto e) { + auto major = store_transposed ? thrust::get<1>(e) : thrust::get<0>(e); + auto minor = store_transposed ? thrust::get<0>(e) : thrust::get<1>(e); + return key_func(major, minor) != comm_rank; + })); + } else { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(d_edgelist_rows.begin(), d_edgelist_cols.begin())); + number_of_local_edges = thrust::distance( + edge_first, + thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edge_first, + edge_first + d_edgelist_rows.size(), + [comm_rank, key_func = edge_key_func] __device__(auto e) { + auto major = store_transposed ? thrust::get<1>(e) : thrust::get<0>(e); + auto minor = store_transposed ? thrust::get<0>(e) : thrust::get<1>(e); + return key_func(major, minor) != comm_rank; + })); + } + + d_edgelist_rows.resize(number_of_local_edges, handle.get_stream()); + d_edgelist_rows.shrink_to_fit(handle.get_stream()); + d_edgelist_cols.resize(number_of_local_edges, handle.get_stream()); + d_edgelist_cols.shrink_to_fit(handle.get_stream()); + if (test_weighted) { + d_edgelist_weights.resize(number_of_local_edges, handle.get_stream()); + d_edgelist_weights.shrink_to_fit(handle.get_stream()); + } + } - return generate_graph_from_edgelist( - handle, - std::move(d_vertices), - std::move(d_edgelist_rows), - std::move(d_edgelist_cols), - std::move(d_edgelist_weights), - is_symmetric, - test_weighted, - renumber); + handle.get_stream_view().synchronize(); + return detail:: + generate_graph_from_edgelist( + handle, + std::move(d_vertices), + std::move(d_edgelist_rows), + std::move(d_edgelist_cols), + std::move(d_edgelist_weights), + is_symmetric, + test_weighted, + renumber); } // explicit instantiations diff --git a/cpp/tests/utilities/rmat_utilities.cu b/cpp/tests/utilities/rmat_utilities.cu index 9ab05cfbbc2..c9918dcd979 100644 --- a/cpp/tests/utilities/rmat_utilities.cu +++ b/cpp/tests/utilities/rmat_utilities.cu @@ -13,10 +13,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +#include #include +#include #include +#include #include +#include #include #include @@ -41,115 +46,268 @@ generate_graph_from_rmat_params(raft::handle_t const& handle, double a, double b, double c, - uint64_t seed, + uint64_t base_seed, bool undirected, bool scramble_vertex_ids, bool test_weighted, - bool renumber) + bool renumber, + std::vector const& partition_ids, + size_t num_partitions) { + CUGRAPH_EXPECTS(!multi_gpu || renumber, "renumber should be true if multi_gpu is true."); + + vertex_t number_of_vertices = static_cast(size_t{1} << scale); + edge_t number_of_edges = + static_cast(static_cast(number_of_vertices) * edge_factor); + + std::vector partition_edge_counts(partition_ids.size()); + std::vector partition_vertex_firsts(partition_ids.size()); + std::vector partition_vertex_lasts(partition_ids.size()); + for (size_t i = 0; i < partition_ids.size(); ++i) { + auto id = partition_ids[i]; + + partition_edge_counts[i] = number_of_edges / num_partitions + + (id < number_of_edges % num_partitions ? edge_t{1} : edge_t{0}); + + partition_vertex_firsts[i] = (number_of_vertices / num_partitions) * id; + partition_vertex_lasts[i] = (number_of_vertices / num_partitions) * (id + 1); + if (id < number_of_vertices % num_partitions) { + partition_vertex_firsts[i] += id; + partition_vertex_lasts[i] += id + 1; + } else { + partition_vertex_firsts[i] += number_of_vertices % num_partitions; + partition_vertex_lasts[i] += number_of_vertices % num_partitions; + } + } + rmm::device_uvector d_edgelist_rows(0, handle.get_stream()); rmm::device_uvector d_edgelist_cols(0, handle.get_stream()); - std::tie(d_edgelist_rows, d_edgelist_cols) = - cugraph::experimental::generate_rmat_edgelist(handle, - scale, - (size_t{1} << scale) * edge_factor, - a, - b, - c, - seed, - undirected ? true : false, - scramble_vertex_ids); + rmm::device_uvector d_edgelist_weights(0, handle.get_stream()); + for (size_t i = 0; i < partition_ids.size(); ++i) { + auto id = partition_ids[i]; + + rmm::device_uvector d_tmp_rows(0, handle.get_stream()); + rmm::device_uvector d_tmp_cols(0, handle.get_stream()); + std::tie(i == 0 ? d_edgelist_rows : d_tmp_rows, i == 0 ? d_edgelist_cols : d_tmp_cols) = + cugraph::experimental::generate_rmat_edgelist(handle, + scale, + partition_edge_counts[i], + a, + b, + c, + base_seed + id, + undirected ? true : false, + scramble_vertex_ids); + + rmm::device_uvector d_tmp_weights(0, handle.get_stream()); + if (test_weighted) { + if (i == 0) { + d_edgelist_weights.resize(d_edgelist_rows.size(), handle.get_stream()); + } else { + d_tmp_weights.resize(d_tmp_rows.size(), handle.get_stream()); + } + + raft::random::Rng rng(base_seed + num_partitions + id); + rng.uniform(i == 0 ? d_edgelist_weights.data() : d_tmp_weights.data(), + i == 0 ? d_edgelist_weights.size() : d_tmp_weights.size(), + weight_t{0.0}, + weight_t{1.0}, + handle.get_stream()); + } + + if (i > 0) { + auto start_offset = d_edgelist_rows.size(); + d_edgelist_rows.resize(start_offset + d_tmp_rows.size(), handle.get_stream()); + d_edgelist_cols.resize(d_edgelist_rows.size(), handle.get_stream()); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_tmp_rows.begin(), + d_tmp_rows.end(), + d_edgelist_rows.begin() + start_offset); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_tmp_cols.begin(), + d_tmp_cols.end(), + d_edgelist_cols.begin() + start_offset); + if (test_weighted) { + d_edgelist_weights.resize(d_edgelist_rows.size(), handle.get_stream()); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_tmp_weights.begin(), + d_tmp_weights.end(), + d_edgelist_weights.begin() + start_offset); + } + } + } + if (undirected) { // FIXME: need to symmetrize CUGRAPH_FAIL("unimplemented."); } - rmm::device_uvector d_edgelist_weights(test_weighted ? d_edgelist_rows.size() : 0, - handle.get_stream()); - if (test_weighted) { - raft::random::Rng rng(seed + 1); - rng.uniform(d_edgelist_weights.data(), - d_edgelist_weights.size(), - weight_t{0.0}, - weight_t{1.0}, - handle.get_stream()); + if (multi_gpu) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_size = col_comm.get_size(); + + rmm::device_uvector d_rx_edgelist_rows(0, handle.get_stream()); + rmm::device_uvector d_rx_edgelist_cols(0, handle.get_stream()); + rmm::device_uvector d_rx_edgelist_weights(0, handle.get_stream()); + if (test_weighted) { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(store_transposed ? d_edgelist_cols.begin() : d_edgelist_rows.begin(), + store_transposed ? d_edgelist_rows.begin() : d_edgelist_cols.begin(), + d_edgelist_weights.begin())); + + std::forward_as_tuple(std::tie(store_transposed ? d_rx_edgelist_cols : d_rx_edgelist_rows, + store_transposed ? d_rx_edgelist_rows : d_rx_edgelist_cols, + d_rx_edgelist_weights), + std::ignore) = + cugraph::experimental::groupby_gpuid_and_shuffle_values( + comm, // handle.get_comms(), + edge_first, + edge_first + d_edgelist_rows.size(), + [key_func = + cugraph::experimental::detail::compute_gpu_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto val) { + return key_func(thrust::get<0>(val), thrust::get<1>(val)); + }, + handle.get_stream()); + } else { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(store_transposed ? d_edgelist_cols.begin() : d_edgelist_rows.begin(), + store_transposed ? d_edgelist_rows.begin() : d_edgelist_cols.begin())); + + std::forward_as_tuple(std::tie(store_transposed ? d_rx_edgelist_cols : d_rx_edgelist_rows, + store_transposed ? d_rx_edgelist_rows : d_rx_edgelist_cols), + std::ignore) = + cugraph::experimental::groupby_gpuid_and_shuffle_values( + comm, // handle.get_comms(), + edge_first, + edge_first + d_edgelist_rows.size(), + [key_func = + cugraph::experimental::detail::compute_gpu_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto val) { + return key_func(thrust::get<0>(val), thrust::get<1>(val)); + }, + handle.get_stream()); + } + + d_edgelist_rows = std::move(d_rx_edgelist_rows); + d_edgelist_cols = std::move(d_rx_edgelist_cols); + d_edgelist_weights = std::move(d_rx_edgelist_weights); + } + + rmm::device_uvector d_vertices(0, handle.get_stream()); + for (size_t i = 0; i < partition_ids.size(); ++i) { + auto id = partition_ids[i]; + + auto start_offset = d_vertices.size(); + d_vertices.resize(start_offset + (partition_vertex_lasts[i] - partition_vertex_firsts[i]), + handle.get_stream()); + thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_vertices.begin() + start_offset, + d_vertices.end(), + partition_vertex_firsts[i]); } - rmm::device_uvector d_vertices(static_cast(size_t{1} << scale), - handle.get_stream()); - thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - d_vertices.begin(), - d_vertices.end(), - vertex_t{0}); - - return generate_graph_from_edgelist( - handle, - std::move(d_vertices), - std::move(d_edgelist_rows), - std::move(d_edgelist_cols), - std::move(d_edgelist_weights), - false, - test_weighted, - renumber); + if (multi_gpu) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + + rmm::device_uvector d_rx_vertices(0, handle.get_stream()); + std::tie(d_rx_vertices, std::ignore) = cugraph::experimental::groupby_gpuid_and_shuffle_values( + comm, // handle.get_comms(), + d_vertices.begin(), + d_vertices.end(), + [key_func = + cugraph::experimental::detail::compute_gpu_id_from_vertex_t{ + comm_size}] __device__(auto val) { return key_func(val); }, + handle.get_stream()); + d_vertices = std::move(d_rx_vertices); + } + + return detail:: + generate_graph_from_edgelist( + handle, + std::move(d_vertices), + std::move(d_edgelist_rows), + std::move(d_edgelist_cols), + std::move(d_edgelist_weights), + false, + test_weighted, + renumber); } // explicit instantiations template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> @@ -164,105 +322,128 @@ generate_graph_from_rmat_params( bool undirected, bool scramble_vertex_ids, bool test_weighted, - bool renumber); + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> @@ -277,105 +458,128 @@ generate_graph_from_rmat_params( bool undirected, bool scramble_vertex_ids, bool test_weighted, - bool renumber); + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> @@ -390,49 +594,60 @@ generate_graph_from_rmat_params( bool undirected, bool scramble_vertex_ids, bool test_weighted, - bool renumber); + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); } // namespace test } // namespace cugraph diff --git a/cpp/tests/utilities/test_utilities.hpp b/cpp/tests/utilities/test_utilities.hpp index 37e87c62247..de7e442ea69 100644 --- a/cpp/tests/utilities/test_utilities.hpp +++ b/cpp/tests/utilities/test_utilities.hpp @@ -130,22 +130,6 @@ read_graph_from_matrix_market_file(raft::handle_t const& handle, bool test_weighted, bool renumber); -template -std::tuple, - rmm::device_uvector> -generate_graph_from_edgelist(raft::handle_t const& handle, - rmm::device_uvector&& vertices, - rmm::device_uvector&& edgelist_rows, - rmm::device_uvector&& edgelist_cols, - rmm::device_uvector&& edgelist_weights, - bool is_symmetric, - bool test_weighted, - bool renumber); - template const& partition_ids, + size_t num_partitions); struct rmat_params_t { size_t scale{}; From f329e65a6d47d3abadfd3c4c93262289e4f38f86 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 20 Mar 2021 02:05:25 -0400 Subject: [PATCH 24/63] add overflow check --- cpp/tests/utilities/rmat_utilities.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/tests/utilities/rmat_utilities.cu b/cpp/tests/utilities/rmat_utilities.cu index c9918dcd979..be300e74760 100644 --- a/cpp/tests/utilities/rmat_utilities.cu +++ b/cpp/tests/utilities/rmat_utilities.cu @@ -55,6 +55,9 @@ generate_graph_from_rmat_params(raft::handle_t const& handle, size_t num_partitions) { CUGRAPH_EXPECTS(!multi_gpu || renumber, "renumber should be true if multi_gpu is true."); + CUGRAPH_EXPECTS(size_t{1} << scale <= std::numeric_limits::max(), "vertex_t overflow."); + CUGRAPH_EXPECTS((size_t{1} << scale) * edge_factor <= std::numeric_limits::max(), + " edge_t overflow."); vertex_t number_of_vertices = static_cast(size_t{1} << scale); edge_t number_of_edges = From 70000e1b57ffb891f57358fe7338a13f6036b9e2 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 20 Mar 2021 23:36:12 -0400 Subject: [PATCH 25/63] reduce R-mat graph size to avoid malloc failure when running tests in MIG --- cpp/tests/experimental/bfs_test.cpp | 2 +- cpp/tests/experimental/katz_centrality_test.cpp | 4 ++-- cpp/tests/experimental/pagerank_test.cpp | 8 ++++---- cpp/tests/experimental/sssp_test.cpp | 2 +- cpp/tests/pagerank/mg_pagerank_test.cpp | 10 +++++----- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/tests/experimental/bfs_test.cpp b/cpp/tests/experimental/bfs_test.cpp index a43b8ee9ea8..d06e634f695 100644 --- a/cpp/tests/experimental/bfs_test.cpp +++ b/cpp/tests/experimental/bfs_test.cpp @@ -318,7 +318,7 @@ INSTANTIATE_TEST_CASE_P( BFS_Usecase("test/datasets/wiki-Talk.mtx", 1000), BFS_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0), // disable correctness checks for large graphs - BFS_Usecase(cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, + BFS_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0, false))); diff --git a/cpp/tests/experimental/katz_centrality_test.cpp b/cpp/tests/experimental/katz_centrality_test.cpp index 60f6ee20084..37a77e22e46 100644 --- a/cpp/tests/experimental/katz_centrality_test.cpp +++ b/cpp/tests/experimental/katz_centrality_test.cpp @@ -315,10 +315,10 @@ INSTANTIATE_TEST_CASE_P( KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, true), // disable correctness checks for large graphs - KatzCentrality_Usecase(cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, + KatzCentrality_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, false, false), - KatzCentrality_Usecase(cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, + KatzCentrality_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, true, false))); diff --git a/cpp/tests/experimental/pagerank_test.cpp b/cpp/tests/experimental/pagerank_test.cpp index e87394ed033..b3148166efe 100644 --- a/cpp/tests/experimental/pagerank_test.cpp +++ b/cpp/tests/experimental/pagerank_test.cpp @@ -441,12 +441,12 @@ INSTANTIATE_TEST_CASE_P( true), // disable correctness checks for large graphs PageRank_Usecase( - cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, false, false), + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, false, false), PageRank_Usecase( - cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, false, false), + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, false, false), PageRank_Usecase( - cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, true, false), + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, true, false), PageRank_Usecase( - cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, true, false))); + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, true, false))); CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/sssp_test.cpp b/cpp/tests/experimental/sssp_test.cpp index a986a238911..13a58450eee 100644 --- a/cpp/tests/experimental/sssp_test.cpp +++ b/cpp/tests/experimental/sssp_test.cpp @@ -331,7 +331,7 @@ INSTANTIATE_TEST_CASE_P( SSSP_Usecase("test/datasets/wiki2003.mtx", 1000), SSSP_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0), // disable correctness checks for large graphs - SSSP_Usecase(cugraph::test::rmat_params_t{25, 16, 0.57, 0.19, 0.19, 0, false, false}, + SSSP_Usecase(cugraph::test::rmat_params_t{20, 16, 0.57, 0.19, 0.19, 0, false, false}, 0, false))); diff --git a/cpp/tests/pagerank/mg_pagerank_test.cpp b/cpp/tests/pagerank/mg_pagerank_test.cpp index 8e6d1957c8b..0989dc01299 100644 --- a/cpp/tests/pagerank/mg_pagerank_test.cpp +++ b/cpp/tests/pagerank/mg_pagerank_test.cpp @@ -117,7 +117,7 @@ class Tests_MGPageRank : public ::testing::TestWithParam { virtual void SetUp() {} virtual void TearDown() {} - // Compare the results of running pagerank on multiple GPUs to that of a single-GPU run + // Compare the results of running PageRank on multiple GPUs to that of a single-GPU run template void run_current_test(PageRank_Usecase const& configuration) { @@ -363,12 +363,12 @@ INSTANTIATE_TEST_CASE_P( true), // disable correctness checks for large graphs PageRank_Usecase( - cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, false, false), + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, false, false), PageRank_Usecase( - cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, false, false), + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, false, false), PageRank_Usecase( - cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, true, false), + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, true, false), PageRank_Usecase( - cugraph::test::rmat_params_t{25, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, true, false))); + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, true, false))); CUGRAPH_MG_TEST_PROGRAM_MAIN() From bc2eda6554dd543cdbcb86172ec246e338780b77 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 21 Mar 2021 14:46:14 -0400 Subject: [PATCH 26/63] cosmetic updates --- cpp/tests/experimental/katz_centrality_test.cpp | 3 +-- cpp/tests/pagerank/mg_pagerank_test.cpp | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/tests/experimental/katz_centrality_test.cpp b/cpp/tests/experimental/katz_centrality_test.cpp index 37a77e22e46..3a929c8c294 100644 --- a/cpp/tests/experimental/katz_centrality_test.cpp +++ b/cpp/tests/experimental/katz_centrality_test.cpp @@ -201,8 +201,7 @@ class Tests_KatzCentrality : public ::testing::TestWithParam::max(), false, - true, - false); + true); CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement diff --git a/cpp/tests/pagerank/mg_pagerank_test.cpp b/cpp/tests/pagerank/mg_pagerank_test.cpp index 0989dc01299..407e255a5bb 100644 --- a/cpp/tests/pagerank/mg_pagerank_test.cpp +++ b/cpp/tests/pagerank/mg_pagerank_test.cpp @@ -184,7 +184,7 @@ class Tests_MGPageRank : public ::testing::TestWithParam { handle.get_stream()); } - // 4. run MG pagerank + // 4. run MG PageRank result_t constexpr alpha{0.85}; result_t constexpr epsilon{1e-6}; @@ -267,7 +267,7 @@ class Tests_MGPageRank : public ::testing::TestWithParam { handle.get_stream()); } - // 5-3. run SG pagerank + // 5-3. run SG PageRank rmm::device_uvector d_sg_pageranks(sg_graph_view.get_number_of_vertices(), handle.get_stream()); From 52d96bfef9bb855846b4de0fd2ce8034ad72063c Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 21 Mar 2021 16:27:19 -0400 Subject: [PATCH 27/63] add compute_max_(in_out)_(degree|weight_sum) --- cpp/include/experimental/graph_view.hpp | 12 ++ cpp/include/utilities/device_comm.cuh | 100 ++++++++++++++ cpp/src/experimental/graph_view.cu | 169 ++++++++++++++++++++++++ 3 files changed, 281 insertions(+) diff --git a/cpp/include/experimental/graph_view.hpp b/cpp/include/experimental/graph_view.hpp index aa8cad5f9b3..73e05e646a7 100644 --- a/cpp/include/experimental/graph_view.hpp +++ b/cpp/include/experimental/graph_view.hpp @@ -482,6 +482,12 @@ class graph_view_t compute_in_weight_sums(raft::handle_t const& handle) const; rmm::device_uvector compute_out_weight_sums(raft::handle_t const& handle) const; + edge_t compute_max_in_degree(raft::handle_t const& handle) const; + edge_t compute_max_out_degree(raft::handle_t const& handle) const; + + weight_t compute_max_in_weight_sum(raft::handle_t const& handle) const; + weight_t compute_max_out_weight_sum(raft::handle_t const& handle) const; + private: std::vector adj_matrix_partition_offsets_{}; std::vector adj_matrix_partition_indices_{}; @@ -630,6 +636,12 @@ class graph_view_t compute_in_weight_sums(raft::handle_t const& handle) const; rmm::device_uvector compute_out_weight_sums(raft::handle_t const& handle) const; + edge_t compute_max_in_degree(raft::handle_t const& handle) const; + edge_t compute_max_out_degree(raft::handle_t const& handle) const; + + weight_t compute_max_in_weight_sum(raft::handle_t const& handle) const; + weight_t compute_max_out_weight_sum(raft::handle_t const& handle) const; + private: edge_t const* offsets_{nullptr}; vertex_t const* indices_{nullptr}; diff --git a/cpp/include/utilities/device_comm.cuh b/cpp/include/utilities/device_comm.cuh index 7b9956902cc..c0e34d0a61b 100644 --- a/cpp/include/utilities/device_comm.cuh +++ b/cpp/include/utilities/device_comm.cuh @@ -413,6 +413,66 @@ struct device_bcast_tuple_iterator_element_impl +std::enable_if_t::value, void> +device_allreduce_impl(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t count, + raft::comms::op_t op, + cudaStream_t stream) +{ + // no-op +} + +template +std::enable_if_t< + std::is_arithmetic::value_type>::value, + void> +device_allreduce_impl(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t count, + raft::comms::op_t op, + cudaStream_t stream) +{ + static_assert(std::is_same::value_type, + typename std::iterator_traits::value_type>::value); + comm.allreduce(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, op, stream); +} + +template +struct device_allreduce_tuple_iterator_element_impl { + void run(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t count, + raft::comms::op_t op, + cudaStream_t stream) const + { + device_allreduce_impl(comm, + thrust::get(input_first.get_iterator_tuple()), + thrust::get(output_first.get_iterator_tuple()), + count, + op, + stream); + device_allreduce_tuple_iterator_element_impl( + comm, input_first, output_first, count, op, stream); + } +}; + +template +struct device_allreduce_tuple_iterator_element_impl { + void run(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t count, + raft::comms::op_t op, + cudaStream_t stream) const + { + } +}; + template std::enable_if_t::value, void> device_reduce_impl(raft::comms::comms_t const& comm, @@ -854,6 +914,46 @@ device_bcast(raft::comms::comms_t const& comm, comm, input_first, output_first, count, root, stream); } +template +std::enable_if_t< + std::is_arithmetic::value_type>::value, + void> +device_allreduce(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t count, + raft::comms::op_t op, + cudaStream_t stream) +{ + detail::device_allreduce_impl(comm, input_first, output_first, count, op, stream); +} + +template +std::enable_if_t< + is_thrust_tuple_of_arithmetic::value_type>::value && + is_thrust_tuple::value_type>::value, + void> +device_allreduce(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t count, + raft::comms::op_t op, + cudaStream_t stream) +{ + static_assert( + thrust::tuple_size::value_type>::value == + thrust::tuple_size::value_type>::value); + + size_t constexpr tuple_size = + thrust::tuple_size::value_type>::value; + + detail::device_allreduce_tuple_iterator_element_impl( + comm, input_first, output_first, count, op, stream); +} + template std::enable_if_t< std::is_arithmetic::value_type>::value, diff --git a/cpp/src/experimental/graph_view.cu b/cpp/src/experimental/graph_view.cu index 2fa2fe9560b..74c3e2f7de4 100644 --- a/cpp/src/experimental/graph_view.cu +++ b/cpp/src/experimental/graph_view.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -520,6 +521,174 @@ rmm::device_uvector graph_view_t< } } +template +edge_t +graph_view_t>:: + compute_max_in_degree(raft::handle_t const& handle) const +{ + auto in_degrees = compute_in_degrees(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + in_degrees.begin(), + in_degrees.end()); + rmm::device_scalar ret(handle.get_stream()); + device_allreduce( + handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream()); + return ret.value(); +} + +template +edge_t graph_view_t>::compute_max_in_degree(raft::handle_t const& + handle) const +{ + auto in_degrees = compute_in_degrees(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + in_degrees.begin(), + in_degrees.end()); + edge_t ret{}; + raft::update_host(&ret, it, 1, handle.get_stream()); + handle.get_stream_view().synchronize(); + return ret; +} + +template +edge_t +graph_view_t>:: + compute_max_out_degree(raft::handle_t const& handle) const +{ + auto out_degrees = compute_out_degrees(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + out_degrees.begin(), + out_degrees.end()); + rmm::device_scalar ret(handle.get_stream()); + device_allreduce( + handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream()); + return ret.value(); +} + +template +edge_t graph_view_t>::compute_max_out_degree(raft::handle_t const& + handle) const +{ + auto out_degrees = compute_out_degrees(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + out_degrees.begin(), + out_degrees.end()); + edge_t ret{}; + raft::update_host(&ret, it, 1, handle.get_stream()); + handle.get_stream_view().synchronize(); + return ret; +} + +template +weight_t +graph_view_t>:: + compute_max_in_weight_sum(raft::handle_t const& handle) const +{ + auto in_weight_sums = compute_in_weight_sums(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + in_weight_sums.begin(), + in_weight_sums.end()); + rmm::device_scalar ret(handle.get_stream()); + device_allreduce( + handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream()); + return ret.value(); +} + +template +weight_t graph_view_t>::compute_max_in_weight_sum(raft::handle_t const& + handle) const +{ + auto in_weight_sums = compute_in_weight_sums(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + in_weight_sums.begin(), + in_weight_sums.end()); + weight_t ret{}; + raft::update_host(&ret, it, 1, handle.get_stream()); + handle.get_stream_view().synchronize(); + return ret; +} + +template +weight_t +graph_view_t>:: + compute_max_out_weight_sum(raft::handle_t const& handle) const +{ + auto out_weight_sums = compute_out_weight_sums(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + out_weight_sums.begin(), + out_weight_sums.end()); + rmm::device_scalar ret(handle.get_stream()); + device_allreduce( + handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream()); + return ret.value(); +} + +template +weight_t graph_view_t< + vertex_t, + edge_t, + weight_t, + store_transposed, + multi_gpu, + std::enable_if_t>::compute_max_out_weight_sum(raft::handle_t const& handle) const +{ + auto out_weight_sums = compute_out_weight_sums(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + out_weight_sums.begin(), + out_weight_sums.end()); + weight_t ret{}; + raft::update_host(&ret, it, 1, handle.get_stream()); + handle.get_stream_view().synchronize(); + return ret; +} + // explicit instantiation template class graph_view_t; From eafecd9bc32a48fc87199b35ff2dd8710a8f2051 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 23 Mar 2021 11:34:03 -0400 Subject: [PATCH 28/63] move is_valid_vertex from tests/utilities/test_utilities.hpp to include/experimental/graph.hpp --- cpp/include/experimental/graph.hpp | 14 +++++++++++ cpp/tests/community/egonet_test.cu | 7 +++--- cpp/tests/experimental/coarsen_graph_test.cpp | 5 ++-- cpp/tests/experimental/generate_rmat_test.cpp | 23 +++++++++++-------- cpp/tests/utilities/test_utilities.hpp | 14 ----------- 5 files changed, 34 insertions(+), 29 deletions(-) diff --git a/cpp/include/experimental/graph.hpp b/cpp/include/experimental/graph.hpp index 6a10256e6f4..0e88ba49bc0 100644 --- a/cpp/include/experimental/graph.hpp +++ b/cpp/include/experimental/graph.hpp @@ -188,6 +188,20 @@ template struct invalid_edge_id : invalid_idx { }; +template +std::enable_if_t::value, bool> is_valid_vertex(vertex_t num_vertices, + vertex_t v) +{ + return (v >= 0) && (v < num_vertices); +} + +template +std::enable_if_t::value, bool> is_valid_vertex(vertex_t num_vertices, + vertex_t v) +{ + return v < num_vertices; +} + } // namespace experimental } // namespace cugraph diff --git a/cpp/tests/community/egonet_test.cu b/cpp/tests/community/egonet_test.cu index a9224b42bc1..eab5cb31d8b 100644 --- a/cpp/tests/community/egonet_test.cu +++ b/cpp/tests/community/egonet_test.cu @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -129,8 +128,10 @@ class Tests_InducedEgo : public ::testing::TestWithParam { ASSERT_TRUE(h_cugraph_ego_edge_offsets[i] <= h_cugraph_ego_edge_offsets[i + 1]); auto n_vertices = graph_view.get_number_of_vertices(); for (size_t i = 0; i < d_ego_edgelist_src.size(); i++) { - ASSERT_TRUE(cugraph::test::is_valid_vertex(n_vertices, h_cugraph_ego_edgelist_src[i])); - ASSERT_TRUE(cugraph::test::is_valid_vertex(n_vertices, h_cugraph_ego_edgelist_dst[i])); + ASSERT_TRUE( + cugraph::experimental::is_valid_vertex(n_vertices, h_cugraph_ego_edgelist_src[i])); + ASSERT_TRUE( + cugraph::experimental::is_valid_vertex(n_vertices, h_cugraph_ego_edgelist_dst[i])); } /* diff --git a/cpp/tests/experimental/coarsen_graph_test.cpp b/cpp/tests/experimental/coarsen_graph_test.cpp index 789619f2cd9..0fc0634bbbc 100644 --- a/cpp/tests/experimental/coarsen_graph_test.cpp +++ b/cpp/tests/experimental/coarsen_graph_test.cpp @@ -54,13 +54,14 @@ void check_coarsened_graph_results(edge_t* org_offsets, ASSERT_TRUE(std::count_if(org_indices, org_indices + org_offsets[num_org_vertices], [num_org_vertices](auto nbr) { - return !cugraph::test::is_valid_vertex(num_org_vertices, nbr); + return !cugraph::experimental::is_valid_vertex(num_org_vertices, nbr); }) == 0); ASSERT_TRUE(std::is_sorted(coarse_offsets, coarse_offsets + num_coarse_vertices)); ASSERT_TRUE(std::count_if(coarse_indices, coarse_indices + coarse_offsets[num_coarse_vertices], [num_coarse_vertices](auto nbr) { - return !cugraph::test::is_valid_vertex(num_coarse_vertices, nbr); + return !cugraph::experimental::is_valid_vertex(num_coarse_vertices, + nbr); }) == 0); ASSERT_TRUE(num_coarse_vertices <= num_org_vertices); diff --git a/cpp/tests/experimental/generate_rmat_test.cpp b/cpp/tests/experimental/generate_rmat_test.cpp index 249a1a3c6c8..7825724249c 100644 --- a/cpp/tests/experimental/generate_rmat_test.cpp +++ b/cpp/tests/experimental/generate_rmat_test.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -200,17 +201,19 @@ class Tests_GenerateRmat : public ::testing::TestWithParam (h_cugraph_srcs.size() == (size_t{1} << configuration.scale) * configuration.edge_factor) && (h_cugraph_dsts.size() == (size_t{1} << configuration.scale) * configuration.edge_factor)) << "Returned an invalid number of R-mat graph edges."; - ASSERT_TRUE( - std::count_if(h_cugraph_srcs.begin(), - h_cugraph_srcs.end(), - [num_vertices = static_cast(size_t{1} << configuration.scale)]( - auto v) { return !cugraph::test::is_valid_vertex(num_vertices, v); }) == 0) + ASSERT_TRUE(std::count_if(h_cugraph_srcs.begin(), + h_cugraph_srcs.end(), + [num_vertices = static_cast( + size_t{1} << configuration.scale)](auto v) { + return !cugraph::experimental::is_valid_vertex(num_vertices, v); + }) == 0) << "Returned R-mat graph edges have invalid source vertex IDs."; - ASSERT_TRUE( - std::count_if(h_cugraph_dsts.begin(), - h_cugraph_dsts.end(), - [num_vertices = static_cast(size_t{1} << configuration.scale)]( - auto v) { return !cugraph::test::is_valid_vertex(num_vertices, v); }) == 0) + ASSERT_TRUE(std::count_if(h_cugraph_dsts.begin(), + h_cugraph_dsts.end(), + [num_vertices = static_cast( + size_t{1} << configuration.scale)](auto v) { + return !cugraph::experimental::is_valid_vertex(num_vertices, v); + }) == 0) << "Returned R-mat graph edges have invalid destination vertex IDs."; if (!scramble) { diff --git a/cpp/tests/utilities/test_utilities.hpp b/cpp/tests/utilities/test_utilities.hpp index de7e442ea69..3937c1a75ff 100644 --- a/cpp/tests/utilities/test_utilities.hpp +++ b/cpp/tests/utilities/test_utilities.hpp @@ -168,19 +168,5 @@ struct input_graph_specifier_t { rmat_params_t rmat_params{}; }; -template -std::enable_if_t::value, bool> is_valid_vertex(vertex_t num_vertices, - vertex_t v) -{ - return (v >= 0) && (v < num_vertices); -} - -template -std::enable_if_t::value, bool> is_valid_vertex(vertex_t num_vertices, - vertex_t v) -{ - return v < num_vertices; -} - } // namespace test } // namespace cugraph From d5dbc2842a7e23120d59244bce406371a3094b08 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 23 Mar 2021 13:29:08 -0400 Subject: [PATCH 29/63] add renumber/unrenumber vertex utility functions --- cpp/CMakeLists.txt | 1 + cpp/include/experimental/graph_functions.hpp | 134 ++++++- cpp/include/utilities/collect_comm.cuh | 3 +- cpp/src/experimental/relabel.cu | 1 + cpp/src/experimental/renumber_utils.cu | 360 +++++++++++++++++++ 5 files changed, 491 insertions(+), 8 deletions(-) create mode 100644 cpp/src/experimental/renumber_utils.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 26a8f98e265..16897b8173f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -426,6 +426,7 @@ add_library(cugraph SHARED src/experimental/graph_view.cu src/experimental/coarsen_graph.cu src/experimental/renumber_edgelist.cu + src/experimental/renumber_utils.cu src/experimental/relabel.cu src/experimental/induced_subgraph.cu src/experimental/bfs.cu diff --git a/cpp/include/experimental/graph_functions.hpp b/cpp/include/experimental/graph_functions.hpp index e9740d67666..1c464683b7c 100644 --- a/cpp/include/experimental/graph_functions.hpp +++ b/cpp/include/experimental/graph_functions.hpp @@ -17,13 +17,13 @@ #include #include -#include #include #include #include #include +#include namespace cugraph { namespace experimental { @@ -40,9 +40,24 @@ namespace experimental { * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. - * @param edgelist_major_vertices Pointers (one pointer per local graph adjacency matrix partition assigned to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, minor) pair should return the GPU ID of this process and applying the compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition should return the partition ID of the corresponding matrix partition. - * @param edgelist_minor_vertices Pointers (one pointer per local graph adjacency matrix partition assigned to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, minor) pair should return the GPU ID of this process and applying the compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition should return the partition ID of the corresponding matrix partition. - * @param edgelist_edge_counts Edge counts (one count per local graph adjacency matrix partition assigned to this process). + * @param edgelist_major_vertices Pointers (one pointer per local graph adjacency matrix partition + * assigned to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as + * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex + * IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target + * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, + * minor) pair should return the GPU ID of this process and applying the + * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition + * should return the partition ID of the corresponding matrix partition. + * @param edgelist_minor_vertices Pointers (one pointer per local graph adjacency matrix partition + * assigned to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored + * as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs + * are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target + * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, + * minor) pair should return the GPU ID of this process and applying the + * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition + * should return the partition ID of the corresponding matrix partition. + * @param edgelist_edge_counts Edge counts (one count per local graph adjacency matrix partition + * assigned to this process). * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). * @return std::tuple, partition_t, vertex_t, edge_t> * Quadruplet of labels (vertex IDs before renumbering) for the entire set of vertices (assigned to @@ -104,9 +119,24 @@ std::enable_if_t> renumber_edgelist( * the compute_gpu_id_from_vertex_t to every vertex should return the local GPU ID for this function * to work (vertices should be pre-shuffled). * @param num_local_vertices Number of local vertices. - * @param edgelist_major_vertices Pointers (one pointer per local graph adjacency matrix partition assigned to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, minor) pair should return the GPU ID of this process and applying the compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition should return the partition ID of the corresponding matrix partition. - * @param edgelist_minor_vertices Pointers (one pointer per local graph adjacency matrix partition assigned to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, minor) pair should return the GPU ID of this process and applying the compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition should return the partition ID of the corresponding matrix partition. - * @param edgelist_edge_counts Edge counts (one count per local graph adjacency matrix partition assigned to this process). + * @param edgelist_major_vertices Pointers (one pointer per local graph adjacency matrix partition + * assigned to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as + * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex + * IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target + * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, + * minor) pair should return the GPU ID of this process and applying the + * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition + * should return the partition ID of the corresponding matrix partition. + * @param edgelist_minor_vertices Pointers (one pointer per local graph adjacency matrix partition + * assigned to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored + * as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs + * are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target + * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, + * minor) pair should return the GPU ID of this process and applying the + * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition + * should return the partition ID of the corresponding matrix partition. + * @param edgelist_edge_counts Edge counts (one count per local graph adjacency matrix partition + * assigned to this process). * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). * @return std::tuple, partition_t, vertex_t, edge_t> * Quadruplet of labels (vertex IDs before renumbering) for the entire set of vertices (assigned to @@ -159,6 +189,96 @@ std::enable_if_t> renumber_edgelist( edge_t num_edgelist_edges, bool do_expensive_check = false); +/** + * @brief Renumber external vertices to internal vertices based on the provoided @p + * renumber_map_labels. + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) + * or multi-GPU (true). + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param vertices Pointer to the vertices to be renumbered. The input external vertices are + * renumbered to internal vertices in-place. + * @param num_vertices Number of vertices to be renumbered. + * @param renumber_map_labels Pointer to the external vertices corresponding to the internal + * vertices in the range [@p local_int_vertex_first, @p local_int_vertex_last). + * @param local_int_vertex_first The first local internal vertex (inclusive, assigned to this + * process in multi-GPU). + * @param local_int_vertex_last The last local internal vertex (exclusive, assigned to this process + * in multi-GPU). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void renumber_ext_vertices(raft::handle_t const& handle, + vertex_t* vertices /* [INOUT] */, + size_t num_vertices, + vertex_t const* renumber_map_labels, + vertex_t local_int_vertex_first, + vertex_t local_int_vertex_last, + bool do_expensive_check = false); + +/** + * @brief Unrenumber local internal vertices to external vertices based on the providied @p + * renumber_map_labels. + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param vertices Pointer to the local internal vertices to be unrenumbered. Each input element + * should be in [@p local_int_vertex_first, @p local_int_vertex_last). The input internal vertices + * are renumbered to external vertices in-place. + * @param num_vertices Number of vertices to be unrenumbered. + * @param renumber_map_labels Pointer to the external vertices corresponding to the internal + * vertices in the range [@p local_int_vertex_first, @p local_int_vertex_last). + * @param local_int_vertex_first The first local internal vertex (inclusive, assigned to this + * process in multi-GPU). + * @param local_int_vertex_last The last local internal vertex (exclusive, assigned to this process + * in multi-GPU). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void unrenumber_local_int_vertices( + raft::handle_t const& handle, + vertex_t* vertices /* [INOUT] */, + size_t num_vertices, + vertex_t const* renumber_map_labels /* size = local_int_vertex_last - local_int_vertex_first */, + vertex_t local_int_vertex_first, + vertex_t local_int_vertex_last, + bool do_expensive_check = false); + +/** + * @brief Unrenumber (possibly non-local) internal vertices to external vertices based on the + * providied @p renumber_map_labels. + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) + * or multi-GPU (true). + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param vertices Pointer to the internal vertices to be unrenumbered. The input internal vertices + * are renumbered to external vertices in-place. + * @param num_vertices Number of vertices to be unrenumbered. + * @param renumber_map_labels Pointer to the external vertices corresponding to the internal + * vertices in the range [@p local_int_vertex_first, @p local_int_vertex_last). + * @param local_int_vertex_first The first local internal vertex (inclusive, assigned to this + * process in multi-GPU). + * @param local_int_vertex_last The last local internal vertex (exclusive, assigned to this process + * in multi-GPU). + * @param vertex_partition_lasts Last local internal vertices (exclusive, assigned to each process + * in multi-GPU). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void unrenumber_int_vertices(raft::handle_t const& handle, + vertex_t* vertices /* [INOUT] */, + size_t num_vertices, + vertex_t const* renumber_map_labels, + vertex_t local_int_vertex_first, + vertex_t local_int_vertex_last, + std::vector& vertex_partition_lasts, + bool do_expensive_check = false); + /** * @brief Compute the coarsened graph. * diff --git a/cpp/include/utilities/collect_comm.cuh b/cpp/include/utilities/collect_comm.cuh index 5ca58ebeb17..e2d881c84ff 100644 --- a/cpp/include/utilities/collect_comm.cuh +++ b/cpp/include/utilities/collect_comm.cuh @@ -58,7 +58,8 @@ collect_values_for_keys(raft::comms::comms_t const &comm, double constexpr load_factor = 0.7; // FIXME: we may compare the performance & memory footprint of this hash based approach vs binary - // search based approach + // search based approach (especially when thrust::distance(collect_key_first, collect_key_last) << + // thrust::distance(map_key_first, map_key_last) // 1. build a cuco::static_map object for the map k, v pairs. diff --git a/cpp/src/experimental/relabel.cu b/cpp/src/experimental/relabel.cu index 62bd6951f71..31133ae64fa 100644 --- a/cpp/src/experimental/relabel.cu +++ b/cpp/src/experimental/relabel.cu @@ -42,6 +42,7 @@ namespace cugraph { namespace experimental { +// FIXME: think about requiring old_new_label_pairs to be pre-shuffled template void relabel(raft::handle_t const& handle, std::tuple old_new_label_pairs, diff --git a/cpp/src/experimental/renumber_utils.cu b/cpp/src/experimental/renumber_utils.cu new file mode 100644 index 00000000000..8915065f17e --- /dev/null +++ b/cpp/src/experimental/renumber_utils.cu @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cugraph { +namespace experimental { + +template +void renumber_ext_vertices(raft::handle_t const& handle, + vertex_t* vertices /* [INOUT] */, + size_t num_vertices, + vertex_t const* renumber_map_labels, + vertex_t local_int_vertex_first, + vertex_t local_int_vertex_last, + bool do_expensive_check) +{ + double constexpr load_factor = 0.7; + + // FIXME: remove this check once we drop Pascal support + CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, + "renumber_vertices() not supported on Pascal and older architectures."); + +#ifdef CUCO_STATIC_MAP_DEFINED + if (do_expensive_check) { + rmm::device_uvector labels(local_int_vertex_last - local_int_vertex_first, + handle.get_stream()); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + renumber_map_labels, + renumber_map_labels + labels.size(), + labels.begin()); + thrust::sort( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), labels.begin(), labels.end()); + CUGRAPH_EXPECTS(thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + labels.begin(), + labels.end()) == labels.end(), + "Invalid input arguments: renumber_map_labels have duplicate elements."); + } + + if (multi_gpu) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + + // FIXME: renumbered_vertices and thrust::copy are unnecessary if there exists an in-place + // version of collect_values_for_keys (when the key type is identical to the value type) + auto renumbered_vertices = + collect_values_for_keys(comm, + renumber_map_labels, + renumber_map_labels + local_int_vertex_last - local_int_vertex_first, + thrust::make_counting_iterator(local_int_vertex_first), + vertices, + vertices + num_vertices, + detail::compute_gpu_id_from_vertex_t{comm_size}, + handle.get_stream()); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + renumbered_vertices.begin(), + renumbered_vertices.end(), + vertices); + } else { + cuco::static_map renumber_map{ + static_cast(static_cast(local_int_vertex_last - local_int_vertex_first) / + load_factor), + invalid_vertex_id::value, + invalid_vertex_id::value}; + + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator( + thrust::make_tuple(renumber_map_labels, thrust::make_counting_iterator(vertex_t{0}))), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + renumber_map.insert(pair_first, pair_first + (local_int_vertex_last - local_int_vertex_first)); + renumber_map.find(vertices, vertices + num_vertices, vertices); + } + + if (do_expensive_check) { + CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + [] __device__(auto v) { + return v == invalid_vertex_id::value; + }) == 0, + "Invalid input arguments: vertices have elements that are missing in " + "(aggregate) renumber_map_labels."); + } +#endif +} + +template +void unrenumber_local_int_vertices( + raft::handle_t const& handle, + vertex_t* vertices /* [INOUT] */, + size_t num_vertices, + vertex_t const* renumber_map_labels /* size = local_int_vertex_last - local_int_vertex_first */, + vertex_t local_int_vertex_first, + vertex_t local_int_vertex_last, + bool do_expensive_check) +{ + // FIXME: remove this check once we drop Pascal support + CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, + "unrenumber_local_vertices() not supported on Pascal and older architectures."); + +#ifdef CUCO_STATIC_MAP_DEFINED + if (do_expensive_check) { + CUGRAPH_EXPECTS( + thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + [local_int_vertex_first, local_int_vertex_last] __device__(auto v) { + return v < local_int_vertex_first || v >= local_int_vertex_last; + }) == 0, + "Invalid input arguments: there are non-local vertices in [vertices, vertices " + "+ num_vertices)."); + } + + thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + vertices, + [renumber_map_labels, local_int_vertex_first] __device__(auto v) { + return renumber_map_labels[v - local_int_vertex_first]; + }); +#endif +} + +template +void unrenumber_int_vertices(raft::handle_t const& handle, + vertex_t* vertices /* [INOUT] */, + size_t num_vertices, + vertex_t const* renumber_map_labels, + vertex_t local_int_vertex_first, + vertex_t local_int_vertex_last, + std::vector& vertex_partition_lasts, + bool do_expensive_check) +{ + double constexpr load_factor = 0.7; + + // FIXME: remove this check once we drop Pascal support + CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, + "unrenumber_vertices() not supported on Pascal and older architectures."); + +#ifdef CUCO_STATIC_MAP_DEFINED + if (do_expensive_check) { + CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + [num_vertices = vertex_partition_lasts.back()] __device__( + auto v) { return !is_valid_vertex(num_vertices, v); }) == 0, + "Invalid input arguments: there are non-local vertices in [vertices, vertices " + "+ num_vertices)."); + } + + if (multi_gpu) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + + rmm::device_uvector sorted_unique_int_vertices(num_vertices, handle.get_stream()); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + sorted_unique_int_vertices.begin()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_unique_int_vertices.begin(), + sorted_unique_int_vertices.end()); + sorted_unique_int_vertices.resize( + thrust::distance( + sorted_unique_int_vertices.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_unique_int_vertices.begin(), + sorted_unique_int_vertices.end())), + handle.get_stream()); + + rmm::device_uvector d_vertex_partition_lasts(vertex_partition_lasts.size(), + handle.get_stream()); + raft::update_device(d_vertex_partition_lasts.data(), + vertex_partition_lasts.data(), + vertex_partition_lasts.size(), + handle.get_stream()); + rmm::device_uvector d_tx_int_vertex_offsets(d_vertex_partition_lasts.size(), + handle.get_stream()); + thrust::lower_bound(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_unique_int_vertices.begin(), + sorted_unique_int_vertices.end(), + d_vertex_partition_lasts.begin(), + d_vertex_partition_lasts.end(), + d_tx_int_vertex_offsets.begin()); + std::vector h_tx_int_vertex_counts(d_tx_int_vertex_offsets.size()); + raft::update_host(h_tx_int_vertex_counts.data(), + d_tx_int_vertex_offsets.data(), + d_tx_int_vertex_offsets.size(), + handle.get_stream()); + handle.get_stream_view().synchronize(); + std::adjacent_difference( + h_tx_int_vertex_counts.begin(), h_tx_int_vertex_counts.end(), h_tx_int_vertex_counts.begin()); + + rmm::device_uvector rx_int_vertices(0, handle.get_stream()); + std::vector rx_int_vertex_counts{}; + std::tie(rx_int_vertices, rx_int_vertex_counts) = shuffle_values( + comm, sorted_unique_int_vertices.begin(), h_tx_int_vertex_counts, handle.get_stream()); + + auto tx_ext_vertices = std::move(rx_int_vertices); + thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + tx_ext_vertices.begin(), + tx_ext_vertices.end(), + tx_ext_vertices.begin(), + [renumber_map_labels, local_int_vertex_first] __device__(auto v) { + return renumber_map_labels[v - local_int_vertex_first]; + }); + + rmm::device_uvector rx_ext_vertices_for_sorted_unique_int_vertices( + 0, handle.get_stream()); + std::tie(rx_ext_vertices_for_sorted_unique_int_vertices, std::ignore) = + shuffle_values(comm, tx_ext_vertices.begin(), rx_int_vertex_counts, handle.get_stream()); + + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream + + cuco::static_map unrenumber_map( + static_cast( + static_cast(static_cast(sorted_unique_int_vertices.size()) / load_factor)), + invalid_vertex_id::value, + invalid_vertex_id::value); + + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator( + thrust::make_tuple(sorted_unique_int_vertices.begin(), + rx_ext_vertices_for_sorted_unique_int_vertices.begin())), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + unrenumber_map.insert(pair_first, pair_first + sorted_unique_int_vertices.size()); + unrenumber_map.find(vertices, vertices + num_vertices, vertices); + } else { + unrenumber_local_int_vertices(handle, + vertices, + num_vertices, + renumber_map_labels, + local_int_vertex_first, + local_int_vertex_last, + do_expensive_check); + } +#endif +} + +// explicit instantiation + +template void renumber_ext_vertices(raft::handle_t const& handle, + int32_t* vertices, + size_t num_vertices, + int32_t const* renumber_map_labels, + int32_t local_int_vertex_first, + int32_t local_int_vertex_last, + bool do_expensive_check); + +template void renumber_ext_vertices(raft::handle_t const& handle, + int32_t* vertices, + size_t num_vertices, + int32_t const* renumber_map_labels, + int32_t local_int_vertex_first, + int32_t local_int_vertex_last, + bool do_expensive_check); + +template void renumber_ext_vertices(raft::handle_t const& handle, + int64_t* vertices, + size_t num_vertices, + int64_t const* renumber_map_labels, + int64_t local_int_vertex_first, + int64_t local_int_vertex_last, + bool do_expensive_check); + +template void renumber_ext_vertices(raft::handle_t const& handle, + int64_t* vertices, + size_t num_vertices, + int64_t const* renumber_map_labels, + int64_t local_int_vertex_first, + int64_t local_int_vertex_last, + bool do_expensive_check); + +template void unrenumber_local_int_vertices(raft::handle_t const& handle, + int32_t* vertices, + size_t num_vertices, + int32_t const* renumber_map_labels, + int32_t local_int_vertex_first, + int32_t local_int_vertex_last, + bool do_expensive_check); + +template void unrenumber_local_int_vertices(raft::handle_t const& handle, + int64_t* vertices, + size_t num_vertices, + int64_t const* renumber_map_labels, + int64_t local_int_vertex_first, + int64_t local_int_vertex_last, + bool do_expensive_check); + +template void unrenumber_int_vertices(raft::handle_t const& handle, + int32_t* vertices, + size_t num_vertices, + int32_t const* renumber_map_labels, + int32_t local_int_vertex_first, + int32_t local_int_vertex_last, + std::vector& vertex_partition_lasts, + bool do_expensive_check); + +template void unrenumber_int_vertices(raft::handle_t const& handle, + int32_t* vertices, + size_t num_vertices, + int32_t const* renumber_map_labels, + int32_t local_int_vertex_first, + int32_t local_int_vertex_last, + std::vector& vertex_partition_lasts, + bool do_expensive_check); + +template void unrenumber_int_vertices(raft::handle_t const& handle, + int64_t* vertices, + size_t num_vertices, + int64_t const* renumber_map_labels, + int64_t local_int_vertex_first, + int64_t local_int_vertex_last, + std::vector& vertex_partition_lasts, + bool do_expensive_check); + +template void unrenumber_int_vertices(raft::handle_t const& handle, + int64_t* vertices, + size_t num_vertices, + int64_t const* renumber_map_labels, + int64_t local_int_vertex_first, + int64_t local_int_vertex_last, + std::vector& vertex_partition_lasts, + bool do_expensive_check); + +} // namespace experimental +} // namespace cugraph From 78105ba61eacc6e6884af1aeb1c573e7170f301a Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 23 Mar 2021 13:43:21 -0400 Subject: [PATCH 30/63] mark is_valid_vertex as __host__ __device__ function --- cpp/include/experimental/graph.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/experimental/graph.hpp b/cpp/include/experimental/graph.hpp index 0e88ba49bc0..a380200ea1f 100644 --- a/cpp/include/experimental/graph.hpp +++ b/cpp/include/experimental/graph.hpp @@ -189,15 +189,15 @@ struct invalid_edge_id : invalid_idx { }; template -std::enable_if_t::value, bool> is_valid_vertex(vertex_t num_vertices, - vertex_t v) +__host__ __device__ std::enable_if_t::value, bool> is_valid_vertex( + vertex_t num_vertices, vertex_t v) { return (v >= 0) && (v < num_vertices); } template -std::enable_if_t::value, bool> is_valid_vertex(vertex_t num_vertices, - vertex_t v) +__host__ __device__ std::enable_if_t::value, bool> is_valid_vertex( + vertex_t num_vertices, vertex_t v) { return v < num_vertices; } From a1e76383aa5fd7d8d5cb8dd7971ff05b63d9eb9a Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 23 Mar 2021 16:11:25 -0400 Subject: [PATCH 31/63] refactor test utilities related to (un)renumber and update renumber utilities to handle invalid vertex id --- cpp/include/experimental/graph_functions.hpp | 6 + cpp/include/utilities/collect_comm.cuh | 114 +++++++++++++- cpp/src/experimental/renumber_utils.cu | 130 +++++++++++----- cpp/tests/utilities/renumber_utilities.cu | 143 ------------------ cpp/tests/utilities/thrust_wrapper.cu | 81 ++++++++++ ...umber_utilities.hpp => thrust_wrapper.hpp} | 18 +-- 6 files changed, 295 insertions(+), 197 deletions(-) delete mode 100644 cpp/tests/utilities/renumber_utilities.cu create mode 100644 cpp/tests/utilities/thrust_wrapper.cu rename cpp/tests/utilities/{renumber_utilities.hpp => thrust_wrapper.hpp} (54%) diff --git a/cpp/include/experimental/graph_functions.hpp b/cpp/include/experimental/graph_functions.hpp index 1c464683b7c..100742adccd 100644 --- a/cpp/include/experimental/graph_functions.hpp +++ b/cpp/include/experimental/graph_functions.hpp @@ -193,6 +193,8 @@ std::enable_if_t> renumber_edgelist( * @brief Renumber external vertices to internal vertices based on the provoided @p * renumber_map_labels. * + * Note cugraph::experimental::invalid_id::value remains unchanged. + * * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). @@ -222,6 +224,8 @@ void renumber_ext_vertices(raft::handle_t const& handle, * @brief Unrenumber local internal vertices to external vertices based on the providied @p * renumber_map_labels. * + * Note cugraph::experimental::invalid_id::value remains unchanged. + * * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. @@ -251,6 +255,8 @@ void unrenumber_local_int_vertices( * @brief Unrenumber (possibly non-local) internal vertices to external vertices based on the * providied @p renumber_map_labels. * + * Note cugraph::experimental::invalid_id::value remains unchanged. + * * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). diff --git a/cpp/include/utilities/collect_comm.cuh b/cpp/include/utilities/collect_comm.cuh index e2d881c84ff..ab4a070cb79 100644 --- a/cpp/include/utilities/collect_comm.cuh +++ b/cpp/include/utilities/collect_comm.cuh @@ -83,9 +83,6 @@ collect_values_for_keys(raft::comms::comms_t const &comm, stream); thrust::copy( rmm::exec_policy(stream)->on(stream), collect_key_first, collect_key_last, unique_keys.begin()); - // FIXME: sort and unique are unnecessary if the keys in [collect_key_first, collect_key_last) are - // already unique, if this cost becomes a performance bottlenec, we may add - // collect_values_for_unique_keys in the future thrust::sort(rmm::exec_policy(stream)->on(stream), unique_keys.begin(), unique_keys.end()); unique_keys.resize( thrust::distance( @@ -150,5 +147,116 @@ collect_values_for_keys(raft::comms::comms_t const &comm, return value_buffer; } +// for key = [map_key_first, map_key_last), key_to_gpu_id_op(key) should be coincide with +// comm.get_rank() +template +decltype(allocate_dataframe_buffer::value_type>( + 0, cudaStream_t{nullptr})) +collect_values_for_unique_keys(raft::comms::comms_t const &comm, + VertexIterator0 map_key_first, + VertexIterator0 map_key_last, + ValueIterator map_value_first, + VertexIterator1 collect_unique_key_first, + VertexIterator1 collect_unique_key_last, + KeyToGPUIdOp key_to_gpu_id_op, + cudaStream_t stream) +{ + using vertex_t = typename std::iterator_traits::value_type; + static_assert( + std::is_same::value_type, vertex_t>::value); + using value_t = typename std::iterator_traits::value_type; + + double constexpr load_factor = 0.7; + + // FIXME: we may compare the performance & memory footprint of this hash based approach vs binary + // search based approach (especially when thrust::distance(collect_unique_key_first, + // collect_unique_key_last) << thrust::distance(map_key_first, map_key_last) + + // 1. build a cuco::static_map object for the map k, v pairs. + + auto kv_map_ptr = std::make_unique>( + static_cast(static_cast(thrust::distance(map_key_first, map_key_last)) / + load_factor), + invalid_vertex_id::value, + invalid_vertex_id::value); + { + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first)), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); + } + + // 2. collect values for the unique keys in [collect_unique_key_first, collect_unique_key_last) + + rmm::device_uvector unique_keys( + thrust::distance(collect_unique_key_first, collect_unique_key_last), stream); + thrust::copy(rmm::exec_policy(stream)->on(stream), + collect_unique_key_first, + collect_unique_key_last, + unique_keys.begin()); + + rmm::device_uvector values_for_unique_keys(0, stream); + { + rmm::device_uvector rx_unique_keys(0, stream); + std::vector rx_value_counts{}; + std::tie(rx_unique_keys, rx_value_counts) = groupby_gpuid_and_shuffle_values( + comm, + unique_keys.begin(), + unique_keys.end(), + [key_to_gpu_id_op] __device__(auto val) { return key_to_gpu_id_op(val); }, + stream); + + rmm::device_uvector values_for_rx_unique_keys(rx_unique_keys.size(), stream); + + CUDA_TRY(cudaStreamSynchronize(stream)); // cuco::static_map currently does not take stream + + kv_map_ptr->find( + rx_unique_keys.begin(), rx_unique_keys.end(), values_for_rx_unique_keys.begin()); + + rmm::device_uvector rx_values_for_unique_keys(0, stream); + std::tie(rx_values_for_unique_keys, std::ignore) = + shuffle_values(comm, values_for_rx_unique_keys.begin(), rx_value_counts, stream); + + values_for_unique_keys = std::move(rx_values_for_unique_keys); + } + + // 3. re-build a cuco::static_map object for the k, v pairs in unique_keys, + // values_for_unique_keys. + + CUDA_TRY(cudaStreamSynchronize(stream)); // cuco::static_map currently does not take stream + + kv_map_ptr.reset(); + + kv_map_ptr = std::make_unique>( + static_cast(static_cast(unique_keys.size()) / load_factor), + invalid_vertex_id::value, + invalid_vertex_id::value); + { + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator( + thrust::make_tuple(unique_keys.begin(), values_for_unique_keys.begin())), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + + kv_map_ptr->insert(pair_first, pair_first + unique_keys.size()); + } + + // 4. find values for [collect_unique_key_first, collect_unique_key_last) + + auto value_buffer = allocate_dataframe_buffer( + thrust::distance(collect_unique_key_first, collect_unique_key_last), stream); + kv_map_ptr->find(collect_unique_key_first, + collect_unique_key_last, + get_dataframe_buffer_begin(value_buffer)); + + return value_buffer; +} + } // namespace experimental } // namespace cugraph diff --git a/cpp/src/experimental/renumber_utils.cu b/cpp/src/experimental/renumber_utils.cu index 8915065f17e..606fd3d32dc 100644 --- a/cpp/src/experimental/renumber_utils.cu +++ b/cpp/src/experimental/renumber_utils.cu @@ -66,31 +66,67 @@ void renumber_ext_vertices(raft::handle_t const& handle, "Invalid input arguments: renumber_map_labels have duplicate elements."); } + auto renumber_map_ptr = std::make_unique>( + size_t{0}, invalid_vertex_id::value, invalid_vertex_id::value); if (multi_gpu) { auto& comm = handle.get_comms(); auto const comm_size = comm.get_size(); - // FIXME: renumbered_vertices and thrust::copy are unnecessary if there exists an in-place - // version of collect_values_for_keys (when the key type is identical to the value type) - auto renumbered_vertices = - collect_values_for_keys(comm, - renumber_map_labels, - renumber_map_labels + local_int_vertex_last - local_int_vertex_first, - thrust::make_counting_iterator(local_int_vertex_first), - vertices, - vertices + num_vertices, - detail::compute_gpu_id_from_vertex_t{comm_size}, - handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - renumbered_vertices.begin(), - renumbered_vertices.end(), - vertices); + rmm::device_uvector sorted_unique_ext_vertices(num_vertices, handle.get_stream()); + sorted_unique_ext_vertices.resize( + thrust::distance( + sorted_unique_ext_vertices.begin(), + thrust::copy_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + sorted_unique_ext_vertices.begin(), + [] __device__(auto v) { return v != invalid_vertex_id::value; })), + handle.get_stream()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_unique_ext_vertices.begin(), + sorted_unique_ext_vertices.end()); + sorted_unique_ext_vertices.resize( + thrust::distance( + sorted_unique_ext_vertices.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_unique_ext_vertices.begin(), + sorted_unique_ext_vertices.end())), + handle.get_stream()); + + auto int_vertices_for_sorted_unique_ext_vertices = collect_values_for_unique_keys( + comm, + renumber_map_labels, + renumber_map_labels + (local_int_vertex_last - local_int_vertex_first), + thrust::make_counting_iterator(local_int_vertex_first), + sorted_unique_ext_vertices.begin(), + sorted_unique_ext_vertices.end(), + detail::compute_gpu_id_from_vertex_t{comm_size}, + handle.get_stream()); + + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream + + renumber_map_ptr.reset(); + + renumber_map_ptr = std::make_unique>( + static_cast(static_cast(sorted_unique_ext_vertices.size()) / load_factor), + invalid_vertex_id::value, + invalid_vertex_id::value); + + auto kv_pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple( + sorted_unique_ext_vertices.begin(), int_vertices_for_sorted_unique_ext_vertices.begin())), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + renumber_map_ptr->insert(kv_pair_first, kv_pair_first + sorted_unique_ext_vertices.size()); } else { - cuco::static_map renumber_map{ + renumber_map_ptr.reset(); + + renumber_map_ptr = std::make_unique>( static_cast(static_cast(local_int_vertex_last - local_int_vertex_first) / load_factor), invalid_vertex_id::value, - invalid_vertex_id::value}; + invalid_vertex_id::value); auto pair_first = thrust::make_transform_iterator( thrust::make_zip_iterator( @@ -98,20 +134,29 @@ void renumber_ext_vertices(raft::handle_t const& handle, [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - renumber_map.insert(pair_first, pair_first + (local_int_vertex_last - local_int_vertex_first)); - renumber_map.find(vertices, vertices + num_vertices, vertices); + renumber_map_ptr->insert(pair_first, + pair_first + (local_int_vertex_last - local_int_vertex_first)); } if (do_expensive_check) { + rmm::device_uvector contains(num_vertices, handle.get_stream()); + renumber_map_ptr->contains(vertices, vertices + num_vertices, contains.begin()); + auto vc_pair_first = thrust::make_zip_iterator(thrust::make_tuple(vertices, contains.begin())); CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - vertices, - vertices + num_vertices, - [] __device__(auto v) { - return v == invalid_vertex_id::value; + vc_pair_first, + vc_pair_first + num_vertices, + [] __device__(auto pair) { + auto v = thrust::get<0>(pair); + auto c = thrust::get<1>(pair); + return v == invalid_vertex_id::value + ? (c == true) + : (c == false); }) == 0, "Invalid input arguments: vertices have elements that are missing in " "(aggregate) renumber_map_labels."); } + + renumber_map_ptr->find(vertices, vertices + num_vertices, vertices); #endif } @@ -136,7 +181,8 @@ void unrenumber_local_int_vertices( vertices, vertices + num_vertices, [local_int_vertex_first, local_int_vertex_last] __device__(auto v) { - return v < local_int_vertex_first || v >= local_int_vertex_last; + return v != invalid_vertex_id::value && + (v < local_int_vertex_first || v >= local_int_vertex_last); }) == 0, "Invalid input arguments: there are non-local vertices in [vertices, vertices " "+ num_vertices)."); @@ -147,7 +193,9 @@ void unrenumber_local_int_vertices( vertices + num_vertices, vertices, [renumber_map_labels, local_int_vertex_first] __device__(auto v) { - return renumber_map_labels[v - local_int_vertex_first]; + return v == invalid_vertex_id::value + ? v + : renumber_map_labels[v - local_int_vertex_first]; }); #endif } @@ -170,13 +218,16 @@ void unrenumber_int_vertices(raft::handle_t const& handle, #ifdef CUCO_STATIC_MAP_DEFINED if (do_expensive_check) { - CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - vertices, - vertices + num_vertices, - [num_vertices = vertex_partition_lasts.back()] __device__( - auto v) { return !is_valid_vertex(num_vertices, v); }) == 0, - "Invalid input arguments: there are non-local vertices in [vertices, vertices " - "+ num_vertices)."); + CUGRAPH_EXPECTS( + thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + [num_vertices = vertex_partition_lasts.back()] __device__(auto v) { + return v != invalid_vertex_id::value && + !is_valid_vertex(num_vertices, v); + }) == 0, + "Invalid input arguments: there are out-of-range vertices in [vertices, vertices " + "+ num_vertices)."); } if (multi_gpu) { @@ -184,10 +235,15 @@ void unrenumber_int_vertices(raft::handle_t const& handle, auto const comm_size = comm.get_size(); rmm::device_uvector sorted_unique_int_vertices(num_vertices, handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - vertices, - vertices + num_vertices, - sorted_unique_int_vertices.begin()); + sorted_unique_int_vertices.resize( + thrust::distance( + sorted_unique_int_vertices.begin(), + thrust::copy_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + sorted_unique_int_vertices.begin(), + [] __device__(auto v) { return v != invalid_vertex_id::value; })), + handle.get_stream()); thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), sorted_unique_int_vertices.begin(), sorted_unique_int_vertices.end()); @@ -200,7 +256,7 @@ void unrenumber_int_vertices(raft::handle_t const& handle, handle.get_stream()); rmm::device_uvector d_vertex_partition_lasts(vertex_partition_lasts.size(), - handle.get_stream()); + handle.get_stream()); raft::update_device(d_vertex_partition_lasts.data(), vertex_partition_lasts.data(), vertex_partition_lasts.size(), diff --git a/cpp/tests/utilities/renumber_utilities.cu b/cpp/tests/utilities/renumber_utilities.cu deleted file mode 100644 index 306faaee1aa..00000000000 --- a/cpp/tests/utilities/renumber_utilities.cu +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include - -#include -#include - -namespace cugraph { -namespace test { - -template -std::tuple, rmm::device_uvector> unrenumber_kv_pairs( - raft::handle_t const& handle, - vertex_t const* keys, - value_t const* values, - size_t num_pairs, - vertex_t const* renumber_map_labels, - vertex_t map_key_first, - vertex_t map_key_last) -{ - rmm::device_uvector unrenumbered_keys(num_pairs, handle.get_stream_view()); - rmm::device_uvector values_for_unrenumbered_keys(num_pairs, handle.get_stream_view()); - - auto unrenumbered_key_first = - thrust::make_transform_iterator(keys, [renumber_map_labels, map_key_first] __device__(auto v) { - return renumber_map_labels[v - map_key_first]; - }); - thrust::copy(rmm::exec_policy(handle.get_stream_view()), - unrenumbered_key_first, - unrenumbered_key_first + num_pairs, - unrenumbered_keys.begin()); - thrust::copy(rmm::exec_policy(handle.get_stream_view()), - values, - values + num_pairs, - values_for_unrenumbered_keys.begin()); - - thrust::sort_by_key(rmm::exec_policy(handle.get_stream_view()), - unrenumbered_keys.begin(), - unrenumbered_keys.end(), - values_for_unrenumbered_keys.begin()); - - return std::make_tuple(std::move(unrenumbered_keys), std::move(values_for_unrenumbered_keys)); -} - -template -rmm::device_uvector sort_values_by_key(raft::handle_t const& handle, - vertex_t const* keys, - value_t const* values, - size_t num_pairs) -{ - rmm::device_uvector sorted_keys(num_pairs, handle.get_stream_view()); - rmm::device_uvector sorted_values(num_pairs, handle.get_stream_view()); - - thrust::copy( - rmm::exec_policy(handle.get_stream_view()), keys, keys + num_pairs, sorted_keys.begin()); - thrust::copy( - rmm::exec_policy(handle.get_stream_view()), values, values + num_pairs, sorted_values.begin()); - - thrust::sort_by_key(rmm::exec_policy(handle.get_stream_view()), - sorted_keys.begin(), - sorted_keys.end(), - sorted_values.begin()); - - return sorted_values; -} - -template std::tuple, rmm::device_uvector> -unrenumber_kv_pairs(raft::handle_t const& handle, - int32_t const* keys, - float const* values, - size_t num_pairs, - int32_t const* renumber_map_labels, - int32_t map_key_first, - int32_t map_key_last); - -template std::tuple, rmm::device_uvector> -unrenumber_kv_pairs(raft::handle_t const& handle, - int32_t const* keys, - double const* values, - size_t num_pairs, - int32_t const* renumber_map_labels, - int32_t map_key_first, - int32_t map_key_last); - -template std::tuple, rmm::device_uvector> -unrenumber_kv_pairs(raft::handle_t const& handle, - int64_t const* keys, - float const* values, - size_t num_pairs, - int64_t const* renumber_map_labels, - int64_t map_key_first, - int64_t map_key_last); - -template std::tuple, rmm::device_uvector> -unrenumber_kv_pairs(raft::handle_t const& handle, - int64_t const* keys, - double const* values, - size_t num_pairs, - int64_t const* renumber_map_labels, - int64_t map_key_first, - int64_t map_key_last); - -template rmm::device_uvector sort_values_by_key(raft::handle_t const& handle, - int32_t const* keys, - float const* values, - size_t num_pairs); - -template rmm::device_uvector sort_values_by_key( - raft::handle_t const& handle, int32_t const* keys, double const* values, size_t num_pairs); - -template rmm::device_uvector sort_values_by_key( - raft::handle_t const& handle, int32_t const* keys, int32_t const* values, size_t num_pairs); - -template rmm::device_uvector sort_values_by_key(raft::handle_t const& handle, - int64_t const* keys, - float const* values, - size_t num_pairs); - -template rmm::device_uvector sort_values_by_key( - raft::handle_t const& handle, int64_t const* keys, double const* values, size_t num_pairs); - -template rmm::device_uvector sort_values_by_key( - raft::handle_t const& handle, int64_t const* keys, int64_t const* values, size_t num_pairs); - -} // namespace test -} // namespace cugraph diff --git a/cpp/tests/utilities/thrust_wrapper.cu b/cpp/tests/utilities/thrust_wrapper.cu new file mode 100644 index 00000000000..5d32fb8a5d1 --- /dev/null +++ b/cpp/tests/utilities/thrust_wrapper.cu @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include + +namespace cugraph { +namespace test { + +template +rmm::device_uvector sort_by_key(raft::handle_t const& handle, + vertex_t const* keys, + value_t const* values, + size_t num_pairs) +{ + rmm::device_uvector sorted_keys(num_pairs, handle.get_stream_view()); + rmm::device_uvector sorted_values(num_pairs, handle.get_stream_view()); + + thrust::copy( + rmm::exec_policy(handle.get_stream_view()), keys, keys + num_pairs, sorted_keys.begin()); + thrust::copy( + rmm::exec_policy(handle.get_stream_view()), values, values + num_pairs, sorted_values.begin()); + + thrust::sort_by_key(rmm::exec_policy(handle.get_stream_view()), + sorted_keys.begin(), + sorted_keys.end(), + sorted_values.begin()); + + return sorted_values; +} + +template rmm::device_uvector sort_by_key(raft::handle_t const& handle, + int32_t const* keys, + float const* values, + size_t num_pairs); + +template rmm::device_uvector sort_by_key(raft::handle_t const& handle, + int32_t const* keys, + double const* values, + size_t num_pairs); + +template rmm::device_uvector sort_by_key(raft::handle_t const& handle, + int32_t const* keys, + int32_t const* values, + size_t num_pairs); + +template rmm::device_uvector sort_by_key(raft::handle_t const& handle, + int64_t const* keys, + float const* values, + size_t num_pairs); + +template rmm::device_uvector sort_by_key(raft::handle_t const& handle, + int64_t const* keys, + double const* values, + size_t num_pairs); + +template rmm::device_uvector sort_by_key(raft::handle_t const& handle, + int64_t const* keys, + int64_t const* values, + size_t num_pairs); + +} // namespace test +} // namespace cugraph diff --git a/cpp/tests/utilities/renumber_utilities.hpp b/cpp/tests/utilities/thrust_wrapper.hpp similarity index 54% rename from cpp/tests/utilities/renumber_utilities.hpp rename to cpp/tests/utilities/thrust_wrapper.hpp index 01bf865e58c..579dc3c550f 100644 --- a/cpp/tests/utilities/renumber_utilities.hpp +++ b/cpp/tests/utilities/thrust_wrapper.hpp @@ -21,20 +21,10 @@ namespace cugraph { namespace test { template -std::tuple, rmm::device_uvector> unrenumber_kv_pairs( - raft::handle_t const& handle, - vertex_t const* keys /* map_key_first <= keys[] < map_key_last */, - value_t const* values, - size_t num_pairs, - vertex_t const* renumber_map_labels, - vertex_t map_key_first, - vertex_t map_key_last); - -template -rmm::device_uvector sort_values_by_key(raft::handle_t const& handle, - vertex_t const* keys, - value_t const* values, - size_t num_pairs); +rmm::device_uvector sort_by_key(raft::handle_t const& handle, + vertex_t const* keys, + value_t const* values, + size_t num_pairs); } // namespace test } // namespace cugraph From 0a2fbe6e092c10dcb4b22b9b498a4e53c93cea8a Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 24 Mar 2021 15:03:48 -0400 Subject: [PATCH 32/63] several cosmetic and cleanup updates --- cpp/tests/experimental/bfs_test.cpp | 51 ++++++++----------- .../experimental/katz_centrality_test.cpp | 13 ++--- cpp/tests/experimental/pagerank_test.cpp | 44 ++++++++++------ cpp/tests/experimental/sssp_test.cpp | 43 +++++++--------- cpp/tests/pagerank/mg_pagerank_test.cpp | 51 ++++++++++++++----- cpp/tests/utilities/rmat_utilities.cu | 8 +-- 6 files changed, 118 insertions(+), 92 deletions(-) diff --git a/cpp/tests/experimental/bfs_test.cpp b/cpp/tests/experimental/bfs_test.cpp index d06e634f695..22889ac60a8 100644 --- a/cpp/tests/experimental/bfs_test.cpp +++ b/cpp/tests/experimental/bfs_test.cpp @@ -15,11 +15,12 @@ */ #include -#include #include +#include #include #include +#include #include #include @@ -82,7 +83,7 @@ void bfs_reference(edge_t const* offsets, typedef struct BFS_Usecase_t { cugraph::test::input_graph_specifier_t input_graph_specifier{}; - size_t source{false}; + size_t source{0}; bool check_correctness{false}; BFS_Usecase_t(std::string const& graph_file_path, size_t source, bool check_correctness = true) @@ -174,10 +175,8 @@ class Tests_BFS : public ::testing::TestWithParam { std::find( h_renumber_map_labels.begin(), h_renumber_map_labels.end(), configuration.source))); } - ASSERT_TRUE(source >= 0 && source < graph_view.get_number_of_vertices()) - << "Starting sources should be >= 0 and" - << " less than the number of vertices in the graph."; + << "Invalid starting source."; rmm::device_uvector d_distances(graph_view.get_number_of_vertices(), handle.get_stream()); @@ -188,12 +187,11 @@ class Tests_BFS : public ::testing::TestWithParam { cugraph::experimental::bfs(handle, graph_view, - d_distances.begin(), - d_predecessors.begin(), + d_distances.data(), + d_predecessors.data(), source, false, - std::numeric_limits::max(), - false); + std::numeric_limits::max()); CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement @@ -234,13 +232,20 @@ class Tests_BFS : public ::testing::TestWithParam { std::vector h_cugraph_distances(graph_view.get_number_of_vertices()); std::vector h_cugraph_predecessors(graph_view.get_number_of_vertices()); if (renumber) { - auto d_unrenumbered_distances = cugraph::test::sort_values_by_key( + cugraph::experimental::unrenumber_local_int_vertices(handle, + d_predecessors.data(), + d_predecessors.size(), + d_renumber_map_labels.data(), + vertex_t{0}, + graph_view.get_number_of_vertices(), + true); + + auto d_unrenumbered_distances = cugraph::test::sort_by_key( handle, d_renumber_map_labels.data(), d_distances.data(), d_renumber_map_labels.size()); - auto d_unrenumbered_predecessors = - cugraph::test::sort_values_by_key(handle, - d_renumber_map_labels.data(), - d_predecessors.data(), - d_renumber_map_labels.size()); + auto d_unrenumbered_predecessors = cugraph::test::sort_by_key(handle, + d_renumber_map_labels.data(), + d_predecessors.data(), + d_renumber_map_labels.size()); raft::update_host(h_cugraph_distances.data(), d_unrenumbered_distances.data(), d_unrenumbered_distances.size(), @@ -250,21 +255,7 @@ class Tests_BFS : public ::testing::TestWithParam { d_unrenumbered_predecessors.size(), handle.get_stream()); - std::vector h_renumber_map_labels(d_renumber_map_labels.size()); - raft::update_host(h_renumber_map_labels.data(), - d_renumber_map_labels.data(), - d_renumber_map_labels.size(), - handle.get_stream()); - handle.get_stream_view().synchronize(); - - std::transform( - h_cugraph_predecessors.begin(), - h_cugraph_predecessors.end(), - h_cugraph_predecessors.begin(), - [&h_renumber_map_labels](auto v) { - return v == cugraph::invalid_vertex_id::value ? v : h_renumber_map_labels[v]; - }); } else { raft::update_host( h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream()); @@ -284,7 +275,7 @@ class Tests_BFS : public ::testing::TestWithParam { auto i = std::distance(h_cugraph_predecessors.begin(), it); if (*it == cugraph::invalid_vertex_id::value) { ASSERT_TRUE(h_reference_predecessors[i] == *it) - << "vertex reachability do not match with the reference."; + << "vertex reachability does not match with the reference."; } else { ASSERT_TRUE(h_reference_distances[*it] + 1 == h_reference_distances[i]) << "distance to this vertex != distance to the predecessor vertex + 1."; diff --git a/cpp/tests/experimental/katz_centrality_test.cpp b/cpp/tests/experimental/katz_centrality_test.cpp index 3a929c8c294..71011f3d018 100644 --- a/cpp/tests/experimental/katz_centrality_test.cpp +++ b/cpp/tests/experimental/katz_centrality_test.cpp @@ -15,11 +15,12 @@ */ #include -#include #include +#include #include #include +#include #include #include @@ -195,7 +196,7 @@ class Tests_KatzCentrality : public ::testing::TestWithParam(nullptr), - d_katz_centralities.begin(), + d_katz_centralities.data(), alpha, beta, epsilon, @@ -255,10 +256,10 @@ class Tests_KatzCentrality : public ::testing::TestWithParam h_cugraph_katz_centralities(graph_view.get_number_of_vertices()); if (renumber) { auto d_unrenumbered_katz_centralities = - cugraph::test::sort_values_by_key(handle, - d_renumber_map_labels.data(), - d_katz_centralities.data(), - d_renumber_map_labels.size()); + cugraph::test::sort_by_key(handle, + d_renumber_map_labels.data(), + d_katz_centralities.data(), + d_renumber_map_labels.size()); raft::update_host(h_cugraph_katz_centralities.data(), d_unrenumbered_katz_centralities.data(), d_unrenumbered_katz_centralities.size(), diff --git a/cpp/tests/experimental/pagerank_test.cpp b/cpp/tests/experimental/pagerank_test.cpp index b3148166efe..649fe11d805 100644 --- a/cpp/tests/experimental/pagerank_test.cpp +++ b/cpp/tests/experimental/pagerank_test.cpp @@ -15,11 +15,12 @@ */ #include -#include #include +#include #include #include +#include #include #include @@ -278,7 +279,7 @@ class Tests_PageRank : public ::testing::TestWithParam { d_personalization_vertices.data(), d_personalization_values.data(), static_cast(d_personalization_vertices.size()), - d_pageranks.begin(), + d_pageranks.data(), alpha, epsilon, std::numeric_limits::max(), @@ -317,19 +318,32 @@ class Tests_PageRank : public ::testing::TestWithParam { std::vector h_unrenumbered_personalization_vertices( d_personalization_vertices.size()); - std::vector h_unrenumbered_personalization_values(d_personalization_values.size()); + std::vector h_unrenumbered_personalization_values( + h_unrenumbered_personalization_vertices.size()); if (renumber) { - rmm::device_uvector d_unrenumbered_personalization_vertices(0, - handle.get_stream()); - rmm::device_uvector d_unrenumbered_personalization_values(0, handle.get_stream()); - std::tie(d_unrenumbered_personalization_vertices, d_unrenumbered_personalization_values) = - cugraph::test::unrenumber_kv_pairs(handle, - d_personalization_vertices.data(), - d_personalization_values.data(), - d_personalization_vertices.size(), - d_renumber_map_labels.data(), - vertex_t{0}, - static_cast(d_renumber_map_labels.size())); + rmm::device_uvector d_unrenumbered_personalization_vertices( + d_personalization_vertices.size(), handle.get_stream()); + rmm::device_uvector d_unrenumbered_personalization_values( + d_unrenumbered_personalization_vertices.size(), handle.get_stream()); + raft::copy_async(d_unrenumbered_personalization_vertices.data(), + d_personalization_vertices.data(), + d_personalization_vertices.size(), + handle.get_stream()); + raft::copy_async(d_unrenumbered_personalization_values.data(), + d_personalization_values.data(), + d_personalization_values.size(), + handle.get_stream()); + cugraph::experimental::unrenumber_local_int_vertices( + handle, + d_unrenumbered_personalization_vertices.data(), + d_unrenumbered_personalization_vertices.size(), + d_renumber_map_labels.data(), + vertex_t{0}, + graph_view.get_number_of_vertices()); + cugraph::test::sort_by_key(handle, + d_unrenumbered_personalization_vertices.data(), + d_unrenumbered_personalization_values.data(), + d_unrenumbered_personalization_vertices.size()); raft::update_host(h_unrenumbered_personalization_vertices.data(), d_unrenumbered_personalization_vertices.data(), @@ -369,7 +383,7 @@ class Tests_PageRank : public ::testing::TestWithParam { std::vector h_cugraph_pageranks(graph_view.get_number_of_vertices()); if (renumber) { - auto d_unrenumbered_pageranks = cugraph::test::sort_values_by_key( + auto d_unrenumbered_pageranks = cugraph::test::sort_by_key( handle, d_renumber_map_labels.data(), d_pageranks.data(), d_renumber_map_labels.size()); raft::update_host(h_cugraph_pageranks.data(), d_unrenumbered_pageranks.data(), diff --git a/cpp/tests/experimental/sssp_test.cpp b/cpp/tests/experimental/sssp_test.cpp index 13a58450eee..31e6c194e80 100644 --- a/cpp/tests/experimental/sssp_test.cpp +++ b/cpp/tests/experimental/sssp_test.cpp @@ -15,11 +15,12 @@ */ #include -#include #include +#include #include #include +#include #include #include @@ -88,7 +89,7 @@ void sssp_reference(edge_t const* offsets, typedef struct SSSP_Usecase_t { cugraph::test::input_graph_specifier_t input_graph_specifier{}; - size_t source{false}; + size_t source{0}; bool check_correctness{false}; SSSP_Usecase_t(std::string const& graph_file_path, size_t source, bool check_correctness = true) @@ -192,8 +193,8 @@ class Tests_SSSP : public ::testing::TestWithParam { cugraph::experimental::sssp(handle, graph_view, - d_distances.begin(), - d_predecessors.begin(), + d_distances.data(), + d_predecessors.data(), source, std::numeric_limits::max(), false); @@ -242,13 +243,21 @@ class Tests_SSSP : public ::testing::TestWithParam { std::vector h_cugraph_distances(graph_view.get_number_of_vertices()); std::vector h_cugraph_predecessors(graph_view.get_number_of_vertices()); if (renumber) { - auto d_unrenumbered_distances = cugraph::test::sort_values_by_key( + cugraph::experimental::unrenumber_local_int_vertices(handle, + d_predecessors.data(), + d_predecessors.size(), + d_renumber_map_labels.data(), + vertex_t{0}, + graph_view.get_number_of_vertices(), + true); + + auto d_unrenumbered_distances = cugraph::test::sort_by_key( handle, d_renumber_map_labels.data(), d_distances.data(), d_renumber_map_labels.size()); - auto d_unrenumbered_predecessors = - cugraph::test::sort_values_by_key(handle, - d_renumber_map_labels.data(), - d_predecessors.data(), - d_renumber_map_labels.size()); + auto d_unrenumbered_predecessors = cugraph::test::sort_by_key(handle, + d_renumber_map_labels.data(), + d_predecessors.data(), + d_renumber_map_labels.size()); + raft::update_host(h_cugraph_distances.data(), d_unrenumbered_distances.data(), d_unrenumbered_distances.size(), @@ -258,21 +267,7 @@ class Tests_SSSP : public ::testing::TestWithParam { d_unrenumbered_predecessors.size(), handle.get_stream()); - std::vector h_renumber_map_labels(d_renumber_map_labels.size()); - raft::update_host(h_renumber_map_labels.data(), - d_renumber_map_labels.data(), - d_renumber_map_labels.size(), - handle.get_stream()); - handle.get_stream_view().synchronize(); - - std::transform( - h_cugraph_predecessors.begin(), - h_cugraph_predecessors.end(), - h_cugraph_predecessors.begin(), - [&h_renumber_map_labels](auto v) { - return v == cugraph::invalid_vertex_id::value ? v : h_renumber_map_labels[v]; - }); } else { raft::update_host( h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream()); diff --git a/cpp/tests/pagerank/mg_pagerank_test.cpp b/cpp/tests/pagerank/mg_pagerank_test.cpp index 407e255a5bb..f7b1e8dfbb4 100644 --- a/cpp/tests/pagerank/mg_pagerank_test.cpp +++ b/cpp/tests/pagerank/mg_pagerank_test.cpp @@ -15,10 +15,13 @@ */ #include -#include #include +#include #include +#include +#include +#include #include #include @@ -200,7 +203,7 @@ class Tests_MGPageRank : public ::testing::TestWithParam { d_mg_personalization_vertices.data(), d_mg_personalization_values.data(), static_cast(d_mg_personalization_vertices.size()), - d_mg_pageranks.begin(), + d_mg_pageranks.data(), alpha, epsilon, std::numeric_limits::max(), @@ -224,17 +227,32 @@ class Tests_MGPageRank : public ::testing::TestWithParam { rmm::device_uvector d_sg_personalization_vertices(0, handle.get_stream()); rmm::device_uvector d_sg_personalization_values(0, handle.get_stream()); if (configuration.personalization_ratio > 0.0) { - rmm::device_uvector d_unrenumbered_personalization_vertices(0, - handle.get_stream()); - rmm::device_uvector d_unrenumbered_personalization_values(0, handle.get_stream()); - std::tie(d_unrenumbered_personalization_vertices, d_unrenumbered_personalization_values) = - cugraph::test::unrenumber_kv_pairs(handle, - d_mg_personalization_vertices.data(), - d_mg_personalization_values.data(), - d_mg_personalization_vertices.size(), - d_mg_renumber_map_labels.data(), - mg_graph_view.get_local_vertex_first(), - mg_graph_view.get_local_vertex_last()); + rmm::device_uvector d_unrenumbered_personalization_vertices( + d_mg_personalization_vertices.size(), handle.get_stream()); + rmm::device_uvector d_unrenumbered_personalization_values( + d_unrenumbered_personalization_vertices.size(), handle.get_stream()); + raft::copy_async(d_unrenumbered_personalization_vertices.data(), + d_mg_personalization_vertices.data(), + d_mg_personalization_vertices.size(), + handle.get_stream()); + raft::copy_async(d_unrenumbered_personalization_values.data(), + d_mg_personalization_values.data(), + d_mg_personalization_values.size(), + handle.get_stream()); + + std::vector vertex_partition_lasts(comm_size); + for (size_t i = 0; i < vertex_partition_lasts.size(); ++i) { + vertex_partition_lasts[i] = mg_graph_view.get_vertex_partition_last(i); + } + cugraph::experimental::unrenumber_int_vertices( + handle, + d_unrenumbered_personalization_vertices.data(), + d_unrenumbered_personalization_vertices.size(), + d_mg_renumber_map_labels.data(), + mg_graph_view.get_local_vertex_first(), + mg_graph_view.get_local_vertex_last(), + vertex_partition_lasts, + handle.get_stream()); rmm::device_scalar d_local_personalization_vector_size( d_unrenumbered_personalization_vertices.size(), handle.get_stream()); @@ -265,6 +283,11 @@ class Tests_MGPageRank : public ::testing::TestWithParam { recvcounts.data(), displacements.data(), handle.get_stream()); + + cugraph::test::sort_by_key(handle, + d_unrenumbered_personalization_vertices.data(), + d_unrenumbered_personalization_values.data(), + d_unrenumbered_personalization_vertices.size()); } // 5-3. run SG PageRank @@ -278,7 +301,7 @@ class Tests_MGPageRank : public ::testing::TestWithParam { d_sg_personalization_vertices.data(), d_sg_personalization_values.data(), static_cast(d_sg_personalization_vertices.size()), - d_sg_pageranks.begin(), + d_sg_pageranks.data(), alpha, epsilon, std::numeric_limits::max(), // max_iterations diff --git a/cpp/tests/utilities/rmat_utilities.cu b/cpp/tests/utilities/rmat_utilities.cu index be300e74760..f2707ee5f73 100644 --- a/cpp/tests/utilities/rmat_utilities.cu +++ b/cpp/tests/utilities/rmat_utilities.cu @@ -55,9 +55,11 @@ generate_graph_from_rmat_params(raft::handle_t const& handle, size_t num_partitions) { CUGRAPH_EXPECTS(!multi_gpu || renumber, "renumber should be true if multi_gpu is true."); - CUGRAPH_EXPECTS(size_t{1} << scale <= std::numeric_limits::max(), "vertex_t overflow."); - CUGRAPH_EXPECTS((size_t{1} << scale) * edge_factor <= std::numeric_limits::max(), - " edge_t overflow."); + CUGRAPH_EXPECTS(size_t{1} << scale <= static_cast(std::numeric_limits::max()), + "vertex_t overflow."); + CUGRAPH_EXPECTS( + (size_t{1} << scale) * edge_factor <= static_cast(std::numeric_limits::max()), + " edge_t overflow."); vertex_t number_of_vertices = static_cast(size_t{1} << scale); edge_t number_of_edges = From 0ab187ce465e051e3e3c3f93e787ba3e17199d84 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 24 Mar 2021 15:19:35 -0400 Subject: [PATCH 33/63] temporary workaround for a cuco bug --- ...ransform_reduce_key_aggregated_out_nbr.cuh | 7 ++++-- cpp/include/utilities/collect_comm.cuh | 22 ++++++++++++++----- cpp/src/experimental/relabel.cu | 15 ++++++++++--- cpp/src/experimental/renumber_edgelist.cu | 18 +++++++++++---- cpp/src/experimental/renumber_utils.cu | 20 ++++++++++++----- 5 files changed, 62 insertions(+), 20 deletions(-) diff --git a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index 964e1741d40..de79b0b59af 100644 --- a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -206,8 +206,11 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( // 1. build a cuco::static_map object for the k, v pairs. auto kv_map_ptr = std::make_unique>( - static_cast(static_cast(thrust::distance(map_key_first, map_key_last)) / - load_factor), + // FIXME: std::max(..., size_t{1}) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 + std::max(static_cast( + static_cast(thrust::distance(map_key_first, map_key_last)) / load_factor), + size_t{1}), invalid_vertex_id::value, invalid_vertex_id::value); auto pair_first = thrust::make_transform_iterator( diff --git a/cpp/include/utilities/collect_comm.cuh b/cpp/include/utilities/collect_comm.cuh index ab4a070cb79..c69276cca67 100644 --- a/cpp/include/utilities/collect_comm.cuh +++ b/cpp/include/utilities/collect_comm.cuh @@ -64,8 +64,11 @@ collect_values_for_keys(raft::comms::comms_t const &comm, // 1. build a cuco::static_map object for the map k, v pairs. auto kv_map_ptr = std::make_unique>( - static_cast(static_cast(thrust::distance(map_key_first, map_key_last)) / - load_factor), + // FIXME: std::max(..., size_t{1}) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 + std::max(static_cast( + static_cast(thrust::distance(map_key_first, map_key_last)) / load_factor), + size_t{1}), invalid_vertex_id::value, invalid_vertex_id::value); { @@ -123,7 +126,9 @@ collect_values_for_keys(raft::comms::comms_t const &comm, kv_map_ptr.reset(); kv_map_ptr = std::make_unique>( - static_cast(static_cast(unique_keys.size()) / load_factor), + // FIXME: std::max(..., size_t{1}) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 + std::max(static_cast(static_cast(unique_keys.size()) / load_factor), size_t{1}), invalid_vertex_id::value, invalid_vertex_id::value); { @@ -178,8 +183,11 @@ collect_values_for_unique_keys(raft::comms::comms_t const &comm, // 1. build a cuco::static_map object for the map k, v pairs. auto kv_map_ptr = std::make_unique>( - static_cast(static_cast(thrust::distance(map_key_first, map_key_last)) / - load_factor), + // FIXME: std::max(..., size_t{1}) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 + std::max(static_cast( + static_cast(thrust::distance(map_key_first, map_key_last)) / load_factor), + size_t{1}), invalid_vertex_id::value, invalid_vertex_id::value); { @@ -233,7 +241,9 @@ collect_values_for_unique_keys(raft::comms::comms_t const &comm, kv_map_ptr.reset(); kv_map_ptr = std::make_unique>( - static_cast(static_cast(unique_keys.size()) / load_factor), + // FIXME: std::max(..., size_t{1}) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 + std::max(static_cast(static_cast(unique_keys.size()) / load_factor), size_t{1}), invalid_vertex_id::value, invalid_vertex_id::value); { diff --git a/cpp/src/experimental/relabel.cu b/cpp/src/experimental/relabel.cu index 31133ae64fa..362d09c61c3 100644 --- a/cpp/src/experimental/relabel.cu +++ b/cpp/src/experimental/relabel.cu @@ -121,7 +121,11 @@ void relabel(raft::handle_t const& handle, handle.get_stream())); // cuco::static_map currently does not take stream cuco::static_map relabel_map{ - static_cast(static_cast(rx_label_pair_old_labels.size()) / load_factor), + // FIXME: std::max(..., size_t{1}) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 + std::max( + static_cast(static_cast(rx_label_pair_old_labels.size()) / load_factor), + size_t{1}), invalid_vertex_id::value, invalid_vertex_id::value}; @@ -165,7 +169,10 @@ void relabel(raft::handle_t const& handle, } cuco::static_map relabel_map( - static_cast(static_cast(unique_old_labels.size()) / load_factor), + // FIXME: std::max(..., size_t{1}) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 + std::max(static_cast(static_cast(unique_old_labels.size()) / load_factor), + size_t{1}), invalid_vertex_id::value, invalid_vertex_id::value); @@ -180,7 +187,9 @@ void relabel(raft::handle_t const& handle, relabel_map.find(labels, labels + num_labels, labels); } else { cuco::static_map relabel_map( - static_cast(static_cast(num_label_pairs) / load_factor), + // FIXME: std::max(..., size_t{1}) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 + std::max(static_cast(static_cast(num_label_pairs) / load_factor), size_t{1}), invalid_vertex_id::value, invalid_vertex_id::value); diff --git a/cpp/src/experimental/renumber_edgelist.cu b/cpp/src/experimental/renumber_edgelist.cu index a68cb8a7d9c..9960df92b4f 100644 --- a/cpp/src/experimental/renumber_edgelist.cu +++ b/cpp/src/experimental/renumber_edgelist.cu @@ -558,8 +558,11 @@ renumber_edgelist(raft::handle_t const& handle, handle.get_stream())); // cuco::static_map currently does not take stream cuco::static_map renumber_map{ - static_cast(static_cast(partition.get_matrix_partition_major_size(i)) / - load_factor), + // FIXME: std::max(..., size_t{1}) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 + std::max(static_cast( + static_cast(partition.get_matrix_partition_major_size(i)) / load_factor), + size_t{1}), invalid_vertex_id::value, invalid_vertex_id::value}; auto pair_first = thrust::make_transform_iterator( @@ -596,7 +599,11 @@ renumber_edgelist(raft::handle_t const& handle, handle.get_stream())); // cuco::static_map currently does not take stream cuco::static_map renumber_map{ - static_cast(static_cast(renumber_map_minor_labels.size()) / load_factor), + // FIXME: std::max(..., size_t{1}) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 + std::max( + static_cast(static_cast(renumber_map_minor_labels.size()) / load_factor), + size_t{1}), invalid_vertex_id::value, invalid_vertex_id::value}; auto pair_first = thrust::make_transform_iterator( @@ -664,7 +671,10 @@ std::enable_if_t> renumber_edgelist( // footprint and execution time cuco::static_map renumber_map{ - static_cast(static_cast(renumber_map_labels.size()) / load_factor), + // FIXME: std::max(..., size_t{1}) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 + std::max(static_cast(static_cast(renumber_map_labels.size()) / load_factor), + size_t{1}), invalid_vertex_id::value, invalid_vertex_id::value}; auto pair_first = thrust::make_transform_iterator( diff --git a/cpp/src/experimental/renumber_utils.cu b/cpp/src/experimental/renumber_utils.cu index 606fd3d32dc..93362a9c9a1 100644 --- a/cpp/src/experimental/renumber_utils.cu +++ b/cpp/src/experimental/renumber_utils.cu @@ -108,7 +108,11 @@ void renumber_ext_vertices(raft::handle_t const& handle, renumber_map_ptr.reset(); renumber_map_ptr = std::make_unique>( - static_cast(static_cast(sorted_unique_ext_vertices.size()) / load_factor), + // FIXME: std::max(..., size_t{1}) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 + std::max( + static_cast(static_cast(sorted_unique_ext_vertices.size()) / load_factor), + size_t{1}), invalid_vertex_id::value, invalid_vertex_id::value); @@ -123,8 +127,11 @@ void renumber_ext_vertices(raft::handle_t const& handle, renumber_map_ptr.reset(); renumber_map_ptr = std::make_unique>( - static_cast(static_cast(local_int_vertex_last - local_int_vertex_first) / - load_factor), + // FIXME: std::max(..., size_t{1}) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 + std::max(static_cast( + static_cast(local_int_vertex_last - local_int_vertex_first) / load_factor), + size_t{1}), invalid_vertex_id::value, invalid_vertex_id::value); @@ -300,8 +307,11 @@ void unrenumber_int_vertices(raft::handle_t const& handle, handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream cuco::static_map unrenumber_map( - static_cast( - static_cast(static_cast(sorted_unique_int_vertices.size()) / load_factor)), + // FIXME: std::max(..., size_t{1}) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 + std::max( + static_cast(static_cast(sorted_unique_int_vertices.size()) / load_factor), + size_t{1}), invalid_vertex_id::value, invalid_vertex_id::value); From f88de2b2956f33ada80e569c40d56f6e89231f51 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 24 Mar 2021 23:56:17 -0400 Subject: [PATCH 34/63] minor tweaks --- cpp/src/experimental/renumber_utils.cu | 4 +-- cpp/tests/experimental/bfs_test.cpp | 35 +++++++++++------------ cpp/tests/experimental/sssp_test.cpp | 39 ++++++++++++-------------- 3 files changed, 37 insertions(+), 41 deletions(-) diff --git a/cpp/src/experimental/renumber_utils.cu b/cpp/src/experimental/renumber_utils.cu index 93362a9c9a1..ac34e71c03d 100644 --- a/cpp/src/experimental/renumber_utils.cu +++ b/cpp/src/experimental/renumber_utils.cu @@ -229,9 +229,9 @@ void unrenumber_int_vertices(raft::handle_t const& handle, thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), vertices, vertices + num_vertices, - [num_vertices = vertex_partition_lasts.back()] __device__(auto v) { + [int_vertex_last = vertex_partition_lasts.back()] __device__(auto v) { return v != invalid_vertex_id::value && - !is_valid_vertex(num_vertices, v); + !is_valid_vertex(int_vertex_last, v); }) == 0, "Invalid input arguments: there are out-of-range vertices in [vertices, vertices " "+ num_vertices)."); diff --git a/cpp/tests/experimental/bfs_test.cpp b/cpp/tests/experimental/bfs_test.cpp index 22889ac60a8..8fce9488d8a 100644 --- a/cpp/tests/experimental/bfs_test.cpp +++ b/cpp/tests/experimental/bfs_test.cpp @@ -160,22 +160,8 @@ class Tests_BFS : public ::testing::TestWithParam { read_graph(handle, configuration, renumber); auto graph_view = graph.view(); - auto source = static_cast(configuration.source); - if (renumber) { - std::vector h_renumber_map_labels(d_renumber_map_labels.size()); - raft::update_host(h_renumber_map_labels.data(), - d_renumber_map_labels.data(), - d_renumber_map_labels.size(), - handle.get_stream()); - - handle.get_stream_view().synchronize(); - - source = static_cast(thrust::distance( - h_renumber_map_labels.begin(), - std::find( - h_renumber_map_labels.begin(), h_renumber_map_labels.end(), configuration.source))); - } - ASSERT_TRUE(source >= 0 && source < graph_view.get_number_of_vertices()) + ASSERT_TRUE(static_cast(configuration.source) >= 0 && + static_cast(configuration.source) < graph_view.get_number_of_vertices()) << "Invalid starting source."; rmm::device_uvector d_distances(graph_view.get_number_of_vertices(), @@ -189,7 +175,7 @@ class Tests_BFS : public ::testing::TestWithParam { graph_view, d_distances.data(), d_predecessors.data(), - source, + static_cast(configuration.source), false, std::numeric_limits::max()); @@ -217,6 +203,19 @@ class Tests_BFS : public ::testing::TestWithParam { handle.get_stream_view().synchronize(); + auto unrenumbered_source = static_cast(configuration.source); + if (renumber) { + std::vector h_renumber_map_labels(d_renumber_map_labels.size()); + raft::update_host(h_renumber_map_labels.data(), + d_renumber_map_labels.data(), + d_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + unrenumbered_source = h_renumber_map_labels[configuration.source]; + } + std::vector h_reference_distances(unrenumbered_graph_view.get_number_of_vertices()); std::vector h_reference_predecessors( unrenumbered_graph_view.get_number_of_vertices()); @@ -226,7 +225,7 @@ class Tests_BFS : public ::testing::TestWithParam { h_reference_distances.data(), h_reference_predecessors.data(), unrenumbered_graph_view.get_number_of_vertices(), - static_cast(configuration.source), + unrenumbered_source, std::numeric_limits::max()); std::vector h_cugraph_distances(graph_view.get_number_of_vertices()); diff --git a/cpp/tests/experimental/sssp_test.cpp b/cpp/tests/experimental/sssp_test.cpp index 31e6c194e80..9364d261dec 100644 --- a/cpp/tests/experimental/sssp_test.cpp +++ b/cpp/tests/experimental/sssp_test.cpp @@ -164,25 +164,8 @@ class Tests_SSSP : public ::testing::TestWithParam { read_graph(handle, configuration, renumber); auto graph_view = graph.view(); - auto source = static_cast(configuration.source); - if (renumber) { - std::vector h_renumber_map_labels(d_renumber_map_labels.size()); - raft::update_host(h_renumber_map_labels.data(), - d_renumber_map_labels.data(), - d_renumber_map_labels.size(), - handle.get_stream()); - - handle.get_stream_view().synchronize(); - - source = static_cast(thrust::distance( - h_renumber_map_labels.begin(), - std::find( - h_renumber_map_labels.begin(), h_renumber_map_labels.end(), configuration.source))); - } - - ASSERT_TRUE(source >= 0 && source < graph_view.get_number_of_vertices()) - << "Starting sources should be >= 0 and" - << " less than the number of vertices in the graph."; + ASSERT_TRUE(static_cast(configuration.source) >= 0 && + static_cast(configuration.source) < graph_view.get_number_of_vertices()); rmm::device_uvector d_distances(graph_view.get_number_of_vertices(), handle.get_stream()); @@ -195,7 +178,7 @@ class Tests_SSSP : public ::testing::TestWithParam { graph_view, d_distances.data(), d_predecessors.data(), - source, + static_cast(configuration.source), std::numeric_limits::max(), false); @@ -228,6 +211,19 @@ class Tests_SSSP : public ::testing::TestWithParam { handle.get_stream_view().synchronize(); + auto unrenumbered_source = static_cast(configuration.source); + if (renumber) { + std::vector h_renumber_map_labels(d_renumber_map_labels.size()); + raft::update_host(h_renumber_map_labels.data(), + d_renumber_map_labels.data(), + d_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + unrenumbered_source = h_renumber_map_labels[configuration.source]; + } + std::vector h_reference_distances(unrenumbered_graph_view.get_number_of_vertices()); std::vector h_reference_predecessors( unrenumbered_graph_view.get_number_of_vertices()); @@ -238,7 +234,8 @@ class Tests_SSSP : public ::testing::TestWithParam { h_reference_distances.data(), h_reference_predecessors.data(), unrenumbered_graph_view.get_number_of_vertices(), - static_cast(configuration.source)); + unrenumbered_source, + std::numeric_limits::max()); std::vector h_cugraph_distances(graph_view.get_number_of_vertices()); std::vector h_cugraph_predecessors(graph_view.get_number_of_vertices()); From ddb735c1f52f2501278057abaf0e0b09ce1a9a84 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 25 Mar 2021 13:38:01 -0400 Subject: [PATCH 35/63] update update_frontier_v_push_if_out_nbr to use dataframe_buffer.cuh --- .../update_frontier_v_push_if_out_nbr.cuh | 473 +++++++----------- cpp/include/utilities/dataframe_buffer.cuh | 29 +- cpp/src/experimental/bfs.cu | 6 +- cpp/src/experimental/sssp.cu | 4 +- 4 files changed, 207 insertions(+), 305 deletions(-) diff --git a/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh index c4426b3598f..aab951209d3 100644 --- a/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh +++ b/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh @@ -25,12 +25,14 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include @@ -115,159 +117,161 @@ __global__ void for_all_frontier_row_for_all_nbr_low_degree( static_assert(sizeof(unsigned long long int) == sizeof(size_t)); auto buffer_idx = atomicAdd(reinterpret_cast(buffer_idx_ptr), static_cast(1)); - *(buffer_key_output_first + buffer_idx) = col; - *(buffer_payload_output_first + buffer_idx) = - remove_first_thrust_tuple_element()(e_op_result); + *(buffer_key_output_first + buffer_idx) = col; + *(buffer_payload_output_first + buffer_idx) = thrust::get<1>(e_op_result); } - } - idx += gridDim.x * blockDim.x; + idx += gridDim.x * blockDim.x; + } } -} -template -size_t reduce_buffer_elements(raft::handle_t const& handle, - BufferKeyOutputIterator buffer_key_output_first, - BufferPayloadOutputIterator buffer_payload_output_first, - size_t num_buffer_elements, - ReduceOp reduce_op) -{ - thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - buffer_key_output_first, - buffer_key_output_first + num_buffer_elements, - buffer_payload_output_first); - - if (std::is_same>::value) { - // FIXME: if ReducOp is any, we may have a cheaper alternative than sort & uique (i.e. discard - // non-first elements) - auto it = thrust::unique_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - buffer_key_output_first, - buffer_key_output_first + num_buffer_elements, - buffer_payload_output_first); - return static_cast(thrust::distance(buffer_key_output_first, thrust::get<0>(it))); - } else { - using key_t = typename std::iterator_traits::value_type; - using payload_t = typename std::iterator_traits::value_type; - // FIXME: better avoid temporary buffer or at least limit the maximum buffer size (if we adopt - // CUDA cooperative group https://devblogs.nvidia.com/cooperative-groups and global sync(), we - // can use aggregate shared memory as a temporary buffer, or we can limit the buffer size, and - // split one thrust::reduce_by_key call to multiple thrust::reduce_by_key calls if the - // temporary buffer size exceeds the maximum buffer size (may be definied as percentage of the - // system HBM size or a function of the maximum number of threads in the system)) - // FIXME: actually, we can find how many unique keys are here by now. - // FIXME: if GraphViewType::is_multi_gpu is true, this should be executed on the GPU holding the - // vertex unless reduce_op is a pure function. - rmm::device_uvector keys(num_buffer_elements, handle.get_stream()); - auto value_buffer = - allocate_dataframe_buffer(num_buffer_elements, handle.get_stream()); - auto it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - buffer_key_output_first, - buffer_key_output_first + num_buffer_elements, - buffer_payload_output_first, - keys.begin(), - get_dataframe_buffer_begin(value_buffer), - thrust::equal_to(), - reduce_op); - auto num_reduced_buffer_elements = - static_cast(thrust::distance(keys.begin(), thrust::get<0>(it))); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - keys.begin(), - keys.begin() + num_reduced_buffer_elements, - buffer_key_output_first); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - get_dataframe_buffer_begin(value_buffer), - get_dataframe_buffer_begin(value_buffer) + num_reduced_buffer_elements, - buffer_payload_output_first); - return num_reduced_buffer_elements; + template + size_t reduce_buffer_elements(raft::handle_t const& handle, + BufferKeyOutputIterator buffer_key_output_first, + BufferPayloadOutputIterator buffer_payload_output_first, + size_t num_buffer_elements, + ReduceOp reduce_op) + { + thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + buffer_key_output_first, + buffer_key_output_first + num_buffer_elements, + buffer_payload_output_first); + + if (std::is_same>::value) { + // FIXME: if ReducOp is any, we may have a cheaper alternative than sort & uique (i.e. discard + // non-first elements) + auto it = + thrust::unique_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + buffer_key_output_first, + buffer_key_output_first + num_buffer_elements, + buffer_payload_output_first); + return static_cast(thrust::distance(buffer_key_output_first, thrust::get<0>(it))); + } else { + using key_t = typename std::iterator_traits::value_type; + using payload_t = typename std::iterator_traits::value_type; + // FIXME: better avoid temporary buffer or at least limit the maximum buffer size (if we adopt + // CUDA cooperative group https://devblogs.nvidia.com/cooperative-groups and global sync(), we + // can use aggregate shared memory as a temporary buffer, or we can limit the buffer size, and + // split one thrust::reduce_by_key call to multiple thrust::reduce_by_key calls if the + // temporary buffer size exceeds the maximum buffer size (may be definied as percentage of the + // system HBM size or a function of the maximum number of threads in the system)) + // FIXME: actually, we can find how many unique keys are here by now. + // FIXME: if GraphViewType::is_multi_gpu is true, this should be executed on the GPU holding + // the vertex unless reduce_op is a pure function. + rmm::device_uvector keys(num_buffer_elements, handle.get_stream()); + auto value_buffer = + allocate_dataframe_buffer(num_buffer_elements, handle.get_stream()); + auto it = + thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + buffer_key_output_first, + buffer_key_output_first + num_buffer_elements, + buffer_payload_output_first, + keys.begin(), + get_dataframe_buffer_begin(value_buffer), + thrust::equal_to(), + reduce_op); + auto num_reduced_buffer_elements = + static_cast(thrust::distance(keys.begin(), thrust::get<0>(it))); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + keys.begin(), + keys.begin() + num_reduced_buffer_elements, + buffer_key_output_first); + thrust::copy( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + get_dataframe_buffer_begin(value_buffer), + get_dataframe_buffer_begin(value_buffer) + num_reduced_buffer_elements, + buffer_payload_output_first); + return num_reduced_buffer_elements; + } } -} -template -__global__ void update_frontier_and_vertex_output_values( - vertex_partition_device_t vertex_partition, - BufferKeyInputIterator buffer_key_input_first, - BufferPayloadInputIterator buffer_payload_input_first, - size_t num_buffer_elements, - VertexValueInputIterator vertex_value_input_first, - VertexValueOutputIterator vertex_value_output_first, - vertex_t** bucket_ptrs, - size_t* bucket_sizes_ptr, - size_t invalid_bucket_idx, - vertex_t invalid_vertex, - VertexOp v_op) -{ - static_assert(std::is_same::value_type, - vertex_t>::value); - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - size_t idx = tid; - size_t block_idx = blockIdx.x; - // FIXME: it might be more performant to process more than one element per thread - auto num_blocks = (num_buffer_elements + blockDim.x - 1) / blockDim.x; - - using BlockScan = - cub::BlockScan; - __shared__ typename BlockScan::TempStorage temp_storage; - - __shared__ size_t bucket_block_start_offsets[num_buckets]; - - size_t bucket_block_local_offsets[num_buckets]; - size_t bucket_block_aggregate_sizes[num_buckets]; - - while (block_idx < num_blocks) { - for (size_t i = 0; i < num_buckets; ++i) { bucket_block_local_offsets[i] = 0; } - - size_t selected_bucket_idx{invalid_bucket_idx}; - vertex_t key{invalid_vertex}; - - if (idx < num_buffer_elements) { - key = *(buffer_key_input_first + idx); - auto key_offset = vertex_partition.get_local_vertex_offset_from_vertex_nocheck(key); - auto v_val = *(vertex_value_input_first + key_offset); - auto payload = *(buffer_payload_input_first + idx); - auto v_op_result = v_op(v_val, payload); - selected_bucket_idx = thrust::get<0>(v_op_result); - if (selected_bucket_idx != invalid_bucket_idx) { - *(vertex_value_output_first + key_offset) = - remove_first_thrust_tuple_element()(v_op_result); - bucket_block_local_offsets[selected_bucket_idx] = 1; + template + __global__ void update_frontier_and_vertex_output_values( + vertex_partition_device_t vertex_partition, + BufferKeyInputIterator buffer_key_input_first, + BufferPayloadInputIterator buffer_payload_input_first, + size_t num_buffer_elements, + VertexValueInputIterator vertex_value_input_first, + VertexValueOutputIterator vertex_value_output_first, + vertex_t * *bucket_ptrs, + size_t * bucket_sizes_ptr, + size_t invalid_bucket_idx, + vertex_t invalid_vertex, + VertexOp v_op) + { + static_assert(std::is_same::value_type, + vertex_t>::value); + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + size_t idx = tid; + size_t block_idx = blockIdx.x; + // FIXME: it might be more performant to process more than one element per thread + auto num_blocks = (num_buffer_elements + blockDim.x - 1) / blockDim.x; + + using BlockScan = + cub::BlockScan; + __shared__ typename BlockScan::TempStorage temp_storage; + + __shared__ size_t bucket_block_start_offsets[num_buckets]; + + size_t bucket_block_local_offsets[num_buckets]; + size_t bucket_block_aggregate_sizes[num_buckets]; + + while (block_idx < num_blocks) { + for (size_t i = 0; i < num_buckets; ++i) { bucket_block_local_offsets[i] = 0; } + + size_t selected_bucket_idx{invalid_bucket_idx}; + vertex_t key{invalid_vertex}; + + if (idx < num_buffer_elements) { + key = *(buffer_key_input_first + idx); + auto key_offset = vertex_partition.get_local_vertex_offset_from_vertex_nocheck(key); + auto v_val = *(vertex_value_input_first + key_offset); + auto payload = *(buffer_payload_input_first + idx); + auto v_op_result = v_op(v_val, payload); + selected_bucket_idx = thrust::get<0>(v_op_result); + if (selected_bucket_idx != invalid_bucket_idx) { + *(vertex_value_output_first + key_offset) = thrust::get<1>(v_op_result); + bucket_block_local_offsets[selected_bucket_idx] = 1; + } } - } - for (size_t i = 0; i < num_buckets; ++i) { - BlockScan(temp_storage) - .ExclusiveSum(bucket_block_local_offsets[i], - bucket_block_local_offsets[i], - bucket_block_aggregate_sizes[i]); - } - - if (threadIdx.x == 0) { for (size_t i = 0; i < num_buckets; ++i) { - static_assert(sizeof(unsigned long long int) == sizeof(size_t)); - bucket_block_start_offsets[i] = - atomicAdd(reinterpret_cast(bucket_sizes_ptr + i), - static_cast(bucket_block_aggregate_sizes[i])); + BlockScan(temp_storage) + .ExclusiveSum(bucket_block_local_offsets[i], + bucket_block_local_offsets[i], + bucket_block_aggregate_sizes[i]); } - } - __syncthreads(); + if (threadIdx.x == 0) { + for (size_t i = 0; i < num_buckets; ++i) { + static_assert(sizeof(unsigned long long int) == sizeof(size_t)); + bucket_block_start_offsets[i] = + atomicAdd(reinterpret_cast(bucket_sizes_ptr + i), + static_cast(bucket_block_aggregate_sizes[i])); + } + } - // FIXME: better use shared memory buffer to aggreaget global memory writes - if (selected_bucket_idx != invalid_bucket_idx) { - bucket_ptrs[selected_bucket_idx][bucket_block_start_offsets[selected_bucket_idx] + - bucket_block_local_offsets[selected_bucket_idx]] = key; - } + __syncthreads(); + + // FIXME: better use shared memory buffer to aggreaget global memory writes + if (selected_bucket_idx != invalid_bucket_idx) { + bucket_ptrs[selected_bucket_idx][bucket_block_start_offsets[selected_bucket_idx] + + bucket_block_local_offsets[selected_bucket_idx]] = key; + } - idx += gridDim.x * blockDim.x; - block_idx += gridDim.x; + idx += gridDim.x * blockDim.x; + block_idx += gridDim.x; + } } -} } // namespace detail @@ -349,13 +353,16 @@ void update_frontier_v_push_if_out_nbr( static_assert(!GraphViewType::is_adj_matrix_transposed, "GraphViewType should support the push model."); - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using weight_t = typename GraphViewType::weight_type; + using payload_t = typename ReduceOp::type; // 1. fill the buffer - vertex_frontier.set_buffer_idx_value(0); - + rmm::device_uvector keys(size_t{0}, handle.get_stream()); + auto payload_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + rmm::device_scalar buffer_idx(size_t{0}, handle.get_stream()); for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { matrix_partition_device_t matrix_partition(graph_view, i); @@ -425,10 +432,8 @@ void update_frontier_v_push_if_out_nbr( // locking. // FIXME: if i != 0, this will require costly reallocation if we don't use the new CUDA feature // to reserve address space. - vertex_frontier.resize_buffer(vertex_frontier.get_buffer_idx_value() + max_pushes); - auto buffer_first = vertex_frontier.buffer_begin(); - auto buffer_key_first = std::get<0>(buffer_first); - auto buffer_payload_first = std::get<1>(buffer_first); + keys.resize(buffer_idx.value() + max_pushes, handle.get_stream()); + resize_dataframe_buffer(payload_buffer, keys.size(), handle.get_stream()); auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed ? vertex_t{0} @@ -453,9 +458,9 @@ void update_frontier_v_push_if_out_nbr( frontier_rows.end(), adj_matrix_row_value_input_first + row_value_input_offset, adj_matrix_col_value_input_first, - buffer_key_first, - buffer_payload_first, - vertex_frontier.get_buffer_idx_ptr(), + keys.begin(), + get_dataframe_buffer_begin(payload_buffer), + buffer_idx.data(), e_op); } else { detail::for_all_frontier_row_for_all_nbr_low_degree<<(payload_buffer), + buffer_idx.data(), e_op); } } @@ -477,18 +482,12 @@ void update_frontier_v_push_if_out_nbr( // 2. reduce the buffer - auto num_buffer_offset = edge_t{0}; - - auto buffer_first = vertex_frontier.buffer_begin(); - auto buffer_key_first = std::get<0>(buffer_first) + num_buffer_offset; - auto buffer_payload_first = std::get<1>(buffer_first) + num_buffer_offset; - - auto num_buffer_elements = detail::reduce_buffer_elements(handle, - buffer_key_first, - buffer_payload_first, - vertex_frontier.get_buffer_idx_value(), - reduce_op); - + auto num_buffer_elements = + detail::reduce_buffer_elements(handle, + keys.begin(), + get_dataframe_buffer_begin(payload_buffer), + buffer_idx.value(), + reduce_op); if (GraphViewType::is_multi_gpu) { auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); @@ -510,8 +509,8 @@ void update_frontier_v_push_if_out_nbr( rmm::device_uvector d_tx_buffer_last_boundaries(d_vertex_lasts.size(), handle.get_stream()); thrust::lower_bound(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - buffer_key_first, - buffer_key_first + num_buffer_elements, + keys.begin(), + keys.begin() + num_buffer_elements, d_vertex_lasts.begin(), d_vertex_lasts.end(), d_tx_buffer_last_boundaries.begin()); @@ -520,113 +519,35 @@ void update_frontier_v_push_if_out_nbr( d_tx_buffer_last_boundaries.data(), d_tx_buffer_last_boundaries.size(), handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); + handle.get_stream_view().synchronize(); std::vector tx_counts(h_tx_buffer_last_boundaries.size()); std::adjacent_difference( h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin()); - std::vector rx_counts(row_comm_size); - std::vector count_requests(tx_counts.size() + rx_counts.size()); - size_t tx_self_i = std::numeric_limits::max(); - for (size_t i = 0; i < tx_counts.size(); ++i) { - auto comm_dst_rank = col_comm_rank * row_comm_size + static_cast(i); - if (comm_dst_rank == comm_rank) { - tx_self_i = i; - // FIXME: better define request_null (similar to MPI_REQUEST_NULL) under raft::comms - count_requests[i] = std::numeric_limits::max(); - } else { - comm.isend(&tx_counts[i], 1, comm_dst_rank, 0 /* tag */, count_requests.data() + i); - } - } - for (size_t i = 0; i < rx_counts.size(); ++i) { - auto comm_src_rank = col_comm_rank * row_comm_size + static_cast(i); - if (comm_src_rank == comm_rank) { - assert(tx_self_i != std::numeric_limits::max()); - rx_counts[i] = tx_counts[tx_self_i]; - // FIXME: better define request_null (similar to MPI_REQUEST_NULL) under raft::comms - count_requests[tx_counts.size() + i] = std::numeric_limits::max(); - } else { - comm.irecv(&rx_counts[i], - 1, - comm_src_rank, - 0 /* tag */, - count_requests.data() + tx_counts.size() + i); - } - } - // FIXME: better define request_null (similar to MPI_REQUEST_NULL) under raft::comms, if - // raft::comms::wait immediately returns on seeing request_null, this remove is unnecessary - count_requests.erase(std::remove(count_requests.begin(), - count_requests.end(), - std::numeric_limits::max()), - count_requests.end()); - comm.waitall(count_requests.size(), count_requests.data()); - - std::vector tx_offsets(tx_counts.size() + 1, edge_t{0}); - std::partial_sum(tx_counts.begin(), tx_counts.end(), tx_offsets.begin() + 1); - std::vector rx_offsets(rx_counts.size() + 1, edge_t{0}); - std::partial_sum(rx_counts.begin(), rx_counts.end(), rx_offsets.begin() + 1); - - // FIXME: this will require costly reallocation if we don't use the new CUDA feature to reserve - // address space. - // FIXME: std::max(actual size, 1) as ncclRecv currently hangs if recvuff is nullptr even if - // count is 0 - vertex_frontier.resize_buffer(std::max(num_buffer_elements + rx_offsets.back(), size_t(1))); - - auto buffer_first = vertex_frontier.buffer_begin(); - auto buffer_key_first = std::get<0>(buffer_first) + num_buffer_offset; - auto buffer_payload_first = std::get<1>(buffer_first) + num_buffer_offset; - - std::vector tx_dst_ranks(tx_counts.size()); - std::vector rx_src_ranks(rx_counts.size()); - for (size_t i = 0; i < tx_dst_ranks.size(); ++i) { - tx_dst_ranks[i] = col_comm_rank * row_comm_size + static_cast(i); - } - for (size_t i = 0; i < rx_src_ranks.size(); ++i) { - rx_src_ranks[i] = col_comm_rank * row_comm_size + static_cast(i); - } - - device_multicast_sendrecv( - comm, - buffer_key_first, - tx_counts, - tx_offsets, - tx_dst_ranks, - buffer_key_first + num_buffer_elements, - rx_counts, - rx_offsets, - rx_src_ranks, - handle.get_stream()); - device_multicast_sendrecv( - comm, - buffer_payload_first, - tx_counts, - tx_offsets, - tx_dst_ranks, - buffer_payload_first + num_buffer_elements, - rx_counts, - rx_offsets, - rx_src_ranks, - handle.get_stream()); - - // FIXME: this does not exploit the fact that each segment is sorted. Lost performance - // optimization opportunities. - // FIXME: we can use [vertex_frontier.buffer_begin(), vertex_frontier.buffer_begin() + - // num_buffer_elements) as temporary buffer inside reduce_buffer_elements(). - num_buffer_offset = num_buffer_elements; - num_buffer_elements = detail::reduce_buffer_elements(handle, - buffer_key_first + num_buffer_elements, - buffer_payload_first + num_buffer_elements, - rx_offsets.back(), - reduce_op); + rmm::device_uvector rx_keys(size_t{0}, handle.get_stream()); + std::tie(rx_keys, std::ignore) = + shuffle_values(row_comm, keys.begin(), tx_counts, handle.get_stream()); + keys = std::move(rx_keys); + + auto rx_payload_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_payload_buffer, std::ignore) = + shuffle_values(row_comm, + get_dataframe_buffer_begin(payload_buffer), + tx_counts, + handle.get_stream()); + payload_buffer = std::move(rx_payload_buffer); + + num_buffer_elements = + detail::reduce_buffer_elements(handle, + keys.begin(), + get_dataframe_buffer_begin(payload_buffer), + keys.size(), + reduce_op); } // 3. update vertex properties if (num_buffer_elements > 0) { - auto buffer_first = vertex_frontier.buffer_begin(); - auto buffer_key_first = std::get<0>(buffer_first) + num_buffer_offset; - auto buffer_payload_first = std::get<1>(buffer_first) + num_buffer_offset; - raft::grid_1d_thread_t update_grid(num_buffer_elements, detail::update_frontier_v_push_if_out_nbr_update_block_size, handle.get_device_properties().maxGridSize[0]); @@ -640,8 +561,8 @@ void update_frontier_v_push_if_out_nbr( detail::update_frontier_and_vertex_output_values <<>>( vertex_partition, - buffer_key_first, - buffer_payload_first, + keys.begin(), + get_dataframe_buffer_begin(payload_buffer), num_buffer_elements, vertex_value_input_first, vertex_value_output_first, @@ -664,21 +585,5 @@ void update_frontier_v_push_if_out_nbr( } } -/* - -FIXME: - -iterating over lower triangular (or upper triangular) : triangle counting -LRB might be necessary if the cost of processing an edge (i, j) is a function of degree(i) and -degree(j) : triangle counting -push-pull switching support (e.g. DOBFS), in this case, we need both -CSR & CSC (trade-off execution time vs memory requirement, unless graph is symmetric) -if graph is symmetric, there will be additional optimization opportunities (e.g. in-degree == -out-degree) For BFS, sending a bit vector (for the entire set of dest vertices per partitoin may -work better we can use thrust::set_intersection for triangle counting think about adding thrust -wrappers for reduction functions. Can I pass nullptr for dummy -instead of thrust::make_counting_iterator(0)? -*/ - +} // namespace detail } // namespace experimental -} // namespace cugraph diff --git a/cpp/include/utilities/dataframe_buffer.cuh b/cpp/include/utilities/dataframe_buffer.cuh index 06352b8e217..e59b12f2a80 100644 --- a/cpp/include/utilities/dataframe_buffer.cuh +++ b/cpp/include/utilities/dataframe_buffer.cuh @@ -47,21 +47,19 @@ auto allocate_dataframe_buffer_tuple_impl(std::index_sequence, } template -void resize_dataframe_buffer_tuple_element_impl(BufferType& buffer, - size_t new_buffer_size, - cudaStream_t stream) -{ - std::get(buffer).resize(new_buffer_size, stream); - resize_dataframe_buffer_tuple_element_impl( - buffer, new_buffer_size, stream); -} +struct resize_dataframe_buffer_tuple_iterator_element_impl { + void run(BufferType& buffer, size_t new_buffer_size, cudaStream_t stream) + { + std::get(buffer).resize(new_buffer_size, stream); + resize_dataframe_buffer_tuple_iterator_element_impl().run( + buffer, new_buffer_size, stream); + } +}; template -void resize_dataframe_buffer_tuple_impl(BufferType& buffer, - size_t new_buffer_size, - cudaStream_t stream) -{ -} +struct resize_dataframe_buffer_tuple_iterator_element_impl { + void run(BufferType& buffer, size_t new_buffer_size, cudaStream_t stream) {} +}; template auto get_dataframe_buffer_begin_tuple_element_impl(BufferType& buffer) @@ -108,8 +106,9 @@ template ::value; - detail::resize_dataframe_buffer_tuple_impl( - buffer, new_buffer_size, stream); + detail:: + resize_dataframe_buffer_tuple_iterator_element_impl() + .run(buffer, new_buffer_size, stream); } template >(), + reduce_op::any(), distances, thrust::make_zip_iterator(thrust::make_tuple(distances, predecessor_first)), vertex_frontier, @@ -145,7 +143,7 @@ void bfs(raft::handle_t const &handle, auto idx = (v_val == invalid_distance) ? static_cast(Bucket::cur) : VertexFrontier, vertex_t>::kInvalidBucketIdx; - return thrust::make_tuple(idx, depth + 1, thrust::get<0>(pushed_val)); + return thrust::make_tuple(idx, thrust::make_tuple(depth + 1, pushed_val)); }); auto new_vertex_frontier_aggregate_size = diff --git a/cpp/src/experimental/sssp.cu b/cpp/src/experimental/sssp.cu index 4996b3734cb..e62ca32ba7a 100644 --- a/cpp/src/experimental/sssp.cu +++ b/cpp/src/experimental/sssp.cu @@ -188,7 +188,7 @@ void sssp(raft::handle_t const &handle, threshold = old_distance < threshold ? old_distance : threshold; } if (new_distance >= threshold) { push = false; } - return thrust::make_tuple(push, new_distance, src); + return thrust::make_tuple(push, thrust::make_tuple(new_distance, src)); }, reduce_op::min>(), distances, @@ -200,7 +200,7 @@ void sssp(raft::handle_t const &handle, ? (new_dist < near_far_threshold ? static_cast(Bucket::new_near) : static_cast(Bucket::far)) : VertexFrontier, vertex_t>::kInvalidBucketIdx; - return thrust::make_tuple(idx, thrust::get<0>(pushed_val), thrust::get<1>(pushed_val)); + return thrust::make_tuple(idx, pushed_val); }); vertex_frontier.get_bucket(static_cast(Bucket::cur_near)).clear(); From f7b7471c2345c2d238ca172b070dd6c5abdd13f7 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 25 Mar 2021 22:03:26 -0400 Subject: [PATCH 36/63] update buffer use in update_frontier_v_push_if_out_nbr --- .../update_frontier_v_push_if_out_nbr.cuh | 279 +++++++++--------- cpp/include/patterns/vertex_frontier.cuh | 109 +------ cpp/src/experimental/bfs.cu | 5 +- cpp/src/experimental/graph_view.cu | 8 +- cpp/src/experimental/sssp.cu | 7 +- 5 files changed, 148 insertions(+), 260 deletions(-) diff --git a/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh index aab951209d3..4d557b97a30 100644 --- a/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh +++ b/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh @@ -120,158 +120,153 @@ __global__ void for_all_frontier_row_for_all_nbr_low_degree( *(buffer_key_output_first + buffer_idx) = col; *(buffer_payload_output_first + buffer_idx) = thrust::get<1>(e_op_result); } - - idx += gridDim.x * blockDim.x; } + idx += gridDim.x * blockDim.x; } +} - template - size_t reduce_buffer_elements(raft::handle_t const& handle, - BufferKeyOutputIterator buffer_key_output_first, - BufferPayloadOutputIterator buffer_payload_output_first, - size_t num_buffer_elements, - ReduceOp reduce_op) - { - thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - buffer_key_output_first, - buffer_key_output_first + num_buffer_elements, - buffer_payload_output_first); - - if (std::is_same>::value) { - // FIXME: if ReducOp is any, we may have a cheaper alternative than sort & uique (i.e. discard - // non-first elements) - auto it = - thrust::unique_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - buffer_key_output_first, - buffer_key_output_first + num_buffer_elements, - buffer_payload_output_first); - return static_cast(thrust::distance(buffer_key_output_first, thrust::get<0>(it))); - } else { - using key_t = typename std::iterator_traits::value_type; - using payload_t = typename std::iterator_traits::value_type; - // FIXME: better avoid temporary buffer or at least limit the maximum buffer size (if we adopt - // CUDA cooperative group https://devblogs.nvidia.com/cooperative-groups and global sync(), we - // can use aggregate shared memory as a temporary buffer, or we can limit the buffer size, and - // split one thrust::reduce_by_key call to multiple thrust::reduce_by_key calls if the - // temporary buffer size exceeds the maximum buffer size (may be definied as percentage of the - // system HBM size or a function of the maximum number of threads in the system)) - // FIXME: actually, we can find how many unique keys are here by now. - // FIXME: if GraphViewType::is_multi_gpu is true, this should be executed on the GPU holding - // the vertex unless reduce_op is a pure function. - rmm::device_uvector keys(num_buffer_elements, handle.get_stream()); - auto value_buffer = - allocate_dataframe_buffer(num_buffer_elements, handle.get_stream()); - auto it = - thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - buffer_key_output_first, - buffer_key_output_first + num_buffer_elements, - buffer_payload_output_first, - keys.begin(), - get_dataframe_buffer_begin(value_buffer), - thrust::equal_to(), - reduce_op); - auto num_reduced_buffer_elements = - static_cast(thrust::distance(keys.begin(), thrust::get<0>(it))); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - keys.begin(), - keys.begin() + num_reduced_buffer_elements, - buffer_key_output_first); - thrust::copy( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - get_dataframe_buffer_begin(value_buffer), - get_dataframe_buffer_begin(value_buffer) + num_reduced_buffer_elements, - buffer_payload_output_first); - return num_reduced_buffer_elements; - } +template +size_t reduce_buffer_elements(raft::handle_t const& handle, + BufferKeyOutputIterator buffer_key_output_first, + BufferPayloadOutputIterator buffer_payload_output_first, + size_t num_buffer_elements, + ReduceOp reduce_op) +{ + thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + buffer_key_output_first, + buffer_key_output_first + num_buffer_elements, + buffer_payload_output_first); + + if (std::is_same>::value) { + // FIXME: if ReducOp is any, we may have a cheaper alternative than sort & uique (i.e. discard + // non-first elements) + auto it = thrust::unique_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + buffer_key_output_first, + buffer_key_output_first + num_buffer_elements, + buffer_payload_output_first); + return static_cast(thrust::distance(buffer_key_output_first, thrust::get<0>(it))); + } else { + using key_t = typename std::iterator_traits::value_type; + using payload_t = typename std::iterator_traits::value_type; + // FIXME: better avoid temporary buffer or at least limit the maximum buffer size (if we adopt + // CUDA cooperative group https://devblogs.nvidia.com/cooperative-groups and global sync(), we + // can use aggregate shared memory as a temporary buffer, or we can limit the buffer size, and + // split one thrust::reduce_by_key call to multiple thrust::reduce_by_key calls if the + // temporary buffer size exceeds the maximum buffer size (may be definied as percentage of the + // system HBM size or a function of the maximum number of threads in the system)) + // FIXME: actually, we can find how many unique keys are here by now. + // FIXME: if GraphViewType::is_multi_gpu is true, this should be executed on the GPU holding + // the vertex unless reduce_op is a pure function. + rmm::device_uvector keys(num_buffer_elements, handle.get_stream()); + auto value_buffer = + allocate_dataframe_buffer(num_buffer_elements, handle.get_stream()); + auto it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + buffer_key_output_first, + buffer_key_output_first + num_buffer_elements, + buffer_payload_output_first, + keys.begin(), + get_dataframe_buffer_begin(value_buffer), + thrust::equal_to(), + reduce_op); + auto num_reduced_buffer_elements = + static_cast(thrust::distance(keys.begin(), thrust::get<0>(it))); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + keys.begin(), + keys.begin() + num_reduced_buffer_elements, + buffer_key_output_first); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + get_dataframe_buffer_begin(value_buffer), + get_dataframe_buffer_begin(value_buffer) + num_reduced_buffer_elements, + buffer_payload_output_first); + return num_reduced_buffer_elements; } +} - template - __global__ void update_frontier_and_vertex_output_values( - vertex_partition_device_t vertex_partition, - BufferKeyInputIterator buffer_key_input_first, - BufferPayloadInputIterator buffer_payload_input_first, - size_t num_buffer_elements, - VertexValueInputIterator vertex_value_input_first, - VertexValueOutputIterator vertex_value_output_first, - vertex_t * *bucket_ptrs, - size_t * bucket_sizes_ptr, - size_t invalid_bucket_idx, - vertex_t invalid_vertex, - VertexOp v_op) - { - static_assert(std::is_same::value_type, - vertex_t>::value); - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - size_t idx = tid; - size_t block_idx = blockIdx.x; - // FIXME: it might be more performant to process more than one element per thread - auto num_blocks = (num_buffer_elements + blockDim.x - 1) / blockDim.x; - - using BlockScan = - cub::BlockScan; - __shared__ typename BlockScan::TempStorage temp_storage; - - __shared__ size_t bucket_block_start_offsets[num_buckets]; - - size_t bucket_block_local_offsets[num_buckets]; - size_t bucket_block_aggregate_sizes[num_buckets]; - - while (block_idx < num_blocks) { - for (size_t i = 0; i < num_buckets; ++i) { bucket_block_local_offsets[i] = 0; } - - size_t selected_bucket_idx{invalid_bucket_idx}; - vertex_t key{invalid_vertex}; - - if (idx < num_buffer_elements) { - key = *(buffer_key_input_first + idx); - auto key_offset = vertex_partition.get_local_vertex_offset_from_vertex_nocheck(key); - auto v_val = *(vertex_value_input_first + key_offset); - auto payload = *(buffer_payload_input_first + idx); - auto v_op_result = v_op(v_val, payload); - selected_bucket_idx = thrust::get<0>(v_op_result); - if (selected_bucket_idx != invalid_bucket_idx) { - *(vertex_value_output_first + key_offset) = thrust::get<1>(v_op_result); - bucket_block_local_offsets[selected_bucket_idx] = 1; - } +template +__global__ void update_frontier_and_vertex_output_values( + vertex_partition_device_t vertex_partition, + BufferKeyInputIterator buffer_key_input_first, + BufferPayloadInputIterator buffer_payload_input_first, + size_t num_buffer_elements, + VertexValueInputIterator vertex_value_input_first, + VertexValueOutputIterator vertex_value_output_first, + vertex_t** bucket_ptrs, + size_t* bucket_sizes_ptr, + size_t invalid_bucket_idx, + vertex_t invalid_vertex, + VertexOp v_op) +{ + static_assert(std::is_same::value_type, + vertex_t>::value); + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + size_t idx = tid; + size_t block_idx = blockIdx.x; + // FIXME: it might be more performant to process more than one element per thread + auto num_blocks = (num_buffer_elements + blockDim.x - 1) / blockDim.x; + + using BlockScan = + cub::BlockScan; + __shared__ typename BlockScan::TempStorage temp_storage; + + __shared__ size_t bucket_block_start_offsets[num_buckets]; + + size_t bucket_block_local_offsets[num_buckets]; + size_t bucket_block_aggregate_sizes[num_buckets]; + + while (block_idx < num_blocks) { + for (size_t i = 0; i < num_buckets; ++i) { bucket_block_local_offsets[i] = 0; } + + size_t selected_bucket_idx{invalid_bucket_idx}; + vertex_t key{invalid_vertex}; + + if (idx < num_buffer_elements) { + key = *(buffer_key_input_first + idx); + auto key_offset = vertex_partition.get_local_vertex_offset_from_vertex_nocheck(key); + auto v_val = *(vertex_value_input_first + key_offset); + auto payload = *(buffer_payload_input_first + idx); + auto v_op_result = v_op(v_val, payload); + selected_bucket_idx = thrust::get<0>(v_op_result); + if (selected_bucket_idx != invalid_bucket_idx) { + *(vertex_value_output_first + key_offset) = thrust::get<1>(v_op_result); + bucket_block_local_offsets[selected_bucket_idx] = 1; } + } - for (size_t i = 0; i < num_buckets; ++i) { - BlockScan(temp_storage) - .ExclusiveSum(bucket_block_local_offsets[i], - bucket_block_local_offsets[i], - bucket_block_aggregate_sizes[i]); - } + for (size_t i = 0; i < num_buckets; ++i) { + BlockScan(temp_storage) + .ExclusiveSum(bucket_block_local_offsets[i], + bucket_block_local_offsets[i], + bucket_block_aggregate_sizes[i]); + } - if (threadIdx.x == 0) { - for (size_t i = 0; i < num_buckets; ++i) { - static_assert(sizeof(unsigned long long int) == sizeof(size_t)); - bucket_block_start_offsets[i] = - atomicAdd(reinterpret_cast(bucket_sizes_ptr + i), - static_cast(bucket_block_aggregate_sizes[i])); - } + if (threadIdx.x == 0) { + for (size_t i = 0; i < num_buckets; ++i) { + static_assert(sizeof(unsigned long long int) == sizeof(size_t)); + bucket_block_start_offsets[i] = + atomicAdd(reinterpret_cast(bucket_sizes_ptr + i), + static_cast(bucket_block_aggregate_sizes[i])); } + } - __syncthreads(); - - // FIXME: better use shared memory buffer to aggreaget global memory writes - if (selected_bucket_idx != invalid_bucket_idx) { - bucket_ptrs[selected_bucket_idx][bucket_block_start_offsets[selected_bucket_idx] + - bucket_block_local_offsets[selected_bucket_idx]] = key; - } + __syncthreads(); - idx += gridDim.x * blockDim.x; - block_idx += gridDim.x; + // FIXME: better use shared memory buffer to aggreaget global memory writes + if (selected_bucket_idx != invalid_bucket_idx) { + bucket_ptrs[selected_bucket_idx][bucket_block_start_offsets[selected_bucket_idx] + + bucket_block_local_offsets[selected_bucket_idx]] = key; } + + idx += gridDim.x * blockDim.x; + block_idx += gridDim.x; } +} } // namespace detail @@ -432,7 +427,7 @@ void update_frontier_v_push_if_out_nbr( // locking. // FIXME: if i != 0, this will require costly reallocation if we don't use the new CUDA feature // to reserve address space. - keys.resize(buffer_idx.value() + max_pushes, handle.get_stream()); + keys.resize(buffer_idx.value(handle.get_stream()) + max_pushes, handle.get_stream()); resize_dataframe_buffer(payload_buffer, keys.size(), handle.get_stream()); auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed @@ -486,7 +481,7 @@ void update_frontier_v_push_if_out_nbr( detail::reduce_buffer_elements(handle, keys.begin(), get_dataframe_buffer_begin(payload_buffer), - buffer_idx.value(), + buffer_idx.value(handle.get_stream()), reduce_op); if (GraphViewType::is_multi_gpu) { auto& comm = handle.get_comms(); @@ -585,5 +580,5 @@ void update_frontier_v_push_if_out_nbr( } } -} // namespace detail } // namespace experimental +} // namespace cugraph diff --git a/cpp/include/patterns/vertex_frontier.cuh b/cpp/include/patterns/vertex_frontier.cuh index c11142d3cf7..375ec097850 100644 --- a/cpp/include/patterns/vertex_frontier.cuh +++ b/cpp/include/patterns/vertex_frontier.cuh @@ -48,26 +48,6 @@ inline size_t round_up(size_t number_to_round, size_t modulus) return ((number_to_round + (modulus - 1)) / modulus) * modulus; } -template -auto make_buffer_zip_iterator_impl(std::vector& buffer_ptrs, - size_t offset, - std::index_sequence) -{ - auto key_ptr = reinterpret_cast(buffer_ptrs[0]) + offset; - auto payload_it = thrust::make_zip_iterator( - thrust::make_tuple(reinterpret_cast::type*>( - buffer_ptrs[1 + Is])...)); - return std::make_tuple(key_ptr, payload_it); -} - -template -auto make_buffer_zip_iterator(std::vector& buffer_ptrs, size_t offset) -{ - size_t constexpr tuple_size = thrust::tuple_size::value; - return make_buffer_zip_iterator_impl( - buffer_ptrs, offset, std::make_index_sequence()); -} - template __global__ void move_and_invalidate_if(RowIterator row_first, RowIterator row_last, @@ -199,10 +179,7 @@ class Bucket { size_t size_{0}; }; -template +template class VertexFrontier { public: static size_t constexpr kNumBuckets = num_buckets; @@ -211,9 +188,7 @@ class VertexFrontier { VertexFrontier(raft::handle_t const& handle, std::vector bucket_capacities) : handle_ptr_(&handle), tmp_bucket_ptrs_(num_buckets, handle.get_stream()), - tmp_bucket_sizes_(num_buckets, handle.get_stream()), - buffer_ptrs_(kReduceInputTupleSize + 1 /* to store destination column number */, nullptr), - buffer_idx_(0, handle_ptr_->get_stream()) + tmp_bucket_sizes_(num_buckets, handle.get_stream()) { CUGRAPH_EXPECTS(bucket_capacities.size() == num_buckets, "invalid input argument bucket_capacities (size mismatch)"); @@ -228,7 +203,6 @@ class VertexFrontier { for (size_t i = 0; i < num_buckets; ++i) { buckets_.emplace_back(handle, bucket_capacities[i]); } - buffer_.set_stream(handle_ptr_->get_stream()); } Bucket& get_bucket(size_t bucket_idx) { return buckets_[bucket_idx]; } @@ -311,90 +285,11 @@ class VertexFrontier { return std::make_tuple(tmp_bucket_ptrs_.data(), tmp_bucket_sizes_.data()); } - void resize_buffer(size_t size) - { - // FIXME: rmm::device_buffer resize incurs copy if memory is reallocated, which is unnecessary - // in this case. - buffer_.resize(compute_aggregate_buffer_size_in_bytes(size), handle_ptr_->get_stream()); - if (size > buffer_capacity_) { - buffer_capacity_ = size; - update_buffer_ptrs(); - } - buffer_size_ = size; - } - - void clear_buffer() { resize_buffer(0); } - - void shrink_to_fit_buffer() - { - if (buffer_size_ != buffer_capacity_) { - // FIXME: rmm::device_buffer shrink_to_fit incurs copy if memory is reallocated, which is - // unnecessary in this case. - buffer_.shrink_to_fit(handle_ptr_->get_stream()); - update_buffer_ptrs(); - buffer_capacity_ = buffer_size_; - } - } - - auto buffer_begin() - { - return detail::make_buffer_zip_iterator(buffer_ptrs_, 0); - } - - auto buffer_end() - { - return detail::make_buffer_zip_iterator(buffer_ptrs_, - buffer_size_); - } - - auto get_buffer_idx_ptr() { return buffer_idx_.data(); } - - size_t get_buffer_idx_value() { return buffer_idx_.value(handle_ptr_->get_stream()); } - - void set_buffer_idx_value(size_t value) - { - buffer_idx_.set_value(value, handle_ptr_->get_stream()); - } - private: - static size_t constexpr kReduceInputTupleSize = thrust::tuple_size::value; - static size_t constexpr kBufferAlignment = 128; - raft::handle_t const* handle_ptr_{nullptr}; std::vector> buckets_{}; rmm::device_uvector tmp_bucket_ptrs_; rmm::device_uvector tmp_bucket_sizes_; - - std::array tuple_element_sizes_ = - compute_thrust_tuple_element_sizes()(); - std::vector buffer_ptrs_{}; - rmm::device_buffer buffer_{}; - size_t buffer_size_{0}; - size_t buffer_capacity_{0}; - rmm::device_scalar buffer_idx_{}; - - // FIXME: better pick between this apporach or the approach used in allocate_comm_buffer - size_t compute_aggregate_buffer_size_in_bytes(size_t size) - { - size_t aggregate_buffer_size_in_bytes = - detail::round_up(sizeof(vertex_t) * size, kBufferAlignment); - for (size_t i = 0; i < kReduceInputTupleSize; ++i) { - aggregate_buffer_size_in_bytes += - detail::round_up(tuple_element_sizes_[i] * size, kBufferAlignment); - } - return aggregate_buffer_size_in_bytes; - } - - void update_buffer_ptrs() - { - uintptr_t ptr = reinterpret_cast(buffer_.data()); - buffer_ptrs_[0] = reinterpret_cast(ptr); - ptr += detail::round_up(sizeof(vertex_t) * buffer_capacity_, kBufferAlignment); - for (size_t i = 0; i < kReduceInputTupleSize; ++i) { - buffer_ptrs_[1 + i] = reinterpret_cast(ptr); - ptr += detail::round_up(tuple_element_sizes_[i] * buffer_capacity_, kBufferAlignment); - } - } }; } // namespace experimental diff --git a/cpp/src/experimental/bfs.cu b/cpp/src/experimental/bfs.cu index 70689fba9bb..2b03fa57a5d 100644 --- a/cpp/src/experimental/bfs.cu +++ b/cpp/src/experimental/bfs.cu @@ -93,8 +93,7 @@ void bfs(raft::handle_t const &handle, enum class Bucket { cur, num_buckets }; std::vector bucket_sizes(static_cast(Bucket::num_buckets), push_graph_view.get_number_of_local_vertices()); - VertexFrontier, - vertex_t, + VertexFrontier(Bucket::num_buckets)> vertex_frontier(handle, bucket_sizes); @@ -142,7 +141,7 @@ void bfs(raft::handle_t const &handle, [depth] __device__(auto v_val, auto pushed_val) { auto idx = (v_val == invalid_distance) ? static_cast(Bucket::cur) - : VertexFrontier, vertex_t>::kInvalidBucketIdx; + : VertexFrontier::kInvalidBucketIdx; return thrust::make_tuple(idx, thrust::make_tuple(depth + 1, pushed_val)); }); diff --git a/cpp/src/experimental/graph_view.cu b/cpp/src/experimental/graph_view.cu index 74c3e2f7de4..c6f39a44333 100644 --- a/cpp/src/experimental/graph_view.cu +++ b/cpp/src/experimental/graph_view.cu @@ -537,7 +537,7 @@ graph_view_t ret(handle.get_stream()); device_allreduce( handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream()); - return ret.value(); + return ret.value(handle.get_stream()); } template ret(handle.get_stream()); device_allreduce( handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream()); - return ret.value(); + return ret.value(handle.get_stream()); } template ret(handle.get_stream()); device_allreduce( handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream()); - return ret.value(); + return ret.value(handle.get_stream()); } template ret(handle.get_stream()); device_allreduce( handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream()); - return ret.value(); + return ret.value(handle.get_stream()); } template bucket_sizes(static_cast(Bucket::num_buckets), push_graph_view.get_number_of_local_vertices()); - VertexFrontier, - vertex_t, + VertexFrontier(Bucket::num_buckets)> vertex_frontier(handle, bucket_sizes); @@ -199,7 +198,7 @@ void sssp(raft::handle_t const &handle, auto idx = new_dist < v_val ? (new_dist < near_far_threshold ? static_cast(Bucket::new_near) : static_cast(Bucket::far)) - : VertexFrontier, vertex_t>::kInvalidBucketIdx; + : VertexFrontier::kInvalidBucketIdx; return thrust::make_tuple(idx, pushed_val); }); @@ -222,7 +221,7 @@ void sssp(raft::handle_t const &handle, auto dist = *(distances + vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v)); if (dist < old_near_far_threshold) { - return VertexFrontier, vertex_t>::kInvalidBucketIdx; + return VertexFrontier::kInvalidBucketIdx; } else if (dist < near_far_threshold) { return static_cast(Bucket::cur_near); } else { From e536b13523540a9d981261272b5a5ece25382c3f Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 26 Mar 2021 11:18:15 -0400 Subject: [PATCH 37/63] workaround for cuco static_map bugs --- ...ransform_reduce_key_aggregated_out_nbr.cuh | 7 +-- cpp/include/utilities/collect_comm.cuh | 30 +++++++----- cpp/include/utilities/thrust_tuple_utils.cuh | 17 ------- cpp/src/experimental/relabel.cu | 22 +++++---- cpp/src/experimental/renumber_edgelist.cu | 21 +++++---- cpp/src/experimental/renumber_utils.cu | 47 +++++++++++++++---- 6 files changed, 85 insertions(+), 59 deletions(-) diff --git a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index de79b0b59af..9c38429ed15 100644 --- a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -206,11 +206,12 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( // 1. build a cuco::static_map object for the k, v pairs. auto kv_map_ptr = std::make_unique>( - // FIXME: std::max(..., size_t{1}) as a temporary workaround for - // https://github.com/NVIDIA/cuCollections/issues/72 + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 std::max(static_cast( static_cast(thrust::distance(map_key_first, map_key_last)) / load_factor), - size_t{1}), + static_cast(thrust::distance(map_key_first, map_key_last)) + 1), invalid_vertex_id::value, invalid_vertex_id::value); auto pair_first = thrust::make_transform_iterator( diff --git a/cpp/include/utilities/collect_comm.cuh b/cpp/include/utilities/collect_comm.cuh index c69276cca67..fb47f97e248 100644 --- a/cpp/include/utilities/collect_comm.cuh +++ b/cpp/include/utilities/collect_comm.cuh @@ -64,11 +64,12 @@ collect_values_for_keys(raft::comms::comms_t const &comm, // 1. build a cuco::static_map object for the map k, v pairs. auto kv_map_ptr = std::make_unique>( - // FIXME: std::max(..., size_t{1}) as a temporary workaround for - // https://github.com/NVIDIA/cuCollections/issues/72 + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 std::max(static_cast( static_cast(thrust::distance(map_key_first, map_key_last)) / load_factor), - size_t{1}), + static_cast(thrust::distance(map_key_first, map_key_last)) + 1), invalid_vertex_id::value, invalid_vertex_id::value); { @@ -126,9 +127,11 @@ collect_values_for_keys(raft::comms::comms_t const &comm, kv_map_ptr.reset(); kv_map_ptr = std::make_unique>( - // FIXME: std::max(..., size_t{1}) as a temporary workaround for - // https://github.com/NVIDIA/cuCollections/issues/72 - std::max(static_cast(static_cast(unique_keys.size()) / load_factor), size_t{1}), + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast(static_cast(unique_keys.size()) / load_factor), + unique_keys.size() + 1), invalid_vertex_id::value, invalid_vertex_id::value); { @@ -183,11 +186,12 @@ collect_values_for_unique_keys(raft::comms::comms_t const &comm, // 1. build a cuco::static_map object for the map k, v pairs. auto kv_map_ptr = std::make_unique>( - // FIXME: std::max(..., size_t{1}) as a temporary workaround for - // https://github.com/NVIDIA/cuCollections/issues/72 + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 std::max(static_cast( static_cast(thrust::distance(map_key_first, map_key_last)) / load_factor), - size_t{1}), + static_cast(thrust::distance(map_key_first, map_key_last)) + 1), invalid_vertex_id::value, invalid_vertex_id::value); { @@ -241,9 +245,11 @@ collect_values_for_unique_keys(raft::comms::comms_t const &comm, kv_map_ptr.reset(); kv_map_ptr = std::make_unique>( - // FIXME: std::max(..., size_t{1}) as a temporary workaround for - // https://github.com/NVIDIA/cuCollections/issues/72 - std::max(static_cast(static_cast(unique_keys.size()) / load_factor), size_t{1}), + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast(static_cast(unique_keys.size()) / load_factor), + unique_keys.size() + 1), invalid_vertex_id::value, invalid_vertex_id::value); { diff --git a/cpp/include/utilities/thrust_tuple_utils.cuh b/cpp/include/utilities/thrust_tuple_utils.cuh index 01843a583eb..d5ce6ff1a29 100644 --- a/cpp/include/utilities/thrust_tuple_utils.cuh +++ b/cpp/include/utilities/thrust_tuple_utils.cuh @@ -61,13 +61,6 @@ struct compute_thrust_tuple_element_sizes_impl { void compute(std::array::value>& arr) const {} }; -template -__device__ constexpr auto remove_first_thrust_tuple_element_impl(TupleType const& tuple, - std::index_sequence) -{ - return thrust::make_tuple(thrust::get<1 + Is>(tuple)...); -} - template struct plus_thrust_tuple_impl { __host__ __device__ constexpr void compute(TupleType& lhs, TupleType const& rhs) const @@ -200,16 +193,6 @@ struct compute_thrust_tuple_element_sizes { } }; -template -struct remove_first_thrust_tuple_element { - __device__ constexpr auto operator()(TupleType const& tuple) const - { - size_t constexpr tuple_size = thrust::tuple_size::value; - return detail::remove_first_thrust_tuple_element_impl( - tuple, std::make_index_sequence()); - } -}; - template struct plus_thrust_tuple { __host__ __device__ constexpr TupleType operator()(TupleType const& lhs, diff --git a/cpp/src/experimental/relabel.cu b/cpp/src/experimental/relabel.cu index 362d09c61c3..3c81e182732 100644 --- a/cpp/src/experimental/relabel.cu +++ b/cpp/src/experimental/relabel.cu @@ -121,11 +121,12 @@ void relabel(raft::handle_t const& handle, handle.get_stream())); // cuco::static_map currently does not take stream cuco::static_map relabel_map{ - // FIXME: std::max(..., size_t{1}) as a temporary workaround for - // https://github.com/NVIDIA/cuCollections/issues/72 + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 std::max( static_cast(static_cast(rx_label_pair_old_labels.size()) / load_factor), - size_t{1}), + rx_label_pair_old_labels.size() + 1), invalid_vertex_id::value, invalid_vertex_id::value}; @@ -169,10 +170,11 @@ void relabel(raft::handle_t const& handle, } cuco::static_map relabel_map( - // FIXME: std::max(..., size_t{1}) as a temporary workaround for - // https://github.com/NVIDIA/cuCollections/issues/72 + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 std::max(static_cast(static_cast(unique_old_labels.size()) / load_factor), - size_t{1}), + unique_old_labels.size() + 1), invalid_vertex_id::value, invalid_vertex_id::value); @@ -187,9 +189,11 @@ void relabel(raft::handle_t const& handle, relabel_map.find(labels, labels + num_labels, labels); } else { cuco::static_map relabel_map( - // FIXME: std::max(..., size_t{1}) as a temporary workaround for - // https://github.com/NVIDIA/cuCollections/issues/72 - std::max(static_cast(static_cast(num_label_pairs) / load_factor), size_t{1}), + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast(static_cast(num_label_pairs) / load_factor), + static_cast(num_label_pairs) + 1), invalid_vertex_id::value, invalid_vertex_id::value); diff --git a/cpp/src/experimental/renumber_edgelist.cu b/cpp/src/experimental/renumber_edgelist.cu index 9960df92b4f..42d560ae9ad 100644 --- a/cpp/src/experimental/renumber_edgelist.cu +++ b/cpp/src/experimental/renumber_edgelist.cu @@ -558,11 +558,12 @@ renumber_edgelist(raft::handle_t const& handle, handle.get_stream())); // cuco::static_map currently does not take stream cuco::static_map renumber_map{ - // FIXME: std::max(..., size_t{1}) as a temporary workaround for - // https://github.com/NVIDIA/cuCollections/issues/72 + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 std::max(static_cast( static_cast(partition.get_matrix_partition_major_size(i)) / load_factor), - size_t{1}), + partition.get_matrix_partition_major_size(i) + 1), invalid_vertex_id::value, invalid_vertex_id::value}; auto pair_first = thrust::make_transform_iterator( @@ -599,11 +600,12 @@ renumber_edgelist(raft::handle_t const& handle, handle.get_stream())); // cuco::static_map currently does not take stream cuco::static_map renumber_map{ - // FIXME: std::max(..., size_t{1}) as a temporary workaround for - // https://github.com/NVIDIA/cuCollections/issues/72 + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 std::max( static_cast(static_cast(renumber_map_minor_labels.size()) / load_factor), - size_t{1}), + renumber_map_minor_labels.size() + 1), invalid_vertex_id::value, invalid_vertex_id::value}; auto pair_first = thrust::make_transform_iterator( @@ -671,10 +673,11 @@ std::enable_if_t> renumber_edgelist( // footprint and execution time cuco::static_map renumber_map{ - // FIXME: std::max(..., size_t{1}) as a temporary workaround for - // https://github.com/NVIDIA/cuCollections/issues/72 + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 std::max(static_cast(static_cast(renumber_map_labels.size()) / load_factor), - size_t{1}), + renumber_map_labels.size() + 1), invalid_vertex_id::value, invalid_vertex_id::value}; auto pair_first = thrust::make_transform_iterator( diff --git a/cpp/src/experimental/renumber_utils.cu b/cpp/src/experimental/renumber_utils.cu index ac34e71c03d..6fabec464c3 100644 --- a/cpp/src/experimental/renumber_utils.cu +++ b/cpp/src/experimental/renumber_utils.cu @@ -108,11 +108,12 @@ void renumber_ext_vertices(raft::handle_t const& handle, renumber_map_ptr.reset(); renumber_map_ptr = std::make_unique>( - // FIXME: std::max(..., size_t{1}) as a temporary workaround for - // https://github.com/NVIDIA/cuCollections/issues/72 + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 std::max( static_cast(static_cast(sorted_unique_ext_vertices.size()) / load_factor), - size_t{1}), + sorted_unique_ext_vertices.size() + 1), invalid_vertex_id::value, invalid_vertex_id::value); @@ -127,11 +128,12 @@ void renumber_ext_vertices(raft::handle_t const& handle, renumber_map_ptr.reset(); renumber_map_ptr = std::make_unique>( - // FIXME: std::max(..., size_t{1}) as a temporary workaround for - // https://github.com/NVIDIA/cuCollections/issues/72 + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 std::max(static_cast( static_cast(local_int_vertex_last - local_int_vertex_first) / load_factor), - size_t{1}), + static_cast(local_int_vertex_last - local_int_vertex_first) + 1), invalid_vertex_id::value, invalid_vertex_id::value); @@ -163,8 +165,21 @@ void renumber_ext_vertices(raft::handle_t const& handle, "(aggregate) renumber_map_labels."); } + // FIXME: a temporary workaround for https://github.com/NVIDIA/cuCollections/issues/74 +#if 1 + thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + vertices, + [view = renumber_map_ptr->get_device_view()] __device__(auto v) { + return v != invalid_vertex_id::value + ? view.find(v)->second.load(cuda::std::memory_order_relaxed) + : invalid_vertex_id::value; + }); +#else renumber_map_ptr->find(vertices, vertices + num_vertices, vertices); #endif +#endif } template @@ -307,11 +322,12 @@ void unrenumber_int_vertices(raft::handle_t const& handle, handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream cuco::static_map unrenumber_map( - // FIXME: std::max(..., size_t{1}) as a temporary workaround for - // https://github.com/NVIDIA/cuCollections/issues/72 + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 std::max( static_cast(static_cast(sorted_unique_int_vertices.size()) / load_factor), - size_t{1}), + sorted_unique_int_vertices.size() + 1), invalid_vertex_id::value, invalid_vertex_id::value); @@ -323,7 +339,20 @@ void unrenumber_int_vertices(raft::handle_t const& handle, return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); unrenumber_map.insert(pair_first, pair_first + sorted_unique_int_vertices.size()); + // FIXME: a temporary workaround for https://github.com/NVIDIA/cuCollections/issues/74 +#if 1 + thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + vertices, + [view = unrenumber_map.get_device_view()] __device__(auto v) { + return v != invalid_vertex_id::value + ? view.find(v)->second.load(cuda::std::memory_order_relaxed) + : invalid_vertex_id::value; + }); +#else unrenumber_map.find(vertices, vertices + num_vertices, vertices); +#endif } else { unrenumber_local_int_vertices(handle, vertices, From 177e9d8befb9166bdb734f9f9e42ff6dafb0ca30 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 26 Mar 2021 11:27:42 -0400 Subject: [PATCH 38/63] add MG Katz, BFS, and SSSP tests --- cpp/tests/CMakeLists.txt | 29 +- cpp/tests/experimental/mg_bfs_test.cpp | 303 +++++++++++++++++ .../experimental/mg_katz_centrality_test.cpp | 268 +++++++++++++++ cpp/tests/experimental/mg_sssp_test.cpp | 314 ++++++++++++++++++ 4 files changed, 913 insertions(+), 1 deletion(-) create mode 100644 cpp/tests/experimental/mg_bfs_test.cpp create mode 100644 cpp/tests/experimental/mg_katz_centrality_test.cpp create mode 100644 cpp/tests/experimental/mg_sssp_test.cpp diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 8d4d2fa10d8..3e2cf8e207f 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -22,7 +22,7 @@ add_library(cugraphtestutil STATIC "${CMAKE_CURRENT_SOURCE_DIR}/utilities/matrix_market_file_utilities.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/rmat_utilities.cu" - "${CMAKE_CURRENT_SOURCE_DIR}/utilities/renumber_utilities.cu" + "${CMAKE_CURRENT_SOURCE_DIR}/utilities/thrust_wrapper.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/misc_utilities.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/detail/generate_graph_from_edgelist.cu" "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c") @@ -423,6 +423,33 @@ if(BUILD_CUGRAPH_MG_TESTS) ConfigureTest(MG_PAGERANK_TEST "${MG_PAGERANK_TEST_SRCS}") target_link_libraries(MG_PAGERANK_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX) + ########################################################################################### + # - MG KATZ CENTRALITY tests -------------------------------------------------------------- + + set(MG_KATZ_CENTRALITY_TEST_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/experimental/mg_katz_centrality_test.cpp") + + ConfigureTest(MG_KATZ_CENTRALITY_TEST "${MG_KATZ_CENTRALITY_TEST_SRCS}") + target_link_libraries(MG_KATZ_CENTRALITY_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX) + + ########################################################################################### + # - MG BFS tests -------------------------------------------------------------------------- + + set(MG_BFS_TEST_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/experimental/mg_bfs_test.cpp") + + ConfigureTest(MG_BFS_TEST "${MG_BFS_TEST_SRCS}") + target_link_libraries(MG_BFS_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX) + + ########################################################################################### + # - MG SSSP tests ------------------------------------------------------------------------- + + set(MG_SSSP_TEST_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/experimental/mg_sssp_test.cpp") + + ConfigureTest(MG_SSSP_TEST "${MG_SSSP_TEST_SRCS}") + target_link_libraries(MG_SSSP_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX) + else(MPI_CXX_FOUND) message(FATAL_ERROR "OpenMPI NOT found, cannot build MG tests.") endif(MPI_CXX_FOUND) diff --git a/cpp/tests/experimental/mg_bfs_test.cpp b/cpp/tests/experimental/mg_bfs_test.cpp new file mode 100644 index 00000000000..76ccb5d9de3 --- /dev/null +++ b/cpp/tests/experimental/mg_bfs_test.cpp @@ -0,0 +1,303 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +typedef struct BFS_Usecase_t { + cugraph::test::input_graph_specifier_t input_graph_specifier{}; + + size_t source{0}; + bool check_correctness{false}; + + BFS_Usecase_t(std::string const& graph_file_path, size_t source, bool check_correctness = true) + : source(source), check_correctness(check_correctness) + { + std::string graph_file_full_path{}; + if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { + graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path; + } else { + graph_file_full_path = graph_file_path; + } + input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH; + input_graph_specifier.graph_file_full_path = graph_file_full_path; + }; + + BFS_Usecase_t(cugraph::test::rmat_params_t rmat_params, + size_t source, + bool check_correctness = true) + : source(source), check_correctness(check_correctness) + { + input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; + input_graph_specifier.rmat_params = rmat_params; + } +} BFS_Usecase; + +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, BFS_Usecase const& configuration, bool renumber) +{ + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + std::vector partition_ids(multi_gpu ? size_t{1} : static_cast(comm_size)); + std::iota(partition_ids.begin(), + partition_ids.end(), + multi_gpu ? static_cast(comm_rank) : size_t{0}); + + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, configuration.input_graph_specifier.graph_file_full_path, false, renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + false, + renumber, + partition_ids, + static_cast(comm_size)); +} + +class Tests_MGBFS : public ::testing::TestWithParam { + public: + Tests_MGBFS() {} + static void SetupTestCase() {} + static void TearDownTestCase() {} + + virtual void SetUp() {} + virtual void TearDown() {} + + // Compare the results of running BFS on multiple GPUs to that of a single-GPU run + template + void run_current_test(BFS_Usecase const& configuration) + { + using weight_t = float; + + // 1. initialize handle + + raft::handle_t handle{}; + + raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + auto row_comm_size = static_cast(sqrt(static_cast(comm_size))); + while (comm_size % row_comm_size != 0) { --row_comm_size; } + cugraph::partition_2d::subcomm_factory_t + subcomm_factory(handle, row_comm_size); + + // 2. create MG graph + + cugraph::experimental::graph_t mg_graph(handle); + rmm::device_uvector d_mg_renumber_map_labels(0, handle.get_stream()); + std::tie(mg_graph, d_mg_renumber_map_labels) = + read_graph(handle, configuration, true); + + auto mg_graph_view = mg_graph.view(); + + ASSERT_TRUE(static_cast(configuration.source) >= 0 && + static_cast(configuration.source) < + mg_graph_view.get_number_of_vertices()) + << "Invalid starting source."; + + // 3. run MG BFS + + rmm::device_uvector d_mg_distances(mg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + rmm::device_uvector d_mg_predecessors(mg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + + cugraph::experimental::bfs(handle, + mg_graph_view, + d_mg_distances.data(), + d_mg_predecessors.data(), + static_cast(configuration.source), + false, + std::numeric_limits::max(), + true); + + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + + // 5. copmare SG & MG results + + if (configuration.check_correctness) { + // 5-1. create SG graph + + cugraph::experimental::graph_t sg_graph(handle); + std::tie(sg_graph, std::ignore) = + read_graph(handle, configuration, false); + + auto sg_graph_view = sg_graph.view(); + + std::vector vertex_partition_lasts(comm_size); + for (size_t i = 0; i < vertex_partition_lasts.size(); ++i) { + vertex_partition_lasts[i] = mg_graph_view.get_vertex_partition_last(i); + } + + rmm::device_scalar d_source(static_cast(configuration.source), + handle.get_stream()); + cugraph::experimental::unrenumber_int_vertices( + handle, + d_source.data(), + size_t{1}, + d_mg_renumber_map_labels.data(), + mg_graph_view.get_local_vertex_first(), + mg_graph_view.get_local_vertex_last(), + vertex_partition_lasts, + true); + auto unrenumbered_source = d_source.value(handle.get_stream()); + + // 5-2. run SG BFS + + rmm::device_uvector d_sg_distances(sg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + rmm::device_uvector d_sg_predecessors(sg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + + cugraph::experimental::bfs(handle, + sg_graph_view, + d_sg_distances.data(), + d_sg_predecessors.data(), + unrenumbered_source, + false, + std::numeric_limits::max(), + true); + + // 5-3. compare + + std::vector h_sg_offsets(sg_graph_view.get_number_of_vertices() + 1); + std::vector h_sg_indices(sg_graph_view.get_number_of_edges()); + raft::update_host(h_sg_offsets.data(), + sg_graph_view.offsets(), + sg_graph_view.get_number_of_vertices() + 1, + handle.get_stream()); + raft::update_host(h_sg_indices.data(), + sg_graph_view.indices(), + sg_graph_view.get_number_of_edges(), + handle.get_stream()); + + std::vector h_sg_distances(sg_graph_view.get_number_of_vertices()); + std::vector h_sg_predecessors(sg_graph_view.get_number_of_vertices()); + raft::update_host( + h_sg_distances.data(), d_sg_distances.data(), d_sg_distances.size(), handle.get_stream()); + raft::update_host(h_sg_predecessors.data(), + d_sg_predecessors.data(), + d_sg_predecessors.size(), + handle.get_stream()); + + std::vector h_mg_distances(mg_graph_view.get_number_of_local_vertices()); + std::vector h_mg_predecessors(mg_graph_view.get_number_of_local_vertices()); + raft::update_host( + h_mg_distances.data(), d_mg_distances.data(), d_mg_distances.size(), handle.get_stream()); + cugraph::experimental::unrenumber_int_vertices( + handle, + d_mg_predecessors.data(), + d_mg_predecessors.size(), + d_mg_renumber_map_labels.data(), + mg_graph_view.get_local_vertex_first(), + mg_graph_view.get_local_vertex_last(), + vertex_partition_lasts, + true); + raft::update_host(h_mg_predecessors.data(), + d_mg_predecessors.data(), + d_mg_predecessors.size(), + handle.get_stream()); + + std::vector h_mg_renumber_map_labels(d_mg_renumber_map_labels.size()); + raft::update_host(h_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) { + auto mapped_vertex = h_mg_renumber_map_labels[i]; + ASSERT_TRUE(h_mg_distances[i] == h_sg_distances[mapped_vertex]) + << "MG BFS distance for vertex: " << mapped_vertex << " in rank: " << comm_rank + << " has value: " << h_mg_distances[i] + << " different from the corresponding SG value: " << h_sg_distances[mapped_vertex]; + if (h_mg_predecessors[i] == cugraph::invalid_vertex_id::value) { + ASSERT_TRUE(h_sg_predecessors[mapped_vertex] == h_mg_predecessors[i]) + << "vertex reachability does not match with the SG result."; + } else { + ASSERT_TRUE(h_sg_distances[h_mg_predecessors[i]] + 1 == h_sg_distances[mapped_vertex]) + << "distances to this vertex != distances to the predecessor vertex + 1."; + bool found{false}; + for (auto j = h_sg_offsets[h_mg_predecessors[i]]; + j < h_sg_offsets[h_mg_predecessors[i] + 1]; + ++j) { + if (h_sg_indices[j] == mapped_vertex) { + found = true; + break; + } + } + ASSERT_TRUE(found) << "no edge from the predecessor vertex to this vertex."; + } + } + } + } +}; + +TEST_P(Tests_MGBFS, CheckInt32Int32) { run_current_test(GetParam()); } + +INSTANTIATE_TEST_CASE_P( + simple_test, + Tests_MGBFS, + ::testing::Values( + // enable correctness checks + BFS_Usecase("test/datasets/karate.mtx", 0), + BFS_Usecase("test/datasets/web-Google.mtx", 0), + BFS_Usecase("test/datasets/ljournal-2008.mtx", 0), + BFS_Usecase("test/datasets/webbase-1M.mtx", 0), + BFS_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0), + // disable correctness checks for large graphs + BFS_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + 0, + false))); + +CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/mg_katz_centrality_test.cpp b/cpp/tests/experimental/mg_katz_centrality_test.cpp new file mode 100644 index 00000000000..e3033af3771 --- /dev/null +++ b/cpp/tests/experimental/mg_katz_centrality_test.cpp @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +typedef struct KatzCentrality_Usecase_t { + cugraph::test::input_graph_specifier_t input_graph_specifier{}; + + bool test_weighted{false}; + bool check_correctness{false}; + + KatzCentrality_Usecase_t(std::string const& graph_file_path, + bool test_weighted, + bool check_correctness = true) + : test_weighted(test_weighted), check_correctness(check_correctness) + { + std::string graph_file_full_path{}; + if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { + graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path; + } else { + graph_file_full_path = graph_file_path; + } + input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH; + input_graph_specifier.graph_file_full_path = graph_file_full_path; + }; + + KatzCentrality_Usecase_t(cugraph::test::rmat_params_t rmat_params, + bool test_weighted, + bool check_correctness = true) + : test_weighted(test_weighted), check_correctness(check_correctness) + { + input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; + input_graph_specifier.rmat_params = rmat_params; + } +} KatzCentrality_Usecase; + +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, KatzCentrality_Usecase const& configuration, bool renumber) +{ + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + std::vector partition_ids(multi_gpu ? size_t{1} : static_cast(comm_size)); + std::iota(partition_ids.begin(), + partition_ids.end(), + multi_gpu ? static_cast(comm_rank) : size_t{0}); + + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, + configuration.input_graph_specifier.graph_file_full_path, + configuration.test_weighted, + renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + configuration.test_weighted, + renumber, + partition_ids, + static_cast(comm_size)); +} + +class Tests_MGKatzCentrality : public ::testing::TestWithParam { + public: + Tests_MGKatzCentrality() {} + static void SetupTestCase() {} + static void TearDownTestCase() {} + + virtual void SetUp() {} + virtual void TearDown() {} + + // Compare the results of running Katz Centrality on multiple GPUs to that of a single-GPU run + template + void run_current_test(KatzCentrality_Usecase const& configuration) + { + // 1. initialize handle + + raft::handle_t handle{}; + + raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + auto row_comm_size = static_cast(sqrt(static_cast(comm_size))); + while (comm_size % row_comm_size != 0) { --row_comm_size; } + cugraph::partition_2d::subcomm_factory_t + subcomm_factory(handle, row_comm_size); + + // 2. create MG graph + + cugraph::experimental::graph_t mg_graph(handle); + rmm::device_uvector d_mg_renumber_map_labels(0, handle.get_stream()); + std::tie(mg_graph, d_mg_renumber_map_labels) = + read_graph(handle, configuration, true); + + auto mg_graph_view = mg_graph.view(); + + // 3. compute max in-degree + + auto max_in_degree = mg_graph_view.compute_max_in_degree(handle); + + // 4. run MG Katz Centrality + + result_t const alpha = result_t{1.0} / static_cast(max_in_degree + 1); + result_t constexpr beta{1.0}; + result_t constexpr epsilon{1e-6}; + + rmm::device_uvector d_mg_katz_centralities( + mg_graph_view.get_number_of_local_vertices(), handle.get_stream()); + + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + + cugraph::experimental::katz_centrality(handle, + mg_graph_view, + static_cast(nullptr), + d_mg_katz_centralities.data(), + alpha, + beta, + epsilon, + std::numeric_limits::max(), + false, + true); + + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + + // 5. copmare SG & MG results + + if (configuration.check_correctness) { + // 5-1. create SG graph + + cugraph::experimental::graph_t sg_graph(handle); + std::tie(sg_graph, std::ignore) = + read_graph(handle, configuration, false); + + auto sg_graph_view = sg_graph.view(); + + // 5-3. run SG Katz Centrality + + rmm::device_uvector d_sg_katz_centralities(sg_graph_view.get_number_of_vertices(), + handle.get_stream()); + + cugraph::experimental::katz_centrality(handle, + sg_graph_view, + static_cast(nullptr), + d_sg_katz_centralities.data(), + alpha, + beta, + epsilon, + std::numeric_limits::max(), // max_iterations + false, + true); + + // 5-4. compare + + std::vector h_sg_katz_centralities(sg_graph_view.get_number_of_vertices()); + raft::update_host(h_sg_katz_centralities.data(), + d_sg_katz_centralities.data(), + d_sg_katz_centralities.size(), + handle.get_stream()); + + std::vector h_mg_katz_centralities(mg_graph_view.get_number_of_local_vertices()); + raft::update_host(h_mg_katz_centralities.data(), + d_mg_katz_centralities.data(), + d_mg_katz_centralities.size(), + handle.get_stream()); + + std::vector h_mg_renumber_map_labels(d_mg_renumber_map_labels.size()); + raft::update_host(h_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + auto threshold_ratio = 1e-3; + auto threshold_magnitude = + (1.0 / static_cast(mg_graph_view.get_number_of_vertices())) * + threshold_ratio; // skip comparison for low KatzCentrality verties (lowly ranked vertices) + auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { + return std::abs(lhs - rhs) < + std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); + }; + + for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) { + auto mapped_vertex = h_mg_renumber_map_labels[i]; + ASSERT_TRUE(nearly_equal(h_mg_katz_centralities[i], h_sg_katz_centralities[mapped_vertex])) + << "MG KatzCentrality value for vertex: " << mapped_vertex << " in rank: " << comm_rank + << " has value: " << h_mg_katz_centralities[i] + << " which exceeds the error margin for comparing to SG value: " + << h_sg_katz_centralities[mapped_vertex]; + } + } + } +}; + +TEST_P(Tests_MGKatzCentrality, CheckInt32Int32FloatFloat) +{ + run_current_test(GetParam()); +} + +INSTANTIATE_TEST_CASE_P( + simple_test, + Tests_MGKatzCentrality, + ::testing::Values( + // enable correctness checks + KatzCentrality_Usecase("test/datasets/karate.mtx", false), + KatzCentrality_Usecase("test/datasets/karate.mtx", true), + KatzCentrality_Usecase("test/datasets/web-Google.mtx", false), + KatzCentrality_Usecase("test/datasets/web-Google.mtx", true), + KatzCentrality_Usecase("test/datasets/ljournal-2008.mtx", false), + KatzCentrality_Usecase("test/datasets/ljournal-2008.mtx", true), + KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", false), + KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", true), + KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, + false), + KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, + true), + // disable correctness checks for large graphs + KatzCentrality_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + false, + false), + KatzCentrality_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + true, + false))); + +CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/mg_sssp_test.cpp b/cpp/tests/experimental/mg_sssp_test.cpp new file mode 100644 index 00000000000..48e4dc869f4 --- /dev/null +++ b/cpp/tests/experimental/mg_sssp_test.cpp @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +typedef struct SSSP_Usecase_t { + cugraph::test::input_graph_specifier_t input_graph_specifier{}; + + size_t source{0}; + bool check_correctness{false}; + + SSSP_Usecase_t(std::string const& graph_file_path, size_t source, bool check_correctness = true) + : source(source), check_correctness(check_correctness) + { + std::string graph_file_full_path{}; + if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { + graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path; + } else { + graph_file_full_path = graph_file_path; + } + input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH; + input_graph_specifier.graph_file_full_path = graph_file_full_path; + }; + + SSSP_Usecase_t(cugraph::test::rmat_params_t rmat_params, + size_t source, + bool check_correctness = true) + : source(source), check_correctness(check_correctness) + { + input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; + input_graph_specifier.rmat_params = rmat_params; + } +} SSSP_Usecase; + +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, SSSP_Usecase const& configuration, bool renumber) +{ + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + std::vector partition_ids(multi_gpu ? size_t{1} : static_cast(comm_size)); + std::iota(partition_ids.begin(), + partition_ids.end(), + multi_gpu ? static_cast(comm_rank) : size_t{0}); + + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, configuration.input_graph_specifier.graph_file_full_path, true, renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + true, + renumber, + partition_ids, + static_cast(comm_size)); +} + +class Tests_MGSSSP : public ::testing::TestWithParam { + public: + Tests_MGSSSP() {} + static void SetupTestCase() {} + static void TearDownTestCase() {} + + virtual void SetUp() {} + virtual void TearDown() {} + + // Compare the results of running SSSP on multiple GPUs to that of a single-GPU run + template + void run_current_test(SSSP_Usecase const& configuration) + { + // 1. initialize handle + + raft::handle_t handle{}; + + raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + auto row_comm_size = static_cast(sqrt(static_cast(comm_size))); + while (comm_size % row_comm_size != 0) { --row_comm_size; } + cugraph::partition_2d::subcomm_factory_t + subcomm_factory(handle, row_comm_size); + + // 2. create MG graph + + cugraph::experimental::graph_t mg_graph(handle); + rmm::device_uvector d_mg_renumber_map_labels(0, handle.get_stream()); + std::tie(mg_graph, d_mg_renumber_map_labels) = + read_graph(handle, configuration, true); + + auto mg_graph_view = mg_graph.view(); + + ASSERT_TRUE(static_cast(configuration.source) >= 0 && + static_cast(configuration.source) < + mg_graph_view.get_number_of_vertices()) + << "Invalid starting source."; + + // 3. run MG SSSP + + rmm::device_uvector d_mg_distances(mg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + rmm::device_uvector d_mg_predecessors(mg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + + // FIXME: disable do_expensive_check + cugraph::experimental::sssp(handle, + mg_graph_view, + d_mg_distances.data(), + d_mg_predecessors.data(), + static_cast(configuration.source), + std::numeric_limits::max(), + true); + + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + + // 5. copmare SG & MG results + + if (configuration.check_correctness) { + // 5-1. create SG graph + + cugraph::experimental::graph_t sg_graph(handle); + std::tie(sg_graph, std::ignore) = + read_graph(handle, configuration, false); + + auto sg_graph_view = sg_graph.view(); + + std::vector vertex_partition_lasts(comm_size); + for (size_t i = 0; i < vertex_partition_lasts.size(); ++i) { + vertex_partition_lasts[i] = mg_graph_view.get_vertex_partition_last(i); + } + + rmm::device_scalar d_source(static_cast(configuration.source), + handle.get_stream()); + cugraph::experimental::unrenumber_int_vertices( + handle, + d_source.data(), + size_t{1}, + d_mg_renumber_map_labels.data(), + mg_graph_view.get_local_vertex_first(), + mg_graph_view.get_local_vertex_last(), + vertex_partition_lasts, + true); + auto unrenumbered_source = d_source.value(handle.get_stream()); + + // 5-2. run SG SSSP + + rmm::device_uvector d_sg_distances(sg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + rmm::device_uvector d_sg_predecessors(sg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + + // FIXME: disable do_expensive_check + cugraph::experimental::sssp(handle, + sg_graph_view, + d_sg_distances.data(), + d_sg_predecessors.data(), + unrenumbered_source, + std::numeric_limits::max(), + true); + + // 5-3. compare + + std::vector h_sg_offsets(sg_graph_view.get_number_of_vertices() + 1); + std::vector h_sg_indices(sg_graph_view.get_number_of_edges()); + std::vector h_sg_weights(sg_graph_view.get_number_of_edges()); + raft::update_host(h_sg_offsets.data(), + sg_graph_view.offsets(), + sg_graph_view.get_number_of_vertices() + 1, + handle.get_stream()); + raft::update_host(h_sg_indices.data(), + sg_graph_view.indices(), + sg_graph_view.get_number_of_edges(), + handle.get_stream()); + raft::update_host(h_sg_weights.data(), + sg_graph_view.weights(), + sg_graph_view.get_number_of_edges(), + handle.get_stream()); + + std::vector h_sg_distances(sg_graph_view.get_number_of_vertices()); + std::vector h_sg_predecessors(sg_graph_view.get_number_of_vertices()); + raft::update_host( + h_sg_distances.data(), d_sg_distances.data(), d_sg_distances.size(), handle.get_stream()); + raft::update_host(h_sg_predecessors.data(), + d_sg_predecessors.data(), + d_sg_predecessors.size(), + handle.get_stream()); + + std::vector h_mg_distances(mg_graph_view.get_number_of_local_vertices()); + std::vector h_mg_predecessors(mg_graph_view.get_number_of_local_vertices()); + raft::update_host( + h_mg_distances.data(), d_mg_distances.data(), d_mg_distances.size(), handle.get_stream()); + cugraph::experimental::unrenumber_int_vertices( + handle, + d_mg_predecessors.data(), + d_mg_predecessors.size(), + d_mg_renumber_map_labels.data(), + mg_graph_view.get_local_vertex_first(), + mg_graph_view.get_local_vertex_last(), + vertex_partition_lasts, + true); + raft::update_host(h_mg_predecessors.data(), + d_mg_predecessors.data(), + d_mg_predecessors.size(), + handle.get_stream()); + + std::vector h_mg_renumber_map_labels(d_mg_renumber_map_labels.size()); + raft::update_host(h_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + auto max_weight_element = std::max_element(h_sg_weights.begin(), h_sg_weights.end()); + auto epsilon = *max_weight_element * weight_t{1e-6}; + auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; }; + + for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) { + auto mapped_vertex = h_mg_renumber_map_labels[i]; + ASSERT_TRUE(nearly_equal(h_mg_distances[i], h_sg_distances[mapped_vertex])) + << "MG SSSP distance for vertex: " << mapped_vertex << " in rank: " << comm_rank + << " has value: " << h_mg_distances[i] + << " different from the corresponding SG value: " << h_sg_distances[mapped_vertex]; + if (h_mg_predecessors[i] == cugraph::invalid_vertex_id::value) { + ASSERT_TRUE(h_sg_predecessors[mapped_vertex] == h_mg_predecessors[i]) + << "vertex reachability does not match with the SG result."; + } else { + auto pred_distance = h_sg_distances[h_mg_predecessors[i]]; + bool found{false}; + for (auto j = h_sg_offsets[h_mg_predecessors[i]]; + j < h_sg_offsets[h_mg_predecessors[i] + 1]; + ++j) { + if (h_sg_indices[j] == mapped_vertex) { + if (nearly_equal(pred_distance + h_sg_weights[j], h_sg_distances[mapped_vertex])) { + found = true; + break; + } + } + } + ASSERT_TRUE(found) + << "no edge from the predecessor vertex to this vertex with the matching weight."; + } + } + } + } +}; + +TEST_P(Tests_MGSSSP, CheckInt32Int32Float) +{ + run_current_test(GetParam()); +} + +INSTANTIATE_TEST_CASE_P( + simple_test, + Tests_MGSSSP, + ::testing::Values( + // enable correctness checks + SSSP_Usecase("test/datasets/karate.mtx", 0), + SSSP_Usecase("test/datasets/dblp.mtx", 0), + SSSP_Usecase("test/datasets/wiki2003.mtx", 1000), + SSSP_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0), + // disable correctness checks for large graphs + SSSP_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + 0, + false))); + +CUGRAPH_MG_TEST_PROGRAM_MAIN() From 3df52a279cb3a95179c0f66381e1d053d4d1a76f Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 26 Mar 2021 17:28:59 -0400 Subject: [PATCH 39/63] bug fix --- .../patterns/copy_to_adj_matrix_row_col.cuh | 22 ++++++++++--------- cpp/src/experimental/renumber_edgelist.cu | 2 +- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh b/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh index 26876957b44..ca20b9a1285 100644 --- a/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh +++ b/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh @@ -150,11 +150,12 @@ void copy_to_matrix_major(raft::handle_t const& handle, }); // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and // directly scatters from the internal buffer) - thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_value_first, - rx_value_first + rx_counts[i], - map_first, - matrix_major_value_output_first); + thrust::scatter( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_value_first, + rx_value_first + rx_counts[i], + map_first, + matrix_major_value_output_first + matrix_partition.get_major_value_start_offset()); } else { auto map_first = thrust::make_transform_iterator( rx_vertices.begin(), [matrix_partition] __device__(auto v) { @@ -162,11 +163,12 @@ void copy_to_matrix_major(raft::handle_t const& handle, }); // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and // directly scatters from the internal buffer) - thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_value_first, - rx_value_first + rx_counts[i], - map_first, - matrix_major_value_output_first); + thrust::scatter( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_value_first, + rx_value_first + rx_counts[i], + map_first, + matrix_major_value_output_first + matrix_partition.get_major_value_start_offset()); } } } else { diff --git a/cpp/src/experimental/renumber_edgelist.cu b/cpp/src/experimental/renumber_edgelist.cu index 42d560ae9ad..4a2b0180e33 100644 --- a/cpp/src/experimental/renumber_edgelist.cu +++ b/cpp/src/experimental/renumber_edgelist.cu @@ -563,7 +563,7 @@ renumber_edgelist(raft::handle_t const& handle, // https://github.com/NVIDIA/cuCollections/issues/73 std::max(static_cast( static_cast(partition.get_matrix_partition_major_size(i)) / load_factor), - partition.get_matrix_partition_major_size(i) + 1), + static_cast(partition.get_matrix_partition_major_size(i)) + 1), invalid_vertex_id::value, invalid_vertex_id::value}; auto pair_first = thrust::make_transform_iterator( From 456a56492cbabc53b172ba6db8b5056cac6d5119 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 30 Mar 2021 14:48:18 -0400 Subject: [PATCH 40/63] move generate_graph_from_edgelist test utility function out from the detail space --- cpp/tests/CMakeLists.txt | 2 +- .../detail/generate_graph_from_edgelist.hpp | 49 ------------------- .../generate_graph_from_edgelist.cu | 2 - .../utilities/matrix_market_file_utilities.cu | 20 ++++---- cpp/tests/utilities/rmat_utilities.cu | 20 ++++---- cpp/tests/utilities/test_utilities.hpp | 16 ++++++ 6 files changed, 35 insertions(+), 74 deletions(-) delete mode 100644 cpp/tests/utilities/detail/generate_graph_from_edgelist.hpp rename cpp/tests/utilities/{detail => }/generate_graph_from_edgelist.cu (99%) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 9236a7ac1d9..4f67c802009 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -22,9 +22,9 @@ add_library(cugraphtestutil STATIC "${CMAKE_CURRENT_SOURCE_DIR}/utilities/matrix_market_file_utilities.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/rmat_utilities.cu" + "${CMAKE_CURRENT_SOURCE_DIR}/utilities/generate_graph_from_edgelist.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/thrust_wrapper.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/misc_utilities.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/utilities/detail/generate_graph_from_edgelist.cu" "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c") set_property(TARGET cugraphtestutil PROPERTY POSITION_INDEPENDENT_CODE ON) diff --git a/cpp/tests/utilities/detail/generate_graph_from_edgelist.hpp b/cpp/tests/utilities/detail/generate_graph_from_edgelist.hpp deleted file mode 100644 index b0ece55be7e..00000000000 --- a/cpp/tests/utilities/detail/generate_graph_from_edgelist.hpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -#include -#include - -#include -#include - -namespace cugraph { -namespace test { -namespace detail { - -template -std::tuple, - rmm::device_uvector> -generate_graph_from_edgelist(raft::handle_t const& handle, - rmm::device_uvector&& vertices, - rmm::device_uvector&& edgelist_rows, - rmm::device_uvector&& edgelist_cols, - rmm::device_uvector&& edgelist_weights, - bool is_symmetric, - bool test_weighted, - bool renumber); - -} // namespace detail -} // namespace test -} // namespace cugraph diff --git a/cpp/tests/utilities/detail/generate_graph_from_edgelist.cu b/cpp/tests/utilities/generate_graph_from_edgelist.cu similarity index 99% rename from cpp/tests/utilities/detail/generate_graph_from_edgelist.cu rename to cpp/tests/utilities/generate_graph_from_edgelist.cu index be93b98b833..c36dd9bde8d 100644 --- a/cpp/tests/utilities/detail/generate_graph_from_edgelist.cu +++ b/cpp/tests/utilities/generate_graph_from_edgelist.cu @@ -28,7 +28,6 @@ namespace cugraph { namespace test { -namespace detail { namespace { @@ -516,6 +515,5 @@ generate_graph_from_edgelist( bool test_weighted, bool renumber); -} // namespace detail } // namespace test } // namespace cugraph diff --git a/cpp/tests/utilities/matrix_market_file_utilities.cu b/cpp/tests/utilities/matrix_market_file_utilities.cu index cbab0e988fe..bf7539864be 100644 --- a/cpp/tests/utilities/matrix_market_file_utilities.cu +++ b/cpp/tests/utilities/matrix_market_file_utilities.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include @@ -410,16 +409,15 @@ read_graph_from_matrix_market_file(raft::handle_t const& handle, } handle.get_stream_view().synchronize(); - return detail:: - generate_graph_from_edgelist( - handle, - std::move(d_vertices), - std::move(d_edgelist_rows), - std::move(d_edgelist_cols), - std::move(d_edgelist_weights), - is_symmetric, - test_weighted, - renumber); + return generate_graph_from_edgelist( + handle, + std::move(d_vertices), + std::move(d_edgelist_rows), + std::move(d_edgelist_cols), + std::move(d_edgelist_weights), + is_symmetric, + test_weighted, + renumber); } // explicit instantiations diff --git a/cpp/tests/utilities/rmat_utilities.cu b/cpp/tests/utilities/rmat_utilities.cu index f2707ee5f73..3f0bb0b4a1f 100644 --- a/cpp/tests/utilities/rmat_utilities.cu +++ b/cpp/tests/utilities/rmat_utilities.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include @@ -232,16 +231,15 @@ generate_graph_from_rmat_params(raft::handle_t const& handle, d_vertices = std::move(d_rx_vertices); } - return detail:: - generate_graph_from_edgelist( - handle, - std::move(d_vertices), - std::move(d_edgelist_rows), - std::move(d_edgelist_cols), - std::move(d_edgelist_weights), - false, - test_weighted, - renumber); + return generate_graph_from_edgelist( + handle, + std::move(d_vertices), + std::move(d_edgelist_rows), + std::move(d_edgelist_cols), + std::move(d_edgelist_weights), + false, + test_weighted, + renumber); } // explicit instantiations diff --git a/cpp/tests/utilities/test_utilities.hpp b/cpp/tests/utilities/test_utilities.hpp index 3937c1a75ff..e81a76b4163 100644 --- a/cpp/tests/utilities/test_utilities.hpp +++ b/cpp/tests/utilities/test_utilities.hpp @@ -106,6 +106,22 @@ static const std::string& get_rapids_dataset_root_dir() return rdrd; } +template +std::tuple, + rmm::device_uvector> +generate_graph_from_edgelist(raft::handle_t const& handle, + rmm::device_uvector&& vertices, + rmm::device_uvector&& edgelist_rows, + rmm::device_uvector&& edgelist_cols, + rmm::device_uvector&& edgelist_weights, + bool is_symmetric, + bool test_weighted, + bool renumber); + // returns a tuple of (rows, columns, weights, number_of_vertices, is_symmetric) template std::tuple, From f8cb8b54750544122a5b520b5effca2f41b8fdda Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 30 Mar 2021 14:48:52 -0400 Subject: [PATCH 41/63] fix compile error --- .../patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index 5baf951cc4a..cd5200ae998 100644 --- a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -385,7 +385,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( if (static_cast(sub_comm_rank) == i) { rx_displs.assign(sub_comm_size, size_t{0}); std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1); - rmm::device_uvector rx_major_vertices.resize(rx_displs.back() + rx_sizes.back(), + rx_major_vertices.resize(rx_displs.back() + rx_sizes.back(), handle.get_stream()); } auto rx_tmp_e_op_result_buffer = From dc2c0a10892ac2a4e9e40bcd9546c797d3b2b14d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 30 Mar 2021 17:31:32 -0400 Subject: [PATCH 42/63] bug fix --- cpp/src/experimental/coarsen_graph.cu | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/cpp/src/experimental/coarsen_graph.cu b/cpp/src/experimental/coarsen_graph.cu index 9fc64cf7c8b..0929ed8abd0 100644 --- a/cpp/src/experimental/coarsen_graph.cu +++ b/cpp/src/experimental/coarsen_graph.cu @@ -363,7 +363,7 @@ coarsen_graph( }; auto pair_first = thrust::make_zip_iterator( thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); - auto displacements = + auto counts = graph_view.is_weighted() ? groupby_and_count(pair_first, pair_first + edgelist_major_vertices.size(), @@ -376,15 +376,14 @@ coarsen_graph( local_partition_id_op, graph_view.get_number_of_local_adj_matrix_partitions(), handle.get_stream()); - thrust::exclusive_scan(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - displacements.begin(), - displacements.end(), - displacements.begin()); - std::vector h_displacements(displacements.size()); + std::vector h_counts(counts.size()); raft::update_host( - h_displacements.data(), displacements.data(), displacements.size(), handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); + h_counts.data(), counts.data(), counts.size(), handle.get_stream()); + handle.get_stream_view().synchronize(); + + std::vector h_displacements(h_counts.size(), size_t{0}); + std::partial_sum(h_counts.begin(), h_counts.end() - 1, h_displacements.begin() + 1); for (int j = 0; j < col_comm_size; ++j) { auto number_of_partition_edges = groupby_e_and_coarsen_edgelist( @@ -392,7 +391,7 @@ coarsen_graph( edgelist_minor_vertices.begin() + h_displacements[j], graph_view.is_weighted() ? edgelist_weights.begin() + h_displacements[j] : static_cast(nullptr), - h_displacements[j + 1] - h_displacements[j], + h_counts[j], handle.get_stream()); auto cur_size = coarsened_edgelist_major_vertices[j].size(); @@ -527,7 +526,7 @@ coarsen_graph( std::vector> edgelists{}; edgelists.resize(graph_view.get_number_of_local_adj_matrix_partitions()); - for (size_t i = 0; edgelists.size(); ++i) { + for (size_t i = 0; i < edgelists.size(); ++i) { edgelists[i].p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices[i].data() : coarsened_edgelist_major_vertices[i].data(); edgelists[i].p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices[i].data() From b0f7f207997dc8d704f3bf6a4029ba4be05d0739 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 30 Mar 2021 17:32:00 -0400 Subject: [PATCH 43/63] cosmetic update --- cpp/tests/community/mg_louvain_test.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp index f6596a6b59a..2e8dcc44708 100644 --- a/cpp/tests/community/mg_louvain_test.cpp +++ b/cpp/tests/community/mg_louvain_test.cpp @@ -31,10 +31,10 @@ #include -void compare(float modularity, float sg_modularity) { ASSERT_FLOAT_EQ(modularity, sg_modularity); } -void compare(double modularity, double sg_modularity) +void compare(float mg_modularity, float sg_modularity) { ASSERT_FLOAT_EQ(mg_modularity, sg_modularity); } +void compare(double mg_modularity, double sg_modularity) { - ASSERT_DOUBLE_EQ(modularity, sg_modularity); + ASSERT_DOUBLE_EQ(mg_modularity, sg_modularity); } //////////////////////////////////////////////////////////////////////////////// From 5903deca13d07dbc01104bd3457bc80aa3fcb6f0 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 31 Mar 2021 01:58:53 -0400 Subject: [PATCH 44/63] bug fix --- cpp/src/experimental/coarsen_graph.cu | 32 +++++++++++++-------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/cpp/src/experimental/coarsen_graph.cu b/cpp/src/experimental/coarsen_graph.cu index 0929ed8abd0..103dcabc341 100644 --- a/cpp/src/experimental/coarsen_graph.cu +++ b/cpp/src/experimental/coarsen_graph.cu @@ -363,23 +363,21 @@ coarsen_graph( }; auto pair_first = thrust::make_zip_iterator( thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); - auto counts = - graph_view.is_weighted() - ? groupby_and_count(pair_first, - pair_first + edgelist_major_vertices.size(), - edgelist_weights.begin(), - local_partition_id_op, - graph_view.get_number_of_local_adj_matrix_partitions(), - handle.get_stream()) - : groupby_and_count(pair_first, - pair_first + edgelist_major_vertices.size(), - local_partition_id_op, - graph_view.get_number_of_local_adj_matrix_partitions(), - handle.get_stream()); + auto counts = graph_view.is_weighted() + ? groupby_and_count(pair_first, + pair_first + edgelist_major_vertices.size(), + edgelist_weights.begin(), + local_partition_id_op, + graph_view.get_number_of_local_adj_matrix_partitions(), + handle.get_stream()) + : groupby_and_count(pair_first, + pair_first + edgelist_major_vertices.size(), + local_partition_id_op, + graph_view.get_number_of_local_adj_matrix_partitions(), + handle.get_stream()); std::vector h_counts(counts.size()); - raft::update_host( - h_counts.data(), counts.data(), counts.size(), handle.get_stream()); + raft::update_host(h_counts.data(), counts.data(), counts.size(), handle.get_stream()); handle.get_stream_view().synchronize(); std::vector h_displacements(h_counts.size(), size_t{0}); @@ -400,10 +398,10 @@ coarsen_graph( // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management coarsened_edgelist_major_vertices[j].resize(cur_size + number_of_partition_edges, handle.get_stream()); - coarsened_edgelist_minor_vertices[j].resize(coarsened_edgelist_major_vertices.size(), + coarsened_edgelist_minor_vertices[j].resize(coarsened_edgelist_major_vertices[j].size(), handle.get_stream()); if (graph_view.is_weighted()) { - coarsened_edgelist_weights[j].resize(coarsened_edgelist_major_vertices.size(), + coarsened_edgelist_weights[j].resize(coarsened_edgelist_major_vertices[j].size(), handle.get_stream()); auto src_edge_first = From 1a8994d139392d29b43a889ec129ebf951b90192 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 1 Apr 2021 10:24:46 -0400 Subject: [PATCH 45/63] remove default value for the stream input parameter in add_level --- cpp/include/dendrogram.hpp | 2 +- cpp/src/community/ecg.cu | 2 +- cpp/src/community/leiden.cuh | 2 +- cpp/src/community/louvain.cuh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/dendrogram.hpp b/cpp/include/dendrogram.hpp index bb9ba470a52..aa0802e80b3 100644 --- a/cpp/include/dendrogram.hpp +++ b/cpp/include/dendrogram.hpp @@ -27,7 +27,7 @@ class Dendrogram { public: void add_level(vertex_t first_index, vertex_t num_verts, - cudaStream_t stream = 0, + cudaStream_t stream, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) { level_ptr_.push_back(std::make_unique>(num_verts, stream, mr)); diff --git a/cpp/src/community/ecg.cu b/cpp/src/community/ecg.cu index 45f7d723191..a176dfbd1c8 100644 --- a/cpp/src/community/ecg.cu +++ b/cpp/src/community/ecg.cu @@ -117,7 +117,7 @@ class EcgLouvain : public cugraph::Louvain { void initialize_dendrogram_level(vertex_t num_vertices) override { - this->dendrogram_->add_level(0, num_vertices); + this->dendrogram_->add_level(0, num_vertices, this->stream_); get_permutation_vector( num_vertices, seed_, this->dendrogram_->current_level_begin(), this->stream_); diff --git a/cpp/src/community/leiden.cuh b/cpp/src/community/leiden.cuh index aae2d3712b5..4ffb7c20eb2 100644 --- a/cpp/src/community/leiden.cuh +++ b/cpp/src/community/leiden.cuh @@ -132,7 +132,7 @@ class Leiden : public Louvain { // // Initialize every cluster to reference each vertex to itself // - this->dendrogram_->add_level(0, current_graph.number_of_vertices); + this->dendrogram_->add_level(0, current_graph.number_of_vertices, this->stream_); thrust::sequence(rmm::exec_policy(this->stream_)->on(this->stream_), this->dendrogram_->current_level_begin(), diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index 0862bbc62a9..e3569d4c850 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -210,7 +210,7 @@ class Louvain { virtual void initialize_dendrogram_level(vertex_t num_vertices) { - dendrogram_->add_level(0, num_vertices); + dendrogram_->add_level(0, num_vertices, stream_); thrust::sequence(rmm::exec_policy(stream_)->on(stream_), dendrogram_->current_level_begin(), From ab1fb12fb06f578e580abb428bde57986c63a429 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 1 Apr 2021 10:30:48 -0400 Subject: [PATCH 46/63] explicitly take is_weighted in graph construction than using p_weights != nullptr as p_weights can be nullptr if there is no edge in a graph partitoin even for weighted graphs --- cpp/include/experimental/graph_view.hpp | 21 +++++----- cpp/src/experimental/coarsen_graph.cu | 22 ++++++---- cpp/src/experimental/graph.cu | 40 +++++++++++-------- cpp/tests/experimental/graph_test.cpp | 2 +- cpp/tests/experimental/rw_low_level_test.cu | 21 +++++----- .../utilities/generate_graph_from_edgelist.cu | 4 +- 6 files changed, 64 insertions(+), 46 deletions(-) diff --git a/cpp/include/experimental/graph_view.hpp b/cpp/include/experimental/graph_view.hpp index 73e05e646a7..d946638a5cd 100644 --- a/cpp/include/experimental/graph_view.hpp +++ b/cpp/include/experimental/graph_view.hpp @@ -159,7 +159,8 @@ class partition_t { vertex_t get_matrix_partition_major_size(size_t partition_idx) const { - return get_matrix_partition_major_last(partition_idx) - get_matrix_partition_major_first(partition_idx); + return get_matrix_partition_major_last(partition_idx) - + get_matrix_partition_major_first(partition_idx); } vertex_t get_matrix_partition_major_value_start_offset(size_t partition_idx) const @@ -208,6 +209,7 @@ class partition_t { struct graph_properties_t { bool is_symmetric{false}; bool is_multigraph{false}; + bool is_weighted{false}; }; namespace detail { @@ -249,6 +251,7 @@ class graph_base_t { bool is_symmetric() const { return properties_.is_symmetric; } bool is_multigraph() const { return properties_.is_multigraph; } + bool is_weighted() const { return properties_.is_weighted; } protected: raft::handle_t const* get_handle_ptr() const { return handle_ptr_; }; @@ -306,8 +309,6 @@ class graph_view_t 0; } - // FIXME: this should be removed once MNMG Louvain is updated to use graph primitives partition_t get_partition() const { return partition_; } @@ -393,8 +394,10 @@ class graph_view_tget_number_of_vertices(); } constexpr vertex_t get_local_vertex_first() const { return vertex_t{0}; } diff --git a/cpp/src/experimental/coarsen_graph.cu b/cpp/src/experimental/coarsen_graph.cu index 103dcabc341..1eccbd23584 100644 --- a/cpp/src/experimental/coarsen_graph.cu +++ b/cpp/src/experimental/coarsen_graph.cu @@ -50,6 +50,7 @@ std:: weight_t const *compressed_sparse_weights, vertex_t major_first, vertex_t major_last, + bool is_weighted, cudaStream_t stream) { edge_t number_of_edges{0}; @@ -58,8 +59,7 @@ std:: CUDA_TRY(cudaStreamSynchronize(stream)); rmm::device_uvector edgelist_major_vertices(number_of_edges, stream); rmm::device_uvector edgelist_minor_vertices(number_of_edges, stream); - rmm::device_uvector edgelist_weights( - compressed_sparse_weights != nullptr ? number_of_edges : 0, stream); + rmm::device_uvector edgelist_weights(is_weighted ? number_of_edges : 0, stream); // FIXME: this is highly inefficient for very high-degree vertices, for better performance, we can // fill high-degree vertices using one CUDA block per vertex, mid-degree vertices using one CUDA @@ -78,7 +78,7 @@ std:: compressed_sparse_indices, compressed_sparse_indices + number_of_edges, edgelist_minor_vertices.begin()); - if (compressed_sparse_weights != nullptr) { + if (is_weighted) { thrust::copy(rmm::exec_policy(stream)->on(stream), compressed_sparse_weights, compressed_sparse_weights + number_of_edges, @@ -95,12 +95,13 @@ edge_t groupby_e_and_coarsen_edgelist(vertex_t *edgelist_major_vertices /* [INOU vertex_t *edgelist_minor_vertices /* [INOUT] */, weight_t *edgelist_weights /* [INOUT] */, edge_t number_of_edges, + bool is_weighted, cudaStream_t stream) { auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices)); - if (edgelist_weights != nullptr) { + if (is_weighted) { thrust::sort_by_key(rmm::exec_policy(stream)->on(stream), pair_first, pair_first + number_of_edges, @@ -154,6 +155,7 @@ std:: vertex_t major_last, vertex_t minor_first, vertex_t minor_last, + bool is_weighted, cudaStream_t stream) { // FIXME: it might be possible to directly create relabled & coarsened edgelist from the @@ -168,6 +170,7 @@ std:: compressed_sparse_weights, major_first, major_last, + is_weighted, stream); auto pair_first = thrust::make_zip_iterator( @@ -187,12 +190,13 @@ std:: edgelist_minor_vertices.data(), edgelist_weights.data(), static_cast(edgelist_major_vertices.size()), + is_weighted, stream); edgelist_major_vertices.resize(number_of_edges, stream); edgelist_major_vertices.shrink_to_fit(stream); edgelist_minor_vertices.resize(number_of_edges, stream); edgelist_minor_vertices.shrink_to_fit(stream); - if (edgelist_weights.size() > 0) { + if (is_weighted) { edgelist_weights.resize(number_of_edges, stream); edgelist_weights.shrink_to_fit(stream); } @@ -300,6 +304,7 @@ coarsen_graph( : graph_view.get_local_adj_matrix_partition_col_first(i), store_transposed ? graph_view.get_local_adj_matrix_partition_row_last(i) : graph_view.get_local_adj_matrix_partition_col_last(i), + graph_view.is_weighted(), handle.get_stream()); // 1-2. globaly shuffle @@ -390,6 +395,7 @@ coarsen_graph( graph_view.is_weighted() ? edgelist_weights.begin() + h_displacements[j] : static_cast(nullptr), h_counts[j], + graph_view.is_weighted(), handle.get_stream()); auto cur_size = coarsened_edgelist_major_vertices[j].size(); @@ -441,6 +447,7 @@ coarsen_graph( graph_view.is_weighted() ? coarsened_edgelist_weights[i].data() : static_cast(nullptr), static_cast(coarsened_edgelist_major_vertices[i].size()), + graph_view.is_weighted(), handle.get_stream()); coarsened_edgelist_major_vertices[i].resize(number_of_partition_edges, handle.get_stream()); coarsened_edgelist_major_vertices[i].shrink_to_fit(handle.get_stream()); @@ -541,7 +548,7 @@ coarsen_graph( partition, number_of_vertices, number_of_edges, - graph_properties_t{graph_view.is_symmetric(), false}, + graph_properties_t{graph_view.is_symmetric(), false, graph_view.is_weighted()}, true), std::move(renumber_map_labels)); } @@ -582,6 +589,7 @@ coarsen_graph( graph_view.get_number_of_vertices(), vertex_t{0}, graph_view.get_number_of_vertices(), + graph_view.is_weighted(), handle.get_stream()); rmm::device_uvector unique_labels(graph_view.get_number_of_vertices(), @@ -622,7 +630,7 @@ coarsen_graph( handle, edgelist, static_cast(renumber_map_labels.size()), - graph_properties_t{graph_view.is_symmetric(), false}, + graph_properties_t{graph_view.is_symmetric(), false, graph_view.is_weighted()}, true), std::move(renumber_map_labels)); } diff --git a/cpp/src/experimental/graph.cu b/cpp/src/experimental/graph.cu index 2b5f7889ee7..47c41cb3426 100644 --- a/cpp/src/experimental/graph.cu +++ b/cpp/src/experimental/graph.cu @@ -67,12 +67,12 @@ std:: vertex_t major_last, vertex_t minor_first, vertex_t minor_last, + bool is_weighted, cudaStream_t stream) { rmm::device_uvector offsets((major_last - major_first) + 1, stream); rmm::device_uvector indices(edgelist.number_of_edges, stream); - rmm::device_uvector weights( - edgelist.p_edge_weights != nullptr ? edgelist.number_of_edges : 0, stream); + rmm::device_uvector weights(is_weighted ? edgelist.number_of_edges : 0, stream); thrust::fill(rmm::exec_policy(stream)->on(stream), offsets.begin(), offsets.end(), edge_t{0}); thrust::fill(rmm::exec_policy(stream)->on(stream), indices.begin(), indices.end(), vertex_t{0}); @@ -89,8 +89,7 @@ std:: auto p_offsets = offsets.data(); auto p_indices = indices.data(); - auto p_weights = - edgelist.p_edge_weights != nullptr ? weights.data() : static_cast(nullptr); + auto p_weights = is_weighted ? weights.data() : static_cast(nullptr); thrust::for_each(rmm::exec_policy(stream)->on(stream), store_transposed ? edgelist.p_dst_vertices : edgelist.p_src_vertices, @@ -103,7 +102,7 @@ std:: thrust::exclusive_scan( rmm::exec_policy(stream)->on(stream), offsets.begin(), offsets.end(), offsets.begin()); - if (edgelist.p_edge_weights != nullptr) { + if (is_weighted) { auto edge_first = thrust::make_zip_iterator(thrust::make_tuple( edgelist.p_src_vertices, edgelist.p_dst_vertices, edgelist.p_edge_weights)); thrust::for_each(rmm::exec_policy(stream)->on(stream), @@ -191,20 +190,20 @@ graph_t 0, "Invalid input argument: edgelists.size() should be non-zero."); - bool is_weighted = edgelists[0].p_edge_weights != nullptr; - CUGRAPH_EXPECTS( std::any_of(edgelists.begin() + 1, edgelists.end(), - [is_weighted](auto edgelist) { - return (edgelist.p_src_vertices == nullptr) || - (edgelist.p_dst_vertices == nullptr) || - (is_weighted && (edgelist.p_edge_weights == nullptr)) || + [is_weighted = properties.is_weighted](auto edgelist) { + return ((edgelist.number_of_edges > 0) && (edgelist.p_src_vertices == nullptr)) || + ((edgelist.number_of_edges > 0) && (edgelist.p_dst_vertices == nullptr)) || + (is_weighted && (edgelist.number_of_edges > 0) && + (edgelist.p_edge_weights == nullptr)) || (!is_weighted && (edgelist.p_edge_weights != nullptr)); }) == false, "Invalid input argument: edgelists[].p_src_vertices and edgelists[].p_dst_vertices should not " - "be nullptr and edgelists[].p_edge_weights should be nullptr (if edgelists[0].p_edge_weights " - "is nullptr) or should not be nullptr (otherwise)."); + "be nullptr if edgelists[].number_of_edges > 0 and edgelists[].p_edge_weights should be " + "nullptr if unweighted or should not be nullptr if weighted and edgelists[].number_of_edges > " + "0."); CUGRAPH_EXPECTS(edgelists.size() == static_cast(col_comm_size), "Invalid input argument: errneous edgelists.size()."); @@ -249,7 +248,7 @@ graph_tget_handle_ptr()->get_stream()); adj_matrix_partition_offsets_.push_back(std::move(offsets)); adj_matrix_partition_indices_.push_back(std::move(indices)); - if (is_weighted) { adj_matrix_partition_weights_.push_back(std::move(weights)); } + if (properties.is_weighted) { adj_matrix_partition_weights_.push_back(std::move(weights)); } } // update degree-based segment offsets (to be used for graph analytics kernel optimization) @@ -373,9 +373,14 @@ graph_tget_handle_ptr()->get_stream(); CUGRAPH_EXPECTS( - (edgelist.p_src_vertices != nullptr) && (edgelist.p_dst_vertices != nullptr), + ((edgelist.number_of_edges == 0) || (edgelist.p_src_vertices != nullptr)) && + ((edgelist.number_of_edges == 0) || (edgelist.p_dst_vertices != nullptr)) && + ((properties.is_weighted && + ((edgelist.number_of_edges == 0) || (edgelist.p_edge_weights != nullptr))) || + (!properties.is_weighted && (edgelist.p_edge_weights == nullptr))), "Invalid input argument: edgelist.p_src_vertices and edgelist.p_dst_vertices should " - "not be nullptr."); + "not be nullptr if edgelist.number_of_edges > 0 and edgelist.p_edge_weights should be nullptr " + "if unweighted or should not be nullptr if weighted and edgelist.number_of_edges > 0."); // optional expensive checks (part 1/2) @@ -407,6 +412,7 @@ graph_tget_number_of_vertices(), vertex_t{0}, this->get_number_of_vertices(), + properties.is_weighted, this->get_handle_ptr()->get_stream()); // update degree-based segment offsets (to be used for graph analytics kernel optimization) diff --git a/cpp/tests/experimental/graph_test.cpp b/cpp/tests/experimental/graph_test.cpp index 949f6d2e08e..6ce32e0c836 100644 --- a/cpp/tests/experimental/graph_test.cpp +++ b/cpp/tests/experimental/graph_test.cpp @@ -139,7 +139,7 @@ class Tests_Graph : public ::testing::TestWithParam { handle, edgelist, number_of_vertices, - cugraph::experimental::graph_properties_t{is_symmetric, false}, + cugraph::experimental::graph_properties_t{is_symmetric, false, configuration.test_weighted}, false, true); diff --git a/cpp/tests/experimental/rw_low_level_test.cu b/cpp/tests/experimental/rw_low_level_test.cu index a32e258d366..8b562bc41f6 100644 --- a/cpp/tests/experimental/rw_low_level_test.cu +++ b/cpp/tests/experimental/rw_low_level_test.cu @@ -53,7 +53,8 @@ graph_t make_graph(raft::handle_t cons std::vector const& v_dst, std::vector const& v_w, vertex_t num_vertices, - edge_t num_edges) + edge_t num_edges, + bool is_weighted) { vector_test_t d_src(num_edges, handle.get_stream()); vector_test_t d_dst(num_edges, handle.get_stream()); @@ -67,7 +68,7 @@ graph_t make_graph(raft::handle_t cons d_src.data(), d_dst.data(), d_weights.data(), num_edges}; graph_t graph( - handle, edgelist, num_vertices, graph_properties_t{}, false); + handle, edgelist, num_vertices, graph_properties_t{false, false, is_weighted}, false); return graph; } @@ -119,7 +120,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphRWStart) std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; - auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges); + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); auto graph_view = graph.view(); @@ -199,7 +200,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphCoalesceExperiments) std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; - auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges); + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); auto graph_view = graph.view(); @@ -275,7 +276,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphColExtraction) std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; - auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges); + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); auto graph_view = graph.view(); @@ -371,7 +372,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphRndGenColIndx) std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; - auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges); + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); auto graph_view = graph.view(); @@ -449,7 +450,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphUpdatePathSizes) std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; - auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges); + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); auto graph_view = graph.view(); @@ -521,7 +522,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphScatterUpdate) std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; - auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges); + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); auto graph_view = graph.view(); @@ -666,7 +667,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphCoalesceDefragment) std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; - auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges); + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); auto graph_view = graph.view(); @@ -741,7 +742,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphRandomWalk) std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; - auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges); + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); auto graph_view = graph.view(); diff --git a/cpp/tests/utilities/generate_graph_from_edgelist.cu b/cpp/tests/utilities/generate_graph_from_edgelist.cu index c36dd9bde8d..a9df392d2fb 100644 --- a/cpp/tests/utilities/generate_graph_from_edgelist.cu +++ b/cpp/tests/utilities/generate_graph_from_edgelist.cu @@ -141,7 +141,7 @@ generate_graph_from_edgelist_impl(raft::handle_t const& handle, partition, number_of_vertices, number_of_edges, - cugraph::experimental::graph_properties_t{is_symmetric, false}, + cugraph::experimental::graph_properties_t{is_symmetric, false, test_weighted}, true, true), std::move(renumber_map_labels)); @@ -190,7 +190,7 @@ generate_graph_from_edgelist_impl(raft::handle_t const& handle, test_weighted ? edgelist_weights.data() : nullptr, static_cast(edgelist_rows.size())}, number_of_vertices, - cugraph::experimental::graph_properties_t{is_symmetric, false}, + cugraph::experimental::graph_properties_t{is_symmetric, false, test_weighted}, renumber ? true : false, true), std::move(renumber_map_labels)); From eaa6dfe12d97dd34baea3f21a723c39ffe80e7f3 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 1 Apr 2021 10:31:14 -0400 Subject: [PATCH 47/63] minor fixes --- cpp/include/matrix_partition_device.cuh | 2 +- cpp/src/experimental/renumber_utils.cu | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/include/matrix_partition_device.cuh b/cpp/include/matrix_partition_device.cuh index b41119e7be6..30d6540bcfe 100644 --- a/cpp/include/matrix_partition_device.cuh +++ b/cpp/include/matrix_partition_device.cuh @@ -192,7 +192,7 @@ class matrix_partition_device_tinsert(kv_pair_first, kv_pair_first + sorted_unique_ext_vertices.size()); } else { + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream + renumber_map_ptr.reset(); renumber_map_ptr = std::make_unique>( From fc7cf27191d29ce3f77d1f01e4afcb3047614c5e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 1 Apr 2021 10:33:26 -0400 Subject: [PATCH 48/63] bug fixes for Louvain related graph primitives --- ...ransform_reduce_key_aggregated_out_nbr.cuh | 223 +++++++++++------- ...orm_reduce_by_adj_matrix_row_col_key_e.cuh | 14 +- 2 files changed, 153 insertions(+), 84 deletions(-) diff --git a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index cd5200ae998..53a866fab39 100644 --- a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -170,8 +171,8 @@ __global__ void for_all_major_for_all_nbr_low_degree( */ template ::value_type, + static_assert(std::is_same::value_type, typename GraphViewType::vertex_type>::value); + static_assert(std::is_same::value_type, + typename std::iterator_traits::value_type>::value); static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); using vertex_t = typename GraphViewType::vertex_type; @@ -206,20 +209,90 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( // 1. build a cuco::static_map object for the k, v pairs. auto kv_map_ptr = std::make_unique>( - // FIXME: std::max(..., ...) as a temporary workaround for - // https://github.com/NVIDIA/cuCollections/issues/72 and - // https://github.com/NVIDIA/cuCollections/issues/73 - std::max(static_cast( - static_cast(thrust::distance(map_key_first, map_key_last)) / load_factor), - static_cast(thrust::distance(map_key_first, map_key_last)) + 1), - invalid_vertex_id::value, - invalid_vertex_id::value); - auto pair_first = thrust::make_transform_iterator( - thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first)), - [] __device__(auto val) { - return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); - }); - kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); + size_t{0}, invalid_vertex_id::value, invalid_vertex_id::value); + if (GraphViewType::is_multi_gpu) { + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_rank = row_comm.get_rank(); + auto const row_comm_size = row_comm.get_size(); + + auto map_counts = + host_scalar_allgather(row_comm, + static_cast(thrust::distance(map_key_first, map_key_last)), + handle.get_stream()); + std::vector map_displacements(row_comm_size, size_t{0}); + std::partial_sum(map_counts.begin(), map_counts.end() - 1, map_displacements.begin() + 1); + rmm::device_uvector map_keys(map_displacements.back() + map_counts.back(), + handle.get_stream()); + auto map_value_buffer = + allocate_dataframe_buffer(map_keys.size(), handle.get_stream()); + for (int i = 0; i < row_comm_size; ++i) { + device_bcast(row_comm, + map_key_first, + map_keys.begin() + map_displacements[i], + map_counts[i], + i, + handle.get_stream()); + device_bcast(row_comm, + map_value_first, + get_dataframe_buffer_begin(map_value_buffer) + map_displacements[i], + map_counts[i], + i, + handle.get_stream()); + } + // FIXME: these copies are unnecessary, better fix RAFT comm's bcast to take separate input & + // output pointers + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + map_key_first, + map_key_last, + map_keys.begin() + map_displacements[row_comm_rank]); + thrust::copy( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + map_value_first, + map_value_first + thrust::distance(map_key_first, map_key_last), + get_dataframe_buffer_begin(map_value_buffer) + map_displacements[row_comm_rank]); + + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream + + kv_map_ptr.reset(); + + kv_map_ptr = std::make_unique>( + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast(static_cast(map_keys.size()) / load_factor), + static_cast(thrust::distance(map_key_first, map_key_last)) + 1), + invalid_vertex_id::value, + invalid_vertex_id::value); + + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple( + map_keys.begin(), get_dataframe_buffer_begin(map_value_buffer))), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + kv_map_ptr->insert(pair_first, pair_first + map_keys.size()); + } else { + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream + + kv_map_ptr.reset(); + + kv_map_ptr = std::make_unique>( + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast( + static_cast(thrust::distance(map_key_first, map_key_last)) / load_factor), + static_cast(thrust::distance(map_key_first, map_key_last)) + 1), + invalid_vertex_id::value, + invalid_vertex_id::value); + + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first)), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); + } // 2. aggregate each vertex out-going edges based on keys and transform-reduce. @@ -228,35 +301,15 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { matrix_partition_device_t matrix_partition(graph_view, i); - int comm_root_rank = 0; - if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - comm_root_rank = i * row_comm_size + row_comm_rank; - } - - auto num_edges = thrust::transform_reduce( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - thrust::make_counting_iterator(graph_view.get_vertex_partition_first(comm_root_rank)), - thrust::make_counting_iterator(graph_view.get_vertex_partition_last(comm_root_rank)), - [matrix_partition] __device__(auto row) { - auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row); - return matrix_partition.get_local_degree(row_offset); - }, - edge_t{0}, - thrust::plus()); - - rmm::device_uvector tmp_major_vertices(num_edges, handle.get_stream()); + rmm::device_uvector tmp_major_vertices(matrix_partition.get_number_of_edges(), + handle.get_stream()); rmm::device_uvector tmp_minor_keys(tmp_major_vertices.size(), handle.get_stream()); rmm::device_uvector tmp_key_aggregated_edge_weights(tmp_major_vertices.size(), handle.get_stream()); - if (graph_view.get_vertex_partition_size(comm_root_rank) > 0) { + if (matrix_partition.get_major_size() > 0) { raft::grid_1d_thread_t update_grid( - graph_view.get_vertex_partition_size(comm_root_rank), + matrix_partition.get_major_size(), detail::copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size, handle.get_device_properties().maxGridSize[0]); @@ -270,8 +323,8 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( 0, handle.get_stream()>>>( matrix_partition, - graph_view.get_vertex_partition_first(comm_root_rank), - graph_view.get_vertex_partition_last(comm_root_rank), + matrix_partition.get_major_first(), + matrix_partition.get_major_last(), adj_matrix_col_key_first, tmp_major_vertices.data(), tmp_minor_keys.data(), @@ -293,8 +346,14 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( tmp_key_aggregated_edge_weights.resize(tmp_major_vertices.size(), handle.get_stream()); if (GraphViewType::is_multi_gpu) { - auto& sub_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const sub_comm_size = sub_comm.get_size(); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_size = row_comm.get_size(); + + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_size = col_comm.get_size(); triplet_first = thrust::make_zip_iterator(thrust::make_tuple(tmp_major_vertices.begin(), @@ -306,11 +365,13 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( std::forward_as_tuple( std::tie(rx_major_vertices, rx_minor_keys, rx_key_aggregated_edge_weights), std::ignore) = groupby_gpuid_and_shuffle_values( - sub_comm, + col_comm, triplet_first, triplet_first + tmp_major_vertices.size(), - [key_func = detail::compute_gpu_id_from_vertex_t{sub_comm_size}] __device__( - auto val) { return key_func(thrust::get<1>(val)); }, + [key_func = detail::compute_gpu_id_from_vertex_t{comm_size}, + row_comm_size] __device__(auto val) { + return key_func(thrust::get<1>(val)) / row_comm_size; + }, handle.get_stream()); auto pair_first = thrust::make_zip_iterator( @@ -346,52 +407,52 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( triplet_first = thrust::make_zip_iterator(thrust::make_tuple( tmp_major_vertices.begin(), tmp_minor_keys.begin(), tmp_key_aggregated_edge_weights.begin())); - thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - triplet_first, - triplet_first + tmp_major_vertices.size(), - tmp_e_op_result_buffer_first, - [adj_matrix_row_value_input_first, - key_aggregated_e_op, - matrix_partition, - kv_map = kv_map_ptr->get_device_view()] __device__(auto val) { - auto major = thrust::get<0>(val); - auto key = thrust::get<1>(val); - auto w = thrust::get<2>(val); - return key_aggregated_e_op( - major, - key, - w, - *(adj_matrix_row_value_input_first + - matrix_partition.get_major_offset_from_major_nocheck(major)), - kv_map.find(key)->second.load(cuda::std::memory_order_relaxed)); - }); + thrust::transform( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + triplet_first, + triplet_first + tmp_major_vertices.size(), + tmp_e_op_result_buffer_first, + [adj_matrix_row_value_input_first = + adj_matrix_row_value_input_first + matrix_partition.get_major_value_start_offset(), + key_aggregated_e_op, + matrix_partition, + kv_map = kv_map_ptr->get_device_view()] __device__(auto val) { + auto major = thrust::get<0>(val); + auto key = thrust::get<1>(val); + auto w = thrust::get<2>(val); + return key_aggregated_e_op(major, + key, + w, + *(adj_matrix_row_value_input_first + + matrix_partition.get_major_offset_from_major_nocheck(major)), + kv_map.find(key)->second.load(cuda::std::memory_order_relaxed)); + }); tmp_minor_keys.resize(0, handle.get_stream()); tmp_key_aggregated_edge_weights.resize(0, handle.get_stream()); tmp_minor_keys.shrink_to_fit(handle.get_stream()); tmp_key_aggregated_edge_weights.shrink_to_fit(handle.get_stream()); if (GraphViewType::is_multi_gpu) { - auto& sub_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const sub_comm_rank = sub_comm.get_rank(); - auto const sub_comm_size = sub_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); // FIXME: additional optimization is possible if reduce_op is a pure function (and reduce_op // can be mapped to ncclRedOp_t). auto rx_sizes = - host_scalar_gather(sub_comm, tmp_major_vertices.size(), i, handle.get_stream()); + host_scalar_gather(col_comm, tmp_major_vertices.size(), i, handle.get_stream()); std::vector rx_displs{}; rmm::device_uvector rx_major_vertices(0, handle.get_stream()); - if (static_cast(sub_comm_rank) == i) { - rx_displs.assign(sub_comm_size, size_t{0}); + if (static_cast(col_comm_rank) == i) { + rx_displs.assign(col_comm_size, size_t{0}); std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1); - rx_major_vertices.resize(rx_displs.back() + rx_sizes.back(), - handle.get_stream()); + rx_major_vertices.resize(rx_displs.back() + rx_sizes.back(), handle.get_stream()); } auto rx_tmp_e_op_result_buffer = allocate_dataframe_buffer(rx_major_vertices.size(), handle.get_stream()); - device_gatherv(sub_comm, + device_gatherv(col_comm, tmp_major_vertices.data(), rx_major_vertices.data(), tmp_major_vertices.size(), @@ -399,7 +460,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( rx_displs, i, handle.get_stream()); - device_gatherv(sub_comm, + device_gatherv(col_comm, tmp_e_op_result_buffer_first, get_dataframe_buffer_begin(rx_tmp_e_op_result_buffer), tmp_major_vertices.size(), @@ -408,7 +469,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( i, handle.get_stream()); - if (static_cast(sub_comm_rank) == i) { + if (static_cast(col_comm_rank) == i) { major_vertices = std::move(rx_major_vertices); e_op_result_buffer = std::move(rx_tmp_e_op_result_buffer); } diff --git a/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh b/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh index 8d9a928b903..34721c75e31 100644 --- a/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh +++ b/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh @@ -213,6 +213,13 @@ transform_reduce_by_adj_matrix_row_col_key_e( detail::transform_reduce_by_key_e_for_all_block_size, handle.get_device_properties().maxGridSize[0]); + auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed + ? vertex_t{0} + : matrix_partition.get_major_value_start_offset(); + auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed + ? matrix_partition.get_major_value_start_offset() + : vertex_t{0}; + // FIXME: This is highly inefficient for graphs with high-degree vertices. If we renumber // vertices to insure that rows within a partition are sorted by their out-degree in // decreasing order, we will apply this kernel only to low out-degree vertices. @@ -221,9 +228,10 @@ transform_reduce_by_adj_matrix_row_col_key_e( matrix_partition, graph_view.get_vertex_partition_first(comm_root_rank), graph_view.get_vertex_partition_last(comm_root_rank), - adj_matrix_row_value_input_first, - adj_matrix_col_value_input_first, - adj_matrix_row_col_key_first, + adj_matrix_row_value_input_first + row_value_input_offset, + adj_matrix_col_value_input_first + col_value_input_offset, + adj_matrix_row_col_key_first + + (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset), e_op, tmp_keys.data(), get_dataframe_buffer_begin(tmp_value_buffer)); From f825d2ce768011cda253a35837b2479bb8fd4163 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 1 Apr 2021 10:34:30 -0400 Subject: [PATCH 49/63] explicitly take is_weighted on graph constrution --- cpp/tests/community/mg_louvain_helper.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/tests/community/mg_louvain_helper.cu b/cpp/tests/community/mg_louvain_helper.cu index a7f95e6d718..661065ca65b 100644 --- a/cpp/tests/community/mg_louvain_helper.cu +++ b/cpp/tests/community/mg_louvain_helper.cu @@ -323,7 +323,8 @@ coarsen_graph( handle, edgelist, new_number_of_vertices, - cugraph::experimental::graph_properties_t{graph_view.is_symmetric(), false}, + cugraph::experimental::graph_properties_t{ + graph_view.is_symmetric(), false, graph_view.is_weighted()}, true); } From 1c819a3f93bc5650ee234020493894e4f42944f8 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 1 Apr 2021 11:40:26 -0400 Subject: [PATCH 50/63] louvain bug fixes --- cpp/src/experimental/louvain.cuh | 34 ++++++++++++------------- cpp/tests/community/mg_louvain_test.cpp | 17 ++++++++----- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh index 3136515faa6..24914fb028b 100644 --- a/cpp/src/experimental/louvain.cuh +++ b/cpp/src/experimental/louvain.cuh @@ -151,7 +151,8 @@ class Louvain { protected: void initialize_dendrogram_level(vertex_t num_vertices) { - dendrogram_->add_level(current_graph_view_.get_local_vertex_first(), num_vertices); + dendrogram_->add_level( + current_graph_view_.get_local_vertex_first(), num_vertices, handle_.get_stream()); thrust::sequence(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), dendrogram_->current_level_begin(), @@ -369,8 +370,6 @@ class Louvain { current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); rmm::device_uvector src_cluster_weights_v(next_cluster_v.size(), handle_.get_stream()); - rmm::device_uvector dst_cluster_weights_v(next_cluster_v.size(), - handle_.get_stream()); compute_cluster_sum_and_subtract(old_cluster_sum_v, cluster_subtract_v); @@ -396,19 +395,9 @@ class Louvain { vertex_to_gpu_id_op, handle_.get_stream()); - dst_cluster_weights_v = cugraph::experimental::collect_values_for_keys( - handle_.get_comms(), - cluster_keys_v_.begin(), - cluster_keys_v_.end(), - cluster_weights_v_.data(), - d_dst_cluster_cache_, - d_dst_cluster_cache_ + dst_cluster_cache_v_.size(), - vertex_to_gpu_id_op, - handle_.get_stream()); - - map_key_first = d_dst_cluster_cache_; - map_key_last = d_dst_cluster_cache_ + dst_cluster_cache_v_.size(); - map_value_first = dst_cluster_weights_v.begin(); + map_key_first = cluster_keys_v_.begin(); + map_key_last = cluster_keys_v_.end(); + map_value_first = cluster_weights_v_.begin(); } else { thrust::sort_by_key(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), cluster_keys_v_.begin(), @@ -432,12 +421,21 @@ class Louvain { map_value_first = src_cluster_weights_v.begin(); } + rmm::device_uvector src_old_cluster_sum_v( + current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), handle_.get_stream()); + rmm::device_uvector src_cluster_subtract_v( + current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), handle_.get_stream()); + copy_to_adj_matrix_row( + handle_, current_graph_view_, old_cluster_sum_v.begin(), src_old_cluster_sum_v.begin()); + copy_to_adj_matrix_row( + handle_, current_graph_view_, cluster_subtract_v.begin(), src_cluster_subtract_v.begin()); + copy_v_transform_reduce_key_aggregated_out_nbr( handle_, current_graph_view_, - thrust::make_zip_iterator(thrust::make_tuple(old_cluster_sum_v.begin(), + thrust::make_zip_iterator(thrust::make_tuple(src_old_cluster_sum_v.begin(), d_src_vertex_weights_cache_, - cluster_subtract_v.begin(), + src_cluster_subtract_v.begin(), d_src_cluster_cache_, src_cluster_weights_v.begin())), diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp index 2e8dcc44708..8a1a3010a6f 100644 --- a/cpp/tests/community/mg_louvain_test.cpp +++ b/cpp/tests/community/mg_louvain_test.cpp @@ -31,7 +31,10 @@ #include -void compare(float mg_modularity, float sg_modularity) { ASSERT_FLOAT_EQ(mg_modularity, sg_modularity); } +void compare(float mg_modularity, float sg_modularity) +{ + ASSERT_FLOAT_EQ(mg_modularity, sg_modularity); +} void compare(double mg_modularity, double sg_modularity) { ASSERT_DOUBLE_EQ(mg_modularity, sg_modularity); @@ -90,13 +93,13 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam cugraph::Dendrogram const& dendrogram, weight_t resolution, int rank, - weight_t modularity) + weight_t mg_modularity) { auto sg_graph = std::make_unique>( handle); rmm::device_uvector d_clustering_v(0, handle.get_stream()); - weight_t sg_modularity; + weight_t sg_modularity{-1.0}; if (rank == 0) { // Create initial SG graph, renumbered according to the MNMG renumber map @@ -160,7 +163,7 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam } }); - if (rank == 0) compare(modularity, sg_modularity); + if (rank == 0) compare(mg_modularity, sg_modularity); } // Compare the results of running louvain on multiple GPUs to that of a @@ -197,9 +200,9 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam auto mg_graph_view = mg_graph.view(); std::unique_ptr> dendrogram; - weight_t modularity; + weight_t mg_modularity; - std::tie(dendrogram, modularity) = + std::tie(dendrogram, mg_modularity) = cugraph::louvain(handle, mg_graph_view, param.max_level, param.resolution); SCOPED_TRACE("compare modularity input: " + param.graph_file_full_path); @@ -213,7 +216,7 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam *dendrogram, param.resolution, comm_rank, - modularity); + mg_modularity); } }; From b4d0793234b242006156cf2e353ef0a922310830 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 1 Apr 2021 13:45:53 -0400 Subject: [PATCH 51/63] throw an exception if unweighted graph is passed to SSSP --- cpp/src/experimental/sssp.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/experimental/sssp.cu b/cpp/src/experimental/sssp.cu index 33b25b1181f..373444cb0a2 100644 --- a/cpp/src/experimental/sssp.cu +++ b/cpp/src/experimental/sssp.cu @@ -70,6 +70,9 @@ void sssp(raft::handle_t const &handle, CUGRAPH_EXPECTS(push_graph_view.is_valid_vertex(source_vertex), "Invalid input argument: source vertex out-of-range."); + CUGRAPH_EXPECTS(push_graph_view.is_weighted(), + "Invalid input argument: an unweighted graph is passed to SSSP, BFS is more " + "efficient for unweighted graphs."); if (do_expensive_check) { auto num_negative_edge_weights = @@ -126,9 +129,7 @@ void sssp(raft::handle_t const &handle, // FIXME: need to double check the bucket sizes are sufficient std::vector bucket_sizes(static_cast(Bucket::num_buckets), push_graph_view.get_number_of_local_vertices()); - VertexFrontier(Bucket::num_buckets)> + VertexFrontier(Bucket::num_buckets)> vertex_frontier(handle, bucket_sizes); // 5. SSSP iteration From be8fbbbce8f4ceb775465d0e9dfd33a194765ee0 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 1 Apr 2021 13:52:52 -0400 Subject: [PATCH 52/63] update populate_graph_container to take explicit is_weighted parameter --- cpp/include/utilities/cython.hpp | 4 ++++ cpp/src/utilities/cython.cu | 6 ++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/include/utilities/cython.hpp b/cpp/include/utilities/cython.hpp index 5c7206373bd..11a0fd39249 100644 --- a/cpp/include/utilities/cython.hpp +++ b/cpp/include/utilities/cython.hpp @@ -356,6 +356,9 @@ struct renum_quad_t { // The number of vertices and edges respectively in the graph represented by // the above arrays. // +// bool is_weighted +// true if the resulting graph object should store edge weights +// // bool transposed // true if the resulting graph object should store a transposed adjacency // matrix @@ -376,6 +379,7 @@ void populate_graph_container(graph_container_t& graph_container, size_t num_global_vertices, size_t num_global_edges, bool sorted_by_degree, + bool is_weighted, bool transposed, bool multi_gpu); diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu index d913d02459e..cfded78cf76 100644 --- a/cpp/src/utilities/cython.cu +++ b/cpp/src/utilities/cython.cu @@ -116,6 +116,7 @@ void populate_graph_container(graph_container_t& graph_container, size_t num_global_vertices, size_t num_global_edges, bool sorted_by_degree, + bool is_weighted, bool transposed, bool multi_gpu) { @@ -152,7 +153,8 @@ void populate_graph_container(graph_container_t& graph_container, graph_container.sorted_by_degree = sorted_by_degree; graph_container.do_expensive_check = do_expensive_check; - experimental::graph_properties_t graph_props{.is_symmetric = false, .is_multigraph = false}; + experimental::graph_properties_t graph_props{ + .is_symmetric = false, .is_multigraph = false, .is_weighted = is_weighted}; graph_container.graph_props = graph_props; graph_container.graph_type = graphTypeEnum::graph_t; @@ -174,7 +176,7 @@ void populate_graph_container_legacy(graph_container_t& graph_container, int* local_offsets) { CUGRAPH_EXPECTS(graph_container.graph_type == graphTypeEnum::null, - "populate_graph_container() can only be called on an empty container."); + "populate_graph_container_legacy() can only be called on an empty container."); // FIXME: This is soon-to-be legacy code left in place until the new graph_t // class is supported everywhere else. Remove everything down to the comment From 541467d4177776a34512245c84bddcb0a6b72358 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 1 Apr 2021 14:31:04 -0400 Subject: [PATCH 53/63] update populate_graph_container to take extra is_weighted input parameter --- python/cugraph/community/egonet_wrapper.pyx | 3 +++ python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx | 4 ++++ python/cugraph/dask/community/louvain_wrapper.pyx | 1 + python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx | 4 ++++ python/cugraph/dask/traversal/mg_bfs_wrapper.pyx | 1 + python/cugraph/dask/traversal/mg_sssp_wrapper.pyx | 3 +++ python/cugraph/link_analysis/pagerank_wrapper.pyx | 3 +++ python/cugraph/structure/graph_utilities.pxd | 1 + 8 files changed, 20 insertions(+) diff --git a/python/cugraph/community/egonet_wrapper.pyx b/python/cugraph/community/egonet_wrapper.pyx index ead41705628..798b89a230f 100644 --- a/python/cugraph/community/egonet_wrapper.pyx +++ b/python/cugraph/community/egonet_wrapper.pyx @@ -50,8 +50,10 @@ def egonet(input_graph, vertices, radius=1): if weights is not None: c_edge_weights = weights.__cuda_array_interface__['data'][0] weight_t = weights.dtype + is_weighted = True else: weight_t = np.dtype("float32") + is_weighted = False # Pointers for egonet vertices = vertices.astype('int32') @@ -76,6 +78,7 @@ def egonet(input_graph, vertices, radius=1): num_verts, num_edges, False, + is_weighted, False, False) if(weight_t==np.dtype("float32")): diff --git a/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx b/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx index ccae26fe7e6..b3c0b7a3823 100644 --- a/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx +++ b/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx @@ -52,8 +52,11 @@ def mg_katz_centrality(input_df, if "value" in input_df.columns: weights = input_df['value'] weight_t = weights.dtype + is_weighted = True + raise NotImplementedError # FIXME: c_edge_weights is always set to NULL else: weight_t = np.dtype("float32") + is_weighted = True if alpha is None: alpha = 0.1 @@ -88,6 +91,7 @@ def mg_katz_centrality(input_df, num_partition_edges, num_global_verts, num_global_edges, True, + is_weighted, True, True) df = cudf.DataFrame() diff --git a/python/cugraph/dask/community/louvain_wrapper.pyx b/python/cugraph/dask/community/louvain_wrapper.pyx index f58630d07aa..ba651aac3ff 100644 --- a/python/cugraph/dask/community/louvain_wrapper.pyx +++ b/python/cugraph/dask/community/louvain_wrapper.pyx @@ -97,6 +97,7 @@ def louvain(input_df, num_partition_edges, num_global_verts, num_global_edges, sorted_by_degree, + True, False, True) # store_transposed, multi_gpu # Create the output dataframe, column lengths must be equal to the number of diff --git a/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx b/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx index 12f2342559b..7ad1c5979e1 100644 --- a/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx +++ b/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx @@ -51,8 +51,11 @@ def mg_pagerank(input_df, if "value" in input_df.columns: weights = input_df['value'] weight_t = weights.dtype + is_weighted = True + raise NotImplementedError # FIXME: c_edge_weights is always set to NULL else: weight_t = np.dtype("float32") + is_weighted = False # FIXME: Offsets and indices are currently hardcoded to int, but this may # not be acceptable in the future. @@ -84,6 +87,7 @@ def mg_pagerank(input_df, num_partition_edges, num_global_verts, num_global_edges, True, + is_weighted, True, True) df = cudf.DataFrame() diff --git a/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx b/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx index 527cb2bcf0a..d63e82bbd14 100644 --- a/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx +++ b/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx @@ -80,6 +80,7 @@ def mg_bfs(input_df, num_partition_edges, num_global_verts, num_global_edges, True, + False, # BFS runs on unweighted graphs False, True) # Generate the cudf.DataFrame result diff --git a/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx b/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx index 15d956836b4..fbcd51d5022 100644 --- a/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx +++ b/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx @@ -46,9 +46,11 @@ def mg_sssp(input_df, if "value" in input_df.columns: weights = input_df['value'] weight_t = weights.dtype + is_weighted = True else: weights = None weight_t = np.dtype("float32") + is_weighted = False # FIXME: Offsets and indices are currently hardcoded to int, but this may # not be acceptable in the future. @@ -82,6 +84,7 @@ def mg_sssp(input_df, num_partition_edges, num_global_verts, num_global_edges, True, + is_weighted, False, True) # Generate the cudf.DataFrame result diff --git a/python/cugraph/link_analysis/pagerank_wrapper.pyx b/python/cugraph/link_analysis/pagerank_wrapper.pyx index 81a68d42360..cde25ca0b97 100644 --- a/python/cugraph/link_analysis/pagerank_wrapper.pyx +++ b/python/cugraph/link_analysis/pagerank_wrapper.pyx @@ -71,8 +71,10 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1. if weights is not None: c_edge_weights = weights.__cuda_array_interface__['data'][0] weight_t = weights.dtype + is_weighted = True else: weight_t = np.dtype("float32") + is_weighted = False # FIXME: Offsets and indices are currently hardcoded to int, but this may # not be acceptable in the future. @@ -100,6 +102,7 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1. num_partition_edges, num_verts, num_edges, False, + is_weighted, True, False) diff --git a/python/cugraph/structure/graph_utilities.pxd b/python/cugraph/structure/graph_utilities.pxd index 10c90f44cb8..081c20cb756 100644 --- a/python/cugraph/structure/graph_utilities.pxd +++ b/python/cugraph/structure/graph_utilities.pxd @@ -50,6 +50,7 @@ cdef extern from "utilities/cython.hpp" namespace "cugraph::cython": size_t num_global_vertices, size_t num_global_edges, bool sorted_by_degree, + bool is_weighted, bool transposed, bool multi_gpu) except + From 6c247f5d9840522540ccc166d1fa238bdc2e54e2 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 1 Apr 2021 16:39:45 -0400 Subject: [PATCH 54/63] update python binding for the C++ updates --- cpp/include/utilities/cython.hpp | 7 +- python/cugraph/structure/graph_utilities.pxd | 21 ++--- python/cugraph/structure/renumber_wrapper.pyx | 81 +++++++++---------- 3 files changed, 55 insertions(+), 54 deletions(-) diff --git a/cpp/include/utilities/cython.hpp b/cpp/include/utilities/cython.hpp index 11a0fd39249..547f74a8ff5 100644 --- a/cpp/include/utilities/cython.hpp +++ b/cpp/include/utilities/cython.hpp @@ -182,6 +182,11 @@ struct major_minor_weights_t { sizeof(weight_t)); } + std::unique_ptr> get_edge_counts_wrap(void) // const + { + return std::make_unique>(edge_counts_.begin(), edge_counts_.end()); + } + private: rmm::device_uvector shuffled_major_vertices_; rmm::device_uvector shuffled_minor_vertices_; @@ -492,7 +497,7 @@ std::unique_ptr> call_renumber( raft::handle_t const& handle, vertex_t* shuffled_edgelist_major_vertices /* [INOUT] */, vertex_t* shuffled_edgelist_minor_vertices /* [INOUT] */, - edge_t num_edgelist_edges, + std::vector const& edge_counts, bool do_expensive_check, bool multi_gpu); diff --git a/python/cugraph/structure/graph_utilities.pxd b/python/cugraph/structure/graph_utilities.pxd index 081c20cb756..6897a50211b 100644 --- a/python/cugraph/structure/graph_utilities.pxd +++ b/python/cugraph/structure/graph_utilities.pxd @@ -107,18 +107,21 @@ cdef extern from "experimental/graph_view.hpp" namespace "cugraph::experimental" # cdef extern from "utilities/cython.hpp" namespace "cugraph::cython": - cdef cppclass major_minor_weights_t[vertex_t, weight_t]: + cdef cppclass major_minor_weights_t[vertex_t, edge_t, weight_t]: major_minor_weights_t(const handle_t &handle) pair[unique_ptr[device_buffer], size_t] get_major_wrap() pair[unique_ptr[device_buffer], size_t] get_minor_wrap() pair[unique_ptr[device_buffer], size_t] get_weights_wrap() + unique_ptr[vector[edge_t]] get_edge_counts_wrap() ctypedef fused shuffled_vertices_t: - major_minor_weights_t[int, float] - major_minor_weights_t[int, double] - major_minor_weights_t[long, float] - major_minor_weights_t[long, double] + major_minor_weights_t[int, int, float] + major_minor_weights_t[int, int, double] + major_minor_weights_t[int, long, float] + major_minor_weights_t[int, long, double] + major_minor_weights_t[long, long, float] + major_minor_weights_t[long, long, double] # 3. return type for renumber: # @@ -152,13 +155,12 @@ cdef extern from "utilities/cython.hpp" namespace "cugraph::cython": # cdef extern from "utilities/cython.hpp" namespace "cugraph::cython": - cdef unique_ptr[major_minor_weights_t[vertex_t, weight_t]] call_shuffle[vertex_t, edge_t, weight_t]( + cdef unique_ptr[major_minor_weights_t[vertex_t, edge_t, weight_t]] call_shuffle[vertex_t, edge_t, weight_t]( const handle_t &handle, vertex_t *edgelist_major_vertices, vertex_t *edgelist_minor_vertices, weight_t* edgelist_weights, - edge_t num_edges, - bool is_hyper_partitioned) except + + edge_t num_edges) except + # 5. `renumber_edgelist()` wrapper # @@ -168,7 +170,6 @@ cdef extern from "utilities/cython.hpp" namespace "cugraph::cython": const handle_t &handle, vertex_t *edgelist_major_vertices, vertex_t *edgelist_minor_vertices, - edge_t num_edges, - bool is_hyper_partitioned, + const vector[edge_t]& edge_counts, bool do_check, bool multi_gpu) except + diff --git a/python/cugraph/structure/renumber_wrapper.pyx b/python/cugraph/structure/renumber_wrapper.pyx index 682c6b32a0f..043f9d20a6f 100644 --- a/python/cugraph/structure/renumber_wrapper.pyx +++ b/python/cugraph/structure/renumber_wrapper.pyx @@ -108,8 +108,6 @@ def renumber(input_df, # maybe use cpdef ? cdef uintptr_t c_major_vertices = major_vertices.__cuda_array_interface__['data'][0] cdef uintptr_t c_minor_vertices = minor_vertices.__cuda_array_interface__['data'][0] - cdef bool is_hyper_partitioned = False # for now - cdef uintptr_t shuffled_major = NULL cdef uintptr_t shuffled_minor = NULL @@ -119,12 +117,14 @@ def renumber(input_df, # maybe use cpdef ? cdef pair[unique_ptr[device_buffer], size_t] pair_original cdef pair[unique_ptr[device_buffer], size_t] pair_partition - # tparams: vertex_t, weight_t: + # tparams: vertex_t, edge_t, weight_t: # - cdef unique_ptr[major_minor_weights_t[int, float]] ptr_shuffled_32_32 - cdef unique_ptr[major_minor_weights_t[int, double]] ptr_shuffled_32_64 - cdef unique_ptr[major_minor_weights_t[long, float]] ptr_shuffled_64_32 - cdef unique_ptr[major_minor_weights_t[long, double]] ptr_shuffled_64_64 + cdef unique_ptr[major_minor_weights_t[int, int, float]] ptr_shuffled_32_32_32 + cdef unique_ptr[major_minor_weights_t[int, int, double]] ptr_shuffled_32_32_64 + cdef unique_ptr[major_minor_weights_t[int, long, float]] ptr_shuffled_32_64_32 + cdef unique_ptr[major_minor_weights_t[int, long, double]] ptr_shuffled_32_64_64 + cdef unique_ptr[major_minor_weights_t[long, long, float]] ptr_shuffled_64_64_32 + cdef unique_ptr[major_minor_weights_t[long, long, double]] ptr_shuffled_64_64_64 # tparams: vertex_t, edge_t: # @@ -143,13 +143,12 @@ def renumber(input_df, # maybe use cpdef ? if ( edge_t == np.dtype("int32")): if( weight_t == np.dtype("float32")): if(is_multi_gpu): - ptr_shuffled_32_32.reset(call_shuffle[int, int, float](deref(handle_ptr), + ptr_shuffled_32_32_32.reset(call_shuffle[int, int, float](deref(handle_ptr), c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges, - is_hyper_partitioned).release()) - shuffled_df = renumber_helper(ptr_shuffled_32_32.get(), vertex_t, weights) + num_partition_edges).release()) + shuffled_df = renumber_helper(ptr_shuffled_32_32_32.get(), vertex_t, weights) major_vertices = shuffled_df['major_vertices'] minor_vertices = shuffled_df['minor_vertices'] num_partition_edges = len(shuffled_df) @@ -163,11 +162,12 @@ def renumber(input_df, # maybe use cpdef ? shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] + edge_counts_32 = move(ptr_shuffled_32_32_32.get().get_edge_counts_wrap()) + ptr_renum_quad_32_32.reset(call_renumber[int, int](deref(handle_ptr), shuffled_major, shuffled_minor, - num_partition_edges, - is_hyper_partitioned, + deref(edge_counts_32.get()), 1, mg_flag).release()) @@ -205,14 +205,13 @@ def renumber(input_df, # maybe use cpdef ? elif( weight_t == np.dtype("float64")): if(is_multi_gpu): - ptr_shuffled_32_64.reset(call_shuffle[int, int, double](deref(handle_ptr), + ptr_shuffled_32_32_64.reset(call_shuffle[int, int, double](deref(handle_ptr), c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges, - is_hyper_partitioned).release()) + num_partition_edges).release()) - shuffled_df = renumber_helper(ptr_shuffled_32_64.get(), vertex_t, weights) + shuffled_df = renumber_helper(ptr_shuffled_32_32_64.get(), vertex_t, weights) major_vertices = shuffled_df['major_vertices'] minor_vertices = shuffled_df['minor_vertices'] num_partition_edges = len(shuffled_df) @@ -226,12 +225,12 @@ def renumber(input_df, # maybe use cpdef ? shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] + edge_counts_32 = move(ptr_shuffled_32_32_64.get().get_edge_counts_wrap()) ptr_renum_quad_32_32.reset(call_renumber[int, int](deref(handle_ptr), shuffled_major, shuffled_minor, - num_partition_edges, - is_hyper_partitioned, + deref(edge_counts_32.get()), do_check, mg_flag).release()) @@ -271,14 +270,13 @@ def renumber(input_df, # maybe use cpdef ? elif ( edge_t == np.dtype("int64")): if( weight_t == np.dtype("float32")): if(is_multi_gpu): - ptr_shuffled_32_32.reset(call_shuffle[int, long, float](deref(handle_ptr), + ptr_shuffled_32_64_32.reset(call_shuffle[int, long, float](deref(handle_ptr), c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges, - is_hyper_partitioned).release()) + num_partition_edges).release()) - shuffled_df = renumber_helper(ptr_shuffled_32_32.get(), vertex_t, weights) + shuffled_df = renumber_helper(ptr_shuffled_32_64_32.get(), vertex_t, weights) major_vertices = shuffled_df['major_vertices'] minor_vertices = shuffled_df['minor_vertices'] num_partition_edges = len(shuffled_df) @@ -292,12 +290,12 @@ def renumber(input_df, # maybe use cpdef ? shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] + edge_counts_64 = move(ptr_shuffled_32_64_32.get().get_edge_counts_wrap()) ptr_renum_quad_32_64.reset(call_renumber[int, long](deref(handle_ptr), shuffled_major, shuffled_minor, - num_partition_edges, - is_hyper_partitioned, + deref(edge_counts_64.get()), do_check, mg_flag).release()) @@ -335,14 +333,13 @@ def renumber(input_df, # maybe use cpdef ? return renumbered_map, shuffled_df elif( weight_t == np.dtype("float64")): if(is_multi_gpu): - ptr_shuffled_32_64.reset(call_shuffle[int, long, double](deref(handle_ptr), + ptr_shuffled_32_64_64.reset(call_shuffle[int, long, double](deref(handle_ptr), c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges, - is_hyper_partitioned).release()) + num_partition_edges).release()) - shuffled_df = renumber_helper(ptr_shuffled_32_64.get(), vertex_t, weights) + shuffled_df = renumber_helper(ptr_shuffled_32_64_64.get(), vertex_t, weights) major_vertices = shuffled_df['major_vertices'] minor_vertices = shuffled_df['minor_vertices'] num_partition_edges = len(shuffled_df) @@ -356,12 +353,12 @@ def renumber(input_df, # maybe use cpdef ? shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] + edge_counts_64 = move(ptr_shuffled_32_64_64.get().get_edge_counts_wrap()) ptr_renum_quad_32_64.reset(call_renumber[int, long](deref(handle_ptr), shuffled_major, shuffled_minor, - num_partition_edges, - is_hyper_partitioned, + deref(edge_counts_64.get()), do_check, mg_flag).release()) @@ -401,14 +398,13 @@ def renumber(input_df, # maybe use cpdef ? if ( edge_t == np.dtype("int64")): if( weight_t == np.dtype("float32")): if(is_multi_gpu): - ptr_shuffled_64_32.reset(call_shuffle[long, long, float](deref(handle_ptr), + ptr_shuffled_64_64_32.reset(call_shuffle[long, long, float](deref(handle_ptr), c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges, - is_hyper_partitioned).release()) + num_partition_edges).release()) - shuffled_df = renumber_helper(ptr_shuffled_64_32.get(), vertex_t, weights) + shuffled_df = renumber_helper(ptr_shuffled_64_64_32.get(), vertex_t, weights) major_vertices = shuffled_df['major_vertices'] minor_vertices = shuffled_df['minor_vertices'] num_partition_edges = len(shuffled_df) @@ -422,12 +418,12 @@ def renumber(input_df, # maybe use cpdef ? shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] + edge_counts_64 = move(ptr_shuffled_64_64_32.get().get_edge_counts_wrap()) ptr_renum_quad_64_64.reset(call_renumber[long, long](deref(handle_ptr), shuffled_major, shuffled_minor, - num_partition_edges, - is_hyper_partitioned, + deref(edge_counts_64.get()), do_check, mg_flag).release()) @@ -466,14 +462,13 @@ def renumber(input_df, # maybe use cpdef ? elif( weight_t == np.dtype("float64")): if(is_multi_gpu): - ptr_shuffled_64_64.reset(call_shuffle[long, long, double](deref(handle_ptr), + ptr_shuffled_64_64_64.reset(call_shuffle[long, long, double](deref(handle_ptr), c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges, - is_hyper_partitioned).release()) + num_partition_edges).release()) - shuffled_df = renumber_helper(ptr_shuffled_64_64.get(), vertex_t, weights) + shuffled_df = renumber_helper(ptr_shuffled_64_64_64.get(), vertex_t, weights) major_vertices = shuffled_df['major_vertices'] minor_vertices = shuffled_df['minor_vertices'] num_partition_edges = len(shuffled_df) @@ -487,12 +482,12 @@ def renumber(input_df, # maybe use cpdef ? shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] + edge_counts_64 = move(ptr_shuffled_64_64_64.get().get_edge_counts_wrap()) ptr_renum_quad_64_64.reset(call_renumber[long, long](deref(handle_ptr), shuffled_major, shuffled_minor, - num_partition_edges, - is_hyper_partitioned, + deref(edge_counts_64.get()), do_check, mg_flag).release()) From 6d65486f684229064bdf76ff341c99f3f7ccd122 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 1 Apr 2021 16:50:15 -0400 Subject: [PATCH 55/63] clang-format --- .../copy_v_transform_reduce_in_out_nbr.cuh | 17 ++++++++--------- cpp/include/utilities/shuffle_comm.cuh | 4 ++-- cpp/src/experimental/bfs.cu | 9 +++------ 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh index 32ca956b535..e6a73a874ae 100644 --- a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh +++ b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh @@ -376,7 +376,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, if (GraphViewType::is_multi_gpu) { auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); auto const row_comm_rank = row_comm.get_rank(); - minor_init = (row_comm_rank == 0) ? init : T {}; + minor_init = (row_comm_rank == 0) ? init : T{}; } if (GraphViewType::is_multi_gpu) { @@ -477,14 +477,13 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); - device_reduce( - col_comm, - major_buffer_first, - vertex_value_output_first, - matrix_partition.get_major_size(), - raft::comms::op_t::SUM, - i, - handle.get_stream()); + device_reduce(col_comm, + major_buffer_first, + vertex_value_output_first, + matrix_partition.get_major_size(), + raft::comms::op_t::SUM, + i, + handle.get_stream()); } } diff --git a/cpp/include/utilities/shuffle_comm.cuh b/cpp/include/utilities/shuffle_comm.cuh index 867b554ab39..b318009d9bf 100644 --- a/cpp/include/utilities/shuffle_comm.cuh +++ b/cpp/include/utilities/shuffle_comm.cuh @@ -252,8 +252,8 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const &comm, { auto const comm_size = comm.get_size(); - auto d_tx_value_counts = groupby_and_count( - tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), stream); + auto d_tx_value_counts = + groupby_and_count(tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), stream); std::vector tx_counts{}; std::vector tx_offsets{}; diff --git a/cpp/src/experimental/bfs.cu b/cpp/src/experimental/bfs.cu index 2b03fa57a5d..9145e3737b6 100644 --- a/cpp/src/experimental/bfs.cu +++ b/cpp/src/experimental/bfs.cu @@ -93,9 +93,7 @@ void bfs(raft::handle_t const &handle, enum class Bucket { cur, num_buckets }; std::vector bucket_sizes(static_cast(Bucket::num_buckets), push_graph_view.get_number_of_local_vertices()); - VertexFrontier(Bucket::num_buckets)> + VertexFrontier(Bucket::num_buckets)> vertex_frontier(handle, bucket_sizes); if (push_graph_view.is_local_vertex_nocheck(source_vertex)) { @@ -139,9 +137,8 @@ void bfs(raft::handle_t const &handle, thrust::make_zip_iterator(thrust::make_tuple(distances, predecessor_first)), vertex_frontier, [depth] __device__(auto v_val, auto pushed_val) { - auto idx = (v_val == invalid_distance) - ? static_cast(Bucket::cur) - : VertexFrontier::kInvalidBucketIdx; + auto idx = (v_val == invalid_distance) ? static_cast(Bucket::cur) + : VertexFrontier::kInvalidBucketIdx; return thrust::make_tuple(idx, thrust::make_tuple(depth + 1, pushed_val)); }); From a7b6c8e6d39efbe0b35555c7196131988187ffc6 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 1 Apr 2021 21:25:52 -0400 Subject: [PATCH 56/63] python binding bug fix --- cpp/include/utilities/cython.hpp | 2 +- python/cugraph/structure/renumber_wrapper.pyx | 44 +++++++++++-------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/cpp/include/utilities/cython.hpp b/cpp/include/utilities/cython.hpp index 547f74a8ff5..5d838958bb6 100644 --- a/cpp/include/utilities/cython.hpp +++ b/cpp/include/utilities/cython.hpp @@ -184,7 +184,7 @@ struct major_minor_weights_t { std::unique_ptr> get_edge_counts_wrap(void) // const { - return std::make_unique>(edge_counts_.begin(), edge_counts_.end()); + return std::make_unique>(edge_counts_); } private: diff --git a/python/cugraph/structure/renumber_wrapper.pyx b/python/cugraph/structure/renumber_wrapper.pyx index 043f9d20a6f..33f8e978efb 100644 --- a/python/cugraph/structure/renumber_wrapper.pyx +++ b/python/cugraph/structure/renumber_wrapper.pyx @@ -22,6 +22,7 @@ from libc.stdint cimport uintptr_t from cython.operator cimport dereference as deref import numpy as np +from libcpp.memory cimport make_unique from libcpp.utility cimport move from rmm._lib.device_buffer cimport device_buffer, DeviceBuffer @@ -132,6 +133,11 @@ def renumber(input_df, # maybe use cpdef ? cdef unique_ptr[renum_quad_t[int, long]] ptr_renum_quad_32_64 cdef unique_ptr[renum_quad_t[long, long]] ptr_renum_quad_64_64 + # tparam: vertex_t: + # + cdef unique_ptr[vector[int]] edge_counts_32 + cdef unique_ptr[vector[long]] edge_counts_64 + # tparam: vertex_t: # cdef unique_ptr[vector[int]] uniq_partition_vector_32 @@ -157,12 +163,13 @@ def renumber(input_df, # maybe use cpdef ? else: major = 'dst'; minor = 'src' shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False) + edge_counts_32 = move(ptr_shuffled_32_32_32.get().get_edge_counts_wrap()) else: shuffled_df = input_df - + edge_counts_32 = make_unique[vector[int]](1, num_partition_edges) + shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] - edge_counts_32 = move(ptr_shuffled_32_32_32.get().get_edge_counts_wrap()) ptr_renum_quad_32_32.reset(call_renumber[int, int](deref(handle_ptr), shuffled_major, @@ -190,8 +197,7 @@ def renumber(input_df, # maybe use cpdef ? uniq_partition_vector_32.get()[0].at(rank_indx+1)), dtype=vertex_t) else: - new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(0), - uniq_partition_vector_32.get()[0].at(1)), + new_series = cudf.Series(np.arange(0, ptr_renum_quad_32_32.get().get_num_vertices()), dtype=vertex_t) # create new cudf df # @@ -220,12 +226,13 @@ def renumber(input_df, # maybe use cpdef ? else: major = 'dst'; minor = 'src' shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False) + edge_counts_32 = move(ptr_shuffled_32_32_64.get().get_edge_counts_wrap()) else: shuffled_df = input_df + edge_counts_32 = make_unique[vector[int]](1, num_partition_edges) shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] - edge_counts_32 = move(ptr_shuffled_32_32_64.get().get_edge_counts_wrap()) ptr_renum_quad_32_32.reset(call_renumber[int, int](deref(handle_ptr), shuffled_major, @@ -253,8 +260,7 @@ def renumber(input_df, # maybe use cpdef ? uniq_partition_vector_32.get()[0].at(rank_indx+1)), dtype=vertex_t) else: - new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(0), - uniq_partition_vector_32.get()[0].at(1)), + new_series = cudf.Series(np.arange(0, ptr_renum_quad_32_32.get().get_num_vertices()), dtype=vertex_t) # create new cudf df @@ -285,12 +291,13 @@ def renumber(input_df, # maybe use cpdef ? else: major = 'dst'; minor = 'src' shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False) + edge_counts_64 = move(ptr_shuffled_32_64_32.get().get_edge_counts_wrap()) else: shuffled_df = input_df + edge_counts_64 = make_unique[vector[long]](1, num_partition_edges) shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] - edge_counts_64 = move(ptr_shuffled_32_64_32.get().get_edge_counts_wrap()) ptr_renum_quad_32_64.reset(call_renumber[int, long](deref(handle_ptr), shuffled_major, @@ -318,8 +325,7 @@ def renumber(input_df, # maybe use cpdef ? uniq_partition_vector_32.get()[0].at(rank_indx+1)), dtype=vertex_t) else: - new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(0), - uniq_partition_vector_32.get()[0].at(1)), + new_series = cudf.Series(np.arange(0, ptr_renum_quad_32_64.get().get_num_vertices()), dtype=vertex_t) # create new cudf df @@ -348,12 +354,13 @@ def renumber(input_df, # maybe use cpdef ? else: major = 'dst'; minor = 'src' shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False) + edge_counts_64 = move(ptr_shuffled_32_64_64.get().get_edge_counts_wrap()) else: shuffled_df = input_df + edge_counts_64 = make_unique[vector[long]](1, num_partition_edges) shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] - edge_counts_64 = move(ptr_shuffled_32_64_64.get().get_edge_counts_wrap()) ptr_renum_quad_32_64.reset(call_renumber[int, long](deref(handle_ptr), shuffled_major, @@ -381,8 +388,7 @@ def renumber(input_df, # maybe use cpdef ? uniq_partition_vector_32.get()[0].at(rank_indx+1)), dtype=vertex_t) else: - new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(0), - uniq_partition_vector_32.get()[0].at(1)), + new_series = cudf.Series(np.arange(0, ptr_renum_quad_32_64.get().get_num_vertices()), dtype=vertex_t) # create new cudf df # @@ -413,12 +419,13 @@ def renumber(input_df, # maybe use cpdef ? else: major = 'dst'; minor = 'src' shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False) + edge_counts_64 = move(ptr_shuffled_64_64_32.get().get_edge_counts_wrap()) else: shuffled_df = input_df + edge_counts_64 = make_unique[vector[long]](1, num_partition_edges) shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] - edge_counts_64 = move(ptr_shuffled_64_64_32.get().get_edge_counts_wrap()) ptr_renum_quad_64_64.reset(call_renumber[long, long](deref(handle_ptr), shuffled_major, @@ -446,8 +453,7 @@ def renumber(input_df, # maybe use cpdef ? uniq_partition_vector_64.get()[0].at(rank_indx+1)), dtype=vertex_t) else: - new_series = cudf.Series(np.arange(uniq_partition_vector_64.get()[0].at(0), - uniq_partition_vector_64.get()[0].at(1)), + new_series = cudf.Series(np.arange(0, ptr_renum_quad_64_64.get().get_num_vertices()), dtype=vertex_t) # create new cudf df @@ -477,12 +483,13 @@ def renumber(input_df, # maybe use cpdef ? else: major = 'dst'; minor = 'src' shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False) + edge_counts_64 = move(ptr_shuffled_64_64_64.get().get_edge_counts_wrap()) else: shuffled_df = input_df + edge_counts_64 = make_unique[vector[long]](1, num_partition_edges) shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] - edge_counts_64 = move(ptr_shuffled_64_64_64.get().get_edge_counts_wrap()) ptr_renum_quad_64_64.reset(call_renumber[long, long](deref(handle_ptr), shuffled_major, @@ -510,8 +517,7 @@ def renumber(input_df, # maybe use cpdef ? uniq_partition_vector_64.get()[0].at(rank_indx+1)), dtype=vertex_t) else: - new_series = cudf.Series(np.arange(uniq_partition_vector_64.get()[0].at(0), - uniq_partition_vector_64.get()[0].at(1)), + new_series = cudf.Series(np.arange(0, ptr_renum_quad_64_64.get().get_num_vertices()), dtype=vertex_t) # create new cudf df From 496ff0348f6e385bc5d33afdea58e7d3cab0f3bc Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 1 Apr 2021 23:19:30 -0400 Subject: [PATCH 57/63] rename num_partition_edges to num_local_edges as with the new partitioning scheme, there can be more than one partition per GPU --- cpp/include/utilities/cython.hpp | 4 +- cpp/src/utilities/cython.cu | 8 ++-- python/cugraph/community/egonet_wrapper.pyx | 4 +- .../centrality/mg_katz_centrality_wrapper.pyx | 4 +- .../dask/community/louvain_wrapper.pyx | 6 +-- .../link_analysis/mg_pagerank_wrapper.pyx | 4 +- .../cugraph/dask/traversal/mg_bfs_wrapper.pyx | 4 +- .../dask/traversal/mg_sssp_wrapper.pyx | 4 +- .../link_analysis/pagerank_wrapper.pyx | 5 +-- python/cugraph/structure/graph_utilities.pxd | 2 +- python/cugraph/structure/renumber_wrapper.pyx | 38 +++++++++---------- 11 files changed, 41 insertions(+), 42 deletions(-) diff --git a/cpp/include/utilities/cython.hpp b/cpp/include/utilities/cython.hpp index 5d838958bb6..d8c476760f0 100644 --- a/cpp/include/utilities/cython.hpp +++ b/cpp/include/utilities/cython.hpp @@ -93,7 +93,7 @@ struct graph_container_t { void* weights; void* vertex_partition_offsets; - size_t num_partition_edges; + size_t num_local_edges; size_t num_global_vertices; size_t num_global_edges; numberTypeEnum vertexType; @@ -380,7 +380,7 @@ void populate_graph_container(graph_container_t& graph_container, numberTypeEnum vertexType, numberTypeEnum edgeType, numberTypeEnum weightType, - size_t num_partition_edges, + size_t num_local_edges, size_t num_global_vertices, size_t num_global_edges, bool sorted_by_degree, diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu index cfded78cf76..8278c38f8d4 100644 --- a/cpp/src/utilities/cython.cu +++ b/cpp/src/utilities/cython.cu @@ -49,7 +49,7 @@ create_graph(raft::handle_t const& handle, graph_container_t const& graph_contai {{reinterpret_cast(graph_container.src_vertices), reinterpret_cast(graph_container.dst_vertices), reinterpret_cast(graph_container.weights), - static_cast(graph_container.num_partition_edges)}}); + static_cast(graph_container.num_local_edges)}}); std::vector partition_offsets_vector( reinterpret_cast(graph_container.vertex_partition_offsets), @@ -88,7 +88,7 @@ create_graph(raft::handle_t const& handle, graph_container_t const& graph_contai reinterpret_cast(graph_container.src_vertices), reinterpret_cast(graph_container.dst_vertices), reinterpret_cast(graph_container.weights), - static_cast(graph_container.num_partition_edges)}; + static_cast(graph_container.num_local_edges)}; return std::make_unique>( handle, edgelist, @@ -112,7 +112,7 @@ void populate_graph_container(graph_container_t& graph_container, numberTypeEnum vertexType, numberTypeEnum edgeType, numberTypeEnum weightType, - size_t num_partition_edges, + size_t num_local_edges, size_t num_global_vertices, size_t num_global_edges, bool sorted_by_degree, @@ -142,7 +142,7 @@ void populate_graph_container(graph_container_t& graph_container, graph_container.src_vertices = src_vertices; graph_container.dst_vertices = dst_vertices; graph_container.weights = weights; - graph_container.num_partition_edges = num_partition_edges; + graph_container.num_local_edges = num_local_edges; graph_container.num_global_vertices = num_global_vertices; graph_container.num_global_edges = num_global_edges; graph_container.vertexType = vertexType; diff --git a/python/cugraph/community/egonet_wrapper.pyx b/python/cugraph/community/egonet_wrapper.pyx index 798b89a230f..23aa159314f 100644 --- a/python/cugraph/community/egonet_wrapper.pyx +++ b/python/cugraph/community/egonet_wrapper.pyx @@ -42,7 +42,7 @@ def egonet(input_graph, vertices, radius=1): num_verts = input_graph.number_of_vertices() num_edges = input_graph.number_of_edges(directed_edges=True) - num_partition_edges = num_edges + num_local_edges = num_edges cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0] cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0] @@ -74,7 +74,7 @@ def egonet(input_graph, vertices, radius=1): ((numberTypeMap[vertex_t])), ((numberTypeMap[edge_t])), ((numberTypeMap[weight_t])), - num_partition_edges, + num_local_edges, num_verts, num_edges, False, diff --git a/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx b/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx index b3c0b7a3823..8ef942810a2 100644 --- a/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx +++ b/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx @@ -70,7 +70,7 @@ def mg_katz_centrality(input_df, np.dtype("double") : numberTypeEnum.doubleType} # FIXME: needs to be edge_t type not int - cdef int num_partition_edges = len(src) + cdef int num_local_edges = len(src) cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0] cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0] @@ -88,7 +88,7 @@ def mg_katz_centrality(input_df, ((numberTypeMap[vertex_t])), ((numberTypeMap[edge_t])), ((numberTypeMap[weight_t])), - num_partition_edges, + num_local_edges, num_global_verts, num_global_edges, True, is_weighted, diff --git a/python/cugraph/dask/community/louvain_wrapper.pyx b/python/cugraph/dask/community/louvain_wrapper.pyx index ba651aac3ff..a3cebeac272 100644 --- a/python/cugraph/dask/community/louvain_wrapper.pyx +++ b/python/cugraph/dask/community/louvain_wrapper.pyx @@ -56,12 +56,12 @@ def louvain(input_df, src = input_df['src'] dst = input_df['dst'] - num_partition_edges = len(src) + num_local_edges = len(src) if "value" in input_df.columns: weights = input_df['value'] else: - weights = cudf.Series(np.full(num_partition_edges, 1.0, dtype=np.float32)) + weights = cudf.Series(np.full(num_local_edges, 1.0, dtype=np.float32)) vertex_t = src.dtype if num_global_edges > (2**31 - 1): @@ -94,7 +94,7 @@ def louvain(input_df, ((numberTypeMap[vertex_t])), ((numberTypeMap[edge_t])), ((numberTypeMap[weight_t])), - num_partition_edges, + num_local_edges, num_global_verts, num_global_edges, sorted_by_degree, True, diff --git a/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx b/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx index 7ad1c5979e1..7839a40d763 100644 --- a/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx +++ b/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx @@ -65,7 +65,7 @@ def mg_pagerank(input_df, np.dtype("double") : numberTypeEnum.doubleType} # FIXME: needs to be edge_t type not int - cdef int num_partition_edges = len(src) + cdef int num_local_edges = len(src) cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0] cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0] @@ -84,7 +84,7 @@ def mg_pagerank(input_df, ((numberTypeMap[vertex_t])), ((numberTypeMap[edge_t])), ((numberTypeMap[weight_t])), - num_partition_edges, + num_local_edges, num_global_verts, num_global_edges, True, is_weighted, diff --git a/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx b/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx index d63e82bbd14..44630ba5fb3 100644 --- a/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx +++ b/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx @@ -58,7 +58,7 @@ def mg_bfs(input_df, np.dtype("double") : numberTypeEnum.doubleType} # FIXME: needs to be edge_t type not int - cdef int num_partition_edges = len(src) + cdef int num_local_edges = len(src) cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0] cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0] @@ -77,7 +77,7 @@ def mg_bfs(input_df, ((numberTypeMap[vertex_t])), ((numberTypeMap[edge_t])), ((numberTypeMap[weight_t])), - num_partition_edges, + num_local_edges, num_global_verts, num_global_edges, True, False, # BFS runs on unweighted graphs diff --git a/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx b/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx index fbcd51d5022..82a4ebe04d6 100644 --- a/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx +++ b/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx @@ -60,7 +60,7 @@ def mg_sssp(input_df, np.dtype("double") : numberTypeEnum.doubleType} # FIXME: needs to be edge_t type not int - cdef int num_partition_edges = len(src) + cdef int num_local_edges = len(src) cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0] cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0] @@ -81,7 +81,7 @@ def mg_sssp(input_df, ((numberTypeMap[vertex_t])), ((numberTypeMap[edge_t])), ((numberTypeMap[weight_t])), - num_partition_edges, + num_local_edges, num_global_verts, num_global_edges, True, is_weighted, diff --git a/python/cugraph/link_analysis/pagerank_wrapper.pyx b/python/cugraph/link_analysis/pagerank_wrapper.pyx index cde25ca0b97..2c619a052ec 100644 --- a/python/cugraph/link_analysis/pagerank_wrapper.pyx +++ b/python/cugraph/link_analysis/pagerank_wrapper.pyx @@ -42,7 +42,7 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1. num_verts = input_graph.number_of_vertices() num_edges = input_graph.number_of_edges(directed_edges=True) # FIXME: needs to be edge_t type not int - cdef int num_partition_edges = len(src) + cdef int num_local_edges = len(src) df = cudf.DataFrame() df['vertex'] = cudf.Series(np.arange(num_verts, dtype=np.int32)) @@ -98,8 +98,7 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1. ((numberTypeEnum.int32Type)), ((numberTypeEnum.int32Type)), ((numberTypeMap[weight_t])), - #num_verts, num_edges, - num_partition_edges, + num_local_edges, num_verts, num_edges, False, is_weighted, diff --git a/python/cugraph/structure/graph_utilities.pxd b/python/cugraph/structure/graph_utilities.pxd index 6897a50211b..b169e42ccf8 100644 --- a/python/cugraph/structure/graph_utilities.pxd +++ b/python/cugraph/structure/graph_utilities.pxd @@ -46,7 +46,7 @@ cdef extern from "utilities/cython.hpp" namespace "cugraph::cython": numberTypeEnum vertexType, numberTypeEnum edgeType, numberTypeEnum weightType, - size_t num_partition_edges, + size_t num_local_edges, size_t num_global_vertices, size_t num_global_edges, bool sorted_by_degree, diff --git a/python/cugraph/structure/renumber_wrapper.pyx b/python/cugraph/structure/renumber_wrapper.pyx index 33f8e978efb..99626cdee08 100644 --- a/python/cugraph/structure/renumber_wrapper.pyx +++ b/python/cugraph/structure/renumber_wrapper.pyx @@ -104,7 +104,7 @@ def renumber(input_df, # maybe use cpdef ? raise Exception("Incompatible vertex_t and edge_t types.") # FIXME: needs to be edge_t type not int - cdef int num_partition_edges = len(major_vertices) + cdef int num_local_edges = len(major_vertices) cdef uintptr_t c_major_vertices = major_vertices.__cuda_array_interface__['data'][0] cdef uintptr_t c_minor_vertices = minor_vertices.__cuda_array_interface__['data'][0] @@ -153,11 +153,11 @@ def renumber(input_df, # maybe use cpdef ? c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges).release()) + num_local_edges).release()) shuffled_df = renumber_helper(ptr_shuffled_32_32_32.get(), vertex_t, weights) major_vertices = shuffled_df['major_vertices'] minor_vertices = shuffled_df['minor_vertices'] - num_partition_edges = len(shuffled_df) + num_local_edges = len(shuffled_df) if not transposed: major = 'src'; minor = 'dst' else: @@ -166,7 +166,7 @@ def renumber(input_df, # maybe use cpdef ? edge_counts_32 = move(ptr_shuffled_32_32_32.get().get_edge_counts_wrap()) else: shuffled_df = input_df - edge_counts_32 = make_unique[vector[int]](1, num_partition_edges) + edge_counts_32 = make_unique[vector[int]](1, num_local_edges) shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] @@ -215,12 +215,12 @@ def renumber(input_df, # maybe use cpdef ? c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges).release()) + num_local_edges).release()) shuffled_df = renumber_helper(ptr_shuffled_32_32_64.get(), vertex_t, weights) major_vertices = shuffled_df['major_vertices'] minor_vertices = shuffled_df['minor_vertices'] - num_partition_edges = len(shuffled_df) + num_local_edges = len(shuffled_df) if not transposed: major = 'src'; minor = 'dst' else: @@ -229,7 +229,7 @@ def renumber(input_df, # maybe use cpdef ? edge_counts_32 = move(ptr_shuffled_32_32_64.get().get_edge_counts_wrap()) else: shuffled_df = input_df - edge_counts_32 = make_unique[vector[int]](1, num_partition_edges) + edge_counts_32 = make_unique[vector[int]](1, num_local_edges) shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] @@ -280,12 +280,12 @@ def renumber(input_df, # maybe use cpdef ? c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges).release()) + num_local_edges).release()) shuffled_df = renumber_helper(ptr_shuffled_32_64_32.get(), vertex_t, weights) major_vertices = shuffled_df['major_vertices'] minor_vertices = shuffled_df['minor_vertices'] - num_partition_edges = len(shuffled_df) + num_local_edges = len(shuffled_df) if not transposed: major = 'src'; minor = 'dst' else: @@ -294,7 +294,7 @@ def renumber(input_df, # maybe use cpdef ? edge_counts_64 = move(ptr_shuffled_32_64_32.get().get_edge_counts_wrap()) else: shuffled_df = input_df - edge_counts_64 = make_unique[vector[long]](1, num_partition_edges) + edge_counts_64 = make_unique[vector[long]](1, num_local_edges) shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] @@ -343,12 +343,12 @@ def renumber(input_df, # maybe use cpdef ? c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges).release()) + num_local_edges).release()) shuffled_df = renumber_helper(ptr_shuffled_32_64_64.get(), vertex_t, weights) major_vertices = shuffled_df['major_vertices'] minor_vertices = shuffled_df['minor_vertices'] - num_partition_edges = len(shuffled_df) + num_local_edges = len(shuffled_df) if not transposed: major = 'src'; minor = 'dst' else: @@ -357,7 +357,7 @@ def renumber(input_df, # maybe use cpdef ? edge_counts_64 = move(ptr_shuffled_32_64_64.get().get_edge_counts_wrap()) else: shuffled_df = input_df - edge_counts_64 = make_unique[vector[long]](1, num_partition_edges) + edge_counts_64 = make_unique[vector[long]](1, num_local_edges) shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] @@ -408,12 +408,12 @@ def renumber(input_df, # maybe use cpdef ? c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges).release()) + num_local_edges).release()) shuffled_df = renumber_helper(ptr_shuffled_64_64_32.get(), vertex_t, weights) major_vertices = shuffled_df['major_vertices'] minor_vertices = shuffled_df['minor_vertices'] - num_partition_edges = len(shuffled_df) + num_local_edges = len(shuffled_df) if not transposed: major = 'src'; minor = 'dst' else: @@ -422,7 +422,7 @@ def renumber(input_df, # maybe use cpdef ? edge_counts_64 = move(ptr_shuffled_64_64_32.get().get_edge_counts_wrap()) else: shuffled_df = input_df - edge_counts_64 = make_unique[vector[long]](1, num_partition_edges) + edge_counts_64 = make_unique[vector[long]](1, num_local_edges) shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] @@ -472,12 +472,12 @@ def renumber(input_df, # maybe use cpdef ? c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges).release()) + num_local_edges).release()) shuffled_df = renumber_helper(ptr_shuffled_64_64_64.get(), vertex_t, weights) major_vertices = shuffled_df['major_vertices'] minor_vertices = shuffled_df['minor_vertices'] - num_partition_edges = len(shuffled_df) + num_local_edges = len(shuffled_df) if not transposed: major = 'src'; minor = 'dst' else: @@ -486,7 +486,7 @@ def renumber(input_df, # maybe use cpdef ? edge_counts_64 = move(ptr_shuffled_64_64_64.get().get_edge_counts_wrap()) else: shuffled_df = input_df - edge_counts_64 = make_unique[vector[long]](1, num_partition_edges) + edge_counts_64 = make_unique[vector[long]](1, num_local_edges) shuffled_major = major_vertices.__cuda_array_interface__['data'][0] shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] From 156aa3d45e35ebd39ff45c0d8a3ebacec6a95050 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 2 Apr 2021 10:51:04 -0400 Subject: [PATCH 58/63] python binding bug fix in handling weights --- .../cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx | 5 ++++- python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx b/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx index 8ef942810a2..5fb9de788cf 100644 --- a/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx +++ b/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx @@ -55,8 +55,9 @@ def mg_katz_centrality(input_df, is_weighted = True raise NotImplementedError # FIXME: c_edge_weights is always set to NULL else: + weights = None weight_t = np.dtype("float32") - is_weighted = True + is_weighted = False if alpha is None: alpha = 0.1 @@ -75,6 +76,8 @@ def mg_katz_centrality(input_df, cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0] cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0] cdef uintptr_t c_edge_weights = NULL + if weights is not None: + c_edge_weights = weights.__cuda_array_interface__['data'][0] # FIXME: data is on device, move to host (to_pandas()), convert to np array and access pointer to pass to C vertex_partition_offsets_host = vertex_partition_offsets.values_host diff --git a/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx b/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx index 7839a40d763..c2f92f0f33b 100644 --- a/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx +++ b/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx @@ -54,6 +54,7 @@ def mg_pagerank(input_df, is_weighted = True raise NotImplementedError # FIXME: c_edge_weights is always set to NULL else: + weights = None weight_t = np.dtype("float32") is_weighted = False @@ -70,6 +71,8 @@ def mg_pagerank(input_df, cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0] cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0] cdef uintptr_t c_edge_weights = NULL + if weights is not None: + c_edge_weights = weights.__cuda_array_interface__['data'][0] # FIXME: data is on device, move to host (to_pandas()), convert to np array and access pointer to pass to C vertex_partition_offsets_host = vertex_partition_offsets.values_host From 4d77b8416958e649a411a7279be1be19f6e0c2b0 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 2 Apr 2021 14:42:47 -0400 Subject: [PATCH 59/63] bug fix --- cpp/src/utilities/cython.cu | 112 ++++++++++++++++++++++++++++++++---- 1 file changed, 101 insertions(+), 11 deletions(-) diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu index 8278c38f8d4..4a2b98ea815 100644 --- a/cpp/src/utilities/cython.cu +++ b/cpp/src/utilities/cython.cu @@ -20,22 +20,101 @@ #include #include #include -#include #include #include #include #include +#include +#include + #include +#include +#include #include +#include #include +#include +#include + +#include +#include namespace cugraph { namespace cython { namespace detail { -// FIXME: Add description of this function +// workaround for CUDA extended lambda restrictions +template +struct compute_local_partition_id_t { + vertex_t const* lasts{nullptr}; + size_t num_local_partitions{0}; + + __device__ size_t operator()(vertex_t v) + { + for (size_t i = 0; i < num_local_partitions; ++i) { + if (v < lasts[i]) { return i; } + } + return num_local_partitions; + } +}; + +// FIXME: this is unnecessary if edge_counts_ in the major_minor_weights_t object returned by +// call_shuffle() is passed back, better be fixed. this code assumes that the entire set of edges +// for each partition are consecutively stored. +template +std::vector compute_edge_counts(raft::handle_t const& handle, + graph_container_t const& graph_container) +{ + auto num_local_partitions = static_cast(graph_container.col_comm_size); + + std::vector partition_offsets_vector( + reinterpret_cast(graph_container.vertex_partition_offsets), + reinterpret_cast(graph_container.vertex_partition_offsets) + + (graph_container.row_comm_size * graph_container.col_comm_size) + 1); + + std::vector h_lasts(num_local_partitions); + for (size_t i = 0; i < h_lasts.size(); ++i) { + h_lasts[i] = partition_offsets_vector[graph_container.row_comm_size * (i + 1)]; + } + rmm::device_uvector d_lasts(h_lasts.size(), handle.get_stream()); + raft::update_device(d_lasts.data(), h_lasts.data(), h_lasts.size(), handle.get_stream()); + auto major_vertices = transposed + ? reinterpret_cast(graph_container.dst_vertices) + : reinterpret_cast(graph_container.src_vertices); + auto key_first = thrust::make_transform_iterator( + major_vertices, compute_local_partition_id_t{d_lasts.data(), num_local_partitions}); + rmm::device_uvector d_local_partition_ids(num_local_partitions, handle.get_stream()); + rmm::device_uvector d_edge_counts(d_local_partition_ids.size(), handle.get_stream()); + auto it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + key_first, + key_first + graph_container.num_local_edges, + thrust::make_constant_iterator(edge_t{1}), + d_local_partition_ids.begin(), + d_edge_counts.begin()); + if (static_cast(thrust::distance(d_local_partition_ids.begin(), thrust::get<0>(it))) < + num_local_partitions) { + rmm::device_uvector d_counts(num_local_partitions, handle.get_stream()); + thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_counts.begin(), + d_counts.end(), + edge_t{0}); + thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_edge_counts.begin(), + thrust::get<1>(it), + d_local_partition_ids.begin(), + d_counts.begin()); + d_edge_counts = std::move(d_counts); + } + std::vector h_edge_counts(num_local_partitions, 0); + raft::update_host( + h_edge_counts.data(), d_edge_counts.data(), d_edge_counts.size(), handle.get_stream()); + handle.get_stream_view().synchronize(); + + return h_edge_counts; +} + template > create_graph(raft::handle_t const& handle, graph_container_t const& graph_container) { - std::vector> edgelist( - {{reinterpret_cast(graph_container.src_vertices), - reinterpret_cast(graph_container.dst_vertices), - reinterpret_cast(graph_container.weights), - static_cast(graph_container.num_local_edges)}}); + auto num_local_partitions = static_cast(graph_container.col_comm_size); std::vector partition_offsets_vector( reinterpret_cast(graph_container.vertex_partition_offsets), reinterpret_cast(graph_container.vertex_partition_offsets) + (graph_container.row_comm_size * graph_container.col_comm_size) + 1); + auto edge_counts = compute_edge_counts(handle, graph_container); + + std::vector displacements(edge_counts.size(), 0); + std::partial_sum(edge_counts.begin(), edge_counts.end() - 1, displacements.begin() + 1); + + std::vector> edgelists( + num_local_partitions); + for (size_t i = 0; i < edgelists.size(); ++i) { + edgelists[i] = cugraph::experimental::edgelist_t{ + reinterpret_cast(graph_container.src_vertices) + displacements[i], + reinterpret_cast(graph_container.dst_vertices) + displacements[i], + graph_container.graph_props.is_weighted + ? reinterpret_cast(graph_container.weights) + displacements[i] + : static_cast(nullptr), + edge_counts[i]}; + } + experimental::partition_t partition(partition_offsets_vector, graph_container.row_comm_size, graph_container.col_comm_size, @@ -64,14 +156,12 @@ create_graph(raft::handle_t const& handle, graph_container_t const& graph_contai return std::make_unique>( handle, - edgelist, + edgelists, partition, static_cast(graph_container.num_global_vertices), static_cast(graph_container.num_global_edges), graph_container.graph_props, - // FIXME: This currently fails if sorted_by_degree is true... - // graph_container.sorted_by_degree, - false, + true, graph_container.do_expensive_check); } From fdb309a02c4a92efd7c66b1f78143a84c2cf468c Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 3 Apr 2021 22:58:09 -0400 Subject: [PATCH 60/63] bug fix for a corner case in expensive check for renumber_edgelist --- cpp/src/experimental/renumber_edgelist.cu | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/cpp/src/experimental/renumber_edgelist.cu b/cpp/src/experimental/renumber_edgelist.cu index 4a2b0180e33..efac52d464b 100644 --- a/cpp/src/experimental/renumber_edgelist.cu +++ b/cpp/src/experimental/renumber_edgelist.cu @@ -357,7 +357,12 @@ void expensive_check_edgelist( "Invalid input argument: edgelist_major_vertices & edgelist_minor_vertices should be " "pre-shuffled."); - if (local_vertices != nullptr) { + auto aggregate_vertexlist_size = host_scalar_allreduce( + comm, + local_vertices != nullptr ? num_local_vertices : vertex_t{0}, + handle.get_stream()); // local_vertices != nullptr is insufficient in multi-GPU as only a + // subset of GPUs may have a non-zero vertices + if (aggregate_vertexlist_size > 0) { auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); @@ -428,18 +433,6 @@ void expensive_check_edgelist( assert(edgelist_minor_vertices.size() == 1); if (local_vertices != nullptr) { - CUGRAPH_EXPECTS( - thrust::count_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_major_vertices[0], - edgelist_major_vertices[0] + edgelist_edge_counts[0], - [num_local_vertices, - sorted_local_vertices = sorted_local_vertices.data()] __device__(auto v) { - return !thrust::binary_search( - thrust::seq, sorted_local_vertices, sorted_local_vertices + num_local_vertices, v); - }) == 0, - "Invalid input argument: edgelist_major_vertices has invalid vertex ID(s)."); - auto edge_first = thrust::make_zip_iterator( thrust::make_tuple(edgelist_major_vertices[0], edgelist_minor_vertices[0])); CUGRAPH_EXPECTS( From d40320ce748b2c73618d8a0a4052d9522492c622 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 3 Apr 2021 23:37:54 -0400 Subject: [PATCH 61/63] workaround for cuco static_map kernel launch with 0 grid size --- ...ransform_reduce_key_aggregated_out_nbr.cuh | 10 +++- cpp/include/utilities/collect_comm.cuh | 54 ++++++++++++++----- cpp/src/experimental/relabel.cu | 40 ++++++++++---- cpp/src/experimental/renumber_edgelist.cu | 52 +++++++++++++----- cpp/src/experimental/renumber_utils.cu | 34 +++++++++--- 5 files changed, 145 insertions(+), 45 deletions(-) diff --git a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index 53a866fab39..22dc2041793 100644 --- a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -270,7 +270,9 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - kv_map_ptr->insert(pair_first, pair_first + map_keys.size()); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (map_keys.size()) { kv_map_ptr->insert(pair_first, pair_first + map_keys.size()); } } else { handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream @@ -291,7 +293,11 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (thrust::distance(map_key_first, map_key_last) > 0) { + kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); + } } // 2. aggregate each vertex out-going edges based on keys and transform-reduce. diff --git a/cpp/include/utilities/collect_comm.cuh b/cpp/include/utilities/collect_comm.cuh index 1e5f426d10f..481717d7c38 100644 --- a/cpp/include/utilities/collect_comm.cuh +++ b/cpp/include/utilities/collect_comm.cuh @@ -78,7 +78,11 @@ collect_values_for_keys(raft::comms::comms_t const &comm, [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (thrust::distance(map_key_first, map_key_last) > 0) { + kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); + } } // 2. collect values for the unique keys in [collect_key_first, collect_key_last) @@ -109,8 +113,12 @@ collect_values_for_keys(raft::comms::comms_t const &comm, CUDA_TRY(cudaStreamSynchronize(stream)); // cuco::static_map currently does not take stream - kv_map_ptr->find( - rx_unique_keys.begin(), rx_unique_keys.end(), values_for_rx_unique_keys.begin()); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (rx_unique_keys.size() > 0) { + kv_map_ptr->find( + rx_unique_keys.begin(), rx_unique_keys.end(), values_for_rx_unique_keys.begin()); + } rmm::device_uvector rx_values_for_unique_keys(0, stream); std::tie(rx_values_for_unique_keys, std::ignore) = @@ -142,15 +150,21 @@ collect_values_for_keys(raft::comms::comms_t const &comm, return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - kv_map_ptr->insert(pair_first, pair_first + unique_keys.size()); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (unique_keys.size() > 0) { kv_map_ptr->insert(pair_first, pair_first + unique_keys.size()); } } // 4. find values for [collect_key_first, collect_key_last) auto value_buffer = allocate_dataframe_buffer( thrust::distance(collect_key_first, collect_key_last), stream); - kv_map_ptr->find( - collect_key_first, collect_key_last, get_dataframe_buffer_begin(value_buffer)); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (thrust::distance(collect_key_first, collect_key_last) > 0) { + kv_map_ptr->find( + collect_key_first, collect_key_last, get_dataframe_buffer_begin(value_buffer)); + } return value_buffer; } @@ -200,7 +214,11 @@ collect_values_for_unique_keys(raft::comms::comms_t const &comm, [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (thrust::distance(map_key_first, map_key_last)) { + kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); + } } // 2. collect values for the unique keys in [collect_unique_key_first, collect_unique_key_last) @@ -227,8 +245,12 @@ collect_values_for_unique_keys(raft::comms::comms_t const &comm, CUDA_TRY(cudaStreamSynchronize(stream)); // cuco::static_map currently does not take stream - kv_map_ptr->find( - rx_unique_keys.begin(), rx_unique_keys.end(), values_for_rx_unique_keys.begin()); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (rx_unique_keys.size() > 0) { + kv_map_ptr->find( + rx_unique_keys.begin(), rx_unique_keys.end(), values_for_rx_unique_keys.begin()); + } rmm::device_uvector rx_values_for_unique_keys(0, stream); std::tie(rx_values_for_unique_keys, std::ignore) = @@ -260,16 +282,22 @@ collect_values_for_unique_keys(raft::comms::comms_t const &comm, return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - kv_map_ptr->insert(pair_first, pair_first + unique_keys.size()); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (unique_keys.size() > 0) { kv_map_ptr->insert(pair_first, pair_first + unique_keys.size()); } } // 4. find values for [collect_unique_key_first, collect_unique_key_last) auto value_buffer = allocate_dataframe_buffer( thrust::distance(collect_unique_key_first, collect_unique_key_last), stream); - kv_map_ptr->find(collect_unique_key_first, - collect_unique_key_last, - get_dataframe_buffer_begin(value_buffer)); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (thrust::distance(collect_unique_key_first, collect_unique_key_last)) { + kv_map_ptr->find(collect_unique_key_first, + collect_unique_key_last, + get_dataframe_buffer_begin(value_buffer)); + } return value_buffer; } diff --git a/cpp/src/experimental/relabel.cu b/cpp/src/experimental/relabel.cu index 3c81e182732..8d8fb0322a8 100644 --- a/cpp/src/experimental/relabel.cu +++ b/cpp/src/experimental/relabel.cu @@ -136,7 +136,11 @@ void relabel(raft::handle_t const& handle, [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - relabel_map.insert(pair_first, pair_first + rx_label_pair_old_labels.size()); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the + // grid size is 0; this leads to cudaErrorInvaildConfiguration. + if (rx_label_pair_old_labels.size() > 0) { + relabel_map.insert(pair_first, pair_first + rx_label_pair_old_labels.size()); + } rx_label_pair_old_labels.resize(0, handle.get_stream()); rx_label_pair_new_labels.resize(0, handle.get_stream()); @@ -158,17 +162,23 @@ void relabel(raft::handle_t const& handle, CUDA_TRY(cudaStreamSynchronize( handle.get_stream())); // cuco::static_map currently does not take stream - relabel_map.find( - rx_unique_old_labels.begin(), - rx_unique_old_labels.end(), - rx_unique_old_labels - .begin()); // now rx_unique_old_lables hold new labels for the corresponding old labels + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the + // grid size is 0; this leads to cudaErrorInvaildConfiguration. + if (rx_unique_old_labels.size() > 0) { + relabel_map.find( + rx_unique_old_labels.begin(), + rx_unique_old_labels.end(), + rx_unique_old_labels.begin()); // now rx_unique_old_lables hold new labels for the + // corresponding old labels + } std::tie(new_labels_for_unique_old_labels, std::ignore) = shuffle_values( handle.get_comms(), rx_unique_old_labels.begin(), rx_value_counts, handle.get_stream()); } } + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream + cuco::static_map relabel_map( // FIXME: std::max(..., ...) as a temporary workaround for // https://github.com/NVIDIA/cuCollections/issues/72 and @@ -185,8 +195,14 @@ void relabel(raft::handle_t const& handle, return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - relabel_map.insert(pair_first, pair_first + unique_old_labels.size()); - relabel_map.find(labels, labels + num_labels, labels); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (unique_old_labels.size() > 0) { + relabel_map.insert(pair_first, pair_first + unique_old_labels.size()); + } + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_labels > 0) { relabel_map.find(labels, labels + num_labels, labels); } } else { cuco::static_map relabel_map( // FIXME: std::max(..., ...) as a temporary workaround for @@ -204,8 +220,12 @@ void relabel(raft::handle_t const& handle, return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - relabel_map.insert(pair_first, pair_first + num_label_pairs); - relabel_map.find(labels, labels + num_labels, labels); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_label_pairs > 0) { relabel_map.insert(pair_first, pair_first + num_label_pairs); } + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_labels > 0) { relabel_map.find(labels, labels + num_labels, labels); } } if (do_expensive_check) { diff --git a/cpp/src/experimental/renumber_edgelist.cu b/cpp/src/experimental/renumber_edgelist.cu index efac52d464b..127bd507271 100644 --- a/cpp/src/experimental/renumber_edgelist.cu +++ b/cpp/src/experimental/renumber_edgelist.cu @@ -567,10 +567,18 @@ renumber_edgelist(raft::handle_t const& handle, [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - renumber_map.insert(pair_first, pair_first + partition.get_matrix_partition_major_size(i)); - renumber_map.find(edgelist_major_vertices[i], - edgelist_major_vertices[i] + edgelist_edge_counts[i], - edgelist_major_vertices[i]); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (partition.get_matrix_partition_major_size(i) > 0) { + renumber_map.insert(pair_first, pair_first + partition.get_matrix_partition_major_size(i)); + } + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (edgelist_edge_counts[i]) { + renumber_map.find(edgelist_major_vertices[i], + edgelist_major_vertices[i] + edgelist_edge_counts[i], + edgelist_major_vertices[i]); + } } { @@ -608,11 +616,19 @@ renumber_edgelist(raft::handle_t const& handle, [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - renumber_map.insert(pair_first, pair_first + renumber_map_minor_labels.size()); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (renumber_map_minor_labels.size()) { + renumber_map.insert(pair_first, pair_first + renumber_map_minor_labels.size()); + } for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { - renumber_map.find(edgelist_minor_vertices[i], - edgelist_minor_vertices[i] + edgelist_edge_counts[i], - edgelist_minor_vertices[i]); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the + // grid size is 0; this leads to cudaErrorInvaildConfiguration. + if (edgelist_edge_counts[i]) { + renumber_map.find(edgelist_minor_vertices[i], + edgelist_minor_vertices[i] + edgelist_edge_counts[i], + edgelist_minor_vertices[i]); + } } } @@ -679,11 +695,21 @@ std::enable_if_t> renumber_edgelist( [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - renumber_map.insert(pair_first, pair_first + renumber_map_labels.size()); - renumber_map.find( - edgelist_major_vertices, edgelist_major_vertices + num_edgelist_edges, edgelist_major_vertices); - renumber_map.find( - edgelist_minor_vertices, edgelist_minor_vertices + num_edgelist_edges, edgelist_minor_vertices); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (renumber_map_labels.size()) { + renumber_map.insert(pair_first, pair_first + renumber_map_labels.size()); + } + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_edgelist_edges > 0) { + renumber_map.find(edgelist_major_vertices, + edgelist_major_vertices + num_edgelist_edges, + edgelist_major_vertices); + renumber_map.find(edgelist_minor_vertices, + edgelist_minor_vertices + num_edgelist_edges, + edgelist_minor_vertices); + } return renumber_map_labels; #else diff --git a/cpp/src/experimental/renumber_utils.cu b/cpp/src/experimental/renumber_utils.cu index b2f13a3254d..8f59683d9d6 100644 --- a/cpp/src/experimental/renumber_utils.cu +++ b/cpp/src/experimental/renumber_utils.cu @@ -123,7 +123,11 @@ void renumber_ext_vertices(raft::handle_t const& handle, [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - renumber_map_ptr->insert(kv_pair_first, kv_pair_first + sorted_unique_ext_vertices.size()); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (sorted_unique_ext_vertices.size()) { + renumber_map_ptr->insert(kv_pair_first, kv_pair_first + sorted_unique_ext_vertices.size()); + } } else { handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream @@ -145,13 +149,21 @@ void renumber_ext_vertices(raft::handle_t const& handle, [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - renumber_map_ptr->insert(pair_first, - pair_first + (local_int_vertex_last - local_int_vertex_first)); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if ((local_int_vertex_last - local_int_vertex_first) > 0) { + renumber_map_ptr->insert(pair_first, + pair_first + (local_int_vertex_last - local_int_vertex_first)); + } } if (do_expensive_check) { rmm::device_uvector contains(num_vertices, handle.get_stream()); - renumber_map_ptr->contains(vertices, vertices + num_vertices, contains.begin()); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_vertices > 0) { + renumber_map_ptr->contains(vertices, vertices + num_vertices, contains.begin()); + } auto vc_pair_first = thrust::make_zip_iterator(thrust::make_tuple(vertices, contains.begin())); CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), vc_pair_first, @@ -179,7 +191,9 @@ void renumber_ext_vertices(raft::handle_t const& handle, : invalid_vertex_id::value; }); #else - renumber_map_ptr->find(vertices, vertices + num_vertices, vertices); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_vertices > 0) { renumber_map_ptr->find(vertices, vertices + num_vertices, vertices); } #endif #endif } @@ -340,7 +354,11 @@ void unrenumber_int_vertices(raft::handle_t const& handle, [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - unrenumber_map.insert(pair_first, pair_first + sorted_unique_int_vertices.size()); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (sorted_unique_int_vertices.size()) { + unrenumber_map.insert(pair_first, pair_first + sorted_unique_int_vertices.size()); + } // FIXME: a temporary workaround for https://github.com/NVIDIA/cuCollections/issues/74 #if 1 thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), @@ -353,7 +371,9 @@ void unrenumber_int_vertices(raft::handle_t const& handle, : invalid_vertex_id::value; }); #else - unrenumber_map.find(vertices, vertices + num_vertices, vertices); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_vertices > 0) { unrenumber_map.find(vertices, vertices + num_vertices, vertices); } #endif } else { unrenumber_local_int_vertices(handle, From 6f2a8d601e76f8fd47aa1fd18b4f421acb9cb44e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 5 Apr 2021 17:13:27 -0400 Subject: [PATCH 62/63] remove unnecessary code --- cpp/include/experimental/graph_view.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/include/experimental/graph_view.hpp b/cpp/include/experimental/graph_view.hpp index d946638a5cd..47c93b42ca9 100644 --- a/cpp/include/experimental/graph_view.hpp +++ b/cpp/include/experimental/graph_view.hpp @@ -309,9 +309,6 @@ class graph_view_t get_partition() const { return partition_; } - vertex_t get_number_of_local_vertices() const { return partition_.get_local_vertex_last() - partition_.get_local_vertex_first(); From 462331e58848c8f083dc42a63dddf10f1984e2cb Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 5 Apr 2021 17:14:04 -0400 Subject: [PATCH 63/63] remove unnecessary synchronization --- cpp/include/experimental/detail/graph_utils.cuh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/cpp/include/experimental/detail/graph_utils.cuh b/cpp/include/experimental/detail/graph_utils.cuh index 490ba5cd4b1..d79788e59ce 100644 --- a/cpp/include/experimental/detail/graph_utils.cuh +++ b/cpp/include/experimental/detail/graph_utils.cuh @@ -82,12 +82,6 @@ rmm::device_uvector compute_major_degrees( handle.get_stream()); } - // FIXME: is this necessary? - auto status = - col_comm.sync_stream(handle.get_stream()); // this is necessary as local_degrees will become - // out-of-scope once this function returns. - CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure."); - return degrees; }