From 092cf49f66fdf4f7676452ae94f9a5ccc5ea3c3b Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 26 Jan 2022 15:33:20 -0800 Subject: [PATCH 01/60] enable multi-stream execution and overlapping communication with computation (currently with the temporary mechanism to support stream priorities, eventually, rmm should be updated to support this) --- .../copy_v_transform_reduce_in_out_nbr.cuh | 197 +++++++++++++----- cpp/tests/link_analysis/mg_pagerank_test.cpp | 2 +- 2 files changed, 141 insertions(+), 58 deletions(-) diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh index 6251c269697..a98013ac996 100644 --- a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh +++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh @@ -475,6 +475,17 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, T init, VertexValueOutputIterator vertex_value_output_first) { +// FIXME: for temporary testing +#if 1 + cudaStream_t high_stream0{}; + cudaStream_t high_stream1{}; + cudaStream_t mid_stream{}; + cudaStream_t low_stream{}; + CUDA_TRY(cudaStreamCreateWithPriority(&high_stream0, cudaStreamNonBlocking, -5)); + CUDA_TRY(cudaStreamCreateWithPriority(&high_stream1, cudaStreamNonBlocking, -5)); + CUDA_TRY(cudaStreamCreateWithPriority(&mid_stream, cudaStreamNonBlocking, -3)); + CUDA_TRY(cudaStreamCreateWithPriority(&low_stream, cudaStreamNonBlocking, 0)); +#endif constexpr auto update_major = (in == GraphViewType::is_adj_matrix_transposed); using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; @@ -559,34 +570,87 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, output_buffer = vertex_value_output_first; } auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i); + std::optional> stream_pool_indices{std::nullopt}; if (segment_offsets) { - // FIXME: we may further improve performance by 1) concurrently running kernels on different - // segments; 2) individually tuning block sizes for different segments; and 3) adding one more - // segment for very high degree vertices and running segmented reduction static_assert(detail::num_sparse_segments_per_vertex_partition == 3); - if ((*segment_offsets)[1] > 0) { - raft::grid_1d_block_t update_grid((*segment_offsets)[1], - detail::copy_v_transform_reduce_nbr_for_all_block_size, - handle.get_device_properties().maxGridSize[0]); - detail::for_all_major_for_all_nbr_high_degree - <<>>( + + auto num_segments = detail::num_sparse_segments_per_vertex_partition + + (matrix_partition.get_dcs_nzd_vertex_count() ? size_t{1} : size_t{0}); + if (GraphViewType::is_multi_gpu && handle.get_stream_pool_size() >= num_segments) { + stream_pool_indices = std::vector(num_segments); + std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); + handle.sync_stream(); + } + + // FIXME: we may further improve performance by 1) individually tuning block sizes for + // different segments; and 2) adding one more segment for very high degree vertices and + // running segmented reduction + if (matrix_partition.get_dcs_nzd_vertex_count()) { + auto exec_stream = stream_pool_indices + ? rmm::cuda_stream_view{high_stream0} + /* FIXME for temporary testing, + handle.get_stream_from_stream_pool((*stream_pool_indices)[0]) */ + : handle.get_stream(); + if constexpr (update_major) { // this is necessary as we don't visit every vertex in the + // hypersparse segment in + // for_all_major_for_all_nbr_hypersparse + thrust::fill(rmm::exec_policy(exec_stream), + output_buffer + (*segment_offsets)[3], + output_buffer + (*segment_offsets)[4], + major_init); + } + if (*(matrix_partition.get_dcs_nzd_vertex_count()) > 0) { + raft::grid_1d_thread_t update_grid(*(matrix_partition.get_dcs_nzd_vertex_count()), + detail::copy_v_transform_reduce_nbr_for_all_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[3]; } + detail::for_all_major_for_all_nbr_hypersparse + <<>>( + matrix_partition, + matrix_partition.get_major_first() + (*segment_offsets)[3], + matrix_partition_row_value_input, + matrix_partition_col_value_input, + segment_output_buffer, + e_op, + major_init); + } + } + if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { + auto exec_stream = stream_pool_indices ? rmm::cuda_stream_view{high_stream1} + /* FIXME for temporary testing, + handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count() + ? 1 : 0]) */ + : handle.get_stream(); + raft::grid_1d_thread_t update_grid((*segment_offsets)[3] - (*segment_offsets)[2], + detail::copy_v_transform_reduce_nbr_for_all_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[2]; } + detail::for_all_major_for_all_nbr_low_degree + <<>>( matrix_partition, - matrix_partition.get_major_first(), - matrix_partition.get_major_first() + (*segment_offsets)[1], + matrix_partition.get_major_first() + (*segment_offsets)[2], + matrix_partition.get_major_first() + (*segment_offsets)[3], matrix_partition_row_value_input, matrix_partition_col_value_input, - output_buffer, + segment_output_buffer, e_op, major_init); } if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { + auto exec_stream = stream_pool_indices ? rmm::cuda_stream_view{mid_stream} + /* FIXME for temporary testing, + handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count() + ? 2 : 1]) */ + : handle.get_stream(); raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1], detail::copy_v_transform_reduce_nbr_for_all_block_size, handle.get_device_properties().maxGridSize[0]); auto segment_output_buffer = output_buffer; if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[1]; } detail::for_all_major_for_all_nbr_mid_degree - <<>>( + <<>>( matrix_partition, matrix_partition.get_major_first() + (*segment_offsets)[1], matrix_partition.get_major_first() + (*segment_offsets)[2], @@ -596,49 +660,26 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, e_op, major_init); } - if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { - raft::grid_1d_thread_t update_grid((*segment_offsets)[3] - (*segment_offsets)[2], - detail::copy_v_transform_reduce_nbr_for_all_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[2]; } - detail::for_all_major_for_all_nbr_low_degree - <<>>( + if ((*segment_offsets)[1] > 0) { + auto exec_stream = stream_pool_indices ? rmm::cuda_stream_view{low_stream} + /* FIXME for temporary testing, + handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count() + ? 3 : 2]) */ + : handle.get_stream(); + raft::grid_1d_block_t update_grid((*segment_offsets)[1], + detail::copy_v_transform_reduce_nbr_for_all_block_size, + handle.get_device_properties().maxGridSize[0]); + detail::for_all_major_for_all_nbr_high_degree + <<>>( matrix_partition, - matrix_partition.get_major_first() + (*segment_offsets)[2], - matrix_partition.get_major_first() + (*segment_offsets)[3], + matrix_partition.get_major_first(), + matrix_partition.get_major_first() + (*segment_offsets)[1], matrix_partition_row_value_input, matrix_partition_col_value_input, - segment_output_buffer, + output_buffer, e_op, major_init); } - if (matrix_partition.get_dcs_nzd_vertex_count()) { - if constexpr (update_major) { // this is necessary as we don't visit every vertex in the - // hypersparse segment in - // for_all_major_for_all_nbr_hypersparse - thrust::fill(handle.get_thrust_policy(), - output_buffer + (*segment_offsets)[3], - output_buffer + (*segment_offsets)[4], - major_init); - } - if (*(matrix_partition.get_dcs_nzd_vertex_count()) > 0) { - raft::grid_1d_thread_t update_grid(*(matrix_partition.get_dcs_nzd_vertex_count()), - detail::copy_v_transform_reduce_nbr_for_all_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[3]; } - detail::for_all_major_for_all_nbr_hypersparse - <<>>( - matrix_partition, - matrix_partition.get_major_first() + (*segment_offsets)[3], - matrix_partition_row_value_input, - matrix_partition_col_value_input, - segment_output_buffer, - e_op, - major_init); - } - } } else { if (matrix_partition.get_major_size() > 0) { raft::grid_1d_thread_t update_grid(matrix_partition.get_major_size(), @@ -666,13 +707,48 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); - device_reduce(col_comm, - major_buffer_first, - vertex_value_output_first, - matrix_partition.get_major_size(), - raft::comms::op_t::SUM, - i, - handle.get_stream()); + if (segment_offsets && stream_pool_indices) { + if ((*segment_offsets).back() - (*segment_offsets)[3] > 0) { + device_reduce(col_comm, + major_buffer_first + (*segment_offsets)[3], + vertex_value_output_first + (*segment_offsets)[3], + (*segment_offsets).back() - (*segment_offsets)[3], + raft::comms::op_t::SUM, + i, + high_stream0/* FIXME for temporary testing, handle.get_stream_from_stream_pool((*stream_pool_indices)[0]) */); + } + if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { + device_reduce( + col_comm, major_buffer_first + (*segment_offsets)[2], vertex_value_output_first + (*segment_offsets)[2], (*segment_offsets)[3] - (*segment_offsets)[2], raft::comms::op_t::SUM, i, high_stream1 /* FIXME for temporary testing, handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count() ? 1 : 0]) */); + } + if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { + device_reduce( + col_comm, major_buffer_first + (*segment_offsets)[1], vertex_value_output_first + (*segment_offsets)[1], (*segment_offsets)[2] - (*segment_offsets)[1], raft::comms::op_t::SUM, i, mid_stream /* FIXME for temporary testing, handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count() ? 2 : 1]) */); + } + if ((*segment_offsets)[1] > 0) { + device_reduce( + col_comm, major_buffer_first, vertex_value_output_first, (*segment_offsets)[1], raft::comms::op_t::SUM, i, low_stream /* FIXME for temporary testing, handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count() ? 3 : 2]) */); + } + } else { + device_reduce(col_comm, + major_buffer_first, + vertex_value_output_first, + matrix_partition.get_major_size(), + raft::comms::op_t::SUM, + i, + handle.get_stream()); + } + } + + if (stream_pool_indices) { +#if 1 // FIXME: for temporary testing + CUDA_TRY(cudaStreamSynchronize(high_stream0)); + CUDA_TRY(cudaStreamSynchronize(high_stream1)); + CUDA_TRY(cudaStreamSynchronize(mid_stream)); + CUDA_TRY(cudaStreamSynchronize(low_stream)); +#else + handle.sync_stream_pool(*stream_pool_indices); +#endif } } @@ -738,6 +814,13 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } } } +// FIXME: for temporary testing +#if 1 + CUDA_TRY(cudaStreamDestroy(low_stream)); + CUDA_TRY(cudaStreamDestroy(mid_stream)); + CUDA_TRY(cudaStreamDestroy(high_stream1)); + CUDA_TRY(cudaStreamDestroy(high_stream0)); +#endif } } // namespace detail diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp index adcd0c94a8f..df264f2e0e1 100644 --- a/cpp/tests/link_analysis/mg_pagerank_test.cpp +++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp @@ -61,7 +61,7 @@ class Tests_MGPageRank { // 1. initialize handle - raft::handle_t handle{}; + raft::handle_t handle(rmm::cuda_stream_per_thread, std::make_shared()); HighResClock hr_clock{}; raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD); From 6102cd1b5f57f831cb6d47ea30c21e3b1e744cb0 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 27 Jan 2022 10:16:37 -0800 Subject: [PATCH 02/60] update group_by_and_count to not use reduce_by_key (which is expensive and also seems like having an issue with 2^31 or more elements) --- .../cugraph/utilities/shuffle_comm.cuh | 64 +++++++++---------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh index 5fd78dc00ee..3840de019fc 100644 --- a/cpp/include/cugraph/utilities/shuffle_comm.cuh +++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -36,6 +37,21 @@ namespace cugraph { namespace detail { +template +struct compute_group_id_count_pair_t { + GroupIdIterator group_id_first{}; + GroupIdIterator group_id_last{}; + + __device__ thrust::tuple operator()(size_t i) const + { + auto lower_it = + thrust::lower_bound(thrust::seq, group_id_first, group_id_last, static_cast(i)); + auto upper_it = thrust::upper_bound(thrust::seq, lower_it, group_id_last, static_cast(i)); + return thrust::make_tuple(static_cast(i), + static_cast(thrust::distance(lower_it, upper_it))); + } +}; + // inline to suppress a complaint about ODR violation inline std::tuple, std::vector, @@ -128,23 +144,14 @@ rmm::device_uvector groupby_and_count(ValueIterator tx_value_first /* [I [value_to_group_id_op] __device__(auto value) { return value_to_group_id_op(value); }); rmm::device_uvector d_tx_dst_ranks(num_groups, stream_view); rmm::device_uvector d_tx_value_counts(d_tx_dst_ranks.size(), stream_view); - auto last = - thrust::reduce_by_key(rmm::exec_policy(stream_view), - group_id_first, - group_id_first + thrust::distance(tx_value_first, tx_value_last), - thrust::make_constant_iterator(size_t{1}), - d_tx_dst_ranks.begin(), - d_tx_value_counts.begin()); - if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < num_groups) { - rmm::device_uvector d_counts(num_groups, stream_view); - thrust::fill(rmm::exec_policy(stream_view), d_counts.begin(), d_counts.end(), size_t{0}); - thrust::scatter(rmm::exec_policy(stream_view), - d_tx_value_counts.begin(), - thrust::get<1>(last), - d_tx_dst_ranks.begin(), - d_counts.begin()); - d_tx_value_counts = std::move(d_counts); - } + auto rank_count_pair_first = thrust::make_zip_iterator( + thrust::make_tuple(d_tx_dst_ranks.begin(), d_tx_value_counts.begin())); + thrust::tabulate( + rmm::exec_policy(stream_view), + rank_count_pair_first, + rank_count_pair_first + num_groups, + detail::compute_group_id_count_pair_t{ + group_id_first, group_id_first + thrust::distance(tx_value_first, tx_value_last)}); return d_tx_value_counts; } @@ -169,22 +176,13 @@ rmm::device_uvector groupby_and_count(VertexIterator tx_key_first /* [IN tx_key_first, [key_to_group_id_op] __device__(auto key) { return key_to_group_id_op(key); }); rmm::device_uvector d_tx_dst_ranks(num_groups, stream_view); rmm::device_uvector d_tx_value_counts(d_tx_dst_ranks.size(), stream_view); - auto last = thrust::reduce_by_key(rmm::exec_policy(stream_view), - group_id_first, - group_id_first + thrust::distance(tx_key_first, tx_key_last), - thrust::make_constant_iterator(size_t{1}), - d_tx_dst_ranks.begin(), - d_tx_value_counts.begin()); - if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < num_groups) { - rmm::device_uvector d_counts(num_groups, stream_view); - thrust::fill(rmm::exec_policy(stream_view), d_counts.begin(), d_counts.end(), size_t{0}); - thrust::scatter(rmm::exec_policy(stream_view), - d_tx_value_counts.begin(), - thrust::get<1>(last), - d_tx_dst_ranks.begin(), - d_counts.begin()); - d_tx_value_counts = std::move(d_counts); - } + auto rank_count_pair_first = thrust::make_zip_iterator( + thrust::make_tuple(d_tx_dst_ranks.begin(), d_tx_value_counts.begin())); + thrust::tabulate(rmm::exec_policy(stream_view), + rank_count_pair_first, + rank_count_pair_first + num_groups, + detail::compute_group_id_count_pair_t{ + group_id_first, group_id_first + thrust::distance(tx_key_first, tx_key_last)}); return d_tx_value_counts; } From 5146b198b4617396f8b11664df8736e4f6e374d7 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 27 Jan 2022 11:51:52 -0800 Subject: [PATCH 03/60] add time measurements (should be undone) --- cpp/src/structure/graph_impl.cuh | 34 +++++++++++++++ cpp/src/structure/renumber_edgelist_impl.cuh | 34 +++++++++++++++ cpp/tests/link_analysis/mg_pagerank_test.cpp | 22 ++++++++++ cpp/tests/utilities/test_graphs.hpp | 44 ++++++++++++++++++++ 4 files changed, 134 insertions(+) diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index e969bb4a6a3..6aeb41a915b 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -486,6 +486,13 @@ graph_tget_handle_ptr()->get_comms(); @@ -618,6 +625,10 @@ graph_t elapsed_total = time5 - time0; + std::chrono::duration elapsed0 = time1 - time0; + std::chrono::duration elapsed1 = time2 - time1; + std::chrono::duration elapsed2 = time3 - time2; + std::chrono::duration elapsed3 = time4 - time3; + std::chrono::duration elapsed4 = time5 - time4; + std::cout << "Graph constructor took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << ") ms." << std::endl; +#endif } template >> const& edgelist_intra_partition_segment_offsets, bool do_expensive_check) { +#if 1 // FIXME: delete + handle.sync_stream(); + if constexpr (multi_gpu) { + handle.get_comms().barrier(); + } + auto time0 = std::chrono::steady_clock::now(); +#endif auto& comm = handle.get_comms(); auto const comm_size = comm.get_size(); auto const comm_rank = comm.get_rank(); @@ -613,6 +620,10 @@ renumber_edgelist( // 1. compute renumber map +#if 1 // FIXME: delete + handle.sync_stream(); + auto time1 = std::chrono::steady_clock::now(); +#endif auto [renumber_map_labels, vertex_partition_segment_offsets, num_unique_edge_majors, @@ -626,6 +637,10 @@ renumber_edgelist( // 2. initialize partition_t object, number_of_vertices, and number_of_edges for the coarsened // graph +#if 1 // FIXME: delete + handle.sync_stream(); + auto time2 = std::chrono::steady_clock::now(); +#endif auto vertex_counts = host_scalar_allgather( comm, static_cast(renumber_map_labels.size()), handle.get_stream()); std::vector vertex_partition_offsets(comm_size + 1, 0); @@ -649,6 +664,10 @@ renumber_edgelist( // FIXME: compare this hash based approach with a binary search based approach in both memory // footprint and execution time +#if 1 // FIXME: delete + handle.sync_stream(); + auto time3 = std::chrono::steady_clock::now(); +#endif { vertex_t max_matrix_partition_major_size{0}; for (size_t i = 0; i < edgelist_majors.size(); ++i) { @@ -696,6 +715,10 @@ renumber_edgelist( } } +#if 1 // FIXME: delete + handle.sync_stream(); + auto time4 = std::chrono::steady_clock::now(); +#endif if ((partition.get_matrix_partition_minor_size() >= number_of_edges / comm_size) && edgelist_intra_partition_segment_offsets) { // memory footprint dominated by the O(V/sqrt(P)) // part than the O(E/P) part @@ -791,6 +814,17 @@ renumber_edgelist( handle.get_stream()); } } +#if 1 // FIXME: delete + handle.sync_stream(); + auto time5 = std::chrono::steady_clock::now(); + std::chrono::duration elapsed_total = time5 - time0; + std::chrono::duration elapsed0 = time1 - time0; + std::chrono::duration elapsed1 = time2 - time1; + std::chrono::duration elapsed2 = time3 - time2; + std::chrono::duration elapsed3 = time4 - time3; + std::chrono::duration elapsed4 = time5 - time4; + std::cout << "Renumber took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << ") ms." << std::endl; +#endif return std::make_tuple( std::move(renumber_map_labels), diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp index adcd0c94a8f..4a6dd08dabd 100644 --- a/cpp/tests/link_analysis/mg_pagerank_test.cpp +++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp @@ -63,6 +63,9 @@ class Tests_MGPageRank raft::handle_t handle{}; HighResClock hr_clock{}; +#if 1 // FIXME: delete + auto time0 = std::chrono::steady_clock::now(); +#endif raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD); auto& comm = handle.get_comms(); @@ -75,6 +78,25 @@ class Tests_MGPageRank } cugraph::partition_2d::subcomm_factory_t subcomm_factory(handle, row_comm_size); +#if 1 // FIXME: delete + { + rmm::device_uvector tx_ints(comm_size, handle.get_stream()); + rmm::device_uvector rx_ints(comm_size, handle.get_stream()); + std::vector tx_sizes(comm_size, size_t{1}); + std::vector tx_offsets(comm_size); + std::iota(tx_offsets.begin(), tx_offsets.end(), size_t{0}); + std::vector tx_ranks(comm_size); + std::iota(tx_ranks.begin(), tx_ranks.end(), int32_t{0}); + auto rx_sizes = tx_sizes; + auto rx_offsets = tx_offsets; + auto rx_ranks = tx_ranks; + handle.get_comms().device_multicast_sendrecv(tx_ints.data(), tx_sizes, tx_offsets, tx_ranks, rx_ints.data(), rx_sizes, rx_offsets, rx_ranks, handle.get_stream()); + handle.sync_stream(); + } + auto time1 = std::chrono::steady_clock::now(); + std::chrono::duration elapsed = time1 - time0; + std::cout << "Handle initialization and 1st all-to-all took " << elapsed.count() * 1e3 << " ms." << std::endl; +#endif // 2. create MG graph diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp index 9fa4cee9f7a..8818d9633bf 100644 --- a/cpp/tests/utilities/test_graphs.hpp +++ b/cpp/tests/utilities/test_graphs.hpp @@ -147,6 +147,13 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { static_cast(std::numeric_limits::max()), "Invalid template parameter: (scale_, edge_factor_) too large for edge_t"); +#if 1 // FIXME: delete + handle.sync_stream(); + if constexpr (multi_gpu) { + handle.get_comms().barrier(); + } + auto time0 = std::chrono::steady_clock::now(); +#endif std::vector partition_ids(1); size_t num_partitions; @@ -191,6 +198,10 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { } } +#if 1 // FIXME: delete + handle.sync_stream(); + auto time1 = std::chrono::steady_clock::now(); +#endif rmm::device_uvector src_v(0, handle.get_stream()); rmm::device_uvector dst_v(0, handle.get_stream()); auto weights_v = test_weighted @@ -247,13 +258,25 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { } } +#if 1 // FIXME: delete + handle.sync_stream(); + auto time2 = std::chrono::steady_clock::now(); +#endif translate(handle, src_v, dst_v); +#if 1 // FIXME: delete + handle.sync_stream(); + auto time3 = std::chrono::steady_clock::now(); +#endif if (undirected_) std::tie(src_v, dst_v, weights_v) = cugraph::symmetrize_edgelist_from_triangular( handle, std::move(src_v), std::move(dst_v), std::move(weights_v)); +#if 1 // FIXME: delete + handle.sync_stream(); + auto time4 = std::chrono::steady_clock::now(); +#endif if (multi_gpu) { std::tie(store_transposed ? dst_v : src_v, store_transposed ? src_v : dst_v, weights_v) = cugraph::detail::shuffle_edgelist_by_gpu_id( @@ -263,6 +286,10 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { std::move(weights_v)); } +#if 1 // FIXME: delete + handle.sync_stream(); + auto time5 = std::chrono::steady_clock::now(); +#endif rmm::device_uvector vertices_v(0, handle.get_stream()); for (size_t i = 0; i < partition_ids.size(); ++i) { auto id = partition_ids[i]; @@ -276,10 +303,27 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { partition_vertex_firsts[i]); } +#if 1 // FIXME: delete + handle.sync_stream(); + auto time6 = std::chrono::steady_clock::now(); +#endif if constexpr (multi_gpu) { vertices_v = cugraph::detail::shuffle_vertices_by_gpu_id(handle, std::move(vertices_v)); } +#if 1 // FIXME: delete + handle.sync_stream(); + auto time7 = std::chrono::steady_clock::now(); + std::chrono::duration elapsed_total = time7 - time0; + std::chrono::duration elapsed0 = time1 - time0; + std::chrono::duration elapsed1 = time2 - time1; + std::chrono::duration elapsed2 = time3 - time2; + std::chrono::duration elapsed3 = time4 - time3; + std::chrono::duration elapsed4 = time5 - time4; + std::chrono::duration elapsed5 = time6 - time5; + std::chrono::duration elapsed6 = time7 - time6; + std::cout << "Edge generation took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << "," << elapsed5.count() * 1e3 << "," << elapsed6.count() * 1e3 << ") ms." << std::endl; +#endif return std::make_tuple( std::move(src_v), std::move(dst_v), From 965f0cddc0b2379b9480285d066824b8fbff89de Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 31 Jan 2022 11:32:52 -0800 Subject: [PATCH 04/60] cosmetic updates --- cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh index 508294c9e89..f93ee1a6ae5 100644 --- a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh +++ b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh @@ -170,7 +170,7 @@ void copy_to_matrix_major(raft::handle_t const& handle, matrix_partition_device_view_t( graph_view.get_matrix_partition_view(i)); - if (col_comm_rank == i) { + if (i == col_comm_rank) { auto vertex_partition = vertex_partition_device_view_t( graph_view.get_vertex_partition_view()); @@ -365,7 +365,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle, matrix_partition_device_view_t( graph_view.get_matrix_partition_view(size_t{0})); for (int i = 0; i < row_comm_size; ++i) { - if (row_comm_rank == i) { + if (i == row_comm_rank) { auto vertex_partition = vertex_partition_device_view_t( graph_view.get_vertex_partition_view()); From 7c02fcf0ee8661621b6e45d9e260058082ee31c9 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 31 Jan 2022 17:52:04 -0800 Subject: [PATCH 05/60] improve weak scaling behavior of renumber --- cpp/src/structure/renumber_edgelist_impl.cuh | 582 +++++++++++-------- 1 file changed, 348 insertions(+), 234 deletions(-) diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 94ab7f3a495..2d501b6f1cc 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include #include @@ -53,240 +54,328 @@ compute_renumber_map(raft::handle_t const& handle, std::vector const& edgelist_minors, std::vector const& edgelist_edge_counts) { - // FIXME: compare this sort based approach with hash based approach in both speed and memory - // footprint +#if 1 // FIXME: delete + handle.sync_stream(); + if constexpr (multi_gpu) { + rmm::device_uvector dummy(1, handle.get_stream()); + handle.get_comms().allreduce( + dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream()); + } + auto time0 = std::chrono::steady_clock::now(); +#endif + rmm::device_uvector sorted_local_vertices(0, handle.get_stream()); + vertex_t num_local_unique_edge_majors{0}; + vertex_t num_local_unique_edge_minors{0}; - // 1. acquire (unique major label, count) pairs + edge_t num_local_edges = std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end()); - rmm::device_uvector major_labels(0, handle.get_stream()); - rmm::device_uvector major_counts(0, handle.get_stream()); - vertex_t num_local_unique_edge_majors{0}; - for (size_t i = 0; i < edgelist_majors.size(); ++i) { - rmm::device_uvector tmp_major_labels(0, handle.get_stream()); - rmm::device_uvector tmp_major_counts(0, handle.get_stream()); - { - rmm::device_uvector sorted_major_labels(edgelist_edge_counts[i], - handle.get_stream()); + // 1. if local_vertices.has_value() is false, keep unique vertices from edge majors as well (to + // construct local_vertices) + + rmm::device_uvector sorted_unique_majors(0, handle.get_stream()); + if (!local_vertices) { + sorted_unique_majors.resize(num_local_edges, handle.get_stream()); + size_t major_offset{0}; + for (size_t i = 0; i < edgelist_majors.size(); ++i) { thrust::copy(handle.get_thrust_policy(), edgelist_majors[i], edgelist_majors[i] + edgelist_edge_counts[i], - sorted_major_labels.begin()); - // FIXME: better refactor this sort-count_if-reduce_by_key routine for reuse - thrust::sort( - handle.get_thrust_policy(), sorted_major_labels.begin(), sorted_major_labels.end()); - auto num_unique_labels = - thrust::count_if(handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(sorted_major_labels.size()), - [labels = sorted_major_labels.data()] __device__(auto i) { - return (i == 0) || (labels[i - 1] != labels[i]); - }); - tmp_major_labels.resize(num_unique_labels, handle.get_stream()); - tmp_major_counts.resize(tmp_major_labels.size(), handle.get_stream()); - thrust::reduce_by_key(handle.get_thrust_policy(), - sorted_major_labels.begin(), - sorted_major_labels.end(), - thrust::make_constant_iterator(edge_t{1}), - tmp_major_labels.begin(), - tmp_major_counts.begin()); + sorted_unique_majors.begin() + major_offset); + thrust::sort(handle.get_thrust_policy(), + sorted_unique_majors.begin() + major_offset, + sorted_unique_majors.begin() + major_offset + edgelist_edge_counts[i]); + major_offset += static_cast(thrust::distance( + sorted_unique_majors.begin() + major_offset, + thrust::unique(handle.get_thrust_policy(), + sorted_unique_majors.begin() + major_offset, + sorted_unique_majors.begin() + major_offset + edgelist_edge_counts[i]))); } - num_local_unique_edge_majors += static_cast(tmp_major_labels.size()); - - if (multi_gpu) { - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - auto const col_comm_size = col_comm.get_size(); - - rmm::device_uvector rx_major_labels(0, handle.get_stream()); - rmm::device_uvector rx_major_counts(0, handle.get_stream()); - auto rx_sizes = host_scalar_gather( - col_comm, tmp_major_labels.size(), static_cast(i), handle.get_stream()); - std::vector rx_displs{}; - if (static_cast(i) == col_comm_rank) { - rx_displs.assign(col_comm_size, size_t{0}); - std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1); - rx_major_labels.resize(rx_displs.back() + rx_sizes.back(), handle.get_stream()); - rx_major_counts.resize(rx_major_labels.size(), handle.get_stream()); - } - device_gatherv(col_comm, - thrust::make_zip_iterator( - thrust::make_tuple(tmp_major_labels.begin(), tmp_major_counts.begin())), - thrust::make_zip_iterator( - thrust::make_tuple(rx_major_labels.begin(), rx_major_counts.begin())), - tmp_major_labels.size(), - rx_sizes, - rx_displs, - static_cast(i), - handle.get_stream()); - if (static_cast(i) == col_comm_rank) { - major_labels = std::move(rx_major_labels); - major_counts = std::move(rx_major_counts); - } - } else { - assert(i == 0); - major_labels = std::move(tmp_major_labels); - major_counts = std::move(tmp_major_counts); + sorted_unique_majors.resize(major_offset, handle.get_stream()); + + if (edgelist_majors.size() > 1) { + thrust::sort( + handle.get_thrust_policy(), sorted_unique_majors.begin(), sorted_unique_majors.end()); + sorted_unique_majors.resize(thrust::distance(sorted_unique_majors.begin(), + thrust::unique(handle.get_thrust_policy(), + sorted_unique_majors.begin(), + sorted_unique_majors.end())), + handle.get_stream()); } - } - if (multi_gpu) { - // FIXME: better refactor this sort-count_if-reduce_by_key routine for reuse - thrust::sort_by_key( - handle.get_thrust_policy(), major_labels.begin(), major_labels.end(), major_counts.begin()); - auto num_unique_labels = thrust::count_if(handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(major_labels.size()), - [labels = major_labels.data()] __device__(auto i) { - return (i == 0) || (labels[i - 1] != labels[i]); - }); - rmm::device_uvector tmp_major_labels(num_unique_labels, handle.get_stream()); - rmm::device_uvector tmp_major_counts(tmp_major_labels.size(), handle.get_stream()); - thrust::reduce_by_key(handle.get_thrust_policy(), - major_labels.begin(), - major_labels.end(), - major_counts.begin(), - tmp_major_labels.begin(), - tmp_major_counts.begin()); - major_labels = std::move(tmp_major_labels); - major_counts = std::move(tmp_major_counts); + sorted_unique_majors.shrink_to_fit(handle.get_stream()); } - // 2. acquire unique minor labels + // 2. count unique edge minors. + // if local_vertices.has_value() is false, keep unique vertices from edge minors as well (to + // construct local_vertices) - std::vector minor_displs(edgelist_minors.size(), edge_t{0}); - std::partial_sum( - edgelist_edge_counts.begin(), edgelist_edge_counts.end() - 1, minor_displs.begin() + 1); - rmm::device_uvector minor_labels(minor_displs.back() + edgelist_edge_counts.back(), - handle.get_stream()); - vertex_t minor_offset{0}; +#if 1 // FIXME: delete + handle.sync_stream(); + auto time1 = std::chrono::steady_clock::now(); +#endif + rmm::device_uvector sorted_unique_minors(num_local_edges, handle.get_stream()); + size_t minor_offset{0}; for (size_t i = 0; i < edgelist_minors.size(); ++i) { thrust::copy(handle.get_thrust_policy(), edgelist_minors[i], edgelist_minors[i] + edgelist_edge_counts[i], - minor_labels.begin() + minor_offset); + sorted_unique_minors.begin() + minor_offset); thrust::sort(handle.get_thrust_policy(), - minor_labels.begin() + minor_offset, - minor_labels.begin() + minor_offset + edgelist_edge_counts[i]); - minor_offset += thrust::distance( - minor_labels.begin() + minor_offset, + sorted_unique_minors.begin() + minor_offset, + sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i]); + minor_offset += static_cast(thrust::distance( + sorted_unique_minors.begin() + minor_offset, thrust::unique(handle.get_thrust_policy(), - minor_labels.begin() + minor_offset, - minor_labels.begin() + minor_offset + edgelist_edge_counts[i])); + sorted_unique_minors.begin() + minor_offset, + sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i]))); } - minor_labels.resize(minor_offset, handle.get_stream()); - thrust::sort(handle.get_thrust_policy(), minor_labels.begin(), minor_labels.end()); - minor_labels.resize( - thrust::distance( - minor_labels.begin(), - thrust::unique(handle.get_thrust_policy(), minor_labels.begin(), minor_labels.end())), - handle.get_stream()); - auto num_local_unique_edge_minors = static_cast(minor_labels.size()); - if (multi_gpu) { + sorted_unique_minors.resize(minor_offset, handle.get_stream()); + if (edgelist_minors.size() > 1) { + thrust::sort( + handle.get_thrust_policy(), sorted_unique_minors.begin(), sorted_unique_minors.end()); + sorted_unique_minors.resize(thrust::distance(sorted_unique_minors.begin(), + thrust::unique(handle.get_thrust_policy(), + sorted_unique_minors.begin(), + sorted_unique_minors.end())), + handle.get_stream()); + } + + num_local_unique_edge_minors = static_cast(sorted_unique_minors.size()); + + if (local_vertices) { sorted_unique_minors.resize(0, handle.get_stream()); } + sorted_unique_minors.shrink_to_fit(handle.get_stream()); + + // 3. update sorted_local_vertices. + // if local_vertices.has_value() is false, reconstruct local_vertices first + +#if 1 // FIXME: delete + handle.sync_stream(); + auto time2 = std::chrono::steady_clock::now(); +#endif + if (!local_vertices) { + sorted_local_vertices.resize(sorted_unique_majors.size() + sorted_unique_minors.size(), + handle.get_stream()); + + thrust::merge(handle.get_thrust_policy(), + sorted_unique_majors.begin(), + sorted_unique_majors.end(), + sorted_unique_minors.begin(), + sorted_unique_minors.end(), + sorted_local_vertices.begin()); + + sorted_unique_majors.resize(0, handle.get_stream()); + sorted_unique_majors.shrink_to_fit(handle.get_stream()); + sorted_unique_minors.resize(0, handle.get_stream()); + sorted_unique_minors.shrink_to_fit(handle.get_stream()); + + sorted_local_vertices.resize(thrust::distance(sorted_local_vertices.begin(), + thrust::unique(handle.get_thrust_policy(), + sorted_local_vertices.begin(), + sorted_local_vertices.end())), + handle.get_stream()); + sorted_local_vertices.shrink_to_fit(handle.get_stream()); + + if constexpr (multi_gpu) { + sorted_local_vertices = + cugraph::detail::shuffle_vertices_by_gpu_id(handle, std::move(sorted_local_vertices)); + } + } else { + sorted_local_vertices = std::move(*local_vertices); + thrust::sort( + handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end()); + } + + // 4. compute global degrees for the sorted local vertices, and count unique edge majors on the + // way + +#if 1 // FIXME: delete + handle.sync_stream(); + auto time3 = std::chrono::steady_clock::now(); +#endif + rmm::device_uvector sorted_local_vertex_degrees(0, handle.get_stream()); + std::optional> stream_pool_indices{std::nullopt}; // FIXME: move this inside the if statement + if constexpr (multi_gpu) { auto& comm = handle.get_comms(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); + + assert(edgelist_majors.size() == col_comm_size); - if (row_comm_size > 1) { - rmm::device_uvector rx_minor_labels(0, handle.get_stream()); - std::tie(rx_minor_labels, std::ignore) = groupby_gpuid_and_shuffle_values( - row_comm, - minor_labels.begin(), - minor_labels.end(), - [key_func = detail::compute_gpu_id_from_vertex_t{row_comm_size}] __device__( - auto val) { return key_func(val); }, + auto edge_partition_major_sizes = + host_scalar_allgather(col_comm, sorted_local_vertices.size(), handle.get_stream()); + + if ((col_comm_size >= 2) && (handle.get_stream_pool_size() >= 2)) { + auto vertex_edge_counts = host_scalar_allreduce( + comm, + thrust::make_tuple(static_cast(sorted_local_vertices.size()), num_local_edges), + raft::comms::op_t::SUM, handle.get_stream()); - thrust::sort(handle.get_thrust_policy(), rx_minor_labels.begin(), rx_minor_labels.end()); - rx_minor_labels.resize(thrust::distance(rx_minor_labels.begin(), - thrust::unique(handle.get_thrust_policy(), - rx_minor_labels.begin(), - rx_minor_labels.end())), - handle.get_stream()); - minor_labels = std::move(rx_minor_labels); + // memory footprint vs parallelism trade-off + // peak memory requirement per loop is + // min( + // (E / (comm_size * col_comm_size)) * sizeof(vertex_t) * 2, + // (E / (comm_size * col_comm_size)) * sizeof(vertex_t) + + // (V/P) * (sizeof(vertex_t) + sizeof(edge_t)), + // (V/P) * (sizeof(vertex_t) + sizeof(edge_t)) * 2 + // ) + // and limit temporary memory requirement to (E / comm_size) * sizeof(vertex_t) * 2 + auto avg_vertex_degree = thrust::get<0>(vertex_edge_counts) > 0 + ? static_cast(thrust::get<1>(vertex_edge_counts)) / + static_cast(thrust::get<0>(vertex_edge_counts)) + : double{0.0}; + auto num_streams = + std::min(static_cast(avg_vertex_degree * + (static_cast(sizeof(vertex_t)) / + static_cast(sizeof(vertex_t) + sizeof(edge_t)))), + static_cast( + std::min(static_cast(col_comm_size), handle.get_stream_pool_size()))); + if (num_streams >= 2) { + stream_pool_indices = std::vector(num_streams); + std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); + handle.sync_stream(); + } } - } - minor_labels.shrink_to_fit(handle.get_stream()); - - // 3. merge major and minor labels and vertex labels - - rmm::device_uvector merged_labels(major_labels.size() + minor_labels.size(), - handle.get_stream()); - rmm::device_uvector merged_counts(merged_labels.size(), handle.get_stream()); - thrust::merge_by_key(handle.get_thrust_policy(), - major_labels.begin(), - major_labels.end(), - minor_labels.begin(), - minor_labels.end(), - major_counts.begin(), - thrust::make_constant_iterator(edge_t{0}), - merged_labels.begin(), - merged_counts.begin()); - - major_labels.resize(0, handle.get_stream()); - major_counts.resize(0, handle.get_stream()); - minor_labels.resize(0, handle.get_stream()); - major_labels.shrink_to_fit(handle.get_stream()); - major_counts.shrink_to_fit(handle.get_stream()); - minor_labels.shrink_to_fit(handle.get_stream()); - - rmm::device_uvector labels(merged_labels.size(), handle.get_stream()); - rmm::device_uvector counts(labels.size(), handle.get_stream()); - auto pair_it = thrust::reduce_by_key(handle.get_thrust_policy(), - merged_labels.begin(), - merged_labels.end(), - merged_counts.begin(), - labels.begin(), - counts.begin()); - merged_labels.resize(0, handle.get_stream()); - merged_counts.resize(0, handle.get_stream()); - merged_labels.shrink_to_fit(handle.get_stream()); - merged_counts.shrink_to_fit(handle.get_stream()); - labels.resize(thrust::distance(labels.begin(), thrust::get<0>(pair_it)), handle.get_stream()); - counts.resize(labels.size(), handle.get_stream()); - labels.shrink_to_fit(handle.get_stream()); - counts.shrink_to_fit(handle.get_stream()); - - auto num_non_isolated_vertices = static_cast(labels.size()); - - // 4. if local_vertices.has_value() == true, append isolated vertices + stream_pool_indices = std::nullopt; // FIXME: delete - if (local_vertices) { - rmm::device_uvector isolated_vertices(0, handle.get_stream()); - - auto num_isolated_vertices = thrust::count_if( - handle.get_thrust_policy(), - (*local_vertices).begin(), - (*local_vertices).end(), - [label_first = labels.begin(), label_last = labels.end()] __device__(auto v) { - return !thrust::binary_search(thrust::seq, label_first, label_last, v); - }); - isolated_vertices.resize(num_isolated_vertices, handle.get_stream()); - thrust::copy_if(handle.get_thrust_policy(), - (*local_vertices).begin(), - (*local_vertices).end(), - isolated_vertices.begin(), - [label_first = labels.begin(), label_last = labels.end()] __device__(auto v) { - return !thrust::binary_search(thrust::seq, label_first, label_last, v); - }); - (*local_vertices).resize(0, handle.get_stream()); - (*local_vertices).shrink_to_fit(handle.get_stream()); - - if (isolated_vertices.size() > 0) { - labels.resize(labels.size() + isolated_vertices.size(), handle.get_stream()); - thrust::copy(handle.get_thrust_policy(), - isolated_vertices.begin(), - isolated_vertices.end(), - labels.end() - isolated_vertices.size()); + for (int i = 0; i < col_comm_size; ++i) { + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*stream_pool_indices)[i % (*stream_pool_indices).size()]) + : handle.get_stream(); + + rmm::device_uvector tmp_majors(edgelist_edge_counts[i], loop_stream); + thrust::copy(rmm::exec_policy(loop_stream), + edgelist_majors[i], + edgelist_majors[i] + edgelist_edge_counts[i], + tmp_majors.begin()); + thrust::sort(rmm::exec_policy(loop_stream), tmp_majors.begin(), tmp_majors.end()); + auto num_unique_majors = + thrust::count_if(rmm::exec_policy(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(tmp_majors.size()), + [majors = tmp_majors.data()] __device__(auto idx) { + return (idx == 0) || (majors[idx - 1] != majors[idx]); + }); + rmm::device_uvector tmp_keys(num_unique_majors, loop_stream); + rmm::device_uvector tmp_values(num_unique_majors, loop_stream); + thrust::reduce_by_key(rmm::exec_policy(loop_stream), + tmp_majors.begin(), + tmp_majors.end(), + thrust::make_constant_iterator(edge_t{1}), + tmp_keys.begin(), + tmp_values.begin()); + + num_local_unique_edge_majors += num_unique_majors; + + tmp_majors.resize(0, loop_stream); + tmp_majors.shrink_to_fit(loop_stream); + + rmm::device_uvector sorted_majors(edge_partition_major_sizes[i], loop_stream); + device_bcast(col_comm, + sorted_local_vertices.data(), + sorted_majors.data(), + edge_partition_major_sizes[i], + static_cast(i), + loop_stream); + + rmm::device_uvector sorted_major_degrees(sorted_majors.size(), loop_stream); + thrust::fill(rmm::exec_policy(loop_stream), + sorted_major_degrees.begin(), + sorted_major_degrees.end(), + edge_t{0}); + + auto kv_pair_first = + thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin())); + thrust::for_each( + rmm::exec_policy(loop_stream), + kv_pair_first, + kv_pair_first + tmp_keys.size(), + [sorted_major_first = sorted_majors.begin(), + sorted_major_last = sorted_majors.end(), + degrees = sorted_major_degrees.begin()] __device__(auto pair) { + auto it = thrust::lower_bound( + thrust::seq, sorted_major_first, sorted_major_last, thrust::get<0>(pair)); + *(degrees + thrust::distance(sorted_major_first, it)) = thrust::get<1>(pair); + }); + + device_reduce(col_comm, + sorted_major_degrees.begin(), + sorted_major_degrees.begin(), + edge_partition_major_sizes[i], + raft::comms::op_t::SUM, + i, + loop_stream); + if (i == col_comm_rank) { sorted_local_vertex_degrees = std::move(sorted_major_degrees); } } + + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + } else { + assert(edgelist_majors.size() == 1); + + rmm::device_uvector tmp_majors(edgelist_edge_counts[0], handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + edgelist_majors[0], + edgelist_majors[0] + edgelist_edge_counts[0], + tmp_majors.begin()); + thrust::sort(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end()); + auto num_unique_majors = + thrust::count_if(handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(tmp_majors.size()), + [majors = tmp_majors.data()] __device__(auto idx) { + return (idx == 0) || (majors[idx - 1] != majors[idx]); + }); + rmm::device_uvector tmp_keys(num_unique_majors, handle.get_stream()); + rmm::device_uvector tmp_values(num_unique_majors, handle.get_stream()); + thrust::reduce_by_key(handle.get_thrust_policy(), + tmp_majors.begin(), + tmp_majors.end(), + thrust::make_constant_iterator(edge_t{1}), + tmp_keys.begin(), + tmp_values.begin()); + + num_local_unique_edge_majors += num_unique_majors; + + tmp_majors.resize(0, handle.get_stream()); + tmp_majors.shrink_to_fit(handle.get_stream()); + + sorted_local_vertex_degrees.resize(sorted_local_vertices.size(), handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + sorted_local_vertex_degrees.begin(), + sorted_local_vertex_degrees.end(), + edge_t{0}); + + auto kv_pair_first = + thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin())); + thrust::for_each(handle.get_thrust_policy(), + kv_pair_first, + kv_pair_first + tmp_keys.size(), + [sorted_major_first = sorted_local_vertices.begin(), + sorted_major_last = sorted_local_vertices.end(), + degrees = sorted_local_vertex_degrees.begin()] __device__(auto pair) { + auto it = thrust::lower_bound( + thrust::seq, sorted_major_first, sorted_major_last, thrust::get<0>(pair)); + *(degrees + thrust::distance(sorted_major_first, it)) = thrust::get<1>(pair); + }); } - // 5. sort non-isolated vertices by degree + // 4. sort local vertices by degree (descending) +#if 1 // FIXME: delete + handle.sync_stream(); + auto time4 = std::chrono::steady_clock::now(); +#endif thrust::sort_by_key(handle.get_thrust_policy(), - counts.begin(), - counts.begin() + num_non_isolated_vertices, - labels.begin(), + sorted_local_vertex_degrees.begin(), + sorted_local_vertex_degrees.end(), + sorted_local_vertices.begin(), thrust::greater()); - // 6. compute segment_offsets + // 5. compute segment_offsets +#if 1 // FIXME: delete + handle.sync_stream(); + auto time5 = std::chrono::steady_clock::now(); +#endif static_assert(detail::num_sparse_segments_per_vertex_partition == 3); static_assert((detail::low_degree_threshold <= detail::mid_degree_threshold) && @@ -320,14 +409,14 @@ compute_renumber_map(raft::handle_t const& handle, handle.get_stream()); auto zero_vertex = vertex_t{0}; - auto vertex_count = static_cast(labels.size()); + auto vertex_count = static_cast(sorted_local_vertices.size()); d_segment_offsets.set_element_async(0, zero_vertex, handle.get_stream()); d_segment_offsets.set_element_async( num_segments_per_vertex_partition, vertex_count, handle.get_stream()); thrust::upper_bound(handle.get_thrust_policy(), - counts.begin(), - counts.end(), + sorted_local_vertex_degrees.begin(), + sorted_local_vertex_degrees.end(), d_thresholds.begin(), d_thresholds.end(), d_segment_offsets.begin() + 1, @@ -340,7 +429,25 @@ compute_renumber_map(raft::handle_t const& handle, handle.get_stream()); handle.sync_stream(); - return std::make_tuple(std::move(labels), +#if 1 // FIXME: delete + handle.sync_stream(); + auto time6 = std::chrono::steady_clock::now(); + std::chrono::duration elapsed_total = time6 - time0; + std::chrono::duration elapsed0 = time1 - time0; + std::chrono::duration elapsed1 = time2 - time1; + std::chrono::duration elapsed2 = time3 - time2; + std::chrono::duration elapsed3 = time4 - time3; + std::chrono::duration elapsed4 = time5 - time4; + std::chrono::duration elapsed5 = time6 - time5; + std::cout << "Compute renumber map (num_streams:" + << (stream_pool_indices ? (*stream_pool_indices).size() : size_t{0}) << ") took " + << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," + << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," + << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << "," + << elapsed5.count() * 1e3 << ") ms." << std::endl; +#endif + + return std::make_tuple(std::move(sorted_local_vertices), h_segment_offsets, num_local_unique_edge_majors, num_local_unique_edge_minors); @@ -436,17 +543,19 @@ void expensive_check_edgelist( rmm::device_uvector sorted_majors(0, handle.get_stream()); { - auto recvcounts = - host_scalar_allgather(col_comm, (*sorted_local_vertices).size(), handle.get_stream()); - std::vector displacements(recvcounts.size(), size_t{0}); - std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); - sorted_majors.resize(displacements.back() + recvcounts.back(), handle.get_stream()); - device_allgatherv(col_comm, - (*sorted_local_vertices).data(), - sorted_majors.data(), - recvcounts, - displacements, - handle.get_stream()); + auto major_size = + host_scalar_bcast(col_comm, + static_cast(i) == col_comm_rank ? (*sorted_local_vertices).size() + : size_t{0} /* dummy */, + i, + handle.get_stream()); + sorted_majors.resize(major_size, handle.get_stream()); + device_bcast(col_comm, + (*sorted_local_vertices).begin(), + sorted_majors.begin(), + major_size, + i, + handle.get_stream()); thrust::sort(handle.get_thrust_policy(), sorted_majors.begin(), sorted_majors.end()); } @@ -556,10 +665,12 @@ renumber_edgelist( std::optional>> const& edgelist_intra_partition_segment_offsets, bool do_expensive_check) { -#if 1 // FIXME: delete +#if 1 // FIXME: delete handle.sync_stream(); if constexpr (multi_gpu) { - handle.get_comms().barrier(); + rmm::device_uvector dummy(1, handle.get_stream()); + handle.get_comms().allreduce( + dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream()); } auto time0 = std::chrono::steady_clock::now(); #endif @@ -620,7 +731,7 @@ renumber_edgelist( // 1. compute renumber map -#if 1 // FIXME: delete +#if 1 // FIXME: delete handle.sync_stream(); auto time1 = std::chrono::steady_clock::now(); #endif @@ -637,7 +748,7 @@ renumber_edgelist( // 2. initialize partition_t object, number_of_vertices, and number_of_edges for the coarsened // graph -#if 1 // FIXME: delete +#if 1 // FIXME: delete handle.sync_stream(); auto time2 = std::chrono::steady_clock::now(); #endif @@ -664,7 +775,7 @@ renumber_edgelist( // FIXME: compare this hash based approach with a binary search based approach in both memory // footprint and execution time -#if 1 // FIXME: delete +#if 1 // FIXME: delete handle.sync_stream(); auto time3 = std::chrono::steady_clock::now(); #endif @@ -715,7 +826,7 @@ renumber_edgelist( } } -#if 1 // FIXME: delete +#if 1 // FIXME: delete handle.sync_stream(); auto time4 = std::chrono::steady_clock::now(); #endif @@ -814,16 +925,19 @@ renumber_edgelist( handle.get_stream()); } } -#if 1 // FIXME: delete +#if 1 // FIXME: delete handle.sync_stream(); - auto time5 = std::chrono::steady_clock::now(); + auto time5 = std::chrono::steady_clock::now(); std::chrono::duration elapsed_total = time5 - time0; - std::chrono::duration elapsed0 = time1 - time0; - std::chrono::duration elapsed1 = time2 - time1; - std::chrono::duration elapsed2 = time3 - time2; - std::chrono::duration elapsed3 = time4 - time3; - std::chrono::duration elapsed4 = time5 - time4; - std::cout << "Renumber took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << ") ms." << std::endl; + std::chrono::duration elapsed0 = time1 - time0; + std::chrono::duration elapsed1 = time2 - time1; + std::chrono::duration elapsed2 = time3 - time2; + std::chrono::duration elapsed3 = time4 - time3; + std::chrono::duration elapsed4 = time5 - time4; + std::cout << "Renumber took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" + << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," + << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," + << elapsed4.count() * 1e3 << ") ms." << std::endl; #endif return std::make_tuple( From 0744160e841bf2566c9725edec7635ea3ddf8d3d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 1 Feb 2022 01:09:26 -0800 Subject: [PATCH 06/60] move is_first_in_run_t to graph_utils.cuh --- cpp/include/cugraph/detail/graph_utils.cuh | 8 ++++++++ .../copy_v_transform_reduce_key_aggregated_out_nbr.cuh | 10 ---------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/cpp/include/cugraph/detail/graph_utils.cuh b/cpp/include/cugraph/detail/graph_utils.cuh index 2d9ee2b2547..ca918c53a62 100644 --- a/cpp/include/cugraph/detail/graph_utils.cuh +++ b/cpp/include/cugraph/detail/graph_utils.cuh @@ -77,5 +77,13 @@ struct compute_partition_id_from_edge_t { } }; +template +struct is_first_in_run_t { + vertex_t const* vertices{nullptr}; + __device__ bool operator()(size_t i) const { + return (i == 0) || (vertices[i - 1] != vertices[i]); + } +}; + } // namespace detail } // namespace cugraph diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index 4cf6ce5b4cb..1dee131a000 100644 --- a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -99,16 +99,6 @@ struct call_key_aggregated_e_op_t { } }; -// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used -template -struct is_first_in_run_t { - vertex_t const* major_vertices{nullptr}; - __device__ bool operator()(size_t i) const - { - return ((i == 0) || (major_vertices[i] != major_vertices[i - 1])) ? true : false; - } -}; - // a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used template struct is_valid_vertex_t { From 077008bc3770440bd90bfa54a169152afd749ef1 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 1 Feb 2022 01:11:00 -0800 Subject: [PATCH 07/60] avoid using device lambdas --- cpp/src/structure/renumber_edgelist_impl.cuh | 134 ++++++++++--------- 1 file changed, 69 insertions(+), 65 deletions(-) diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 2d501b6f1cc..3f7891a7c4e 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -45,6 +45,38 @@ namespace cugraph { namespace detail { +template +struct check_edge_src_and_dst_t { + vertex_t const* sorted_majors{nullptr}; + vertex_t num_majors{0}; + vertex_t const* sorted_minors{nullptr}; + vertex_t num_minors{0}; + + __device__ bool operator()(thrust::tuple e) const + { + return !thrust::binary_search( + thrust::seq, sorted_majors, sorted_majors + num_majors, thrust::get<0>(e)) || + !thrust::binary_search( + thrust::seq, sorted_minors, sorted_minors + num_minors, thrust::get<1>(e)); + } +}; + +template +struct search_and_set_degree_t { + vertex_t const* sorted_vertices{nullptr}; + vertex_t num_vertices{0}; + edge_t* degrees{nullptr}; + + __device__ void operator()(thrust::tuple vertex_degree_pair) const + { + auto it = thrust::lower_bound(thrust::seq, + sorted_vertices, + sorted_vertices + num_vertices, + thrust::get<0>(vertex_degree_pair)); + *(degrees + thrust::distance(sorted_vertices, it)) = thrust::get<1>(vertex_degree_pair); + } +}; + // returns renumber map, segment_offsets, and # unique edge majors & minors template std::tuple, std::vector, vertex_t, vertex_t> @@ -192,7 +224,8 @@ compute_renumber_map(raft::handle_t const& handle, auto time3 = std::chrono::steady_clock::now(); #endif rmm::device_uvector sorted_local_vertex_degrees(0, handle.get_stream()); - std::optional> stream_pool_indices{std::nullopt}; // FIXME: move this inside the if statement + std::optional> stream_pool_indices{ + std::nullopt}; // FIXME: move this inside the if statement if constexpr (multi_gpu) { auto& comm = handle.get_comms(); auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); @@ -249,13 +282,10 @@ compute_renumber_map(raft::handle_t const& handle, edgelist_majors[i] + edgelist_edge_counts[i], tmp_majors.begin()); thrust::sort(rmm::exec_policy(loop_stream), tmp_majors.begin(), tmp_majors.end()); - auto num_unique_majors = - thrust::count_if(rmm::exec_policy(loop_stream), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(tmp_majors.size()), - [majors = tmp_majors.data()] __device__(auto idx) { - return (idx == 0) || (majors[idx - 1] != majors[idx]); - }); + auto num_unique_majors = thrust::count_if(rmm::exec_policy(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(tmp_majors.size()), + is_first_in_run_t{tmp_majors.data()}); rmm::device_uvector tmp_keys(num_unique_majors, loop_stream); rmm::device_uvector tmp_values(num_unique_majors, loop_stream); thrust::reduce_by_key(rmm::exec_policy(loop_stream), @@ -286,17 +316,12 @@ compute_renumber_map(raft::handle_t const& handle, auto kv_pair_first = thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin())); - thrust::for_each( - rmm::exec_policy(loop_stream), - kv_pair_first, - kv_pair_first + tmp_keys.size(), - [sorted_major_first = sorted_majors.begin(), - sorted_major_last = sorted_majors.end(), - degrees = sorted_major_degrees.begin()] __device__(auto pair) { - auto it = thrust::lower_bound( - thrust::seq, sorted_major_first, sorted_major_last, thrust::get<0>(pair)); - *(degrees + thrust::distance(sorted_major_first, it)) = thrust::get<1>(pair); - }); + thrust::for_each(rmm::exec_policy(loop_stream), + kv_pair_first, + kv_pair_first + tmp_keys.size(), + search_and_set_degree_t{sorted_majors.data(), + static_cast(sorted_majors.size()), + sorted_major_degrees.data()}); device_reduce(col_comm, sorted_major_degrees.begin(), @@ -318,13 +343,10 @@ compute_renumber_map(raft::handle_t const& handle, edgelist_majors[0] + edgelist_edge_counts[0], tmp_majors.begin()); thrust::sort(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end()); - auto num_unique_majors = - thrust::count_if(handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(tmp_majors.size()), - [majors = tmp_majors.data()] __device__(auto idx) { - return (idx == 0) || (majors[idx - 1] != majors[idx]); - }); + auto num_unique_majors = thrust::count_if(handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(tmp_majors.size()), + is_first_in_run_t{tmp_majors.data()}); rmm::device_uvector tmp_keys(num_unique_majors, handle.get_stream()); rmm::device_uvector tmp_values(num_unique_majors, handle.get_stream()); thrust::reduce_by_key(handle.get_thrust_policy(), @@ -350,13 +372,9 @@ compute_renumber_map(raft::handle_t const& handle, thrust::for_each(handle.get_thrust_policy(), kv_pair_first, kv_pair_first + tmp_keys.size(), - [sorted_major_first = sorted_local_vertices.begin(), - sorted_major_last = sorted_local_vertices.end(), - degrees = sorted_local_vertex_degrees.begin()] __device__(auto pair) { - auto it = thrust::lower_bound( - thrust::seq, sorted_major_first, sorted_major_last, thrust::get<0>(pair)); - *(degrees + thrust::distance(sorted_major_first, it)) = thrust::get<1>(pair); - }); + search_and_set_degree_t{sorted_local_vertices.data(), + static_cast(sorted_local_vertices.size()), + sorted_major_degrees.data()}); } // 4. sort local vertices by degree (descending) @@ -577,22 +595,16 @@ void expensive_check_edgelist( auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors[i], edgelist_minors[i])); - CUGRAPH_EXPECTS( - thrust::count_if( - handle.get_thrust_policy(), - edge_first, - edge_first + edgelist_edge_counts[i], - [num_majors = static_cast(sorted_majors.size()), - sorted_majors = sorted_majors.data(), - num_minors = static_cast(sorted_minors.size()), - sorted_minors = sorted_minors.data()] __device__(auto e) { - return !thrust::binary_search( - thrust::seq, sorted_majors, sorted_majors + num_majors, thrust::get<0>(e)) || - !thrust::binary_search( - thrust::seq, sorted_minors, sorted_minors + num_minors, thrust::get<1>(e)); - }) == 0, - "Invalid input argument: edgelist_majors and/or edgelist_minors have " - "invalid vertex ID(s)."); + CUGRAPH_EXPECTS(thrust::count_if(handle.get_thrust_policy(), + edge_first, + edge_first + edgelist_edge_counts[i], + check_edge_src_and_dst_t{ + sorted_majors.data(), + static_cast(sorted_majors.size()), + sorted_minors.data(), + static_cast(sorted_minors.size())}) == 0, + "Invalid input argument: edgelist_majors and/or edgelist_minors have " + "invalid vertex ID(s)."); } if (edgelist_intra_partition_segment_offsets) { @@ -623,22 +635,14 @@ void expensive_check_edgelist( auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors[0], edgelist_minors[0])); CUGRAPH_EXPECTS( - thrust::count_if( - handle.get_thrust_policy(), - edge_first, - edge_first + edgelist_edge_counts[0], - [sorted_local_vertices = (*sorted_local_vertices).data(), - num_sorted_local_vertices = - static_cast((*sorted_local_vertices).size())] __device__(auto e) { - return !thrust::binary_search(thrust::seq, - sorted_local_vertices, - sorted_local_vertices + num_sorted_local_vertices, - thrust::get<0>(e)) || - !thrust::binary_search(thrust::seq, - sorted_local_vertices, - sorted_local_vertices + num_sorted_local_vertices, - thrust::get<1>(e)); - }) == 0, + thrust::count_if(handle.get_thrust_policy(), + edge_first, + edge_first + edgelist_edge_counts[0], + check_edge_src_and_dst_t{ + (*sorted_local_vertices).data(), + static_cast((*sorted_local_vertices).size()), + (*sorted_local_vertices).data(), + static_cast((*sorted_local_vertices).size())}) == 0, "Invalid input argument: edgelist_majors and/or edgelist_minors have " "invalid vertex ID(s)."); } From 6a0dfa1850cf3aa912f39b1a4ca8659232ac8fdc Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 1 Feb 2022 12:54:50 -0800 Subject: [PATCH 08/60] fix compile errors --- cpp/src/structure/renumber_edgelist_impl.cuh | 26 +++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 3f7891a7c4e..e186571d38d 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -316,12 +316,13 @@ compute_renumber_map(raft::handle_t const& handle, auto kv_pair_first = thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin())); - thrust::for_each(rmm::exec_policy(loop_stream), - kv_pair_first, - kv_pair_first + tmp_keys.size(), - search_and_set_degree_t{sorted_majors.data(), - static_cast(sorted_majors.size()), - sorted_major_degrees.data()}); + thrust::for_each( + rmm::exec_policy(loop_stream), + kv_pair_first, + kv_pair_first + tmp_keys.size(), + search_and_set_degree_t{sorted_majors.data(), + static_cast(sorted_majors.size()), + sorted_major_degrees.data()}); device_reduce(col_comm, sorted_major_degrees.begin(), @@ -369,12 +370,13 @@ compute_renumber_map(raft::handle_t const& handle, auto kv_pair_first = thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin())); - thrust::for_each(handle.get_thrust_policy(), - kv_pair_first, - kv_pair_first + tmp_keys.size(), - search_and_set_degree_t{sorted_local_vertices.data(), - static_cast(sorted_local_vertices.size()), - sorted_major_degrees.data()}); + thrust::for_each( + handle.get_thrust_policy(), + kv_pair_first, + kv_pair_first + tmp_keys.size(), + search_and_set_degree_t{sorted_local_vertices.data(), + static_cast(sorted_local_vertices.size()), + sorted_local_vertex_degrees.data()}); } // 4. sort local vertices by degree (descending) From 41645aa63b2a0999b0e639a034f422a7dd41b4d0 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 1 Feb 2022 16:49:02 -0800 Subject: [PATCH 09/60] code cleanup --- cpp/src/structure/renumber_edgelist_impl.cuh | 146 ++++++++++--------- 1 file changed, 74 insertions(+), 72 deletions(-) diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index e186571d38d..f1358ddfddb 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -101,8 +101,8 @@ compute_renumber_map(raft::handle_t const& handle, edge_t num_local_edges = std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end()); - // 1. if local_vertices.has_value() is false, keep unique vertices from edge majors as well (to - // construct local_vertices) + // 1. if local_vertices.has_value() is false, find unique vertices from edge majors (to construct + // local_vertices) unique edge majors will be counted in step 4. rmm::device_uvector sorted_unique_majors(0, handle.get_stream()); if (!local_vertices) { @@ -183,7 +183,11 @@ compute_renumber_map(raft::handle_t const& handle, handle.sync_stream(); auto time2 = std::chrono::steady_clock::now(); #endif - if (!local_vertices) { + if (local_vertices) { + sorted_local_vertices = std::move(*local_vertices); + thrust::sort( + handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end()); + } else { sorted_local_vertices.resize(sorted_unique_majors.size() + sorted_unique_minors.size(), handle.get_stream()); @@ -209,11 +213,15 @@ compute_renumber_map(raft::handle_t const& handle, if constexpr (multi_gpu) { sorted_local_vertices = cugraph::detail::shuffle_vertices_by_gpu_id(handle, std::move(sorted_local_vertices)); + thrust::sort( + handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end()); + sorted_local_vertices.resize(thrust::distance(sorted_local_vertices.begin(), + thrust::unique(handle.get_thrust_policy(), + sorted_local_vertices.begin(), + sorted_local_vertices.end())), + handle.get_stream()); + sorted_local_vertices.shrink_to_fit(handle.get_stream()); } - } else { - sorted_local_vertices = std::move(*local_vertices); - thrust::sort( - handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end()); } // 4. compute global degrees for the sorted local vertices, and count unique edge majors on the @@ -272,8 +280,7 @@ compute_renumber_map(raft::handle_t const& handle, for (int i = 0; i < col_comm_size; ++i) { auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool( - (*stream_pool_indices)[i % (*stream_pool_indices).size()]) + ? handle.get_stream_from_stream_pool(i % (*stream_pool_indices).size()) : handle.get_stream(); rmm::device_uvector tmp_majors(edgelist_edge_counts[i], loop_stream); @@ -305,7 +312,7 @@ compute_renumber_map(raft::handle_t const& handle, sorted_local_vertices.data(), sorted_majors.data(), edge_partition_major_sizes[i], - static_cast(i), + i, loop_stream); rmm::device_uvector sorted_major_degrees(sorted_majors.size(), loop_stream); @@ -517,20 +524,6 @@ void expensive_check_edgelist( "Invalid input argument: both edgelist_majors.size() & " "edgelist_minors.size() should coincide with col_comm_size."); - if (sorted_local_vertices) { - CUGRAPH_EXPECTS( - thrust::count_if( - handle.get_thrust_policy(), - (*sorted_local_vertices).begin(), - (*sorted_local_vertices).end(), - [comm_rank, - key_func = - detail::compute_gpu_id_from_vertex_t{comm_size}] __device__(auto val) { - return key_func(val) != comm_rank; - }) == 0, - "Invalid input argument: local_vertices should be pre-shuffled."); - } - for (size_t i = 0; i < edgelist_majors.size(); ++i) { auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors[i], edgelist_minors[i])); @@ -557,46 +550,75 @@ void expensive_check_edgelist( "Invalid input argument: edgelist_majors & edgelist_minors should be " "pre-shuffled."); - if (sorted_local_vertices) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + if (edgelist_intra_partition_segment_offsets) { + for (int j = 0; j < row_comm_size; ++j) { + CUGRAPH_EXPECTS( + thrust::count_if( + handle.get_thrust_policy(), + edgelist_minors[i] + (*edgelist_intra_partition_segment_offsets)[i][j], + edgelist_minors[i] + (*edgelist_intra_partition_segment_offsets)[i][j + 1], + [row_comm_size, + col_comm_rank, + j, + gpu_id_key_func = + detail::compute_gpu_id_from_vertex_t{comm_size}] __device__(auto minor) { + return gpu_id_key_func(minor) != col_comm_rank * row_comm_size + j; + }) == 0, + "Invalid input argument: if edgelist_intra_partition_segment_offsets.has_value() is " + "true, edgelist_majors & edgelist_minors should be properly grouped " + "within each local partition."); + } + } + } + + if (sorted_local_vertices) { + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + + CUGRAPH_EXPECTS( + thrust::count_if( + handle.get_thrust_policy(), + (*sorted_local_vertices).begin(), + (*sorted_local_vertices).end(), + [comm_rank, + key_func = + detail::compute_gpu_id_from_vertex_t{comm_size}] __device__(auto val) { + return key_func(val) != comm_rank; + }) == 0, + "Invalid input argument: local_vertices should be pre-shuffled."); + auto major_sizes = + host_scalar_allgather(col_comm, (*sorted_local_vertices).size(), handle.get_stream()); + + rmm::device_uvector sorted_minors(0, handle.get_stream()); + auto recvcounts = + host_scalar_allgather(row_comm, (*sorted_local_vertices).size(), handle.get_stream()); + std::vector displacements(recvcounts.size(), size_t{0}); + std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); + sorted_minors.resize(displacements.back() + recvcounts.back(), handle.get_stream()); + device_allgatherv(row_comm, + (*sorted_local_vertices).data(), + sorted_minors.data(), + recvcounts, + displacements, + handle.get_stream()); + thrust::sort(handle.get_thrust_policy(), sorted_minors.begin(), sorted_minors.end()); + + for (size_t i = 0; i < edgelist_majors.size(); ++i) { rmm::device_uvector sorted_majors(0, handle.get_stream()); { - auto major_size = - host_scalar_bcast(col_comm, - static_cast(i) == col_comm_rank ? (*sorted_local_vertices).size() - : size_t{0} /* dummy */, - i, - handle.get_stream()); - sorted_majors.resize(major_size, handle.get_stream()); + sorted_majors.resize(major_sizes[i], handle.get_stream()); device_bcast(col_comm, (*sorted_local_vertices).begin(), sorted_majors.begin(), - major_size, + major_sizes[i], i, handle.get_stream()); - thrust::sort(handle.get_thrust_policy(), sorted_majors.begin(), sorted_majors.end()); - } - - rmm::device_uvector sorted_minors(0, handle.get_stream()); - { - auto recvcounts = - host_scalar_allgather(row_comm, (*sorted_local_vertices).size(), handle.get_stream()); - std::vector displacements(recvcounts.size(), size_t{0}); - std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); - sorted_minors.resize(displacements.back() + recvcounts.back(), handle.get_stream()); - device_allgatherv(row_comm, - (*sorted_local_vertices).data(), - sorted_minors.data(), - recvcounts, - displacements, - handle.get_stream()); - thrust::sort(handle.get_thrust_policy(), sorted_minors.begin(), sorted_minors.end()); } auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors[i], edgelist_minors[i])); + CUGRAPH_EXPECTS(thrust::count_if(handle.get_thrust_policy(), edge_first, edge_first + edgelist_edge_counts[i], @@ -608,26 +630,6 @@ void expensive_check_edgelist( "Invalid input argument: edgelist_majors and/or edgelist_minors have " "invalid vertex ID(s)."); } - - if (edgelist_intra_partition_segment_offsets) { - for (int j = 0; j < row_comm_size; ++j) { - CUGRAPH_EXPECTS( - thrust::count_if( - handle.get_thrust_policy(), - edgelist_minors[i] + (*edgelist_intra_partition_segment_offsets)[i][j], - edgelist_minors[i] + (*edgelist_intra_partition_segment_offsets)[i][j + 1], - [row_comm_size, - col_comm_rank, - j, - gpu_id_key_func = - detail::compute_gpu_id_from_vertex_t{comm_size}] __device__(auto minor) { - return gpu_id_key_func(minor) != col_comm_rank * row_comm_size + j; - }) == 0, - "Invalid input argument: if edgelist_intra_partition_segment_offsets.has_value() is " - "true, edgelist_majors & edgelist_minors should be properly grouped " - "within each local partition."); - } - } } } else { assert(edgelist_majors.size() == 1); From 6b4b6828c5b8ca6fd6100d15d35689ca36a31e18 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 1 Feb 2022 17:13:56 -0800 Subject: [PATCH 10/60] update copy_v_transform_reduce_in_out_nbr to process multiple edge partitions in parallel --- .../copy_v_transform_reduce_in_out_nbr.cuh | 199 ++++++++++++------ 1 file changed, 139 insertions(+), 60 deletions(-) diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh index a98013ac996..fbd339fdf32 100644 --- a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh +++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh @@ -35,6 +35,7 @@ #include #include +#include #include #include @@ -475,21 +476,12 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, T init, VertexValueOutputIterator vertex_value_output_first) { -// FIXME: for temporary testing -#if 1 - cudaStream_t high_stream0{}; - cudaStream_t high_stream1{}; - cudaStream_t mid_stream{}; - cudaStream_t low_stream{}; - CUDA_TRY(cudaStreamCreateWithPriority(&high_stream0, cudaStreamNonBlocking, -5)); - CUDA_TRY(cudaStreamCreateWithPriority(&high_stream1, cudaStreamNonBlocking, -5)); - CUDA_TRY(cudaStreamCreateWithPriority(&mid_stream, cudaStreamNonBlocking, -3)); - CUDA_TRY(cudaStreamCreateWithPriority(&low_stream, cudaStreamNonBlocking, 0)); -#endif constexpr auto update_major = (in == GraphViewType::is_adj_matrix_transposed); - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using weight_t = typename GraphViewType::weight_type; + [[maybe_unused]] constexpr auto max_segments = + detail::num_sparse_segments_per_vertex_partition + size_t{1}; + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using weight_t = typename GraphViewType::weight_type; static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); @@ -524,15 +516,81 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } } + std::optional> stream_pool_indices{std::nullopt}; + size_t num_concurrent_loops{1}; // FIXME: this can go inside the loop after temporary testing +#if 1 // FIXME: for temporary testing + std::vector high_streams0{}; + std::vector high_streams1{}; + std::vector mid_streams{}; + std::vector low_streams{}; +#endif + if constexpr (GraphViewType::is_multi_gpu) { + if (handle.get_stream_pool_size() >= max_segments) { + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_size = col_comm.get_size(); + + // memory footprint vs parallelism trade-off + // peak memory requirement per loop is + // update_major ? V / comm_size * sizeof(T) : 0 + // and limit memory requirement to (E / comm_size) * sizeof(vertex_t) + + num_concurrent_loops = + std::min(static_cast(col_comm_size), handle.get_stream_pool_size() / max_segments); + if constexpr (update_major) { + size_t value_size{0}; + if constexpr (is_thrust_tuple_of_arithmetic::value) { + auto elem_sizes = compute_thrust_tuple_element_sizes{}(); + value_size = std::reduce(elem_sizes.begin(), elem_sizes.end()); + } else { + value_size = sizeof(T); + } + + auto avg_vertex_degree = graph_view.get_number_of_vertices() > 0 + ? (static_cast(graph_view.get_number_of_edges()) / + static_cast(graph_view.get_number_of_vertices())) + : double{0.0}; + + num_concurrent_loops = + std::min(static_cast(avg_vertex_degree * (static_cast(sizeof(vertex_t)) / + static_cast(value_size))), + num_concurrent_loops); + } + +#if 1 // FIXME: for temporary testing + high_streams0.resize(num_concurrent_loops); + high_streams1.resize(num_concurrent_loops); + mid_streams.resize(num_concurrent_loops); + low_streams.resize(num_concurrent_loops); + for (size_t i = 0; i < num_concurrent_loops; ++i) { + CUDA_TRY(cudaStreamCreateWithPriority(&high_streams0[i], cudaStreamNonBlocking, -2)); + CUDA_TRY(cudaStreamCreateWithPriority(&high_streams1[i], cudaStreamNonBlocking, -2)); + CUDA_TRY(cudaStreamCreateWithPriority(&mid_streams[i], cudaStreamNonBlocking, -1)); + CUDA_TRY(cudaStreamCreateWithPriority(&low_streams[i], cudaStreamNonBlocking, 0)); + } +#endif + + stream_pool_indices = std::vector(num_concurrent_loops * max_segments); + std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); + handle.sync_stream(); + } + } + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + auto loop_stream = stream_pool_indices + ? rmm::cuda_stream_view{high_streams0[i % num_concurrent_loops]} + /* FIXME for temporary testing, + handle.get_stream_from_stream_pool((i * + max_segments) % + (*stream_pool_indices).size()) */ + : handle.get_stream(); + auto matrix_partition = matrix_partition_device_view_t( graph_view.get_matrix_partition_view(i)); auto major_tmp_buffer_size = GraphViewType::is_multi_gpu && update_major ? matrix_partition.get_major_size() : vertex_t{0}; - auto major_tmp_buffer = - allocate_dataframe_buffer(major_tmp_buffer_size, handle.get_stream()); + auto major_tmp_buffer = allocate_dataframe_buffer(major_tmp_buffer_size, loop_stream); auto major_buffer_first = get_dataframe_buffer_begin(major_tmp_buffer); auto major_init = T{}; @@ -569,28 +627,28 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } else { output_buffer = vertex_value_output_first; } + + if (stream_pool_indices) { + CUDA_TRY(cudaStreamSynchronize( + high_streams0[i % num_concurrent_loops])); /* FIXME for temporary testing, + handle.sync_stream_pool(std::vector{(i * max_segments) % + (*stream_pool_indices).size()}); */ + } + auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i); - std::optional> stream_pool_indices{std::nullopt}; if (segment_offsets) { static_assert(detail::num_sparse_segments_per_vertex_partition == 3); - auto num_segments = detail::num_sparse_segments_per_vertex_partition + - (matrix_partition.get_dcs_nzd_vertex_count() ? size_t{1} : size_t{0}); - if (GraphViewType::is_multi_gpu && handle.get_stream_pool_size() >= num_segments) { - stream_pool_indices = std::vector(num_segments); - std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); - handle.sync_stream(); - } - // FIXME: we may further improve performance by 1) individually tuning block sizes for // different segments; and 2) adding one more segment for very high degree vertices and // running segmented reduction if (matrix_partition.get_dcs_nzd_vertex_count()) { auto exec_stream = stream_pool_indices - ? rmm::cuda_stream_view{high_stream0} + ? rmm::cuda_stream_view{high_streams0[i % num_concurrent_loops]} /* FIXME for temporary testing, - handle.get_stream_from_stream_pool((*stream_pool_indices)[0]) */ - : handle.get_stream(); + handle.get_stream_from_stream_pool((i * max_segments) % + (*stream_pool_indices).size()) */ + : loop_stream; if constexpr (update_major) { // this is necessary as we don't visit every vertex in the // hypersparse segment in // for_all_major_for_all_nbr_hypersparse @@ -617,11 +675,12 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } } if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { - auto exec_stream = stream_pool_indices ? rmm::cuda_stream_view{high_stream1} - /* FIXME for temporary testing, - handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count() - ? 1 : 0]) */ - : handle.get_stream(); + auto exec_stream = stream_pool_indices + ? rmm::cuda_stream_view{high_streams1[i % num_concurrent_loops]} + /* FIXME for temporary testing, + handle.get_stream_from_stream_pool((i * max_segments + 1) % + (*stream_pool_indices).size()) */ + : loop_stream; raft::grid_1d_thread_t update_grid((*segment_offsets)[3] - (*segment_offsets)[2], detail::copy_v_transform_reduce_nbr_for_all_block_size, handle.get_device_properties().maxGridSize[0]); @@ -639,11 +698,12 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, major_init); } if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { - auto exec_stream = stream_pool_indices ? rmm::cuda_stream_view{mid_stream} - /* FIXME for temporary testing, - handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count() - ? 2 : 1]) */ - : handle.get_stream(); + auto exec_stream = stream_pool_indices + ? rmm::cuda_stream_view{mid_streams[i % num_concurrent_loops]} + /* FIXME for temporary testing, + handle.get_stream_from_stream_pool((i * max_segments + 2) % + (*stream_pool_indices).size()) */ + : loop_stream; raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1], detail::copy_v_transform_reduce_nbr_for_all_block_size, handle.get_device_properties().maxGridSize[0]); @@ -661,11 +721,12 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, major_init); } if ((*segment_offsets)[1] > 0) { - auto exec_stream = stream_pool_indices ? rmm::cuda_stream_view{low_stream} - /* FIXME for temporary testing, - handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count() - ? 3 : 2]) */ - : handle.get_stream(); + auto exec_stream = stream_pool_indices + ? rmm::cuda_stream_view{low_streams[i % num_concurrent_loops]} + /* FIXME for temporary testing, + handle.get_stream_from_stream_pool((i * max_segments + 3) % + (*stream_pool_indices).size()) */ + : loop_stream; raft::grid_1d_block_t update_grid((*segment_offsets)[1], detail::copy_v_transform_reduce_nbr_for_all_block_size, handle.get_device_properties().maxGridSize[0]); @@ -686,7 +747,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, detail::copy_v_transform_reduce_nbr_for_all_block_size, handle.get_device_properties().maxGridSize[0]); detail::for_all_major_for_all_nbr_low_degree - <<>>( + <<>>( matrix_partition, matrix_partition.get_major_first(), matrix_partition.get_major_last(), @@ -715,19 +776,19 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, (*segment_offsets).back() - (*segment_offsets)[3], raft::comms::op_t::SUM, i, - high_stream0/* FIXME for temporary testing, handle.get_stream_from_stream_pool((*stream_pool_indices)[0]) */); + high_streams0[i % num_concurrent_loops]/* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) */); } if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { device_reduce( - col_comm, major_buffer_first + (*segment_offsets)[2], vertex_value_output_first + (*segment_offsets)[2], (*segment_offsets)[3] - (*segment_offsets)[2], raft::comms::op_t::SUM, i, high_stream1 /* FIXME for temporary testing, handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count() ? 1 : 0]) */); + col_comm, major_buffer_first + (*segment_offsets)[2], vertex_value_output_first + (*segment_offsets)[2], (*segment_offsets)[3] - (*segment_offsets)[2], raft::comms::op_t::SUM, i, high_streams1[i % num_concurrent_loops] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 1) % (*stream_pool_indices).size()) */); } if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { device_reduce( - col_comm, major_buffer_first + (*segment_offsets)[1], vertex_value_output_first + (*segment_offsets)[1], (*segment_offsets)[2] - (*segment_offsets)[1], raft::comms::op_t::SUM, i, mid_stream /* FIXME for temporary testing, handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count() ? 2 : 1]) */); + col_comm, major_buffer_first + (*segment_offsets)[1], vertex_value_output_first + (*segment_offsets)[1], (*segment_offsets)[2] - (*segment_offsets)[1], raft::comms::op_t::SUM, i, mid_streams[i % num_concurrent_loops] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 2) % (*stream_pool_indices).size()) */); } if ((*segment_offsets)[1] > 0) { device_reduce( - col_comm, major_buffer_first, vertex_value_output_first, (*segment_offsets)[1], raft::comms::op_t::SUM, i, low_stream /* FIXME for temporary testing, handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count() ? 3 : 2]) */); + col_comm, major_buffer_first, vertex_value_output_first, (*segment_offsets)[1], raft::comms::op_t::SUM, i, low_streams[i % num_concurrent_loops] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 3) % (*stream_pool_indices).size()) */); } } else { device_reduce(col_comm, @@ -736,20 +797,28 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, matrix_partition.get_major_size(), raft::comms::op_t::SUM, i, - handle.get_stream()); + loop_stream); } } + } - if (stream_pool_indices) { + if (stream_pool_indices) { #if 1 // FIXME: for temporary testing - CUDA_TRY(cudaStreamSynchronize(high_stream0)); - CUDA_TRY(cudaStreamSynchronize(high_stream1)); - CUDA_TRY(cudaStreamSynchronize(mid_stream)); - CUDA_TRY(cudaStreamSynchronize(low_stream)); + for (size_t i = 0; i < high_streams0.size(); ++i) { + CUDA_TRY(cudaStreamSynchronize(high_streams0[i])); + } + for (size_t i = 0; i < high_streams1.size(); ++i) { + CUDA_TRY(cudaStreamSynchronize(high_streams1[i])); + } + for (size_t i = 0; i < mid_streams.size(); ++i) { + CUDA_TRY(cudaStreamSynchronize(mid_streams[i])); + } + for (size_t i = 0; i < low_streams.size(); ++i) { + CUDA_TRY(cudaStreamSynchronize(low_streams[i])); + } #else - handle.sync_stream_pool(*stream_pool_indices); + handle.sync_stream_pool(*stream_pool_indices); #endif - } } if constexpr (GraphViewType::is_multi_gpu && !update_major) { @@ -816,10 +885,20 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } // FIXME: for temporary testing #if 1 - CUDA_TRY(cudaStreamDestroy(low_stream)); - CUDA_TRY(cudaStreamDestroy(mid_stream)); - CUDA_TRY(cudaStreamDestroy(high_stream1)); - CUDA_TRY(cudaStreamDestroy(high_stream0)); + if (stream_pool_indices) { + for (size_t i = 0; i < low_streams.size(); ++i) { + CUDA_TRY(cudaStreamDestroy(low_streams[i])); + } + for (size_t i = 0; i < mid_streams.size(); ++i) { + CUDA_TRY(cudaStreamDestroy(mid_streams[i])); + } + for (size_t i = 0; i < high_streams1.size(); ++i) { + CUDA_TRY(cudaStreamDestroy(high_streams1[i])); + } + for (size_t i = 0; i < high_streams0.size(); ++i) { + CUDA_TRY(cudaStreamDestroy(high_streams0[i])); + } + } #endif } From a0b009e9f97d09511e9dd42dd5e047f476f65ca0 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 2 Feb 2022 11:19:14 -0800 Subject: [PATCH 11/60] fix overflow bug with 2^31 or more vertices --- cpp/src/generators/generate_rmat_edgelist.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/generators/generate_rmat_edgelist.cu b/cpp/src/generators/generate_rmat_edgelist.cu index 8ee99d61747..07b01853fdd 100644 --- a/cpp/src/generators/generate_rmat_edgelist.cu +++ b/cpp/src/generators/generate_rmat_edgelist.cu @@ -94,8 +94,8 @@ std::tuple, rmm::device_uvector> generat } } } - src += src_bit_set ? static_cast(1 << bit) : 0; - dst += dst_bit_set ? static_cast(1 << bit) : 0; + src += src_bit_set ? static_cast(vertex_t{1} << bit) : 0; + dst += dst_bit_set ? static_cast(vertex_t{1} << bit) : 0; } return thrust::make_tuple(src, dst); }); From 9a70472cc33f53389fe526571281fdb02187bdd3 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 3 Feb 2022 13:22:49 -0800 Subject: [PATCH 12/60] delete temporary performance measurement code --- cpp/src/structure/graph_impl.cuh | 34 -------------------------------- 1 file changed, 34 deletions(-) diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index a4f6d55836f..ef64e60ac2f 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -486,13 +486,6 @@ graph_tget_handle_ptr()->get_comms(); @@ -625,10 +618,6 @@ graph_t elapsed_total = time5 - time0; - std::chrono::duration elapsed0 = time1 - time0; - std::chrono::duration elapsed1 = time2 - time1; - std::chrono::duration elapsed2 = time3 - time2; - std::chrono::duration elapsed3 = time4 - time3; - std::chrono::duration elapsed4 = time5 - time4; - std::cout << "Graph constructor took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << ") ms." << std::endl; -#endif } template Date: Thu, 3 Feb 2022 13:26:34 -0800 Subject: [PATCH 13/60] delete additional temporary performance measurement code --- cpp/src/structure/renumber_edgelist_impl.cuh | 87 -------------------- 1 file changed, 87 deletions(-) diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index f1358ddfddb..ac4c5f7fe1f 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -86,15 +86,6 @@ compute_renumber_map(raft::handle_t const& handle, std::vector const& edgelist_minors, std::vector const& edgelist_edge_counts) { -#if 1 // FIXME: delete - handle.sync_stream(); - if constexpr (multi_gpu) { - rmm::device_uvector dummy(1, handle.get_stream()); - handle.get_comms().allreduce( - dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream()); - } - auto time0 = std::chrono::steady_clock::now(); -#endif rmm::device_uvector sorted_local_vertices(0, handle.get_stream()); vertex_t num_local_unique_edge_majors{0}; vertex_t num_local_unique_edge_minors{0}; @@ -140,10 +131,6 @@ compute_renumber_map(raft::handle_t const& handle, // if local_vertices.has_value() is false, keep unique vertices from edge minors as well (to // construct local_vertices) -#if 1 // FIXME: delete - handle.sync_stream(); - auto time1 = std::chrono::steady_clock::now(); -#endif rmm::device_uvector sorted_unique_minors(num_local_edges, handle.get_stream()); size_t minor_offset{0}; for (size_t i = 0; i < edgelist_minors.size(); ++i) { @@ -179,10 +166,6 @@ compute_renumber_map(raft::handle_t const& handle, // 3. update sorted_local_vertices. // if local_vertices.has_value() is false, reconstruct local_vertices first -#if 1 // FIXME: delete - handle.sync_stream(); - auto time2 = std::chrono::steady_clock::now(); -#endif if (local_vertices) { sorted_local_vertices = std::move(*local_vertices); thrust::sort( @@ -227,10 +210,6 @@ compute_renumber_map(raft::handle_t const& handle, // 4. compute global degrees for the sorted local vertices, and count unique edge majors on the // way -#if 1 // FIXME: delete - handle.sync_stream(); - auto time3 = std::chrono::steady_clock::now(); -#endif rmm::device_uvector sorted_local_vertex_degrees(0, handle.get_stream()); std::optional> stream_pool_indices{ std::nullopt}; // FIXME: move this inside the if statement @@ -276,7 +255,6 @@ compute_renumber_map(raft::handle_t const& handle, handle.sync_stream(); } } - stream_pool_indices = std::nullopt; // FIXME: delete for (int i = 0; i < col_comm_size; ++i) { auto loop_stream = stream_pool_indices @@ -387,10 +365,6 @@ compute_renumber_map(raft::handle_t const& handle, } // 4. sort local vertices by degree (descending) -#if 1 // FIXME: delete - handle.sync_stream(); - auto time4 = std::chrono::steady_clock::now(); -#endif thrust::sort_by_key(handle.get_thrust_policy(), sorted_local_vertex_degrees.begin(), @@ -399,10 +373,6 @@ compute_renumber_map(raft::handle_t const& handle, thrust::greater()); // 5. compute segment_offsets -#if 1 // FIXME: delete - handle.sync_stream(); - auto time5 = std::chrono::steady_clock::now(); -#endif static_assert(detail::num_sparse_segments_per_vertex_partition == 3); static_assert((detail::low_degree_threshold <= detail::mid_degree_threshold) && @@ -456,24 +426,6 @@ compute_renumber_map(raft::handle_t const& handle, handle.get_stream()); handle.sync_stream(); -#if 1 // FIXME: delete - handle.sync_stream(); - auto time6 = std::chrono::steady_clock::now(); - std::chrono::duration elapsed_total = time6 - time0; - std::chrono::duration elapsed0 = time1 - time0; - std::chrono::duration elapsed1 = time2 - time1; - std::chrono::duration elapsed2 = time3 - time2; - std::chrono::duration elapsed3 = time4 - time3; - std::chrono::duration elapsed4 = time5 - time4; - std::chrono::duration elapsed5 = time6 - time5; - std::cout << "Compute renumber map (num_streams:" - << (stream_pool_indices ? (*stream_pool_indices).size() : size_t{0}) << ") took " - << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," - << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," - << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << "," - << elapsed5.count() * 1e3 << ") ms." << std::endl; -#endif - return std::make_tuple(std::move(sorted_local_vertices), h_segment_offsets, num_local_unique_edge_majors, @@ -673,15 +625,6 @@ renumber_edgelist( std::optional>> const& edgelist_intra_partition_segment_offsets, bool do_expensive_check) { -#if 1 // FIXME: delete - handle.sync_stream(); - if constexpr (multi_gpu) { - rmm::device_uvector dummy(1, handle.get_stream()); - handle.get_comms().allreduce( - dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream()); - } - auto time0 = std::chrono::steady_clock::now(); -#endif auto& comm = handle.get_comms(); auto const comm_size = comm.get_size(); auto const comm_rank = comm.get_rank(); @@ -739,10 +682,6 @@ renumber_edgelist( // 1. compute renumber map -#if 1 // FIXME: delete - handle.sync_stream(); - auto time1 = std::chrono::steady_clock::now(); -#endif auto [renumber_map_labels, vertex_partition_segment_offsets, num_unique_edge_majors, @@ -756,10 +695,6 @@ renumber_edgelist( // 2. initialize partition_t object, number_of_vertices, and number_of_edges for the coarsened // graph -#if 1 // FIXME: delete - handle.sync_stream(); - auto time2 = std::chrono::steady_clock::now(); -#endif auto vertex_counts = host_scalar_allgather( comm, static_cast(renumber_map_labels.size()), handle.get_stream()); std::vector vertex_partition_offsets(comm_size + 1, 0); @@ -783,10 +718,6 @@ renumber_edgelist( // FIXME: compare this hash based approach with a binary search based approach in both memory // footprint and execution time -#if 1 // FIXME: delete - handle.sync_stream(); - auto time3 = std::chrono::steady_clock::now(); -#endif { vertex_t max_matrix_partition_major_size{0}; for (size_t i = 0; i < edgelist_majors.size(); ++i) { @@ -834,10 +765,6 @@ renumber_edgelist( } } -#if 1 // FIXME: delete - handle.sync_stream(); - auto time4 = std::chrono::steady_clock::now(); -#endif if ((partition.get_matrix_partition_minor_size() >= number_of_edges / comm_size) && edgelist_intra_partition_segment_offsets) { // memory footprint dominated by the O(V/sqrt(P)) // part than the O(E/P) part @@ -933,20 +860,6 @@ renumber_edgelist( handle.get_stream()); } } -#if 1 // FIXME: delete - handle.sync_stream(); - auto time5 = std::chrono::steady_clock::now(); - std::chrono::duration elapsed_total = time5 - time0; - std::chrono::duration elapsed0 = time1 - time0; - std::chrono::duration elapsed1 = time2 - time1; - std::chrono::duration elapsed2 = time3 - time2; - std::chrono::duration elapsed3 = time4 - time3; - std::chrono::duration elapsed4 = time5 - time4; - std::cout << "Renumber took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" - << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," - << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," - << elapsed4.count() * 1e3 << ") ms." << std::endl; -#endif return std::make_tuple( std::move(renumber_map_labels), From 3a605b51d815ae6ad4936df7e36c99aa94bc87ba Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 3 Feb 2022 13:36:45 -0800 Subject: [PATCH 14/60] remove temporary performance measurement code --- cpp/src/structure/renumber_edgelist_impl.cuh | 2 +- cpp/tests/link_analysis/mg_pagerank_test.cpp | 22 ---------- cpp/tests/utilities/test_graphs.hpp | 44 -------------------- 3 files changed, 1 insertion(+), 67 deletions(-) diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index ac4c5f7fe1f..959d11b783f 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -93,7 +93,7 @@ compute_renumber_map(raft::handle_t const& handle, edge_t num_local_edges = std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end()); // 1. if local_vertices.has_value() is false, find unique vertices from edge majors (to construct - // local_vertices) unique edge majors will be counted in step 4. + // local_vertices), unique edge majors will be counted in step 4. rmm::device_uvector sorted_unique_majors(0, handle.get_stream()); if (!local_vertices) { diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp index 4a6dd08dabd..adcd0c94a8f 100644 --- a/cpp/tests/link_analysis/mg_pagerank_test.cpp +++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp @@ -63,9 +63,6 @@ class Tests_MGPageRank raft::handle_t handle{}; HighResClock hr_clock{}; -#if 1 // FIXME: delete - auto time0 = std::chrono::steady_clock::now(); -#endif raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD); auto& comm = handle.get_comms(); @@ -78,25 +75,6 @@ class Tests_MGPageRank } cugraph::partition_2d::subcomm_factory_t subcomm_factory(handle, row_comm_size); -#if 1 // FIXME: delete - { - rmm::device_uvector tx_ints(comm_size, handle.get_stream()); - rmm::device_uvector rx_ints(comm_size, handle.get_stream()); - std::vector tx_sizes(comm_size, size_t{1}); - std::vector tx_offsets(comm_size); - std::iota(tx_offsets.begin(), tx_offsets.end(), size_t{0}); - std::vector tx_ranks(comm_size); - std::iota(tx_ranks.begin(), tx_ranks.end(), int32_t{0}); - auto rx_sizes = tx_sizes; - auto rx_offsets = tx_offsets; - auto rx_ranks = tx_ranks; - handle.get_comms().device_multicast_sendrecv(tx_ints.data(), tx_sizes, tx_offsets, tx_ranks, rx_ints.data(), rx_sizes, rx_offsets, rx_ranks, handle.get_stream()); - handle.sync_stream(); - } - auto time1 = std::chrono::steady_clock::now(); - std::chrono::duration elapsed = time1 - time0; - std::cout << "Handle initialization and 1st all-to-all took " << elapsed.count() * 1e3 << " ms." << std::endl; -#endif // 2. create MG graph diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp index 8818d9633bf..9fa4cee9f7a 100644 --- a/cpp/tests/utilities/test_graphs.hpp +++ b/cpp/tests/utilities/test_graphs.hpp @@ -147,13 +147,6 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { static_cast(std::numeric_limits::max()), "Invalid template parameter: (scale_, edge_factor_) too large for edge_t"); -#if 1 // FIXME: delete - handle.sync_stream(); - if constexpr (multi_gpu) { - handle.get_comms().barrier(); - } - auto time0 = std::chrono::steady_clock::now(); -#endif std::vector partition_ids(1); size_t num_partitions; @@ -198,10 +191,6 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { } } -#if 1 // FIXME: delete - handle.sync_stream(); - auto time1 = std::chrono::steady_clock::now(); -#endif rmm::device_uvector src_v(0, handle.get_stream()); rmm::device_uvector dst_v(0, handle.get_stream()); auto weights_v = test_weighted @@ -258,25 +247,13 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { } } -#if 1 // FIXME: delete - handle.sync_stream(); - auto time2 = std::chrono::steady_clock::now(); -#endif translate(handle, src_v, dst_v); -#if 1 // FIXME: delete - handle.sync_stream(); - auto time3 = std::chrono::steady_clock::now(); -#endif if (undirected_) std::tie(src_v, dst_v, weights_v) = cugraph::symmetrize_edgelist_from_triangular( handle, std::move(src_v), std::move(dst_v), std::move(weights_v)); -#if 1 // FIXME: delete - handle.sync_stream(); - auto time4 = std::chrono::steady_clock::now(); -#endif if (multi_gpu) { std::tie(store_transposed ? dst_v : src_v, store_transposed ? src_v : dst_v, weights_v) = cugraph::detail::shuffle_edgelist_by_gpu_id( @@ -286,10 +263,6 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { std::move(weights_v)); } -#if 1 // FIXME: delete - handle.sync_stream(); - auto time5 = std::chrono::steady_clock::now(); -#endif rmm::device_uvector vertices_v(0, handle.get_stream()); for (size_t i = 0; i < partition_ids.size(); ++i) { auto id = partition_ids[i]; @@ -303,27 +276,10 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { partition_vertex_firsts[i]); } -#if 1 // FIXME: delete - handle.sync_stream(); - auto time6 = std::chrono::steady_clock::now(); -#endif if constexpr (multi_gpu) { vertices_v = cugraph::detail::shuffle_vertices_by_gpu_id(handle, std::move(vertices_v)); } -#if 1 // FIXME: delete - handle.sync_stream(); - auto time7 = std::chrono::steady_clock::now(); - std::chrono::duration elapsed_total = time7 - time0; - std::chrono::duration elapsed0 = time1 - time0; - std::chrono::duration elapsed1 = time2 - time1; - std::chrono::duration elapsed2 = time3 - time2; - std::chrono::duration elapsed3 = time4 - time3; - std::chrono::duration elapsed4 = time5 - time4; - std::chrono::duration elapsed5 = time6 - time5; - std::chrono::duration elapsed6 = time7 - time6; - std::cout << "Edge generation took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << "," << elapsed5.count() * 1e3 << "," << elapsed6.count() * 1e3 << ") ms." << std::endl; -#endif return std::make_tuple( std::move(src_v), std::move(dst_v), From 4852ce4536e7fef9e1364eaaf11a8c59ac01bc74 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 3 Feb 2022 14:06:21 -0800 Subject: [PATCH 15/60] clang-format & copyright year --- cpp/include/cugraph/detail/graph_utils.cuh | 5 +++-- cpp/include/cugraph/utilities/shuffle_comm.cuh | 2 +- cpp/src/generators/generate_rmat_edgelist.cu | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/include/cugraph/detail/graph_utils.cuh b/cpp/include/cugraph/detail/graph_utils.cuh index ca918c53a62..254744d11d9 100644 --- a/cpp/include/cugraph/detail/graph_utils.cuh +++ b/cpp/include/cugraph/detail/graph_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -80,7 +80,8 @@ struct compute_partition_id_from_edge_t { template struct is_first_in_run_t { vertex_t const* vertices{nullptr}; - __device__ bool operator()(size_t i) const { + __device__ bool operator()(size_t i) const + { return (i == 0) || (vertices[i - 1] != vertices[i]); } }; diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh index 3840de019fc..b1b60f49fde 100644 --- a/cpp/include/cugraph/utilities/shuffle_comm.cuh +++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/generators/generate_rmat_edgelist.cu b/cpp/src/generators/generate_rmat_edgelist.cu index 07b01853fdd..8aa33d744f7 100644 --- a/cpp/src/generators/generate_rmat_edgelist.cu +++ b/cpp/src/generators/generate_rmat_edgelist.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 96e9693d9dc7777377ba54473e1be60e6141d693 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 3 Feb 2022 17:46:23 -0800 Subject: [PATCH 16/60] add temporary performance measurement code --- .../cugraph/utilities/shuffle_comm.cuh | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh index 3840de019fc..04fcbf749b7 100644 --- a/cpp/include/cugraph/utilities/shuffle_comm.cuh +++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh @@ -244,11 +244,23 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm, ValueToGPUIdOp value_to_gpu_id_op, rmm::cuda_stream_view stream_view) { +#if 1 // FIXME: delete + rmm::device_uvector dummy(1, stream_view); + stream_view.synchronize(); + comm.allreduce(dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, stream_view); + auto time0 = std::chrono::steady_clock::now(); +#endif auto const comm_size = comm.get_size(); auto d_tx_value_counts = groupby_and_count( tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), stream_view); +#if 1 // FIXME: delete + stream_view.synchronize(); + auto time1 = std::chrono::steady_clock::now(); + comm.allreduce(dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, stream_view); + auto time2 = std::chrono::steady_clock::now(); +#endif std::vector tx_counts{}; std::vector tx_offsets{}; std::vector tx_dst_ranks{}; @@ -262,6 +274,12 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm, allocate_dataframe_buffer::value_type>( rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); +#if 1 // FIXME: delete + stream_view.synchronize(); + auto time3 = std::chrono::steady_clock::now(); + comm.allreduce(dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, stream_view); + auto time4 = std::chrono::steady_clock::now(); +#endif // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size). device_multicast_sendrecv(comm, @@ -275,6 +293,10 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm, rx_src_ranks, stream_view); +#if 1 // FIXME: delete + stream_view.synchronize(); + auto time5 = std::chrono::steady_clock::now(); +#endif if (rx_counts.size() < static_cast(comm_size)) { std::vector tmp_rx_counts(comm_size, size_t{0}); for (size_t i = 0; i < rx_src_ranks.size(); ++i) { @@ -282,6 +304,18 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm, } rx_counts = std::move(tmp_rx_counts); } +#if 1 // FIXME: delete + stream_view.synchronize(); + auto time6 = std::chrono::steady_clock::now(); + std::chrono::duration elapsed_total = time6 - time0; + std::chrono::duration elapsed0 = time1 - time0; + std::chrono::duration elapsed1 = time2 - time1; + std::chrono::duration elapsed2 = time3 - time2; + std::chrono::duration elapsed3 = time4 - time3; + std::chrono::duration elapsed4 = time5 - time4; + std::chrono::duration elapsed5 = time6 - time5; + std::cout << "Shuffle values (" << thrust::distance(tx_value_first, tx_value_last) << ") took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << "," << elapsed5.count() * 1e3 << ") ms." << std::endl; +#endif return std::make_tuple(std::move(rx_value_buffer), rx_counts); } From e8769d34b1d8113dae19a348aef97722914051bd Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 3 Feb 2022 23:35:14 -0800 Subject: [PATCH 17/60] add temporary performance measurement code to PageRank implementation --- cpp/src/link_analysis/pagerank_impl.cuh | 35 +++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh index b6023d21bf2..a4df330c617 100644 --- a/cpp/src/link_analysis/pagerank_impl.cuh +++ b/cpp/src/link_analysis/pagerank_impl.cuh @@ -192,6 +192,14 @@ void pagerank( row_properties_t adj_matrix_row_pageranks(handle, pull_graph_view); size_t iter{0}; while (true) { +#if 1 // FIXME: delete + handle.sync_stream(); + if constexpr (GraphViewType::is_multi_gpu) { + rmm::device_uvector dummy(1, handle.get_stream()); + handle.get_comms().allreduce(dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream()); + } + auto time0 = std::chrono::steady_clock::now(); +#endif thrust::copy(handle.get_thrust_policy(), pageranks, pageranks + pull_graph_view.get_number_of_local_vertices(), @@ -223,8 +231,16 @@ void pagerank( return pagerank / divisor; }); +#if 1 // FIXME: delete + handle.sync_stream(); + auto time1 = std::chrono::steady_clock::now(); +#endif copy_to_adj_matrix_row(handle, pull_graph_view, pageranks, adj_matrix_row_pageranks); +#if 1 // FIXME: delete + handle.sync_stream(); + auto time2 = std::chrono::steady_clock::now(); +#endif auto unvarying_part = aggregate_personalization_vector_size == 0 ? (dangling_sum * alpha + static_cast(1.0 - alpha)) / static_cast(num_vertices) @@ -241,6 +257,10 @@ void pagerank( unvarying_part, pageranks); +#if 1 // FIXME: delete + handle.sync_stream(); + auto time3 = std::chrono::steady_clock::now(); +#endif if (aggregate_personalization_vector_size > 0) { auto vertex_partition = vertex_partition_device_view_t( pull_graph_view.get_vertex_partition_view()); @@ -260,6 +280,10 @@ void pagerank( }); } +#if 1 // FIXME: delete + handle.sync_stream(); + auto time4 = std::chrono::steady_clock::now(); +#endif auto diff_sum = transform_reduce_v( handle, pull_graph_view, @@ -267,6 +291,17 @@ void pagerank( [] __device__(auto val) { return std::abs(thrust::get<0>(val) - thrust::get<1>(val)); }, result_t{0.0}); +#if 1 // FIXME: delete + handle.sync_stream(); + auto time5 = std::chrono::steady_clock::now(); + std::chrono::duration elapsed_total = time5 - time0; + std::chrono::duration elapsed0 = time1 - time0; + std::chrono::duration elapsed1 = time2 - time1; + std::chrono::duration elapsed2 = time3 - time2; + std::chrono::duration elapsed3 = time4 - time3; + std::chrono::duration elapsed4 = time5 - time4; + std::cout << "PageRank iter " << iter << " took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << ") ms." << std::endl; +#endif iter++; if (diff_sum < epsilon) { From dc6acefb5a6386283a7dcb8d061dca1a4bbd33e5 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 4 Feb 2022 00:27:28 -0800 Subject: [PATCH 18/60] add more performance measurements to MG PageRank test --- cpp/tests/link_analysis/mg_pagerank_test.cpp | 87 ++++++++++++++++++-- 1 file changed, 79 insertions(+), 8 deletions(-) diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp index df264f2e0e1..5b60a130728 100644 --- a/cpp/tests/link_analysis/mg_pagerank_test.cpp +++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp @@ -61,26 +61,82 @@ class Tests_MGPageRank { // 1. initialize handle - raft::handle_t handle(rmm::cuda_stream_per_thread, std::make_shared()); + auto constexpr pool_size = 64; // FIXME: tuning parameter + raft::handle_t handle(rmm::cuda_stream_per_thread, std::make_shared(pool_size)); HighResClock hr_clock{}; +#if 1 // FIXME: delete + auto time0 = std::chrono::steady_clock::now(); +#endif raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD); auto& comm = handle.get_comms(); auto const comm_size = comm.get_size(); auto const comm_rank = comm.get_rank(); - auto row_comm_size = static_cast(sqrt(static_cast(comm_size))); - while (comm_size % row_comm_size != 0) { - --row_comm_size; + int row_comm_size{}; + int num_gpus_per_node{}; + RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); + if (comm_size > num_gpus_per_node) { // multi-node, inter-node communication bandwidth + // (Infinniband) is more likely to be a bottleneck than + // intra-node (NVLink) communication bandwidth + CUGRAPH_EXPECTS((comm_size % num_gpus_per_node) == 0, + "Invalid MPI configuration: in multi-node execution, # MPI processes should " + "be a multiple of the number of GPUs per node."); + auto num_nodes = comm_size / num_gpus_per_node; + row_comm_size = static_cast(sqrt(static_cast(num_nodes))); + while (num_nodes % row_comm_size != 0) { + --row_comm_size; + } + row_comm_size *= num_gpus_per_node; + } else { + row_comm_size = static_cast(sqrt(static_cast(comm_size))); + while (comm_size % row_comm_size != 0) { + --row_comm_size; + } } + cugraph::partition_2d::subcomm_factory_t subcomm_factory(handle, row_comm_size); +#if 1 // FIXME: delete + { + rmm::device_uvector tx_ints(comm_size, handle.get_stream()); + rmm::device_uvector rx_ints(comm_size, handle.get_stream()); + std::vector tx_sizes(comm_size, size_t{1}); + std::vector tx_offsets(comm_size); + std::iota(tx_offsets.begin(), tx_offsets.end(), size_t{0}); + std::vector tx_ranks(comm_size); + std::iota(tx_ranks.begin(), tx_ranks.end(), int32_t{0}); + auto rx_sizes = tx_sizes; + auto rx_offsets = tx_offsets; + auto rx_ranks = tx_ranks; + handle.get_comms().device_multicast_sendrecv(tx_ints.data(), + tx_sizes, + tx_offsets, + tx_ranks, + rx_ints.data(), + rx_sizes, + rx_offsets, + rx_ranks, + handle.get_stream()); + handle.sync_stream(); + } + auto time1 = std::chrono::steady_clock::now(); + std::chrono::duration elapsed = time1 - time0; + std::cout << "Handle initialization and 1st all-to-all (comm_size=" << comm_size + << ", row_comm_size=" << row_comm_size << ") took " << elapsed.count() * 1e3 << " ms." + << std::endl; +#endif // 2. create MG graph if (cugraph::test::g_perf) { RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - handle.get_comms().barrier(); +#if 1 // FIXME: should use handle.get_comms().barrier() instead once raft is updated to internally + // use NCCL All-Reduce instead of MPI barrier + rmm::device_uvector dummy(1, handle.get_stream()); + handle.get_comms().allreduce( + dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream()); +#endif hr_clock.start(); } @@ -90,7 +146,12 @@ class Tests_MGPageRank if (cugraph::test::g_perf) { RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - handle.get_comms().barrier(); +#if 1 // FIXME: should use handle.get_comms().barrier() instead once raft is updated to internally + // use NCCL All-Reduce instead of MPI barrier + rmm::device_uvector dummy(1, handle.get_stream()); + handle.get_comms().allreduce( + dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream()); +#endif double elapsed_time{0.0}; hr_clock.stop(&elapsed_time); std::cout << "MG construct_graph took " << elapsed_time * 1e-6 << " s.\n"; @@ -155,7 +216,12 @@ class Tests_MGPageRank if (cugraph::test::g_perf) { RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - handle.get_comms().barrier(); +#if 1 // FIXME: should use handle.get_comms().barrier() instead once raft is updated to internally + // use NCCL All-Reduce instead of MPI barrier + rmm::device_uvector dummy(1, handle.get_stream()); + handle.get_comms().allreduce( + dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream()); +#endif hr_clock.start(); } @@ -180,7 +246,12 @@ class Tests_MGPageRank if (cugraph::test::g_perf) { RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - handle.get_comms().barrier(); +#if 1 // FIXME: should use handle.get_comms().barrier() instead once raft is updated to internally + // use NCCL All-Reduce instead of MPI barrier + rmm::device_uvector dummy(1, handle.get_stream()); + handle.get_comms().allreduce( + dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream()); +#endif double elapsed_time{0.0}; hr_clock.stop(&elapsed_time); std::cout << "MG PageRank took " << elapsed_time * 1e-6 << " s.\n"; From 8136b77130fe2cb57bdd1f6824ea622025a47e4e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 4 Feb 2022 00:40:22 -0800 Subject: [PATCH 19/60] add more experimental code (should be cleaned-up before merge) --- .../copy_v_transform_reduce_in_out_nbr.cuh | 189 +++++++++--------- 1 file changed, 98 insertions(+), 91 deletions(-) diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh index fbd339fdf32..63f1aae6c8a 100644 --- a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh +++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh @@ -517,15 +517,16 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } std::optional> stream_pool_indices{std::nullopt}; - size_t num_concurrent_loops{1}; // FIXME: this can go inside the loop after temporary testing -#if 1 // FIXME: for temporary testing - std::vector high_streams0{}; - std::vector high_streams1{}; - std::vector mid_streams{}; - std::vector low_streams{}; +#if 1 // FIXME: for temporary testing + std::vector pool_streams{}; #endif if constexpr (GraphViewType::is_multi_gpu) { - if (handle.get_stream_pool_size() >= max_segments) { + if ((graph_view.get_local_adj_matrix_partition_segment_offsets(0)) && + (handle.get_stream_pool_size() >= max_segments)) { + for (size_t i = 1; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + assert(graph_view.get_local_adj_matrix_partition_segment_offsets(i)); + } + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_size = col_comm.get_size(); @@ -534,8 +535,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, // update_major ? V / comm_size * sizeof(T) : 0 // and limit memory requirement to (E / comm_size) * sizeof(vertex_t) - num_concurrent_loops = - std::min(static_cast(col_comm_size), handle.get_stream_pool_size() / max_segments); + size_t num_streams = std::min(static_cast(col_comm_size) * max_segments, + (handle.get_stream_pool_size() / max_segments) * max_segments); if constexpr (update_major) { size_t value_size{0}; if constexpr (is_thrust_tuple_of_arithmetic::value) { @@ -550,60 +551,87 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, static_cast(graph_view.get_number_of_vertices())) : double{0.0}; - num_concurrent_loops = + num_streams = std::min(static_cast(avg_vertex_degree * (static_cast(sizeof(vertex_t)) / - static_cast(value_size))), - num_concurrent_loops); + static_cast(value_size))) * + max_segments, + num_streams); } + if (num_streams >= max_segments) { #if 1 // FIXME: for temporary testing - high_streams0.resize(num_concurrent_loops); - high_streams1.resize(num_concurrent_loops); - mid_streams.resize(num_concurrent_loops); - low_streams.resize(num_concurrent_loops); - for (size_t i = 0; i < num_concurrent_loops; ++i) { - CUDA_TRY(cudaStreamCreateWithPriority(&high_streams0[i], cudaStreamNonBlocking, -2)); - CUDA_TRY(cudaStreamCreateWithPriority(&high_streams1[i], cudaStreamNonBlocking, -2)); - CUDA_TRY(cudaStreamCreateWithPriority(&mid_streams[i], cudaStreamNonBlocking, -1)); - CUDA_TRY(cudaStreamCreateWithPriority(&low_streams[i], cudaStreamNonBlocking, 0)); - } + pool_streams.resize(num_streams); + for (size_t i = 0; i < pool_streams.size() / max_segments; ++i) { + static_assert(max_segments == 4); + CUDA_TRY(cudaStreamCreateWithPriority( + &pool_streams[i * max_segments], cudaStreamNonBlocking, -2)); + CUDA_TRY(cudaStreamCreateWithPriority( + &pool_streams[i * max_segments + 1], cudaStreamNonBlocking, -2)); + CUDA_TRY(cudaStreamCreateWithPriority( + &pool_streams[i * max_segments + 2], cudaStreamNonBlocking, -1)); + CUDA_TRY(cudaStreamCreateWithPriority( + &pool_streams[i * max_segments + 3], cudaStreamNonBlocking, 0)); + } #endif - stream_pool_indices = std::vector(num_concurrent_loops * max_segments); - std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); - handle.sync_stream(); + stream_pool_indices = std::vector(num_streams); + std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); + handle.sync_stream(); + } } } - for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { - auto loop_stream = stream_pool_indices - ? rmm::cuda_stream_view{high_streams0[i % num_concurrent_loops]} - /* FIXME for temporary testing, - handle.get_stream_from_stream_pool((i * - max_segments) % - (*stream_pool_indices).size()) */ - : handle.get_stream(); + std::vector(0, rmm::cuda_stream_view{}))> + major_tmp_buffers{}; + if constexpr (GraphViewType::is_multi_gpu && update_major) { + std::vector major_tmp_buffer_sizes( + graph_view.get_number_of_local_adj_matrix_partitions(), size_t{0}); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + major_tmp_buffer_sizes[i] = GraphViewType::is_adj_matrix_transposed + ? graph_view.get_number_of_local_adj_matrix_partition_cols(i) + : graph_view.get_number_of_local_adj_matrix_partition_rows(i); + } + if (stream_pool_indices) { + auto num_concurrent_loops = (*stream_pool_indices).size() / max_segments; + major_tmp_buffers.reserve(num_concurrent_loops); + for (size_t i = 0; i < num_concurrent_loops; ++i) { + size_t max_size{0}; + for (size_t j = i; j < graph_view.get_number_of_local_adj_matrix_partitions(); + j += num_concurrent_loops) { + max_size = std::max(major_tmp_buffer_sizes[j], max_size); + } + major_tmp_buffers.push_back(allocate_dataframe_buffer(max_size, handle.get_stream())); + } + } else { + major_tmp_buffers.reserve(1); + major_tmp_buffers.push_back(allocate_dataframe_buffer( + *std::max_element(major_tmp_buffer_sizes.begin(), major_tmp_buffer_sizes.end()), + handle.get_stream())); + } + } else { // dummy + major_tmp_buffers.reserve(1); + major_tmp_buffers.push_back(allocate_dataframe_buffer(size_t{0}, handle.get_stream())); + } + if (stream_pool_indices) { handle.sync_stream(); } + + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { auto matrix_partition = matrix_partition_device_view_t( graph_view.get_matrix_partition_view(i)); - auto major_tmp_buffer_size = - GraphViewType::is_multi_gpu && update_major ? matrix_partition.get_major_size() : vertex_t{0}; - auto major_tmp_buffer = allocate_dataframe_buffer(major_tmp_buffer_size, loop_stream); - auto major_buffer_first = get_dataframe_buffer_begin(major_tmp_buffer); - auto major_init = T{}; if constexpr (update_major) { if constexpr (GraphViewType::is_multi_gpu) { auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_rank = col_comm.get_rank(); - major_init = (col_comm_rank == 0) ? init : T{}; + major_init = (static_cast(i) == col_comm_rank) ? init : T{}; } else { major_init = init; } } + // FIXME: need to double check whether this leads to actual copy auto matrix_partition_row_value_input = adj_matrix_row_value_input; auto matrix_partition_col_value_input = adj_matrix_col_value_input; if constexpr (GraphViewType::is_adj_matrix_transposed) { @@ -612,6 +640,9 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, matrix_partition_row_value_input.set_local_adj_matrix_partition_idx(i); } + auto major_buffer_first = + get_dataframe_buffer_begin(major_tmp_buffers[i % major_tmp_buffers.size()]); + std::conditional_t{(i * max_segments) % - (*stream_pool_indices).size()}); */ - } - auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i); if (segment_offsets) { static_assert(detail::num_sparse_segments_per_vertex_partition == 3); @@ -644,11 +668,12 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, // running segmented reduction if (matrix_partition.get_dcs_nzd_vertex_count()) { auto exec_stream = stream_pool_indices - ? rmm::cuda_stream_view{high_streams0[i % num_concurrent_loops]} + ? rmm::cuda_stream_view{pool_streams[(i * max_segments) % + (*stream_pool_indices).size()]} /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) */ - : loop_stream; + : handle.get_stream(); if constexpr (update_major) { // this is necessary as we don't visit every vertex in the // hypersparse segment in // for_all_major_for_all_nbr_hypersparse @@ -676,11 +701,11 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { auto exec_stream = stream_pool_indices - ? rmm::cuda_stream_view{high_streams1[i % num_concurrent_loops]} - /* FIXME for temporary testing, - handle.get_stream_from_stream_pool((i * max_segments + 1) % - (*stream_pool_indices).size()) */ - : loop_stream; + ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 1) % + (*stream_pool_indices).size()]} + /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * + max_segments + 1) % (*stream_pool_indices).size()) */ + : handle.get_stream(); raft::grid_1d_thread_t update_grid((*segment_offsets)[3] - (*segment_offsets)[2], detail::copy_v_transform_reduce_nbr_for_all_block_size, handle.get_device_properties().maxGridSize[0]); @@ -699,11 +724,11 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { auto exec_stream = stream_pool_indices - ? rmm::cuda_stream_view{mid_streams[i % num_concurrent_loops]} - /* FIXME for temporary testing, - handle.get_stream_from_stream_pool((i * max_segments + 2) % - (*stream_pool_indices).size()) */ - : loop_stream; + ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 2) % + (*stream_pool_indices).size()]} + /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * + max_segments + 2) % (*stream_pool_indices).size()) */ + : handle.get_stream(); raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1], detail::copy_v_transform_reduce_nbr_for_all_block_size, handle.get_device_properties().maxGridSize[0]); @@ -722,11 +747,11 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } if ((*segment_offsets)[1] > 0) { auto exec_stream = stream_pool_indices - ? rmm::cuda_stream_view{low_streams[i % num_concurrent_loops]} - /* FIXME for temporary testing, - handle.get_stream_from_stream_pool((i * max_segments + 3) % - (*stream_pool_indices).size()) */ - : loop_stream; + ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 3) % + (*stream_pool_indices).size()]} + /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * + max_segments + 3) % (*stream_pool_indices).size()) */ + : handle.get_stream(); raft::grid_1d_block_t update_grid((*segment_offsets)[1], detail::copy_v_transform_reduce_nbr_for_all_block_size, handle.get_device_properties().maxGridSize[0]); @@ -747,7 +772,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, detail::copy_v_transform_reduce_nbr_for_all_block_size, handle.get_device_properties().maxGridSize[0]); detail::for_all_major_for_all_nbr_low_degree - <<>>( + <<>>( matrix_partition, matrix_partition.get_major_first(), matrix_partition.get_major_last(), @@ -776,19 +801,19 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, (*segment_offsets).back() - (*segment_offsets)[3], raft::comms::op_t::SUM, i, - high_streams0[i % num_concurrent_loops]/* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) */); + pool_streams[(i * max_segments) % (*stream_pool_indices).size()]/* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) */); } if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { device_reduce( - col_comm, major_buffer_first + (*segment_offsets)[2], vertex_value_output_first + (*segment_offsets)[2], (*segment_offsets)[3] - (*segment_offsets)[2], raft::comms::op_t::SUM, i, high_streams1[i % num_concurrent_loops] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 1) % (*stream_pool_indices).size()) */); + col_comm, major_buffer_first + (*segment_offsets)[2], vertex_value_output_first + (*segment_offsets)[2], (*segment_offsets)[3] - (*segment_offsets)[2], raft::comms::op_t::SUM, i, pool_streams[(i * max_segments + 1) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 1) % (*stream_pool_indices).size()) */); } if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { device_reduce( - col_comm, major_buffer_first + (*segment_offsets)[1], vertex_value_output_first + (*segment_offsets)[1], (*segment_offsets)[2] - (*segment_offsets)[1], raft::comms::op_t::SUM, i, mid_streams[i % num_concurrent_loops] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 2) % (*stream_pool_indices).size()) */); + col_comm, major_buffer_first + (*segment_offsets)[1], vertex_value_output_first + (*segment_offsets)[1], (*segment_offsets)[2] - (*segment_offsets)[1], raft::comms::op_t::SUM, i, pool_streams[(i * max_segments + 2) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 2) % (*stream_pool_indices).size()) */); } if ((*segment_offsets)[1] > 0) { device_reduce( - col_comm, major_buffer_first, vertex_value_output_first, (*segment_offsets)[1], raft::comms::op_t::SUM, i, low_streams[i % num_concurrent_loops] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 3) % (*stream_pool_indices).size()) */); + col_comm, major_buffer_first, vertex_value_output_first, (*segment_offsets)[1], raft::comms::op_t::SUM, i, pool_streams[(i * max_segments + 3) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 3) % (*stream_pool_indices).size()) */); } } else { device_reduce(col_comm, @@ -797,24 +822,15 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, matrix_partition.get_major_size(), raft::comms::op_t::SUM, i, - loop_stream); + handle.get_stream()); } } } if (stream_pool_indices) { #if 1 // FIXME: for temporary testing - for (size_t i = 0; i < high_streams0.size(); ++i) { - CUDA_TRY(cudaStreamSynchronize(high_streams0[i])); - } - for (size_t i = 0; i < high_streams1.size(); ++i) { - CUDA_TRY(cudaStreamSynchronize(high_streams1[i])); - } - for (size_t i = 0; i < mid_streams.size(); ++i) { - CUDA_TRY(cudaStreamSynchronize(mid_streams[i])); - } - for (size_t i = 0; i < low_streams.size(); ++i) { - CUDA_TRY(cudaStreamSynchronize(low_streams[i])); + for (size_t i = 0; i < pool_streams.size(); ++i) { + CUDA_TRY(cudaStreamSynchronize(pool_streams[i])); } #else handle.sync_stream_pool(*stream_pool_indices); @@ -886,17 +902,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, // FIXME: for temporary testing #if 1 if (stream_pool_indices) { - for (size_t i = 0; i < low_streams.size(); ++i) { - CUDA_TRY(cudaStreamDestroy(low_streams[i])); - } - for (size_t i = 0; i < mid_streams.size(); ++i) { - CUDA_TRY(cudaStreamDestroy(mid_streams[i])); - } - for (size_t i = 0; i < high_streams1.size(); ++i) { - CUDA_TRY(cudaStreamDestroy(high_streams1[i])); - } - for (size_t i = 0; i < high_streams0.size(); ++i) { - CUDA_TRY(cudaStreamDestroy(high_streams0[i])); + for (size_t i = 0; i < pool_streams.size(); ++i) { + CUDA_TRY(cudaStreamDestroy(pool_streams[i])); } } #endif From f28ccfa186e90bd7390f945fe9e5c3c9f602268f Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 9 Feb 2022 16:28:52 -0800 Subject: [PATCH 20/60] remove some temporary code --- .../cugraph/utilities/shuffle_comm.cuh | 34 ------------------- cpp/tests/link_analysis/mg_pagerank_test.cpp | 28 +++------------ 2 files changed, 4 insertions(+), 58 deletions(-) diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh index cd5828f9554..f10f9db95e1 100644 --- a/cpp/include/cugraph/utilities/shuffle_comm.cuh +++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh @@ -246,23 +246,11 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm, ValueToGPUIdOp value_to_gpu_id_op, rmm::cuda_stream_view stream_view) { -#if 1 // FIXME: delete - rmm::device_uvector dummy(1, stream_view); - stream_view.synchronize(); - comm.allreduce(dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, stream_view); - auto time0 = std::chrono::steady_clock::now(); -#endif auto const comm_size = comm.get_size(); auto d_tx_value_counts = groupby_and_count( tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), stream_view); -#if 1 // FIXME: delete - stream_view.synchronize(); - auto time1 = std::chrono::steady_clock::now(); - comm.allreduce(dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, stream_view); - auto time2 = std::chrono::steady_clock::now(); -#endif std::vector tx_counts{}; std::vector tx_offsets{}; std::vector tx_dst_ranks{}; @@ -276,12 +264,6 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm, allocate_dataframe_buffer::value_type>( rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); -#if 1 // FIXME: delete - stream_view.synchronize(); - auto time3 = std::chrono::steady_clock::now(); - comm.allreduce(dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, stream_view); - auto time4 = std::chrono::steady_clock::now(); -#endif // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size). device_multicast_sendrecv(comm, @@ -295,10 +277,6 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm, rx_src_ranks, stream_view); -#if 1 // FIXME: delete - stream_view.synchronize(); - auto time5 = std::chrono::steady_clock::now(); -#endif if (rx_counts.size() < static_cast(comm_size)) { std::vector tmp_rx_counts(comm_size, size_t{0}); for (size_t i = 0; i < rx_src_ranks.size(); ++i) { @@ -306,18 +284,6 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm, } rx_counts = std::move(tmp_rx_counts); } -#if 1 // FIXME: delete - stream_view.synchronize(); - auto time6 = std::chrono::steady_clock::now(); - std::chrono::duration elapsed_total = time6 - time0; - std::chrono::duration elapsed0 = time1 - time0; - std::chrono::duration elapsed1 = time2 - time1; - std::chrono::duration elapsed2 = time3 - time2; - std::chrono::duration elapsed3 = time4 - time3; - std::chrono::duration elapsed4 = time5 - time4; - std::chrono::duration elapsed5 = time6 - time5; - std::cout << "Shuffle values (" << thrust::distance(tx_value_first, tx_value_last) << ") took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << "," << elapsed5.count() * 1e3 << ") ms." << std::endl; -#endif return std::make_tuple(std::move(rx_value_buffer), rx_counts); } diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp index 5b60a130728..c2a9bf74e2e 100644 --- a/cpp/tests/link_analysis/mg_pagerank_test.cpp +++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp @@ -131,12 +131,7 @@ class Tests_MGPageRank if (cugraph::test::g_perf) { RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement -#if 1 // FIXME: should use handle.get_comms().barrier() instead once raft is updated to internally - // use NCCL All-Reduce instead of MPI barrier - rmm::device_uvector dummy(1, handle.get_stream()); - handle.get_comms().allreduce( - dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream()); -#endif + handle.get_comms().barrier(); hr_clock.start(); } @@ -146,12 +141,7 @@ class Tests_MGPageRank if (cugraph::test::g_perf) { RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement -#if 1 // FIXME: should use handle.get_comms().barrier() instead once raft is updated to internally - // use NCCL All-Reduce instead of MPI barrier - rmm::device_uvector dummy(1, handle.get_stream()); - handle.get_comms().allreduce( - dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream()); -#endif + handle.get_comms().barrier(); double elapsed_time{0.0}; hr_clock.stop(&elapsed_time); std::cout << "MG construct_graph took " << elapsed_time * 1e-6 << " s.\n"; @@ -216,12 +206,7 @@ class Tests_MGPageRank if (cugraph::test::g_perf) { RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement -#if 1 // FIXME: should use handle.get_comms().barrier() instead once raft is updated to internally - // use NCCL All-Reduce instead of MPI barrier - rmm::device_uvector dummy(1, handle.get_stream()); - handle.get_comms().allreduce( - dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream()); -#endif + handle.get_comms().barrier(); hr_clock.start(); } @@ -246,12 +231,7 @@ class Tests_MGPageRank if (cugraph::test::g_perf) { RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement -#if 1 // FIXME: should use handle.get_comms().barrier() instead once raft is updated to internally - // use NCCL All-Reduce instead of MPI barrier - rmm::device_uvector dummy(1, handle.get_stream()); - handle.get_comms().allreduce( - dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream()); -#endif + handle.get_comms().barrier(); double elapsed_time{0.0}; hr_clock.stop(&elapsed_time); std::cout << "MG PageRank took " << elapsed_time * 1e-6 << " s.\n"; From 1839a61d4ee37f98aeedf67fa4589f934ac5f54b Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 9 Feb 2022 16:30:31 -0800 Subject: [PATCH 21/60] undo some temporary fix --- cpp/src/link_analysis/pagerank_impl.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh index a4df330c617..e346a6892b9 100644 --- a/cpp/src/link_analysis/pagerank_impl.cuh +++ b/cpp/src/link_analysis/pagerank_impl.cuh @@ -195,8 +195,7 @@ void pagerank( #if 1 // FIXME: delete handle.sync_stream(); if constexpr (GraphViewType::is_multi_gpu) { - rmm::device_uvector dummy(1, handle.get_stream()); - handle.get_comms().allreduce(dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream()); + handle.get_comms().barrier(); } auto time0 = std::chrono::steady_clock::now(); #endif From 7cb1f03b2d832ee3290f00158cd3c124e2d8187b Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 10 Feb 2022 00:06:42 -0800 Subject: [PATCH 22/60] remove host_barrier (no longer used) --- cpp/CMakeLists.txt | 1 - .../cugraph/utilities/host_barrier.hpp | 26 ----- cpp/src/utilities/host_barrier.cpp | 104 ------------------ 3 files changed, 131 deletions(-) delete mode 100644 cpp/include/cugraph/utilities/host_barrier.hpp delete mode 100644 cpp/src/utilities/host_barrier.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ec3141343b4..0ec6b249df0 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -228,7 +228,6 @@ add_library(cugraph SHARED src/structure/create_graph_from_edgelist_mg.cu src/structure/symmetrize_edgelist_sg.cu src/structure/symmetrize_edgelist_mg.cu - src/utilities/host_barrier.cpp src/visitors/graph_envelope.cpp src/visitors/visitors_factory.cpp src/visitors/bfs_visitor.cpp diff --git a/cpp/include/cugraph/utilities/host_barrier.hpp b/cpp/include/cugraph/utilities/host_barrier.hpp deleted file mode 100644 index 6825814eb93..00000000000 --- a/cpp/include/cugraph/utilities/host_barrier.hpp +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -namespace cugraph { - -// FIXME: a temporary hack till UCC is integrated into RAFT (so we can use UCC barrier for DASK and -// MPI barrier for MPI) -void host_barrier(raft::comms::comms_t const& comm, rmm::cuda_stream_view stream_view); - -} // namespace cugraph diff --git a/cpp/src/utilities/host_barrier.cpp b/cpp/src/utilities/host_barrier.cpp deleted file mode 100644 index 2887350ad4d..00000000000 --- a/cpp/src/utilities/host_barrier.cpp +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include - -#include - -namespace cugraph { - -// FIXME: a temporary hack till UCC is integrated into RAFT (so we can use UCC barrier for DASK and -// MPI barrier for MPI) -void host_barrier(raft::comms::comms_t const& comm, rmm::cuda_stream_view stream_view) -{ - stream_view.synchronize(); - - auto const comm_size = comm.get_size(); - auto const comm_rank = comm.get_rank(); - - // k-tree barrier - - int constexpr k = 2; - static_assert(k >= 2); - std::vector requests(k - 1); - std::vector dummies(k - 1); - - // up - - int mod = 1; - while (mod < comm_size) { - if (comm_rank % mod == 0) { - auto level_rank = comm_rank / mod; - if (level_rank % k == 0) { - auto num_irecvs = 0; - ; - for (int i = 1; i < k; ++i) { - auto src_rank = (level_rank + i) * mod; - if (src_rank < comm_size) { - comm.irecv(dummies.data() + (i - 1), - sizeof(std::byte), - src_rank, - int{0} /* tag */, - requests.data() + (i - 1)); - ++num_irecvs; - } - } - comm.waitall(num_irecvs, requests.data()); - } else { - comm.isend(dummies.data(), - sizeof(std::byte), - (level_rank - (level_rank % k)) * mod, - int{0} /* tag */, - requests.data()); - comm.waitall(1, requests.data()); - } - } - mod *= k; - } - - // down - - mod /= k; - while (mod >= 1) { - if (comm_rank % mod == 0) { - auto level_rank = comm_rank / mod; - if (level_rank % k == 0) { - auto num_isends = 0; - for (int i = 1; i < k; ++i) { - auto dst_rank = (level_rank + i) * mod; - if (dst_rank < comm_size) { - comm.isend(dummies.data() + (i - 1), - sizeof(std::byte), - dst_rank, - int{0} /* tag */, - requests.data() + (i - 1)); - ++num_isends; - } - } - comm.waitall(num_isends, requests.data()); - } else { - comm.irecv(dummies.data(), - sizeof(std::byte), - (level_rank - (level_rank % k)) * mod, - int{0} /* tag */, - requests.data()); - comm.waitall(1, requests.data()); - } - } - mod /= k; - } -} - -} // namespace cugraph From eefe72984aaa423f7668b0c5c714016145e29053 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 12 Feb 2022 12:41:52 -0800 Subject: [PATCH 23/60] reduce temporary memory requirement in R-mat edge list generation --- cpp/tests/utilities/test_graphs.hpp | 178 +++++++++++++++++----------- 1 file changed, 112 insertions(+), 66 deletions(-) diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp index 9fa4cee9f7a..0934beb466a 100644 --- a/cpp/tests/utilities/test_graphs.hpp +++ b/cpp/tests/utilities/test_graphs.hpp @@ -146,23 +146,33 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { CUGRAPH_EXPECTS(((size_t{1} << scale_) * edge_factor_) <= static_cast(std::numeric_limits::max()), "Invalid template parameter: (scale_, edge_factor_) too large for edge_t"); + // generate in multi-partitions to limit peak memory usage (thrust::sort & + // shuffle_edgelist_by_gpu_id requires a temporary buffer with the size of the original data) + // With the current implementation, the temporary memory requirement is roughly 50% of the + // original data with num_partitions_per_gpu = 2. If we use cuMemAddressReserve + // (https://developer.nvidia.com/blog/introducing-low-level-gpu-virtual-memory-management), we + // can reduce the temporary memory requirement to (1 / num_partitions) * (original data size) + size_t constexpr num_partitions_per_gpu = 2; - std::vector partition_ids(1); - size_t num_partitions; + // 1. calculate # partitions, # edges to generate in each partition, and partition vertex ranges + + std::vector partition_ids{}; + size_t num_partitions{}; if (multi_gpu_usecase_) { auto& comm = handle.get_comms(); - num_partitions = comm.get_size(); + num_partitions = comm.get_size() * num_partitions_per_gpu; auto const comm_rank = comm.get_rank(); - partition_ids.resize(multi_gpu ? size_t{1} : static_cast(num_partitions)); + partition_ids.resize(multi_gpu ? num_partitions_per_gpu : num_partitions); std::iota(partition_ids.begin(), partition_ids.end(), - multi_gpu ? static_cast(comm_rank) : size_t{0}); + multi_gpu ? static_cast(comm_rank) * num_partitions_per_gpu : size_t{0}); } else { - num_partitions = 1; - partition_ids[0] = size_t{0}; + num_partitions = num_partitions_per_gpu; + partition_ids.resize(num_partitions); + std::iota(partition_ids.begin(), partition_ids.end(), size_t{0}); } vertex_t number_of_vertices = static_cast(size_t{1} << scale_); @@ -191,17 +201,20 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { } } - rmm::device_uvector src_v(0, handle.get_stream()); - rmm::device_uvector dst_v(0, handle.get_stream()); - auto weights_v = test_weighted - ? std::make_optional>(0, handle.get_stream()) - : std::nullopt; + // 2. generate edges + + std::vector> src_partitions{}; + std::vector> dst_partitions{}; + auto weight_partitions = test_weighted + ? std::make_optional>>() + : std::nullopt; + src_partitions.reserve(partition_ids.size()); + dst_partitions.reserve(partition_ids.size()); + if (weight_partitions) { (*weight_partitions).reserve(partition_ids.size()); } for (size_t i = 0; i < partition_ids.size(); ++i) { auto id = partition_ids[i]; - rmm::device_uvector tmp_src_v(0, handle.get_stream()); - rmm::device_uvector tmp_dst_v(0, handle.get_stream()); - std::tie(i == 0 ? src_v : tmp_src_v, i == 0 ? dst_v : tmp_dst_v) = + auto [tmp_src_v, tmp_dst_v] = cugraph::generate_rmat_edgelist(handle, scale_, partition_edge_counts[i], @@ -212,79 +225,112 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { undirected_ ? true : false); std::optional> tmp_weights_v{std::nullopt}; - if (weights_v) { - if (i == 0) { - weights_v->resize(src_v.size(), handle.get_stream()); - } else { - tmp_weights_v = std::make_optional>(tmp_src_v.size(), - handle.get_stream()); - } + if (weight_partitions) { + tmp_weights_v = + std::make_optional>(tmp_src_v.size(), handle.get_stream()); cugraph::detail::uniform_random_fill(handle.get_stream(), - i == 0 ? weights_v->data() : tmp_weights_v->data(), - i == 0 ? weights_v->size() : tmp_weights_v->size(), + tmp_weights_v->data(), + tmp_weights_v->size(), weight_t{0.0}, weight_t{1.0}, seed_ + num_partitions + id); } - if (i > 0) { - auto start_offset = src_v.size(); - src_v.resize(start_offset + tmp_src_v.size(), handle.get_stream()); - dst_v.resize(start_offset + tmp_dst_v.size(), handle.get_stream()); - raft::copy( - src_v.begin() + start_offset, tmp_src_v.begin(), tmp_src_v.size(), handle.get_stream()); - raft::copy( - dst_v.begin() + start_offset, tmp_dst_v.begin(), tmp_dst_v.size(), handle.get_stream()); - - if (weights_v) { - weights_v->resize(start_offset + tmp_weights_v->size(), handle.get_stream()); - raft::copy(weights_v->begin() + start_offset, - tmp_weights_v->begin(), - tmp_weights_v->size(), - handle.get_stream()); - } + translate(handle, tmp_src_v, tmp_dst_v); + + if (undirected_) { + std::tie(tmp_src_v, tmp_dst_v, tmp_weights_v) = + cugraph::symmetrize_edgelist_from_triangular( + handle, std::move(tmp_src_v), std::move(tmp_dst_v), std::move(tmp_weights_v)); } - } - translate(handle, src_v, dst_v); + if (multi_gpu) { + std::tie(store_transposed ? tmp_dst_v : tmp_src_v, + store_transposed ? tmp_src_v : tmp_dst_v, + tmp_weights_v) = + cugraph::detail::shuffle_edgelist_by_gpu_id( + handle, + store_transposed ? std::move(tmp_dst_v) : std::move(tmp_src_v), + store_transposed ? std::move(tmp_src_v) : std::move(tmp_dst_v), + std::move(tmp_weights_v)); + } - if (undirected_) - std::tie(src_v, dst_v, weights_v) = - cugraph::symmetrize_edgelist_from_triangular( - handle, std::move(src_v), std::move(dst_v), std::move(weights_v)); + src_partitions.push_back(std::move(tmp_src_v)); + dst_partitions.push_back(std::move(tmp_dst_v)); + if (weight_partitions) { (*weight_partitions).push_back(std::move(*tmp_weights_v)); } + } - if (multi_gpu) { - std::tie(store_transposed ? dst_v : src_v, store_transposed ? src_v : dst_v, weights_v) = - cugraph::detail::shuffle_edgelist_by_gpu_id( - handle, - store_transposed ? std::move(dst_v) : std::move(src_v), - store_transposed ? std::move(src_v) : std::move(dst_v), - std::move(weights_v)); + size_t tot_edge_counts{0}; + for (size_t i = 0; i < src_partitions.size(); ++i) { + tot_edge_counts += src_partitions[i].size(); } - rmm::device_uvector vertices_v(0, handle.get_stream()); - for (size_t i = 0; i < partition_ids.size(); ++i) { - auto id = partition_ids[i]; + rmm::device_uvector src_v(tot_edge_counts, handle.get_stream()); + size_t src_offset{0}; + for (size_t i = 0; i < src_partitions.size(); ++i) { + thrust::copy(handle.get_thrust_policy(), + src_partitions[i].begin(), + src_partitions[i].end(), + src_v.begin() + src_offset); + src_offset += src_partitions[i].size(); + } + src_partitions.clear(); + src_partitions.shrink_to_fit(); + + rmm::device_uvector dst_v(tot_edge_counts, handle.get_stream()); + size_t dst_offset{0}; + for (size_t i = 0; i < dst_partitions.size(); ++i) { + thrust::copy(handle.get_thrust_policy(), + dst_partitions[i].begin(), + dst_partitions[i].end(), + dst_v.begin() + dst_offset); + dst_offset += dst_partitions[i].size(); + } + dst_partitions.clear(); + dst_partitions.shrink_to_fit(); + + std::optional> weight_v{std::nullopt}; + if (weight_partitions) { + weight_v = rmm::device_uvector(tot_edge_counts, handle.get_stream()); + size_t weight_offset{0}; + for (size_t i = 0; i < (*weight_partitions).size(); ++i) { + thrust::copy(handle.get_thrust_policy(), + (*weight_partitions)[i].begin(), + (*weight_partitions)[i].end(), + (*weight_v).begin() + weight_offset); + weight_offset += (*weight_partitions)[i].size(); + } + (*weight_partitions).clear(); + (*weight_partitions).shrink_to_fit(); + } + + // 3. generate vertices - auto start_offset = vertices_v.size(); - vertices_v.resize(start_offset + (partition_vertex_lasts[i] - partition_vertex_firsts[i]), - handle.get_stream()); - cugraph::detail::sequence_fill(handle.get_stream(), - vertices_v.begin() + start_offset, - vertices_v.size() - start_offset, - partition_vertex_firsts[i]); + size_t tot_vertex_counts{0}; + for (size_t i = 0; i < partition_vertex_firsts.size(); ++i) { + tot_vertex_counts += partition_vertex_lasts[i] - partition_vertex_firsts[i]; + } + rmm::device_uvector vertex_v(tot_vertex_counts, handle.get_stream()); + size_t v_offset{0}; + for (size_t i = 0; i < partition_vertex_firsts.size(); ++i) { + cugraph::detail::sequence_fill( + handle.get_stream(), + vertex_v.begin() + v_offset, + partition_vertex_lasts[i] - partition_vertex_firsts[i], + partition_vertex_firsts[i]); + v_offset += partition_vertex_lasts[i] - partition_vertex_firsts[i]; } if constexpr (multi_gpu) { - vertices_v = cugraph::detail::shuffle_vertices_by_gpu_id(handle, std::move(vertices_v)); + vertex_v = cugraph::detail::shuffle_vertices_by_gpu_id(handle, std::move(vertex_v)); } return std::make_tuple( std::move(src_v), std::move(dst_v), - std::move(weights_v), - std::move(vertices_v), + std::move(weight_v), + std::move(vertex_v), static_cast(detail::TranslateGraph_Usecase::base_vertex_id_) + number_of_vertices, undirected_); } From 167b5abd046117ffbef90d4c9a5b7ca8cee87142 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 12 Feb 2022 12:43:23 -0800 Subject: [PATCH 24/60] input parameter renaming for clarity --- cpp/include/cugraph/detail/shuffle_wrappers.hpp | 6 +++--- cpp/src/detail/shuffle_wrappers.cu | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/cugraph/detail/shuffle_wrappers.hpp b/cpp/include/cugraph/detail/shuffle_wrappers.hpp index e205110d4f4..db02ab94a5d 100644 --- a/cpp/include/cugraph/detail/shuffle_wrappers.hpp +++ b/cpp/include/cugraph/detail/shuffle_wrappers.hpp @@ -76,8 +76,8 @@ rmm::device_uvector shuffle_vertices_by_gpu_id( * @param[in/out] d_edgelist_minors Vertex IDs for columns (if the graph adjacency matrix is stored * as is) or rows (if the graph adjacency matrix is stored transposed) * @param[in/out] d_edgelist_weights Optional edge weights - * @param[in] groupby_and_count_local_partition If set to true, groupby and count edges based on - * (local partition ID, GPU ID) pairs (where GPU IDs are computed by applying the + * @param[in] groupby_and_count_local_partition_by_minor If set to true, groupby and count edges + * based on (local partition ID, GPU ID) pairs (where GPU IDs are computed by applying the * compute_gpu_id_from_vertex_t function to the minor vertex ID). If set to false, groupby and count * edges by just local partition ID. * @@ -91,7 +91,7 @@ rmm::device_uvector groupby_and_count_edgelist_by_local_partition_id( rmm::device_uvector& d_edgelist_majors, rmm::device_uvector& d_edgelist_minors, std::optional>& d_edgelist_weights, - bool groupby_and_count_local_partition = false); + bool groupby_and_count_local_partition_by_minor = false); } // namespace detail } // namespace cugraph diff --git a/cpp/src/detail/shuffle_wrappers.cu b/cpp/src/detail/shuffle_wrappers.cu index a9fa67c769f..4f25dcf30b7 100644 --- a/cpp/src/detail/shuffle_wrappers.cu +++ b/cpp/src/detail/shuffle_wrappers.cu @@ -147,7 +147,7 @@ rmm::device_uvector groupby_and_count_edgelist_by_local_partition_id( rmm::device_uvector& d_edgelist_majors, rmm::device_uvector& d_edgelist_minors, std::optional>& d_edgelist_weights, - bool groupby_and_count_local_partition) + bool groupby_and_count_local_partition_by_minor) { auto& comm = handle.get_comms(); auto const comm_size = comm.get_size(); @@ -162,7 +162,7 @@ rmm::device_uvector groupby_and_count_edgelist_by_local_partition_id( auto pair_first = thrust::make_zip_iterator( thrust::make_tuple(d_edgelist_majors.begin(), d_edgelist_minors.begin())); - if (groupby_and_count_local_partition) { + if (groupby_and_count_local_partition_by_minor) { auto local_partition_id_gpu_id_pair_op = [comm_size, row_comm_size, From f06f32b796b601cf6b37b0686e49c13000551719 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 12 Feb 2022 23:24:44 -0800 Subject: [PATCH 25/60] add a heuristic to cut peak memory footprint --- .../cugraph/utilities/shuffle_comm.cuh | 228 ++++++++++++++++-- cpp/src/detail/shuffle_wrappers.cu | 13 + cpp/src/utilities/cython.cu | 2 + 3 files changed, 227 insertions(+), 16 deletions(-) diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh index f10f9db95e1..00936a8c373 100644 --- a/cpp/include/cugraph/utilities/shuffle_comm.cuh +++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh @@ -125,6 +125,157 @@ compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const& comm, return std::make_tuple(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks); } +template +struct value_group_id_less_t { + ValueToGroupIdOp value_to_group_id_op{}; + int pivot{}; + __device__ bool operator()(value_type v) const { return value_to_group_id_op(v) < pivot; } +}; + +template +struct kv_pair_group_id_less_t { + KeyToGroupIdOp key_to_group_id_op{}; + int pivot{}; + __device__ bool operator()(thrust::tuple t) const + { + return key_to_group_id_op(thrust::get<0>(t)) < pivot; + } +}; + +template +struct value_group_id_greater_equal_t { + ValueToGroupIdOp value_to_group_id_op{}; + int pivot{}; + __device__ bool operator()(value_type v) const { return value_to_group_id_op(v) >= pivot; } +}; + +template +struct kv_pair_group_id_greater_equal_t { + KeyToGroupIdOp key_to_group_id_op{}; + int pivot{}; + __device__ bool operator()(thrust::tuple t) const + { + return key_to_group_id_op(thrust::get<0>(t)) >= pivot; + } +}; + +// use roughly half temporary buffer than thrust::partition (if first & second partition sizes are +// comparable) +template +ValueIterator mem_frugal_partition( + ValueIterator value_first, + ValueIterator value_last, + ValueToGroupIdOp value_to_group_id_op, + int pivot, // group Id less than pivot goes to the first partition + rmm::cuda_stream_view stream_view) +{ + auto num_elements = static_cast(thrust::distance(value_first, value_last)); + auto first_size = static_cast(thrust::count_if( + rmm::exec_policy(stream_view), + value_first, + value_last, + value_group_id_less_t::value_type, + ValueToGroupIdOp>{value_to_group_id_op, pivot})); + auto second_size = num_elements - first_size; + + auto tmp_buffer = + allocate_dataframe_buffer::value_type>( + second_size, stream_view); + + // to limit memory footprint (16 * 1024 * 1024 is a tuning parameter) + // thrust::copy_if (1.15.0) also uses temporary buffer + auto constexpr max_elements_per_iteration = size_t{16} * 1024 * 1024; + auto num_chunks = (num_elements + max_elements_per_iteration - 1) / max_elements_per_iteration; + auto output_chunk_first = get_dataframe_buffer_begin(tmp_buffer); + for (size_t i = 0; i < num_chunks; ++i) { + output_chunk_first = thrust::copy_if( + rmm::exec_policy(stream_view), + value_first + max_elements_per_iteration * i, + value_first + std::min(max_elements_per_iteration * (i + 1), num_elements), + output_chunk_first, + value_group_id_greater_equal_t::value_type, + ValueToGroupIdOp>{value_to_group_id_op, pivot}); + } + + thrust::remove_if( + rmm::exec_policy(stream_view), + value_first, + value_last, + value_group_id_greater_equal_t::value_type, + ValueToGroupIdOp>{value_to_group_id_op, pivot}); + thrust::copy(rmm::exec_policy(stream_view), + get_dataframe_buffer_cbegin(tmp_buffer), + get_dataframe_buffer_cend(tmp_buffer), + value_first + first_size); + + return value_first + first_size; +} + +// use roughly half temporary buffer than thrust::partition (if first & second partition sizes are +// comparable) +template +std::tuple mem_frugal_partition( + KeyIterator key_first, + KeyIterator key_last, + ValueIterator value_first, + KeyToGroupIdOp key_to_group_id_op, + int pivot, // group Id less than pivot goes to the first partition + rmm::cuda_stream_view stream_view) +{ + auto num_elements = static_cast(thrust::distance(key_first, key_last)); + auto first_size = static_cast(thrust::count_if( + rmm::exec_policy(stream_view), + key_first, + key_last, + kv_pair_group_id_less_t::value_type, + typename thrust::iterator_traits::value_type, + KeyToGroupIdOp>{key_to_group_id_op, pivot})); + auto second_size = num_elements - first_size; + + auto tmp_key_buffer = + allocate_dataframe_buffer::value_type>( + second_size, stream_view); + auto tmp_value_buffer = + allocate_dataframe_buffer::value_type>( + second_size, stream_view); + + // to limit memory footprint (16 * 1024 * 1024 is a tuning parameter) + // thrust::copy_if (1.15.0) also uses temporary buffer + auto max_elements_per_iteration = size_t{16} * 1024 * 1024; + auto num_chunks = (num_elements + max_elements_per_iteration - 1) / max_elements_per_iteration; + auto kv_pair_first = thrust::make_zip_iterator(thrust::make_tuple(key_first, value_first)); + auto output_chunk_first = thrust::make_zip_iterator(thrust::make_tuple( + get_dataframe_buffer_begin(tmp_key_buffer), get_dataframe_buffer_begin(tmp_value_buffer))); + for (size_t i = 0; i < num_chunks; ++i) { + output_chunk_first = thrust::copy_if( + rmm::exec_policy(stream_view), + kv_pair_first + max_elements_per_iteration * i, + kv_pair_first + std::min(max_elements_per_iteration * (i + 1), num_elements), + output_chunk_first, + kv_pair_group_id_greater_equal_t::value_type, + typename thrust::iterator_traits::value_type, + KeyToGroupIdOp>{key_to_group_id_op, pivot}); + } + + thrust::remove_if( + rmm::exec_policy(stream_view), + kv_pair_first, + kv_pair_first + num_elements, + kv_pair_group_id_greater_equal_t::value_type, + typename thrust::iterator_traits::value_type, + KeyToGroupIdOp>{key_to_group_id_op, pivot}); + thrust::copy(rmm::exec_policy(stream_view), + get_dataframe_buffer_cbegin(tmp_key_buffer), + get_dataframe_buffer_cend(tmp_key_buffer), + key_first + first_size); + thrust::copy(rmm::exec_policy(stream_view), + get_dataframe_buffer_cbegin(tmp_value_buffer), + get_dataframe_buffer_cend(tmp_value_buffer), + value_first + first_size); + + return std::make_tuple(key_first + first_size, value_first + first_size); +} + } // namespace detail template @@ -132,14 +283,33 @@ rmm::device_uvector groupby_and_count(ValueIterator tx_value_first /* [I ValueIterator tx_value_last /* [INOUT */, ValueToGPUIdOp value_to_group_id_op, int num_groups, + bool mem_frugal, rmm::cuda_stream_view stream_view) { - thrust::sort(rmm::exec_policy(stream_view), - tx_value_first, - tx_value_last, - [value_to_group_id_op] __device__(auto lhs, auto rhs) { - return value_to_group_id_op(lhs) < value_to_group_id_op(rhs); - }); + if (mem_frugal) { + auto pivot = num_groups / 2; + auto second_first = detail::mem_frugal_partition( + tx_value_first, tx_value_last, value_to_group_id_op, pivot, stream_view); + thrust::sort(rmm::exec_policy(stream_view), + tx_value_first, + second_first, + [value_to_group_id_op] __device__(auto lhs, auto rhs) { + return value_to_group_id_op(lhs) < value_to_group_id_op(rhs); + }); + thrust::sort(rmm::exec_policy(stream_view), + second_first, + tx_value_last, + [value_to_group_id_op] __device__(auto lhs, auto rhs) { + return value_to_group_id_op(lhs) < value_to_group_id_op(rhs); + }); + } else { + thrust::sort(rmm::exec_policy(stream_view), + tx_value_first, + tx_value_last, + [value_to_group_id_op] __device__(auto lhs, auto rhs) { + return value_to_group_id_op(lhs) < value_to_group_id_op(rhs); + }); + } auto group_id_first = thrust::make_transform_iterator( tx_value_first, @@ -164,15 +334,36 @@ rmm::device_uvector groupby_and_count(VertexIterator tx_key_first /* [IN ValueIterator tx_value_first /* [INOUT */, KeyToGPUIdOp key_to_group_id_op, int num_groups, + bool mem_frugal, rmm::cuda_stream_view stream_view) { - thrust::sort_by_key(rmm::exec_policy(stream_view), - tx_key_first, - tx_key_last, - tx_value_first, - [key_to_group_id_op] __device__(auto lhs, auto rhs) { - return key_to_group_id_op(lhs) < key_to_group_id_op(rhs); - }); + if (mem_frugal) { + auto pivot = num_groups / 2; + auto second_first = detail::mem_frugal_partition( + tx_key_first, tx_key_last, tx_value_first, key_to_group_id_op, pivot, stream_view); + thrust::sort_by_key(rmm::exec_policy(stream_view), + tx_key_first, + std::get<0>(second_first), + tx_value_first, + [key_to_group_id_op] __device__(auto lhs, auto rhs) { + return key_to_group_id_op(lhs) < key_to_group_id_op(rhs); + }); + thrust::sort_by_key(rmm::exec_policy(stream_view), + std::get<0>(second_first), + tx_key_last, + std::get<1>(second_first), + [key_to_group_id_op] __device__(auto lhs, auto rhs) { + return key_to_group_id_op(lhs) < key_to_group_id_op(rhs); + }); + } else { + thrust::sort_by_key(rmm::exec_policy(stream_view), + tx_key_first, + tx_key_last, + tx_value_first, + [key_to_group_id_op] __device__(auto lhs, auto rhs) { + return key_to_group_id_op(lhs) < key_to_group_id_op(rhs); + }); + } auto group_id_first = thrust::make_transform_iterator( tx_key_first, [key_to_group_id_op] __device__(auto key) { return key_to_group_id_op(key); }); @@ -249,7 +440,7 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm, auto const comm_size = comm.get_size(); auto d_tx_value_counts = groupby_and_count( - tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), stream_view); + tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), false, stream_view); std::vector tx_counts{}; std::vector tx_offsets{}; @@ -298,8 +489,13 @@ auto groupby_gpuid_and_shuffle_kv_pairs(raft::comms::comms_t const& comm, { auto const comm_size = comm.get_size(); - auto d_tx_value_counts = groupby_and_count( - tx_key_first, tx_key_last, tx_value_first, key_to_gpu_id_op, comm.get_size(), stream_view); + auto d_tx_value_counts = groupby_and_count(tx_key_first, + tx_key_last, + tx_value_first, + key_to_gpu_id_op, + comm.get_size(), + false, + stream_view); std::vector tx_counts{}; std::vector tx_offsets{}; diff --git a/cpp/src/detail/shuffle_wrappers.cu b/cpp/src/detail/shuffle_wrappers.cu index 4f25dcf30b7..7a52fd07822 100644 --- a/cpp/src/detail/shuffle_wrappers.cu +++ b/cpp/src/detail/shuffle_wrappers.cu @@ -159,6 +159,15 @@ rmm::device_uvector groupby_and_count_edgelist_by_local_partition_id( auto const col_comm_size = col_comm.get_size(); auto const col_comm_rank = col_comm.get_rank(); + auto total_global_mem = handle.get_device_properties().totalGlobalMem; + auto element_size = sizeof(vertex_t) * 2 + (d_edgelist_weights ? sizeof(weight_t) : size_t{0}); + auto mem_frugal = + d_edgelist_majors.size() * element_size >= + total_global_mem / + 4; // if the data size exceeds 1/4 of the device memory (1/4 is a tuning parameter), + // groupby_and_count requires temporary buffer comparable to the input data size, if + // mem_frugal is set to true, temporary buffer size can be reduced up to 50% + auto pair_first = thrust::make_zip_iterator( thrust::make_tuple(d_edgelist_majors.begin(), d_edgelist_minors.begin())); @@ -183,11 +192,13 @@ rmm::device_uvector groupby_and_count_edgelist_by_local_partition_id( d_edgelist_weights->begin(), local_partition_id_gpu_id_pair_op, comm_size, + mem_frugal, handle.get_stream()) : cugraph::groupby_and_count(pair_first, pair_first + d_edgelist_majors.size(), local_partition_id_gpu_id_pair_op, comm_size, + mem_frugal, handle.get_stream()); } else { auto local_partition_id_op = @@ -203,11 +214,13 @@ rmm::device_uvector groupby_and_count_edgelist_by_local_partition_id( d_edgelist_weights->begin(), local_partition_id_op, col_comm_size, + mem_frugal, handle.get_stream()) : cugraph::groupby_and_count(pair_first, pair_first + d_edgelist_majors.size(), local_partition_id_op, col_comm_size, + mem_frugal, handle.get_stream()); } } diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu index 35a6be4edc3..afbabb64431 100644 --- a/cpp/src/utilities/cython.cu +++ b/cpp/src/utilities/cython.cu @@ -1248,11 +1248,13 @@ std::unique_ptr> call_shuffle( ptr_ret->get_weights().data(), local_partition_id_op, col_comm_size, + false, handle.get_stream()) : cugraph::groupby_and_count(pair_first, pair_first + ptr_ret->get_major().size(), local_partition_id_op, col_comm_size, + false, handle.get_stream()); std::vector h_edge_counts(edge_counts.size()); From 6dceeabdefe95cb98319fe7c95004402748bbe4c Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 12 Feb 2022 23:41:44 -0800 Subject: [PATCH 26/60] improve inconsistencies in naming --- ...ransform_reduce_key_aggregated_out_nbr.cuh | 2 +- ...orm_reduce_by_adj_matrix_row_col_key_e.cuh | 2 +- .../cugraph/utilities/collect_comm.cuh | 4 +- cpp/include/cugraph/utilities/cython.hpp | 2 +- .../cugraph/utilities/shuffle_comm.cuh | 38 +++++++++---------- cpp/src/community/louvain.cuh | 2 +- .../weakly_connected_components_impl.cuh | 30 +++++++-------- cpp/src/detail/shuffle_wrappers.cu | 6 +-- cpp/src/structure/coarsen_graph_impl.cuh | 2 +- cpp/src/structure/relabel_impl.cuh | 4 +- cpp/src/utilities/cython.cu | 6 +-- 11 files changed, 49 insertions(+), 49 deletions(-) diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index 1dee131a000..1ff109c7766 100644 --- a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -482,7 +482,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( rmm::device_uvector rx_key_aggregated_edge_weights(0, handle.get_stream()); std::forward_as_tuple( std::tie(rx_major_vertices, rx_minor_keys, rx_key_aggregated_edge_weights), std::ignore) = - groupby_gpuid_and_shuffle_values( + groupby_gpu_id_and_shuffle_values( col_comm, triplet_first, triplet_first + tmp_major_vertices.size(), diff --git a/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh b/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh index 7f4cad5eded..968d99b7d25 100644 --- a/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh +++ b/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh @@ -505,7 +505,7 @@ transform_reduce_by_adj_matrix_row_col_key_e( rmm::device_uvector rx_unique_keys(0, handle.get_stream()); auto rx_value_for_unique_key_buffer = allocate_dataframe_buffer(0, handle.get_stream()); std::tie(rx_unique_keys, rx_value_for_unique_key_buffer, std::ignore) = - groupby_gpuid_and_shuffle_kv_pairs( + groupby_gpu_id_and_shuffle_kv_pairs( comm, tmp_keys.begin(), tmp_keys.end(), diff --git a/cpp/include/cugraph/utilities/collect_comm.cuh b/cpp/include/cugraph/utilities/collect_comm.cuh index 8b89d941885..5b414f1f1eb 100644 --- a/cpp/include/cugraph/utilities/collect_comm.cuh +++ b/cpp/include/cugraph/utilities/collect_comm.cuh @@ -103,7 +103,7 @@ collect_values_for_keys(raft::comms::comms_t const& comm, { rmm::device_uvector rx_unique_keys(0, stream_view); std::vector rx_value_counts{}; - std::tie(rx_unique_keys, rx_value_counts) = groupby_gpuid_and_shuffle_values( + std::tie(rx_unique_keys, rx_value_counts) = groupby_gpu_id_and_shuffle_values( comm, unique_keys.begin(), unique_keys.end(), @@ -228,7 +228,7 @@ collect_values_for_unique_keys(raft::comms::comms_t const& comm, { rmm::device_uvector rx_unique_keys(0, stream_view); std::vector rx_value_counts{}; - std::tie(rx_unique_keys, rx_value_counts) = groupby_gpuid_and_shuffle_values( + std::tie(rx_unique_keys, rx_value_counts) = groupby_gpu_id_and_shuffle_values( comm, unique_keys.begin(), unique_keys.end(), diff --git a/cpp/include/cugraph/utilities/cython.hpp b/cpp/include/cugraph/utilities/cython.hpp index 100a9d7db5e..260393009bd 100644 --- a/cpp/include/cugraph/utilities/cython.hpp +++ b/cpp/include/cugraph/utilities/cython.hpp @@ -588,7 +588,7 @@ template std::unique_ptr> call_shuffle( raft::handle_t const& handle, vertex_t* - edgelist_major_vertices, // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place + edgelist_major_vertices, // [IN / OUT]: groupby_gpu_id_and_shuffle_values() sorts in-place vertex_t* edgelist_minor_vertices, // [IN / OUT] weight_t* edgelist_weights, // [IN / OUT] edge_t num_edgelist_edges); diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh index 00936a8c373..d951930a2b3 100644 --- a/cpp/include/cugraph/utilities/shuffle_comm.cuh +++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh @@ -278,10 +278,10 @@ std::tuple mem_frugal_partition( } // namespace detail -template +template rmm::device_uvector groupby_and_count(ValueIterator tx_value_first /* [INOUT */, ValueIterator tx_value_last /* [INOUT */, - ValueToGPUIdOp value_to_group_id_op, + ValueToGroupIdOp value_to_group_id_op, int num_groups, bool mem_frugal, rmm::cuda_stream_view stream_view) @@ -328,11 +328,11 @@ rmm::device_uvector groupby_and_count(ValueIterator tx_value_first /* [I return d_tx_value_counts; } -template +template rmm::device_uvector groupby_and_count(VertexIterator tx_key_first /* [INOUT */, VertexIterator tx_key_last /* [INOUT */, ValueIterator tx_value_first /* [INOUT */, - KeyToGPUIdOp key_to_group_id_op, + KeyToGroupIdOp key_to_group_id_op, int num_groups, bool mem_frugal, rmm::cuda_stream_view stream_view) @@ -402,7 +402,7 @@ auto shuffle_values(raft::comms::comms_t const& comm, detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view); auto rx_value_buffer = - allocate_dataframe_buffer::value_type>( + allocate_dataframe_buffer::value_type>( rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released @@ -431,11 +431,11 @@ auto shuffle_values(raft::comms::comms_t const& comm, } template -auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm, - ValueIterator tx_value_first /* [INOUT */, - ValueIterator tx_value_last /* [INOUT */, - ValueToGPUIdOp value_to_gpu_id_op, - rmm::cuda_stream_view stream_view) +auto groupby_gpu_id_and_shuffle_values(raft::comms::comms_t const& comm, + ValueIterator tx_value_first /* [INOUT */, + ValueIterator tx_value_last /* [INOUT */, + ValueToGPUIdOp value_to_gpu_id_op, + rmm::cuda_stream_view stream_view) { auto const comm_size = comm.get_size(); @@ -452,7 +452,7 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm, detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view); auto rx_value_buffer = - allocate_dataframe_buffer::value_type>( + allocate_dataframe_buffer::value_type>( rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released @@ -480,12 +480,12 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm, } template -auto groupby_gpuid_and_shuffle_kv_pairs(raft::comms::comms_t const& comm, - VertexIterator tx_key_first /* [INOUT */, - VertexIterator tx_key_last /* [INOUT */, - ValueIterator tx_value_first /* [INOUT */, - KeyToGPUIdOp key_to_gpu_id_op, - rmm::cuda_stream_view stream_view) +auto groupby_gpu_id_and_shuffle_kv_pairs(raft::comms::comms_t const& comm, + VertexIterator tx_key_first /* [INOUT */, + VertexIterator tx_key_last /* [INOUT */, + ValueIterator tx_value_first /* [INOUT */, + KeyToGPUIdOp key_to_gpu_id_op, + rmm::cuda_stream_view stream_view) { auto const comm_size = comm.get_size(); @@ -506,10 +506,10 @@ auto groupby_gpuid_and_shuffle_kv_pairs(raft::comms::comms_t const& comm, std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) = detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view); - rmm::device_uvector::value_type> rx_keys( + rmm::device_uvector::value_type> rx_keys( rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); auto rx_value_buffer = - allocate_dataframe_buffer::value_type>( + allocate_dataframe_buffer::value_type>( rx_keys.size(), stream_view); // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index 025c520abf5..094f3bc6546 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -308,7 +308,7 @@ class Louvain { thrust::make_tuple(cluster_keys_v_.begin(), cluster_weights_v_.begin())); std::forward_as_tuple(std::tie(rx_keys_v, rx_weights_v), std::ignore) = - groupby_gpuid_and_shuffle_values( + groupby_gpu_id_and_shuffle_values( handle_.get_comms(), pair_first, pair_first + current_graph_view_.get_number_of_local_vertices(), diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh index 21e9571fbb2..757fc9e3d23 100644 --- a/cpp/src/components/weakly_connected_components_impl.cuh +++ b/cpp/src/components/weakly_connected_components_impl.cuh @@ -371,17 +371,17 @@ void weakly_connected_components_impl(raft::handle_t const& handle, // with fewer than one root per GPU if (std::reduce(first_candidate_degrees.begin(), first_candidate_degrees.end()) > degree_sum_threshold * comm_size) { - std::vector> degree_gpuid_pairs(comm_size); + std::vector> degree_gpu_id_pairs(comm_size); for (int i = 0; i < comm_size; ++i) { - degree_gpuid_pairs[i] = std::make_tuple(first_candidate_degrees[i], i); + degree_gpu_id_pairs[i] = std::make_tuple(first_candidate_degrees[i], i); } - std::sort(degree_gpuid_pairs.begin(), degree_gpuid_pairs.end(), [](auto lhs, auto rhs) { + std::sort(degree_gpu_id_pairs.begin(), degree_gpu_id_pairs.end(), [](auto lhs, auto rhs) { return std::get<0>(lhs) > std::get<0>(rhs); }); edge_t sum{0}; - for (size_t i = 0; i < degree_gpuid_pairs.size(); ++i) { - sum += std::get<0>(degree_gpuid_pairs[i]); - init_max_new_root_counts[std::get<1>(degree_gpuid_pairs[i])] = 1; + for (size_t i = 0; i < degree_gpu_id_pairs.size(); ++i) { + sum += std::get<0>(degree_gpu_id_pairs[i]); + init_max_new_root_counts[std::get<1>(degree_gpu_id_pairs[i])] = 1; if (sum > degree_sum_threshold * comm_size) { break; } } } @@ -390,18 +390,18 @@ void weakly_connected_components_impl(raft::handle_t const& handle, else if (level_graph_view.get_number_of_vertices() <= static_cast(handle.get_comms().get_size() * ceil(1.0 / max_new_roots_ratio))) { - std::vector gpuids{}; - gpuids.reserve( + std::vector gpu_ids{}; + gpu_ids.reserve( std::reduce(new_root_candidate_counts.begin(), new_root_candidate_counts.end())); for (size_t i = 0; i < new_root_candidate_counts.size(); ++i) { - gpuids.insert(gpuids.end(), new_root_candidate_counts[i], static_cast(i)); + gpu_ids.insert(gpu_ids.end(), new_root_candidate_counts[i], static_cast(i)); } std::random_device rd{}; - std::shuffle(gpuids.begin(), gpuids.end(), std::mt19937(rd())); - gpuids.resize( - std::max(static_cast(gpuids.size() * max_new_roots_ratio), vertex_t{1})); - for (size_t i = 0; i < gpuids.size(); ++i) { - ++init_max_new_root_counts[gpuids[i]]; + std::shuffle(gpu_ids.begin(), gpu_ids.end(), std::mt19937(rd())); + gpu_ids.resize( + std::max(static_cast(gpu_ids.size() * max_new_roots_ratio), vertex_t{1})); + for (size_t i = 0; i < gpu_ids.size(); ++i) { + ++init_max_new_root_counts[gpu_ids[i]]; } } else { std::fill(init_max_new_root_counts.begin(), @@ -678,7 +678,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle, auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_size = col_comm.get_size(); - std::tie(edge_buffer, std::ignore) = cugraph::groupby_gpuid_and_shuffle_values( + std::tie(edge_buffer, std::ignore) = cugraph::groupby_gpu_id_and_shuffle_values( comm, get_dataframe_buffer_begin(edge_buffer), get_dataframe_buffer_end(edge_buffer), diff --git a/cpp/src/detail/shuffle_wrappers.cu b/cpp/src/detail/shuffle_wrappers.cu index 7a52fd07822..fd5bffbb950 100644 --- a/cpp/src/detail/shuffle_wrappers.cu +++ b/cpp/src/detail/shuffle_wrappers.cu @@ -51,7 +51,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle, std::forward_as_tuple( std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors, d_rx_edgelist_weights), std::ignore) = - cugraph::groupby_gpuid_and_shuffle_values( + cugraph::groupby_gpu_id_and_shuffle_values( comm, // handle.get_comms(), edge_first, edge_first + d_edgelist_majors.size(), @@ -67,7 +67,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle, std::forward_as_tuple(std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors), std::ignore) = - cugraph::groupby_gpuid_and_shuffle_values( + cugraph::groupby_gpu_id_and_shuffle_values( comm, // handle.get_comms(), edge_first, edge_first + d_edgelist_majors.size(), @@ -124,7 +124,7 @@ rmm::device_uvector shuffle_vertices_by_gpu_id(raft::handle_t const& h auto const comm_size = comm.get_size(); rmm::device_uvector d_rx_vertices(0, handle.get_stream()); - std::tie(d_rx_vertices, std::ignore) = cugraph::groupby_gpuid_and_shuffle_values( + std::tie(d_rx_vertices, std::ignore) = cugraph::groupby_gpu_id_and_shuffle_values( comm, // handle.get_comms(), d_vertices.begin(), d_vertices.end(), diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index b0f6c7eca05..aa2a87ccc72 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -267,7 +267,7 @@ coarsen_graph( // 1-3. append data to local adjacency matrix partitions - // FIXME: we can skip this if groupby_gpuid_and_shuffle_values is updated to return sorted edge + // FIXME: we can skip this if groupby_gpu_id_and_shuffle_values is updated to return sorted edge // list based on the final matrix partition (maybe add // groupby_adj_matrix_partition_and_shuffle_values). diff --git a/cpp/src/structure/relabel_impl.cuh b/cpp/src/structure/relabel_impl.cuh index d709152f71c..4ace52e351a 100644 --- a/cpp/src/structure/relabel_impl.cuh +++ b/cpp/src/structure/relabel_impl.cuh @@ -95,7 +95,7 @@ void relabel(raft::handle_t const& handle, thrust::make_tuple(label_pair_old_labels.begin(), label_pair_new_labels.begin())); std::forward_as_tuple(std::tie(rx_label_pair_old_labels, rx_label_pair_new_labels), std::ignore) = - groupby_gpuid_and_shuffle_values( + groupby_gpu_id_and_shuffle_values( handle.get_comms(), pair_first, pair_first + num_label_pairs, @@ -136,7 +136,7 @@ void relabel(raft::handle_t const& handle, { rmm::device_uvector rx_unique_old_labels(0, handle.get_stream()); std::vector rx_value_counts{}; - std::tie(rx_unique_old_labels, rx_value_counts) = groupby_gpuid_and_shuffle_values( + std::tie(rx_unique_old_labels, rx_value_counts) = groupby_gpu_id_and_shuffle_values( handle.get_comms(), unique_old_labels.begin(), unique_old_labels.end(), diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu index afbabb64431..59241a3e913 100644 --- a/cpp/src/utilities/cython.cu +++ b/cpp/src/utilities/cython.cu @@ -1182,7 +1182,7 @@ template std::unique_ptr> call_shuffle( raft::handle_t const& handle, vertex_t* - edgelist_major_vertices, // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place + edgelist_major_vertices, // [IN / OUT]: groupby_gpu_id_and_shuffle_values() sorts in-place vertex_t* edgelist_minor_vertices, // [IN / OUT] weight_t* edgelist_weights, // [IN / OUT] edge_t num_edgelist_edges) @@ -1204,7 +1204,7 @@ std::unique_ptr> call_shuffle( std::forward_as_tuple( std::tie(ptr_ret->get_major(), ptr_ret->get_minor(), ptr_ret->get_weights()), std::ignore) = - cugraph::groupby_gpuid_and_shuffle_values( + cugraph::groupby_gpu_id_and_shuffle_values( comm, // handle.get_comms(), zip_edge, zip_edge + num_edgelist_edges, @@ -1220,7 +1220,7 @@ std::unique_ptr> call_shuffle( std::forward_as_tuple(std::tie(ptr_ret->get_major(), ptr_ret->get_minor()), std::ignore) = - cugraph::groupby_gpuid_and_shuffle_values( + cugraph::groupby_gpu_id_and_shuffle_values( comm, // handle.get_comms(), zip_edge, zip_edge + num_edgelist_edges, From febd72d6e2e7fa9d561238b3ddc5fc3b0e10abae Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 14 Feb 2022 18:02:22 -0800 Subject: [PATCH 27/60] split groupby_and_shuffle_edgelist to groupby_and_count and shuffle_edgelist to pass mem_frugal=true to limit the maximum allocation chunk size to avoid malloc failure due to fragmentation with the pool allocator --- cpp/src/detail/shuffle_wrappers.cu | 85 +++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 25 deletions(-) diff --git a/cpp/src/detail/shuffle_wrappers.cu b/cpp/src/detail/shuffle_wrappers.cu index fd5bffbb950..b22c839e346 100644 --- a/cpp/src/detail/shuffle_wrappers.cu +++ b/cpp/src/detail/shuffle_wrappers.cu @@ -41,6 +41,22 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle, auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_size = col_comm.get_size(); + auto total_global_mem = handle.get_device_properties().totalGlobalMem; + auto element_size = sizeof(vertex_t) * 2 + (d_edgelist_weights ? sizeof(weight_t) : size_t{0}); + auto mem_frugal = + d_edgelist_majors.size() * element_size >= + total_global_mem / + 5; // if the data size exceeds 1/5 of the device memory (1/5 is a tuning parameter), + // groupby_and_count requires temporary buffer comparable to the input data size, if + // mem_frugal is set to true, temporary buffer size can be reduced up to 50% + + // invoke groupby_and_count and shuffle values to pass mem_frugal instead of directly calling + // groupby_gpu_id_and_shuffle_values there is no benefit in reducing peak memory as we need to + // allocate a receive buffer anyways) but this reduces the maximum memory allocation size by half + // (thrust::sort used inside the groupby_and_count allocates the entire temporary buffer in a + // single chunk, and the pool allocator often cannot handle a large single allocation (due to + // fragmentation) even when the remaining free memory in aggregate is significantly larger than + // the requested size). rmm::device_uvector d_rx_edgelist_majors(0, handle.get_stream()); rmm::device_uvector d_rx_edgelist_minors(0, handle.get_stream()); std::optional> d_rx_edgelist_weights{std::nullopt}; @@ -48,35 +64,54 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle, auto edge_first = thrust::make_zip_iterator(thrust::make_tuple( d_edgelist_majors.begin(), d_edgelist_minors.begin(), (*d_edgelist_weights).begin())); + auto d_tx_value_counts = cugraph::groupby_and_count( + edge_first, + edge_first + d_edgelist_majors.size(), + [key_func = + cugraph::detail::compute_gpu_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto val) { + return key_func(thrust::get<0>(val), thrust::get<1>(val)); + }, + comm_size, + mem_frugal, + handle.get_stream()); + + std::vector h_tx_value_counts(d_tx_value_counts.size()); + raft::update_host(h_tx_value_counts.data(), + d_tx_value_counts.data(), + d_tx_value_counts.size(), + handle.get_stream()); + handle.sync_stream(); + std::forward_as_tuple( - std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors, d_rx_edgelist_weights), - std::ignore) = - cugraph::groupby_gpu_id_and_shuffle_values( - comm, // handle.get_comms(), - edge_first, - edge_first + d_edgelist_majors.size(), - [key_func = - cugraph::detail::compute_gpu_id_from_edge_t{ - comm_size, row_comm_size, col_comm_size}] __device__(auto val) { - return key_func(thrust::get<0>(val), thrust::get<1>(val)); - }, - handle.get_stream()); + std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors, d_rx_edgelist_weights), std::ignore) = + shuffle_values(comm, edge_first, h_tx_value_counts, handle.get_stream()); } else { auto edge_first = thrust::make_zip_iterator( thrust::make_tuple(d_edgelist_majors.begin(), d_edgelist_minors.begin())); - std::forward_as_tuple(std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors), - std::ignore) = - cugraph::groupby_gpu_id_and_shuffle_values( - comm, // handle.get_comms(), - edge_first, - edge_first + d_edgelist_majors.size(), - [key_func = - cugraph::detail::compute_gpu_id_from_edge_t{ - comm_size, row_comm_size, col_comm_size}] __device__(auto val) { - return key_func(thrust::get<0>(val), thrust::get<1>(val)); - }, - handle.get_stream()); + auto d_tx_value_counts = cugraph::groupby_and_count( + edge_first, + edge_first + d_edgelist_majors.size(), + [key_func = + cugraph::detail::compute_gpu_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto val) { + return key_func(thrust::get<0>(val), thrust::get<1>(val)); + }, + comm_size, + mem_frugal, + handle.get_stream()); + + std::vector h_tx_value_counts(d_tx_value_counts.size()); + raft::update_host(h_tx_value_counts.data(), + d_tx_value_counts.data(), + d_tx_value_counts.size(), + handle.get_stream()); + handle.sync_stream(); + + std::forward_as_tuple( + std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors), std::ignore) = + shuffle_values(comm, edge_first, h_tx_value_counts, handle.get_stream()); } return std::make_tuple(std::move(d_rx_edgelist_majors), @@ -164,7 +199,7 @@ rmm::device_uvector groupby_and_count_edgelist_by_local_partition_id( auto mem_frugal = d_edgelist_majors.size() * element_size >= total_global_mem / - 4; // if the data size exceeds 1/4 of the device memory (1/4 is a tuning parameter), + 5; // if the data size exceeds 1/5 of the device memory (1/5 is a tuning parameter), // groupby_and_count requires temporary buffer comparable to the input data size, if // mem_frugal is set to true, temporary buffer size can be reduced up to 50% From 48fa2559725add366b7dd87a6221071a53f88bdf Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 16 Feb 2022 09:45:03 -0800 Subject: [PATCH 28/60] use temporary host buffer to concatenate edge list in edge generation --- cpp/tests/utilities/test_graphs.hpp | 98 +++++++++++++++++------------ 1 file changed, 59 insertions(+), 39 deletions(-) diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp index 0934beb466a..dc0f13fc9f0 100644 --- a/cpp/tests/utilities/test_graphs.hpp +++ b/cpp/tests/utilities/test_graphs.hpp @@ -29,6 +29,53 @@ namespace test { namespace detail { +template +std::optional> try_allocate(raft::handle_t const& handle, size_t size) +{ + try { + return std::make_optional>(size, handle.get_stream()); + } catch (std::exception const& e) { + return std::nullopt; + } +} + +// use host memory as temporary buffer if memroy allocation on device fails +template +rmm::device_uvector concatenate(raft::handle_t const& handle, + std::vector>&& inputs) +{ + size_t tot_count{0}; + for (size_t i = 0; i < inputs.size(); ++i) { + tot_count += inputs[i].size(); + } + + auto output = try_allocate(handle, tot_count); + if (output) { + size_t offset{0}; + for (size_t i = 0; i < inputs.size(); ++i) { + raft::copy( + (*output).data() + offset, inputs[i].data(), inputs[i].size(), handle.get_stream()); + offset += inputs[i].size(); + } + inputs.clear(); + inputs.shrink_to_fit(); + } else { + std::vector h_buffer(tot_count); + size_t offset{0}; + for (size_t i = 0; i < inputs.size(); ++i) { + raft::update_host( + h_buffer.data() + offset, inputs[i].data(), inputs[i].size(), handle.get_stream()); + offset += inputs[i].size(); + } + inputs.clear(); + inputs.shrink_to_fit(); + output = rmm::device_uvector(tot_count, handle.get_stream()); + raft::update_device((*output).data(), h_buffer.data(), h_buffer.size(), handle.get_stream()); + } + + return std::move(*output); +} + class TranslateGraph_Usecase { public: TranslateGraph_Usecase() = delete; @@ -266,43 +313,17 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { tot_edge_counts += src_partitions[i].size(); } - rmm::device_uvector src_v(tot_edge_counts, handle.get_stream()); - size_t src_offset{0}; - for (size_t i = 0; i < src_partitions.size(); ++i) { - thrust::copy(handle.get_thrust_policy(), - src_partitions[i].begin(), - src_partitions[i].end(), - src_v.begin() + src_offset); - src_offset += src_partitions[i].size(); - } - src_partitions.clear(); - src_partitions.shrink_to_fit(); - - rmm::device_uvector dst_v(tot_edge_counts, handle.get_stream()); - size_t dst_offset{0}; - for (size_t i = 0; i < dst_partitions.size(); ++i) { - thrust::copy(handle.get_thrust_policy(), - dst_partitions[i].begin(), - dst_partitions[i].end(), - dst_v.begin() + dst_offset); - dst_offset += dst_partitions[i].size(); - } - dst_partitions.clear(); - dst_partitions.shrink_to_fit(); + // detail::concatenate uses a host buffer to store input vectors if initial device memory + // allocation for the return vector fails. This does not improve peak memory usage and is not + // helpful with the rmm_mode = cuda. However, if rmm_mode = pool, memory allocation can fail + // even when the aggregate free memory size far exceeds the requested size. This heuristic is + // helpful in this case. + auto src_v = detail::concatenate(handle, std::move(src_partitions)); + auto dst_v = detail::concatenate(handle, std::move(dst_partitions)); std::optional> weight_v{std::nullopt}; if (weight_partitions) { - weight_v = rmm::device_uvector(tot_edge_counts, handle.get_stream()); - size_t weight_offset{0}; - for (size_t i = 0; i < (*weight_partitions).size(); ++i) { - thrust::copy(handle.get_thrust_policy(), - (*weight_partitions)[i].begin(), - (*weight_partitions)[i].end(), - (*weight_v).begin() + weight_offset); - weight_offset += (*weight_partitions)[i].size(); - } - (*weight_partitions).clear(); - (*weight_partitions).shrink_to_fit(); + weight_v = detail::concatenate(handle, std::move(*weight_partitions)); } // 3. generate vertices @@ -314,11 +335,10 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { rmm::device_uvector vertex_v(tot_vertex_counts, handle.get_stream()); size_t v_offset{0}; for (size_t i = 0; i < partition_vertex_firsts.size(); ++i) { - cugraph::detail::sequence_fill( - handle.get_stream(), - vertex_v.begin() + v_offset, - partition_vertex_lasts[i] - partition_vertex_firsts[i], - partition_vertex_firsts[i]); + cugraph::detail::sequence_fill(handle.get_stream(), + vertex_v.begin() + v_offset, + partition_vertex_lasts[i] - partition_vertex_firsts[i], + partition_vertex_firsts[i]); v_offset += partition_vertex_lasts[i] - partition_vertex_firsts[i]; } From 234a11971d70a007a607cad8b92350f15f8b6fea Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 16 Feb 2022 09:51:40 -0800 Subject: [PATCH 29/60] update groupby to take mem_frugal_threshold instead of bool mem_frugal --- .../cugraph/utilities/shuffle_comm.cuh | 231 +++++++++++++----- cpp/src/detail/shuffle_wrappers.cu | 59 ++--- 2 files changed, 200 insertions(+), 90 deletions(-) diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh index d951930a2b3..309a30c78e2 100644 --- a/cpp/include/cugraph/utilities/shuffle_comm.cuh +++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh @@ -159,14 +159,16 @@ struct kv_pair_group_id_greater_equal_t { } }; -// use roughly half temporary buffer than thrust::partition (if first & second partition sizes are -// comparable) +// Use roughly half temporary buffer than thrust::partition (if first & second partition sizes are +// comparable). This also uses multiple smaller allocations than one single allocation (thrust::sort +// does this) of the same aggregate size if the input iterators are the zip iterators (this is more +// favorable to the pool allocator). template ValueIterator mem_frugal_partition( ValueIterator value_first, ValueIterator value_last, ValueToGroupIdOp value_to_group_id_op, - int pivot, // group Id less than pivot goes to the first partition + int pivot, // group id less than pivot goes to the first partition rmm::cuda_stream_view stream_view) { auto num_elements = static_cast(thrust::distance(value_first, value_last)); @@ -211,8 +213,10 @@ ValueIterator mem_frugal_partition( return value_first + first_size; } -// use roughly half temporary buffer than thrust::partition (if first & second partition sizes are -// comparable) +// Use roughly half temporary buffer than thrust::partition (if first & second partition sizes are +// comparable). This also uses multiple smaller allocations than one single allocation (thrust::sort +// does this) of the same aggregate size if the input iterators are the zip iterators (this is more +// favorable to the pool allocator). template std::tuple mem_frugal_partition( KeyIterator key_first, @@ -276,6 +280,145 @@ std::tuple mem_frugal_partition( return std::make_tuple(key_first + first_size, value_first + first_size); } +template +void mem_frugal_groupby( + ValueIterator value_first, + ValueIterator value_last, + ValueToGroupIdOp value_to_group_id_op, + int num_groups, + size_t mem_frugal_threshold, // take the memory frugal approach (instead of thrust::sort) if # + // elements to groupby is no smaller than this value + rmm::cuda_stream_view stream_view) +{ + std::vector group_firsts{}; + std::vector group_lasts{}; + std::vector value_firsts{}; + std::vector value_lasts{}; + if (num_groups > 1) { + group_firsts.push_back(int{0}); + group_lasts.push_back(num_groups); + value_firsts.push_back(value_first); + value_lasts.push_back(value_last); + } + + auto offset_first = size_t{0}; + auto offset_last = group_firsts.size(); + while (offset_first < offset_last) { + for (size_t i = offset_first; i < offset_last; ++i) { + auto pivot = (group_firsts[i] + group_lasts[i]) / 2; + if (static_cast(thrust::distance(value_firsts[i], value_lasts[i])) < + mem_frugal_threshold) { + if (group_lasts[i] - group_firsts[i] == 2) { + thrust::partition( + rmm::exec_policy(stream_view), + value_firsts[i], + value_lasts[i], + value_group_id_less_t::value_type, + ValueToGroupIdOp>{value_to_group_id_op, pivot}); + } else { + thrust::sort(rmm::exec_policy(stream_view), + value_firsts[i], + value_lasts[i], + [value_to_group_id_op] __device__(auto lhs, auto rhs) { + return value_to_group_id_op(lhs) < value_to_group_id_op(rhs); + }); + } + } else { + auto second_first = mem_frugal_partition( + value_firsts[i], value_lasts[i], value_to_group_id_op, pivot, stream_view); + if (pivot - group_firsts[i] > 1) { + group_firsts.push_back(group_firsts[i]); + group_lasts.push_back(pivot); + value_firsts.push_back(value_firsts[i]); + value_lasts.push_back(second_first); + } + if (group_lasts[i] - pivot > 1) { + group_firsts.push_back(pivot); + group_lasts.push_back(group_lasts[i]); + value_firsts.push_back(second_first); + value_lasts.push_back(value_lasts[i]); + } + } + } + offset_first = offset_last; + offset_last = group_firsts.size(); + } +} + +template +void mem_frugal_groupby( + KeyIterator key_first, + KeyIterator key_last, + ValueIterator value_first, + KeyToGroupIdOp key_to_group_id_op, + int num_groups, + size_t mem_frugal_threshold, // take the memory frugal approach (instead of thrust::sort) if # + // elements to groupby is no smaller than this value + rmm::cuda_stream_view stream_view) +{ + std::vector group_firsts{}; + std::vector group_lasts{}; + std::vector key_firsts{}; + std::vector key_lasts{}; + std::vector value_firsts{}; + if (num_groups > 1) { + group_firsts.push_back(int{0}); + group_lasts.push_back(num_groups); + key_firsts.push_back(key_first); + key_lasts.push_back(key_last); + value_firsts.push_back(value_first); + } + + auto offset_first = size_t{0}; + auto offset_last = group_firsts.size(); + while (offset_first < offset_last) { + for (size_t i = offset_first; i < offset_last; ++i) { + auto pivot = (group_firsts[i] + group_lasts[i]) / 2; + if (static_cast(thrust::distance(key_firsts[i], key_lasts[i])) < + mem_frugal_threshold) { + if (group_lasts[i] - group_firsts[i] == 2) { + auto kv_pair_first = + thrust::make_zip_iterator(thrust::make_tuple(key_firsts[i], value_firsts[i])); + thrust::partition( + rmm::exec_policy(stream_view), + kv_pair_first, + kv_pair_first + thrust::distance(key_firsts[i], key_lasts[i]), + kv_pair_group_id_less_t::value_type, + typename thrust::iterator_traits::value_type, + KeyToGroupIdOp>{key_to_group_id_op, pivot}); + } else { + thrust::sort_by_key(rmm::exec_policy(stream_view), + key_firsts[i], + key_lasts[i], + value_firsts[i], + [key_to_group_id_op] __device__(auto lhs, auto rhs) { + return key_to_group_id_op(lhs) < key_to_group_id_op(rhs); + }); + } + } else { + auto second_first = mem_frugal_partition( + key_firsts[i], key_lasts[i], value_firsts[i], key_to_group_id_op, pivot, stream_view); + if (pivot - group_firsts[i] > 1) { + group_firsts.push_back(group_firsts[i]); + group_lasts.push_back(pivot); + key_firsts.push_back(key_firsts[i]); + key_lasts.push_back(std::get<0>(second_first)); + value_firsts.push_back(value_firsts[i]); + } + if (group_lasts[i] - pivot > 1) { + group_firsts.push_back(pivot); + group_lasts.push_back(group_lasts[i]); + key_firsts.push_back(std::get<0>(second_first)); + key_lasts.push_back(key_lasts[i]); + value_firsts.push_back(std::get<1>(second_first)); + } + } + } + offset_first = offset_last; + offset_last = group_firsts.size(); + } +} + } // namespace detail template @@ -283,33 +426,15 @@ rmm::device_uvector groupby_and_count(ValueIterator tx_value_first /* [I ValueIterator tx_value_last /* [INOUT */, ValueToGroupIdOp value_to_group_id_op, int num_groups, - bool mem_frugal, + size_t mem_frugal_threshold, rmm::cuda_stream_view stream_view) { - if (mem_frugal) { - auto pivot = num_groups / 2; - auto second_first = detail::mem_frugal_partition( - tx_value_first, tx_value_last, value_to_group_id_op, pivot, stream_view); - thrust::sort(rmm::exec_policy(stream_view), - tx_value_first, - second_first, - [value_to_group_id_op] __device__(auto lhs, auto rhs) { - return value_to_group_id_op(lhs) < value_to_group_id_op(rhs); - }); - thrust::sort(rmm::exec_policy(stream_view), - second_first, - tx_value_last, - [value_to_group_id_op] __device__(auto lhs, auto rhs) { - return value_to_group_id_op(lhs) < value_to_group_id_op(rhs); - }); - } else { - thrust::sort(rmm::exec_policy(stream_view), - tx_value_first, - tx_value_last, - [value_to_group_id_op] __device__(auto lhs, auto rhs) { - return value_to_group_id_op(lhs) < value_to_group_id_op(rhs); - }); - } + detail::mem_frugal_groupby(tx_value_first, + tx_value_last, + value_to_group_id_op, + num_groups, + mem_frugal_threshold, + stream_view); auto group_id_first = thrust::make_transform_iterator( tx_value_first, @@ -334,36 +459,16 @@ rmm::device_uvector groupby_and_count(VertexIterator tx_key_first /* [IN ValueIterator tx_value_first /* [INOUT */, KeyToGroupIdOp key_to_group_id_op, int num_groups, - bool mem_frugal, + size_t mem_frugal_threshold, rmm::cuda_stream_view stream_view) { - if (mem_frugal) { - auto pivot = num_groups / 2; - auto second_first = detail::mem_frugal_partition( - tx_key_first, tx_key_last, tx_value_first, key_to_group_id_op, pivot, stream_view); - thrust::sort_by_key(rmm::exec_policy(stream_view), - tx_key_first, - std::get<0>(second_first), - tx_value_first, - [key_to_group_id_op] __device__(auto lhs, auto rhs) { - return key_to_group_id_op(lhs) < key_to_group_id_op(rhs); - }); - thrust::sort_by_key(rmm::exec_policy(stream_view), - std::get<0>(second_first), - tx_key_last, - std::get<1>(second_first), - [key_to_group_id_op] __device__(auto lhs, auto rhs) { - return key_to_group_id_op(lhs) < key_to_group_id_op(rhs); - }); - } else { - thrust::sort_by_key(rmm::exec_policy(stream_view), - tx_key_first, - tx_key_last, - tx_value_first, - [key_to_group_id_op] __device__(auto lhs, auto rhs) { - return key_to_group_id_op(lhs) < key_to_group_id_op(rhs); - }); - } + detail::mem_frugal_groupby(tx_key_first, + tx_key_last, + tx_value_first, + key_to_group_id_op, + num_groups, + mem_frugal_threshold, + stream_view); auto group_id_first = thrust::make_transform_iterator( tx_key_first, [key_to_group_id_op] __device__(auto key) { return key_to_group_id_op(key); }); @@ -439,8 +544,12 @@ auto groupby_gpu_id_and_shuffle_values(raft::comms::comms_t const& comm, { auto const comm_size = comm.get_size(); - auto d_tx_value_counts = groupby_and_count( - tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), false, stream_view); + auto d_tx_value_counts = groupby_and_count(tx_value_first, + tx_value_last, + value_to_gpu_id_op, + comm.get_size(), + std::numeric_limits::max(), + stream_view); std::vector tx_counts{}; std::vector tx_offsets{}; @@ -494,7 +603,7 @@ auto groupby_gpu_id_and_shuffle_kv_pairs(raft::comms::comms_t const& comm, tx_value_first, key_to_gpu_id_op, comm.get_size(), - false, + std::numeric_limits::max(), stream_view); std::vector tx_counts{}; diff --git a/cpp/src/detail/shuffle_wrappers.cu b/cpp/src/detail/shuffle_wrappers.cu index b22c839e346..6e9434882ba 100644 --- a/cpp/src/detail/shuffle_wrappers.cu +++ b/cpp/src/detail/shuffle_wrappers.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -43,20 +43,21 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle, auto total_global_mem = handle.get_device_properties().totalGlobalMem; auto element_size = sizeof(vertex_t) * 2 + (d_edgelist_weights ? sizeof(weight_t) : size_t{0}); - auto mem_frugal = - d_edgelist_majors.size() * element_size >= - total_global_mem / - 5; // if the data size exceeds 1/5 of the device memory (1/5 is a tuning parameter), - // groupby_and_count requires temporary buffer comparable to the input data size, if - // mem_frugal is set to true, temporary buffer size can be reduced up to 50% - - // invoke groupby_and_count and shuffle values to pass mem_frugal instead of directly calling - // groupby_gpu_id_and_shuffle_values there is no benefit in reducing peak memory as we need to - // allocate a receive buffer anyways) but this reduces the maximum memory allocation size by half - // (thrust::sort used inside the groupby_and_count allocates the entire temporary buffer in a - // single chunk, and the pool allocator often cannot handle a large single allocation (due to - // fragmentation) even when the remaining free memory in aggregate is significantly larger than - // the requested size). + auto constexpr mem_frugal_ratio = + 0.1; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the + // total_global_mem, switch to the memory frugal approach (thrust::sort is used to + // group-by by default, and thrust::sort requires temporary buffer comparable to the input + // data size) + auto mem_frugal_threshold = + static_cast(static_cast(total_global_mem / element_size) * mem_frugal_ratio); + + // invoke groupby_and_count and shuffle values to pass mem_frugal_threshold instead of directly + // calling groupby_gpu_id_and_shuffle_values there is no benefit in reducing peak memory as we + // need to allocate a receive buffer anyways) but this reduces the maximum memory allocation size + // by half or more (thrust::sort used inside the groupby_and_count allocates the entire temporary + // buffer in a single chunk, and the pool allocator often cannot handle a large single allocation + // (due to fragmentation) even when the remaining free memory in aggregate is significantly larger + // than the requested size). rmm::device_uvector d_rx_edgelist_majors(0, handle.get_stream()); rmm::device_uvector d_rx_edgelist_minors(0, handle.get_stream()); std::optional> d_rx_edgelist_weights{std::nullopt}; @@ -73,7 +74,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle, return key_func(thrust::get<0>(val), thrust::get<1>(val)); }, comm_size, - mem_frugal, + mem_frugal_threshold, handle.get_stream()); std::vector h_tx_value_counts(d_tx_value_counts.size()); @@ -99,7 +100,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle, return key_func(thrust::get<0>(val), thrust::get<1>(val)); }, comm_size, - mem_frugal, + mem_frugal_threshold, handle.get_stream()); std::vector h_tx_value_counts(d_tx_value_counts.size()); @@ -109,8 +110,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle, handle.get_stream()); handle.sync_stream(); - std::forward_as_tuple( - std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors), std::ignore) = + std::forward_as_tuple(std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors), std::ignore) = shuffle_values(comm, edge_first, h_tx_value_counts, handle.get_stream()); } @@ -196,12 +196,13 @@ rmm::device_uvector groupby_and_count_edgelist_by_local_partition_id( auto total_global_mem = handle.get_device_properties().totalGlobalMem; auto element_size = sizeof(vertex_t) * 2 + (d_edgelist_weights ? sizeof(weight_t) : size_t{0}); - auto mem_frugal = - d_edgelist_majors.size() * element_size >= - total_global_mem / - 5; // if the data size exceeds 1/5 of the device memory (1/5 is a tuning parameter), - // groupby_and_count requires temporary buffer comparable to the input data size, if - // mem_frugal is set to true, temporary buffer size can be reduced up to 50% + auto constexpr mem_frugal_ratio = + 0.1; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the + // total_global_mem, switch to the memory frugal approach (thrust::sort is used to + // group-by by default, and thrust::sort requires temporary buffer comparable to the input + // data size) + auto mem_frugal_threshold = + static_cast(static_cast(total_global_mem / element_size) * mem_frugal_ratio); auto pair_first = thrust::make_zip_iterator( thrust::make_tuple(d_edgelist_majors.begin(), d_edgelist_minors.begin())); @@ -227,13 +228,13 @@ rmm::device_uvector groupby_and_count_edgelist_by_local_partition_id( d_edgelist_weights->begin(), local_partition_id_gpu_id_pair_op, comm_size, - mem_frugal, + mem_frugal_threshold, handle.get_stream()) : cugraph::groupby_and_count(pair_first, pair_first + d_edgelist_majors.size(), local_partition_id_gpu_id_pair_op, comm_size, - mem_frugal, + mem_frugal_threshold, handle.get_stream()); } else { auto local_partition_id_op = @@ -249,13 +250,13 @@ rmm::device_uvector groupby_and_count_edgelist_by_local_partition_id( d_edgelist_weights->begin(), local_partition_id_op, col_comm_size, - mem_frugal, + mem_frugal_threshold, handle.get_stream()) : cugraph::groupby_and_count(pair_first, pair_first + d_edgelist_majors.size(), local_partition_id_op, col_comm_size, - mem_frugal, + mem_frugal_threshold, handle.get_stream()); } } From 95f2295714756f4e72647f00b833dadb234b113b Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 16 Feb 2022 13:14:13 -0800 Subject: [PATCH 30/60] move counting unique local edge majors/mionrs inside the graph constructor to limit peak memory usage --- cpp/include/cugraph/graph_functions.hpp | 3 - cpp/src/structure/coarsen_graph_impl.cuh | 4 +- .../create_graph_from_edgelist_impl.cuh | 4 +- cpp/src/structure/graph_impl.cuh | 100 +++++++++------- cpp/src/structure/renumber_edgelist_impl.cuh | 112 ++++++++---------- 5 files changed, 103 insertions(+), 120 deletions(-) diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp index 5ddc244b183..d3017ac7aaa 100644 --- a/cpp/include/cugraph/graph_functions.hpp +++ b/cpp/include/cugraph/graph_functions.hpp @@ -37,9 +37,6 @@ struct renumber_meta_t> edge_t number_of_edges{}; partition_t partition{}; std::vector segment_offsets{}; - - vertex_t num_local_unique_edge_majors{}; - vertex_t num_local_unique_edge_minors{}; }; template diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index aa2a87ccc72..2559f9ef408 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -421,9 +421,7 @@ coarsen_graph( meta.number_of_edges, graph_properties_t{graph_view.is_symmetric(), false}, meta.partition, - meta.segment_offsets, - store_transposed ? meta.num_local_unique_edge_minors : meta.num_local_unique_edge_majors, - store_transposed ? meta.num_local_unique_edge_majors : meta.num_local_unique_edge_minors}), + meta.segment_offsets}), std::move(renumber_map_labels)); } diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index f05f5f957c6..985c99eda36 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -288,9 +288,7 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, meta.number_of_edges, graph_properties, meta.partition, - meta.segment_offsets, - store_transposed ? meta.num_local_unique_edge_minors : meta.num_local_unique_edge_majors, - store_transposed ? meta.num_local_unique_edge_majors : meta.num_local_unique_edge_minors}), + meta.segment_offsets}), std::optional>{std::move(renumber_map_labels)}); } diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index ef64e60ac2f..c674430ac20 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -80,6 +80,29 @@ struct has_nzd_t { } }; +// can't use lambda due to nvcc limitations (The enclosing parent function ("graph_t") for an +// extended __device__ lambda must allow its address to be taken) +template +struct atomic_or_bitmap_t { + uint32_t* bitmaps{nullptr}; + vertex_t minor_first{}; + + __device__ void operator()(vertex_t minor) const { + auto minor_offset = minor - minor_first; + auto mask = uint32_t{1} << (minor_offset % (sizeof(uint32_t) * 8)); + atomicOr(bitmaps + (minor_offset / (sizeof(uint32_t) * 8)), mask); + } +}; + +// can't use lambda due to nvcc limitations (The enclosing parent function ("graph_t") for an +// extended __device__ lambda must allow its address to be taken) +template +struct popc_t { + __device__ vertex_t operator()(uint32_t bitmap) const { + return static_cast(__popc(bitmap)); + } +}; + // can't use lambda due to nvcc limitations (The enclosing parent function ("graph_t") for an // extended __device__ lambda must allow its address to be taken) template @@ -573,48 +596,6 @@ graph_t majors(number_of_local_edges, handle.get_stream()); - rmm::device_uvector minors(number_of_local_edges, handle.get_stream()); - size_t cur_size{0}; - for (size_t i = 0; i < edgelists.size(); ++i) { - auto p_majors = store_transposed ? edgelists[i].p_dst_vertices : edgelists[i].p_src_vertices; - auto p_minors = store_transposed ? edgelists[i].p_src_vertices : edgelists[i].p_dst_vertices; - thrust::copy(handle.get_thrust_policy(), - p_majors, - p_majors + edgelists[i].number_of_edges, - majors.begin() + cur_size); - thrust::copy(handle.get_thrust_policy(), - p_minors, - p_minors + edgelists[i].number_of_edges, - minors.begin() + cur_size); - cur_size += edgelists[i].number_of_edges; - } - thrust::sort(handle.get_thrust_policy(), majors.begin(), majors.end()); - thrust::sort(handle.get_thrust_policy(), minors.begin(), minors.end()); - auto num_local_unique_edge_majors = static_cast(thrust::distance( - majors.begin(), thrust::unique(handle.get_thrust_policy(), majors.begin(), majors.end()))); - auto num_local_unique_edge_minors = static_cast(thrust::distance( - minors.begin(), thrust::unique(handle.get_thrust_policy(), minors.begin(), minors.end()))); - // FIXME: temporarily disable this check as these are currently not used - // (row_col_properties_kv_pair_fill_ratio_threshold is set to 0.0, so (key, value) pairs for - // row/column properties will be never enabled) and we're not currently exposing this to the - // python layer. Should be re-enabled later once we enable the (key, value) pair feature and - // hopefully simplify the python graph creation pipeline as well (so no need to pass this - // information to the python layer). -#if 0 - if constexpr (store_transposed) { - CUGRAPH_EXPECTS(num_local_unique_edge_majors == meta.num_local_unique_edge_cols, - "Invalid input argument: num_local_unique_edge_cols is erroneous."); - CUGRAPH_EXPECTS(num_local_unique_edge_minors == meta.num_local_unique_edge_rows, - "Invalid input argument: num_local_unique_edge_rows is erroneous."); - } else { - CUGRAPH_EXPECTS(num_local_unique_edge_majors == meta.num_local_unique_edge_rows, - "Invalid input argument: num_local_unique_edge_rows is erroneous."); - CUGRAPH_EXPECTS(num_local_unique_edge_minors == meta.num_local_unique_edge_cols, - "Invalid input argument: num_local_unique_edge_cols is erroneous."); - } -#endif } // aggregate segment_offsets @@ -701,13 +682,40 @@ graph_t(adj_matrix_partition_indices_[i].size())); } + // if # unique edge rows/cols << V / row_comm_size|col_comm_size, store unique edge rows/cols to // support storing edge row/column properties in (key, value) pairs. - auto num_local_unique_edge_majors = - store_transposed ? meta.num_local_unique_edge_cols : meta.num_local_unique_edge_rows; - auto num_local_unique_edge_minors = - store_transposed ? meta.num_local_unique_edge_rows : meta.num_local_unique_edge_cols; + vertex_t num_local_unique_edge_majors{0}; + for (size_t i = 0; i < adj_matrix_partition_offsets_.size(); ++i) { + num_local_unique_edge_majors += thrust::count_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(static_cast(adj_matrix_partition_offsets_[i].size() - 1)), + has_nzd_t{adj_matrix_partition_offsets_[i].data(), vertex_t{0}}); + } + + auto [minor_first, minor_last] = partition_.get_matrix_partition_minor_range(); + rmm::device_uvector minor_bitmaps( + ((minor_last - minor_first) + sizeof(uint32_t) * 8 - 1) / (sizeof(uint32_t) * 8), + handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), minor_bitmaps.begin(), minor_bitmaps.end(), uint32_t{0}); + for (size_t i = 0; i < adj_matrix_partition_indices_.size(); ++i) { + thrust::for_each(handle.get_thrust_policy(), + adj_matrix_partition_indices_[i].begin(), + adj_matrix_partition_indices_[i].end(), + atomic_or_bitmap_t{minor_bitmaps.data(), minor_first}); + } + + auto count_first = thrust::make_transform_iterator(minor_bitmaps.begin(), popc_t{}); + auto num_local_unique_edge_minors = thrust::reduce( + handle.get_thrust_policy(), + count_first, + count_first + minor_bitmaps.size(), + vertex_t{0}); + + minor_bitmaps.resize(0, handle.get_stream()); + minor_bitmaps.shrink_to_fit(handle.get_stream()); vertex_t aggregate_major_size{0}; for (size_t i = 0; i < partition_.get_number_of_matrix_partitions(); ++i) { diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 959d11b783f..aeb7682f440 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -77,23 +77,21 @@ struct search_and_set_degree_t { } }; -// returns renumber map, segment_offsets, and # unique edge majors & minors +// returns renumber map and segment_offsets template -std::tuple, std::vector, vertex_t, vertex_t> -compute_renumber_map(raft::handle_t const& handle, - std::optional>&& local_vertices, - std::vector const& edgelist_majors, - std::vector const& edgelist_minors, - std::vector const& edgelist_edge_counts) +std::tuple, std::vector> compute_renumber_map( + raft::handle_t const& handle, + std::optional>&& local_vertices, + std::vector const& edgelist_majors, + std::vector const& edgelist_minors, + std::vector const& edgelist_edge_counts) { rmm::device_uvector sorted_local_vertices(0, handle.get_stream()); - vertex_t num_local_unique_edge_majors{0}; - vertex_t num_local_unique_edge_minors{0}; edge_t num_local_edges = std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end()); // 1. if local_vertices.has_value() is false, find unique vertices from edge majors (to construct - // local_vertices), unique edge majors will be counted in step 4. + // local_vertices) rmm::device_uvector sorted_unique_majors(0, handle.get_stream()); if (!local_vertices) { @@ -127,42 +125,40 @@ compute_renumber_map(raft::handle_t const& handle, sorted_unique_majors.shrink_to_fit(handle.get_stream()); } - // 2. count unique edge minors. - // if local_vertices.has_value() is false, keep unique vertices from edge minors as well (to - // construct local_vertices) + // 2. if local_vertices.has_value() is false, find unique vertices from edge minors (to construct + // local_vertices) - rmm::device_uvector sorted_unique_minors(num_local_edges, handle.get_stream()); - size_t minor_offset{0}; - for (size_t i = 0; i < edgelist_minors.size(); ++i) { - thrust::copy(handle.get_thrust_policy(), - edgelist_minors[i], - edgelist_minors[i] + edgelist_edge_counts[i], - sorted_unique_minors.begin() + minor_offset); - thrust::sort(handle.get_thrust_policy(), - sorted_unique_minors.begin() + minor_offset, - sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i]); - minor_offset += static_cast(thrust::distance( - sorted_unique_minors.begin() + minor_offset, - thrust::unique(handle.get_thrust_policy(), - sorted_unique_minors.begin() + minor_offset, - sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i]))); - } - sorted_unique_minors.resize(minor_offset, handle.get_stream()); - if (edgelist_minors.size() > 1) { - thrust::sort( - handle.get_thrust_policy(), sorted_unique_minors.begin(), sorted_unique_minors.end()); - sorted_unique_minors.resize(thrust::distance(sorted_unique_minors.begin(), - thrust::unique(handle.get_thrust_policy(), - sorted_unique_minors.begin(), - sorted_unique_minors.end())), - handle.get_stream()); + rmm::device_uvector sorted_unique_minors(0, handle.get_stream()); + if (!local_vertices) { + sorted_unique_minors.resize(num_local_edges, handle.get_stream()); + size_t minor_offset{0}; + for (size_t i = 0; i < edgelist_minors.size(); ++i) { + thrust::copy(handle.get_thrust_policy(), + edgelist_minors[i], + edgelist_minors[i] + edgelist_edge_counts[i], + sorted_unique_minors.begin() + minor_offset); + thrust::sort(handle.get_thrust_policy(), + sorted_unique_minors.begin() + minor_offset, + sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i]); + minor_offset += static_cast(thrust::distance( + sorted_unique_minors.begin() + minor_offset, + thrust::unique(handle.get_thrust_policy(), + sorted_unique_minors.begin() + minor_offset, + sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i]))); + } + sorted_unique_minors.resize(minor_offset, handle.get_stream()); + if (edgelist_minors.size() > 1) { + thrust::sort( + handle.get_thrust_policy(), sorted_unique_minors.begin(), sorted_unique_minors.end()); + sorted_unique_minors.resize(thrust::distance(sorted_unique_minors.begin(), + thrust::unique(handle.get_thrust_policy(), + sorted_unique_minors.begin(), + sorted_unique_minors.end())), + handle.get_stream()); + } + sorted_unique_minors.shrink_to_fit(handle.get_stream()); } - num_local_unique_edge_minors = static_cast(sorted_unique_minors.size()); - - if (local_vertices) { sorted_unique_minors.resize(0, handle.get_stream()); } - sorted_unique_minors.shrink_to_fit(handle.get_stream()); - // 3. update sorted_local_vertices. // if local_vertices.has_value() is false, reconstruct local_vertices first @@ -207,8 +203,7 @@ compute_renumber_map(raft::handle_t const& handle, } } - // 4. compute global degrees for the sorted local vertices, and count unique edge majors on the - // way + // 4. compute global degrees for the sorted local vertices rmm::device_uvector sorted_local_vertex_degrees(0, handle.get_stream()); std::optional> stream_pool_indices{ @@ -280,8 +275,6 @@ compute_renumber_map(raft::handle_t const& handle, tmp_keys.begin(), tmp_values.begin()); - num_local_unique_edge_majors += num_unique_majors; - tmp_majors.resize(0, loop_stream); tmp_majors.shrink_to_fit(loop_stream); @@ -342,8 +335,6 @@ compute_renumber_map(raft::handle_t const& handle, tmp_keys.begin(), tmp_values.begin()); - num_local_unique_edge_majors += num_unique_majors; - tmp_majors.resize(0, handle.get_stream()); tmp_majors.shrink_to_fit(handle.get_stream()); @@ -426,10 +417,7 @@ compute_renumber_map(raft::handle_t const& handle, handle.get_stream()); handle.sync_stream(); - return std::make_tuple(std::move(sorted_local_vertices), - h_segment_offsets, - num_local_unique_edge_majors, - num_local_unique_edge_minors); + return std::make_tuple(std::move(sorted_local_vertices), h_segment_offsets); } template @@ -682,10 +670,7 @@ renumber_edgelist( // 1. compute renumber map - auto [renumber_map_labels, - vertex_partition_segment_offsets, - num_unique_edge_majors, - num_unique_edge_minors] = + auto [renumber_map_labels, vertex_partition_segment_offsets] = detail::compute_renumber_map(handle, std::move(local_vertices), edgelist_const_majors, @@ -765,7 +750,8 @@ renumber_edgelist( } } - if ((partition.get_matrix_partition_minor_size() >= number_of_edges / comm_size) && + if ((static_cast(partition.get_matrix_partition_minor_size() / load_factor) >= + static_cast(number_of_edges / comm_size)) && edgelist_intra_partition_segment_offsets) { // memory footprint dominated by the O(V/sqrt(P)) // part than the O(E/P) part vertex_t max_segment_size{0}; @@ -863,12 +849,8 @@ renumber_edgelist( return std::make_tuple( std::move(renumber_map_labels), - renumber_meta_t{number_of_vertices, - number_of_edges, - partition, - vertex_partition_segment_offsets, - num_unique_edge_majors, - num_unique_edge_minors}); + renumber_meta_t{ + number_of_vertices, number_of_edges, partition, vertex_partition_segment_offsets}); } template @@ -894,7 +876,7 @@ renumber_edgelist(raft::handle_t const& handle, rmm::device_uvector renumber_map_labels(0, handle.get_stream()); std::vector segment_offsets{}; - std::tie(renumber_map_labels, segment_offsets, std::ignore, std::ignore) = + std::tie(renumber_map_labels, segment_offsets) = detail::compute_renumber_map( handle, std::move(vertices), From 15aeb4ca9e9db56a2fb2c7434eb5483327e6ea78 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 16 Feb 2022 13:32:15 -0800 Subject: [PATCH 31/60] clang-format --- cpp/src/structure/create_graph_from_edgelist_impl.cuh | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index 985c99eda36..bad875d554f 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -283,12 +283,11 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, cugraph::graph_t( handle, edgelists, - cugraph::graph_meta_t{ - meta.number_of_vertices, - meta.number_of_edges, - graph_properties, - meta.partition, - meta.segment_offsets}), + cugraph::graph_meta_t{meta.number_of_vertices, + meta.number_of_edges, + graph_properties, + meta.partition, + meta.segment_offsets}), std::optional>{std::move(renumber_map_labels)}); } From e7102e09d81edbd233744bb313f6dee004fcef1f Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 16 Feb 2022 13:35:42 -0800 Subject: [PATCH 32/60] copyright year --- cpp/include/cugraph/graph_functions.hpp | 2 +- .../prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh | 2 +- cpp/include/cugraph/utilities/cython.hpp | 2 +- cpp/src/components/weakly_connected_components_impl.cuh | 2 +- cpp/src/structure/coarsen_graph_impl.cuh | 2 +- cpp/src/structure/create_graph_from_edgelist_impl.cuh | 2 +- cpp/src/utilities/cython.cu | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp index d3017ac7aaa..c170ce65253 100644 --- a/cpp/include/cugraph/graph_functions.hpp +++ b/cpp/include/cugraph/graph_functions.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh b/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh index 968d99b7d25..c81cf2d133e 100644 --- a/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh +++ b/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/cugraph/utilities/cython.hpp b/cpp/include/cugraph/utilities/cython.hpp index 260393009bd..7cc6afb8aee 100644 --- a/cpp/include/cugraph/utilities/cython.hpp +++ b/cpp/include/cugraph/utilities/cython.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh index 757fc9e3d23..a1f663a301c 100644 --- a/cpp/src/components/weakly_connected_components_impl.cuh +++ b/cpp/src/components/weakly_connected_components_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index 2559f9ef408..6234acf5559 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index bad875d554f..ea12a3562ba 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu index 59241a3e913..1527ae90afd 100644 --- a/cpp/src/utilities/cython.cu +++ b/cpp/src/utilities/cython.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 7b544a6b9044bc23bb8cebf0cd61b5e8748120bd Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 16 Feb 2022 13:40:01 -0800 Subject: [PATCH 33/60] clang-format --- cpp/src/structure/graph_impl.cuh | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index e136906aef4..eff76df8a79 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -87,7 +87,8 @@ struct atomic_or_bitmap_t { uint32_t* bitmaps{nullptr}; vertex_t minor_first{}; - __device__ void operator()(vertex_t minor) const { + __device__ void operator()(vertex_t minor) const + { auto minor_offset = minor - minor_first; auto mask = uint32_t{1} << (minor_offset % (sizeof(uint32_t) * 8)); atomicOr(bitmaps + (minor_offset / (sizeof(uint32_t) * 8)), mask); @@ -98,7 +99,8 @@ struct atomic_or_bitmap_t { // extended __device__ lambda must allow its address to be taken) template struct popc_t { - __device__ vertex_t operator()(uint32_t bitmap) const { + __device__ vertex_t operator()(uint32_t bitmap) const + { return static_cast(__popc(bitmap)); } }; @@ -682,7 +684,6 @@ graph_t(adj_matrix_partition_indices_[i].size())); } - // if # unique edge rows/cols << V / row_comm_size|col_comm_size, store unique edge rows/cols to // support storing edge row/column properties in (key, value) pairs. @@ -691,7 +692,8 @@ graph_t(adj_matrix_partition_offsets_[i].size() - 1)), + thrust::make_counting_iterator( + static_cast(adj_matrix_partition_offsets_[i].size() - 1)), has_nzd_t{adj_matrix_partition_offsets_[i].data(), vertex_t{0}}); } @@ -709,10 +711,7 @@ graph_t{}); auto num_local_unique_edge_minors = thrust::reduce( - handle.get_thrust_policy(), - count_first, - count_first + minor_bitmaps.size(), - vertex_t{0}); + handle.get_thrust_policy(), count_first, count_first + minor_bitmaps.size(), vertex_t{0}); minor_bitmaps.resize(0, handle.get_stream()); minor_bitmaps.shrink_to_fit(handle.get_stream()); From 6b4018735c3ac58a578183830834f19794ec5731 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 16 Feb 2022 17:29:12 -0800 Subject: [PATCH 34/60] cosmetic updates --- cpp/src/structure/coarsen_graph_impl.cuh | 211 +++++++++++------------ 1 file changed, 100 insertions(+), 111 deletions(-) diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index b0f6c7eca05..ec01135f7ae 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -46,43 +46,38 @@ namespace cugraph { namespace { template -edge_t groupby_e_and_coarsen_edgelist(vertex_t* edgelist_major_vertices /* [INOUT] */, - vertex_t* edgelist_minor_vertices /* [INOUT] */, +edge_t groupby_e_and_coarsen_edgelist(vertex_t* edgelist_majors /* [INOUT] */, + vertex_t* edgelist_minors /* [INOUT] */, std::optional edgelist_weights /* [INOUT] */, edge_t number_of_edges, cudaStream_t stream) { - auto pair_first = - thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices)); + auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors, edgelist_minors)); if (edgelist_weights) { thrust::sort_by_key( rmm::exec_policy(stream), pair_first, pair_first + number_of_edges, *edgelist_weights); - rmm::device_uvector tmp_edgelist_major_vertices(number_of_edges, stream); - rmm::device_uvector tmp_edgelist_minor_vertices(tmp_edgelist_major_vertices.size(), - stream); - rmm::device_uvector tmp_edgelist_weights(tmp_edgelist_major_vertices.size(), stream); - auto it = thrust::reduce_by_key( - rmm::exec_policy(stream), - pair_first, - pair_first + number_of_edges, - (*edgelist_weights), - thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(), - tmp_edgelist_minor_vertices.begin())), - tmp_edgelist_weights.begin()); + rmm::device_uvector tmp_edgelist_majors(number_of_edges, stream); + rmm::device_uvector tmp_edgelist_minors(tmp_edgelist_majors.size(), stream); + rmm::device_uvector tmp_edgelist_weights(tmp_edgelist_majors.size(), stream); + auto it = thrust::reduce_by_key(rmm::exec_policy(stream), + pair_first, + pair_first + number_of_edges, + (*edgelist_weights), + thrust::make_zip_iterator(thrust::make_tuple( + tmp_edgelist_majors.begin(), tmp_edgelist_minors.begin())), + tmp_edgelist_weights.begin()); auto ret = static_cast(thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it))); - auto edge_first = - thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(), - tmp_edgelist_minor_vertices.begin(), - tmp_edgelist_weights.begin())); + auto edge_first = thrust::make_zip_iterator(thrust::make_tuple( + tmp_edgelist_majors.begin(), tmp_edgelist_minors.begin(), tmp_edgelist_weights.begin())); thrust::copy(rmm::exec_policy(stream), edge_first, edge_first + ret, - thrust::make_zip_iterator(thrust::make_tuple( - edgelist_major_vertices, edgelist_minor_vertices, *edgelist_weights))); + thrust::make_zip_iterator( + thrust::make_tuple(edgelist_majors, edgelist_minors, *edgelist_weights))); return ret; } else { @@ -113,27 +108,26 @@ decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist( // FIXME: it might be possible to directly create relabled & coarsened edgelist from the // compressed sparse format to save memory - rmm::device_uvector edgelist_major_vertices(matrix_partition.get_number_of_edges(), - handle.get_stream()); - rmm::device_uvector edgelist_minor_vertices(edgelist_major_vertices.size(), - handle.get_stream()); + rmm::device_uvector edgelist_majors(matrix_partition.get_number_of_edges(), + handle.get_stream()); + rmm::device_uvector edgelist_minors(edgelist_majors.size(), handle.get_stream()); auto edgelist_weights = matrix_partition.get_weights() ? std::make_optional>( - edgelist_major_vertices.size(), handle.get_stream()) + edgelist_majors.size(), handle.get_stream()) : std::nullopt; detail::decompress_matrix_partition_to_edgelist( handle, matrix_partition, - edgelist_major_vertices.data(), - edgelist_minor_vertices.data(), + edgelist_majors.data(), + edgelist_minors.data(), edgelist_weights ? std::optional{(*edgelist_weights).data()} : std::nullopt, segment_offsets); - auto pair_first = thrust::make_zip_iterator( - thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); + auto pair_first = + thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors.begin(), edgelist_minors.begin())); thrust::transform(handle.get_thrust_policy(), pair_first, - pair_first + edgelist_major_vertices.size(), + pair_first + edgelist_majors.size(), pair_first, [major_label_first, minor_label_input, @@ -145,23 +139,22 @@ decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist( }); auto number_of_edges = groupby_e_and_coarsen_edgelist( - edgelist_major_vertices.data(), - edgelist_minor_vertices.data(), + edgelist_majors.data(), + edgelist_minors.data(), edgelist_weights ? std::optional{(*edgelist_weights).data()} : std::nullopt, - static_cast(edgelist_major_vertices.size()), + static_cast(edgelist_majors.size()), handle.get_stream()); - edgelist_major_vertices.resize(number_of_edges, handle.get_stream()); - edgelist_major_vertices.shrink_to_fit(handle.get_stream()); - edgelist_minor_vertices.resize(number_of_edges, handle.get_stream()); - edgelist_minor_vertices.shrink_to_fit(handle.get_stream()); + edgelist_majors.resize(number_of_edges, handle.get_stream()); + edgelist_majors.shrink_to_fit(handle.get_stream()); + edgelist_minors.resize(number_of_edges, handle.get_stream()); + edgelist_minors.shrink_to_fit(handle.get_stream()); if (edgelist_weights) { (*edgelist_weights).resize(number_of_edges, handle.get_stream()); (*edgelist_weights).shrink_to_fit(handle.get_stream()); } - return std::make_tuple(std::move(edgelist_major_vertices), - std::move(edgelist_minor_vertices), - std::move(edgelist_weights)); + return std::make_tuple( + std::move(edgelist_majors), std::move(edgelist_minors), std::move(edgelist_weights)); } } // namespace @@ -213,19 +206,19 @@ coarsen_graph( copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_minor_labels); } - std::vector> coarsened_edgelist_major_vertices{}; - std::vector> coarsened_edgelist_minor_vertices{}; + std::vector> coarsened_edgelist_majors{}; + std::vector> coarsened_edgelist_minors{}; auto coarsened_edgelist_weights = graph_view.is_weighted() ? std::make_optional>>({}) : std::nullopt; - coarsened_edgelist_major_vertices.reserve(graph_view.get_number_of_local_adj_matrix_partitions()); - coarsened_edgelist_minor_vertices.reserve(coarsened_edgelist_major_vertices.size()); + coarsened_edgelist_majors.reserve(graph_view.get_number_of_local_adj_matrix_partitions()); + coarsened_edgelist_minors.reserve(coarsened_edgelist_majors.size()); if (coarsened_edgelist_weights) { - (*coarsened_edgelist_weights).reserve(coarsened_edgelist_major_vertices.size()); + (*coarsened_edgelist_weights).reserve(coarsened_edgelist_majors.size()); } for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { - coarsened_edgelist_major_vertices.emplace_back(0, handle.get_stream()); - coarsened_edgelist_minor_vertices.emplace_back(0, handle.get_stream()); + coarsened_edgelist_majors.emplace_back(0, handle.get_stream()); + coarsened_edgelist_minors.emplace_back(0, handle.get_stream()); if (coarsened_edgelist_weights) { (*coarsened_edgelist_weights).emplace_back(0, handle.get_stream()); } @@ -248,7 +241,7 @@ coarsen_graph( static_cast(i), handle.get_stream()); - auto [edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights] = + auto [edgelist_majors, edgelist_minors, edgelist_weights] = decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist( handle, matrix_partition_device_view_t( @@ -259,10 +252,10 @@ coarsen_graph( // 1-2. globally shuffle - std::tie(edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights) = + std::tie(edgelist_majors, edgelist_minors, edgelist_weights) = cugraph::detail::shuffle_edgelist_by_gpu_id(handle, - std::move(edgelist_major_vertices), - std::move(edgelist_minor_vertices), + std::move(edgelist_majors), + std::move(edgelist_minors), std::move(edgelist_weights)); // 1-3. append data to local adjacency matrix partitions @@ -272,7 +265,7 @@ coarsen_graph( // groupby_adj_matrix_partition_and_shuffle_values). auto counts = cugraph::detail::groupby_and_count_edgelist_by_local_partition_id( - handle, edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights); + handle, edgelist_majors, edgelist_minors, edgelist_weights); std::vector h_counts(counts.size()); raft::update_host(h_counts.data(), counts.data(), counts.size(), handle.get_stream()); @@ -283,68 +276,66 @@ coarsen_graph( for (int j = 0; j < col_comm_size; ++j) { auto number_of_partition_edges = groupby_e_and_coarsen_edgelist( - edgelist_major_vertices.begin() + h_displacements[j], - edgelist_minor_vertices.begin() + h_displacements[j], + edgelist_majors.begin() + h_displacements[j], + edgelist_minors.begin() + h_displacements[j], edgelist_weights ? std::optional{(*edgelist_weights).data() + h_displacements[j]} : std::nullopt, h_counts[j], handle.get_stream()); - auto cur_size = coarsened_edgelist_major_vertices[j].size(); + auto cur_size = coarsened_edgelist_majors[j].size(); // FIXME: this can lead to frequent costly reallocation; we may be able to avoid this if we // can reserve address space to avoid expensive reallocation. // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management - coarsened_edgelist_major_vertices[j].resize(cur_size + number_of_partition_edges, - handle.get_stream()); - coarsened_edgelist_minor_vertices[j].resize(coarsened_edgelist_major_vertices[j].size(), - handle.get_stream()); + coarsened_edgelist_majors[j].resize(cur_size + number_of_partition_edges, + handle.get_stream()); + coarsened_edgelist_minors[j].resize(coarsened_edgelist_majors[j].size(), handle.get_stream()); if (coarsened_edgelist_weights) { - (*coarsened_edgelist_weights)[j].resize(coarsened_edgelist_major_vertices[j].size(), + (*coarsened_edgelist_weights)[j].resize(coarsened_edgelist_majors[j].size(), handle.get_stream()); - auto src_edge_first = - thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(), - edgelist_minor_vertices.begin(), - (*edgelist_weights).begin())) + + auto input_edge_first = + thrust::make_zip_iterator(thrust::make_tuple( + edgelist_majors.begin(), edgelist_minors.begin(), (*edgelist_weights).begin())) + h_displacements[j]; - auto dst_edge_first = - thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices[j].begin(), - coarsened_edgelist_minor_vertices[j].begin(), + auto output_edge_first = + thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_majors[j].begin(), + coarsened_edgelist_minors[j].begin(), (*coarsened_edgelist_weights)[j].begin())) + cur_size; thrust::copy(handle.get_thrust_policy(), - src_edge_first, - src_edge_first + number_of_partition_edges, - dst_edge_first); + input_edge_first, + input_edge_first + number_of_partition_edges, + output_edge_first); } else { - auto src_edge_first = thrust::make_zip_iterator(thrust::make_tuple( - edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())) + - h_displacements[j]; - auto dst_edge_first = thrust::make_zip_iterator( - thrust::make_tuple(coarsened_edgelist_major_vertices[j].begin(), - coarsened_edgelist_minor_vertices[j].begin())) + - cur_size; + auto input_edge_first = thrust::make_zip_iterator(thrust::make_tuple( + edgelist_majors.begin(), edgelist_minors.begin())) + + h_displacements[j]; + auto output_edge_first = + thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_majors[j].begin(), + coarsened_edgelist_minors[j].begin())) + + cur_size; thrust::copy(handle.get_thrust_policy(), - src_edge_first, - src_edge_first + number_of_partition_edges, - dst_edge_first); + input_edge_first, + input_edge_first + number_of_partition_edges, + output_edge_first); } } } - for (size_t i = 0; i < coarsened_edgelist_major_vertices.size(); ++i) { + for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) { auto number_of_partition_edges = groupby_e_and_coarsen_edgelist( - coarsened_edgelist_major_vertices[i].data(), - coarsened_edgelist_minor_vertices[i].data(), + coarsened_edgelist_majors[i].data(), + coarsened_edgelist_minors[i].data(), coarsened_edgelist_weights ? std::optional{(*coarsened_edgelist_weights)[i].data()} : std::nullopt, - static_cast(coarsened_edgelist_major_vertices[i].size()), + static_cast(coarsened_edgelist_majors[i].size()), handle.get_stream()); - coarsened_edgelist_major_vertices[i].resize(number_of_partition_edges, handle.get_stream()); - coarsened_edgelist_major_vertices[i].shrink_to_fit(handle.get_stream()); - coarsened_edgelist_minor_vertices[i].resize(number_of_partition_edges, handle.get_stream()); - coarsened_edgelist_minor_vertices[i].shrink_to_fit(handle.get_stream()); + coarsened_edgelist_majors[i].resize(number_of_partition_edges, handle.get_stream()); + coarsened_edgelist_majors[i].shrink_to_fit(handle.get_stream()); + coarsened_edgelist_minors[i].resize(number_of_partition_edges, handle.get_stream()); + coarsened_edgelist_minors[i].shrink_to_fit(handle.get_stream()); if (coarsened_edgelist_weights) { (*coarsened_edgelist_weights)[i].resize(number_of_partition_edges, handle.get_stream()); (*coarsened_edgelist_weights)[i].shrink_to_fit(handle.get_stream()); @@ -378,13 +369,13 @@ coarsen_graph( rmm::device_uvector renumber_map_labels(0, handle.get_stream()); renumber_meta_t meta{}; { - std::vector major_ptrs(coarsened_edgelist_major_vertices.size()); + std::vector major_ptrs(coarsened_edgelist_majors.size()); std::vector minor_ptrs(major_ptrs.size()); std::vector counts(major_ptrs.size()); - for (size_t i = 0; i < coarsened_edgelist_major_vertices.size(); ++i) { - major_ptrs[i] = coarsened_edgelist_major_vertices[i].data(); - minor_ptrs[i] = coarsened_edgelist_minor_vertices[i].data(); - counts[i] = static_cast(coarsened_edgelist_major_vertices[i].size()); + for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) { + major_ptrs[i] = coarsened_edgelist_majors[i].data(); + minor_ptrs[i] = coarsened_edgelist_minors[i].data(); + counts[i] = static_cast(coarsened_edgelist_majors[i].size()); } std::tie(renumber_map_labels, meta) = renumber_edgelist( handle, @@ -401,15 +392,15 @@ coarsen_graph( std::vector> edgelists{}; edgelists.resize(graph_view.get_number_of_local_adj_matrix_partitions()); for (size_t i = 0; i < edgelists.size(); ++i) { - edgelists[i].p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices[i].data() - : coarsened_edgelist_major_vertices[i].data(); - edgelists[i].p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices[i].data() - : coarsened_edgelist_minor_vertices[i].data(); + edgelists[i].p_src_vertices = + store_transposed ? coarsened_edgelist_minors[i].data() : coarsened_edgelist_majors[i].data(); + edgelists[i].p_dst_vertices = + store_transposed ? coarsened_edgelist_majors[i].data() : coarsened_edgelist_minors[i].data(); edgelists[i].p_edge_weights = coarsened_edgelist_weights ? std::optional{(*coarsened_edgelist_weights)[i].data()} : std::nullopt, - edgelists[i].number_of_edges = static_cast(coarsened_edgelist_major_vertices[i].size()); + edgelists[i].number_of_edges = static_cast(coarsened_edgelist_majors[i].size()); } return std::make_tuple( @@ -447,9 +438,7 @@ coarsen_graph( // currently, nothing to do } - auto [coarsened_edgelist_major_vertices, - coarsened_edgelist_minor_vertices, - coarsened_edgelist_weights] = + auto [coarsened_edgelist_majors, coarsened_edgelist_minors, coarsened_edgelist_weights] = decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist( handle, matrix_partition_device_view_t( @@ -472,20 +461,20 @@ coarsen_graph( auto [renumber_map_labels, meta] = renumber_edgelist( handle, std::optional>{std::move(unique_labels)}, - coarsened_edgelist_major_vertices.data(), - coarsened_edgelist_minor_vertices.data(), - static_cast(coarsened_edgelist_major_vertices.size()), + coarsened_edgelist_majors.data(), + coarsened_edgelist_minors.data(), + static_cast(coarsened_edgelist_majors.size()), do_expensive_check); edgelist_t edgelist{}; - edgelist.p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices.data() - : coarsened_edgelist_major_vertices.data(); - edgelist.p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices.data() - : coarsened_edgelist_minor_vertices.data(); + edgelist.p_src_vertices = + store_transposed ? coarsened_edgelist_minors.data() : coarsened_edgelist_majors.data(); + edgelist.p_dst_vertices = + store_transposed ? coarsened_edgelist_majors.data() : coarsened_edgelist_minors.data(); edgelist.p_edge_weights = coarsened_edgelist_weights ? std::optional{(*coarsened_edgelist_weights).data()} : std::nullopt; - edgelist.number_of_edges = static_cast(coarsened_edgelist_major_vertices.size()); + edgelist.number_of_edges = static_cast(coarsened_edgelist_majors.size()); return std::make_tuple( std::make_unique>( From 85f39fb9840a92719849bc1b064c1a871758a393 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 16 Feb 2022 17:31:17 -0800 Subject: [PATCH 35/60] update louvain tests to take --perf option --- cpp/tests/community/louvain_test.cpp | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/cpp/tests/community/louvain_test.cpp b/cpp/tests/community/louvain_test.cpp index b86cfdee5c6..364a0b8a68e 100644 --- a/cpp/tests/community/louvain_test.cpp +++ b/cpp/tests/community/louvain_test.cpp @@ -9,6 +9,7 @@ * */ #include +#include #include #include @@ -90,15 +91,28 @@ class Tests_Louvain auto [louvain_usecase, input_usecase] = param; raft::handle_t handle{}; + HighResClock hr_clock{}; // Can't currently check correctness if we renumber bool renumber = true; if (louvain_usecase.check_correctness_) renumber = false; + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } + auto [graph, d_renumber_map_labels] = cugraph::test::construct_graph( handle, input_usecase, true, renumber); + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "construct_graph took " << elapsed_time * 1e-6 << " s.\n"; + } + auto graph_view = graph.view(); // "FIXME": remove this check once we drop support for Pascal @@ -109,6 +123,11 @@ class Tests_Louvain cudaDeviceProp device_prop; RAFT_CUDA_TRY(cudaGetDeviceProperties(&device_prop, 0)); + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } + if (device_prop.major < 7) { EXPECT_THROW(louvain(graph_view, graph_view.get_number_of_local_vertices(), @@ -123,6 +142,13 @@ class Tests_Louvain louvain_usecase.expected_level_, louvain_usecase.expected_modularity_); } + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "Louvain took " << elapsed_time * 1e-6 << " s.\n"; + } } template From 48009a6a8fb430e40e23d841ab6a2e79157ee7cf Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 17 Feb 2022 01:58:00 -0800 Subject: [PATCH 36/60] fix error in comments --- cpp/include/cugraph/detail/decompress_matrix_partition.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/cugraph/detail/decompress_matrix_partition.cuh b/cpp/include/cugraph/detail/decompress_matrix_partition.cuh index aa9b2897075..b419d4bdbb3 100644 --- a/cpp/include/cugraph/detail/decompress_matrix_partition.cuh +++ b/cpp/include/cugraph/detail/decompress_matrix_partition.cuh @@ -184,9 +184,9 @@ template void decompress_matrix_partition_to_edgelist( raft::handle_t const& handle, matrix_partition_device_view_t const matrix_partition, - vertex_t* edgelist_majors /* [INOUT] */, - vertex_t* edgelist_minors /* [INOUT] */, - std::optional edgelist_weights /* [INOUT] */, + vertex_t* edgelist_majors /* [OUT] */, + vertex_t* edgelist_minors /* [OUT] */, + std::optional edgelist_weights /* [OUT] */, std::optional> const& segment_offsets) { auto number_of_edges = matrix_partition.get_number_of_edges(); From 59d16a3ebdae73c118f812db9ea96a4757f35e8f Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 17 Feb 2022 01:58:36 -0800 Subject: [PATCH 37/60] add is_first_in_run_pair_t to graph_utils.cuh --- cpp/include/cugraph/detail/graph_utils.cuh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cpp/include/cugraph/detail/graph_utils.cuh b/cpp/include/cugraph/detail/graph_utils.cuh index 254744d11d9..a5006c711fe 100644 --- a/cpp/include/cugraph/detail/graph_utils.cuh +++ b/cpp/include/cugraph/detail/graph_utils.cuh @@ -86,5 +86,14 @@ struct is_first_in_run_t { } }; +template +struct is_first_in_run_pair_t { + PairIterator pair_first{}; + __device__ bool operator()(size_t i) const + { + return (i == 0) || (*(pair_first + (i - 1)) != *(pair_first + i)); + } +}; + } // namespace detail } // namespace cugraph From b7d9570fd88f8284cecc8c5b7defe13cd35cc6b4 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 17 Feb 2022 02:00:34 -0800 Subject: [PATCH 38/60] add R-mat symmetric cases to coarsen_graph tests --- cpp/tests/structure/coarsen_graph_test.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/tests/structure/coarsen_graph_test.cpp b/cpp/tests/structure/coarsen_graph_test.cpp index dedcb2a718d..dc9298813be 100644 --- a/cpp/tests/structure/coarsen_graph_test.cpp +++ b/cpp/tests/structure/coarsen_graph_test.cpp @@ -433,7 +433,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Combine( // enable correctness checks ::testing::Values(CoarsenGraph_Usecase{0.2, false}, CoarsenGraph_Usecase{0.2, true}), - ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false)))); + ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false), + cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true, false)))); INSTANTIATE_TEST_SUITE_P( file_benchmark_test, /* note that the test filename can be overridden in benchmarking (with @@ -457,6 +458,7 @@ INSTANTIATE_TEST_SUITE_P( // disable correctness checks for large graphs ::testing::Values(CoarsenGraph_Usecase{0.2, false, false}, CoarsenGraph_Usecase{0.2, true, false}), - ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false)))); + ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false), + cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, true, false)))); CUGRAPH_TEST_PROGRAM_MAIN() From 0a0c6c1fbc03b2da6cbf4056069419d99903ddae Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 17 Feb 2022 02:05:57 -0800 Subject: [PATCH 39/60] if the input grpah is symmetric, update coarsen_grpah to work on the lower triangular part only and symmetrize to avoid (slightly) asymmetric edge weights due to the limited floating point precisions --- cpp/src/structure/coarsen_graph_impl.cuh | 486 ++++++++++++++++++----- 1 file changed, 376 insertions(+), 110 deletions(-) diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index ec01135f7ae..0611d642cb3 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -45,46 +45,66 @@ namespace cugraph { namespace { +template +struct is_not_lower_triangular_t { + __device__ bool operator()(EdgeTupleType e) const + { + return thrust::get<0>(e) < thrust::get<1>(e); + } +}; + +template +struct is_not_self_loop_t { + __device__ bool operator()(EdgeTupleType e) const + { + return thrust::get<0>(e) != thrust::get<1>(e); + } +}; + template edge_t groupby_e_and_coarsen_edgelist(vertex_t* edgelist_majors /* [INOUT] */, vertex_t* edgelist_minors /* [INOUT] */, std::optional edgelist_weights /* [INOUT] */, edge_t number_of_edges, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors, edgelist_minors)); if (edgelist_weights) { thrust::sort_by_key( - rmm::exec_policy(stream), pair_first, pair_first + number_of_edges, *edgelist_weights); - - rmm::device_uvector tmp_edgelist_majors(number_of_edges, stream); - rmm::device_uvector tmp_edgelist_minors(tmp_edgelist_majors.size(), stream); - rmm::device_uvector tmp_edgelist_weights(tmp_edgelist_majors.size(), stream); - auto it = thrust::reduce_by_key(rmm::exec_policy(stream), - pair_first, - pair_first + number_of_edges, - (*edgelist_weights), - thrust::make_zip_iterator(thrust::make_tuple( - tmp_edgelist_majors.begin(), tmp_edgelist_minors.begin())), - tmp_edgelist_weights.begin()); - auto ret = - static_cast(thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it))); + rmm::exec_policy(stream_view), pair_first, pair_first + number_of_edges, *edgelist_weights); + + auto num_uniques = + thrust::count_if(rmm::exec_policy(stream_view), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(static_cast(number_of_edges)), + detail::is_first_in_run_pair_t{pair_first}); + + rmm::device_uvector tmp_edgelist_majors(num_uniques, stream_view); + rmm::device_uvector tmp_edgelist_minors(tmp_edgelist_majors.size(), stream_view); + rmm::device_uvector tmp_edgelist_weights(tmp_edgelist_majors.size(), stream_view); + thrust::reduce_by_key(rmm::exec_policy(stream_view), + pair_first, + pair_first + number_of_edges, + (*edgelist_weights), + thrust::make_zip_iterator(thrust::make_tuple( + tmp_edgelist_majors.begin(), tmp_edgelist_minors.begin())), + tmp_edgelist_weights.begin()); auto edge_first = thrust::make_zip_iterator(thrust::make_tuple( tmp_edgelist_majors.begin(), tmp_edgelist_minors.begin(), tmp_edgelist_weights.begin())); - thrust::copy(rmm::exec_policy(stream), + thrust::copy(rmm::exec_policy(stream_view), edge_first, - edge_first + ret, + edge_first + num_uniques, thrust::make_zip_iterator( thrust::make_tuple(edgelist_majors, edgelist_minors, *edgelist_weights))); - return ret; + return num_uniques; } else { - thrust::sort(rmm::exec_policy(stream), pair_first, pair_first + number_of_edges); + thrust::sort(rmm::exec_policy(stream_view), pair_first, pair_first + number_of_edges); return static_cast(thrust::distance( pair_first, - thrust::unique(rmm::exec_policy(stream), pair_first, pair_first + number_of_edges))); + thrust::unique(rmm::exec_policy(stream_view), pair_first, pair_first + number_of_edges))); } } @@ -96,12 +116,24 @@ template , rmm::device_uvector, std::optional>> -decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist( - raft::handle_t const& handle, - matrix_partition_device_view_t const matrix_partition, - vertex_t const* major_label_first, - AdjMatrixMinorLabelInputWrapper const minor_label_input, - std::optional> const& segment_offsets) +decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(raft::handle_t const& + handle, + matrix_partition_device_view_t< + vertex_t, + edge_t, + weight_t, + multi_gpu> const + matrix_partition, + vertex_t const* + major_label_first, + AdjMatrixMinorLabelInputWrapper const + minor_label_input, + std::optional< + std::vector< + vertex_t>> const& + segment_offsets, + bool + lower_triangular_only) { static_assert(std::is_same_v); @@ -138,6 +170,41 @@ decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist( minor_label_input.get(thrust::get<1>(val) - minor_first)); }); + if (lower_triangular_only) { + if (edgelist_weights) { + auto edge_first = thrust::make_zip_iterator(thrust::make_tuple( + edgelist_majors.begin(), edgelist_minors.begin(), (*edgelist_weights).begin())); + edgelist_majors.resize( + thrust::distance( + edge_first, + thrust::remove_if( + handle.get_thrust_policy(), + edge_first, + edge_first + edgelist_majors.size(), + is_not_lower_triangular_t>{})), + handle.get_stream()); + edgelist_majors.shrink_to_fit(handle.get_stream()); + edgelist_minors.resize(edgelist_majors.size(), handle.get_stream()); + edgelist_minors.shrink_to_fit(handle.get_stream()); + (*edgelist_weights).resize(edgelist_majors.size(), handle.get_stream()); + (*edgelist_weights).shrink_to_fit(handle.get_stream()); + } else { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_majors.begin(), edgelist_minors.begin())); + edgelist_majors.resize( + thrust::distance( + edge_first, + thrust::remove_if(handle.get_thrust_policy(), + edge_first, + edge_first + edgelist_majors.size(), + is_not_lower_triangular_t>{})), + handle.get_stream()); + edgelist_majors.shrink_to_fit(handle.get_stream()); + edgelist_minors.resize(edgelist_majors.size(), handle.get_stream()); + edgelist_minors.shrink_to_fit(handle.get_stream()); + } + } + auto number_of_edges = groupby_e_and_coarsen_edgelist( edgelist_majors.data(), edgelist_minors.data(), @@ -191,7 +258,11 @@ coarsen_graph( // currently, nothing to do } - // 1. construct coarsened edge list + // 1. construct coarsened edge lists from each local partition (if the input graph is symmetric, + // start with only the lower triangular edges in the original graph, this is to prevent edge + // weights in the coarsened graph becoming asymmmetric due to limited floatping point resolution) + + bool lower_triangular_only = graph_view.is_symmetric(); std::conditional_t< store_transposed, @@ -216,17 +287,6 @@ coarsen_graph( if (coarsened_edgelist_weights) { (*coarsened_edgelist_weights).reserve(coarsened_edgelist_majors.size()); } - for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { - coarsened_edgelist_majors.emplace_back(0, handle.get_stream()); - coarsened_edgelist_minors.emplace_back(0, handle.get_stream()); - if (coarsened_edgelist_weights) { - (*coarsened_edgelist_weights).emplace_back(0, handle.get_stream()); - } - } - // FIXME: we may compare performance/memory footprint with the hash_based approach especially when - // cuco::dynamic_map becomes available (so we don't need to preallocate memory assuming the worst - // case). We may be able to limit the memory requirement close to the final coarsened edgelist - // with the hash based approach. for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { // 1-1. locally construct coarsened edge list @@ -248,7 +308,8 @@ coarsen_graph( graph_view.get_matrix_partition_view(i)), major_labels.data(), adj_matrix_minor_labels.device_view(), - graph_view.get_local_adj_matrix_partition_segment_offsets(i)); + graph_view.get_local_adj_matrix_partition_segment_offsets(i), + lower_triangular_only); // 1-2. globally shuffle @@ -258,91 +319,236 @@ coarsen_graph( std::move(edgelist_minors), std::move(edgelist_weights)); - // 1-3. append data to local adjacency matrix partitions + // 1-3. groupby and coarsen again - // FIXME: we can skip this if groupby_gpuid_and_shuffle_values is updated to return sorted edge - // list based on the final matrix partition (maybe add - // groupby_adj_matrix_partition_and_shuffle_values). + auto coarsened_size = groupby_e_and_coarsen_edgelist( + edgelist_majors.data(), + edgelist_minors.data(), + edgelist_weights ? std::optional{(*edgelist_weights).data()} : std::nullopt, + edgelist_majors.size(), + handle.get_stream()); + edgelist_majors.resize(coarsened_size, handle.get_stream()); + edgelist_majors.shrink_to_fit(handle.get_stream()); + edgelist_minors.resize(edgelist_majors.size(), handle.get_stream()); + edgelist_minors.shrink_to_fit(handle.get_stream()); + if (edgelist_weights) { + (*edgelist_weights).resize(edgelist_majors.size(), handle.get_stream()); + (*edgelist_weights).shrink_to_fit(handle.get_stream()); + } - auto counts = cugraph::detail::groupby_and_count_edgelist_by_local_partition_id( - handle, edgelist_majors, edgelist_minors, edgelist_weights); + coarsened_edgelist_majors.push_back(std::move(edgelist_majors)); + coarsened_edgelist_minors.push_back(std::move(edgelist_minors)); + if (edgelist_weights) { (*coarsened_edgelist_weights).push_back(std::move(*edgelist_weights)); } + } - std::vector h_counts(counts.size()); - raft::update_host(h_counts.data(), counts.data(), counts.size(), handle.get_stream()); - handle.sync_stream(); + // 2. concatenate and groupby and coarsen again (and if the input graph is symmetric, create a + // copy excluding self loops and globally shuffle) - std::vector h_displacements(h_counts.size(), size_t{0}); - std::partial_sum(h_counts.begin(), h_counts.end() - 1, h_displacements.begin() + 1); + edge_t tot_count{0}; + for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) { + tot_count += coarsened_edgelist_majors[i].size(); + } - for (int j = 0; j < col_comm_size; ++j) { - auto number_of_partition_edges = groupby_e_and_coarsen_edgelist( - edgelist_majors.begin() + h_displacements[j], - edgelist_minors.begin() + h_displacements[j], - edgelist_weights ? std::optional{(*edgelist_weights).data() + h_displacements[j]} - : std::nullopt, - h_counts[j], - handle.get_stream()); + rmm::device_uvector concatenated_edgelist_majors(tot_count, handle.get_stream()); + size_t major_offset{0}; + for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) { + thrust::copy(handle.get_thrust_policy(), + coarsened_edgelist_majors[i].begin(), + coarsened_edgelist_majors[i].end(), + concatenated_edgelist_majors.begin() + major_offset); + major_offset += coarsened_edgelist_majors[i].size(); + coarsened_edgelist_majors[i].resize(0, handle.get_stream()); + coarsened_edgelist_majors[i].shrink_to_fit(handle.get_stream()); + } - auto cur_size = coarsened_edgelist_majors[j].size(); - // FIXME: this can lead to frequent costly reallocation; we may be able to avoid this if we - // can reserve address space to avoid expensive reallocation. - // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management - coarsened_edgelist_majors[j].resize(cur_size + number_of_partition_edges, - handle.get_stream()); - coarsened_edgelist_minors[j].resize(coarsened_edgelist_majors[j].size(), handle.get_stream()); + rmm::device_uvector concatenated_edgelist_minors(tot_count, handle.get_stream()); + size_t minor_offset{0}; + for (size_t i = 0; i < coarsened_edgelist_minors.size(); ++i) { + thrust::copy(handle.get_thrust_policy(), + coarsened_edgelist_minors[i].begin(), + coarsened_edgelist_minors[i].end(), + concatenated_edgelist_minors.begin() + minor_offset); + minor_offset += coarsened_edgelist_minors[i].size(); + coarsened_edgelist_minors[i].resize(0, handle.get_stream()); + coarsened_edgelist_minors[i].shrink_to_fit(handle.get_stream()); + } - if (coarsened_edgelist_weights) { - (*coarsened_edgelist_weights)[j].resize(coarsened_edgelist_majors[j].size(), - handle.get_stream()); + std::optional> concatenated_edgelist_weights{std::nullopt}; + if (coarsened_edgelist_weights) { + concatenated_edgelist_weights = rmm::device_uvector(tot_count, handle.get_stream()); + size_t weight_offset{0}; + for (size_t i = 0; i < (*coarsened_edgelist_weights).size(); ++i) { + thrust::copy(handle.get_thrust_policy(), + (*coarsened_edgelist_weights)[i].begin(), + (*coarsened_edgelist_weights)[i].end(), + (*concatenated_edgelist_weights).begin() + weight_offset); + weight_offset += (*coarsened_edgelist_weights)[i].size(); + (*coarsened_edgelist_weights)[i].resize(0, handle.get_stream()); + (*coarsened_edgelist_weights)[i].shrink_to_fit(handle.get_stream()); + } + } - auto input_edge_first = - thrust::make_zip_iterator(thrust::make_tuple( - edgelist_majors.begin(), edgelist_minors.begin(), (*edgelist_weights).begin())) + - h_displacements[j]; - auto output_edge_first = - thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_majors[j].begin(), - coarsened_edgelist_minors[j].begin(), - (*coarsened_edgelist_weights)[j].begin())) + - cur_size; - thrust::copy(handle.get_thrust_policy(), - input_edge_first, - input_edge_first + number_of_partition_edges, - output_edge_first); - } else { - auto input_edge_first = thrust::make_zip_iterator(thrust::make_tuple( - edgelist_majors.begin(), edgelist_minors.begin())) + - h_displacements[j]; - auto output_edge_first = - thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_majors[j].begin(), - coarsened_edgelist_minors[j].begin())) + - cur_size; - thrust::copy(handle.get_thrust_policy(), - input_edge_first, - input_edge_first + number_of_partition_edges, - output_edge_first); - } + auto concatenated_and_coarsened_size = groupby_e_and_coarsen_edgelist( + concatenated_edgelist_majors.data(), + concatenated_edgelist_minors.data(), + concatenated_edgelist_weights + ? std::optional{(*concatenated_edgelist_weights).data()} + : std::nullopt, + concatenated_edgelist_majors.size(), + handle.get_stream()); + concatenated_edgelist_majors.resize(concatenated_and_coarsened_size, handle.get_stream()); + concatenated_edgelist_majors.shrink_to_fit(handle.get_stream()); + concatenated_edgelist_minors.resize(concatenated_edgelist_majors.size(), handle.get_stream()); + concatenated_edgelist_minors.shrink_to_fit(handle.get_stream()); + if (concatenated_edgelist_weights) { + (*concatenated_edgelist_weights) + .resize(concatenated_edgelist_majors.size(), handle.get_stream()); + (*concatenated_edgelist_weights).shrink_to_fit(handle.get_stream()); + } + + std::optional> reversed_edgelist_majors{std::nullopt}; + std::optional> reversed_edgelist_minors{std::nullopt}; + std::optional> reversed_edgelist_weights{std::nullopt}; + if (lower_triangular_only) { + if (concatenated_edgelist_weights) { + auto edge_first = + thrust::make_zip_iterator(thrust::make_tuple(concatenated_edgelist_majors.begin(), + concatenated_edgelist_minors.begin(), + (*concatenated_edgelist_weights).begin())); + auto last = + thrust::partition(handle.get_thrust_policy(), + edge_first, + edge_first + concatenated_edgelist_majors.size(), + is_not_self_loop_t>{}); + reversed_edgelist_majors = + rmm::device_uvector(thrust::distance(edge_first, last), handle.get_stream()); + reversed_edgelist_minors = + rmm::device_uvector((*reversed_edgelist_majors).size(), handle.get_stream()); + reversed_edgelist_weights = + rmm::device_uvector((*reversed_edgelist_majors).size(), handle.get_stream()); + thrust::copy( + handle.get_thrust_policy(), + edge_first, + edge_first + (*reversed_edgelist_majors).size(), + thrust::make_zip_iterator(thrust::make_tuple((*reversed_edgelist_minors).begin(), + (*reversed_edgelist_majors).begin(), + (*reversed_edgelist_weights).begin()))); + } else { + auto edge_first = thrust::make_zip_iterator(thrust::make_tuple( + concatenated_edgelist_majors.begin(), concatenated_edgelist_minors.begin())); + auto last = thrust::partition(handle.get_thrust_policy(), + edge_first, + edge_first + concatenated_edgelist_majors.size(), + is_not_self_loop_t>{}); + reversed_edgelist_majors = + rmm::device_uvector(thrust::distance(edge_first, last), handle.get_stream()); + reversed_edgelist_minors = + rmm::device_uvector((*reversed_edgelist_majors).size(), handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + edge_first, + edge_first + (*reversed_edgelist_majors).size(), + thrust::make_zip_iterator(thrust::make_tuple( + (*reversed_edgelist_minors).begin(), (*reversed_edgelist_majors).begin()))); } + + std::tie(*reversed_edgelist_majors, *reversed_edgelist_minors, reversed_edgelist_weights) = + cugraph::detail::shuffle_edgelist_by_gpu_id(handle, + std::move(*reversed_edgelist_majors), + std::move(*reversed_edgelist_minors), + std::move(reversed_edgelist_weights)); + } + + // 3. split concatenated edge list to local partitions + + auto concatenated_counts = + groupby_and_count_edgelist_by_local_partition_id(handle, + concatenated_edgelist_majors, + concatenated_edgelist_minors, + concatenated_edgelist_weights); + + std::vector h_concatenated_counts(concatenated_counts.size()); + raft::update_host(h_concatenated_counts.data(), + concatenated_counts.data(), + concatenated_counts.size(), + handle.get_stream()); + + std::optional> h_reversed_counts{std::nullopt}; + if (reversed_edgelist_majors) { + auto reversed_counts = groupby_and_count_edgelist_by_local_partition_id( + handle, *reversed_edgelist_majors, *reversed_edgelist_minors, reversed_edgelist_weights); + + h_reversed_counts = std::vector(reversed_counts.size()); + raft::update_host((*h_reversed_counts).data(), + reversed_counts.data(), + reversed_counts.size(), + handle.get_stream()); + } + + handle.sync_stream(); + + std::vector h_concatenated_displacements(h_concatenated_counts.size(), size_t{0}); + std::partial_sum(h_concatenated_counts.begin(), + h_concatenated_counts.end() - 1, + h_concatenated_displacements.begin() + 1); + + std::optional> h_reversed_displacements{std::nullopt}; + if (h_reversed_counts) { + h_reversed_displacements = std::vector((*h_reversed_counts).size(), size_t{0}); + std::partial_sum((*h_reversed_counts).begin(), + (*h_reversed_counts).end() - 1, + (*h_reversed_displacements).begin() + 1); } for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) { - auto number_of_partition_edges = groupby_e_and_coarsen_edgelist( - coarsened_edgelist_majors[i].data(), - coarsened_edgelist_minors[i].data(), - coarsened_edgelist_weights ? std::optional{(*coarsened_edgelist_weights)[i].data()} - : std::nullopt, - static_cast(coarsened_edgelist_majors[i].size()), + coarsened_edgelist_majors[i].resize( + h_concatenated_counts[i] + (h_reversed_counts ? (*h_reversed_counts)[i] : size_t{0}), handle.get_stream()); - coarsened_edgelist_majors[i].resize(number_of_partition_edges, handle.get_stream()); - coarsened_edgelist_majors[i].shrink_to_fit(handle.get_stream()); - coarsened_edgelist_minors[i].resize(number_of_partition_edges, handle.get_stream()); - coarsened_edgelist_minors[i].shrink_to_fit(handle.get_stream()); + coarsened_edgelist_minors[i].resize(coarsened_edgelist_majors[i].size(), handle.get_stream()); if (coarsened_edgelist_weights) { - (*coarsened_edgelist_weights)[i].resize(number_of_partition_edges, handle.get_stream()); - (*coarsened_edgelist_weights)[i].shrink_to_fit(handle.get_stream()); + (*coarsened_edgelist_weights)[i].resize(coarsened_edgelist_majors[i].size(), + handle.get_stream()); + } + + thrust::copy(handle.get_thrust_policy(), + concatenated_edgelist_majors.begin() + h_concatenated_displacements[i], + concatenated_edgelist_majors.begin() + + (h_concatenated_displacements[i] + h_concatenated_counts[i]), + coarsened_edgelist_majors[i].begin()); + thrust::copy(handle.get_thrust_policy(), + concatenated_edgelist_minors.begin() + h_concatenated_displacements[i], + concatenated_edgelist_minors.begin() + + (h_concatenated_displacements[i] + h_concatenated_counts[i]), + coarsened_edgelist_minors[i].begin()); + if (coarsened_edgelist_weights) { + thrust::copy(handle.get_thrust_policy(), + (*concatenated_edgelist_weights).begin() + h_concatenated_displacements[i], + (*concatenated_edgelist_weights).begin() + + (h_concatenated_displacements[i] + h_concatenated_counts[i]), + (*coarsened_edgelist_weights)[i].begin()); + } + + if (reversed_edgelist_majors) { + thrust::copy(handle.get_thrust_policy(), + (*reversed_edgelist_majors).begin() + (*h_reversed_displacements)[i], + (*reversed_edgelist_majors).begin() + + ((*h_reversed_displacements)[i] + (*h_reversed_counts)[i]), + coarsened_edgelist_majors[i].begin() + h_concatenated_counts[i]); + thrust::copy(handle.get_thrust_policy(), + (*reversed_edgelist_minors).begin() + (*h_reversed_displacements)[i], + (*reversed_edgelist_minors).begin() + + ((*h_reversed_displacements)[i] + (*h_reversed_counts)[i]), + coarsened_edgelist_minors[i].begin() + h_concatenated_counts[i]); + if (coarsened_edgelist_weights) { + thrust::copy(handle.get_thrust_policy(), + (*reversed_edgelist_weights).begin() + (*h_reversed_displacements)[i], + (*reversed_edgelist_weights).begin() + + ((*h_reversed_displacements)[i] + (*h_reversed_counts)[i]), + (*coarsened_edgelist_weights)[i].begin() + h_concatenated_counts[i]); + } } } - // 3. find unique labels for this GPU + // 4. find unique labels for this GPU rmm::device_uvector unique_labels(graph_view.get_number_of_local_vertices(), handle.get_stream()); @@ -364,7 +570,7 @@ coarsen_graph( thrust::unique(handle.get_thrust_policy(), unique_labels.begin(), unique_labels.end())), handle.get_stream()); - // 4. renumber + // 5. renumber rmm::device_uvector renumber_map_labels(0, handle.get_stream()); renumber_meta_t meta{}; @@ -387,7 +593,7 @@ coarsen_graph( do_expensive_check); } - // 5. build a graph + // 6. build a graph std::vector> edgelists{}; edgelists.resize(graph_view.get_number_of_local_adj_matrix_partitions()); @@ -438,6 +644,8 @@ coarsen_graph( // currently, nothing to do } + bool lower_triangular_only = graph_view.is_symmetric(); + auto [coarsened_edgelist_majors, coarsened_edgelist_minors, coarsened_edgelist_weights] = decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist( handle, @@ -445,7 +653,65 @@ coarsen_graph( graph_view.get_matrix_partition_view()), labels, detail::minor_properties_device_view_t(labels), - graph_view.get_local_adj_matrix_partition_segment_offsets(0)); + graph_view.get_local_adj_matrix_partition_segment_offsets(0), + lower_triangular_only); + + if (lower_triangular_only) { + if (coarsened_edgelist_weights) { + std::cout << "lower_triangular weighted" << std::endl; + auto edge_first = + thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_majors.begin(), + coarsened_edgelist_minors.begin(), + (*coarsened_edgelist_weights).begin())); + auto last = + thrust::partition(handle.get_thrust_policy(), + edge_first, + edge_first + coarsened_edgelist_majors.size(), + is_not_self_loop_t>{}); + + auto cur_size = coarsened_edgelist_majors.size(); + auto reversed_size = static_cast(thrust::distance(edge_first, last)); + + coarsened_edgelist_majors.resize(cur_size + reversed_size, handle.get_stream()); + coarsened_edgelist_minors.resize(coarsened_edgelist_majors.size(), handle.get_stream()); + (*coarsened_edgelist_weights).resize(coarsened_edgelist_majors.size(), handle.get_stream()); + + edge_first = + thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_majors.begin(), + coarsened_edgelist_minors.begin(), + (*coarsened_edgelist_weights).begin())); + thrust::copy( + handle.get_thrust_policy(), + edge_first, + edge_first + reversed_size, + thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_minors.begin(), + coarsened_edgelist_majors.begin(), + (*coarsened_edgelist_weights).begin())) + + cur_size); + } else { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(coarsened_edgelist_majors.begin(), coarsened_edgelist_minors.begin())); + auto last = thrust::partition(handle.get_thrust_policy(), + edge_first, + edge_first + coarsened_edgelist_majors.size(), + is_not_self_loop_t>{}); + + auto cur_size = coarsened_edgelist_majors.size(); + auto reversed_size = static_cast(thrust::distance(edge_first, last)); + + coarsened_edgelist_majors.resize(cur_size + reversed_size, handle.get_stream()); + coarsened_edgelist_minors.resize(coarsened_edgelist_majors.size(), handle.get_stream()); + + edge_first = thrust::make_zip_iterator( + thrust::make_tuple(coarsened_edgelist_majors.begin(), coarsened_edgelist_minors.begin())); + thrust::copy(handle.get_thrust_policy(), + edge_first, + edge_first + reversed_size, + thrust::make_zip_iterator(thrust::make_tuple( + coarsened_edgelist_minors.begin(), coarsened_edgelist_majors.begin())) + + cur_size); + } + } rmm::device_uvector unique_labels(graph_view.get_number_of_vertices(), handle.get_stream()); From 86f1d110d92ddbf12d0b6319ab72f76b87b13ba1 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 17 Feb 2022 02:19:02 -0800 Subject: [PATCH 40/60] clang-format --- cpp/src/structure/coarsen_graph_impl.cuh | 25 +++++++----------------- 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index 0611d642cb3..02e8529d108 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -116,24 +116,13 @@ template , rmm::device_uvector, std::optional>> -decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(raft::handle_t const& - handle, - matrix_partition_device_view_t< - vertex_t, - edge_t, - weight_t, - multi_gpu> const - matrix_partition, - vertex_t const* - major_label_first, - AdjMatrixMinorLabelInputWrapper const - minor_label_input, - std::optional< - std::vector< - vertex_t>> const& - segment_offsets, - bool - lower_triangular_only) +decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist( + raft::handle_t const& handle, + matrix_partition_device_view_t const matrix_partition, + vertex_t const* major_label_first, + AdjMatrixMinorLabelInputWrapper const minor_label_input, + std::optional> const& segment_offsets, + bool lower_triangular_only) { static_assert(std::is_same_v); From 75d6f81f6acd83e6a0b195f4a9c48d54fe45cad9 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 17 Feb 2022 10:05:36 -0800 Subject: [PATCH 41/60] fix comments --- cpp/src/structure/coarsen_graph_impl.cuh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index 02e8529d108..39ebaab8d31 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -248,8 +248,8 @@ coarsen_graph( } // 1. construct coarsened edge lists from each local partition (if the input graph is symmetric, - // start with only the lower triangular edges in the original graph, this is to prevent edge - // weights in the coarsened graph becoming asymmmetric due to limited floatping point resolution) + // start with only the lower triangular edges after relabeling, this is to prevent edge weights in + // the coarsened graph becoming asymmmetric due to limited floatping point resolution) bool lower_triangular_only = graph_view.is_symmetric(); @@ -647,7 +647,6 @@ coarsen_graph( if (lower_triangular_only) { if (coarsened_edgelist_weights) { - std::cout << "lower_triangular weighted" << std::endl; auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_majors.begin(), coarsened_edgelist_minors.begin(), From 29e7b84484fd48e029ff3e400ef5d965aba7f834 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 17 Feb 2022 10:29:24 -0800 Subject: [PATCH 42/60] fix to make merge with enh_mg_louvain PR easier --- cpp/include/cugraph/detail/graph_utils.cuh | 7 ++++--- cpp/src/structure/coarsen_graph_impl.cuh | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/include/cugraph/detail/graph_utils.cuh b/cpp/include/cugraph/detail/graph_utils.cuh index a5006c711fe..8cd1eced921 100644 --- a/cpp/include/cugraph/detail/graph_utils.cuh +++ b/cpp/include/cugraph/detail/graph_utils.cuh @@ -86,12 +86,13 @@ struct is_first_in_run_t { } }; -template +template struct is_first_in_run_pair_t { - PairIterator pair_first{}; + vertex_t const* vertices0{nullptr}; + vertex_t const* vertices1{nullptr}; __device__ bool operator()(size_t i) const { - return (i == 0) || (*(pair_first + (i - 1)) != *(pair_first + i)); + return (i == 0) || ((vertices0[i - 1] != vertices0[i]) || (vertices1[i - 1] != vertices1[i])); } }; diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index 39ebaab8d31..93eb35b437c 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -78,7 +78,7 @@ edge_t groupby_e_and_coarsen_edgelist(vertex_t* edgelist_majors /* [INOUT] */, thrust::count_if(rmm::exec_policy(stream_view), thrust::make_counting_iterator(size_t{0}), thrust::make_counting_iterator(static_cast(number_of_edges)), - detail::is_first_in_run_pair_t{pair_first}); + detail::is_first_in_run_pair_t{edgelist_majors, edgelist_minors}); rmm::device_uvector tmp_edgelist_majors(num_uniques, stream_view); rmm::device_uvector tmp_edgelist_minors(tmp_edgelist_majors.size(), stream_view); From 57a084332cd58e57cf6c6550d58a02667989163f Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 17 Feb 2022 10:31:05 -0800 Subject: [PATCH 43/60] copyright --- cpp/include/cugraph/detail/decompress_matrix_partition.cuh | 2 +- cpp/src/structure/coarsen_graph_impl.cuh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cugraph/detail/decompress_matrix_partition.cuh b/cpp/include/cugraph/detail/decompress_matrix_partition.cuh index b419d4bdbb3..ac8864d7e8f 100644 --- a/cpp/include/cugraph/detail/decompress_matrix_partition.cuh +++ b/cpp/include/cugraph/detail/decompress_matrix_partition.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index 93eb35b437c..53b2193ac46 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 14c47d4265c87644d28ae37b1a638e838a5be3c2 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 17 Feb 2022 17:39:22 -0800 Subject: [PATCH 44/60] remove one more memory footprint bottleneck in graph object creation --- cpp/src/structure/renumber_edgelist_impl.cuh | 100 ++++++++++--------- 1 file changed, 55 insertions(+), 45 deletions(-) diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index aeb7682f440..2a3ed5df5df 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -214,6 +214,12 @@ std::tuple, std::vector> compute_renumbe auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); + auto constexpr num_chunks = size_t{ + 2}; // tuning parameter, this trade-offs # binary searches (up to num_chunks times more + // binary searches can be necessary if num_unique_majors << edgelist_edge_counts[i]) and + // temporary buffer requirement (cut by num_chunks times), currently set to 2 to avoid + // peak memory usage happening in this part (especially when col_comm_size is small) + assert(edgelist_majors.size() == col_comm_size); auto edge_partition_major_sizes = @@ -226,24 +232,22 @@ std::tuple, std::vector> compute_renumbe raft::comms::op_t::SUM, handle.get_stream()); // memory footprint vs parallelism trade-off - // peak memory requirement per loop is - // min( - // (E / (comm_size * col_comm_size)) * sizeof(vertex_t) * 2, - // (E / (comm_size * col_comm_size)) * sizeof(vertex_t) + - // (V/P) * (sizeof(vertex_t) + sizeof(edge_t)), - // (V/P) * (sizeof(vertex_t) + sizeof(edge_t)) * 2 - // ) - // and limit temporary memory requirement to (E / comm_size) * sizeof(vertex_t) * 2 + // peak memory requirement per loop is approximately + // (V/P) * (sizeof(vertex_t) + sizeof(edge_t)) + + // (E / (comm_size * col_comm_size)) / num_chunks * sizeof(vertex_t) * 2 + + // std::min(V/P, (E / (comm_size * col_comm_size)) / num_chunks) * (sizeof(vertex_t) + + // sizeof(edge_t)) + // and limit temporary memory requirement to (E / comm_size) * sizeof(vertex_t) auto avg_vertex_degree = thrust::get<0>(vertex_edge_counts) > 0 ? static_cast(thrust::get<1>(vertex_edge_counts)) / static_cast(thrust::get<0>(vertex_edge_counts)) : double{0.0}; - auto num_streams = - std::min(static_cast(avg_vertex_degree * - (static_cast(sizeof(vertex_t)) / - static_cast(sizeof(vertex_t) + sizeof(edge_t)))), - static_cast( - std::min(static_cast(col_comm_size), handle.get_stream_pool_size()))); + auto num_streams = static_cast( + (avg_vertex_degree * sizeof(vertex_t)) / + (static_cast(sizeof(vertex_t) + sizeof(edge_t)) + + (((avg_vertex_degree / col_comm_size) / num_chunks) * sizeof(vertex_t) * 2) + + (std::min(1.0, ((avg_vertex_degree / col_comm_size) / num_chunks)) * + (sizeof(vertex_t) + sizeof(edge_t))))); if (num_streams >= 2) { stream_pool_indices = std::vector(num_streams); std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); @@ -256,28 +260,6 @@ std::tuple, std::vector> compute_renumbe ? handle.get_stream_from_stream_pool(i % (*stream_pool_indices).size()) : handle.get_stream(); - rmm::device_uvector tmp_majors(edgelist_edge_counts[i], loop_stream); - thrust::copy(rmm::exec_policy(loop_stream), - edgelist_majors[i], - edgelist_majors[i] + edgelist_edge_counts[i], - tmp_majors.begin()); - thrust::sort(rmm::exec_policy(loop_stream), tmp_majors.begin(), tmp_majors.end()); - auto num_unique_majors = thrust::count_if(rmm::exec_policy(loop_stream), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(tmp_majors.size()), - is_first_in_run_t{tmp_majors.data()}); - rmm::device_uvector tmp_keys(num_unique_majors, loop_stream); - rmm::device_uvector tmp_values(num_unique_majors, loop_stream); - thrust::reduce_by_key(rmm::exec_policy(loop_stream), - tmp_majors.begin(), - tmp_majors.end(), - thrust::make_constant_iterator(edge_t{1}), - tmp_keys.begin(), - tmp_values.begin()); - - tmp_majors.resize(0, loop_stream); - tmp_majors.shrink_to_fit(loop_stream); - rmm::device_uvector sorted_majors(edge_partition_major_sizes[i], loop_stream); device_bcast(col_comm, sorted_local_vertices.data(), @@ -292,15 +274,43 @@ std::tuple, std::vector> compute_renumbe sorted_major_degrees.end(), edge_t{0}); - auto kv_pair_first = - thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin())); - thrust::for_each( - rmm::exec_policy(loop_stream), - kv_pair_first, - kv_pair_first + tmp_keys.size(), - search_and_set_degree_t{sorted_majors.data(), - static_cast(sorted_majors.size()), - sorted_major_degrees.data()}); + rmm::device_uvector tmp_majors( + (static_cast(edgelist_edge_counts[i]) + (num_chunks - 1)) / num_chunks, + handle.get_stream()); + size_t offset{0}; + for (size_t j = 0; j < num_chunks; ++j) { + size_t this_chunk_size = + std::min(tmp_majors.size(), static_cast(edgelist_edge_counts[i]) - offset); + thrust::copy(rmm::exec_policy(loop_stream), + edgelist_majors[i] + offset, + edgelist_majors[i] + offset + this_chunk_size, + tmp_majors.begin()); + thrust::sort( + rmm::exec_policy(loop_stream), tmp_majors.begin(), tmp_majors.begin() + this_chunk_size); + auto num_unique_majors = thrust::count_if(rmm::exec_policy(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(this_chunk_size), + is_first_in_run_t{tmp_majors.data()}); + rmm::device_uvector tmp_keys(num_unique_majors, loop_stream); + rmm::device_uvector tmp_values(num_unique_majors, loop_stream); + thrust::reduce_by_key(rmm::exec_policy(loop_stream), + tmp_majors.begin(), + tmp_majors.begin() + this_chunk_size, + thrust::make_constant_iterator(edge_t{1}), + tmp_keys.begin(), + tmp_values.begin()); + + auto kv_pair_first = + thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin())); + thrust::for_each( + rmm::exec_policy(loop_stream), + kv_pair_first, + kv_pair_first + tmp_keys.size(), + search_and_set_degree_t{sorted_majors.data(), + static_cast(sorted_majors.size()), + sorted_major_degrees.data()}); + offset += this_chunk_size; + } device_reduce(col_comm, sorted_major_degrees.begin(), From 550576bda3d90b4c1f9dab01bd9943506d899c51 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 17 Feb 2022 17:45:09 -0800 Subject: [PATCH 45/60] add graph constructors taking edge list as R-value (so they can be destroyed as soon as they are no longer necessary to reduce peak memory usage) --- cpp/include/cugraph/graph.hpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp index cc711af663c..216552ec770 100644 --- a/cpp/include/cugraph/graph.hpp +++ b/cpp/include/cugraph/graph.hpp @@ -97,6 +97,13 @@ class graph_t meta, bool do_expensive_check = false); + graph_t(raft::handle_t const& handle, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + graph_meta_t meta, + bool do_expensive_check = false); + /** * @brief Symmetrize this graph. * @@ -264,6 +271,13 @@ class graph_t meta, bool do_expensive_check = false); + graph_t(raft::handle_t const& handle, + rmm::device_uvector&& edgelist_srcs, + rmm::device_uvector&& edgelist_dsts, + std::optional>&& edgelist_weights, + graph_meta_t meta, + bool do_expensive_check = false); + /** * @brief Symmetrize this graph. * From 09f30721041017ea0aa0b92ce5b4a4b905a0b337 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 18 Feb 2022 01:06:25 -0800 Subject: [PATCH 46/60] cosmetic fix --- cpp/include/cugraph/graph.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp index 216552ec770..f50a64665af 100644 --- a/cpp/include/cugraph/graph.hpp +++ b/cpp/include/cugraph/graph.hpp @@ -98,9 +98,9 @@ class graph_t>&& edgelist_srcs, - std::vector>&& edgelist_dsts, - std::optional>>&& edgelist_weights, + std::vector>&& edgelist_src_partitions, + std::vector>&& edgelist_dst_partitions, + std::optional>>&& edge_weight_partitions, graph_meta_t meta, bool do_expensive_check = false); From 2b9d37d07c175d730ef433bbd25b36976e80d101 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 18 Feb 2022 01:08:06 -0800 Subject: [PATCH 47/60] update coarsen_graph return type --- cpp/include/cugraph/graph_functions.hpp | 8 ++--- cpp/src/community/louvain.cuh | 5 +-- cpp/src/structure/coarsen_graph_mg.cu | 36 ++++++++-------------- cpp/src/structure/coarsen_graph_sg.cu | 36 ++++++++-------------- cpp/tests/structure/coarsen_graph_test.cpp | 2 +- 5 files changed, 32 insertions(+), 55 deletions(-) diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp index c170ce65253..988dd152bd9 100644 --- a/cpp/include/cugraph/graph_functions.hpp +++ b/cpp/include/cugraph/graph_functions.hpp @@ -351,16 +351,16 @@ symmetrize_edgelist(raft::handle_t const& handle, * @param graph_view Graph view object of the input graph to be coarsened. * @param labels Vertex labels (assigned to this process in multi-GPU) to be used in coarsening. * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). - * @return std::tuple>, rmm::device_uvector> Tuple of the coarsened graph and labels mapped to the - * vertices (assigned to this process in multi-GPU) in the coarsened graph. + * @return std::tuple, + * rmm::device_uvector> Tuple of the coarsened graph and labels mapped to the vertices + * (assigned to this process in multi-GPU) in the coarsened graph. */ template -std::tuple>, +std::tuple, rmm::device_uvector> coarsen_graph( raft::handle_t const& handle, diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index 094f3bc6546..16ccf872d0c 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -137,6 +137,7 @@ class Louvain { #endif handle_(handle), dendrogram_(std::make_unique>()), + current_graph_(handle), current_graph_view_(graph_view), cluster_keys_v_(0, handle.get_stream()), cluster_weights_v_(0, handle.get_stream()), @@ -559,7 +560,7 @@ class Louvain { std::tie(current_graph_, numbering_map) = coarsen_graph(handle_, current_graph_view_, dendrogram_->current_level_begin()); - current_graph_view_ = current_graph_->view(); + current_graph_view_ = current_graph_.view(); rmm::device_uvector numbering_indices(numbering_map.size(), handle_.get_stream()); thrust::sequence(handle_.get_thrust_policy(), @@ -589,7 +590,7 @@ class Louvain { // but as we shrink the graph we'll keep the // current graph here // - std::unique_ptr current_graph_{}; + graph_t current_graph_; graph_view_t current_graph_view_; rmm::device_uvector cluster_keys_v_; diff --git a/cpp/src/structure/coarsen_graph_mg.cu b/cpp/src/structure/coarsen_graph_mg.cu index 73ca9a82b06..95506878ef1 100644 --- a/cpp/src/structure/coarsen_graph_mg.cu +++ b/cpp/src/structure/coarsen_graph_mg.cu @@ -19,85 +19,73 @@ namespace cugraph { // MG instantiation -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int64_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int64_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int64_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int64_t const* labels, diff --git a/cpp/src/structure/coarsen_graph_sg.cu b/cpp/src/structure/coarsen_graph_sg.cu index d32dd1c744d..6cc07420957 100644 --- a/cpp/src/structure/coarsen_graph_sg.cu +++ b/cpp/src/structure/coarsen_graph_sg.cu @@ -19,85 +19,73 @@ namespace cugraph { // SG instantiation -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int64_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int64_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int32_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int64_t const* labels, bool do_expensive_check); -template std::tuple>, - rmm::device_uvector> +template std::tuple, rmm::device_uvector> coarsen_graph(raft::handle_t const& handle, graph_view_t const& graph_view, int64_t const* labels, diff --git a/cpp/tests/structure/coarsen_graph_test.cpp b/cpp/tests/structure/coarsen_graph_test.cpp index dc9298813be..dec7c994a69 100644 --- a/cpp/tests/structure/coarsen_graph_test.cpp +++ b/cpp/tests/structure/coarsen_graph_test.cpp @@ -324,7 +324,7 @@ class Tests_CoarsenGraph handle.get_stream()); } - auto coarse_graph_view = coarse_graph->view(); + auto coarse_graph_view = coarse_graph.view(); std::vector h_coarse_offsets(coarse_graph_view.get_number_of_vertices() + 1); std::vector h_coarse_indices(coarse_graph_view.get_number_of_edges()); From 35fc8d10b41375f17e8cc3fcf1641f47b8016f4f Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 18 Feb 2022 01:11:04 -0800 Subject: [PATCH 48/60] refactor graph constructors and implement graph constructors taking edge list in R-values --- cpp/src/structure/graph_impl.cuh | 940 ++++++++++++++++++++----------- 1 file changed, 609 insertions(+), 331 deletions(-) diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index eff76df8a79..924679cc48a 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -268,6 +268,399 @@ bool check_no_parallel_edge(raft::handle_t const& handle, } } +template +std::enable_if_t check_graph_constructor_input_arguments( + raft::handle_t const& handle, + std::vector> const& edgelists, + graph_meta_t meta, + bool do_expensive_check) +{ + // cheap error checks + + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_size = col_comm.get_size(); + + CUGRAPH_EXPECTS(edgelists.size() == static_cast(col_comm_size), + "Invalid input argument: erroneous edgelists.size()."); + CUGRAPH_EXPECTS( + !(meta.segment_offsets).has_value() || + ((*(meta.segment_offsets)).size() == + (detail::num_sparse_segments_per_vertex_partition + 1)) || + ((*(meta.segment_offsets)).size() == (detail::num_sparse_segments_per_vertex_partition + 2)), + "Invalid input argument: (*(meta.segment_offsets)).size() returns an invalid value."); + + auto is_weighted = edgelists[0].p_edge_weights.has_value(); + + CUGRAPH_EXPECTS( + std::any_of(edgelists.begin(), + edgelists.end(), + [is_weighted](auto edgelist) { + return ((edgelist.number_of_edges > 0) && (edgelist.p_src_vertices == nullptr)) || + ((edgelist.number_of_edges > 0) && (edgelist.p_dst_vertices == nullptr)) || + (is_weighted && (edgelist.number_of_edges > 0) && + ((edgelist.p_edge_weights.has_value() == false) || + (*(edgelist.p_edge_weights) == nullptr))); + }) == false, + "Invalid input argument: edgelists[].p_src_vertices and edgelists[].p_dst_vertices should not " + "be nullptr if edgelists[].number_of_edges > 0 and edgelists[].p_edge_weights should be " + "neither std::nullopt nor nullptr if weighted and edgelists[].number_of_edges > 0."); + + // optional expensive checks + + if (do_expensive_check) { + edge_t number_of_local_edges{0}; + for (size_t i = 0; i < edgelists.size(); ++i) { + auto [major_first, major_last] = meta.partition.get_matrix_partition_major_range(i); + auto [minor_first, minor_last] = meta.partition.get_matrix_partition_minor_range(); + + number_of_local_edges += edgelists[i].number_of_edges; + + auto edge_first = thrust::make_zip_iterator(thrust::make_tuple( + store_transposed ? edgelists[i].p_dst_vertices : edgelists[i].p_src_vertices, + store_transposed ? edgelists[i].p_src_vertices : edgelists[i].p_dst_vertices)); + // better use thrust::any_of once https://github.com/thrust/thrust/issues/1016 is resolved + CUGRAPH_EXPECTS(thrust::count_if(handle.get_thrust_policy(), + edge_first, + edge_first + edgelists[i].number_of_edges, + out_of_range_t{ + major_first, major_last, minor_first, minor_last}) == 0, + "Invalid input argument: edgelists[] have out-of-range values."); + } + auto number_of_local_edges_sum = host_scalar_allreduce( + comm, number_of_local_edges, raft::comms::op_t::SUM, handle.get_stream()); + CUGRAPH_EXPECTS(number_of_local_edges_sum == meta.number_of_edges, + "Invalid input argument: the sum of local edge counts does not match with " + "meta.number_of_edges."); + + CUGRAPH_EXPECTS( + meta.partition.get_vertex_partition_last(comm_size - 1) == meta.number_of_vertices, + "Invalid input argument: vertex partition should cover [0, meta.number_of_vertices)."); + + if (meta.properties.is_symmetric) { + CUGRAPH_EXPECTS( + (check_symmetric(handle, + edgelists)), + "Invalid input argument: meta.property.is_symmetric is true but the input edge list is not " + "symmetric."); + } + if (!meta.properties.is_multigraph) { + CUGRAPH_EXPECTS( + check_no_parallel_edge(handle, edgelists), + "Invalid input argument: meta.property.is_multigraph is false but the input edge list has " + "parallel edges."); + } + } +} + +template +std::enable_if_t check_graph_constructor_input_arguments( + raft::handle_t const& handle, + edgelist_t const& edgelist, + graph_meta_t meta, + bool do_expensive_check) +{ + // cheap error checks + + auto is_weighted = edgelist.p_edge_weights.has_value(); + + CUGRAPH_EXPECTS( + ((edgelist.number_of_edges == 0) || (edgelist.p_src_vertices != nullptr)) && + ((edgelist.number_of_edges == 0) || (edgelist.p_dst_vertices != nullptr)) && + (!is_weighted || (is_weighted && ((edgelist.number_of_edges == 0) || + (*(edgelist.p_edge_weights) != nullptr)))), + "Invalid input argument: edgelist.p_src_vertices and edgelist.p_dst_vertices should not be " + "nullptr if edgelist.number_of_edges > 0 and edgelist.p_edge_weights should be neither " + "std::nullopt nor nullptr if weighted and edgelist.number_of_edges > 0."); + + CUGRAPH_EXPECTS( + !meta.segment_offsets.has_value() || + ((*(meta.segment_offsets)).size() == (detail::num_sparse_segments_per_vertex_partition + 1)), + "Invalid input argument: (*(meta.segment_offsets)).size() returns an invalid value."); + + // optional expensive checks + + if (do_expensive_check) { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(store_transposed ? edgelist.p_dst_vertices : edgelist.p_src_vertices, + store_transposed ? edgelist.p_src_vertices : edgelist.p_dst_vertices)); + // better use thrust::any_of once https://github.com/thrust/thrust/issues/1016 is resolved + CUGRAPH_EXPECTS( + thrust::count_if( + handle.get_thrust_policy(), + edge_first, + edge_first + edgelist.number_of_edges, + out_of_range_t{0, meta.number_of_vertices, 0, meta.number_of_vertices}) == 0, + "Invalid input argument: edgelist have out-of-range values."); + + if (meta.properties.is_symmetric) { + CUGRAPH_EXPECTS( + (check_symmetric( + handle, std::vector>{edgelist})), + "Invalid input argument: meta.property.is_symmetric is true but the input edge list is not " + "symmetric."); + } + if (!meta.properties.is_multigraph) { + CUGRAPH_EXPECTS( + check_no_parallel_edge(handle, + std::vector>{edgelist}), + "Invalid input argument: meta.property.is_multigraph is false but the input edge list has " + "parallel edges."); + } + } +} + +template +std::vector aggregate_segment_offsets(raft::handle_t const& handle, + std::vector const& segment_offsets) +{ + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_size = col_comm.get_size(); + + rmm::device_uvector d_segment_offsets(segment_offsets.size(), handle.get_stream()); + raft::update_device( + d_segment_offsets.data(), segment_offsets.data(), segment_offsets.size(), handle.get_stream()); + rmm::device_uvector d_aggregate_segment_offsets( + col_comm_size * d_segment_offsets.size(), handle.get_stream()); + col_comm.allgather(d_segment_offsets.data(), + d_aggregate_segment_offsets.data(), + d_segment_offsets.size(), + handle.get_stream()); + + std::vector h_aggregate_segment_offsets(d_aggregate_segment_offsets.size(), + vertex_t{0}); + raft::update_host(h_aggregate_segment_offsets.data(), + d_aggregate_segment_offsets.data(), + d_aggregate_segment_offsets.size(), + handle.get_stream()); + + handle.sync_stream(); // this is necessary as h_aggregate_offsets can be used right after return. + + return h_aggregate_segment_offsets; +} + +template +std::enable_if_t>, + std::optional>, + std::optional>, + std::optional>>> +update_local_sorted_unique_edge_majors_minors( + raft::handle_t const& handle, + graph_meta_t const& meta, + std::optional> const& adj_matrix_partition_segment_offsets, + std::vector> const& adj_matrix_partition_offsets, + std::vector> const& adj_matrix_partition_indices, + std::optional>> const& + adj_matrix_partition_dcs_nzd_vertices, + std::optional> const& adj_matrix_partition_dcs_nzd_vertex_counts) +{ + auto& comm = handle.get_comms(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_rank = row_comm.get_rank(); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); + + auto use_dcs = + meta.segment_offsets + ? ((*(meta.segment_offsets)).size() > (detail::num_sparse_segments_per_vertex_partition + 1)) + : false; + + std::optional> local_sorted_unique_edge_majors{std::nullopt}; + std::optional> local_sorted_unique_edge_major_offsets{std::nullopt}; + + std::optional> local_sorted_unique_edge_minors{std::nullopt}; + std::optional> local_sorted_unique_edge_minor_offsets{std::nullopt}; + + // if # unique edge majors/minors << V / row_comm_size|col_comm_size, store unique edge + // majors/minors to support storing edge major/minor properties in (key, value) pairs. + + vertex_t num_local_unique_edge_majors{0}; + for (size_t i = 0; i < adj_matrix_partition_offsets.size(); ++i) { + num_local_unique_edge_majors += thrust::count_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator( + static_cast(adj_matrix_partition_offsets[i].size() - 1)), + has_nzd_t{adj_matrix_partition_offsets[i].data(), vertex_t{0}}); + } + + auto [minor_first, minor_last] = meta.partition.get_matrix_partition_minor_range(); + rmm::device_uvector minor_bitmaps( + ((minor_last - minor_first) + sizeof(uint32_t) * 8 - 1) / (sizeof(uint32_t) * 8), + handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), minor_bitmaps.begin(), minor_bitmaps.end(), uint32_t{0}); + for (size_t i = 0; i < adj_matrix_partition_indices.size(); ++i) { + thrust::for_each(handle.get_thrust_policy(), + adj_matrix_partition_indices[i].begin(), + adj_matrix_partition_indices[i].end(), + atomic_or_bitmap_t{minor_bitmaps.data(), minor_first}); + } + + auto count_first = thrust::make_transform_iterator(minor_bitmaps.begin(), popc_t{}); + auto num_local_unique_edge_minors = thrust::reduce( + handle.get_thrust_policy(), count_first, count_first + minor_bitmaps.size(), vertex_t{0}); + + minor_bitmaps.resize(0, handle.get_stream()); + minor_bitmaps.shrink_to_fit(handle.get_stream()); + + vertex_t aggregate_major_size{0}; + for (size_t i = 0; i < meta.partition.get_number_of_matrix_partitions(); ++i) { + aggregate_major_size += meta.partition.get_matrix_partition_major_size(i); + } + auto minor_size = meta.partition.get_matrix_partition_minor_size(); + auto max_major_properties_fill_ratio = host_scalar_allreduce( + comm, + static_cast(num_local_unique_edge_majors) / static_cast(aggregate_major_size), + raft::comms::op_t::MAX, + handle.get_stream()); + auto max_minor_properties_fill_ratio = host_scalar_allreduce( + comm, + static_cast(num_local_unique_edge_minors) / static_cast(minor_size), + raft::comms::op_t::MAX, + handle.get_stream()); + + if (max_major_properties_fill_ratio < detail::row_col_properties_kv_pair_fill_ratio_threshold) { + local_sorted_unique_edge_majors = + rmm::device_uvector(num_local_unique_edge_majors, handle.get_stream()); + size_t cur_size{0}; + for (size_t i = 0; i < adj_matrix_partition_offsets.size(); ++i) { + auto [major_first, major_last] = meta.partition.get_matrix_partition_major_range(i); + auto major_hypersparse_first = + use_dcs ? std::optional{major_first + + (*adj_matrix_partition_segment_offsets) + [(*(meta.segment_offsets)).size() * i + + detail::num_sparse_segments_per_vertex_partition]} + : std::nullopt; + cur_size += thrust::distance( + (*local_sorted_unique_edge_majors).data() + cur_size, + thrust::copy_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(major_first), + thrust::make_counting_iterator(use_dcs ? *major_hypersparse_first : major_last), + (*local_sorted_unique_edge_majors).data() + cur_size, + has_nzd_t{adj_matrix_partition_offsets[i].data(), major_first})); + if (use_dcs) { + thrust::copy(handle.get_thrust_policy(), + (*adj_matrix_partition_dcs_nzd_vertices)[i].begin(), + (*adj_matrix_partition_dcs_nzd_vertices)[i].begin() + + (*adj_matrix_partition_dcs_nzd_vertex_counts)[i], + (*local_sorted_unique_edge_majors).data() + cur_size); + cur_size += (*adj_matrix_partition_dcs_nzd_vertex_counts)[i]; + } + } + assert(cur_size == num_local_unique_edge_majors); + + std::vector h_vertex_partition_firsts(col_comm_size - 1); + for (int i = 1; i < col_comm_size; ++i) { + h_vertex_partition_firsts[i - 1] = + meta.partition.get_vertex_partition_first(i * row_comm_size + row_comm_rank); + } + rmm::device_uvector d_vertex_partition_firsts(h_vertex_partition_firsts.size(), + handle.get_stream()); + raft::update_device(d_vertex_partition_firsts.data(), + h_vertex_partition_firsts.data(), + h_vertex_partition_firsts.size(), + handle.get_stream()); + rmm::device_uvector d_key_offsets(d_vertex_partition_firsts.size(), + handle.get_stream()); + thrust::lower_bound(handle.get_thrust_policy(), + (*local_sorted_unique_edge_majors).begin(), + (*local_sorted_unique_edge_majors).end(), + d_vertex_partition_firsts.begin(), + d_vertex_partition_firsts.end(), + d_key_offsets.begin()); + std::vector h_key_offsets(col_comm_size + 1, vertex_t{0}); + h_key_offsets.back() = static_cast((*local_sorted_unique_edge_majors).size()); + raft::update_host( + h_key_offsets.data() + 1, d_key_offsets.data(), d_key_offsets.size(), handle.get_stream()); + + local_sorted_unique_edge_major_offsets = std::move(h_key_offsets); + } + + if (max_minor_properties_fill_ratio < detail::row_col_properties_kv_pair_fill_ratio_threshold) { + local_sorted_unique_edge_minors = rmm::device_uvector(0, handle.get_stream()); + for (size_t i = 0; i < adj_matrix_partition_indices.size(); ++i) { + rmm::device_uvector tmp_minors(adj_matrix_partition_indices[i].size(), + handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + adj_matrix_partition_indices[i].begin(), + adj_matrix_partition_indices[i].end(), + tmp_minors.begin()); + thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end()); + tmp_minors.resize( + thrust::distance( + tmp_minors.begin(), + thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())), + handle.get_stream()); + auto cur_size = (*local_sorted_unique_edge_minors).size(); + if (cur_size == 0) { + (*local_sorted_unique_edge_minors) = std::move(tmp_minors); + } else { + (*local_sorted_unique_edge_minors) + .resize((*local_sorted_unique_edge_minors).size() + tmp_minors.size(), + handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + tmp_minors.begin(), + tmp_minors.end(), + (*local_sorted_unique_edge_minors).begin() + cur_size); + } + } + thrust::sort(handle.get_thrust_policy(), + (*local_sorted_unique_edge_minors).begin(), + (*local_sorted_unique_edge_minors).end()); + (*local_sorted_unique_edge_minors) + .resize(thrust::distance((*local_sorted_unique_edge_minors).begin(), + thrust::unique(handle.get_thrust_policy(), + (*local_sorted_unique_edge_minors).begin(), + (*local_sorted_unique_edge_minors).end())), + handle.get_stream()); + (*local_sorted_unique_edge_minors).shrink_to_fit(handle.get_stream()); + + std::vector h_vertex_partition_firsts(row_comm_size - 1); + for (int i = 1; i < row_comm_size; ++i) { + h_vertex_partition_firsts[i - 1] = + meta.partition.get_vertex_partition_first(col_comm_rank * row_comm_size + i); + } + rmm::device_uvector d_vertex_partition_firsts(h_vertex_partition_firsts.size(), + handle.get_stream()); + raft::update_device(d_vertex_partition_firsts.data(), + h_vertex_partition_firsts.data(), + h_vertex_partition_firsts.size(), + handle.get_stream()); + rmm::device_uvector d_key_offsets(d_vertex_partition_firsts.size(), + handle.get_stream()); + thrust::lower_bound(handle.get_thrust_policy(), + (*local_sorted_unique_edge_minors).begin(), + (*local_sorted_unique_edge_minors).end(), + d_vertex_partition_firsts.begin(), + d_vertex_partition_firsts.end(), + d_key_offsets.begin()); + std::vector h_key_offsets(row_comm_size + 1, vertex_t{0}); + h_key_offsets.back() = static_cast((*local_sorted_unique_edge_minors).size()); + raft::update_host( + h_key_offsets.data() + 1, d_key_offsets.data(), d_key_offsets.size(), handle.get_stream()); + + local_sorted_unique_edge_minor_offsets = std::move(h_key_offsets); + } + + return std::make_tuple(std::move(local_sorted_unique_edge_majors), + std::move(local_sorted_unique_edge_major_offsets), + std::move(local_sorted_unique_edge_minors), + std::move(local_sorted_unique_edge_minor_offsets)); +} + // compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid template std::tuple, @@ -504,129 +897,157 @@ template graph_t>:: graph_t(raft::handle_t const& handle, - std::vector> const& edgelists, + std::vector> const& edgelists, + graph_meta_t meta, + bool do_expensive_check) + : detail::graph_base_t( + handle, meta.number_of_vertices, meta.number_of_edges, meta.properties), + partition_(meta.partition) +{ + auto is_weighted = edgelists[0].p_edge_weights.has_value(); + auto use_dcs = + meta.segment_offsets + ? ((*(meta.segment_offsets)).size() > (detail::num_sparse_segments_per_vertex_partition + 1)) + : false; + + check_graph_constructor_input_arguments( + handle, edgelists, meta, do_expensive_check); + + if (meta.segment_offsets) { + adj_matrix_partition_segment_offsets_ = + aggregate_segment_offsets(handle, (*meta.segment_offsets)); + } + + // compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid + + adj_matrix_partition_offsets_.reserve(edgelists.size()); + adj_matrix_partition_indices_.reserve(edgelists.size()); + if (is_weighted) { + adj_matrix_partition_weights_ = std::vector>{}; + (*adj_matrix_partition_weights_).reserve(edgelists.size()); + } + if (use_dcs) { + adj_matrix_partition_dcs_nzd_vertices_ = std::vector>{}; + adj_matrix_partition_dcs_nzd_vertex_counts_ = std::vector{}; + (*adj_matrix_partition_dcs_nzd_vertices_).reserve(edgelists.size()); + (*adj_matrix_partition_dcs_nzd_vertex_counts_).reserve(edgelists.size()); + } + for (size_t i = 0; i < edgelists.size(); ++i) { + auto [major_first, major_last] = partition_.get_matrix_partition_major_range(i); + auto [minor_first, minor_last] = partition_.get_matrix_partition_minor_range(); + auto major_hypersparse_first = + use_dcs ? std::optional{major_first + + (*adj_matrix_partition_segment_offsets_) + [(*(meta.segment_offsets)).size() * i + + detail::num_sparse_segments_per_vertex_partition]} + : std::nullopt; + auto [offsets, indices, weights, dcs_nzd_vertices] = + compress_edgelist(edgelists[i], + major_first, + major_hypersparse_first, + major_last, + minor_first, + minor_last, + handle.get_stream()); + + adj_matrix_partition_offsets_.push_back(std::move(offsets)); + adj_matrix_partition_indices_.push_back(std::move(indices)); + if (is_weighted) { (*adj_matrix_partition_weights_).push_back(std::move(*weights)); } + if (use_dcs) { + auto dcs_nzd_vertex_count = static_cast((*dcs_nzd_vertices).size()); + (*adj_matrix_partition_dcs_nzd_vertices_).push_back(std::move(*dcs_nzd_vertices)); + (*adj_matrix_partition_dcs_nzd_vertex_counts_).push_back(dcs_nzd_vertex_count); + } + } + + // segmented sort neighbors + + for (size_t i = 0; i < adj_matrix_partition_offsets_.size(); ++i) { + sort_adjacency_list(handle, + adj_matrix_partition_offsets_[i].data(), + adj_matrix_partition_indices_[i].data(), + adj_matrix_partition_weights_ + ? std::optional{(*adj_matrix_partition_weights_)[i].data()} + : std::nullopt, + static_cast(adj_matrix_partition_offsets_[i].size() - 1), + static_cast(adj_matrix_partition_indices_[i].size())); + } + + // update local sorted unique edge sources/destinations (only if key, value pair will be used) + + std::tie(store_transposed ? local_sorted_unique_edge_cols_ : local_sorted_unique_edge_rows_, + store_transposed ? local_sorted_unique_edge_col_offsets_ + : local_sorted_unique_edge_row_offsets_, + store_transposed ? local_sorted_unique_edge_rows_ : local_sorted_unique_edge_cols_, + store_transposed ? local_sorted_unique_edge_row_offsets_ + : local_sorted_unique_edge_col_offsets_) = + update_local_sorted_unique_edge_majors_minors( + handle, + meta, + adj_matrix_partition_segment_offsets_, + adj_matrix_partition_offsets_, + adj_matrix_partition_indices_, + adj_matrix_partition_dcs_nzd_vertices_, + adj_matrix_partition_dcs_nzd_vertex_counts_); +} + +template +graph_t>:: + graph_t(raft::handle_t const& handle, + std::vector>&& edgelist_src_partitions, + std::vector>&& edgelist_dst_partitions, + std::optional>>&& edgelist_weight_partitions, graph_meta_t meta, bool do_expensive_check) : detail::graph_base_t( handle, meta.number_of_vertices, meta.number_of_edges, meta.properties), partition_(meta.partition) { - // cheap error checks - - auto& comm = this->get_handle_ptr()->get_comms(); - auto const comm_size = comm.get_size(); - auto& row_comm = - this->get_handle_ptr()->get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = - this->get_handle_ptr()->get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - auto const col_comm_size = col_comm.get_size(); - auto default_stream_view = this->get_handle_ptr()->get_stream(); - - CUGRAPH_EXPECTS(edgelists.size() == static_cast(col_comm_size), - "Invalid input argument: erroneous edgelists.size()."); CUGRAPH_EXPECTS( - !(meta.segment_offsets).has_value() || - ((*(meta.segment_offsets)).size() == - (detail::num_sparse_segments_per_vertex_partition + 1)) || - ((*(meta.segment_offsets)).size() == (detail::num_sparse_segments_per_vertex_partition + 2)), - "Invalid input argument: (*(meta.segment_offsets)).size() returns an invalid value."); + edgelist_src_partitions.size() == edgelist_dst_partitions.size(), + "Invalid input argument: edgelist_src_partitions.size() != edgelist_dst_partitions.size()."); + CUGRAPH_EXPECTS(!edgelist_weight_partitions.has_value() || + (edgelist_src_partitions.size() == (*edgelist_weight_partitions).size()), + "Invalid input argument: edgelist_weight_partitions.has_value() && " + "edgelist_src_partitions.size() != (*edgelist_weight_partitions).size()."); + for (size_t i = 0; i < edgelist_src_partitions.size(); ++i) { + CUGRAPH_EXPECTS(edgelist_src_partitions[i].size() == edgelist_dst_partitions[i].size(), + "Invalid input argument: edgelist_src_partitions[].size() != " + "edgelist_dst_partitions[].size()."); + CUGRAPH_EXPECTS( + !edgelist_weight_partitions.has_value() || + (edgelist_src_partitions[i].size() == (*edgelist_weight_partitions)[i].size()), + "Invalid input argument: edgelist_weight_partitions.has_value() && " + "edgelist_src_partitions[].size() != (*edgelist_weight_partitions)[].size()."); + } - auto is_weighted = edgelists[0].p_edge_weights.has_value(); + auto is_weighted = edgelist_weight_partitions.has_value(); auto use_dcs = meta.segment_offsets ? ((*(meta.segment_offsets)).size() > (detail::num_sparse_segments_per_vertex_partition + 1)) : false; - CUGRAPH_EXPECTS( - std::any_of(edgelists.begin(), - edgelists.end(), - [is_weighted](auto edgelist) { - return ((edgelist.number_of_edges > 0) && (edgelist.p_src_vertices == nullptr)) || - ((edgelist.number_of_edges > 0) && (edgelist.p_dst_vertices == nullptr)) || - (is_weighted && (edgelist.number_of_edges > 0) && - ((edgelist.p_edge_weights.has_value() == false) || - (*(edgelist.p_edge_weights) == nullptr))); - }) == false, - "Invalid input argument: edgelists[].p_src_vertices and edgelists[].p_dst_vertices should not " - "be nullptr if edgelists[].number_of_edges > 0 and edgelists[].p_edge_weights should be " - "neither std::nullopt nor nullptr if weighted and edgelists[].number_of_edges > 0."); - - // optional expensive checks - - if (do_expensive_check) { - edge_t number_of_local_edges{0}; - for (size_t i = 0; i < edgelists.size(); ++i) { - auto [major_first, major_last] = partition_.get_matrix_partition_major_range(i); - auto [minor_first, minor_last] = partition_.get_matrix_partition_minor_range(); - - number_of_local_edges += edgelists[i].number_of_edges; - - auto edge_first = thrust::make_zip_iterator(thrust::make_tuple( - store_transposed ? edgelists[i].p_dst_vertices : edgelists[i].p_src_vertices, - store_transposed ? edgelists[i].p_src_vertices : edgelists[i].p_dst_vertices)); - // better use thrust::any_of once https://github.com/thrust/thrust/issues/1016 is resolved - CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(default_stream_view), - edge_first, - edge_first + edgelists[i].number_of_edges, - out_of_range_t{ - major_first, major_last, minor_first, minor_last}) == 0, - "Invalid input argument: edgelists[] have out-of-range values."); - } - auto number_of_local_edges_sum = host_scalar_allreduce( - comm, number_of_local_edges, raft::comms::op_t::SUM, default_stream_view.value()); - CUGRAPH_EXPECTS(number_of_local_edges_sum == this->get_number_of_edges(), - "Invalid input argument: the sum of local edge counts does not match with " - "meta.number_of_edges."); - - CUGRAPH_EXPECTS( - partition_.get_vertex_partition_last(comm_size - 1) == meta.number_of_vertices, - "Invalid input argument: vertex partition should cover [0, meta.number_of_vertices)."); - - if (this->is_symmetric()) { - CUGRAPH_EXPECTS( - (check_symmetric(handle, - edgelists)), - "Invalid input argument: meta.property.is_symmetric is true but the input edge list is not " - "symmetric."); - } - if (!this->is_multigraph()) { - CUGRAPH_EXPECTS( - check_no_parallel_edge(handle, edgelists), - "Invalid input argument: meta.property.is_multigraph is false but the input edge list has " - "parallel edges."); - } + std::vector> edgelists(edgelist_src_partitions.size()); + for (size_t i = 0; i < edgelists.size(); ++i) { + edgelists[i] = edgelist_t{ + edgelist_src_partitions[i].data(), + edgelist_dst_partitions[i].data(), + edgelist_weight_partitions + ? std::optional{(*edgelist_weight_partitions)[i].data()} + : std::nullopt, + static_cast(edgelist_src_partitions[i].size())}; } - // aggregate segment_offsets + check_graph_constructor_input_arguments( + handle, edgelists, meta, do_expensive_check); if (meta.segment_offsets) { - // FIXME: we need to add host_allgather - rmm::device_uvector d_segment_offsets((*(meta.segment_offsets)).size(), - default_stream_view); - raft::update_device(d_segment_offsets.data(), - (*(meta.segment_offsets)).data(), - (*(meta.segment_offsets)).size(), - default_stream_view.value()); - rmm::device_uvector d_aggregate_segment_offsets( - col_comm_size * d_segment_offsets.size(), default_stream_view); - col_comm.allgather(d_segment_offsets.data(), - d_aggregate_segment_offsets.data(), - d_segment_offsets.size(), - default_stream_view.value()); - adj_matrix_partition_segment_offsets_ = - std::vector(d_aggregate_segment_offsets.size(), vertex_t{0}); - raft::update_host((*adj_matrix_partition_segment_offsets_).data(), - d_aggregate_segment_offsets.data(), - d_aggregate_segment_offsets.size(), - default_stream_view.value()); - - default_stream_view - .synchronize(); // this is necessary as adj_matrix_partition_segment_offsets_ can be used - // right after return. + aggregate_segment_offsets(handle, (*meta.segment_offsets)); } // compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid @@ -659,7 +1080,15 @@ graph_t(adj_matrix_partition_indices_[i].size())); } - // if # unique edge rows/cols << V / row_comm_size|col_comm_size, store unique edge rows/cols to - // support storing edge row/column properties in (key, value) pairs. - - vertex_t num_local_unique_edge_majors{0}; - for (size_t i = 0; i < adj_matrix_partition_offsets_.size(); ++i) { - num_local_unique_edge_majors += thrust::count_if( - handle.get_thrust_policy(), - thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator( - static_cast(adj_matrix_partition_offsets_[i].size() - 1)), - has_nzd_t{adj_matrix_partition_offsets_[i].data(), vertex_t{0}}); - } - - auto [minor_first, minor_last] = partition_.get_matrix_partition_minor_range(); - rmm::device_uvector minor_bitmaps( - ((minor_last - minor_first) + sizeof(uint32_t) * 8 - 1) / (sizeof(uint32_t) * 8), - handle.get_stream()); - thrust::fill(handle.get_thrust_policy(), minor_bitmaps.begin(), minor_bitmaps.end(), uint32_t{0}); - for (size_t i = 0; i < adj_matrix_partition_indices_.size(); ++i) { - thrust::for_each(handle.get_thrust_policy(), - adj_matrix_partition_indices_[i].begin(), - adj_matrix_partition_indices_[i].end(), - atomic_or_bitmap_t{minor_bitmaps.data(), minor_first}); - } - - auto count_first = thrust::make_transform_iterator(minor_bitmaps.begin(), popc_t{}); - auto num_local_unique_edge_minors = thrust::reduce( - handle.get_thrust_policy(), count_first, count_first + minor_bitmaps.size(), vertex_t{0}); - - minor_bitmaps.resize(0, handle.get_stream()); - minor_bitmaps.shrink_to_fit(handle.get_stream()); - - vertex_t aggregate_major_size{0}; - for (size_t i = 0; i < partition_.get_number_of_matrix_partitions(); ++i) { - aggregate_major_size += partition_.get_matrix_partition_major_size(i); - } - auto minor_size = partition_.get_matrix_partition_minor_size(); - auto max_major_properties_fill_ratio = host_scalar_allreduce( - comm, - static_cast(num_local_unique_edge_majors) / static_cast(aggregate_major_size), - raft::comms::op_t::MAX, - handle.get_stream()); - auto max_minor_properties_fill_ratio = host_scalar_allreduce( - comm, - static_cast(num_local_unique_edge_minors) / static_cast(minor_size), - raft::comms::op_t::MAX, - handle.get_stream()); - - if (max_major_properties_fill_ratio < detail::row_col_properties_kv_pair_fill_ratio_threshold) { - rmm::device_uvector local_sorted_unique_edge_majors(num_local_unique_edge_majors, - handle.get_stream()); - size_t cur_size{0}; - for (size_t i = 0; i < adj_matrix_partition_offsets_.size(); ++i) { - auto [major_first, major_last] = partition_.get_matrix_partition_major_range(i); - auto major_hypersparse_first = - use_dcs ? std::optional{major_first + - (*adj_matrix_partition_segment_offsets_) - [(*(meta.segment_offsets)).size() * i + - detail::num_sparse_segments_per_vertex_partition]} - : std::nullopt; - cur_size += thrust::distance( - local_sorted_unique_edge_majors.data() + cur_size, - thrust::copy_if( - handle.get_thrust_policy(), - thrust::make_counting_iterator(major_first), - thrust::make_counting_iterator(use_dcs ? *major_hypersparse_first : major_last), - local_sorted_unique_edge_majors.data() + cur_size, - has_nzd_t{adj_matrix_partition_offsets_[i].data(), major_first})); - if (use_dcs) { - thrust::copy(handle.get_thrust_policy(), - (*adj_matrix_partition_dcs_nzd_vertices_)[i].begin(), - (*adj_matrix_partition_dcs_nzd_vertices_)[i].begin() + - (*adj_matrix_partition_dcs_nzd_vertex_counts_)[i], - local_sorted_unique_edge_majors.data() + cur_size); - cur_size += (*adj_matrix_partition_dcs_nzd_vertex_counts_)[i]; - } - } - assert(cur_size == num_local_unique_edge_majors); - - std::vector h_vertex_partition_firsts(col_comm_size - 1); - for (int i = 1; i < col_comm_size; ++i) { - h_vertex_partition_firsts[i - 1] = - partition_.get_vertex_partition_first(i * row_comm_size + row_comm_rank); - } - rmm::device_uvector d_vertex_partition_firsts(h_vertex_partition_firsts.size(), - handle.get_stream()); - raft::update_device(d_vertex_partition_firsts.data(), - h_vertex_partition_firsts.data(), - h_vertex_partition_firsts.size(), - handle.get_stream()); - rmm::device_uvector d_key_offsets(d_vertex_partition_firsts.size(), - handle.get_stream()); - thrust::lower_bound(handle.get_thrust_policy(), - local_sorted_unique_edge_majors.begin(), - local_sorted_unique_edge_majors.end(), - d_vertex_partition_firsts.begin(), - d_vertex_partition_firsts.end(), - d_key_offsets.begin()); - std::vector h_key_offsets(col_comm_size + 1, vertex_t{0}); - h_key_offsets.back() = static_cast(local_sorted_unique_edge_majors.size()); - raft::update_host( - h_key_offsets.data() + 1, d_key_offsets.data(), d_key_offsets.size(), handle.get_stream()); - - if constexpr (store_transposed) { - local_sorted_unique_edge_cols_ = std::move(local_sorted_unique_edge_majors); - local_sorted_unique_edge_col_offsets_ = std::move(h_key_offsets); - } else { - local_sorted_unique_edge_rows_ = std::move(local_sorted_unique_edge_majors); - local_sorted_unique_edge_row_offsets_ = std::move(h_key_offsets); - } - } - - if (max_minor_properties_fill_ratio < detail::row_col_properties_kv_pair_fill_ratio_threshold) { - rmm::device_uvector local_sorted_unique_edge_minors(0, handle.get_stream()); - for (size_t i = 0; i < adj_matrix_partition_indices_.size(); ++i) { - rmm::device_uvector tmp_minors(adj_matrix_partition_indices_[i].size(), - handle.get_stream()); - thrust::copy(handle.get_thrust_policy(), - adj_matrix_partition_indices_[i].begin(), - adj_matrix_partition_indices_[i].end(), - tmp_minors.begin()); - thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end()); - tmp_minors.resize( - thrust::distance( - tmp_minors.begin(), - thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())), - handle.get_stream()); - auto cur_size = local_sorted_unique_edge_minors.size(); - if (cur_size == 0) { - local_sorted_unique_edge_minors = std::move(tmp_minors); - } else { - local_sorted_unique_edge_minors.resize( - local_sorted_unique_edge_minors.size() + tmp_minors.size(), handle.get_stream()); - thrust::copy(handle.get_thrust_policy(), - tmp_minors.begin(), - tmp_minors.end(), - local_sorted_unique_edge_minors.begin() + cur_size); - } - } - thrust::sort(handle.get_thrust_policy(), - local_sorted_unique_edge_minors.begin(), - local_sorted_unique_edge_minors.end()); - local_sorted_unique_edge_minors.resize( - thrust::distance(local_sorted_unique_edge_minors.begin(), - thrust::unique(handle.get_thrust_policy(), - local_sorted_unique_edge_minors.begin(), - local_sorted_unique_edge_minors.end())), - handle.get_stream()); - local_sorted_unique_edge_minors.shrink_to_fit(handle.get_stream()); - - std::vector h_vertex_partition_firsts(row_comm_size - 1); - for (int i = 1; i < row_comm_size; ++i) { - h_vertex_partition_firsts[i - 1] = - partition_.get_vertex_partition_first(col_comm_rank * row_comm_size + i); - } - rmm::device_uvector d_vertex_partition_firsts(h_vertex_partition_firsts.size(), - handle.get_stream()); - raft::update_device(d_vertex_partition_firsts.data(), - h_vertex_partition_firsts.data(), - h_vertex_partition_firsts.size(), - handle.get_stream()); - rmm::device_uvector d_key_offsets(d_vertex_partition_firsts.size(), - handle.get_stream()); - thrust::lower_bound(handle.get_thrust_policy(), - local_sorted_unique_edge_minors.begin(), - local_sorted_unique_edge_minors.end(), - d_vertex_partition_firsts.begin(), - d_vertex_partition_firsts.end(), - d_key_offsets.begin()); - std::vector h_key_offsets(row_comm_size + 1, vertex_t{0}); - h_key_offsets.back() = static_cast(local_sorted_unique_edge_minors.size()); - raft::update_host( - h_key_offsets.data() + 1, d_key_offsets.data(), d_key_offsets.size(), handle.get_stream()); + // update local sorted unique edge sources/destinations (only if key, value pair will be used) - if constexpr (store_transposed) { - local_sorted_unique_edge_rows_ = std::move(local_sorted_unique_edge_minors); - local_sorted_unique_edge_row_offsets_ = std::move(h_key_offsets); - } else { - local_sorted_unique_edge_cols_ = std::move(local_sorted_unique_edge_minors); - local_sorted_unique_edge_col_offsets_ = std::move(h_key_offsets); - } - } + std::tie(store_transposed ? local_sorted_unique_edge_cols_ : local_sorted_unique_edge_rows_, + store_transposed ? local_sorted_unique_edge_col_offsets_ + : local_sorted_unique_edge_row_offsets_, + store_transposed ? local_sorted_unique_edge_rows_ : local_sorted_unique_edge_cols_, + store_transposed ? local_sorted_unique_edge_row_offsets_ + : local_sorted_unique_edge_col_offsets_) = + update_local_sorted_unique_edge_majors_minors( + handle, + meta, + adj_matrix_partition_segment_offsets_, + adj_matrix_partition_offsets_, + adj_matrix_partition_indices_, + adj_matrix_partition_dcs_nzd_vertices_, + adj_matrix_partition_dcs_nzd_vertex_counts_); } template (0, handle.get_stream())), segment_offsets_(meta.segment_offsets) { - // cheap error checks + check_graph_constructor_input_arguments( + handle, edgelist, meta, do_expensive_check); - auto default_stream_view = this->get_handle_ptr()->get_stream(); + // convert edge list (COO) to compressed sparse format (CSR or CSC) - auto is_weighted = edgelist.p_edge_weights.has_value(); + std::tie(offsets_, indices_, weights_, std::ignore) = + compress_edgelist(edgelist, + vertex_t{0}, + std::optional{std::nullopt}, + this->get_number_of_vertices(), + vertex_t{0}, + this->get_number_of_vertices(), + handle.get_stream()); - CUGRAPH_EXPECTS( - ((edgelist.number_of_edges == 0) || (edgelist.p_src_vertices != nullptr)) && - ((edgelist.number_of_edges == 0) || (edgelist.p_dst_vertices != nullptr)) && - (!is_weighted || (is_weighted && ((edgelist.number_of_edges == 0) || - (*(edgelist.p_edge_weights) != nullptr)))), - "Invalid input argument: edgelist.p_src_vertices and edgelist.p_dst_vertices should not be " - "nullptr if edgelist.number_of_edges > 0 and edgelist.p_edge_weights should be neither " - "std::nullopt nor nullptr if weighted and edgelist.number_of_edges > 0."); + // segmented sort neighbors + + sort_adjacency_list(handle, + offsets_.data(), + indices_.data(), + weights_ ? std::optional{(*weights_).data()} : std::nullopt, + static_cast(offsets_.size() - 1), + static_cast(indices_.size())); +} +template +graph_t>:: + graph_t(raft::handle_t const& handle, + rmm::device_uvector&& edgelist_srcs, + rmm::device_uvector&& edgelist_dsts, + std::optional>&& edgelist_weights, + graph_meta_t meta, + bool do_expensive_check) + : detail::graph_base_t( + handle, meta.number_of_vertices, static_cast(edgelist_srcs.size()), meta.properties), + offsets_(rmm::device_uvector(0, handle.get_stream())), + indices_(rmm::device_uvector(0, handle.get_stream())), + segment_offsets_(meta.segment_offsets) +{ + CUGRAPH_EXPECTS(edgelist_srcs.size() == edgelist_dsts.size(), + "Invalid input argument: edgelist_srcs.size() != edgelist_dsts.size()."); CUGRAPH_EXPECTS( - !segment_offsets_.has_value() || - ((*segment_offsets_).size() == (detail::num_sparse_segments_per_vertex_partition + 1)), - "Invalid input argument: (*(meta.segment_offsets)).size() returns an invalid value."); + !edgelist_weights.has_value() || (edgelist_srcs.size() == (*edgelist_weights).size()), + "Invalid input argument: edgelist_weights.has_value() && edgelist_srcs.size() != " + "(*edgelist_weights).size()."); - // optional expensive checks + edgelist_t edgelist{ + edgelist_srcs.data(), + edgelist_dsts.data(), + edgelist_weights ? std::optional{(*edgelist_weights).data()} : std::nullopt, + static_cast(edgelist_srcs.size())}; - if (do_expensive_check) { - auto edge_first = thrust::make_zip_iterator( - thrust::make_tuple(store_transposed ? edgelist.p_dst_vertices : edgelist.p_src_vertices, - store_transposed ? edgelist.p_src_vertices : edgelist.p_dst_vertices)); - // better use thrust::any_of once https://github.com/thrust/thrust/issues/1016 is resolved - CUGRAPH_EXPECTS(thrust::count_if( - rmm::exec_policy(default_stream_view), - edge_first, - edge_first + edgelist.number_of_edges, - out_of_range_t{ - 0, this->get_number_of_vertices(), 0, this->get_number_of_vertices()}) == 0, - "Invalid input argument: edgelist have out-of-range values."); - - if (this->is_symmetric()) { - CUGRAPH_EXPECTS( - (check_symmetric( - handle, std::vector>{edgelist})), - "Invalid input argument: meta.property.is_symmetric is true but the input edge list is not " - "symmetric."); - } - if (!this->is_multigraph()) { - CUGRAPH_EXPECTS( - check_no_parallel_edge(handle, - std::vector>{edgelist}), - "Invalid input argument: meta.property.is_multigraph is false but the input edge list has " - "parallel edges."); - } - } + check_graph_constructor_input_arguments( + handle, edgelist, meta, do_expensive_check); // convert edge list (COO) to compressed sparse format (CSR or CSC) @@ -944,7 +1214,15 @@ graph_tget_number_of_vertices(), vertex_t{0}, this->get_number_of_vertices(), - default_stream_view); + handle.get_stream()); + edgelist_srcs.resize(0, handle.get_stream()); + edgelist_srcs.shrink_to_fit(handle.get_stream()); + edgelist_dsts.resize(0, handle.get_stream()); + edgelist_dsts.shrink_to_fit(handle.get_stream()); + if (edgelist_weights) { + (*edgelist_weights).resize(0, handle.get_stream()); + (*edgelist_weights).shrink_to_fit(handle.get_stream()); + } // segmented sort neighbors From dc4a2137f8d0e2ce4c5cf17651df9245d06ac967 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 18 Feb 2022 01:11:57 -0800 Subject: [PATCH 49/60] update create_grpah_from_edgelist to call graph_t taking R-value edgelists --- .../create_graph_from_edgelist_impl.cuh | 77 +++++++++++++------ 1 file changed, 55 insertions(+), 22 deletions(-) diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index ea12a3562ba..0b711936ee0 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -248,15 +248,60 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, edgelist_edge_counts.end() - 1, edgelist_displacements.begin() + 1); + // 2. split the input edges to local partitions + + std::vector> edgelist_src_partitions{}; + edgelist_src_partitions.reserve(col_comm_size); + for (int i = 0; i < col_comm_size; ++i) { + rmm::device_uvector tmp_srcs(edgelist_edge_counts[i], handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + edgelist_rows.begin() + edgelist_displacements[i], + edgelist_rows.begin() + edgelist_displacements[i] + edgelist_edge_counts[i], + tmp_srcs.begin()); + edgelist_src_partitions.push_back(std::move(tmp_srcs)); + } + edgelist_rows.resize(0, handle.get_stream()); + edgelist_rows.shrink_to_fit(handle.get_stream()); + + std::vector> edgelist_dst_partitions{}; + edgelist_dst_partitions.reserve(col_comm_size); + for (int i = 0; i < col_comm_size; ++i) { + rmm::device_uvector tmp_dsts(edgelist_edge_counts[i], handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + edgelist_cols.begin() + edgelist_displacements[i], + edgelist_cols.begin() + edgelist_displacements[i] + edgelist_edge_counts[i], + tmp_dsts.begin()); + edgelist_dst_partitions.push_back(std::move(tmp_dsts)); + } + edgelist_cols.resize(0, handle.get_stream()); + edgelist_cols.shrink_to_fit(handle.get_stream()); + + std::optional>> edgelist_weight_partitions{}; + if (edgelist_weights) { + edgelist_weight_partitions = std::vector>{}; + (*edgelist_weight_partitions).reserve(col_comm_size); + for (int i = 0; i < col_comm_size; ++i) { + rmm::device_uvector tmp_weights(edgelist_edge_counts[i], handle.get_stream()); + thrust::copy( + handle.get_thrust_policy(), + (*edgelist_weights).begin() + edgelist_displacements[i], + (*edgelist_weights).begin() + edgelist_displacements[i] + edgelist_edge_counts[i], + tmp_weights.begin()); + (*edgelist_weight_partitions).push_back(std::move(tmp_weights)); + } + (*edgelist_weights).resize(0, handle.get_stream()); + (*edgelist_weights).shrink_to_fit(handle.get_stream()); + } + // 2. renumber std::vector major_ptrs(col_comm_size); std::vector minor_ptrs(major_ptrs.size()); for (int i = 0; i < col_comm_size; ++i) { - major_ptrs[i] = (store_transposed ? edgelist_cols.begin() : edgelist_rows.begin()) + - edgelist_displacements[i]; - minor_ptrs[i] = (store_transposed ? edgelist_rows.begin() : edgelist_cols.begin()) + - edgelist_displacements[i]; + major_ptrs[i] = + store_transposed ? edgelist_dst_partitions[i].begin() : edgelist_src_partitions[i].begin(); + minor_ptrs[i] = + store_transposed ? edgelist_src_partitions[i].begin() : edgelist_dst_partitions[i].begin(); } auto [renumber_map_labels, meta] = cugraph::renumber_edgelist( handle, @@ -268,21 +313,12 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, // 3. create a graph - std::vector> edgelists(col_comm_size); - for (int i = 0; i < col_comm_size; ++i) { - edgelists[i] = cugraph::edgelist_t{ - edgelist_rows.data() + edgelist_displacements[i], - edgelist_cols.data() + edgelist_displacements[i], - edgelist_weights - ? std::optional{(*edgelist_weights).data() + edgelist_displacements[i]} - : std::nullopt, - static_cast(edgelist_edge_counts[i])}; - } - return std::make_tuple( cugraph::graph_t( handle, - edgelists, + std::move(edgelist_src_partitions), + std::move(edgelist_dst_partitions), + std::move(edgelist_weight_partitions), cugraph::graph_meta_t{meta.number_of_vertices, meta.number_of_edges, graph_properties, @@ -351,12 +387,9 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, return std::make_tuple( cugraph::graph_t( handle, - cugraph::edgelist_t{ - edgelist_rows.data(), - edgelist_cols.data(), - edgelist_weights ? std::optional{(*edgelist_weights).data()} - : std::nullopt, - static_cast(edgelist_rows.size())}, + std::move(edgelist_rows), + std::move(edgelist_cols), + std::move(edgelist_weights), cugraph::graph_meta_t{ num_vertices, graph_properties, From 5177e86a4845f83052eb708a3d0b01ff289b4fdb Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 18 Feb 2022 01:12:52 -0800 Subject: [PATCH 50/60] update coarsen_graph_impl to call create_graph_from_edgelist instead of directly calling graph_t constructor --- cpp/src/structure/coarsen_graph_impl.cuh | 264 +++++++---------------- 1 file changed, 76 insertions(+), 188 deletions(-) diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh index e4cc48dfd99..3d314144fe3 100644 --- a/cpp/src/structure/coarsen_graph_impl.cuh +++ b/cpp/src/structure/coarsen_graph_impl.cuh @@ -223,10 +223,9 @@ template -std::enable_if_t< - multi_gpu, - std::tuple>, - rmm::device_uvector>> +std::enable_if_t, + rmm::device_uvector>> coarsen_graph( raft::handle_t const& handle, graph_view_t const& graph_view, @@ -330,8 +329,8 @@ coarsen_graph( if (edgelist_weights) { (*coarsened_edgelist_weights).push_back(std::move(*edgelist_weights)); } } - // 2. concatenate and groupby and coarsen again (and if the input graph is symmetric, create a - // copy excluding self loops and globally shuffle) + // 2. concatenate and groupby and coarsen again (and if the input graph is symmetric, 1) create a + // copy excluding self loops, 2) globally shuffle, and 3) concatenate again) edge_t tot_count{0}; for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) { @@ -395,10 +394,11 @@ coarsen_graph( (*concatenated_edgelist_weights).shrink_to_fit(handle.get_stream()); } - std::optional> reversed_edgelist_majors{std::nullopt}; - std::optional> reversed_edgelist_minors{std::nullopt}; - std::optional> reversed_edgelist_weights{std::nullopt}; if (lower_triangular_only) { + rmm::device_uvector reversed_edgelist_majors(0, handle.get_stream()); + rmm::device_uvector reversed_edgelist_minors(0, handle.get_stream()); + std::optional> reversed_edgelist_weights{std::nullopt}; + if (concatenated_edgelist_weights) { auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(concatenated_edgelist_majors.begin(), @@ -409,18 +409,16 @@ coarsen_graph( edge_first, edge_first + concatenated_edgelist_majors.size(), is_not_self_loop_t>{}); - reversed_edgelist_majors = - rmm::device_uvector(thrust::distance(edge_first, last), handle.get_stream()); - reversed_edgelist_minors = - rmm::device_uvector((*reversed_edgelist_majors).size(), handle.get_stream()); + reversed_edgelist_majors.resize(thrust::distance(edge_first, last), handle.get_stream()); + reversed_edgelist_minors.resize(reversed_edgelist_majors.size(), handle.get_stream()); reversed_edgelist_weights = - rmm::device_uvector((*reversed_edgelist_majors).size(), handle.get_stream()); + rmm::device_uvector(reversed_edgelist_majors.size(), handle.get_stream()); thrust::copy( handle.get_thrust_policy(), edge_first, - edge_first + (*reversed_edgelist_majors).size(), - thrust::make_zip_iterator(thrust::make_tuple((*reversed_edgelist_minors).begin(), - (*reversed_edgelist_majors).begin(), + edge_first + reversed_edgelist_majors.size(), + thrust::make_zip_iterator(thrust::make_tuple(reversed_edgelist_minors.begin(), + reversed_edgelist_majors.begin(), (*reversed_edgelist_weights).begin()))); } else { auto edge_first = thrust::make_zip_iterator(thrust::make_tuple( @@ -429,115 +427,53 @@ coarsen_graph( edge_first, edge_first + concatenated_edgelist_majors.size(), is_not_self_loop_t>{}); - reversed_edgelist_majors = - rmm::device_uvector(thrust::distance(edge_first, last), handle.get_stream()); - reversed_edgelist_minors = - rmm::device_uvector((*reversed_edgelist_majors).size(), handle.get_stream()); + reversed_edgelist_majors.resize(thrust::distance(edge_first, last), handle.get_stream()); + reversed_edgelist_minors.resize(reversed_edgelist_majors.size(), handle.get_stream()); thrust::copy(handle.get_thrust_policy(), edge_first, - edge_first + (*reversed_edgelist_majors).size(), - thrust::make_zip_iterator(thrust::make_tuple( - (*reversed_edgelist_minors).begin(), (*reversed_edgelist_majors).begin()))); + edge_first + reversed_edgelist_majors.size(), + thrust::make_zip_iterator(thrust::make_tuple(reversed_edgelist_minors.begin(), + reversed_edgelist_majors.begin()))); } - std::tie(*reversed_edgelist_majors, *reversed_edgelist_minors, reversed_edgelist_weights) = + std::tie(reversed_edgelist_majors, reversed_edgelist_minors, reversed_edgelist_weights) = cugraph::detail::shuffle_edgelist_by_gpu_id(handle, - std::move(*reversed_edgelist_majors), - std::move(*reversed_edgelist_minors), + std::move(reversed_edgelist_majors), + std::move(reversed_edgelist_minors), std::move(reversed_edgelist_weights)); - } - - // 3. split concatenated edge list to local partitions - - auto concatenated_counts = - groupby_and_count_edgelist_by_local_partition_id(handle, - concatenated_edgelist_majors, - concatenated_edgelist_minors, - concatenated_edgelist_weights); - - std::vector h_concatenated_counts(concatenated_counts.size()); - raft::update_host(h_concatenated_counts.data(), - concatenated_counts.data(), - concatenated_counts.size(), - handle.get_stream()); - - std::optional> h_reversed_counts{std::nullopt}; - if (reversed_edgelist_majors) { - auto reversed_counts = groupby_and_count_edgelist_by_local_partition_id( - handle, *reversed_edgelist_majors, *reversed_edgelist_minors, reversed_edgelist_weights); - - h_reversed_counts = std::vector(reversed_counts.size()); - raft::update_host((*h_reversed_counts).data(), - reversed_counts.data(), - reversed_counts.size(), - handle.get_stream()); - } - handle.sync_stream(); - - std::vector h_concatenated_displacements(h_concatenated_counts.size(), size_t{0}); - std::partial_sum(h_concatenated_counts.begin(), - h_concatenated_counts.end() - 1, - h_concatenated_displacements.begin() + 1); - - std::optional> h_reversed_displacements{std::nullopt}; - if (h_reversed_counts) { - h_reversed_displacements = std::vector((*h_reversed_counts).size(), size_t{0}); - std::partial_sum((*h_reversed_counts).begin(), - (*h_reversed_counts).end() - 1, - (*h_reversed_displacements).begin() + 1); - } - - for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) { - coarsened_edgelist_majors[i].resize( - h_concatenated_counts[i] + (h_reversed_counts ? (*h_reversed_counts)[i] : size_t{0}), - handle.get_stream()); - coarsened_edgelist_minors[i].resize(coarsened_edgelist_majors[i].size(), handle.get_stream()); - if (coarsened_edgelist_weights) { - (*coarsened_edgelist_weights)[i].resize(coarsened_edgelist_majors[i].size(), - handle.get_stream()); - } + auto output_offset = concatenated_edgelist_majors.size(); + concatenated_edgelist_majors.resize( + concatenated_edgelist_majors.size() + reversed_edgelist_majors.size(), handle.get_stream()); thrust::copy(handle.get_thrust_policy(), - concatenated_edgelist_majors.begin() + h_concatenated_displacements[i], - concatenated_edgelist_majors.begin() + - (h_concatenated_displacements[i] + h_concatenated_counts[i]), - coarsened_edgelist_majors[i].begin()); + reversed_edgelist_majors.begin(), + reversed_edgelist_majors.end(), + concatenated_edgelist_majors.begin() + output_offset); + reversed_edgelist_majors.resize(0, handle.get_stream()); + reversed_edgelist_majors.shrink_to_fit(handle.get_stream()); + + concatenated_edgelist_minors.resize(concatenated_edgelist_majors.size(), handle.get_stream()); thrust::copy(handle.get_thrust_policy(), - concatenated_edgelist_minors.begin() + h_concatenated_displacements[i], - concatenated_edgelist_minors.begin() + - (h_concatenated_displacements[i] + h_concatenated_counts[i]), - coarsened_edgelist_minors[i].begin()); - if (coarsened_edgelist_weights) { - thrust::copy(handle.get_thrust_policy(), - (*concatenated_edgelist_weights).begin() + h_concatenated_displacements[i], - (*concatenated_edgelist_weights).begin() + - (h_concatenated_displacements[i] + h_concatenated_counts[i]), - (*coarsened_edgelist_weights)[i].begin()); - } + reversed_edgelist_minors.begin(), + reversed_edgelist_minors.end(), + concatenated_edgelist_minors.begin() + output_offset); + reversed_edgelist_minors.resize(0, handle.get_stream()); + reversed_edgelist_minors.shrink_to_fit(handle.get_stream()); - if (reversed_edgelist_majors) { - thrust::copy(handle.get_thrust_policy(), - (*reversed_edgelist_majors).begin() + (*h_reversed_displacements)[i], - (*reversed_edgelist_majors).begin() + - ((*h_reversed_displacements)[i] + (*h_reversed_counts)[i]), - coarsened_edgelist_majors[i].begin() + h_concatenated_counts[i]); + if (concatenated_edgelist_weights) { + (*concatenated_edgelist_weights) + .resize(concatenated_edgelist_majors.size(), handle.get_stream()); thrust::copy(handle.get_thrust_policy(), - (*reversed_edgelist_minors).begin() + (*h_reversed_displacements)[i], - (*reversed_edgelist_minors).begin() + - ((*h_reversed_displacements)[i] + (*h_reversed_counts)[i]), - coarsened_edgelist_minors[i].begin() + h_concatenated_counts[i]); - if (coarsened_edgelist_weights) { - thrust::copy(handle.get_thrust_policy(), - (*reversed_edgelist_weights).begin() + (*h_reversed_displacements)[i], - (*reversed_edgelist_weights).begin() + - ((*h_reversed_displacements)[i] + (*h_reversed_counts)[i]), - (*coarsened_edgelist_weights)[i].begin() + h_concatenated_counts[i]); - } + (*reversed_edgelist_weights).begin(), + (*reversed_edgelist_weights).end(), + (*concatenated_edgelist_weights).begin() + output_offset); + (*reversed_edgelist_weights).resize(0, handle.get_stream()); + (*reversed_edgelist_weights).shrink_to_fit(handle.get_stream()); } } - // 4. find unique labels for this GPU + // 3. find unique labels for this GPU rmm::device_uvector unique_labels(graph_view.get_number_of_local_vertices(), handle.get_stream()); @@ -559,56 +495,22 @@ coarsen_graph( thrust::unique(handle.get_thrust_policy(), unique_labels.begin(), unique_labels.end())), handle.get_stream()); - // 5. renumber + // 4. create a graph - rmm::device_uvector renumber_map_labels(0, handle.get_stream()); - renumber_meta_t meta{}; - { - std::vector major_ptrs(coarsened_edgelist_majors.size()); - std::vector minor_ptrs(major_ptrs.size()); - std::vector counts(major_ptrs.size()); - for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) { - major_ptrs[i] = coarsened_edgelist_majors[i].data(); - minor_ptrs[i] = coarsened_edgelist_minors[i].data(); - counts[i] = static_cast(coarsened_edgelist_majors[i].size()); - } - std::tie(renumber_map_labels, meta) = renumber_edgelist( + auto [coarsened_graph, renumber_map] = + create_graph_from_edgelist( handle, - std::optional>{std::move(unique_labels)}, - major_ptrs, - minor_ptrs, - counts, - std::nullopt, + std::move(unique_labels), + store_transposed ? std::move(concatenated_edgelist_minors) + : std::move(concatenated_edgelist_majors), + store_transposed ? std::move(concatenated_edgelist_majors) + : std::move(concatenated_edgelist_minors), + std::move(concatenated_edgelist_weights), + graph_properties_t{graph_view.is_symmetric(), false}, + true, do_expensive_check); - } - - // 6. build a graph - - std::vector> edgelists{}; - edgelists.resize(graph_view.get_number_of_local_adj_matrix_partitions()); - for (size_t i = 0; i < edgelists.size(); ++i) { - edgelists[i].p_src_vertices = - store_transposed ? coarsened_edgelist_minors[i].data() : coarsened_edgelist_majors[i].data(); - edgelists[i].p_dst_vertices = - store_transposed ? coarsened_edgelist_majors[i].data() : coarsened_edgelist_minors[i].data(); - edgelists[i].p_edge_weights = - coarsened_edgelist_weights - ? std::optional{(*coarsened_edgelist_weights)[i].data()} - : std::nullopt, - edgelists[i].number_of_edges = static_cast(coarsened_edgelist_majors[i].size()); - } - return std::make_tuple( - std::make_unique>( - handle, - edgelists, - graph_meta_t{ - meta.number_of_vertices, - meta.number_of_edges, - graph_properties_t{graph_view.is_symmetric(), false}, - meta.partition, - meta.segment_offsets}), - std::move(renumber_map_labels)); + return std::make_tuple(std::move(coarsened_graph), std::move(*renumber_map)); } // single-GPU version @@ -617,10 +519,9 @@ template -std::enable_if_t< - !multi_gpu, - std::tuple>, - rmm::device_uvector>> +std::enable_if_t, + rmm::device_uvector>> coarsen_graph( raft::handle_t const& handle, graph_view_t const& graph_view, @@ -710,33 +611,20 @@ coarsen_graph( thrust::unique(handle.get_thrust_policy(), unique_labels.begin(), unique_labels.end())), handle.get_stream()); - auto [renumber_map_labels, meta] = renumber_edgelist( - handle, - std::optional>{std::move(unique_labels)}, - coarsened_edgelist_majors.data(), - coarsened_edgelist_minors.data(), - static_cast(coarsened_edgelist_majors.size()), - do_expensive_check); - - edgelist_t edgelist{}; - edgelist.p_src_vertices = - store_transposed ? coarsened_edgelist_minors.data() : coarsened_edgelist_majors.data(); - edgelist.p_dst_vertices = - store_transposed ? coarsened_edgelist_majors.data() : coarsened_edgelist_minors.data(); - edgelist.p_edge_weights = coarsened_edgelist_weights - ? std::optional{(*coarsened_edgelist_weights).data()} - : std::nullopt; - edgelist.number_of_edges = static_cast(coarsened_edgelist_majors.size()); - - return std::make_tuple( - std::make_unique>( + auto [coarsened_graph, renumber_map] = + create_graph_from_edgelist( handle, - edgelist, - graph_meta_t{ - static_cast(renumber_map_labels.size()), - graph_properties_t{graph_view.is_symmetric(), false}, - meta.segment_offsets}), - std::move(renumber_map_labels)); + std::optional>{std::move(unique_labels)}, + store_transposed ? std::move(coarsened_edgelist_minors) + : std::move(coarsened_edgelist_majors), + store_transposed ? std::move(coarsened_edgelist_majors) + : std::move(coarsened_edgelist_minors), + std::move(coarsened_edgelist_weights), + graph_properties_t{graph_view.is_symmetric(), false}, + true, + do_expensive_check); + + return std::make_tuple(std::move(coarsened_graph), std::move(*renumber_map)); } } // namespace detail @@ -746,7 +634,7 @@ template -std::tuple>, +std::tuple, rmm::device_uvector> coarsen_graph( raft::handle_t const& handle, From 3bdb95eb5b8a1c53255afcc25c1902a57534776e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 18 Feb 2022 17:56:09 -0800 Subject: [PATCH 51/60] additional (potential) parallelism vs peak memory trade-off in edge shuffling --- cpp/src/detail/shuffle_wrappers.cu | 51 +++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/cpp/src/detail/shuffle_wrappers.cu b/cpp/src/detail/shuffle_wrappers.cu index 6e9434882ba..7d2f2453d08 100644 --- a/cpp/src/detail/shuffle_wrappers.cu +++ b/cpp/src/detail/shuffle_wrappers.cu @@ -84,9 +84,33 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle, handle.get_stream()); handle.sync_stream(); - std::forward_as_tuple( - std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors, d_rx_edgelist_weights), std::ignore) = - shuffle_values(comm, edge_first, h_tx_value_counts, handle.get_stream()); + if (d_edgelist_majors.size() > + mem_frugal_threshold) { // trade-off potential parallelism to lower peak memory + std::tie(d_rx_edgelist_majors, std::ignore) = + shuffle_values(comm, d_edgelist_majors.begin(), h_tx_value_counts, handle.get_stream()); + d_edgelist_majors.resize(0, handle.get_stream()); + d_edgelist_majors.shrink_to_fit(handle.get_stream()); + + std::tie(d_rx_edgelist_minors, std::ignore) = + shuffle_values(comm, d_edgelist_minors.begin(), h_tx_value_counts, handle.get_stream()); + d_edgelist_minors.resize(0, handle.get_stream()); + d_edgelist_minors.shrink_to_fit(handle.get_stream()); + + std::tie(d_rx_edgelist_weights, std::ignore) = + shuffle_values(comm, (*d_edgelist_weights).begin(), h_tx_value_counts, handle.get_stream()); + (*d_edgelist_weights).resize(0, handle.get_stream()); + (*d_edgelist_weights).shrink_to_fit(handle.get_stream()); + } else { + std::forward_as_tuple( + std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors, d_rx_edgelist_weights), std::ignore) = + shuffle_values(comm, edge_first, h_tx_value_counts, handle.get_stream()); + d_edgelist_majors.resize(0, handle.get_stream()); + d_edgelist_majors.shrink_to_fit(handle.get_stream()); + d_edgelist_minors.resize(0, handle.get_stream()); + d_edgelist_minors.shrink_to_fit(handle.get_stream()); + (*d_edgelist_weights).resize(0, handle.get_stream()); + (*d_edgelist_weights).shrink_to_fit(handle.get_stream()); + } } else { auto edge_first = thrust::make_zip_iterator( thrust::make_tuple(d_edgelist_majors.begin(), d_edgelist_minors.begin())); @@ -110,8 +134,25 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle, handle.get_stream()); handle.sync_stream(); - std::forward_as_tuple(std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors), std::ignore) = - shuffle_values(comm, edge_first, h_tx_value_counts, handle.get_stream()); + if (d_edgelist_majors.size() > + mem_frugal_threshold) { // trade-off potential parallelism to lower peak memory + std::tie(d_rx_edgelist_majors, std::ignore) = + shuffle_values(comm, d_edgelist_majors.begin(), h_tx_value_counts, handle.get_stream()); + d_edgelist_majors.resize(0, handle.get_stream()); + d_edgelist_majors.shrink_to_fit(handle.get_stream()); + + std::tie(d_rx_edgelist_minors, std::ignore) = + shuffle_values(comm, d_edgelist_minors.begin(), h_tx_value_counts, handle.get_stream()); + d_edgelist_minors.resize(0, handle.get_stream()); + d_edgelist_minors.shrink_to_fit(handle.get_stream()); + } else { + std::forward_as_tuple(std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors), std::ignore) = + shuffle_values(comm, edge_first, h_tx_value_counts, handle.get_stream()); + d_edgelist_majors.resize(0, handle.get_stream()); + d_edgelist_majors.shrink_to_fit(handle.get_stream()); + d_edgelist_minors.resize(0, handle.get_stream()); + d_edgelist_minors.shrink_to_fit(handle.get_stream()); + } } return std::make_tuple(std::move(d_rx_edgelist_majors), From edd7cddd23c53d15167904dc4030d15fbfe21d89 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 19 Feb 2022 11:11:05 -0800 Subject: [PATCH 52/60] bug fix --- cpp/include/cugraph/utilities/shuffle_comm.cuh | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh index 309a30c78e2..cfed6c33dd3 100644 --- a/cpp/include/cugraph/utilities/shuffle_comm.cuh +++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh @@ -125,6 +125,13 @@ compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const& comm, return std::make_tuple(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks); } +template +struct key_group_id_less_t { + KeyToGroupIdOp key_to_group_id_op{}; + int pivot{}; + __device__ bool operator()(key_type k) const { return key_to_group_id_op(k) < pivot; } +}; + template struct value_group_id_less_t { ValueToGroupIdOp value_to_group_id_op{}; @@ -231,9 +238,8 @@ std::tuple mem_frugal_partition( rmm::exec_policy(stream_view), key_first, key_last, - kv_pair_group_id_less_t::value_type, - typename thrust::iterator_traits::value_type, - KeyToGroupIdOp>{key_to_group_id_op, pivot})); + key_group_id_less_t::value_type, KeyToGroupIdOp>{ + key_to_group_id_op, pivot})); auto second_size = num_elements - first_size; auto tmp_key_buffer = From ff12636896ef34af7b8cd76935d2ef356211ebf2 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 20 Feb 2022 12:58:54 -0800 Subject: [PATCH 53/60] fix for a possible hang --- cpp/src/detail/shuffle_wrappers.cu | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/cpp/src/detail/shuffle_wrappers.cu b/cpp/src/detail/shuffle_wrappers.cu index 7d2f2453d08..26bdd21a1f9 100644 --- a/cpp/src/detail/shuffle_wrappers.cu +++ b/cpp/src/detail/shuffle_wrappers.cu @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -51,6 +52,12 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle, auto mem_frugal_threshold = static_cast(static_cast(total_global_mem / element_size) * mem_frugal_ratio); + auto mem_frugal_flag = + host_scalar_allreduce(comm, + d_edgelist_majors.size() > mem_frugal_threshold ? int{1} : int{0}, + raft::comms::op_t::MAX, + handle.get_stream()); + // invoke groupby_and_count and shuffle values to pass mem_frugal_threshold instead of directly // calling groupby_gpu_id_and_shuffle_values there is no benefit in reducing peak memory as we // need to allocate a receive buffer anyways) but this reduces the maximum memory allocation size @@ -84,8 +91,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle, handle.get_stream()); handle.sync_stream(); - if (d_edgelist_majors.size() > - mem_frugal_threshold) { // trade-off potential parallelism to lower peak memory + if (mem_frugal_flag) { // trade-off potential parallelism to lower peak memory std::tie(d_rx_edgelist_majors, std::ignore) = shuffle_values(comm, d_edgelist_majors.begin(), h_tx_value_counts, handle.get_stream()); d_edgelist_majors.resize(0, handle.get_stream()); @@ -134,8 +140,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle, handle.get_stream()); handle.sync_stream(); - if (d_edgelist_majors.size() > - mem_frugal_threshold) { // trade-off potential parallelism to lower peak memory + if (mem_frugal_flag) { // trade-off potential parallelism to lower peak memory std::tie(d_rx_edgelist_majors, std::ignore) = shuffle_values(comm, d_edgelist_majors.begin(), h_tx_value_counts, handle.get_stream()); d_edgelist_majors.resize(0, handle.get_stream()); From a34ad17c3450efa46b440dc9a6f539cc3656b0d1 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 23 Feb 2022 16:25:20 -0800 Subject: [PATCH 54/60] added temporary code to experiment performance --- .../copy_v_transform_reduce_in_out_nbr.cuh | 338 ++++++++++++++++++ cpp/src/link_analysis/pagerank_impl.cuh | 32 +- .../create_graph_from_edgelist_impl.cuh | 15 + cpp/tests/link_analysis/mg_pagerank_test.cpp | 8 + 4 files changed, 391 insertions(+), 2 deletions(-) diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh index 63f1aae6c8a..7a47803726d 100644 --- a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh +++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh @@ -35,6 +35,8 @@ #include #include +#include // FIXME: delete + #include #include #include @@ -490,11 +492,19 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, col_properties_t> minor_tmp_buffer{}; // relevant only when (GraphViewType::is_multi_gpu && !update_major if constexpr (GraphViewType::is_multi_gpu && !update_major) { +#if 1 // FIXME: delete + handle.sync_stream(); + std::cout << "copy_v allocate minor_tmp_buffer" << std::endl; +#endif if constexpr (GraphViewType::is_adj_matrix_transposed) { minor_tmp_buffer = row_properties_t(handle, graph_view); } else { minor_tmp_buffer = col_properties_t(handle, graph_view); } +#if 1 // FIXME: delete + handle.sync_stream(); + std::cout << "copy_v allocate minor_tmp_buffer: SUCCESS" << std::endl; +#endif } if constexpr (!update_major) { @@ -577,6 +587,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, stream_pool_indices = std::vector(num_streams); std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); handle.sync_stream(); + std::cout << "copy_v num_streams=" << num_streams << std::endl; // FIXME: delete } } } @@ -615,6 +626,328 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, if (stream_pool_indices) { handle.sync_stream(); } +#if 1 // FIXME: delete, just experimenting (to better separate compute time vs reduction time + if (stream_pool_indices) { + auto num_concurrent_loops = (*stream_pool_indices).size() / max_segments; + auto num_rounds = + (graph_view.get_number_of_local_adj_matrix_partitions() + (num_concurrent_loops - 1)) / + num_concurrent_loops; + for (size_t round = 0; round < num_rounds; ++round) { + /* computing */ + +#if 1 // FIXME: delete + if constexpr (GraphViewType::is_multi_gpu) { handle.get_comms().barrier(); } + auto core_time0 = std::chrono::steady_clock::now(); +#endif + for (size_t i = num_concurrent_loops * round; + i < std::min(static_cast(graph_view.get_number_of_local_adj_matrix_partitions()), + num_concurrent_loops * (round + 1)); + ++i) { + auto matrix_partition = + matrix_partition_device_view_t( + graph_view.get_matrix_partition_view(i)); + + auto major_init = T{}; + if constexpr (update_major) { + if constexpr (GraphViewType::is_multi_gpu) { + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + major_init = (static_cast(i) == col_comm_rank) ? init : T{}; + } else { + major_init = init; + } + } + + auto matrix_partition_row_value_input = adj_matrix_row_value_input; + auto matrix_partition_col_value_input = adj_matrix_col_value_input; + if constexpr (GraphViewType::is_adj_matrix_transposed) { + matrix_partition_col_value_input.set_local_adj_matrix_partition_idx(i); + } else { + matrix_partition_row_value_input.set_local_adj_matrix_partition_idx(i); + } + + auto major_buffer_first = + get_dataframe_buffer_begin(major_tmp_buffers[i % major_tmp_buffers.size()]); + + std::conditional_t, + VertexValueOutputIterator> + output_buffer{}; + if constexpr (GraphViewType::is_multi_gpu) { + if constexpr (update_major) { + output_buffer = major_buffer_first; + } else { + output_buffer = minor_tmp_buffer.mutable_device_view(); + } + } else { + output_buffer = vertex_value_output_first; + } + + auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i); + if (segment_offsets) { + static_assert(detail::num_sparse_segments_per_vertex_partition == 3); + + // FIXME: we may further improve performance by 1) individually tuning block sizes for + // different segments; and 2) adding one more segment for very high degree vertices and + // running segmented reduction + if (matrix_partition.get_dcs_nzd_vertex_count()) { + auto exec_stream = + stream_pool_indices + ? rmm::cuda_stream_view{pool_streams[(i * max_segments) % + (*stream_pool_indices).size()]} + /* FIXME for temporary testing, + handle.get_stream_from_stream_pool((i * max_segments) % + (*stream_pool_indices).size()) */ + : handle.get_stream(); + if constexpr (update_major) { // this is necessary as we don't visit every vertex in + // the hypersparse segment in + // for_all_major_for_all_nbr_hypersparse + thrust::fill(rmm::exec_policy(exec_stream), + output_buffer + (*segment_offsets)[3], + output_buffer + (*segment_offsets)[4], + major_init); + } + if (*(matrix_partition.get_dcs_nzd_vertex_count()) > 0) { + raft::grid_1d_thread_t update_grid( + *(matrix_partition.get_dcs_nzd_vertex_count()), + detail::copy_v_transform_reduce_nbr_for_all_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[3]; } + detail::for_all_major_for_all_nbr_hypersparse + <<>>( + matrix_partition, + matrix_partition.get_major_first() + (*segment_offsets)[3], + matrix_partition_row_value_input, + matrix_partition_col_value_input, + segment_output_buffer, + e_op, + major_init); + } + } + if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { + auto exec_stream = + stream_pool_indices + ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 1) % + (*stream_pool_indices).size()]} + /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * + max_segments + 1) % (*stream_pool_indices).size()) */ + : handle.get_stream(); + raft::grid_1d_thread_t update_grid( + (*segment_offsets)[3] - (*segment_offsets)[2], + detail::copy_v_transform_reduce_nbr_for_all_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[2]; } + detail::for_all_major_for_all_nbr_low_degree + <<>>( + matrix_partition, + matrix_partition.get_major_first() + (*segment_offsets)[2], + matrix_partition.get_major_first() + (*segment_offsets)[3], + matrix_partition_row_value_input, + matrix_partition_col_value_input, + segment_output_buffer, + e_op, + major_init); + } + if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { + auto exec_stream = + stream_pool_indices + ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 2) % + (*stream_pool_indices).size()]} + /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * + max_segments + 2) % (*stream_pool_indices).size()) */ + : handle.get_stream(); + raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1], + detail::copy_v_transform_reduce_nbr_for_all_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[1]; } + detail::for_all_major_for_all_nbr_mid_degree + <<>>( + matrix_partition, + matrix_partition.get_major_first() + (*segment_offsets)[1], + matrix_partition.get_major_first() + (*segment_offsets)[2], + matrix_partition_row_value_input, + matrix_partition_col_value_input, + segment_output_buffer, + e_op, + major_init); + } + if ((*segment_offsets)[1] > 0) { + auto exec_stream = + stream_pool_indices + ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 3) % + (*stream_pool_indices).size()]} + /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * + max_segments + 3) % (*stream_pool_indices).size()) */ + : handle.get_stream(); + raft::grid_1d_block_t update_grid( + (*segment_offsets)[1], + detail::copy_v_transform_reduce_nbr_for_all_block_size, + handle.get_device_properties().maxGridSize[0]); + detail::for_all_major_for_all_nbr_high_degree + <<>>( + matrix_partition, + matrix_partition.get_major_first(), + matrix_partition.get_major_first() + (*segment_offsets)[1], + matrix_partition_row_value_input, + matrix_partition_col_value_input, + output_buffer, + e_op, + major_init); + } + } else { + if (matrix_partition.get_major_size() > 0) { + raft::grid_1d_thread_t update_grid( + matrix_partition.get_major_size(), + detail::copy_v_transform_reduce_nbr_for_all_block_size, + handle.get_device_properties().maxGridSize[0]); + detail::for_all_major_for_all_nbr_low_degree + <<>>( + matrix_partition, + matrix_partition.get_major_first(), + matrix_partition.get_major_last(), + matrix_partition_row_value_input, + matrix_partition_col_value_input, + output_buffer, + e_op, + major_init); + } + } + } + +#if 1 // FIXME: for temporary testing + for (size_t i = 0; i < pool_streams.size(); ++i) { + CUDA_TRY(cudaStreamSynchronize(pool_streams[i])); + } +#else + handle.sync_stream_pool(*stream_pool_indices); +#endif + + /* communication */ + +#if 1 // FIXME: delete + auto core_time1 = std::chrono::steady_clock::now(); + if constexpr (GraphViewType::is_multi_gpu) { handle.get_comms().barrier(); } + auto core_time2 = std::chrono::steady_clock::now(); +#endif + if constexpr (GraphViewType::is_multi_gpu && update_major) { + ncclGroupStart(); // SIMPLE + for (size_t i = num_concurrent_loops * round; + i < + std::min(static_cast(graph_view.get_number_of_local_adj_matrix_partitions()), + num_concurrent_loops * (round + 1)); + ++i) { +#if 1 // SIMPLE + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); +#else + auto& comm = handle.get_comms(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_rank = row_comm.get_rank(); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); +#endif + + auto matrix_partition = + matrix_partition_device_view_t( + graph_view.get_matrix_partition_view(i)); + + auto major_buffer_first = + get_dataframe_buffer_begin(major_tmp_buffers[i % major_tmp_buffers.size()]); + +#if 1 // SIMPLE + device_reduce(col_comm, + major_buffer_first, + vertex_value_output_first, + matrix_partition.get_major_size(), + raft::comms::op_t::SUM, + i, + handle.get_stream()); +#else + auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i); + if (segment_offsets && stream_pool_indices) { + if ((*segment_offsets).back() - (*segment_offsets)[3] > 0) { + device_reduce(col_comm, + major_buffer_first + (*segment_offsets)[3], + vertex_value_output_first + (*segment_offsets)[3], + (*segment_offsets).back() - (*segment_offsets)[3], + raft::comms::op_t::SUM, + i, + pool_streams[(i * max_segments) % (*stream_pool_indices).size()]/* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) */); + } + if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { + device_reduce(col_comm, + major_buffer_first + (*segment_offsets)[2], + vertex_value_output_first + (*segment_offsets)[2], + (*segment_offsets)[3] - (*segment_offsets)[2], + raft::comms::op_t::SUM, + i, + pool_streams[(i * max_segments + 1) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 1) % (*stream_pool_indices).size()) */); + } + if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { + device_reduce(col_comm, + major_buffer_first + (*segment_offsets)[1], + vertex_value_output_first + (*segment_offsets)[1], + (*segment_offsets)[2] - (*segment_offsets)[1], + raft::comms::op_t::SUM, + i, + pool_streams[(i * max_segments + 2) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 2) % (*stream_pool_indices).size()) */); + } + if ((*segment_offsets)[1] > 0) { + device_reduce(col_comm, + major_buffer_first, + vertex_value_output_first, + (*segment_offsets)[1], + raft::comms::op_t::SUM, + i, + pool_streams[(i * max_segments + 3) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 3) % (*stream_pool_indices).size()) */); + } + } else { + device_reduce(col_comm, + major_buffer_first, + vertex_value_output_first, + matrix_partition.get_major_size(), + raft::comms::op_t::SUM, + i, + handle.get_stream()); + } +#endif + } + ncclGroupEnd(); // SIMPLE + } + +#if 1 // SIMPLE + handle.sync_stream(); // SIMPLE +#else +#if 1 // FIXME: for temporary testing + for (size_t i = 0; i < pool_streams.size(); ++i) { + CUDA_TRY(cudaStreamSynchronize(pool_streams[i])); + } +#else + handle.sync_stream_pool(*stream_pool_indices); +#endif +#endif +#if 1 // FIXME: delete + auto core_time3 = std::chrono::steady_clock::now(); + if constexpr (GraphViewType::is_multi_gpu) { handle.get_comms().barrier(); } + auto core_time4 = std::chrono::steady_clock::now(); + std::chrono::duration elapsed_total = core_time4 - core_time0; + std::chrono::duration elapsed0 = core_time1 - core_time0; + std::chrono::duration elapsed1 = core_time2 - core_time1; + std::chrono::duration elapsed2 = core_time3 - core_time2; + std::chrono::duration elapsed3 = core_time4 - core_time3; + std::cout << "copy_v core took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << ") ms." << std::endl; +#endif + } + } else { + CUGRAPH_FAIL("should not be reached."); + } +#else for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { auto matrix_partition = matrix_partition_device_view_t( @@ -826,6 +1159,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } } } +#endif if (stream_pool_indices) { #if 1 // FIXME: for temporary testing @@ -838,6 +1172,10 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } if constexpr (GraphViewType::is_multi_gpu && !update_major) { +#if 1 // FIXME: delete + handle.sync_stream(); + std::cout << "minor reduction" << std::endl; +#endif auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh index e346a6892b9..eec2126aea7 100644 --- a/cpp/src/link_analysis/pagerank_impl.cuh +++ b/cpp/src/link_analysis/pagerank_impl.cuh @@ -55,6 +55,12 @@ void pagerank( bool has_initial_guess, bool do_expensive_check) { +#if 1 // FIXME: delete +size_t free_size{}; +size_t total_size{}; +CUDA_TRY(cudaMemGetInfo(&free_size, &total_size)); +std::cout << "PageRank start free_size=" << static_cast(free_size) / (1024.0 * 1024.0 * 1024.0) << " GB total_size=" << static_cast(total_size) / (1024.0 * 1024.0 * 1024.0) << " GB." << std::endl; +#endif using vertex_t = typename GraphViewType::vertex_type; using weight_t = typename GraphViewType::weight_type; @@ -78,6 +84,9 @@ void pagerank( : vertex_t{0}; // 1. check input arguments +#if 1 // FIXME: delete +handle.sync_stream(); std::cout << "PageRank check inputs" << std::endl; +#endif CUGRAPH_EXPECTS((personalization_vertices.has_value() == false) || (personalization_values.has_value() && personalization_vector_size.has_value()), @@ -142,6 +151,9 @@ void pagerank( } // 2. compute the sums of the out-going edge weights (if not provided) +#if 1 // FIXME: delete +handle.sync_stream(); std::cout << "PageRank compute out_weight_sums" << std::endl; +#endif auto tmp_vertex_out_weight_sums = precomputed_vertex_out_weight_sums ? std::nullopt @@ -152,6 +164,9 @@ void pagerank( : (*tmp_vertex_out_weight_sums).data(); // 3. initialize pagerank values +#if 1 // FIXME: delete +handle.sync_stream(); std::cout << "PageRank initialize PageRank values" << std::endl; +#endif if (has_initial_guess) { auto sum = reduce_v(handle, pull_graph_view, pageranks, result_t{0.0}); @@ -171,6 +186,9 @@ void pagerank( } // 4. sum the personalization values +#if 1 // FIXME: delete +handle.sync_stream(); std::cout << "PageRank sum personalization values" << std::endl; +#endif result_t personalization_sum{0.0}; if (aggregate_personalization_vector_size > 0) { @@ -185,6 +203,9 @@ void pagerank( } // 5. pagerank iteration +#if 1 // FIXME: delete +handle.sync_stream(); std::cout << "PageRank iteration" << std::endl; +#endif // old PageRank values rmm::device_uvector old_pageranks(pull_graph_view.get_number_of_local_vertices(), @@ -197,6 +218,7 @@ void pagerank( if constexpr (GraphViewType::is_multi_gpu) { handle.get_comms().barrier(); } + std::cout << "PageRank iteration " << iter << " start" << std::endl; auto time0 = std::chrono::steady_clock::now(); #endif thrust::copy(handle.get_thrust_policy(), @@ -282,6 +304,11 @@ void pagerank( #if 1 // FIXME: delete handle.sync_stream(); auto time4 = std::chrono::steady_clock::now(); + if constexpr (GraphViewType::is_multi_gpu) { + handle.get_comms().barrier(); + } + handle.sync_stream(); + auto time5 = std::chrono::steady_clock::now(); #endif auto diff_sum = transform_reduce_v( handle, @@ -292,13 +319,14 @@ void pagerank( #if 1 // FIXME: delete handle.sync_stream(); - auto time5 = std::chrono::steady_clock::now(); - std::chrono::duration elapsed_total = time5 - time0; + auto time6 = std::chrono::steady_clock::now(); + std::chrono::duration elapsed_total = time6 - time0; std::chrono::duration elapsed0 = time1 - time0; std::chrono::duration elapsed1 = time2 - time1; std::chrono::duration elapsed2 = time3 - time2; std::chrono::duration elapsed3 = time4 - time3; std::chrono::duration elapsed4 = time5 - time4; + std::chrono::duration elapsed5 = time6 - time5; std::cout << "PageRank iter " << iter << " took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << ") ms." << std::endl; #endif iter++; diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index 0b711936ee0..88cba4cef43 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -218,6 +218,9 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, // 1. groupby edges to their target local adjacency matrix partition (and further groupby within // the local partition by applying the compute_gpu_id_from_vertex_t to minor vertex IDs). +#if 1 // FIXME: delete +handle.sync_stream(); std::cout << "create_graph 0" << std::endl; +#endif auto edge_counts = cugraph::detail::groupby_and_count_edgelist_by_local_partition_id( handle, @@ -225,6 +228,9 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, store_transposed ? edgelist_rows : edgelist_cols, edgelist_weights, true); +#if 1 // FIXME: delete +handle.sync_stream(); std::cout << "create_graph 0a" << std::endl; +#endif std::vector h_edge_counts(edge_counts.size()); raft::update_host( @@ -249,6 +255,9 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, edgelist_displacements.begin() + 1); // 2. split the input edges to local partitions +#if 1 // FIXME: delete +handle.sync_stream(); std::cout << "create_graph 1" << std::endl; +#endif std::vector> edgelist_src_partitions{}; edgelist_src_partitions.reserve(col_comm_size); @@ -294,6 +303,9 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, } // 2. renumber +#if 1 // FIXME: delete +handle.sync_stream(); std::cout << "create_graph 2" << std::endl; +#endif std::vector major_ptrs(col_comm_size); std::vector minor_ptrs(major_ptrs.size()); @@ -310,6 +322,9 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle, minor_ptrs, edgelist_edge_counts, edgelist_intra_partition_segment_offsets); +#if 1 // FIXME: delete +handle.sync_stream(); std::cout << "create_graph 3" << std::endl; +#endif // 3. create a graph diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp index c2a9bf74e2e..9b3876b4e5d 100644 --- a/cpp/tests/link_analysis/mg_pagerank_test.cpp +++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp @@ -82,12 +82,20 @@ class Tests_MGPageRank CUGRAPH_EXPECTS((comm_size % num_gpus_per_node) == 0, "Invalid MPI configuration: in multi-node execution, # MPI processes should " "be a multiple of the number of GPUs per node."); +#if 1 + auto col_comm_size = static_cast(sqrt(static_cast(comm_size))); + while (comm_size % col_comm_size != 0) { + --col_comm_size; + } + row_comm_size = comm_size / col_comm_size; +#else auto num_nodes = comm_size / num_gpus_per_node; row_comm_size = static_cast(sqrt(static_cast(num_nodes))); while (num_nodes % row_comm_size != 0) { --row_comm_size; } row_comm_size *= num_gpus_per_node; +#endif } else { row_comm_size = static_cast(sqrt(static_cast(comm_size))); while (comm_size % row_comm_size != 0) { From b1e56e7cea8a537a070702b0c67b43015190da87 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 23 Feb 2022 16:30:12 -0800 Subject: [PATCH 55/60] additional cut in peak memory and maximum single allocation size (to avoid malloc failure due to fragmentation with the pool allocator) --- .../cugraph/utilities/shuffle_comm.cuh | 182 +++++++++++++++++- cpp/src/structure/renumber_edgelist_impl.cuh | 2 +- 2 files changed, 179 insertions(+), 5 deletions(-) diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh index cfed6c33dd3..f8d2280628a 100644 --- a/cpp/include/cugraph/utilities/shuffle_comm.cuh +++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh @@ -166,6 +166,135 @@ struct kv_pair_group_id_greater_equal_t { } }; +template +void swap_partitions(ValueIterator value_first, + ValueIterator value_last, + size_t first_partition_size, + rmm::cuda_stream_view stream_view) +{ + auto num_elements = static_cast(thrust::distance(value_first, value_last)); + auto second_partition_size = num_elements - first_partition_size; + if (first_partition_size >= second_partition_size) { + auto tmp_value_buffer = + allocate_dataframe_buffer::value_type>( + first_partition_size, stream_view); + + thrust::copy(rmm::exec_policy(stream_view), + value_first, + value_first + first_partition_size, + get_dataframe_buffer_begin(tmp_value_buffer)); + + thrust::copy(rmm::exec_policy(stream_view), + value_first + first_partition_size, + value_first + num_elements, + value_first); + + thrust::copy(rmm::exec_policy(stream_view), + get_dataframe_buffer_begin(tmp_value_buffer), + get_dataframe_buffer_end(tmp_value_buffer), + value_first + second_partition_size); + } else { + auto tmp_value_buffer = + allocate_dataframe_buffer::value_type>( + second_partition_size, stream_view); + + thrust::copy(rmm::exec_policy(stream_view), + value_first + first_partition_size, + value_first + num_elements, + get_dataframe_buffer_begin(tmp_value_buffer)); + + thrust::copy(rmm::exec_policy(stream_view), + value_first, + value_first + first_partition_size, + value_first + (num_elements - first_partition_size)); + + thrust::copy(rmm::exec_policy(stream_view), + get_dataframe_buffer_begin(tmp_value_buffer), + get_dataframe_buffer_end(tmp_value_buffer), + value_first); + } +} + +template +void swap_partitions(KeyIterator key_first, + KeyIterator key_last, + ValueIterator value_first, + size_t first_partition_size, + rmm::cuda_stream_view stream_view) +{ + auto num_elements = static_cast(thrust::distance(key_first, key_last)); + auto second_partition_size = num_elements - first_partition_size; + if (first_partition_size >= second_partition_size) { + auto tmp_key_buffer = + allocate_dataframe_buffer::value_type>( + first_partition_size, stream_view); + auto tmp_value_buffer = + allocate_dataframe_buffer::value_type>( + first_partition_size, stream_view); + + thrust::copy(rmm::exec_policy(stream_view), + key_first, + key_first + first_partition_size, + get_dataframe_buffer_begin(tmp_key_buffer)); + thrust::copy(rmm::exec_policy(stream_view), + value_first, + value_first + first_partition_size, + get_dataframe_buffer_begin(tmp_value_buffer)); + + thrust::copy(rmm::exec_policy(stream_view), + key_first + first_partition_size, + key_first + num_elements, + key_first); + thrust::copy(rmm::exec_policy(stream_view), + value_first + first_partition_size, + value_first + num_elements, + value_first); + + thrust::copy(rmm::exec_policy(stream_view), + get_dataframe_buffer_begin(tmp_key_buffer), + get_dataframe_buffer_end(tmp_key_buffer), + key_first + second_partition_size); + thrust::copy(rmm::exec_policy(stream_view), + get_dataframe_buffer_begin(tmp_value_buffer), + get_dataframe_buffer_end(tmp_value_buffer), + value_first + second_partition_size); + } else { + auto tmp_key_buffer = + allocate_dataframe_buffer::value_type>( + second_partition_size, stream_view); + auto tmp_value_buffer = + allocate_dataframe_buffer::value_type>( + second_partition_size, stream_view); + + thrust::copy(rmm::exec_policy(stream_view), + key_first + first_partition_size, + key_first + num_elements, + get_dataframe_buffer_begin(tmp_key_buffer)); + thrust::copy(rmm::exec_policy(stream_view), + value_first + first_partition_size, + value_first + num_elements, + get_dataframe_buffer_begin(tmp_value_buffer)); + + thrust::copy(rmm::exec_policy(stream_view), + key_first, + key_first + first_partition_size, + key_first + (num_elements - first_partition_size)); + thrust::copy(rmm::exec_policy(stream_view), + value_first, + value_first + first_partition_size, + value_first + (num_elements - first_partition_size)); + + thrust::copy(rmm::exec_policy(stream_view), + get_dataframe_buffer_begin(tmp_key_buffer), + get_dataframe_buffer_end(tmp_key_buffer), + key_first); + thrust::copy(rmm::exec_policy(stream_view), + get_dataframe_buffer_begin(tmp_value_buffer), + get_dataframe_buffer_end(tmp_value_buffer), + value_first); + } +} + // Use roughly half temporary buffer than thrust::partition (if first & second partition sizes are // comparable). This also uses multiple smaller allocations than one single allocation (thrust::sort // does this) of the same aggregate size if the input iterators are the zip iterators (this is more @@ -330,8 +459,28 @@ void mem_frugal_groupby( }); } } else { - auto second_first = mem_frugal_partition( - value_firsts[i], value_lasts[i], value_to_group_id_op, pivot, stream_view); + ValueIterator second_first{}; + auto num_elements = static_cast(thrust::distance(value_firsts[i], value_lasts[i])); + auto first_chunk_partition_first = mem_frugal_partition(value_firsts[i], + value_firsts[i] + num_elements / 2, + value_to_group_id_op, + pivot, + stream_view); + auto second_chunk_partition_first = mem_frugal_partition(value_firsts[i] + num_elements / 2, + value_lasts[i], + value_to_group_id_op, + pivot, + stream_view); + auto no_less_size = static_cast( + thrust::distance(first_chunk_partition_first, value_firsts[i] + num_elements / 2)); + auto less_size = static_cast( + thrust::distance(value_firsts[i] + num_elements / 2, second_chunk_partition_first)); + swap_partitions(value_firsts[i] + (num_elements / 2 - no_less_size), + value_firsts[i] + (num_elements / 2 + less_size), + no_less_size, + stream_view); + + second_first = value_firsts[i] + ((num_elements / 2 - no_less_size) + less_size); if (pivot - group_firsts[i] > 1) { group_firsts.push_back(group_firsts[i]); group_lasts.push_back(pivot); @@ -402,8 +551,33 @@ void mem_frugal_groupby( }); } } else { - auto second_first = mem_frugal_partition( - key_firsts[i], key_lasts[i], value_firsts[i], key_to_group_id_op, pivot, stream_view); + std::tuple second_first{}; + auto num_elements = static_cast(thrust::distance(key_firsts[i], key_lasts[i])); + auto first_chunk_partition_first = mem_frugal_partition(key_firsts[i], + key_firsts[i] + num_elements / 2, + value_firsts[i], + key_to_group_id_op, + pivot, + stream_view); + auto second_chunk_partition_first = mem_frugal_partition(key_firsts[i] + num_elements / 2, + key_lasts[i], + value_firsts[i] + num_elements / 2, + key_to_group_id_op, + pivot, + stream_view); + auto no_less_size = static_cast(thrust::distance( + std::get<0>(first_chunk_partition_first), key_firsts[i] + num_elements / 2)); + auto less_size = static_cast(thrust::distance( + key_firsts[i] + num_elements / 2, std::get<0>(second_chunk_partition_first))); + swap_partitions(key_firsts[i] + (num_elements / 2 - no_less_size), + key_firsts[i] + (num_elements / 2 + less_size), + value_firsts[i] + (num_elements / 2 - no_less_size), + no_less_size, + stream_view); + + second_first = + std::make_tuple(key_firsts[i] + ((num_elements / 2 - no_less_size) + less_size), + value_firsts[i] + ((num_elements / 2 - no_less_size) + less_size)); if (pivot - group_firsts[i] > 1) { group_firsts.push_back(group_firsts[i]); group_lasts.push_back(pivot); diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 2a3ed5df5df..1aca1120544 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -760,7 +760,7 @@ renumber_edgelist( } } - if ((static_cast(partition.get_matrix_partition_minor_size() / load_factor) >= + if ((static_cast(partition.get_matrix_partition_minor_size() * (1.0 + 1.0 / load_factor)) >= static_cast(number_of_edges / comm_size)) && edgelist_intra_partition_segment_offsets) { // memory footprint dominated by the O(V/sqrt(P)) // part than the O(E/P) part From d716f6c7c030535660a3bdfeb58f2c7b53c01e81 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 8 Mar 2022 17:02:16 -0800 Subject: [PATCH 56/60] remove temporary experimental code --- .../copy_v_transform_reduce_in_out_nbr.cuh | 447 ++---------------- 1 file changed, 42 insertions(+), 405 deletions(-) diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh index 7a47803726d..ad76ee1fd67 100644 --- a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh +++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh @@ -35,8 +35,6 @@ #include #include -#include // FIXME: delete - #include #include #include @@ -492,19 +490,11 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, col_properties_t> minor_tmp_buffer{}; // relevant only when (GraphViewType::is_multi_gpu && !update_major if constexpr (GraphViewType::is_multi_gpu && !update_major) { -#if 1 // FIXME: delete - handle.sync_stream(); - std::cout << "copy_v allocate minor_tmp_buffer" << std::endl; -#endif if constexpr (GraphViewType::is_adj_matrix_transposed) { minor_tmp_buffer = row_properties_t(handle, graph_view); } else { minor_tmp_buffer = col_properties_t(handle, graph_view); } -#if 1 // FIXME: delete - handle.sync_stream(); - std::cout << "copy_v allocate minor_tmp_buffer: SUCCESS" << std::endl; -#endif } if constexpr (!update_major) { @@ -527,9 +517,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } std::optional> stream_pool_indices{std::nullopt}; -#if 1 // FIXME: for temporary testing - std::vector pool_streams{}; -#endif if constexpr (GraphViewType::is_multi_gpu) { if ((graph_view.get_local_adj_matrix_partition_segment_offsets(0)) && (handle.get_stream_pool_size() >= max_segments)) { @@ -569,25 +556,9 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } if (num_streams >= max_segments) { -#if 1 // FIXME: for temporary testing - pool_streams.resize(num_streams); - for (size_t i = 0; i < pool_streams.size() / max_segments; ++i) { - static_assert(max_segments == 4); - CUDA_TRY(cudaStreamCreateWithPriority( - &pool_streams[i * max_segments], cudaStreamNonBlocking, -2)); - CUDA_TRY(cudaStreamCreateWithPriority( - &pool_streams[i * max_segments + 1], cudaStreamNonBlocking, -2)); - CUDA_TRY(cudaStreamCreateWithPriority( - &pool_streams[i * max_segments + 2], cudaStreamNonBlocking, -1)); - CUDA_TRY(cudaStreamCreateWithPriority( - &pool_streams[i * max_segments + 3], cudaStreamNonBlocking, 0)); - } -#endif - stream_pool_indices = std::vector(num_streams); std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); handle.sync_stream(); - std::cout << "copy_v num_streams=" << num_streams << std::endl; // FIXME: delete } } } @@ -626,328 +597,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, if (stream_pool_indices) { handle.sync_stream(); } -#if 1 // FIXME: delete, just experimenting (to better separate compute time vs reduction time - if (stream_pool_indices) { - auto num_concurrent_loops = (*stream_pool_indices).size() / max_segments; - auto num_rounds = - (graph_view.get_number_of_local_adj_matrix_partitions() + (num_concurrent_loops - 1)) / - num_concurrent_loops; - for (size_t round = 0; round < num_rounds; ++round) { - /* computing */ - -#if 1 // FIXME: delete - if constexpr (GraphViewType::is_multi_gpu) { handle.get_comms().barrier(); } - auto core_time0 = std::chrono::steady_clock::now(); -#endif - for (size_t i = num_concurrent_loops * round; - i < std::min(static_cast(graph_view.get_number_of_local_adj_matrix_partitions()), - num_concurrent_loops * (round + 1)); - ++i) { - auto matrix_partition = - matrix_partition_device_view_t( - graph_view.get_matrix_partition_view(i)); - - auto major_init = T{}; - if constexpr (update_major) { - if constexpr (GraphViewType::is_multi_gpu) { - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - major_init = (static_cast(i) == col_comm_rank) ? init : T{}; - } else { - major_init = init; - } - } - - auto matrix_partition_row_value_input = adj_matrix_row_value_input; - auto matrix_partition_col_value_input = adj_matrix_col_value_input; - if constexpr (GraphViewType::is_adj_matrix_transposed) { - matrix_partition_col_value_input.set_local_adj_matrix_partition_idx(i); - } else { - matrix_partition_row_value_input.set_local_adj_matrix_partition_idx(i); - } - - auto major_buffer_first = - get_dataframe_buffer_begin(major_tmp_buffers[i % major_tmp_buffers.size()]); - - std::conditional_t, - VertexValueOutputIterator> - output_buffer{}; - if constexpr (GraphViewType::is_multi_gpu) { - if constexpr (update_major) { - output_buffer = major_buffer_first; - } else { - output_buffer = minor_tmp_buffer.mutable_device_view(); - } - } else { - output_buffer = vertex_value_output_first; - } - - auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i); - if (segment_offsets) { - static_assert(detail::num_sparse_segments_per_vertex_partition == 3); - - // FIXME: we may further improve performance by 1) individually tuning block sizes for - // different segments; and 2) adding one more segment for very high degree vertices and - // running segmented reduction - if (matrix_partition.get_dcs_nzd_vertex_count()) { - auto exec_stream = - stream_pool_indices - ? rmm::cuda_stream_view{pool_streams[(i * max_segments) % - (*stream_pool_indices).size()]} - /* FIXME for temporary testing, - handle.get_stream_from_stream_pool((i * max_segments) % - (*stream_pool_indices).size()) */ - : handle.get_stream(); - if constexpr (update_major) { // this is necessary as we don't visit every vertex in - // the hypersparse segment in - // for_all_major_for_all_nbr_hypersparse - thrust::fill(rmm::exec_policy(exec_stream), - output_buffer + (*segment_offsets)[3], - output_buffer + (*segment_offsets)[4], - major_init); - } - if (*(matrix_partition.get_dcs_nzd_vertex_count()) > 0) { - raft::grid_1d_thread_t update_grid( - *(matrix_partition.get_dcs_nzd_vertex_count()), - detail::copy_v_transform_reduce_nbr_for_all_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[3]; } - detail::for_all_major_for_all_nbr_hypersparse - <<>>( - matrix_partition, - matrix_partition.get_major_first() + (*segment_offsets)[3], - matrix_partition_row_value_input, - matrix_partition_col_value_input, - segment_output_buffer, - e_op, - major_init); - } - } - if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { - auto exec_stream = - stream_pool_indices - ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 1) % - (*stream_pool_indices).size()]} - /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * - max_segments + 1) % (*stream_pool_indices).size()) */ - : handle.get_stream(); - raft::grid_1d_thread_t update_grid( - (*segment_offsets)[3] - (*segment_offsets)[2], - detail::copy_v_transform_reduce_nbr_for_all_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[2]; } - detail::for_all_major_for_all_nbr_low_degree - <<>>( - matrix_partition, - matrix_partition.get_major_first() + (*segment_offsets)[2], - matrix_partition.get_major_first() + (*segment_offsets)[3], - matrix_partition_row_value_input, - matrix_partition_col_value_input, - segment_output_buffer, - e_op, - major_init); - } - if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { - auto exec_stream = - stream_pool_indices - ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 2) % - (*stream_pool_indices).size()]} - /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * - max_segments + 2) % (*stream_pool_indices).size()) */ - : handle.get_stream(); - raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1], - detail::copy_v_transform_reduce_nbr_for_all_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[1]; } - detail::for_all_major_for_all_nbr_mid_degree - <<>>( - matrix_partition, - matrix_partition.get_major_first() + (*segment_offsets)[1], - matrix_partition.get_major_first() + (*segment_offsets)[2], - matrix_partition_row_value_input, - matrix_partition_col_value_input, - segment_output_buffer, - e_op, - major_init); - } - if ((*segment_offsets)[1] > 0) { - auto exec_stream = - stream_pool_indices - ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 3) % - (*stream_pool_indices).size()]} - /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * - max_segments + 3) % (*stream_pool_indices).size()) */ - : handle.get_stream(); - raft::grid_1d_block_t update_grid( - (*segment_offsets)[1], - detail::copy_v_transform_reduce_nbr_for_all_block_size, - handle.get_device_properties().maxGridSize[0]); - detail::for_all_major_for_all_nbr_high_degree - <<>>( - matrix_partition, - matrix_partition.get_major_first(), - matrix_partition.get_major_first() + (*segment_offsets)[1], - matrix_partition_row_value_input, - matrix_partition_col_value_input, - output_buffer, - e_op, - major_init); - } - } else { - if (matrix_partition.get_major_size() > 0) { - raft::grid_1d_thread_t update_grid( - matrix_partition.get_major_size(), - detail::copy_v_transform_reduce_nbr_for_all_block_size, - handle.get_device_properties().maxGridSize[0]); - detail::for_all_major_for_all_nbr_low_degree - <<>>( - matrix_partition, - matrix_partition.get_major_first(), - matrix_partition.get_major_last(), - matrix_partition_row_value_input, - matrix_partition_col_value_input, - output_buffer, - e_op, - major_init); - } - } - } - -#if 1 // FIXME: for temporary testing - for (size_t i = 0; i < pool_streams.size(); ++i) { - CUDA_TRY(cudaStreamSynchronize(pool_streams[i])); - } -#else - handle.sync_stream_pool(*stream_pool_indices); -#endif - - /* communication */ - -#if 1 // FIXME: delete - auto core_time1 = std::chrono::steady_clock::now(); - if constexpr (GraphViewType::is_multi_gpu) { handle.get_comms().barrier(); } - auto core_time2 = std::chrono::steady_clock::now(); -#endif - if constexpr (GraphViewType::is_multi_gpu && update_major) { - ncclGroupStart(); // SIMPLE - for (size_t i = num_concurrent_loops * round; - i < - std::min(static_cast(graph_view.get_number_of_local_adj_matrix_partitions()), - num_concurrent_loops * (round + 1)); - ++i) { -#if 1 // SIMPLE - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); -#else - auto& comm = handle.get_comms(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - auto const col_comm_size = col_comm.get_size(); -#endif - - auto matrix_partition = - matrix_partition_device_view_t( - graph_view.get_matrix_partition_view(i)); - - auto major_buffer_first = - get_dataframe_buffer_begin(major_tmp_buffers[i % major_tmp_buffers.size()]); - -#if 1 // SIMPLE - device_reduce(col_comm, - major_buffer_first, - vertex_value_output_first, - matrix_partition.get_major_size(), - raft::comms::op_t::SUM, - i, - handle.get_stream()); -#else - auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i); - if (segment_offsets && stream_pool_indices) { - if ((*segment_offsets).back() - (*segment_offsets)[3] > 0) { - device_reduce(col_comm, - major_buffer_first + (*segment_offsets)[3], - vertex_value_output_first + (*segment_offsets)[3], - (*segment_offsets).back() - (*segment_offsets)[3], - raft::comms::op_t::SUM, - i, - pool_streams[(i * max_segments) % (*stream_pool_indices).size()]/* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) */); - } - if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { - device_reduce(col_comm, - major_buffer_first + (*segment_offsets)[2], - vertex_value_output_first + (*segment_offsets)[2], - (*segment_offsets)[3] - (*segment_offsets)[2], - raft::comms::op_t::SUM, - i, - pool_streams[(i * max_segments + 1) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 1) % (*stream_pool_indices).size()) */); - } - if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { - device_reduce(col_comm, - major_buffer_first + (*segment_offsets)[1], - vertex_value_output_first + (*segment_offsets)[1], - (*segment_offsets)[2] - (*segment_offsets)[1], - raft::comms::op_t::SUM, - i, - pool_streams[(i * max_segments + 2) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 2) % (*stream_pool_indices).size()) */); - } - if ((*segment_offsets)[1] > 0) { - device_reduce(col_comm, - major_buffer_first, - vertex_value_output_first, - (*segment_offsets)[1], - raft::comms::op_t::SUM, - i, - pool_streams[(i * max_segments + 3) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 3) % (*stream_pool_indices).size()) */); - } - } else { - device_reduce(col_comm, - major_buffer_first, - vertex_value_output_first, - matrix_partition.get_major_size(), - raft::comms::op_t::SUM, - i, - handle.get_stream()); - } -#endif - } - ncclGroupEnd(); // SIMPLE - } - -#if 1 // SIMPLE - handle.sync_stream(); // SIMPLE -#else -#if 1 // FIXME: for temporary testing - for (size_t i = 0; i < pool_streams.size(); ++i) { - CUDA_TRY(cudaStreamSynchronize(pool_streams[i])); - } -#else - handle.sync_stream_pool(*stream_pool_indices); -#endif -#endif -#if 1 // FIXME: delete - auto core_time3 = std::chrono::steady_clock::now(); - if constexpr (GraphViewType::is_multi_gpu) { handle.get_comms().barrier(); } - auto core_time4 = std::chrono::steady_clock::now(); - std::chrono::duration elapsed_total = core_time4 - core_time0; - std::chrono::duration elapsed0 = core_time1 - core_time0; - std::chrono::duration elapsed1 = core_time2 - core_time1; - std::chrono::duration elapsed2 = core_time3 - core_time2; - std::chrono::duration elapsed3 = core_time4 - core_time3; - std::cout << "copy_v core took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << ") ms." << std::endl; -#endif - } - } else { - CUGRAPH_FAIL("should not be reached."); - } -#else for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { auto matrix_partition = matrix_partition_device_view_t( @@ -964,7 +613,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } } - // FIXME: need to double check whether this leads to actual copy auto matrix_partition_row_value_input = adj_matrix_row_value_input; auto matrix_partition_col_value_input = adj_matrix_col_value_input; if constexpr (GraphViewType::is_adj_matrix_transposed) { @@ -1000,13 +648,10 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, // different segments; and 2) adding one more segment for very high degree vertices and // running segmented reduction if (matrix_partition.get_dcs_nzd_vertex_count()) { - auto exec_stream = stream_pool_indices - ? rmm::cuda_stream_view{pool_streams[(i * max_segments) % - (*stream_pool_indices).size()]} - /* FIXME for temporary testing, - handle.get_stream_from_stream_pool((i * max_segments) % - (*stream_pool_indices).size()) */ - : handle.get_stream(); + auto exec_stream = + stream_pool_indices + ? handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) + : handle.get_stream(); if constexpr (update_major) { // this is necessary as we don't visit every vertex in the // hypersparse segment in // for_all_major_for_all_nbr_hypersparse @@ -1034,10 +679,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { auto exec_stream = stream_pool_indices - ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 1) % - (*stream_pool_indices).size()]} - /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * - max_segments + 1) % (*stream_pool_indices).size()) */ + ? handle.get_stream_from_stream_pool((i * max_segments + 1) % + (*stream_pool_indices).size()) : handle.get_stream(); raft::grid_1d_thread_t update_grid((*segment_offsets)[3] - (*segment_offsets)[2], detail::copy_v_transform_reduce_nbr_for_all_block_size, @@ -1057,10 +700,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { auto exec_stream = stream_pool_indices - ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 2) % - (*stream_pool_indices).size()]} - /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * - max_segments + 2) % (*stream_pool_indices).size()) */ + ? handle.get_stream_from_stream_pool((i * max_segments + 2) % + (*stream_pool_indices).size()) : handle.get_stream(); raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1], detail::copy_v_transform_reduce_nbr_for_all_block_size, @@ -1080,10 +721,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } if ((*segment_offsets)[1] > 0) { auto exec_stream = stream_pool_indices - ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 3) % - (*stream_pool_indices).size()]} - /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * - max_segments + 3) % (*stream_pool_indices).size()) */ + ? handle.get_stream_from_stream_pool((i * max_segments + 3) % + (*stream_pool_indices).size()) : handle.get_stream(); raft::grid_1d_block_t update_grid((*segment_offsets)[1], detail::copy_v_transform_reduce_nbr_for_all_block_size, @@ -1128,25 +767,44 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, if (segment_offsets && stream_pool_indices) { if ((*segment_offsets).back() - (*segment_offsets)[3] > 0) { + device_reduce( + col_comm, + major_buffer_first + (*segment_offsets)[3], + vertex_value_output_first + (*segment_offsets)[3], + (*segment_offsets).back() - (*segment_offsets)[3], + raft::comms::op_t::SUM, + i, + handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size())); + } + if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { device_reduce(col_comm, - major_buffer_first + (*segment_offsets)[3], - vertex_value_output_first + (*segment_offsets)[3], - (*segment_offsets).back() - (*segment_offsets)[3], + major_buffer_first + (*segment_offsets)[2], + vertex_value_output_first + (*segment_offsets)[2], + (*segment_offsets)[3] - (*segment_offsets)[2], raft::comms::op_t::SUM, i, - pool_streams[(i * max_segments) % (*stream_pool_indices).size()]/* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) */); - } - if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { - device_reduce( - col_comm, major_buffer_first + (*segment_offsets)[2], vertex_value_output_first + (*segment_offsets)[2], (*segment_offsets)[3] - (*segment_offsets)[2], raft::comms::op_t::SUM, i, pool_streams[(i * max_segments + 1) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 1) % (*stream_pool_indices).size()) */); + handle.get_stream_from_stream_pool((i * max_segments + 1) % + (*stream_pool_indices).size())); } if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { - device_reduce( - col_comm, major_buffer_first + (*segment_offsets)[1], vertex_value_output_first + (*segment_offsets)[1], (*segment_offsets)[2] - (*segment_offsets)[1], raft::comms::op_t::SUM, i, pool_streams[(i * max_segments + 2) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 2) % (*stream_pool_indices).size()) */); + device_reduce(col_comm, + major_buffer_first + (*segment_offsets)[1], + vertex_value_output_first + (*segment_offsets)[1], + (*segment_offsets)[2] - (*segment_offsets)[1], + raft::comms::op_t::SUM, + i, + handle.get_stream_from_stream_pool((i * max_segments + 2) % + (*stream_pool_indices).size())); } if ((*segment_offsets)[1] > 0) { - device_reduce( - col_comm, major_buffer_first, vertex_value_output_first, (*segment_offsets)[1], raft::comms::op_t::SUM, i, pool_streams[(i * max_segments + 3) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 3) % (*stream_pool_indices).size()) */); + device_reduce(col_comm, + major_buffer_first, + vertex_value_output_first, + (*segment_offsets)[1], + raft::comms::op_t::SUM, + i, + handle.get_stream_from_stream_pool((i * max_segments + 3) % + (*stream_pool_indices).size())); } } else { device_reduce(col_comm, @@ -1159,23 +817,10 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } } } -#endif - if (stream_pool_indices) { -#if 1 // FIXME: for temporary testing - for (size_t i = 0; i < pool_streams.size(); ++i) { - CUDA_TRY(cudaStreamSynchronize(pool_streams[i])); - } -#else - handle.sync_stream_pool(*stream_pool_indices); -#endif - } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } if constexpr (GraphViewType::is_multi_gpu && !update_major) { -#if 1 // FIXME: delete - handle.sync_stream(); - std::cout << "minor reduction" << std::endl; -#endif auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); @@ -1237,14 +882,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, } } } -// FIXME: for temporary testing -#if 1 - if (stream_pool_indices) { - for (size_t i = 0; i < pool_streams.size(); ++i) { - CUDA_TRY(cudaStreamDestroy(pool_streams[i])); - } - } -#endif } } // namespace detail From 99401f3103fea9d310e6d394b90f76ea489ce18c Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 8 Mar 2022 17:09:06 -0800 Subject: [PATCH 57/60] remove temporary experimental code --- cpp/src/link_analysis/pagerank_impl.cuh | 64 +------------------- cpp/tests/link_analysis/mg_pagerank_test.cpp | 63 +------------------ 2 files changed, 4 insertions(+), 123 deletions(-) diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh index eec2126aea7..d33a7e97f82 100644 --- a/cpp/src/link_analysis/pagerank_impl.cuh +++ b/cpp/src/link_analysis/pagerank_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -55,12 +55,6 @@ void pagerank( bool has_initial_guess, bool do_expensive_check) { -#if 1 // FIXME: delete -size_t free_size{}; -size_t total_size{}; -CUDA_TRY(cudaMemGetInfo(&free_size, &total_size)); -std::cout << "PageRank start free_size=" << static_cast(free_size) / (1024.0 * 1024.0 * 1024.0) << " GB total_size=" << static_cast(total_size) / (1024.0 * 1024.0 * 1024.0) << " GB." << std::endl; -#endif using vertex_t = typename GraphViewType::vertex_type; using weight_t = typename GraphViewType::weight_type; @@ -84,9 +78,6 @@ std::cout << "PageRank start free_size=" << static_cast(free_size) / (10 : vertex_t{0}; // 1. check input arguments -#if 1 // FIXME: delete -handle.sync_stream(); std::cout << "PageRank check inputs" << std::endl; -#endif CUGRAPH_EXPECTS((personalization_vertices.has_value() == false) || (personalization_values.has_value() && personalization_vector_size.has_value()), @@ -151,9 +142,6 @@ handle.sync_stream(); std::cout << "PageRank check inputs" << std::endl; } // 2. compute the sums of the out-going edge weights (if not provided) -#if 1 // FIXME: delete -handle.sync_stream(); std::cout << "PageRank compute out_weight_sums" << std::endl; -#endif auto tmp_vertex_out_weight_sums = precomputed_vertex_out_weight_sums ? std::nullopt @@ -164,9 +152,6 @@ handle.sync_stream(); std::cout << "PageRank compute out_weight_sums" << std::en : (*tmp_vertex_out_weight_sums).data(); // 3. initialize pagerank values -#if 1 // FIXME: delete -handle.sync_stream(); std::cout << "PageRank initialize PageRank values" << std::endl; -#endif if (has_initial_guess) { auto sum = reduce_v(handle, pull_graph_view, pageranks, result_t{0.0}); @@ -186,9 +171,6 @@ handle.sync_stream(); std::cout << "PageRank initialize PageRank values" << std: } // 4. sum the personalization values -#if 1 // FIXME: delete -handle.sync_stream(); std::cout << "PageRank sum personalization values" << std::endl; -#endif result_t personalization_sum{0.0}; if (aggregate_personalization_vector_size > 0) { @@ -203,9 +185,6 @@ handle.sync_stream(); std::cout << "PageRank sum personalization values" << std: } // 5. pagerank iteration -#if 1 // FIXME: delete -handle.sync_stream(); std::cout << "PageRank iteration" << std::endl; -#endif // old PageRank values rmm::device_uvector old_pageranks(pull_graph_view.get_number_of_local_vertices(), @@ -213,14 +192,6 @@ handle.sync_stream(); std::cout << "PageRank iteration" << std::endl; row_properties_t adj_matrix_row_pageranks(handle, pull_graph_view); size_t iter{0}; while (true) { -#if 1 // FIXME: delete - handle.sync_stream(); - if constexpr (GraphViewType::is_multi_gpu) { - handle.get_comms().barrier(); - } - std::cout << "PageRank iteration " << iter << " start" << std::endl; - auto time0 = std::chrono::steady_clock::now(); -#endif thrust::copy(handle.get_thrust_policy(), pageranks, pageranks + pull_graph_view.get_number_of_local_vertices(), @@ -252,16 +223,8 @@ handle.sync_stream(); std::cout << "PageRank iteration" << std::endl; return pagerank / divisor; }); -#if 1 // FIXME: delete - handle.sync_stream(); - auto time1 = std::chrono::steady_clock::now(); -#endif copy_to_adj_matrix_row(handle, pull_graph_view, pageranks, adj_matrix_row_pageranks); -#if 1 // FIXME: delete - handle.sync_stream(); - auto time2 = std::chrono::steady_clock::now(); -#endif auto unvarying_part = aggregate_personalization_vector_size == 0 ? (dangling_sum * alpha + static_cast(1.0 - alpha)) / static_cast(num_vertices) @@ -278,10 +241,6 @@ handle.sync_stream(); std::cout << "PageRank iteration" << std::endl; unvarying_part, pageranks); -#if 1 // FIXME: delete - handle.sync_stream(); - auto time3 = std::chrono::steady_clock::now(); -#endif if (aggregate_personalization_vector_size > 0) { auto vertex_partition = vertex_partition_device_view_t( pull_graph_view.get_vertex_partition_view()); @@ -301,15 +260,6 @@ handle.sync_stream(); std::cout << "PageRank iteration" << std::endl; }); } -#if 1 // FIXME: delete - handle.sync_stream(); - auto time4 = std::chrono::steady_clock::now(); - if constexpr (GraphViewType::is_multi_gpu) { - handle.get_comms().barrier(); - } - handle.sync_stream(); - auto time5 = std::chrono::steady_clock::now(); -#endif auto diff_sum = transform_reduce_v( handle, pull_graph_view, @@ -317,18 +267,6 @@ handle.sync_stream(); std::cout << "PageRank iteration" << std::endl; [] __device__(auto val) { return std::abs(thrust::get<0>(val) - thrust::get<1>(val)); }, result_t{0.0}); -#if 1 // FIXME: delete - handle.sync_stream(); - auto time6 = std::chrono::steady_clock::now(); - std::chrono::duration elapsed_total = time6 - time0; - std::chrono::duration elapsed0 = time1 - time0; - std::chrono::duration elapsed1 = time2 - time1; - std::chrono::duration elapsed2 = time3 - time2; - std::chrono::duration elapsed3 = time4 - time3; - std::chrono::duration elapsed4 = time5 - time4; - std::chrono::duration elapsed5 = time6 - time5; - std::cout << "PageRank iter " << iter << " took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << ") ms." << std::endl; -#endif iter++; if (diff_sum < epsilon) { diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp index 9b3876b4e5d..1b1965582fe 100644 --- a/cpp/tests/link_analysis/mg_pagerank_test.cpp +++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp @@ -64,76 +64,19 @@ class Tests_MGPageRank auto constexpr pool_size = 64; // FIXME: tuning parameter raft::handle_t handle(rmm::cuda_stream_per_thread, std::make_shared(pool_size)); HighResClock hr_clock{}; -#if 1 // FIXME: delete - auto time0 = std::chrono::steady_clock::now(); -#endif raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD); auto& comm = handle.get_comms(); auto const comm_size = comm.get_size(); auto const comm_rank = comm.get_rank(); - int row_comm_size{}; - int num_gpus_per_node{}; - RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); - if (comm_size > num_gpus_per_node) { // multi-node, inter-node communication bandwidth - // (Infinniband) is more likely to be a bottleneck than - // intra-node (NVLink) communication bandwidth - CUGRAPH_EXPECTS((comm_size % num_gpus_per_node) == 0, - "Invalid MPI configuration: in multi-node execution, # MPI processes should " - "be a multiple of the number of GPUs per node."); -#if 1 - auto col_comm_size = static_cast(sqrt(static_cast(comm_size))); - while (comm_size % col_comm_size != 0) { - --col_comm_size; - } - row_comm_size = comm_size / col_comm_size; -#else - auto num_nodes = comm_size / num_gpus_per_node; - row_comm_size = static_cast(sqrt(static_cast(num_nodes))); - while (num_nodes % row_comm_size != 0) { - --row_comm_size; - } - row_comm_size *= num_gpus_per_node; -#endif - } else { - row_comm_size = static_cast(sqrt(static_cast(comm_size))); - while (comm_size % row_comm_size != 0) { - --row_comm_size; - } + auto row_comm_size = static_cast(sqrt(static_cast(comm_size))); + while (comm_size % row_comm_size != 0) { + --row_comm_size; } cugraph::partition_2d::subcomm_factory_t subcomm_factory(handle, row_comm_size); -#if 1 // FIXME: delete - { - rmm::device_uvector tx_ints(comm_size, handle.get_stream()); - rmm::device_uvector rx_ints(comm_size, handle.get_stream()); - std::vector tx_sizes(comm_size, size_t{1}); - std::vector tx_offsets(comm_size); - std::iota(tx_offsets.begin(), tx_offsets.end(), size_t{0}); - std::vector tx_ranks(comm_size); - std::iota(tx_ranks.begin(), tx_ranks.end(), int32_t{0}); - auto rx_sizes = tx_sizes; - auto rx_offsets = tx_offsets; - auto rx_ranks = tx_ranks; - handle.get_comms().device_multicast_sendrecv(tx_ints.data(), - tx_sizes, - tx_offsets, - tx_ranks, - rx_ints.data(), - rx_sizes, - rx_offsets, - rx_ranks, - handle.get_stream()); - handle.sync_stream(); - } - auto time1 = std::chrono::steady_clock::now(); - std::chrono::duration elapsed = time1 - time0; - std::cout << "Handle initialization and 1st all-to-all (comm_size=" << comm_size - << ", row_comm_size=" << row_comm_size << ") took " << elapsed.count() * 1e3 << " ms." - << std::endl; -#endif // 2. create MG graph From 0664216fbfdad51dcd6d06ed77e391a72fbb4fdd Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 8 Mar 2022 17:09:50 -0800 Subject: [PATCH 58/60] fix formatting error --- cpp/tests/link_analysis/mg_pagerank_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp index 1b1965582fe..bb03adf6553 100644 --- a/cpp/tests/link_analysis/mg_pagerank_test.cpp +++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp @@ -70,7 +70,7 @@ class Tests_MGPageRank auto const comm_size = comm.get_size(); auto const comm_rank = comm.get_rank(); - auto row_comm_size = static_cast(sqrt(static_cast(comm_size))); + auto row_comm_size = static_cast(sqrt(static_cast(comm_size))); while (comm_size % row_comm_size != 0) { --row_comm_size; } From c136d3a627422264a46209b68da36233adeffd41 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 8 Mar 2022 17:11:04 -0800 Subject: [PATCH 59/60] undo copyright update with the file with no change --- cpp/src/link_analysis/pagerank_impl.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh index d33a7e97f82..b6023d21bf2 100644 --- a/cpp/src/link_analysis/pagerank_impl.cuh +++ b/cpp/src/link_analysis/pagerank_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From a4f6528b27e0c710d1c137e9bbcb3a7ac3e69421 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 8 Mar 2022 17:14:48 -0800 Subject: [PATCH 60/60] clang-format & copyright year --- cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh | 2 +- .../cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh | 2 +- cpp/tests/link_analysis/mg_pagerank_test.cpp | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh index f93ee1a6ae5..1d1b3810a53 100644 --- a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh +++ b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh index ad76ee1fd67..d6f2e9f7a34 100644 --- a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh +++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp index bb03adf6553..1f199668d6f 100644 --- a/cpp/tests/link_analysis/mg_pagerank_test.cpp +++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp @@ -62,7 +62,8 @@ class Tests_MGPageRank // 1. initialize handle auto constexpr pool_size = 64; // FIXME: tuning parameter - raft::handle_t handle(rmm::cuda_stream_per_thread, std::make_shared(pool_size)); + raft::handle_t handle(rmm::cuda_stream_per_thread, + std::make_shared(pool_size)); HighResClock hr_clock{}; raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);