From d57326b2295ed5cc2ee8a511dfae416e4f1d766c Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 2 Dec 2024 16:37:47 -0800 Subject: [PATCH 01/21] code cosmetics --- ...r_v_random_select_transform_outgoing_e.cuh | 125 +++++++++--------- 1 file changed, 61 insertions(+), 64 deletions(-) diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh index 30706632ad2..e3d7f7fb98a 100644 --- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh +++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh @@ -216,8 +216,7 @@ template -std::tuple>, - decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> +std::tuple>, dataframe_buffer_type_t> per_v_random_select_transform_e(raft::handle_t const& handle, GraphViewType const& graph_view, KeyBucketType const& key_list, @@ -594,7 +593,10 @@ per_v_random_select_transform_e(raft::handle_t const& handle, } // namespace detail /** - * @brief Randomly select and transform the input (tagged-)vertices' outgoing edges with biases. + * @brief Randomly select and transform the input (tagged-)vertices' outgoing edges. + * + * This function assumes that every outgoing edge of a given vertex has the same odd to be selected + * (uniform neighbor sampling). * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex @@ -602,8 +604,6 @@ per_v_random_select_transform_e(raft::handle_t const& handle, * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. - * @tparam EdgeBiasOp Type of the quinary edge operator to set-up selection bias - * values. * @tparam EdgeOp Type of the quinary edge operator. * @tparam T Type of the selected and transformed edge output values. * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -625,12 +625,6 @@ per_v_random_select_transform_e(raft::handle_t const& handle, * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not * access edge property values). - * @param e_bias_op Quinary operator takes (tagged-)edge source, edge destination, property values - * for the source, destination, and edge and returns a floating point bias value to be used in - * biased random selection. The return value should be non-negative. The bias value of 0 indicates - * that the corresponding edge cannot be selected. Assuming that the return value type is bias_t, - * the sum of the bias values for any seed vertex should not exceed - * std::numeric_limits::max(). * @param e_op Quinary operator takes (tagged-)edge source, edge destination, property values for * the source, destination, and edge and returns a value to be collected in the output. This * function is called only for the selected edges. @@ -652,24 +646,15 @@ per_v_random_select_transform_e(raft::handle_t const& handle, */ template -std::tuple>, - decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> +std::tuple>, dataframe_buffer_type_t> per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, GraphViewType const& graph_view, KeyBucketType const& key_list, - EdgeBiasSrcValueInputWrapper edge_bias_src_value_input, - EdgeBiasDstValueInputWrapper edge_bias_dst_value_input, - EdgeBiasValueInputWrapper edge_bias_value_input, - EdgeBiasOp e_bias_op, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, @@ -680,29 +665,31 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, std::optional invalid_value, bool do_expensive_check = false) { - return detail::per_v_random_select_transform_e(handle, - graph_view, - key_list, - edge_bias_src_value_input, - edge_bias_dst_value_input, - edge_bias_value_input, - e_bias_op, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - rng_state, - K, - with_replacement, - invalid_value, - do_expensive_check); + return detail::per_v_random_select_transform_e( + handle, + graph_view, + key_list, + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + detail::constant_e_bias_op_t{}, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + rng_state, + K, + with_replacement, + invalid_value, + do_expensive_check); } /** - * @brief Randomly select and transform the input (tagged-)vertices' outgoing edges. - * - * This function assumes that every outgoing edge of a given vertex has the same odd to be selected - * (uniform neighbor sampling). + * @brief Randomly select and transform the input (tagged-)vertices' outgoing edges with biases. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex @@ -710,6 +697,8 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeBiasOp Type of the quinary edge operator to set-up selection bias + * values. * @tparam EdgeOp Type of the quinary edge operator. * @tparam T Type of the selected and transformed edge output values. * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -731,6 +720,12 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not * access edge property values). + * @param e_bias_op Quinary operator takes (tagged-)edge source, edge destination, property values + * for the source, destination, and edge and returns a floating point bias value to be used in + * biased random selection. The return value should be non-negative. The bias value of 0 indicates + * that the corresponding edge cannot be selected. Assuming that the return value type is bias_t, + * the sum of the bias values for any seed vertex should not exceed + * std::numeric_limits::max(). * @param e_op Quinary operator takes (tagged-)edge source, edge destination, property values for * the source, destination, and edge and returns a value to be collected in the output. This * function is called only for the selected edges. @@ -752,16 +747,23 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, */ template -std::tuple>, - decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> +std::tuple>, dataframe_buffer_type_t> per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, GraphViewType const& graph_view, KeyBucketType const& key_list, + EdgeBiasSrcValueInputWrapper edge_bias_src_value_input, + EdgeBiasDstValueInputWrapper edge_bias_dst_value_input, + EdgeBiasValueInputWrapper edge_bias_value_input, + EdgeBiasOp e_bias_op, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, @@ -772,27 +774,22 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, std::optional invalid_value, bool do_expensive_check = false) { - return detail::per_v_random_select_transform_e( - handle, - graph_view, - key_list, - edge_src_dummy_property_t{}.view(), - edge_dst_dummy_property_t{}.view(), - edge_dummy_property_t{}.view(), - detail::constant_e_bias_op_t{}, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - rng_state, - K, - with_replacement, - invalid_value, - do_expensive_check); + return detail::per_v_random_select_transform_e(handle, + graph_view, + key_list, + edge_bias_src_value_input, + edge_bias_dst_value_input, + edge_bias_value_input, + e_bias_op, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + rng_state, + K, + with_replacement, + invalid_value, + do_expensive_check); } } // namespace cugraph From b264ae29ecf12006d56364b770d399484217ce2a Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 11 Dec 2024 16:18:49 -0800 Subject: [PATCH 02/21] code refactor --- .../sample_and_compute_local_nbr_indices.cuh | 855 ++++++++++-------- 1 file changed, 453 insertions(+), 402 deletions(-) diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh index dd0da77851b..6dd4affb9b1 100644 --- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh +++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh @@ -1460,400 +1460,146 @@ shuffle_and_compute_local_nbr_values(raft::handle_t const& handle, std::move(local_frontier_sample_offsets)); } -// skip conversion if local neighbor index is cugraph::invalid_edge_id_v -template -rmm::device_uvector convert_to_unmasked_local_nbr_idx( - raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexIterator aggregate_local_frontier_major_first, - rmm::device_uvector&& local_nbr_indices, - std::optional> key_indices, - std::vector const& local_frontier_sample_offsets, - std::vector const& local_frontier_displacements, - std::vector const& local_frontier_sizes, - size_t K) -{ - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - static_assert( - std::is_same_v::value_type>); - - auto edge_mask_view = graph_view.edge_mask_view(); - - auto [aggregate_local_frontier_unique_majors, - aggregate_local_frontier_major_idx_to_unique_major_idx, - local_frontier_unique_major_displacements, - local_frontier_unique_major_sizes] = - compute_unique_keys(handle, - aggregate_local_frontier_major_first, - local_frontier_displacements, - local_frontier_sizes); - - // to avoid searching the entire neighbor list K times for high degree vertices with edge masking - auto local_frontier_unique_major_valid_local_nbr_count_inclusive_sums = - compute_valid_local_nbr_count_inclusive_sums(handle, - graph_view, - aggregate_local_frontier_unique_majors.begin(), - local_frontier_unique_major_displacements, - local_frontier_unique_major_sizes); - - auto sample_major_idx_first = thrust::make_transform_iterator( - thrust::make_counting_iterator(size_t{0}), - cuda::proclaim_return_type( - [K, - key_indices = key_indices ? thrust::make_optional>( - (*key_indices).data(), (*key_indices).size()) - : thrust::nullopt] __device__(size_t i) { - return key_indices ? (*key_indices)[i] : i / K; - })); - auto pair_first = thrust::make_zip_iterator(local_nbr_indices.begin(), sample_major_idx_first); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); - auto edge_partition_e_mask = - edge_mask_view - ? thrust::make_optional< - detail::edge_partition_edge_property_device_view_t>( - *edge_mask_view, i) - : thrust::nullopt; - - auto edge_partition_frontier_major_first = - aggregate_local_frontier_major_first + local_frontier_displacements[i]; - thrust::transform_if( - handle.get_thrust_policy(), - pair_first + local_frontier_sample_offsets[i], - pair_first + local_frontier_sample_offsets[i + 1], - local_nbr_indices.begin() + local_frontier_sample_offsets[i], - local_nbr_indices.begin() + local_frontier_sample_offsets[i], - find_nth_valid_nbr_idx_t{ - edge_partition, - edge_partition_e_mask, - edge_partition_frontier_major_first, - raft::device_span( - aggregate_local_frontier_major_idx_to_unique_major_idx.data() + - local_frontier_displacements[i], - local_frontier_sizes[i]), - thrust::make_tuple( - raft::device_span( - std::get<0>(local_frontier_unique_major_valid_local_nbr_count_inclusive_sums[i]).data(), - std::get<0>(local_frontier_unique_major_valid_local_nbr_count_inclusive_sums[i]) - .size()), - raft::device_span( - std::get<1>(local_frontier_unique_major_valid_local_nbr_count_inclusive_sums[i]).data(), - std::get<1>(local_frontier_unique_major_valid_local_nbr_count_inclusive_sums[i]) - .size()))}, - is_not_equal_t{cugraph::invalid_edge_id_v}); - } - - return std::move(local_nbr_indices); -} - -template -std::tuple, - std::optional>, - std::vector> -uniform_sample_and_compute_local_nbr_indices( +template +std::tuple /* local_nbr_indices */, + std::optional> /* key_indices */, + std::vector /* local_frontier_sample_offsets */> +biased_sample( raft::handle_t const& handle, - GraphViewType const& graph_view, - KeyIterator aggregate_local_frontier_key_first, std::vector const& local_frontier_displacements, std::vector const& local_frontier_sizes, + raft::device_span aggregate_local_frontier_key_idx_to_unique_key_idx, + raft::host_span local_frontier_unique_key_displacements, + raft::host_span local_frontier_unique_key_sizes, + raft::device_span aggregate_local_frontier_unique_key_biases, + raft::device_span aggregate_local_frontier_unique_key_local_degree_offsets, raft::random::RngState& rng_state, size_t K, bool with_replacement) { - using edge_t = typename GraphViewType::edge_type; - using vertex_t = typename GraphViewType::vertex_type; - using key_t = typename thrust::iterator_traits::value_type; - + int minor_comm_rank{0}; int minor_comm_size{1}; - if constexpr (GraphViewType::is_multi_gpu) { + if constexpr (multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + minor_comm_rank = minor_comm.get_rank(); minor_comm_size = minor_comm.get_size(); } - auto aggregate_local_frontier_major_first = - thrust_tuple_get_or_identity(aggregate_local_frontier_key_first); - - auto edge_mask_view = graph_view.edge_mask_view(); + auto num_local_edge_partitions = local_frontier_unique_key_displacements.size(); - // 1. compute degrees + rmm::device_uvector local_nbr_indices(0, handle.get_stream()); + std::optional> key_indices{std::nullopt}; + std::vector local_frontier_sample_offsets{}; + if (with_replacement) { + // compute segmented inclusive sums (one segment per seed) + auto unique_key_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [offsets = raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size())] __device__(size_t i) { + return static_cast(thrust::distance( + offsets.begin() + 1, + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i))); + })); + rmm::device_uvector + aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums( + aggregate_local_frontier_unique_key_biases.size(), handle.get_stream()); + thrust::inclusive_scan_by_key( + handle.get_thrust_policy(), + unique_key_first, + unique_key_first + aggregate_local_frontier_unique_key_biases.size(), + aggregate_local_frontier_unique_key_biases.begin(), + aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.begin()); - rmm::device_uvector frontier_degrees(0, handle.get_stream()); - std::optional> frontier_partitioned_local_degree_displacements{ - std::nullopt}; - { - auto aggregate_local_frontier_local_degrees = - compute_aggregate_local_frontier_local_degrees(handle, - graph_view, - aggregate_local_frontier_major_first, - local_frontier_displacements, - local_frontier_sizes); + auto aggregate_local_frontier_bias_local_sums = rmm::device_uvector( + local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream()); + for (size_t i = 0; i < num_local_edge_partitions; ++i) { + thrust::tabulate( + handle.get_thrust_policy(), + get_dataframe_buffer_begin(aggregate_local_frontier_bias_local_sums) + + local_frontier_displacements[i], + get_dataframe_buffer_begin(aggregate_local_frontier_bias_local_sums) + + local_frontier_displacements[i] + local_frontier_sizes[i], + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + + local_frontier_displacements[i], + local_frontier_sizes[i]), + unique_key_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data() + + local_frontier_unique_key_displacements[i], + local_frontier_unique_key_sizes[i] + 1), + aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums = + raft::device_span( + aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.data(), + aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums + .size())] __device__(size_t i) { + auto unique_key_idx = key_idx_to_unique_key_idx[i]; + auto degree = unique_key_local_degree_offsets[unique_key_idx + 1] - + unique_key_local_degree_offsets[unique_key_idx]; + if (degree > 0) { + return aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums + [unique_key_local_degree_offsets[unique_key_idx] + degree - 1]; + } else { + return bias_t{0.0}; + } + }); + } + rmm::device_uvector frontier_bias_sums(0, handle.get_stream()); + std::optional> frontier_partitioned_bias_local_sum_displacements{ + std::nullopt}; if (minor_comm_size > 1) { - std::tie(frontier_degrees, frontier_partitioned_local_degree_displacements) = + std::tie(frontier_bias_sums, frontier_partitioned_bias_local_sum_displacements) = compute_frontier_value_sums_and_partitioned_local_value_sum_displacements( handle, - raft::device_span(aggregate_local_frontier_local_degrees.data(), - aggregate_local_frontier_local_degrees.size()), + raft::device_span(aggregate_local_frontier_bias_local_sums.data(), + aggregate_local_frontier_bias_local_sums.size()), local_frontier_displacements, local_frontier_sizes); - aggregate_local_frontier_local_degrees.resize(0, handle.get_stream()); - aggregate_local_frontier_local_degrees.shrink_to_fit(handle.get_stream()); + aggregate_local_frontier_bias_local_sums.resize(0, handle.get_stream()); + aggregate_local_frontier_bias_local_sums.shrink_to_fit(handle.get_stream()); } else { - frontier_degrees = std::move(aggregate_local_frontier_local_degrees); + frontier_bias_sums = std::move(aggregate_local_frontier_bias_local_sums); } - } - // 2. sample neighbor indices - - rmm::device_uvector nbr_indices(0, handle.get_stream()); - - if (with_replacement) { - if (frontier_degrees.size() > 0) { - nbr_indices.resize(frontier_degrees.size() * K, handle.get_stream()); - cugraph::legacy::ops::graph::get_sampling_index(nbr_indices.data(), - rng_state, - frontier_degrees.data(), - static_cast(frontier_degrees.size()), - static_cast(K), - with_replacement, + rmm::device_uvector sample_random_numbers(local_frontier_sizes[minor_comm_rank] * K, handle.get_stream()); - frontier_degrees.resize(0, handle.get_stream()); - frontier_degrees.shrink_to_fit(handle.get_stream()); - } - } else { - nbr_indices = compute_uniform_sampling_index_without_replacement( - handle, std::move(frontier_degrees), rng_state, K); - } - - // 3. shuffle neighbor indices - - auto [local_nbr_indices, key_indices, local_frontier_sample_offsets] = - shuffle_and_compute_local_nbr_values( - handle, - std::move(nbr_indices), - frontier_partitioned_local_degree_displacements - ? std::make_optional>( - (*frontier_partitioned_local_degree_displacements).data(), - (*frontier_partitioned_local_degree_displacements).size()) - : std::nullopt, - K, - cugraph::invalid_edge_id_v); + cugraph::detail::uniform_random_fill(handle.get_stream(), + sample_random_numbers.data(), + sample_random_numbers.size(), + bias_t{0.0}, + bias_t{1.0}, + rng_state); + thrust::transform( + handle.get_thrust_policy(), + sample_random_numbers.begin(), + sample_random_numbers.end(), + thrust::make_counting_iterator(size_t{0}), + sample_random_numbers.begin(), + [frontier_bias_sums = + raft::device_span(frontier_bias_sums.data(), frontier_bias_sums.size()), + K, + invalid_value = std::numeric_limits::infinity()] __device__(bias_t r, size_t i) { + // frontier_bias_sums[i / K] will be 0 if degree is 0 or all the edges have 0 bias + return frontier_bias_sums[i / K] > 0.0 ? r * frontier_bias_sums[i / K] : invalid_value; + }); - // 4. convert neighbor indices in the neighbor list considering edge mask to neighbor indices in - // the neighbor list ignoring edge mask - - if (edge_mask_view) { - local_nbr_indices = convert_to_unmasked_local_nbr_idx( - handle, - graph_view, - aggregate_local_frontier_major_first, - std::move(local_nbr_indices), - key_indices ? std::make_optional>((*key_indices).data(), - (*key_indices).size()) - : std::nullopt, - local_frontier_sample_offsets, - local_frontier_displacements, - local_frontier_sizes, - K); - } - - return std::make_tuple( - std::move(local_nbr_indices), std::move(key_indices), std::move(local_frontier_sample_offsets)); -} - -template -std::tuple, - std::optional>, - std::vector> -biased_sample_and_compute_local_nbr_indices( - raft::handle_t const& handle, - GraphViewType const& graph_view, - KeyIterator aggregate_local_frontier_key_first, - EdgeSrcValueInputWrapper edge_src_value_input, - EdgeDstValueInputWrapper edge_dst_value_input, - EdgeValueInputWrapper edge_value_input, - EdgeBiasOp e_bias_op, - std::vector const& local_frontier_displacements, - std::vector const& local_frontier_sizes, - raft::random::RngState& rng_state, - size_t K, - bool with_replacement, - bool do_expensive_check /* check e_bias_op return values */) -{ - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using key_t = typename thrust::iterator_traits::value_type; - - using bias_t = typename edge_op_result_type::type; - - int minor_comm_rank{0}; - int minor_comm_size{1}; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - minor_comm_rank = minor_comm.get_rank(); - minor_comm_size = minor_comm.get_size(); - } - assert(minor_comm_size == graph_view.number_of_local_edge_partitions()); - - auto aggregate_local_frontier_major_first = - thrust_tuple_get_or_identity(aggregate_local_frontier_key_first); - - auto edge_mask_view = graph_view.edge_mask_view(); - - // 1. compute biases for unique keys (to reduce memory footprint) - - auto [aggregate_local_frontier_unique_keys, - aggregate_local_frontier_key_idx_to_unique_key_idx, - local_frontier_unique_key_displacements, - local_frontier_unique_key_sizes] = compute_unique_keys(handle, - aggregate_local_frontier_key_first, - local_frontier_displacements, - local_frontier_sizes); - - auto [aggregate_local_frontier_unique_key_biases, - aggregate_local_frontier_unique_key_local_degree_offsets] = - compute_aggregate_local_frontier_biases( - handle, - graph_view, - get_dataframe_buffer_begin(aggregate_local_frontier_unique_keys), - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_bias_op, - local_frontier_unique_key_displacements, - local_frontier_unique_key_sizes, - do_expensive_check); - - // 2. sample neighbor indices and shuffle neighbor indices - - rmm::device_uvector local_nbr_indices(0, handle.get_stream()); - std::optional> key_indices{std::nullopt}; - std::vector local_frontier_sample_offsets{}; - if (with_replacement) { - // compute segmented inclusive sums (one segment per seed) - auto unique_key_first = thrust::make_transform_iterator( - thrust::make_counting_iterator(size_t{0}), - cuda::proclaim_return_type( - [offsets = raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data(), - aggregate_local_frontier_unique_key_local_degree_offsets.size())] __device__(size_t i) { - return static_cast(thrust::distance( - offsets.begin() + 1, - thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i))); - })); - thrust::inclusive_scan_by_key( - handle.get_thrust_policy(), - unique_key_first, - unique_key_first + aggregate_local_frontier_unique_key_biases.size(), - get_dataframe_buffer_begin(aggregate_local_frontier_unique_key_biases), - get_dataframe_buffer_begin(aggregate_local_frontier_unique_key_biases)); - - auto aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums = - std::move(aggregate_local_frontier_unique_key_biases); - - auto aggregate_local_frontier_bias_local_sums = rmm::device_uvector( - local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream()); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - thrust::tabulate( - handle.get_thrust_policy(), - get_dataframe_buffer_begin(aggregate_local_frontier_bias_local_sums) + - local_frontier_displacements[i], - get_dataframe_buffer_begin(aggregate_local_frontier_bias_local_sums) + - local_frontier_displacements[i] + local_frontier_sizes[i], - [key_idx_to_unique_key_idx = - raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data() + - local_frontier_displacements[i], - local_frontier_sizes[i]), - unique_key_local_degree_offsets = raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data() + - local_frontier_unique_key_displacements[i], - local_frontier_unique_key_sizes[i] + 1), - aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums = - raft::device_span( - aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.data(), - aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums - .size())] __device__(size_t i) { - auto unique_key_idx = key_idx_to_unique_key_idx[i]; - auto degree = unique_key_local_degree_offsets[unique_key_idx + 1] - - unique_key_local_degree_offsets[unique_key_idx]; - if (degree > 0) { - return aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums - [unique_key_local_degree_offsets[unique_key_idx] + degree - 1]; - } else { - return bias_t{0.0}; - } - }); - } - - rmm::device_uvector frontier_bias_sums(0, handle.get_stream()); - std::optional> frontier_partitioned_bias_local_sum_displacements{ - std::nullopt}; - if (minor_comm_size > 1) { - std::tie(frontier_bias_sums, frontier_partitioned_bias_local_sum_displacements) = - compute_frontier_value_sums_and_partitioned_local_value_sum_displacements( - handle, - raft::device_span(aggregate_local_frontier_bias_local_sums.data(), - aggregate_local_frontier_bias_local_sums.size()), - local_frontier_displacements, - local_frontier_sizes); - aggregate_local_frontier_bias_local_sums.resize(0, handle.get_stream()); - aggregate_local_frontier_bias_local_sums.shrink_to_fit(handle.get_stream()); - } else { - frontier_bias_sums = std::move(aggregate_local_frontier_bias_local_sums); - } - - rmm::device_uvector sample_random_numbers(local_frontier_sizes[minor_comm_rank] * K, - handle.get_stream()); - cugraph::detail::uniform_random_fill(handle.get_stream(), - sample_random_numbers.data(), - sample_random_numbers.size(), - bias_t{0.0}, - bias_t{1.0}, - rng_state); - thrust::transform( - handle.get_thrust_policy(), - sample_random_numbers.begin(), - sample_random_numbers.end(), - thrust::make_counting_iterator(size_t{0}), - sample_random_numbers.begin(), - [frontier_bias_sums = - raft::device_span(frontier_bias_sums.data(), frontier_bias_sums.size()), - K, - invalid_value = std::numeric_limits::infinity()] __device__(bias_t r, size_t i) { - // frontier_bias_sums[i / K] will be 0 if degree is 0 or all the edges have 0 bias - return frontier_bias_sums[i / K] > 0.0 ? r * frontier_bias_sums[i / K] : invalid_value; - }); - - rmm::device_uvector sample_local_random_numbers(0, handle.get_stream()); - std::tie(sample_local_random_numbers, key_indices, local_frontier_sample_offsets) = - shuffle_and_compute_local_nbr_values( - handle, - std::move(sample_random_numbers), - frontier_partitioned_bias_local_sum_displacements - ? std::make_optional>( - (*frontier_partitioned_bias_local_sum_displacements).data(), - (*frontier_partitioned_bias_local_sum_displacements).size()) - : std::nullopt, - K, - std::numeric_limits::infinity()); + rmm::device_uvector sample_local_random_numbers(0, handle.get_stream()); + std::tie(sample_local_random_numbers, key_indices, local_frontier_sample_offsets) = + shuffle_and_compute_local_nbr_values( + handle, + std::move(sample_random_numbers), + frontier_partitioned_bias_local_sum_displacements + ? std::make_optional>( + (*frontier_partitioned_bias_local_sum_displacements).data(), + (*frontier_partitioned_bias_local_sum_displacements).size()) + : std::nullopt, + K, + std::numeric_limits::infinity()); local_nbr_indices.resize(sample_local_random_numbers.size(), handle.get_stream()); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + for (size_t i = 0; i < num_local_edge_partitions; ++i) { thrust::tabulate( handle.get_thrust_policy(), local_nbr_indices.begin() + local_frontier_sample_offsets[i], @@ -1867,12 +1613,12 @@ biased_sample_and_compute_local_nbr_indices( (*key_indices).data() + local_frontier_sample_offsets[i], local_frontier_sample_offsets[i + 1] - local_frontier_sample_offsets[i]) : thrust::nullopt, - key_idx_to_unique_key_idx = - raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data() + - local_frontier_displacements[i], - local_frontier_sizes[i]), + key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + + local_frontier_displacements[i], + local_frontier_sizes[i]), aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums = - raft::device_span( + raft::device_span( aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.data(), aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.size()), unique_key_local_degree_offsets = raft::device_span( @@ -1909,17 +1655,17 @@ biased_sample_and_compute_local_nbr_indices( { rmm::device_uvector aggregate_local_frontier_local_degrees( local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream()); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + for (size_t i = 0; i < num_local_edge_partitions; ++i) { thrust::tabulate( handle.get_thrust_policy(), aggregate_local_frontier_local_degrees.begin() + local_frontier_displacements[i], aggregate_local_frontier_local_degrees.begin() + local_frontier_displacements[i] + local_frontier_sizes[i], - [key_idx_to_unique_key_idx = - raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data() + - local_frontier_displacements[i], - local_frontier_sizes[i]), - unique_key_local_degree_offsets = raft::device_span( + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + + local_frontier_displacements[i], + local_frontier_sizes[i]), + unique_key_local_degree_offsets = raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data() + local_frontier_unique_key_displacements[i], local_frontier_unique_key_sizes[i] + 1)] __device__(size_t i) { @@ -1984,16 +1730,16 @@ biased_sample_and_compute_local_nbr_indices( handle.get_stream()); rmm::device_scalar counter(0, handle.get_stream()); std::vector zero_bias_count_inclusive_sums(low_local_frontier_sizes.size()); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + for (size_t i = 0; i < num_local_edge_partitions; ++i) { thrust::for_each( handle.get_thrust_policy(), aggregate_low_local_frontier_indices.begin() + low_local_frontier_displacements[i], aggregate_low_local_frontier_indices.begin() + (low_local_frontier_displacements[i] + low_local_frontier_sizes[i]), - [key_idx_to_unique_key_idx = - raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data() + - local_frontier_displacements[i], - local_frontier_sizes[i]), + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + + local_frontier_displacements[i], + local_frontier_sizes[i]), aggregate_local_frontier_unique_key_biases = raft::device_span(aggregate_local_frontier_unique_key_biases.data(), aggregate_local_frontier_unique_key_biases.size()), @@ -2053,7 +1799,7 @@ biased_sample_and_compute_local_nbr_indices( auto pair_first = thrust::make_zip_iterator(low_frontier_gathered_zero_bias_frontier_indices.begin(), low_frontier_gathered_zero_bias_nbr_indices.begin()); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + for (size_t i = 0; i < num_local_edge_partitions; ++i) { thrust::transform( handle.get_thrust_policy(), pair_first + rx_displacements[i], @@ -2157,7 +1903,7 @@ biased_sample_and_compute_local_nbr_indices( rmm::device_uvector aggregate_mid_local_frontier_local_degrees( aggregate_mid_local_frontier_indices.size(), handle.get_stream()); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + for (size_t i = 0; i < num_local_edge_partitions; ++i) { thrust::transform( handle.get_thrust_policy(), aggregate_mid_local_frontier_indices.begin() + mid_local_frontier_displacements[i], @@ -2166,7 +1912,7 @@ biased_sample_and_compute_local_nbr_indices( aggregate_mid_local_frontier_local_degrees.begin() + mid_local_frontier_displacements[i], cuda::proclaim_return_type( - [key_idx_to_unique_key_idx = raft::device_span( + [key_idx_to_unique_key_idx = raft::device_span( aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_displacements[i], local_frontier_sizes[i]), @@ -2199,19 +1945,19 @@ biased_sample_and_compute_local_nbr_indices( std::vector mid_local_frontier_degree_sum_lasts( mid_local_frontier_degree_sums.size()); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + for (size_t i = 0; i < num_local_edge_partitions; ++i) { thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), thrust::make_counting_iterator(mid_local_frontier_sizes[i]), - [key_idx_to_unique_key_idx = raft::device_span( + [key_idx_to_unique_key_idx = raft::device_span( aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_displacements[i], local_frontier_sizes[i]), aggregate_local_frontier_unique_key_biases = - raft::device_span(aggregate_local_frontier_unique_key_biases.data(), - aggregate_local_frontier_unique_key_biases.size()), - unique_key_local_degree_offsets = raft::device_span( + raft::device_span(aggregate_local_frontier_unique_key_biases.data(), + aggregate_local_frontier_unique_key_biases.size()), + unique_key_local_degree_offsets = raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data() + local_frontier_unique_key_displacements[i], local_frontier_unique_key_sizes[i] + 1), @@ -2221,7 +1967,7 @@ biased_sample_and_compute_local_nbr_indices( aggregate_mid_local_frontier_biases = raft::device_span(aggregate_mid_local_frontier_biases.data(), aggregate_mid_local_frontier_biases.size()), - aggregate_mid_local_frontier_local_degree_offsets = raft::device_span( + aggregate_mid_local_frontier_local_degree_offsets = raft::device_span( aggregate_mid_local_frontier_local_degree_offsets.data(), aggregate_mid_local_frontier_local_degree_offsets.size()), output_offset = mid_local_frontier_displacements[i]] __device__(size_t i) { @@ -2372,7 +2118,7 @@ biased_sample_and_compute_local_nbr_indices( handle.get_stream()); rmm::device_uvector aggregate_high_local_frontier_keys( aggregate_high_local_frontier_local_nbr_indices.size(), handle.get_stream()); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + for (size_t i = 0; i < num_local_edge_partitions; ++i) { rmm::device_uvector unique_key_indices_for_key_indices( high_local_frontier_sizes[i], handle.get_stream()); thrust::gather( @@ -2391,8 +2137,8 @@ biased_sample_and_compute_local_nbr_indices( aggregate_local_frontier_unique_key_local_degree_offsets.data() + local_frontier_unique_key_displacements[i], local_frontier_unique_key_sizes[i] + 1), - raft::device_span(aggregate_local_frontier_unique_key_biases.data(), - aggregate_local_frontier_unique_key_biases.size()), + raft::device_span(aggregate_local_frontier_unique_key_biases.data(), + aggregate_local_frontier_unique_key_biases.size()), std::nullopt, raft::device_span(aggregate_high_local_frontier_local_nbr_indices.data() + high_local_frontier_displacements[i] * K, @@ -2537,13 +2283,13 @@ biased_sample_and_compute_local_nbr_indices( handle.get_thrust_policy(), frontier_indices.begin(), frontier_indices.begin() + frontier_partition_offsets[1], - [key_idx_to_unique_key_idx = - raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), - aggregate_local_frontier_key_idx_to_unique_key_idx.size()), + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data(), + aggregate_local_frontier_key_idx_to_unique_key_idx.size()), aggregate_local_frontier_unique_key_biases = - raft::device_span(aggregate_local_frontier_unique_key_biases.data(), - aggregate_local_frontier_unique_key_biases.size()), - aggregate_local_frontier_unique_key_local_degree_offsets = raft::device_span( + raft::device_span(aggregate_local_frontier_unique_key_biases.data(), + aggregate_local_frontier_unique_key_biases.size()), + aggregate_local_frontier_unique_key_local_degree_offsets = raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data(), aggregate_local_frontier_unique_key_local_degree_offsets.size()), nbr_indices = raft::device_span(nbr_indices.data(), nbr_indices.size()), @@ -2598,7 +2344,7 @@ biased_sample_and_compute_local_nbr_indices( } std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = - shuffle_and_compute_local_nbr_values( + shuffle_and_compute_local_nbr_values( handle, std::move(nbr_indices), frontier_partitioned_local_degree_displacements @@ -2610,6 +2356,311 @@ biased_sample_and_compute_local_nbr_indices( cugraph::invalid_edge_id_v); } + return std::make_tuple( + std::move(local_nbr_indices), std::move(key_indices), std::move(local_frontier_sample_offsets)); +} + +// skip conversion if local neighbor index is cugraph::invalid_edge_id_v +template +rmm::device_uvector convert_to_unmasked_local_nbr_idx( + raft::handle_t const& handle, + GraphViewType const& graph_view, + VertexIterator aggregate_local_frontier_major_first, + rmm::device_uvector&& local_nbr_indices, + std::optional> key_indices, + std::vector const& local_frontier_sample_offsets, + std::vector const& local_frontier_displacements, + std::vector const& local_frontier_sizes, + size_t K) +{ + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + static_assert( + std::is_same_v::value_type>); + + auto edge_mask_view = graph_view.edge_mask_view(); + + auto [aggregate_local_frontier_unique_majors, + aggregate_local_frontier_major_idx_to_unique_major_idx, + local_frontier_unique_major_displacements, + local_frontier_unique_major_sizes] = + compute_unique_keys(handle, + aggregate_local_frontier_major_first, + local_frontier_displacements, + local_frontier_sizes); + + // to avoid searching the entire neighbor list K times for high degree vertices with edge masking + auto local_frontier_unique_major_valid_local_nbr_count_inclusive_sums = + compute_valid_local_nbr_count_inclusive_sums(handle, + graph_view, + aggregate_local_frontier_unique_majors.begin(), + local_frontier_unique_major_displacements, + local_frontier_unique_major_sizes); + + auto sample_major_idx_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [K, + key_indices = key_indices ? thrust::make_optional>( + (*key_indices).data(), (*key_indices).size()) + : thrust::nullopt] __device__(size_t i) { + return key_indices ? (*key_indices)[i] : i / K; + })); + auto pair_first = thrust::make_zip_iterator(local_nbr_indices.begin(), sample_major_idx_first); + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(i)); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, i) + : thrust::nullopt; + + auto edge_partition_frontier_major_first = + aggregate_local_frontier_major_first + local_frontier_displacements[i]; + thrust::transform_if( + handle.get_thrust_policy(), + pair_first + local_frontier_sample_offsets[i], + pair_first + local_frontier_sample_offsets[i + 1], + local_nbr_indices.begin() + local_frontier_sample_offsets[i], + local_nbr_indices.begin() + local_frontier_sample_offsets[i], + find_nth_valid_nbr_idx_t{ + edge_partition, + edge_partition_e_mask, + edge_partition_frontier_major_first, + raft::device_span( + aggregate_local_frontier_major_idx_to_unique_major_idx.data() + + local_frontier_displacements[i], + local_frontier_sizes[i]), + thrust::make_tuple( + raft::device_span( + std::get<0>(local_frontier_unique_major_valid_local_nbr_count_inclusive_sums[i]).data(), + std::get<0>(local_frontier_unique_major_valid_local_nbr_count_inclusive_sums[i]) + .size()), + raft::device_span( + std::get<1>(local_frontier_unique_major_valid_local_nbr_count_inclusive_sums[i]).data(), + std::get<1>(local_frontier_unique_major_valid_local_nbr_count_inclusive_sums[i]) + .size()))}, + is_not_equal_t{cugraph::invalid_edge_id_v}); + } + + return std::move(local_nbr_indices); +} + +template +std::tuple, + std::optional>, + std::vector> +uniform_sample_and_compute_local_nbr_indices( + raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyIterator aggregate_local_frontier_key_first, + std::vector const& local_frontier_displacements, + std::vector const& local_frontier_sizes, + raft::random::RngState& rng_state, + size_t K, + bool with_replacement) +{ + using edge_t = typename GraphViewType::edge_type; + using vertex_t = typename GraphViewType::vertex_type; + using key_t = typename thrust::iterator_traits::value_type; + + int minor_comm_size{1}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + minor_comm_size = minor_comm.get_size(); + } + + auto aggregate_local_frontier_major_first = + thrust_tuple_get_or_identity(aggregate_local_frontier_key_first); + + auto edge_mask_view = graph_view.edge_mask_view(); + + // 1. compute degrees + + rmm::device_uvector frontier_degrees(0, handle.get_stream()); + std::optional> frontier_partitioned_local_degree_displacements{ + std::nullopt}; + { + auto aggregate_local_frontier_local_degrees = + compute_aggregate_local_frontier_local_degrees(handle, + graph_view, + aggregate_local_frontier_major_first, + local_frontier_displacements, + local_frontier_sizes); + + if (minor_comm_size > 1) { + std::tie(frontier_degrees, frontier_partitioned_local_degree_displacements) = + compute_frontier_value_sums_and_partitioned_local_value_sum_displacements( + handle, + raft::device_span(aggregate_local_frontier_local_degrees.data(), + aggregate_local_frontier_local_degrees.size()), + local_frontier_displacements, + local_frontier_sizes); + aggregate_local_frontier_local_degrees.resize(0, handle.get_stream()); + aggregate_local_frontier_local_degrees.shrink_to_fit(handle.get_stream()); + } else { + frontier_degrees = std::move(aggregate_local_frontier_local_degrees); + } + } + + // 2. sample neighbor indices + + rmm::device_uvector nbr_indices(0, handle.get_stream()); + + if (with_replacement) { + if (frontier_degrees.size() > 0) { + nbr_indices.resize(frontier_degrees.size() * K, handle.get_stream()); + cugraph::legacy::ops::graph::get_sampling_index(nbr_indices.data(), + rng_state, + frontier_degrees.data(), + static_cast(frontier_degrees.size()), + static_cast(K), + with_replacement, + handle.get_stream()); + frontier_degrees.resize(0, handle.get_stream()); + frontier_degrees.shrink_to_fit(handle.get_stream()); + } + } else { + nbr_indices = compute_uniform_sampling_index_without_replacement( + handle, std::move(frontier_degrees), rng_state, K); + } + + // 3. shuffle neighbor indices + + auto [local_nbr_indices, key_indices, local_frontier_sample_offsets] = + shuffle_and_compute_local_nbr_values( + handle, + std::move(nbr_indices), + frontier_partitioned_local_degree_displacements + ? std::make_optional>( + (*frontier_partitioned_local_degree_displacements).data(), + (*frontier_partitioned_local_degree_displacements).size()) + : std::nullopt, + K, + cugraph::invalid_edge_id_v); + + // 4. convert neighbor indices in the neighbor list considering edge mask to neighbor indices in + // the neighbor list ignoring edge mask + + if (edge_mask_view) { + local_nbr_indices = convert_to_unmasked_local_nbr_idx( + handle, + graph_view, + aggregate_local_frontier_major_first, + std::move(local_nbr_indices), + key_indices ? std::make_optional>((*key_indices).data(), + (*key_indices).size()) + : std::nullopt, + local_frontier_sample_offsets, + local_frontier_displacements, + local_frontier_sizes, + K); + } + + return std::make_tuple( + std::move(local_nbr_indices), std::move(key_indices), std::move(local_frontier_sample_offsets)); +} + +template +std::tuple, + std::optional>, + std::vector> +biased_sample_and_compute_local_nbr_indices( + raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyIterator aggregate_local_frontier_key_first, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeBiasOp e_bias_op, + std::vector const& local_frontier_displacements, + std::vector const& local_frontier_sizes, + raft::random::RngState& rng_state, + size_t K, + bool with_replacement, + bool do_expensive_check /* check e_bias_op return values */) +{ + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = typename thrust::iterator_traits::value_type; + + using bias_t = typename edge_op_result_type::type; + + int minor_comm_rank{0}; + int minor_comm_size{1}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + minor_comm_rank = minor_comm.get_rank(); + minor_comm_size = minor_comm.get_size(); + } + assert(minor_comm_size == graph_view.number_of_local_edge_partitions()); + + auto aggregate_local_frontier_major_first = + thrust_tuple_get_or_identity(aggregate_local_frontier_key_first); + + auto edge_mask_view = graph_view.edge_mask_view(); + + // 1. compute biases for unique keys (to reduce memory footprint) + + auto [aggregate_local_frontier_unique_keys, + aggregate_local_frontier_key_idx_to_unique_key_idx, + local_frontier_unique_key_displacements, + local_frontier_unique_key_sizes] = compute_unique_keys(handle, + aggregate_local_frontier_key_first, + local_frontier_displacements, + local_frontier_sizes); + + auto [aggregate_local_frontier_unique_key_biases, + aggregate_local_frontier_unique_key_local_degree_offsets] = + compute_aggregate_local_frontier_biases( + handle, + graph_view, + get_dataframe_buffer_begin(aggregate_local_frontier_unique_keys), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_bias_op, + local_frontier_unique_key_displacements, + local_frontier_unique_key_sizes, + do_expensive_check); + + // 2. sample neighbor indices and shuffle neighbor indices + + auto [local_nbr_indices, key_indices, local_frontier_sample_offsets] = + biased_sample( + handle, + local_frontier_displacements, + local_frontier_sizes, + raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), + aggregate_local_frontier_key_idx_to_unique_key_idx.size()), + raft::host_span(local_frontier_unique_key_displacements.data(), + local_frontier_unique_key_displacements.size()), + raft::host_span(local_frontier_unique_key_sizes.data(), + local_frontier_unique_key_sizes.size()), + raft::device_span(aggregate_local_frontier_unique_key_biases.data(), + aggregate_local_frontier_unique_key_biases.size()), + raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size()), + rng_state, + K, + with_replacement); + // 3. convert neighbor indices in the neighbor list considering edge mask to neighbor indices in // the neighbor list ignoring edge mask From 658f4ac2c62a819b928ce5a6b298cbca2ea411b9 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 11 Dec 2024 17:04:40 -0800 Subject: [PATCH 03/21] fix compiler warning --- cpp/src/sampling/neighbor_sampling_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/sampling/neighbor_sampling_impl.hpp b/cpp/src/sampling/neighbor_sampling_impl.hpp index ed77b330439..b3204d54a59 100644 --- a/cpp/src/sampling/neighbor_sampling_impl.hpp +++ b/cpp/src/sampling/neighbor_sampling_impl.hpp @@ -184,7 +184,7 @@ neighbor_sample_impl(raft::handle_t const& handle, std::vector level_sizes{}; - for (auto hop = 0; hop < num_hops; hop++) { + for (size_t hop = 0; hop < num_hops; ++hop) { rmm::device_uvector level_result_src(0, handle.get_stream()); rmm::device_uvector level_result_dst(0, handle.get_stream()); From 73a0533bc20a236e43fe22bbf271e16c8f99177e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 14 Dec 2024 22:05:28 -0800 Subject: [PATCH 04/21] step 1 in replace std::vector with raft::host_span in prims --- .../sample_and_compute_local_nbr_indices.cuh | 63 ++++++++++--------- ...r_v_random_select_transform_outgoing_e.cuh | 10 +-- 2 files changed, 41 insertions(+), 32 deletions(-) diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh index 6dd4affb9b1..79eab41ab22 100644 --- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh +++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh @@ -336,8 +336,8 @@ std::tuple std::vector> compute_unique_keys(raft::handle_t const& handle, KeyIterator aggregate_local_frontier_key_first, - std::vector const& local_frontier_displacements, - std::vector const& local_frontier_sizes) + raft::host_span local_frontier_displacements, + raft::host_span local_frontier_sizes) { using key_t = typename thrust::iterator_traits::value_type; @@ -411,8 +411,8 @@ std::tuple, rmm::device_uvector> compute_frontier_value_sums_and_partitioned_local_value_sum_displacements( raft::handle_t const& handle, raft::device_span aggregate_local_frontier_local_value_sums, - std::vector const& local_frontier_displacements, - std::vector const& local_frontier_sizes) + raft::host_span local_frontier_displacements, + raft::host_span local_frontier_sizes) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto minor_comm_rank = minor_comm.get_rank(); @@ -453,8 +453,8 @@ compute_valid_local_nbr_count_inclusive_sums( raft::handle_t const& handle, GraphViewType const& graph_view, VertexIterator aggregate_local_frontier_major_first, - std::vector const& local_frontier_displacements, - std::vector const& local_frontier_sizes) + raft::host_span local_frontier_displacements, + raft::host_span local_frontier_sizes) { using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; @@ -1237,8 +1237,8 @@ compute_aggregate_local_frontier_local_degrees( raft::handle_t const& handle, GraphViewType const& graph_view, VertexIterator aggregate_local_frontier_major_first, - std::vector const& local_frontier_displacements, - std::vector const& local_frontier_sizes) + raft::host_span local_frontier_displacements, + raft::host_span local_frontier_sizes) { using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; @@ -1307,8 +1307,8 @@ compute_aggregate_local_frontier_biases(raft::handle_t const& handle, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, EdgeBiasOp e_bias_op, - std::vector const& local_frontier_displacements, - std::vector const& local_frontier_sizes, + raft::host_span local_frontier_displacements, + raft::host_span local_frontier_sizes, bool do_expensive_check) { using vertex_t = typename GraphViewType::vertex_type; @@ -1466,8 +1466,8 @@ std::tuple /* local_nbr_indices */, std::vector /* local_frontier_sample_offsets */> biased_sample( raft::handle_t const& handle, - std::vector const& local_frontier_displacements, - std::vector const& local_frontier_sizes, + raft::host_span local_frontier_displacements, + raft::host_span local_frontier_sizes, raft::device_span aggregate_local_frontier_key_idx_to_unique_key_idx, raft::host_span local_frontier_unique_key_displacements, raft::host_span local_frontier_unique_key_sizes, @@ -2368,9 +2368,9 @@ rmm::device_uvector convert_to_unmasked_local VertexIterator aggregate_local_frontier_major_first, rmm::device_uvector&& local_nbr_indices, std::optional> key_indices, - std::vector const& local_frontier_sample_offsets, - std::vector const& local_frontier_displacements, - std::vector const& local_frontier_sizes, + raft::host_span local_frontier_sample_offsets, + raft::host_span local_frontier_displacements, + raft::host_span local_frontier_sizes, size_t K) { using vertex_t = typename GraphViewType::vertex_type; @@ -2391,11 +2391,14 @@ rmm::device_uvector convert_to_unmasked_local // to avoid searching the entire neighbor list K times for high degree vertices with edge masking auto local_frontier_unique_major_valid_local_nbr_count_inclusive_sums = - compute_valid_local_nbr_count_inclusive_sums(handle, - graph_view, - aggregate_local_frontier_unique_majors.begin(), - local_frontier_unique_major_displacements, - local_frontier_unique_major_sizes); + compute_valid_local_nbr_count_inclusive_sums( + handle, + graph_view, + aggregate_local_frontier_unique_majors.begin(), + raft::host_span(local_frontier_unique_major_displacements.data(), + local_frontier_unique_major_displacements.size()), + raft::host_span(local_frontier_unique_major_sizes.data(), + local_frontier_unique_major_sizes.size())); auto sample_major_idx_first = thrust::make_transform_iterator( thrust::make_counting_iterator(size_t{0}), @@ -2459,8 +2462,8 @@ uniform_sample_and_compute_local_nbr_indices( raft::handle_t const& handle, GraphViewType const& graph_view, KeyIterator aggregate_local_frontier_key_first, - std::vector const& local_frontier_displacements, - std::vector const& local_frontier_sizes, + raft::host_span local_frontier_displacements, + raft::host_span local_frontier_sizes, raft::random::RngState& rng_state, size_t K, bool with_replacement) @@ -2556,7 +2559,8 @@ uniform_sample_and_compute_local_nbr_indices( key_indices ? std::make_optional>((*key_indices).data(), (*key_indices).size()) : std::nullopt, - local_frontier_sample_offsets, + raft::host_span(local_frontier_sample_offsets.data(), + local_frontier_sample_offsets.size()), local_frontier_displacements, local_frontier_sizes, K); @@ -2583,8 +2587,8 @@ biased_sample_and_compute_local_nbr_indices( EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, EdgeBiasOp e_bias_op, - std::vector const& local_frontier_displacements, - std::vector const& local_frontier_sizes, + raft::host_span local_frontier_displacements, + raft::host_span local_frontier_sizes, raft::random::RngState& rng_state, size_t K, bool with_replacement, @@ -2635,8 +2639,10 @@ biased_sample_and_compute_local_nbr_indices( edge_dst_value_input, edge_value_input, e_bias_op, - local_frontier_unique_key_displacements, - local_frontier_unique_key_sizes, + raft::host_span(local_frontier_unique_key_displacements.data(), + local_frontier_unique_key_displacements.size()), + raft::host_span(local_frontier_unique_key_sizes.data(), + local_frontier_unique_key_sizes.size()), do_expensive_check); // 2. sample neighbor indices and shuffle neighbor indices @@ -2673,7 +2679,8 @@ biased_sample_and_compute_local_nbr_indices( key_indices ? std::make_optional>((*key_indices).data(), (*key_indices).size()) : std::nullopt, - local_frontier_sample_offsets, + raft::host_span(local_frontier_sample_offsets.data(), + local_frontier_sample_offsets.size()), local_frontier_displacements, local_frontier_sizes, K); diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh index e3d7f7fb98a..b15afd39807 100644 --- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh +++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh @@ -351,8 +351,9 @@ per_v_random_select_transform_e(raft::handle_t const& handle, graph_view, (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) : key_list.begin(), - local_key_list_displacements, - local_key_list_sizes, + raft::host_span(local_key_list_displacements.data(), + local_key_list_displacements.size()), + raft::host_span(local_key_list_sizes.data(), local_key_list_sizes.size()), rng_state, K, with_replacement); @@ -367,8 +368,9 @@ per_v_random_select_transform_e(raft::handle_t const& handle, edge_bias_dst_value_input, edge_bias_value_input, e_bias_op, - local_key_list_displacements, - local_key_list_sizes, + raft::host_span(local_key_list_displacements.data(), + local_key_list_displacements.size()), + raft::host_span(local_key_list_sizes.data(), local_key_list_sizes.size()), rng_state, K, with_replacement, From 92d18f58d80f4b8273c997af939f28bdfe67eed0 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 6 Jan 2025 16:47:47 -0800 Subject: [PATCH 05/21] step 2 in replacing std::vector with raft::host_span in prims --- .../sample_and_compute_local_nbr_indices.cuh | 29 +++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh index 79eab41ab22..ed828acf592 100644 --- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh +++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh @@ -422,7 +422,11 @@ compute_frontier_value_sums_and_partitioned_local_value_sum_displacements( std::tie(frontier_gathered_local_value_sums, std::ignore) = shuffle_values(minor_comm, aggregate_local_frontier_local_value_sums.begin(), +#if 1 // FIXME: better update shuffle_values to take host_span + std::vector(local_frontier_sizes.begin(), local_frontier_sizes.end()), +#else local_frontier_sizes, +#endif handle.get_stream()); rmm::device_uvector frontier_value_sums(local_frontier_sizes[minor_comm_rank], @@ -1323,15 +1327,22 @@ compute_aggregate_local_frontier_biases(raft::handle_t const& handle, EdgeBiasOp>::type; auto [aggregate_local_frontier_biases, aggregate_local_frontier_local_degree_offsets] = - transform_v_frontier_e(handle, - graph_view, - aggregate_local_frontier_key_first, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_bias_op, - local_frontier_displacements, - local_frontier_sizes); + transform_v_frontier_e( + handle, + graph_view, + aggregate_local_frontier_key_first, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_bias_op, +#if 1 // FIXME: better update shuffle_values to take host_span + std::vector(local_frontier_displacements.begin(), local_frontier_displacements.end()), + std::vector(local_frontier_sizes.begin(), local_frontier_sizes.end()) +#else + local_frontier_displacements, + local_frontier_sizes +#endif + ); if (do_expensive_check) { auto num_invalid_biases = thrust::count_if( From cd22d259b61a7bbb85e68263d0858a1922ea3ade Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 6 Jan 2025 22:57:42 -0800 Subject: [PATCH 06/21] fix misnomers --- .../sample_and_compute_local_nbr_indices.cuh | 27 ++++---- ...r_v_random_select_transform_outgoing_e.cuh | 66 +++++++++---------- 2 files changed, 45 insertions(+), 48 deletions(-) diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh index ed828acf592..a1a6c212fe9 100644 --- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh +++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh @@ -1295,14 +1295,14 @@ template + typename BiasEdgeOp> std::tuple::value_type, typename GraphViewType::vertex_type, typename EdgeSrcValueInputWrapper::value_type, typename EdgeDstValueInputWrapper::value_type, typename EdgeValueInputWrapper::value_type, - EdgeBiasOp>::type>, + BiasEdgeOp>::type>, rmm::device_uvector> compute_aggregate_local_frontier_biases(raft::handle_t const& handle, GraphViewType const& graph_view, @@ -1310,7 +1310,7 @@ compute_aggregate_local_frontier_biases(raft::handle_t const& handle, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, - EdgeBiasOp e_bias_op, + BiasEdgeOp bias_e_op, raft::host_span local_frontier_displacements, raft::host_span local_frontier_sizes, bool do_expensive_check) @@ -1324,7 +1324,7 @@ compute_aggregate_local_frontier_biases(raft::handle_t const& handle, typename EdgeSrcValueInputWrapper::value_type, typename EdgeDstValueInputWrapper::value_type, typename EdgeValueInputWrapper::value_type, - EdgeBiasOp>::type; + BiasEdgeOp>::type; auto [aggregate_local_frontier_biases, aggregate_local_frontier_local_degree_offsets] = transform_v_frontier_e( @@ -1334,7 +1334,7 @@ compute_aggregate_local_frontier_biases(raft::handle_t const& handle, edge_src_value_input, edge_dst_value_input, edge_value_input, - e_bias_op, + bias_e_op, #if 1 // FIXME: better update shuffle_values to take host_span std::vector(local_frontier_displacements.begin(), local_frontier_displacements.end()), std::vector(local_frontier_sizes.begin(), local_frontier_sizes.end()) @@ -1355,7 +1355,7 @@ compute_aggregate_local_frontier_biases(raft::handle_t const& handle, handle.get_comms(), num_invalid_biases, raft::comms::op_t::SUM, handle.get_stream()); } CUGRAPH_EXPECTS(num_invalid_biases == 0, - "invalid_input_argument: e_bias_op return values should be non-negative and " + "invalid_input_argument: bias_e_op return values should be non-negative and " "should not exceed std::numeirc_limits::max()."); } @@ -2586,7 +2586,7 @@ template + typename BiasEdgeOp> std::tuple, std::optional>, std::vector> @@ -2597,13 +2597,13 @@ biased_sample_and_compute_local_nbr_indices( EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, - EdgeBiasOp e_bias_op, + BiasEdgeOp bias_e_op, raft::host_span local_frontier_displacements, raft::host_span local_frontier_sizes, raft::random::RngState& rng_state, size_t K, bool with_replacement, - bool do_expensive_check /* check e_bias_op return values */) + bool do_expensive_check /* check bias_e_op return values */) { using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; @@ -2614,7 +2614,7 @@ biased_sample_and_compute_local_nbr_indices( typename EdgeSrcValueInputWrapper::value_type, typename EdgeDstValueInputWrapper::value_type, typename EdgeValueInputWrapper::value_type, - EdgeBiasOp>::type; + BiasEdgeOp>::type; int minor_comm_rank{0}; int minor_comm_size{1}; @@ -2625,9 +2625,6 @@ biased_sample_and_compute_local_nbr_indices( } assert(minor_comm_size == graph_view.number_of_local_edge_partitions()); - auto aggregate_local_frontier_major_first = - thrust_tuple_get_or_identity(aggregate_local_frontier_key_first); - auto edge_mask_view = graph_view.edge_mask_view(); // 1. compute biases for unique keys (to reduce memory footprint) @@ -2649,7 +2646,7 @@ biased_sample_and_compute_local_nbr_indices( edge_src_value_input, edge_dst_value_input, edge_value_input, - e_bias_op, + bias_e_op, raft::host_span(local_frontier_unique_key_displacements.data(), local_frontier_unique_key_displacements.size()), raft::host_span(local_frontier_unique_key_sizes.data(), @@ -2685,7 +2682,7 @@ biased_sample_and_compute_local_nbr_indices( local_nbr_indices = convert_to_unmasked_local_nbr_idx( handle, graph_view, - aggregate_local_frontier_major_first, + thrust_tuple_get_or_identity(aggregate_local_frontier_key_first), std::move(local_nbr_indices), key_indices ? std::make_optional>((*key_indices).data(), (*key_indices).size()) diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh index b15afd39807..4b512577000 100644 --- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh +++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh @@ -58,7 +58,7 @@ template -struct constant_e_bias_op_t { +struct constant_bias_e_op_t { __device__ float operator()(key_t, typename GraphViewType::vertex_type, typename EdgeSrcValueInputWrapper::value_type, @@ -207,10 +207,10 @@ struct return_value_compute_offset_t { template >, dataframe_buffer_type_t sample_local_nbr_indices(0, handle.get_stream()); std::optional> sample_key_indices{std::nullopt}; std::vector local_key_list_sample_offsets{}; - if constexpr (std::is_same_v>) { std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) = uniform_sample_and_compute_local_nbr_indices( @@ -364,10 +364,10 @@ per_v_random_select_transform_e(raft::handle_t const& handle, graph_view, (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) : key_list.begin(), - edge_bias_src_value_input, - edge_bias_dst_value_input, - edge_bias_value_input, - e_bias_op, + bias_edge_src_value_input, + bias_edge_dst_value_input, + bias_edge_value_input, + bias_e_op, raft::host_span(local_key_list_displacements.data(), local_key_list_displacements.size()), raft::host_span(local_key_list_sizes.data(), local_key_list_sizes.size()), @@ -674,7 +674,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, edge_src_dummy_property_t{}.view(), edge_dst_dummy_property_t{}.view(), edge_dummy_property_t{}.view(), - detail::constant_e_bias_op_t>, dataframe_buffer_type_t(handle, graph_view, key_list, - edge_bias_src_value_input, - edge_bias_dst_value_input, - edge_bias_value_input, - e_bias_op, + bias_edge_src_value_input, + bias_edge_dst_value_input, + bias_edge_value_input, + bias_e_op, edge_src_value_input, edge_dst_value_input, edge_value_input, From 064b90d91a5077e5250c09a3b460fbcaa8899677 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 9 Jan 2025 12:36:48 -0800 Subject: [PATCH 07/21] add public primitive functions for heterogeneous sampling --- ...r_v_random_select_transform_outgoing_e.cuh | 210 ++++++++++++++---- 1 file changed, 166 insertions(+), 44 deletions(-) diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh index 4b512577000..dfa45cb83cc 100644 --- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh +++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh @@ -215,6 +215,7 @@ template std::tuple>, dataframe_buffer_type_t> per_v_random_select_transform_e(raft::handle_t const& handle, @@ -228,8 +229,9 @@ per_v_random_select_transform_e(raft::handle_t const& handle, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, EdgeOp e_op, + EdgeTypeInputWrapper edge_type_input, raft::random::RngState& rng_state, - size_t K, + raft::host_span Ks, bool with_replacement, std::optional invalid_value, bool do_expensive_check) @@ -271,11 +273,18 @@ per_v_random_select_transform_e(raft::handle_t const& handle, EdgeOp>::type, T>); - CUGRAPH_EXPECTS(K >= size_t{1}, - "Invalid input argument: invalid K, K should be a positive integer."); - CUGRAPH_EXPECTS(K <= static_cast(std::numeric_limits::max()), - "Invalid input argument: the current implementation expects K to be no larger " - "than std::numeric_limits::max()."); + if constexpr (std::is_same_v) { // homogeneous + CUGRAPH_EXPECTS(Ks.size() == 1, + "Invalid input argument: Ks.size() should be 1 for homogeneous sampling."); + } + + for (size_t i = 0; i < Ks.size(); ++i) { + CUGRAPH_EXPECTS(Ks[i] >= size_t{1}, + "Invalid input argument: invalid Ks, Ks[] should be a positive integer."); + CUGRAPH_EXPECTS(Ks[i] <= static_cast(std::numeric_limits::max()), + "Invalid input argument: the current implementation expects Ks[] to be no " + "larger than std::numeric_limits::max()."); + } auto minor_comm_size = GraphViewType::is_multi_gpu @@ -345,36 +354,46 @@ per_v_random_select_transform_e(raft::handle_t const& handle, BiasEdgeDstValueInputWrapper, BiasEdgeValueInputWrapper, key_t>>) { - std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) = - uniform_sample_and_compute_local_nbr_indices( - handle, - graph_view, - (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) - : key_list.begin(), - raft::host_span(local_key_list_displacements.data(), - local_key_list_displacements.size()), - raft::host_span(local_key_list_sizes.data(), local_key_list_sizes.size()), - rng_state, - K, - with_replacement); + if constexpr (std::is_same_v) { // homogeneous + std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) = + uniform_sample_and_compute_local_nbr_indices( + handle, + graph_view, + (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) + : key_list.begin(), + raft::host_span(local_key_list_displacements.data(), + local_key_list_displacements.size()), + raft::host_span(local_key_list_sizes.data(), local_key_list_sizes.size()), + rng_state, + Ks[0], + with_replacement); + } else { // heterogeneous + CUGRAPH_FAIL("unimplemented."); + } } else { - std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) = - biased_sample_and_compute_local_nbr_indices( - handle, - graph_view, - (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) - : key_list.begin(), - bias_edge_src_value_input, - bias_edge_dst_value_input, - bias_edge_value_input, - bias_e_op, - raft::host_span(local_key_list_displacements.data(), - local_key_list_displacements.size()), - raft::host_span(local_key_list_sizes.data(), local_key_list_sizes.size()), - rng_state, - K, - with_replacement, - do_expensive_check); + if constexpr (std::is_same_v) { // homogeneous + std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) = + biased_sample_and_compute_local_nbr_indices( + handle, + graph_view, + (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) + : key_list.begin(), + bias_edge_src_value_input, + bias_edge_dst_value_input, + bias_edge_value_input, + bias_e_op, + raft::host_span(local_key_list_displacements.data(), + local_key_list_displacements.size()), + raft::host_span(local_key_list_sizes.data(), local_key_list_sizes.size()), + rng_state, + Ks[0], + with_replacement, + do_expensive_check); + } else { // heterogeneous + CUGRAPH_FAIL("unimplemented."); + } } std::vector local_key_list_sample_counts(minor_comm_size); @@ -384,6 +403,8 @@ per_v_random_select_transform_e(raft::handle_t const& handle, // 3. transform + auto K_sum = std::accumulate(Ks.begin(), Ks.end(), size_t{0}); + auto sample_e_op_results = allocate_dataframe_buffer(local_key_list_sample_offsets.back(), handle.get_stream()); for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { @@ -440,12 +461,12 @@ per_v_random_select_transform_e(raft::handle_t const& handle, e_op, cugraph::invalid_edge_id_v, to_thrust_optional(invalid_value), - K}); + K_sum}); } else { thrust::transform( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(key_list.size() * K), + thrust::make_counting_iterator(key_list.size() * K_sum), edge_partition_sample_e_op_result_first, transform_local_nbr_indices_t, to_thrust_optional(invalid_value), - K}); + K_sum}); } } aggregate_local_key_list = std::nullopt; @@ -474,7 +495,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle, auto sample_offsets = invalid_value ? std::nullopt : std::make_optional>( key_list.size() + 1, handle.get_stream()); - assert(K <= std::numeric_limits::max()); + assert(K_sum <= std::numeric_limits::max()); if (minor_comm_size > 1) { sample_local_nbr_indices.resize(0, handle.get_stream()); sample_local_nbr_indices.shrink_to_fit(handle.get_stream()); @@ -505,7 +526,8 @@ per_v_random_select_transform_e(raft::handle_t const& handle, sample_counts.resize(0, handle.get_stream()); sample_counts.shrink_to_fit(handle.get_stream()); - resize_dataframe_buffer(tmp_sample_e_op_results, key_list.size() * K, handle.get_stream()); + resize_dataframe_buffer( + tmp_sample_e_op_results, key_list.size() * K_sum, handle.get_stream()); thrust::fill(handle.get_thrust_policy(), get_dataframe_buffer_begin(tmp_sample_e_op_results), get_dataframe_buffer_end(tmp_sample_e_op_results), @@ -521,7 +543,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle, (*sample_key_indices).size()), raft::device_span(sample_intra_partition_displacements.data(), sample_intra_partition_displacements.size()), - K}), + K_sum}), get_dataframe_buffer_begin(tmp_sample_e_op_results)); } else { (*sample_offsets).set_element_to_zero_async(size_t{0}, handle.get_stream()); @@ -561,7 +583,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle, sample_counts.end(), count_valids_t{raft::device_span(sample_local_nbr_indices.data(), sample_local_nbr_indices.size()), - K, + K_sum, cugraph::invalid_edge_id_v}); (*sample_offsets).set_element_to_zero_async(size_t{0}, handle.get_stream()); auto typecasted_sample_count_first = @@ -683,8 +705,56 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, edge_dst_value_input, edge_value_input, e_op, + edge_dummy_property_view_t{}, rng_state, - K, + raft::host_span(&K, size_t{1}), + with_replacement, + invalid_value, + do_expensive_check); +} + +template +std::tuple>, dataframe_buffer_type_t> +per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& key_list, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + EdgeTypeInputWrapper edge_type_input, + raft::random::RngState& rng_state, + raft::host_span Ks, + bool with_replacement, + std::optional invalid_value, + bool do_expensive_check = false) +{ + return detail::per_v_random_select_transform_e( + handle, + graph_view, + key_list, + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + detail::constant_bias_e_op_t{}, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + edge_type_input, + rng_state, + Ks, with_replacement, invalid_value, do_expensive_check); @@ -775,6 +845,57 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, bool with_replacement, std::optional invalid_value, bool do_expensive_check = false) +{ + return detail::per_v_random_select_transform_e( + handle, + graph_view, + key_list, + bias_edge_src_value_input, + bias_edge_dst_value_input, + bias_edge_value_input, + bias_e_op, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + edge_dummy_property_view_t{}, + rng_state, + raft::host_span(&K, size_t{1}), + with_replacement, + invalid_value, + do_expensive_check); +} + +template +std::tuple>, dataframe_buffer_type_t> +per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& key_list, + BiasEdgeSrcValueInputWrapper bias_edge_src_value_input, + BiasEdgeDstValueInputWrapper bias_edge_dst_value_input, + BiasEdgeValueInputWrapper bias_edge_value_input, + BiasEdgeOp bias_e_op, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + EdgeTypeInputWrapper edge_type_input, + raft::random::RngState& rng_state, + raft::host_span Ks, + bool with_replacement, + std::optional invalid_value, + bool do_expensive_check = false) { return detail::per_v_random_select_transform_e(handle, graph_view, @@ -787,8 +908,9 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, edge_dst_value_input, edge_value_input, e_op, + edge_type_input, rng_state, - K, + Ks, with_replacement, invalid_value, do_expensive_check); From 3d3852d7ec88d1c00be10b29f95721eabde4e545 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 13 Jan 2025 00:10:17 -0800 Subject: [PATCH 08/21] minor code refactoring --- .../sample_and_compute_local_nbr_indices.cuh | 1642 +++++++++-------- 1 file changed, 848 insertions(+), 794 deletions(-) diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh index a1a6c212fe9..570b5d8fded 100644 --- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh +++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh @@ -1475,7 +1475,7 @@ template std::tuple /* local_nbr_indices */, std::optional> /* key_indices */, std::vector /* local_frontier_sample_offsets */> -biased_sample( +homogeneous_biased_sample_with_replacement( raft::handle_t const& handle, raft::host_span local_frontier_displacements, raft::host_span local_frontier_sizes, @@ -1485,8 +1485,7 @@ biased_sample( raft::device_span aggregate_local_frontier_unique_key_biases, raft::device_span aggregate_local_frontier_unique_key_local_degree_offsets, raft::random::RngState& rng_state, - size_t K, - bool with_replacement) + size_t K) { int minor_comm_rank{0}; int minor_comm_size{1}; @@ -1501,37 +1500,216 @@ biased_sample( rmm::device_uvector local_nbr_indices(0, handle.get_stream()); std::optional> key_indices{std::nullopt}; std::vector local_frontier_sample_offsets{}; - if (with_replacement) { - // compute segmented inclusive sums (one segment per seed) - auto unique_key_first = thrust::make_transform_iterator( - thrust::make_counting_iterator(size_t{0}), - cuda::proclaim_return_type( - [offsets = raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data(), - aggregate_local_frontier_unique_key_local_degree_offsets.size())] __device__(size_t i) { - return static_cast(thrust::distance( - offsets.begin() + 1, - thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i))); - })); - rmm::device_uvector - aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums( - aggregate_local_frontier_unique_key_biases.size(), handle.get_stream()); - thrust::inclusive_scan_by_key( + + // compute segmented inclusive sums (one segment per seed) + + auto unique_key_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [offsets = raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size())] __device__(size_t i) { + return static_cast(thrust::distance( + offsets.begin() + 1, + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i))); + })); + rmm::device_uvector + aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums( + aggregate_local_frontier_unique_key_biases.size(), handle.get_stream()); + thrust::inclusive_scan_by_key( + handle.get_thrust_policy(), + unique_key_first, + unique_key_first + aggregate_local_frontier_unique_key_biases.size(), + aggregate_local_frontier_unique_key_biases.begin(), + aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.begin()); + + // sum collect local bias values (one value per seed) and collect local bias sums + + auto aggregate_local_frontier_bias_local_sums = rmm::device_uvector( + local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream()); + for (size_t i = 0; i < num_local_edge_partitions; ++i) { + thrust::tabulate( handle.get_thrust_policy(), - unique_key_first, - unique_key_first + aggregate_local_frontier_unique_key_biases.size(), - aggregate_local_frontier_unique_key_biases.begin(), - aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.begin()); + get_dataframe_buffer_begin(aggregate_local_frontier_bias_local_sums) + + local_frontier_displacements[i], + get_dataframe_buffer_begin(aggregate_local_frontier_bias_local_sums) + + local_frontier_displacements[i] + local_frontier_sizes[i], + [key_idx_to_unique_key_idx = + raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data() + + local_frontier_displacements[i], + local_frontier_sizes[i]), + unique_key_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data() + + local_frontier_unique_key_displacements[i], + local_frontier_unique_key_sizes[i] + 1), + aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums = + raft::device_span( + aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.data(), + aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums + .size())] __device__(size_t i) { + auto unique_key_idx = key_idx_to_unique_key_idx[i]; + auto degree = unique_key_local_degree_offsets[unique_key_idx + 1] - + unique_key_local_degree_offsets[unique_key_idx]; + if (degree > 0) { + return aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums + [unique_key_local_degree_offsets[unique_key_idx] + degree - 1]; + } else { + return bias_t{0.0}; + } + }); + } + + rmm::device_uvector frontier_bias_sums(0, handle.get_stream()); + std::optional> frontier_partitioned_bias_local_sum_displacements{ + std::nullopt}; + if (minor_comm_size > 1) { + std::tie(frontier_bias_sums, frontier_partitioned_bias_local_sum_displacements) = + compute_frontier_value_sums_and_partitioned_local_value_sum_displacements( + handle, + raft::device_span(aggregate_local_frontier_bias_local_sums.data(), + aggregate_local_frontier_bias_local_sums.size()), + local_frontier_displacements, + local_frontier_sizes); + aggregate_local_frontier_bias_local_sums.resize(0, handle.get_stream()); + aggregate_local_frontier_bias_local_sums.shrink_to_fit(handle.get_stream()); + } else { + frontier_bias_sums = std::move(aggregate_local_frontier_bias_local_sums); + } + + // sample & compute local neighbor indices - auto aggregate_local_frontier_bias_local_sums = rmm::device_uvector( + rmm::device_uvector sample_random_numbers(local_frontier_sizes[minor_comm_rank] * K, + handle.get_stream()); + cugraph::detail::uniform_random_fill(handle.get_stream(), + sample_random_numbers.data(), + sample_random_numbers.size(), + bias_t{0.0}, + bias_t{1.0}, + rng_state); + thrust::transform( + handle.get_thrust_policy(), + sample_random_numbers.begin(), + sample_random_numbers.end(), + thrust::make_counting_iterator(size_t{0}), + sample_random_numbers.begin(), + [frontier_bias_sums = + raft::device_span(frontier_bias_sums.data(), frontier_bias_sums.size()), + K, + invalid_value = std::numeric_limits::infinity()] __device__(bias_t r, size_t i) { + // frontier_bias_sums[i / K] will be 0 if degree is 0 or all the edges have 0 bias + return frontier_bias_sums[i / K] > 0.0 ? r * frontier_bias_sums[i / K] : invalid_value; + }); + + rmm::device_uvector sample_local_random_numbers(0, handle.get_stream()); + std::tie(sample_local_random_numbers, key_indices, local_frontier_sample_offsets) = + shuffle_and_compute_local_nbr_values( + handle, + std::move(sample_random_numbers), + frontier_partitioned_bias_local_sum_displacements + ? std::make_optional>( + (*frontier_partitioned_bias_local_sum_displacements).data(), + (*frontier_partitioned_bias_local_sum_displacements).size()) + : std::nullopt, + K, + std::numeric_limits::infinity()); + + local_nbr_indices.resize(sample_local_random_numbers.size(), handle.get_stream()); + for (size_t i = 0; i < num_local_edge_partitions; ++i) { + thrust::tabulate( + handle.get_thrust_policy(), + local_nbr_indices.begin() + local_frontier_sample_offsets[i], + local_nbr_indices.begin() + local_frontier_sample_offsets[i + 1], + [K, + sample_local_random_numbers = raft::device_span( + sample_local_random_numbers.data() + local_frontier_sample_offsets[i], + local_frontier_sample_offsets[i + 1] - local_frontier_sample_offsets[i]), + key_indices = key_indices + ? thrust::make_optional>( + (*key_indices).data() + local_frontier_sample_offsets[i], + local_frontier_sample_offsets[i + 1] - local_frontier_sample_offsets[i]) + : thrust::nullopt, + key_idx_to_unique_key_idx = + raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data() + + local_frontier_displacements[i], + local_frontier_sizes[i]), + aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums = + raft::device_span( + aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.data(), + aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.size()), + unique_key_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data() + + local_frontier_unique_key_displacements[i], + local_frontier_unique_key_sizes[i] + 1), + invalid_random_number = std::numeric_limits::infinity(), + invalid_idx = cugraph::invalid_edge_id_v] __device__(size_t i) { + auto key_idx = key_indices ? (*key_indices)[i] : (i / K); + auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; + auto local_random_number = sample_local_random_numbers[i]; + if (local_random_number != invalid_random_number) { + auto local_degree = + static_cast(unique_key_local_degree_offsets[unique_key_idx + 1] - + unique_key_local_degree_offsets[unique_key_idx]); + auto inclusive_sum_first = + aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.begin() + + unique_key_local_degree_offsets[unique_key_idx]; + auto inclusive_sum_last = inclusive_sum_first + local_degree; + auto local_nbr_idx = static_cast(thrust::distance( + inclusive_sum_first, + thrust::upper_bound( + thrust::seq, inclusive_sum_first, inclusive_sum_last, local_random_number))); + return cuda::std::min(local_nbr_idx, local_degree - 1); + } else { + return invalid_idx; + } + }); + } + + return std::make_tuple( + std::move(local_nbr_indices), std::move(key_indices), std::move(local_frontier_sample_offsets)); +} + +template +std::tuple /* local_nbr_indices */, + std::optional> /* key_indices */, + std::vector /* local_frontier_sample_offsets */> +homogeneous_biased_sample_without_replacement( + raft::handle_t const& handle, + raft::host_span local_frontier_displacements, + raft::host_span local_frontier_sizes, + raft::device_span aggregate_local_frontier_key_idx_to_unique_key_idx, + raft::host_span local_frontier_unique_key_displacements, + raft::host_span local_frontier_unique_key_sizes, + raft::device_span aggregate_local_frontier_unique_key_biases, + raft::device_span aggregate_local_frontier_unique_key_local_degree_offsets, + raft::random::RngState& rng_state, + size_t K) +{ + int minor_comm_rank{0}; + int minor_comm_size{1}; + if constexpr (multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + minor_comm_rank = minor_comm.get_rank(); + minor_comm_size = minor_comm.get_size(); + } + + auto num_local_edge_partitions = local_frontier_unique_key_displacements.size(); + + rmm::device_uvector local_nbr_indices(0, handle.get_stream()); + std::optional> key_indices{std::nullopt}; + std::vector local_frontier_sample_offsets{}; + + rmm::device_uvector frontier_degrees(0, handle.get_stream()); + std::optional> frontier_partitioned_local_degree_displacements{ + std::nullopt}; + { + rmm::device_uvector aggregate_local_frontier_local_degrees( local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream()); for (size_t i = 0; i < num_local_edge_partitions; ++i) { thrust::tabulate( handle.get_thrust_policy(), - get_dataframe_buffer_begin(aggregate_local_frontier_bias_local_sums) + - local_frontier_displacements[i], - get_dataframe_buffer_begin(aggregate_local_frontier_bias_local_sums) + - local_frontier_displacements[i] + local_frontier_sizes[i], + aggregate_local_frontier_local_degrees.begin() + local_frontier_displacements[i], + aggregate_local_frontier_local_degrees.begin() + local_frontier_displacements[i] + + local_frontier_sizes[i], [key_idx_to_unique_key_idx = raft::device_span( aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_displacements[i], @@ -1539,214 +1717,283 @@ biased_sample( unique_key_local_degree_offsets = raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data() + local_frontier_unique_key_displacements[i], - local_frontier_unique_key_sizes[i] + 1), - aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums = - raft::device_span( - aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.data(), - aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums - .size())] __device__(size_t i) { + local_frontier_unique_key_sizes[i] + 1)] __device__(size_t i) { auto unique_key_idx = key_idx_to_unique_key_idx[i]; - auto degree = unique_key_local_degree_offsets[unique_key_idx + 1] - - unique_key_local_degree_offsets[unique_key_idx]; - if (degree > 0) { - return aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums - [unique_key_local_degree_offsets[unique_key_idx] + degree - 1]; - } else { - return bias_t{0.0}; - } + return unique_key_local_degree_offsets[unique_key_idx + 1] - + unique_key_local_degree_offsets[unique_key_idx]; }); } - - rmm::device_uvector frontier_bias_sums(0, handle.get_stream()); - std::optional> frontier_partitioned_bias_local_sum_displacements{ - std::nullopt}; if (minor_comm_size > 1) { - std::tie(frontier_bias_sums, frontier_partitioned_bias_local_sum_displacements) = + std::tie(frontier_degrees, frontier_partitioned_local_degree_displacements) = compute_frontier_value_sums_and_partitioned_local_value_sum_displacements( handle, - raft::device_span(aggregate_local_frontier_bias_local_sums.data(), - aggregate_local_frontier_bias_local_sums.size()), + raft::device_span(aggregate_local_frontier_local_degrees.data(), + aggregate_local_frontier_local_degrees.size()), local_frontier_displacements, local_frontier_sizes); - aggregate_local_frontier_bias_local_sums.resize(0, handle.get_stream()); - aggregate_local_frontier_bias_local_sums.shrink_to_fit(handle.get_stream()); } else { - frontier_bias_sums = std::move(aggregate_local_frontier_bias_local_sums); + frontier_degrees = std::move(aggregate_local_frontier_local_degrees); } + } - rmm::device_uvector sample_random_numbers(local_frontier_sizes[minor_comm_rank] * K, - handle.get_stream()); - cugraph::detail::uniform_random_fill(handle.get_stream(), - sample_random_numbers.data(), - sample_random_numbers.size(), - bias_t{0.0}, - bias_t{1.0}, - rng_state); - thrust::transform( - handle.get_thrust_policy(), - sample_random_numbers.begin(), - sample_random_numbers.end(), - thrust::make_counting_iterator(size_t{0}), - sample_random_numbers.begin(), - [frontier_bias_sums = - raft::device_span(frontier_bias_sums.data(), frontier_bias_sums.size()), - K, - invalid_value = std::numeric_limits::infinity()] __device__(bias_t r, size_t i) { - // frontier_bias_sums[i / K] will be 0 if degree is 0 or all the edges have 0 bias - return frontier_bias_sums[i / K] > 0.0 ? r * frontier_bias_sums[i / K] : invalid_value; - }); + auto [frontier_indices, frontier_partition_offsets] = partition_v_frontier( + handle, + frontier_degrees.begin(), + frontier_degrees.end(), + std::vector{static_cast(K + 1), static_cast(minor_comm_size * K * 2)}); - rmm::device_uvector sample_local_random_numbers(0, handle.get_stream()); - std::tie(sample_local_random_numbers, key_indices, local_frontier_sample_offsets) = - shuffle_and_compute_local_nbr_values( - handle, - std::move(sample_random_numbers), - frontier_partitioned_bias_local_sum_displacements - ? std::make_optional>( - (*frontier_partitioned_bias_local_sum_displacements).data(), - (*frontier_partitioned_bias_local_sum_displacements).size()) - : std::nullopt, - K, - std::numeric_limits::infinity()); + rmm::device_uvector nbr_indices(frontier_degrees.size() * K, handle.get_stream()); - local_nbr_indices.resize(sample_local_random_numbers.size(), handle.get_stream()); - for (size_t i = 0; i < num_local_edge_partitions; ++i) { - thrust::tabulate( - handle.get_thrust_policy(), - local_nbr_indices.begin() + local_frontier_sample_offsets[i], - local_nbr_indices.begin() + local_frontier_sample_offsets[i + 1], - [K, - sample_local_random_numbers = raft::device_span( - sample_local_random_numbers.data() + local_frontier_sample_offsets[i], - local_frontier_sample_offsets[i + 1] - local_frontier_sample_offsets[i]), - key_indices = - key_indices ? thrust::make_optional>( - (*key_indices).data() + local_frontier_sample_offsets[i], - local_frontier_sample_offsets[i + 1] - local_frontier_sample_offsets[i]) - : thrust::nullopt, - key_idx_to_unique_key_idx = raft::device_span( - aggregate_local_frontier_key_idx_to_unique_key_idx.data() + - local_frontier_displacements[i], - local_frontier_sizes[i]), - aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums = - raft::device_span( - aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.data(), - aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.size()), - unique_key_local_degree_offsets = raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data() + - local_frontier_unique_key_displacements[i], - local_frontier_unique_key_sizes[i] + 1), - invalid_random_number = std::numeric_limits::infinity(), - invalid_idx = cugraph::invalid_edge_id_v] __device__(size_t i) { - auto key_idx = key_indices ? (*key_indices)[i] : (i / K); - auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; - auto local_random_number = sample_local_random_numbers[i]; - if (local_random_number != invalid_random_number) { - auto local_degree = - static_cast(unique_key_local_degree_offsets[unique_key_idx + 1] - - unique_key_local_degree_offsets[unique_key_idx]); - auto inclusive_sum_first = - aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.begin() + - unique_key_local_degree_offsets[unique_key_idx]; - auto inclusive_sum_last = inclusive_sum_first + local_degree; - auto local_nbr_idx = static_cast(thrust::distance( - inclusive_sum_first, - thrust::upper_bound( - thrust::seq, inclusive_sum_first, inclusive_sum_last, local_random_number))); - return cuda::std::min(local_nbr_idx, local_degree - 1); - } else { - return invalid_idx; - } - }); - } - } else { - rmm::device_uvector frontier_degrees(0, handle.get_stream()); - std::optional> frontier_partitioned_local_degree_displacements{ - std::nullopt}; - { - rmm::device_uvector aggregate_local_frontier_local_degrees( - local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream()); + if (minor_comm_size > 1) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + + std::vector low_local_frontier_sizes{}; + low_local_frontier_sizes = + host_scalar_allgather(minor_comm, frontier_partition_offsets[1], handle.get_stream()); + std::vector low_local_frontier_displacements(low_local_frontier_sizes.size()); + std::exclusive_scan(low_local_frontier_sizes.begin(), + low_local_frontier_sizes.end(), + low_local_frontier_displacements.begin(), + size_t{0}); + + if (low_local_frontier_displacements.back() + low_local_frontier_sizes.back() > 0) { + // aggregate frontier indices with their degrees in the low range + + auto aggregate_low_local_frontier_indices = rmm::device_uvector( + low_local_frontier_displacements.back() + low_local_frontier_sizes.back(), + handle.get_stream()); + device_allgatherv(minor_comm, + frontier_indices.begin(), + aggregate_low_local_frontier_indices.begin(), + low_local_frontier_sizes, + low_local_frontier_displacements, + handle.get_stream()); + + // collect 0 bias value neighbor indices + + rmm::device_uvector zero_bias_frontier_indices( + aggregate_low_local_frontier_indices.size() * K /* generous upper bound */, + handle.get_stream()); + rmm::device_uvector zero_bias_local_nbr_indices(zero_bias_frontier_indices.size(), + handle.get_stream()); + rmm::device_scalar counter(0, handle.get_stream()); + std::vector zero_bias_count_inclusive_sums(low_local_frontier_sizes.size()); for (size_t i = 0; i < num_local_edge_partitions; ++i) { - thrust::tabulate( + thrust::for_each( handle.get_thrust_policy(), - aggregate_local_frontier_local_degrees.begin() + local_frontier_displacements[i], - aggregate_local_frontier_local_degrees.begin() + local_frontier_displacements[i] + - local_frontier_sizes[i], + aggregate_low_local_frontier_indices.begin() + low_local_frontier_displacements[i], + aggregate_low_local_frontier_indices.begin() + + (low_local_frontier_displacements[i] + low_local_frontier_sizes[i]), [key_idx_to_unique_key_idx = raft::device_span( aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_displacements[i], local_frontier_sizes[i]), + aggregate_local_frontier_unique_key_biases = + raft::device_span(aggregate_local_frontier_unique_key_biases.data(), + aggregate_local_frontier_unique_key_biases.size()), unique_key_local_degree_offsets = raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data() + local_frontier_unique_key_displacements[i], - local_frontier_unique_key_sizes[i] + 1)] __device__(size_t i) { + local_frontier_unique_key_sizes[i] + 1), + zero_bias_frontier_indices = raft::device_span( + zero_bias_frontier_indices.data(), zero_bias_frontier_indices.size()), + zero_bias_local_nbr_indices = raft::device_span( + zero_bias_local_nbr_indices.data(), zero_bias_local_nbr_indices.size()), + input_offset = local_frontier_displacements[i], + counter = counter.data()] __device__(size_t i) { auto unique_key_idx = key_idx_to_unique_key_idx[i]; - return unique_key_local_degree_offsets[unique_key_idx + 1] - - unique_key_local_degree_offsets[unique_key_idx]; + auto start_offset = unique_key_local_degree_offsets[unique_key_idx]; + auto end_offset = unique_key_local_degree_offsets[unique_key_idx + 1]; + cuda::atomic_ref atomic_counter(*counter); + for (auto j = start_offset; j < end_offset; ++j) { + if (aggregate_local_frontier_unique_key_biases[j] == 0.0) { + auto idx = atomic_counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed); + zero_bias_frontier_indices[idx] = i; + zero_bias_local_nbr_indices[idx] = j - start_offset; + } + } }); + zero_bias_count_inclusive_sums[i] = counter.value(handle.get_stream()); } - if (minor_comm_size > 1) { - std::tie(frontier_degrees, frontier_partitioned_local_degree_displacements) = - compute_frontier_value_sums_and_partitioned_local_value_sum_displacements( - handle, - raft::device_span(aggregate_local_frontier_local_degrees.data(), - aggregate_local_frontier_local_degrees.size()), - local_frontier_displacements, - local_frontier_sizes); - } else { - frontier_degrees = std::move(aggregate_local_frontier_local_degrees); + zero_bias_frontier_indices.resize(zero_bias_count_inclusive_sums.back(), handle.get_stream()); + zero_bias_frontier_indices.shrink_to_fit(handle.get_stream()); + zero_bias_local_nbr_indices.resize(zero_bias_frontier_indices.size(), handle.get_stream()); + zero_bias_local_nbr_indices.shrink_to_fit(handle.get_stream()); + std::vector zero_bias_counts(zero_bias_count_inclusive_sums.size()); + std::adjacent_difference(zero_bias_count_inclusive_sums.begin(), + zero_bias_count_inclusive_sums.end(), + zero_bias_counts.begin()); + + rmm::device_uvector low_frontier_gathered_zero_bias_frontier_indices( + 0, handle.get_stream()); + rmm::device_uvector low_frontier_gathered_zero_bias_nbr_indices(0, + handle.get_stream()); + std::vector rx_counts{}; + std::forward_as_tuple(std::tie(low_frontier_gathered_zero_bias_frontier_indices, + low_frontier_gathered_zero_bias_nbr_indices), + rx_counts) = + shuffle_values(minor_comm, + thrust::make_zip_iterator(zero_bias_frontier_indices.begin(), + zero_bias_local_nbr_indices.begin()), + zero_bias_counts, + handle.get_stream()); + std::vector rx_displacements(rx_counts.size()); + std::exclusive_scan(rx_counts.begin(), rx_counts.end(), rx_displacements.begin(), size_t{0}); + + // convert local neighbor indices to global neighbor indices and sort + + auto pair_first = + thrust::make_zip_iterator(low_frontier_gathered_zero_bias_frontier_indices.begin(), + low_frontier_gathered_zero_bias_nbr_indices.begin()); + for (size_t i = 0; i < num_local_edge_partitions; ++i) { + thrust::transform( + handle.get_thrust_policy(), + pair_first + rx_displacements[i], + pair_first + rx_displacements[i] + rx_counts[i], + low_frontier_gathered_zero_bias_nbr_indices.begin() + rx_displacements[i], + cuda::proclaim_return_type( + [frontier_partitioned_local_degree_displacements = raft::device_span( + (*frontier_partitioned_local_degree_displacements).data(), + (*frontier_partitioned_local_degree_displacements).size()), + minor_comm_size, + minor_comm_rank = i] __device__(auto pair) { + auto frontier_idx = thrust::get<0>(pair); + auto local_nbr_idx = thrust::get<1>(pair); + return frontier_partitioned_local_degree_displacements[frontier_idx * + minor_comm_size + + minor_comm_rank] + + local_nbr_idx; + })); } + + thrust::sort(handle.get_thrust_policy(), + pair_first, + pair_first + low_frontier_gathered_zero_bias_frontier_indices.size()); + + // update neighbor indices excluding zero bias neighbor indices + + thrust::for_each( + handle.get_thrust_policy(), + frontier_indices.begin(), + frontier_indices.begin() + frontier_partition_offsets[1], + [sorted_zero_bias_frontier_indices = + raft::device_span(low_frontier_gathered_zero_bias_frontier_indices.data(), + low_frontier_gathered_zero_bias_frontier_indices.size()), + sorted_zero_bias_nbr_indices = + raft::device_span(low_frontier_gathered_zero_bias_nbr_indices.data(), + low_frontier_gathered_zero_bias_nbr_indices.size()), + frontier_degrees = + raft::device_span(frontier_degrees.data(), frontier_degrees.size()), + nbr_indices = raft::device_span(nbr_indices.data(), nbr_indices.size()), + K, + invalid_idx = cugraph::invalid_edge_id_v] __device__(size_t i) { + auto first = thrust::lower_bound(thrust::seq, + sorted_zero_bias_frontier_indices.begin(), + sorted_zero_bias_frontier_indices.end(), + i); + auto last = + thrust::upper_bound(thrust::seq, first, sorted_zero_bias_frontier_indices.end(), i); + auto degree = frontier_degrees[i]; + edge_t num_valid = 0; + if (thrust::distance(first, last) == 0) { + thrust::sequence(thrust::seq, + nbr_indices.begin() + i * K, + nbr_indices.begin() + i * K + degree, + edge_t{0}); + num_valid = degree; + } else { + auto start_offset = thrust::distance(sorted_zero_bias_frontier_indices.begin(), first); + auto end_offset = thrust::distance(sorted_zero_bias_frontier_indices.begin(), last); + for (size_t j = 0; j < degree; ++j) { + if (!thrust::binary_search(thrust::seq, + sorted_zero_bias_nbr_indices.begin() + start_offset, + sorted_zero_bias_nbr_indices.begin() + end_offset, + j)) { + *(nbr_indices.begin() + i * K + num_valid) = j; + ++num_valid; + } + } + } + thrust::fill(thrust::seq, + nbr_indices.begin() + i * K + num_valid, + nbr_indices.begin() + (i + 1) * K, + invalid_idx); + }); } - auto [frontier_indices, frontier_partition_offsets] = - partition_v_frontier(handle, - frontier_degrees.begin(), - frontier_degrees.end(), - std::vector{static_cast(K + 1), - static_cast(minor_comm_size * K * 2)}); + auto mid_frontier_size = frontier_partition_offsets[2] - frontier_partition_offsets[1]; + std::vector mid_local_frontier_sizes{}; + mid_local_frontier_sizes = + host_scalar_allgather(minor_comm, mid_frontier_size, handle.get_stream()); + std::vector mid_local_frontier_displacements(mid_local_frontier_sizes.size()); + std::exclusive_scan(mid_local_frontier_sizes.begin(), + mid_local_frontier_sizes.end(), + mid_local_frontier_displacements.begin(), + size_t{0}); + + if (mid_local_frontier_displacements.back() + mid_local_frontier_sizes.back() > 0) { + // aggregate frontier indices with their degrees in the medium range + + auto aggregate_mid_local_frontier_indices = rmm::device_uvector( + mid_local_frontier_displacements.back() + mid_local_frontier_sizes.back(), + handle.get_stream()); + device_allgatherv(minor_comm, + frontier_indices.begin() + frontier_partition_offsets[1], + aggregate_mid_local_frontier_indices.begin(), + mid_local_frontier_sizes, + mid_local_frontier_displacements, + handle.get_stream()); - rmm::device_uvector nbr_indices(frontier_degrees.size() * K, handle.get_stream()); + // compute local degrees for the aggregated frontier indices - if (minor_comm_size > 1) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - - std::vector low_local_frontier_sizes{}; - low_local_frontier_sizes = - host_scalar_allgather(minor_comm, frontier_partition_offsets[1], handle.get_stream()); - std::vector low_local_frontier_displacements(low_local_frontier_sizes.size()); - std::exclusive_scan(low_local_frontier_sizes.begin(), - low_local_frontier_sizes.end(), - low_local_frontier_displacements.begin(), - size_t{0}); - - if (low_local_frontier_displacements.back() + low_local_frontier_sizes.back() > 0) { - // aggregate frontier indices with their degrees in the low range - - auto aggregate_low_local_frontier_indices = rmm::device_uvector( - low_local_frontier_displacements.back() + low_local_frontier_sizes.back(), - handle.get_stream()); - device_allgatherv(minor_comm, - frontier_indices.begin(), - aggregate_low_local_frontier_indices.begin(), - low_local_frontier_sizes, - low_local_frontier_displacements, - handle.get_stream()); + rmm::device_uvector aggregate_mid_local_frontier_local_degrees( + aggregate_mid_local_frontier_indices.size(), handle.get_stream()); + for (size_t i = 0; i < num_local_edge_partitions; ++i) { + thrust::transform( + handle.get_thrust_policy(), + aggregate_mid_local_frontier_indices.begin() + mid_local_frontier_displacements[i], + aggregate_mid_local_frontier_indices.begin() + mid_local_frontier_displacements[i] + + mid_local_frontier_sizes[i], + aggregate_mid_local_frontier_local_degrees.begin() + mid_local_frontier_displacements[i], + cuda::proclaim_return_type( + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + + local_frontier_displacements[i], + local_frontier_sizes[i]), + unique_key_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data() + + local_frontier_unique_key_displacements[i], + local_frontier_unique_key_sizes[i] + 1)] __device__(size_t i) { + auto unique_key_idx = key_idx_to_unique_key_idx[i]; + return static_cast(unique_key_local_degree_offsets[unique_key_idx + 1] - + unique_key_local_degree_offsets[unique_key_idx]); + })); + } - // collect 0 bias value neighbor indices + // gather biases for the aggregated frontier indices - rmm::device_uvector zero_bias_frontier_indices( - aggregate_low_local_frontier_indices.size() * K /* generous upper bound */, + rmm::device_uvector aggregate_mid_local_frontier_biases(0, handle.get_stream()); + std::vector mid_local_frontier_degree_sums(mid_local_frontier_sizes.size()); + { + rmm::device_uvector aggregate_mid_local_frontier_local_degree_offsets( + aggregate_mid_local_frontier_local_degrees.size() + 1, handle.get_stream()); + aggregate_mid_local_frontier_local_degree_offsets.set_element_to_zero_async( + 0, handle.get_stream()); + thrust::inclusive_scan(handle.get_thrust_policy(), + aggregate_mid_local_frontier_local_degrees.begin(), + aggregate_mid_local_frontier_local_degrees.end(), + aggregate_mid_local_frontier_local_degree_offsets.begin() + 1); + aggregate_mid_local_frontier_biases.resize( + aggregate_mid_local_frontier_local_degree_offsets.back_element(handle.get_stream()), handle.get_stream()); - rmm::device_uvector zero_bias_local_nbr_indices(zero_bias_frontier_indices.size(), - handle.get_stream()); - rmm::device_scalar counter(0, handle.get_stream()); - std::vector zero_bias_count_inclusive_sums(low_local_frontier_sizes.size()); + + std::vector mid_local_frontier_degree_sum_lasts( + mid_local_frontier_degree_sums.size()); for (size_t i = 0; i < num_local_edge_partitions; ++i) { thrust::for_each( handle.get_thrust_policy(), - aggregate_low_local_frontier_indices.begin() + low_local_frontier_displacements[i], - aggregate_low_local_frontier_indices.begin() + - (low_local_frontier_displacements[i] + low_local_frontier_sizes[i]), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(mid_local_frontier_sizes[i]), [key_idx_to_unique_key_idx = raft::device_span( aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_displacements[i], @@ -1758,615 +2005,399 @@ biased_sample( aggregate_local_frontier_unique_key_local_degree_offsets.data() + local_frontier_unique_key_displacements[i], local_frontier_unique_key_sizes[i] + 1), - zero_bias_frontier_indices = raft::device_span( - zero_bias_frontier_indices.data(), zero_bias_frontier_indices.size()), - zero_bias_local_nbr_indices = raft::device_span( - zero_bias_local_nbr_indices.data(), zero_bias_local_nbr_indices.size()), - input_offset = local_frontier_displacements[i], - counter = counter.data()] __device__(size_t i) { - auto unique_key_idx = key_idx_to_unique_key_idx[i]; - auto start_offset = unique_key_local_degree_offsets[unique_key_idx]; - auto end_offset = unique_key_local_degree_offsets[unique_key_idx + 1]; - cuda::atomic_ref atomic_counter(*counter); - for (auto j = start_offset; j < end_offset; ++j) { - if (aggregate_local_frontier_unique_key_biases[j] == 0.0) { - auto idx = atomic_counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed); - zero_bias_frontier_indices[idx] = i; - zero_bias_local_nbr_indices[idx] = j - start_offset; - } - } + mid_local_frontier_indices = raft::device_span( + aggregate_mid_local_frontier_indices.data() + mid_local_frontier_displacements[i], + mid_local_frontier_sizes[i]), + aggregate_mid_local_frontier_biases = + raft::device_span(aggregate_mid_local_frontier_biases.data(), + aggregate_mid_local_frontier_biases.size()), + aggregate_mid_local_frontier_local_degree_offsets = raft::device_span( + aggregate_mid_local_frontier_local_degree_offsets.data(), + aggregate_mid_local_frontier_local_degree_offsets.size()), + output_offset = mid_local_frontier_displacements[i]] __device__(size_t i) { + auto unique_key_idx = key_idx_to_unique_key_idx[mid_local_frontier_indices[i]]; + thrust::copy(thrust::seq, + aggregate_local_frontier_unique_key_biases.begin() + + unique_key_local_degree_offsets[unique_key_idx], + aggregate_local_frontier_unique_key_biases.begin() + + unique_key_local_degree_offsets[unique_key_idx + 1], + aggregate_mid_local_frontier_biases.begin() + + aggregate_mid_local_frontier_local_degree_offsets[output_offset + i]); }); - zero_bias_count_inclusive_sums[i] = counter.value(handle.get_stream()); + mid_local_frontier_degree_sum_lasts[i] = + aggregate_mid_local_frontier_local_degree_offsets.element( + mid_local_frontier_displacements[i] + mid_local_frontier_sizes[i], + handle.get_stream()); } - zero_bias_frontier_indices.resize(zero_bias_count_inclusive_sums.back(), - handle.get_stream()); - zero_bias_frontier_indices.shrink_to_fit(handle.get_stream()); - zero_bias_local_nbr_indices.resize(zero_bias_frontier_indices.size(), handle.get_stream()); - zero_bias_local_nbr_indices.shrink_to_fit(handle.get_stream()); - std::vector zero_bias_counts(zero_bias_count_inclusive_sums.size()); - std::adjacent_difference(zero_bias_count_inclusive_sums.begin(), - zero_bias_count_inclusive_sums.end(), - zero_bias_counts.begin()); - - rmm::device_uvector low_frontier_gathered_zero_bias_frontier_indices( - 0, handle.get_stream()); - rmm::device_uvector low_frontier_gathered_zero_bias_nbr_indices( - 0, handle.get_stream()); - std::vector rx_counts{}; - std::forward_as_tuple(std::tie(low_frontier_gathered_zero_bias_frontier_indices, - low_frontier_gathered_zero_bias_nbr_indices), - rx_counts) = - shuffle_values(minor_comm, - thrust::make_zip_iterator(zero_bias_frontier_indices.begin(), - zero_bias_local_nbr_indices.begin()), - zero_bias_counts, - handle.get_stream()); - std::vector rx_displacements(rx_counts.size()); - std::exclusive_scan( - rx_counts.begin(), rx_counts.end(), rx_displacements.begin(), size_t{0}); - - // convert local neighbor indices to global neighbor indices and sort - - auto pair_first = - thrust::make_zip_iterator(low_frontier_gathered_zero_bias_frontier_indices.begin(), - low_frontier_gathered_zero_bias_nbr_indices.begin()); - for (size_t i = 0; i < num_local_edge_partitions; ++i) { - thrust::transform( - handle.get_thrust_policy(), - pair_first + rx_displacements[i], - pair_first + rx_displacements[i] + rx_counts[i], - low_frontier_gathered_zero_bias_nbr_indices.begin() + rx_displacements[i], - cuda::proclaim_return_type( - [frontier_partitioned_local_degree_displacements = raft::device_span( - (*frontier_partitioned_local_degree_displacements).data(), - (*frontier_partitioned_local_degree_displacements).size()), - minor_comm_size, - minor_comm_rank = i] __device__(auto pair) { - auto frontier_idx = thrust::get<0>(pair); - auto local_nbr_idx = thrust::get<1>(pair); - return frontier_partitioned_local_degree_displacements[frontier_idx * - minor_comm_size + - minor_comm_rank] + - local_nbr_idx; - })); - } - - thrust::sort(handle.get_thrust_policy(), - pair_first, - pair_first + low_frontier_gathered_zero_bias_frontier_indices.size()); + std::adjacent_difference(mid_local_frontier_degree_sum_lasts.begin(), + mid_local_frontier_degree_sum_lasts.end(), + mid_local_frontier_degree_sums.begin()); + } + aggregate_mid_local_frontier_indices.resize(0, handle.get_stream()); + aggregate_mid_local_frontier_indices.shrink_to_fit(handle.get_stream()); - // update neighbor indices excluding zero bias neighbor indices + // shuffle local degrees & biases - thrust::for_each( - handle.get_thrust_policy(), - frontier_indices.begin(), - frontier_indices.begin() + frontier_partition_offsets[1], - [sorted_zero_bias_frontier_indices = raft::device_span( - low_frontier_gathered_zero_bias_frontier_indices.data(), - low_frontier_gathered_zero_bias_frontier_indices.size()), - sorted_zero_bias_nbr_indices = - raft::device_span(low_frontier_gathered_zero_bias_nbr_indices.data(), - low_frontier_gathered_zero_bias_nbr_indices.size()), - frontier_degrees = - raft::device_span(frontier_degrees.data(), frontier_degrees.size()), - nbr_indices = raft::device_span(nbr_indices.data(), nbr_indices.size()), - K, - invalid_idx = cugraph::invalid_edge_id_v] __device__(size_t i) { - auto first = thrust::lower_bound(thrust::seq, - sorted_zero_bias_frontier_indices.begin(), - sorted_zero_bias_frontier_indices.end(), - i); - auto last = - thrust::upper_bound(thrust::seq, first, sorted_zero_bias_frontier_indices.end(), i); - auto degree = frontier_degrees[i]; - edge_t num_valid = 0; - if (thrust::distance(first, last) == 0) { - thrust::sequence(thrust::seq, - nbr_indices.begin() + i * K, - nbr_indices.begin() + i * K + degree, - edge_t{0}); - num_valid = degree; - } else { - auto start_offset = - thrust::distance(sorted_zero_bias_frontier_indices.begin(), first); - auto end_offset = thrust::distance(sorted_zero_bias_frontier_indices.begin(), last); - for (size_t j = 0; j < degree; ++j) { - if (!thrust::binary_search(thrust::seq, - sorted_zero_bias_nbr_indices.begin() + start_offset, - sorted_zero_bias_nbr_indices.begin() + end_offset, - j)) { - *(nbr_indices.begin() + i * K + num_valid) = j; - ++num_valid; - } - } - } - thrust::fill(thrust::seq, - nbr_indices.begin() + i * K + num_valid, - nbr_indices.begin() + (i + 1) * K, - invalid_idx); - }); + rmm::device_uvector mid_frontier_gathered_local_degree_offsets(0, + handle.get_stream()); + { + rmm::device_uvector mid_frontier_gathered_local_degrees(0, handle.get_stream()); + std::tie(mid_frontier_gathered_local_degrees, std::ignore) = + shuffle_values(minor_comm, + aggregate_mid_local_frontier_local_degrees.data(), + mid_local_frontier_sizes, + handle.get_stream()); + aggregate_mid_local_frontier_local_degrees.resize(0, handle.get_stream()); + aggregate_mid_local_frontier_local_degrees.shrink_to_fit(handle.get_stream()); + mid_frontier_gathered_local_degree_offsets.resize( + mid_frontier_gathered_local_degrees.size() + 1, handle.get_stream()); + mid_frontier_gathered_local_degree_offsets.set_element_to_zero_async(0, + handle.get_stream()); + thrust::inclusive_scan(handle.get_thrust_policy(), + mid_frontier_gathered_local_degrees.begin(), + mid_frontier_gathered_local_degrees.end(), + mid_frontier_gathered_local_degree_offsets.begin() + 1); } - auto mid_frontier_size = frontier_partition_offsets[2] - frontier_partition_offsets[1]; - std::vector mid_local_frontier_sizes{}; - mid_local_frontier_sizes = - host_scalar_allgather(minor_comm, mid_frontier_size, handle.get_stream()); - std::vector mid_local_frontier_displacements(mid_local_frontier_sizes.size()); - std::exclusive_scan(mid_local_frontier_sizes.begin(), - mid_local_frontier_sizes.end(), - mid_local_frontier_displacements.begin(), - size_t{0}); - - if (mid_local_frontier_displacements.back() + mid_local_frontier_sizes.back() > 0) { - // aggregate frontier indices with their degrees in the medium range - - auto aggregate_mid_local_frontier_indices = rmm::device_uvector( - mid_local_frontier_displacements.back() + mid_local_frontier_sizes.back(), - handle.get_stream()); - device_allgatherv(minor_comm, - frontier_indices.begin() + frontier_partition_offsets[1], - aggregate_mid_local_frontier_indices.begin(), - mid_local_frontier_sizes, - mid_local_frontier_displacements, - handle.get_stream()); + rmm::device_uvector mid_frontier_gathered_biases(0, handle.get_stream()); + std::tie(mid_frontier_gathered_biases, std::ignore) = + shuffle_values(minor_comm, + aggregate_mid_local_frontier_biases.data(), + mid_local_frontier_degree_sums, + handle.get_stream()); + aggregate_mid_local_frontier_biases.resize(0, handle.get_stream()); + aggregate_mid_local_frontier_biases.shrink_to_fit(handle.get_stream()); - // compute local degrees for the aggregated frontier indices + auto mid_frontier_degree_first = thrust::make_transform_iterator( + frontier_indices.begin() + frontier_partition_offsets[1], + cuda::proclaim_return_type( + [frontier_degrees = raft::device_span( + frontier_degrees.data(), frontier_degrees.size())] __device__(size_t i) { + return frontier_degrees[i]; + })); + rmm::device_uvector mid_frontier_degree_offsets(mid_frontier_size + 1, + handle.get_stream()); + mid_frontier_degree_offsets.set_element_to_zero_async(0, handle.get_stream()); + thrust::inclusive_scan(handle.get_thrust_policy(), + mid_frontier_degree_first, + mid_frontier_degree_first + mid_frontier_size, + mid_frontier_degree_offsets.begin() + 1); + rmm::device_uvector mid_frontier_biases(mid_frontier_gathered_biases.size(), + handle.get_stream()); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(mid_frontier_size), + [mid_frontier_gathered_local_degree_offsets = + raft::device_span(mid_frontier_gathered_local_degree_offsets.data(), + mid_frontier_gathered_local_degree_offsets.size()), + mid_frontier_gathered_biases = raft::device_span( + mid_frontier_gathered_biases.data(), mid_frontier_gathered_biases.size()), + mid_frontier_degree_offsets = raft::device_span( + mid_frontier_degree_offsets.data(), mid_frontier_degree_offsets.size()), + mid_frontier_biases = + raft::device_span(mid_frontier_biases.data(), mid_frontier_biases.size()), + minor_comm_size, + mid_frontier_size] __device__(size_t i) { + auto output_offset = mid_frontier_degree_offsets[i]; + for (int j = 0; j < minor_comm_size; ++j) { + auto input_offset = + mid_frontier_gathered_local_degree_offsets[mid_frontier_size * j + i]; + auto input_size = + mid_frontier_gathered_local_degree_offsets[mid_frontier_size * j + i + 1] - + input_offset; + thrust::copy(thrust::seq, + mid_frontier_gathered_biases.begin() + input_offset, + mid_frontier_gathered_biases.begin() + input_offset + input_size, + mid_frontier_biases.begin() + output_offset); + output_offset += input_size; + } + }); - rmm::device_uvector aggregate_mid_local_frontier_local_degrees( - aggregate_mid_local_frontier_indices.size(), handle.get_stream()); - for (size_t i = 0; i < num_local_edge_partitions; ++i) { - thrust::transform( - handle.get_thrust_policy(), - aggregate_mid_local_frontier_indices.begin() + mid_local_frontier_displacements[i], - aggregate_mid_local_frontier_indices.begin() + mid_local_frontier_displacements[i] + - mid_local_frontier_sizes[i], - aggregate_mid_local_frontier_local_degrees.begin() + - mid_local_frontier_displacements[i], - cuda::proclaim_return_type( - [key_idx_to_unique_key_idx = raft::device_span( - aggregate_local_frontier_key_idx_to_unique_key_idx.data() + - local_frontier_displacements[i], - local_frontier_sizes[i]), - unique_key_local_degree_offsets = raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data() + - local_frontier_unique_key_displacements[i], - local_frontier_unique_key_sizes[i] + 1)] __device__(size_t i) { - auto unique_key_idx = key_idx_to_unique_key_idx[i]; - return static_cast(unique_key_local_degree_offsets[unique_key_idx + 1] - - unique_key_local_degree_offsets[unique_key_idx]); - })); - } + // now sample and update indices - // gather biases for the aggregated frontier indices - - rmm::device_uvector aggregate_mid_local_frontier_biases(0, handle.get_stream()); - std::vector mid_local_frontier_degree_sums(mid_local_frontier_sizes.size()); - { - rmm::device_uvector aggregate_mid_local_frontier_local_degree_offsets( - aggregate_mid_local_frontier_local_degrees.size() + 1, handle.get_stream()); - aggregate_mid_local_frontier_local_degree_offsets.set_element_to_zero_async( - 0, handle.get_stream()); - thrust::inclusive_scan(handle.get_thrust_policy(), - aggregate_mid_local_frontier_local_degrees.begin(), - aggregate_mid_local_frontier_local_degrees.end(), - aggregate_mid_local_frontier_local_degree_offsets.begin() + 1); - aggregate_mid_local_frontier_biases.resize( - aggregate_mid_local_frontier_local_degree_offsets.back_element(handle.get_stream()), - handle.get_stream()); + compute_biased_sampling_index_without_replacement( + handle, + std::nullopt, + raft::device_span(mid_frontier_degree_offsets.data(), + mid_frontier_degree_offsets.size()), + raft::device_span(mid_frontier_biases.data(), mid_frontier_biases.size()), + std::make_optional>( + frontier_indices.begin() + frontier_partition_offsets[1], mid_frontier_size), + raft::device_span(nbr_indices.data(), nbr_indices.size()), + std::nullopt, + rng_state, + K, + false); + } - std::vector mid_local_frontier_degree_sum_lasts( - mid_local_frontier_degree_sums.size()); - for (size_t i = 0; i < num_local_edge_partitions; ++i) { - thrust::for_each( - handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(mid_local_frontier_sizes[i]), - [key_idx_to_unique_key_idx = raft::device_span( - aggregate_local_frontier_key_idx_to_unique_key_idx.data() + - local_frontier_displacements[i], - local_frontier_sizes[i]), - aggregate_local_frontier_unique_key_biases = - raft::device_span(aggregate_local_frontier_unique_key_biases.data(), - aggregate_local_frontier_unique_key_biases.size()), - unique_key_local_degree_offsets = raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data() + - local_frontier_unique_key_displacements[i], - local_frontier_unique_key_sizes[i] + 1), - mid_local_frontier_indices = raft::device_span( - aggregate_mid_local_frontier_indices.data() + mid_local_frontier_displacements[i], - mid_local_frontier_sizes[i]), - aggregate_mid_local_frontier_biases = - raft::device_span(aggregate_mid_local_frontier_biases.data(), - aggregate_mid_local_frontier_biases.size()), - aggregate_mid_local_frontier_local_degree_offsets = raft::device_span( - aggregate_mid_local_frontier_local_degree_offsets.data(), - aggregate_mid_local_frontier_local_degree_offsets.size()), - output_offset = mid_local_frontier_displacements[i]] __device__(size_t i) { - auto unique_key_idx = key_idx_to_unique_key_idx[mid_local_frontier_indices[i]]; - thrust::copy( - thrust::seq, - aggregate_local_frontier_unique_key_biases.begin() + - unique_key_local_degree_offsets[unique_key_idx], - aggregate_local_frontier_unique_key_biases.begin() + - unique_key_local_degree_offsets[unique_key_idx + 1], - aggregate_mid_local_frontier_biases.begin() + - aggregate_mid_local_frontier_local_degree_offsets[output_offset + i]); - }); - mid_local_frontier_degree_sum_lasts[i] = - aggregate_mid_local_frontier_local_degree_offsets.element( - mid_local_frontier_displacements[i] + mid_local_frontier_sizes[i], - handle.get_stream()); - } - std::adjacent_difference(mid_local_frontier_degree_sum_lasts.begin(), - mid_local_frontier_degree_sum_lasts.end(), - mid_local_frontier_degree_sums.begin()); - } - aggregate_mid_local_frontier_indices.resize(0, handle.get_stream()); - aggregate_mid_local_frontier_indices.shrink_to_fit(handle.get_stream()); - - // shuffle local degrees & biases - - rmm::device_uvector mid_frontier_gathered_local_degree_offsets(0, - handle.get_stream()); - { - rmm::device_uvector mid_frontier_gathered_local_degrees(0, handle.get_stream()); - std::tie(mid_frontier_gathered_local_degrees, std::ignore) = - shuffle_values(minor_comm, - aggregate_mid_local_frontier_local_degrees.data(), - mid_local_frontier_sizes, - handle.get_stream()); - aggregate_mid_local_frontier_local_degrees.resize(0, handle.get_stream()); - aggregate_mid_local_frontier_local_degrees.shrink_to_fit(handle.get_stream()); - mid_frontier_gathered_local_degree_offsets.resize( - mid_frontier_gathered_local_degrees.size() + 1, handle.get_stream()); - mid_frontier_gathered_local_degree_offsets.set_element_to_zero_async(0, - handle.get_stream()); - thrust::inclusive_scan(handle.get_thrust_policy(), - mid_frontier_gathered_local_degrees.begin(), - mid_frontier_gathered_local_degrees.end(), - mid_frontier_gathered_local_degree_offsets.begin() + 1); - } + auto high_frontier_size = frontier_partition_offsets[3] - frontier_partition_offsets[2]; + std::vector high_local_frontier_sizes{}; + high_local_frontier_sizes = + host_scalar_allgather(minor_comm, high_frontier_size, handle.get_stream()); + + std::vector high_local_frontier_displacements(high_local_frontier_sizes.size()); + std::exclusive_scan(high_local_frontier_sizes.begin(), + high_local_frontier_sizes.end(), + high_local_frontier_displacements.begin(), + size_t{0}); + if (high_local_frontier_displacements.back() + high_local_frontier_sizes.back() > 0) { + // aggregate frontier indices with their degrees in the high range + + auto aggregate_high_local_frontier_indices = rmm::device_uvector( + high_local_frontier_displacements.back() + high_local_frontier_sizes.back(), + handle.get_stream()); + device_allgatherv(minor_comm, + frontier_indices.begin() + frontier_partition_offsets[2], + aggregate_high_local_frontier_indices.begin(), + high_local_frontier_sizes, + high_local_frontier_displacements, + handle.get_stream()); - rmm::device_uvector mid_frontier_gathered_biases(0, handle.get_stream()); - std::tie(mid_frontier_gathered_biases, std::ignore) = - shuffle_values(minor_comm, - aggregate_mid_local_frontier_biases.data(), - mid_local_frontier_degree_sums, - handle.get_stream()); - aggregate_mid_local_frontier_biases.resize(0, handle.get_stream()); - aggregate_mid_local_frontier_biases.shrink_to_fit(handle.get_stream()); + // local sample and update indices - auto mid_frontier_degree_first = thrust::make_transform_iterator( - frontier_indices.begin() + frontier_partition_offsets[1], - cuda::proclaim_return_type( - [frontier_degrees = raft::device_span( - frontier_degrees.data(), frontier_degrees.size())] __device__(size_t i) { - return frontier_degrees[i]; - })); - rmm::device_uvector mid_frontier_degree_offsets(mid_frontier_size + 1, - handle.get_stream()); - mid_frontier_degree_offsets.set_element_to_zero_async(0, handle.get_stream()); - thrust::inclusive_scan(handle.get_thrust_policy(), - mid_frontier_degree_first, - mid_frontier_degree_first + mid_frontier_size, - mid_frontier_degree_offsets.begin() + 1); - rmm::device_uvector mid_frontier_biases(mid_frontier_gathered_biases.size(), - handle.get_stream()); - thrust::for_each( + rmm::device_uvector aggregate_high_local_frontier_local_nbr_indices( + (high_local_frontier_displacements.back() + high_local_frontier_sizes.back()) * K, + handle.get_stream()); + rmm::device_uvector aggregate_high_local_frontier_keys( + aggregate_high_local_frontier_local_nbr_indices.size(), handle.get_stream()); + for (size_t i = 0; i < num_local_edge_partitions; ++i) { + rmm::device_uvector unique_key_indices_for_key_indices(high_local_frontier_sizes[i], + handle.get_stream()); + thrust::gather( handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(mid_frontier_size), - [mid_frontier_gathered_local_degree_offsets = - raft::device_span(mid_frontier_gathered_local_degree_offsets.data(), - mid_frontier_gathered_local_degree_offsets.size()), - mid_frontier_gathered_biases = raft::device_span( - mid_frontier_gathered_biases.data(), mid_frontier_gathered_biases.size()), - mid_frontier_degree_offsets = raft::device_span( - mid_frontier_degree_offsets.data(), mid_frontier_degree_offsets.size()), - mid_frontier_biases = - raft::device_span(mid_frontier_biases.data(), mid_frontier_biases.size()), - minor_comm_size, - mid_frontier_size] __device__(size_t i) { - auto output_offset = mid_frontier_degree_offsets[i]; - for (int j = 0; j < minor_comm_size; ++j) { - auto input_offset = - mid_frontier_gathered_local_degree_offsets[mid_frontier_size * j + i]; - auto input_size = - mid_frontier_gathered_local_degree_offsets[mid_frontier_size * j + i + 1] - - input_offset; - thrust::copy(thrust::seq, - mid_frontier_gathered_biases.begin() + input_offset, - mid_frontier_gathered_biases.begin() + input_offset + input_size, - mid_frontier_biases.begin() + output_offset); - output_offset += input_size; - } - }); - - // now sample and update indices - + aggregate_high_local_frontier_indices.data() + high_local_frontier_displacements[i], + aggregate_high_local_frontier_indices.data() + high_local_frontier_displacements[i] + + high_local_frontier_sizes[i], + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + + local_frontier_displacements[i], + unique_key_indices_for_key_indices.begin()); compute_biased_sampling_index_without_replacement( handle, - std::nullopt, - raft::device_span(mid_frontier_degree_offsets.data(), - mid_frontier_degree_offsets.size()), - raft::device_span(mid_frontier_biases.data(), mid_frontier_biases.size()), std::make_optional>( - frontier_indices.begin() + frontier_partition_offsets[1], mid_frontier_size), - raft::device_span(nbr_indices.data(), nbr_indices.size()), + unique_key_indices_for_key_indices.data(), unique_key_indices_for_key_indices.size()), + raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data() + + local_frontier_unique_key_displacements[i], + local_frontier_unique_key_sizes[i] + 1), + raft::device_span(aggregate_local_frontier_unique_key_biases.data(), + aggregate_local_frontier_unique_key_biases.size()), std::nullopt, + raft::device_span(aggregate_high_local_frontier_local_nbr_indices.data() + + high_local_frontier_displacements[i] * K, + high_local_frontier_sizes[i] * K), + std::make_optional>( + aggregate_high_local_frontier_keys.data() + high_local_frontier_displacements[i] * K, + high_local_frontier_sizes[i] * K), rng_state, K, false); } - auto high_frontier_size = frontier_partition_offsets[3] - frontier_partition_offsets[2]; - std::vector high_local_frontier_sizes{}; - high_local_frontier_sizes = - host_scalar_allgather(minor_comm, high_frontier_size, handle.get_stream()); - - std::vector high_local_frontier_displacements(high_local_frontier_sizes.size()); - std::exclusive_scan(high_local_frontier_sizes.begin(), - high_local_frontier_sizes.end(), - high_local_frontier_displacements.begin(), - size_t{0}); - if (high_local_frontier_displacements.back() + high_local_frontier_sizes.back() > 0) { - // aggregate frontier indices with their degrees in the high range - - auto aggregate_high_local_frontier_indices = rmm::device_uvector( - high_local_frontier_displacements.back() + high_local_frontier_sizes.back(), - handle.get_stream()); - device_allgatherv(minor_comm, - frontier_indices.begin() + frontier_partition_offsets[2], - aggregate_high_local_frontier_indices.begin(), - high_local_frontier_sizes, - high_local_frontier_displacements, - handle.get_stream()); - - // local sample and update indices - - rmm::device_uvector aggregate_high_local_frontier_local_nbr_indices( - (high_local_frontier_displacements.back() + high_local_frontier_sizes.back()) * K, - handle.get_stream()); - rmm::device_uvector aggregate_high_local_frontier_keys( - aggregate_high_local_frontier_local_nbr_indices.size(), handle.get_stream()); - for (size_t i = 0; i < num_local_edge_partitions; ++i) { - rmm::device_uvector unique_key_indices_for_key_indices( - high_local_frontier_sizes[i], handle.get_stream()); - thrust::gather( - handle.get_thrust_policy(), - aggregate_high_local_frontier_indices.data() + high_local_frontier_displacements[i], - aggregate_high_local_frontier_indices.data() + high_local_frontier_displacements[i] + - high_local_frontier_sizes[i], - aggregate_local_frontier_key_idx_to_unique_key_idx.data() + - local_frontier_displacements[i], - unique_key_indices_for_key_indices.begin()); - compute_biased_sampling_index_without_replacement( - handle, - std::make_optional>( - unique_key_indices_for_key_indices.data(), unique_key_indices_for_key_indices.size()), - raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data() + - local_frontier_unique_key_displacements[i], - local_frontier_unique_key_sizes[i] + 1), - raft::device_span(aggregate_local_frontier_unique_key_biases.data(), - aggregate_local_frontier_unique_key_biases.size()), - std::nullopt, - raft::device_span(aggregate_high_local_frontier_local_nbr_indices.data() + - high_local_frontier_displacements[i] * K, - high_local_frontier_sizes[i] * K), - std::make_optional>( - aggregate_high_local_frontier_keys.data() + high_local_frontier_displacements[i] * K, - high_local_frontier_sizes[i] * K), - rng_state, - K, - false); - } - - // shuffle local sampling outputs - - std::vector tx_counts(high_local_frontier_sizes); - std::transform(high_local_frontier_sizes.begin(), - high_local_frontier_sizes.end(), - tx_counts.begin(), - [K](size_t size) { return size * K; }); - rmm::device_uvector high_frontier_gathered_local_nbr_indices(0, - handle.get_stream()); - std::tie(high_frontier_gathered_local_nbr_indices, std::ignore) = - shuffle_values(minor_comm, - aggregate_high_local_frontier_local_nbr_indices.data(), - tx_counts, - handle.get_stream()); - rmm::device_uvector high_frontier_gathered_keys(0, handle.get_stream()); - std::tie(high_frontier_gathered_keys, std::ignore) = shuffle_values( - minor_comm, aggregate_high_local_frontier_keys.data(), tx_counts, handle.get_stream()); - aggregate_high_local_frontier_local_nbr_indices.resize(0, handle.get_stream()); - aggregate_high_local_frontier_local_nbr_indices.shrink_to_fit(handle.get_stream()); - aggregate_high_local_frontier_keys.resize(0, handle.get_stream()); - aggregate_high_local_frontier_keys.shrink_to_fit(handle.get_stream()); - - // merge local sampling outputs - - rmm::device_uvector high_frontier_nbr_indices( - high_frontier_size * minor_comm_size * K, handle.get_stream()); - rmm::device_uvector high_frontier_keys(high_frontier_nbr_indices.size(), - handle.get_stream()); - auto index_first = thrust::make_transform_iterator( - thrust::make_counting_iterator(size_t{0}), - cuda::proclaim_return_type( - [K, minor_comm_rank, minor_comm_size, high_frontier_size] __device__(size_t i) { - auto idx = i / (K * minor_comm_size); - auto minor_comm_rank = (i % (K * minor_comm_size)) / K; - return minor_comm_rank * (high_frontier_size * K) + idx * K + (i % K); - })); - auto high_frontier_gathered_nbr_idx_first = thrust::make_transform_iterator( - thrust::counting_iterator(size_t{0}), - cuda::proclaim_return_type( - [frontier_partitioned_local_degree_displacements = raft::device_span( - (*frontier_partitioned_local_degree_displacements).data(), - (*frontier_partitioned_local_degree_displacements).size()), - high_frontier_indices = raft::device_span( - frontier_indices.data() + frontier_partition_offsets[2], high_frontier_size), - high_frontier_gathered_local_nbr_indices = - raft::device_span(high_frontier_gathered_local_nbr_indices.data(), - high_frontier_gathered_local_nbr_indices.size()), - K, - minor_comm_size, - high_frontier_size] __device__(size_t i) { - auto minor_comm_rank = static_cast(i / (high_frontier_size * K)); - auto frontier_idx = high_frontier_indices[(i % (high_frontier_size * K)) / K]; - return frontier_partitioned_local_degree_displacements[frontier_idx * - minor_comm_size + - minor_comm_rank] + - high_frontier_gathered_local_nbr_indices[i]; - })); - thrust::gather( - handle.get_thrust_policy(), - index_first, - index_first + high_frontier_nbr_indices.size(), - thrust::make_zip_iterator(high_frontier_gathered_nbr_idx_first, - high_frontier_gathered_keys.begin()), - thrust::make_zip_iterator(high_frontier_nbr_indices.begin(), high_frontier_keys.begin())); - high_frontier_gathered_local_nbr_indices.resize(0, handle.get_stream()); - high_frontier_gathered_local_nbr_indices.shrink_to_fit(handle.get_stream()); - high_frontier_gathered_keys.resize(0, handle.get_stream()); - high_frontier_gathered_keys.shrink_to_fit(handle.get_stream()); - - rmm::device_uvector d_tmp_storage(0, handle.get_stream()); - size_t tmp_storage_bytes{0}; - - rmm::device_uvector high_frontier_segment_sorted_nbr_indices( - high_frontier_nbr_indices.size(), handle.get_stream()); - rmm::device_uvector high_frontier_segment_sorted_keys(high_frontier_keys.size(), - handle.get_stream()); - cub::DeviceSegmentedSort::SortPairs( - static_cast(nullptr), - tmp_storage_bytes, - high_frontier_keys.data(), - high_frontier_segment_sorted_keys.data(), - high_frontier_nbr_indices.data(), - high_frontier_segment_sorted_nbr_indices.data(), - high_frontier_size * K * minor_comm_size, - high_frontier_size, - thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}), - multiplier_t{minor_comm_size * K}), - thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{1}), - multiplier_t{minor_comm_size * K}), - handle.get_stream()); - if (tmp_storage_bytes > d_tmp_storage.size()) { - d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, handle.get_stream()); - } - cub::DeviceSegmentedSort::SortPairs( - d_tmp_storage.data(), - tmp_storage_bytes, - high_frontier_keys.data(), - high_frontier_segment_sorted_keys.data(), - high_frontier_nbr_indices.data(), - high_frontier_segment_sorted_nbr_indices.data(), - high_frontier_size * K * minor_comm_size, - high_frontier_size, - thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}), - multiplier_t{minor_comm_size * K}), - thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{1}), - multiplier_t{minor_comm_size * K}), - handle.get_stream()); - - thrust::for_each( - handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(high_frontier_size), - [high_frontier_indices = raft::device_span( + // shuffle local sampling outputs + + std::vector tx_counts(high_local_frontier_sizes); + std::transform(high_local_frontier_sizes.begin(), + high_local_frontier_sizes.end(), + tx_counts.begin(), + [K](size_t size) { return size * K; }); + rmm::device_uvector high_frontier_gathered_local_nbr_indices(0, handle.get_stream()); + std::tie(high_frontier_gathered_local_nbr_indices, std::ignore) = + shuffle_values(minor_comm, + aggregate_high_local_frontier_local_nbr_indices.data(), + tx_counts, + handle.get_stream()); + rmm::device_uvector high_frontier_gathered_keys(0, handle.get_stream()); + std::tie(high_frontier_gathered_keys, std::ignore) = shuffle_values( + minor_comm, aggregate_high_local_frontier_keys.data(), tx_counts, handle.get_stream()); + aggregate_high_local_frontier_local_nbr_indices.resize(0, handle.get_stream()); + aggregate_high_local_frontier_local_nbr_indices.shrink_to_fit(handle.get_stream()); + aggregate_high_local_frontier_keys.resize(0, handle.get_stream()); + aggregate_high_local_frontier_keys.shrink_to_fit(handle.get_stream()); + + // merge local sampling outputs + + rmm::device_uvector high_frontier_nbr_indices( + high_frontier_size * minor_comm_size * K, handle.get_stream()); + rmm::device_uvector high_frontier_keys(high_frontier_nbr_indices.size(), + handle.get_stream()); + auto index_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [K, minor_comm_rank, minor_comm_size, high_frontier_size] __device__(size_t i) { + auto idx = i / (K * minor_comm_size); + auto minor_comm_rank = (i % (K * minor_comm_size)) / K; + return minor_comm_rank * (high_frontier_size * K) + idx * K + (i % K); + })); + auto high_frontier_gathered_nbr_idx_first = thrust::make_transform_iterator( + thrust::counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [frontier_partitioned_local_degree_displacements = raft::device_span( + (*frontier_partitioned_local_degree_displacements).data(), + (*frontier_partitioned_local_degree_displacements).size()), + high_frontier_indices = raft::device_span( frontier_indices.data() + frontier_partition_offsets[2], high_frontier_size), - high_frontier_segment_sorted_nbr_indices = - raft::device_span(high_frontier_segment_sorted_nbr_indices.data(), - high_frontier_segment_sorted_nbr_indices.size()), - nbr_indices = raft::device_span(nbr_indices.data(), nbr_indices.size()), + high_frontier_gathered_local_nbr_indices = + raft::device_span(high_frontier_gathered_local_nbr_indices.data(), + high_frontier_gathered_local_nbr_indices.size()), K, - minor_comm_size] __device__(size_t i) { - thrust::copy( - thrust::seq, - high_frontier_segment_sorted_nbr_indices.begin() + (i * K * minor_comm_size), - high_frontier_segment_sorted_nbr_indices.begin() + (i * K * minor_comm_size + K), - nbr_indices.begin() + high_frontier_indices[i] * K); - }); + minor_comm_size, + high_frontier_size] __device__(size_t i) { + auto minor_comm_rank = static_cast(i / (high_frontier_size * K)); + auto frontier_idx = high_frontier_indices[(i % (high_frontier_size * K)) / K]; + return frontier_partitioned_local_degree_displacements[frontier_idx * minor_comm_size + + minor_comm_rank] + + high_frontier_gathered_local_nbr_indices[i]; + })); + thrust::gather( + handle.get_thrust_policy(), + index_first, + index_first + high_frontier_nbr_indices.size(), + thrust::make_zip_iterator(high_frontier_gathered_nbr_idx_first, + high_frontier_gathered_keys.begin()), + thrust::make_zip_iterator(high_frontier_nbr_indices.begin(), high_frontier_keys.begin())); + high_frontier_gathered_local_nbr_indices.resize(0, handle.get_stream()); + high_frontier_gathered_local_nbr_indices.shrink_to_fit(handle.get_stream()); + high_frontier_gathered_keys.resize(0, handle.get_stream()); + high_frontier_gathered_keys.shrink_to_fit(handle.get_stream()); + + rmm::device_uvector d_tmp_storage(0, handle.get_stream()); + size_t tmp_storage_bytes{0}; + + rmm::device_uvector high_frontier_segment_sorted_nbr_indices( + high_frontier_nbr_indices.size(), handle.get_stream()); + rmm::device_uvector high_frontier_segment_sorted_keys(high_frontier_keys.size(), + handle.get_stream()); + cub::DeviceSegmentedSort::SortPairs( + static_cast(nullptr), + tmp_storage_bytes, + high_frontier_keys.data(), + high_frontier_segment_sorted_keys.data(), + high_frontier_nbr_indices.data(), + high_frontier_segment_sorted_nbr_indices.data(), + high_frontier_size * K * minor_comm_size, + high_frontier_size, + thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}), + multiplier_t{minor_comm_size * K}), + thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{1}), + multiplier_t{minor_comm_size * K}), + handle.get_stream()); + if (tmp_storage_bytes > d_tmp_storage.size()) { + d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, handle.get_stream()); } - } else { + cub::DeviceSegmentedSort::SortPairs( + d_tmp_storage.data(), + tmp_storage_bytes, + high_frontier_keys.data(), + high_frontier_segment_sorted_keys.data(), + high_frontier_nbr_indices.data(), + high_frontier_segment_sorted_nbr_indices.data(), + high_frontier_size * K * minor_comm_size, + high_frontier_size, + thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}), + multiplier_t{minor_comm_size * K}), + thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{1}), + multiplier_t{minor_comm_size * K}), + handle.get_stream()); + thrust::for_each( handle.get_thrust_policy(), - frontier_indices.begin(), - frontier_indices.begin() + frontier_partition_offsets[1], - [key_idx_to_unique_key_idx = raft::device_span( - aggregate_local_frontier_key_idx_to_unique_key_idx.data(), - aggregate_local_frontier_key_idx_to_unique_key_idx.size()), - aggregate_local_frontier_unique_key_biases = - raft::device_span(aggregate_local_frontier_unique_key_biases.data(), - aggregate_local_frontier_unique_key_biases.size()), - aggregate_local_frontier_unique_key_local_degree_offsets = raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data(), - aggregate_local_frontier_unique_key_local_degree_offsets.size()), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(high_frontier_size), + [high_frontier_indices = raft::device_span( + frontier_indices.data() + frontier_partition_offsets[2], high_frontier_size), + high_frontier_segment_sorted_nbr_indices = + raft::device_span(high_frontier_segment_sorted_nbr_indices.data(), + high_frontier_segment_sorted_nbr_indices.size()), nbr_indices = raft::device_span(nbr_indices.data(), nbr_indices.size()), K, - invalid_idx = cugraph::invalid_edge_id_v] __device__(size_t i) { - auto unique_key_idx = key_idx_to_unique_key_idx[i]; - auto start_offset = - aggregate_local_frontier_unique_key_local_degree_offsets[unique_key_idx]; - auto degree = - aggregate_local_frontier_unique_key_local_degree_offsets[unique_key_idx + 1] - - start_offset; - edge_t num_valid = 0; - for (size_t j = 0; j < degree; ++j) { - auto bias = aggregate_local_frontier_unique_key_biases[start_offset + j]; - if (bias > 0.0) { - *(nbr_indices.begin() + i * K + num_valid) = j; - ++num_valid; - } - } - thrust::fill(thrust::seq, - nbr_indices.begin() + i * K + num_valid, - nbr_indices.begin() + (i + 1) * K, - invalid_idx); + minor_comm_size] __device__(size_t i) { + thrust::copy( + thrust::seq, + high_frontier_segment_sorted_nbr_indices.begin() + (i * K * minor_comm_size), + high_frontier_segment_sorted_nbr_indices.begin() + (i * K * minor_comm_size + K), + nbr_indices.begin() + high_frontier_indices[i] * K); }); - - auto mid_and_high_frontier_size = - frontier_partition_offsets[3] - frontier_partition_offsets[1]; - rmm::device_uvector unique_key_indices_for_key_indices(mid_and_high_frontier_size, - handle.get_stream()); - thrust::gather( - handle.get_thrust_policy(), - frontier_indices.data() + frontier_partition_offsets[1], - frontier_indices.data() + frontier_partition_offsets[1] + mid_and_high_frontier_size, - aggregate_local_frontier_key_idx_to_unique_key_idx.begin(), - unique_key_indices_for_key_indices.begin()); - compute_biased_sampling_index_without_replacement( - handle, - std::make_optional>( - unique_key_indices_for_key_indices.data(), unique_key_indices_for_key_indices.size()), - raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data(), - aggregate_local_frontier_unique_key_local_degree_offsets.size()), - raft::device_span(aggregate_local_frontier_unique_key_biases.data(), - aggregate_local_frontier_unique_key_biases.size()), - std::make_optional>( - frontier_indices.data() + frontier_partition_offsets[1], mid_and_high_frontier_size), - raft::device_span(nbr_indices.data(), nbr_indices.size()), - std::nullopt, - rng_state, - K, - false); } + } else { // minor_comm_size == 1 + // sample from low-degree vertices - std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = - shuffle_and_compute_local_nbr_values( - handle, - std::move(nbr_indices), - frontier_partitioned_local_degree_displacements - ? std::make_optional>( - (*frontier_partitioned_local_degree_displacements).data(), - (*frontier_partitioned_local_degree_displacements).size()) - : std::nullopt, - K, - cugraph::invalid_edge_id_v); + thrust::for_each( + handle.get_thrust_policy(), + frontier_indices.begin(), + frontier_indices.begin() + frontier_partition_offsets[1], + [key_idx_to_unique_key_idx = + raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), + aggregate_local_frontier_key_idx_to_unique_key_idx.size()), + aggregate_local_frontier_unique_key_biases = + raft::device_span(aggregate_local_frontier_unique_key_biases.data(), + aggregate_local_frontier_unique_key_biases.size()), + aggregate_local_frontier_unique_key_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size()), + nbr_indices = raft::device_span(nbr_indices.data(), nbr_indices.size()), + K, + invalid_idx = cugraph::invalid_edge_id_v] __device__(size_t i) { + auto unique_key_idx = key_idx_to_unique_key_idx[i]; + auto start_offset = + aggregate_local_frontier_unique_key_local_degree_offsets[unique_key_idx]; + auto degree = aggregate_local_frontier_unique_key_local_degree_offsets[unique_key_idx + 1] - + start_offset; + edge_t num_valid = 0; + for (size_t j = 0; j < degree; ++j) { + auto bias = aggregate_local_frontier_unique_key_biases[start_offset + j]; + if (bias > 0.0) { + *(nbr_indices.begin() + i * K + num_valid) = j; + ++num_valid; + } + } + thrust::fill(thrust::seq, + nbr_indices.begin() + i * K + num_valid, + nbr_indices.begin() + (i + 1) * K, + invalid_idx); + }); + + // sample from mid & high-degree vertices + + auto mid_and_high_frontier_size = frontier_partition_offsets[3] - frontier_partition_offsets[1]; + rmm::device_uvector unique_key_indices_for_key_indices(mid_and_high_frontier_size, + handle.get_stream()); + thrust::gather( + handle.get_thrust_policy(), + frontier_indices.data() + frontier_partition_offsets[1], + frontier_indices.data() + frontier_partition_offsets[1] + mid_and_high_frontier_size, + aggregate_local_frontier_key_idx_to_unique_key_idx.begin(), + unique_key_indices_for_key_indices.begin()); + compute_biased_sampling_index_without_replacement( + handle, + std::make_optional>( + unique_key_indices_for_key_indices.data(), unique_key_indices_for_key_indices.size()), + raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size()), + raft::device_span(aggregate_local_frontier_unique_key_biases.data(), + aggregate_local_frontier_unique_key_biases.size()), + std::make_optional>( + frontier_indices.data() + frontier_partition_offsets[1], mid_and_high_frontier_size), + raft::device_span(nbr_indices.data(), nbr_indices.size()), + std::nullopt, + rng_state, + K, + false); } + std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = + shuffle_and_compute_local_nbr_values( + handle, + std::move(nbr_indices), + frontier_partitioned_local_degree_displacements + ? std::make_optional>( + (*frontier_partitioned_local_degree_displacements).data(), + (*frontier_partitioned_local_degree_displacements).size()) + : std::nullopt, + K, + cugraph::invalid_edge_id_v); + return std::make_tuple( std::move(local_nbr_indices), std::move(key_indices), std::move(local_frontier_sample_offsets)); } @@ -2655,25 +2686,48 @@ biased_sample_and_compute_local_nbr_indices( // 2. sample neighbor indices and shuffle neighbor indices - auto [local_nbr_indices, key_indices, local_frontier_sample_offsets] = - biased_sample( - handle, - local_frontier_displacements, - local_frontier_sizes, - raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), - aggregate_local_frontier_key_idx_to_unique_key_idx.size()), - raft::host_span(local_frontier_unique_key_displacements.data(), - local_frontier_unique_key_displacements.size()), - raft::host_span(local_frontier_unique_key_sizes.data(), - local_frontier_unique_key_sizes.size()), - raft::device_span(aggregate_local_frontier_unique_key_biases.data(), - aggregate_local_frontier_unique_key_biases.size()), - raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data(), - aggregate_local_frontier_unique_key_local_degree_offsets.size()), - rng_state, - K, - with_replacement); + rmm::device_uvector local_nbr_indices(0, handle.get_stream()); + std::optional> key_indices{std::nullopt}; + std::vector local_frontier_sample_offsets{}; + if (with_replacement) { + std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = + homogeneous_biased_sample_with_replacement( + handle, + local_frontier_displacements, + local_frontier_sizes, + raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), + aggregate_local_frontier_key_idx_to_unique_key_idx.size()), + raft::host_span(local_frontier_unique_key_displacements.data(), + local_frontier_unique_key_displacements.size()), + raft::host_span(local_frontier_unique_key_sizes.data(), + local_frontier_unique_key_sizes.size()), + raft::device_span(aggregate_local_frontier_unique_key_biases.data(), + aggregate_local_frontier_unique_key_biases.size()), + raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size()), + rng_state, + K); + } else { + std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = + homogeneous_biased_sample_without_replacement( + handle, + local_frontier_displacements, + local_frontier_sizes, + raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), + aggregate_local_frontier_key_idx_to_unique_key_idx.size()), + raft::host_span(local_frontier_unique_key_displacements.data(), + local_frontier_unique_key_displacements.size()), + raft::host_span(local_frontier_unique_key_sizes.data(), + local_frontier_unique_key_sizes.size()), + raft::device_span(aggregate_local_frontier_unique_key_biases.data(), + aggregate_local_frontier_unique_key_biases.size()), + raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size()), + rng_state, + K); + } // 3. convert neighbor indices in the neighbor list considering edge mask to neighbor indices in // the neighbor list ignoring edge mask From bab55caeb0f3521d835a0c67edb502b3a1ac0b37 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 23 Jan 2025 15:20:01 -0800 Subject: [PATCH 09/21] heterogeneous biased sampling without replacement --- cpp/src/prims/detail/partition_v_frontier.cuh | 70 +- .../sample_and_compute_local_nbr_indices.cuh | 3097 +++++++++++++---- ...r_v_random_select_transform_outgoing_e.cuh | 56 +- 3 files changed, 2538 insertions(+), 685 deletions(-) diff --git a/cpp/src/prims/detail/partition_v_frontier.cuh b/cpp/src/prims/detail/partition_v_frontier.cuh index 018960d9a54..c953a67cf5b 100644 --- a/cpp/src/prims/detail/partition_v_frontier.cuh +++ b/cpp/src/prims/detail/partition_v_frontier.cuh @@ -63,7 +63,9 @@ partition_v_frontier(raft::handle_t const& handle, rmm::device_uvector indices(thrust::distance(frontier_value_first, frontier_value_last), handle.get_stream()); thrust::sequence(handle.get_thrust_policy(), indices.begin(), indices.end(), size_t{0}); - std::vector v_frontier_partition_offsets(thresholds.size() + 2); + + auto num_partitions = thresholds.size() + 1; + std::vector v_frontier_partition_offsets(num_partitions + 1); v_frontier_partition_offsets[0] = size_t{0}; v_frontier_partition_offsets.back() = static_cast(thrust::distance(frontier_value_first, frontier_value_last)); @@ -86,6 +88,72 @@ partition_v_frontier(raft::handle_t const& handle, return std::make_tuple(std::move(indices), std::move(v_frontier_partition_offsets)); } +// a key in the frontier has @p num_values_per_key values, the frontier is separately partitioned +// @p num_values_per_key times based on the i'th value; i = [0, @p num_values_per_key). +template +std::tuple /* indices */, + rmm::device_uvector, + std::vector /* offsets (size = value_offsets.size()) */> +partition_v_frontier_per_value_idx( + raft::handle_t const& handle, + ValueIterator frontier_value_first, + ValueIterator frontier_value_last, + raft::host_span::value_type> + thresholds /* size = num_values_per_key * (# partitions - 1), thresholds[i] marks the end + (exclusive) of the (i % num_values_per_key)'th partition value range for the (i / + num_values_per_key)'th value of each key */ + , + size_t num_values_per_key) +{ + using value_t = typename thrust::iterator_traits::value_type; + + assert((thrust::distance(frontier_value_first, frontier_value_last) % num_values_per_key) == 0); + rmm::device_uvector key_indices( + thrust::distance(frontier_value_first, frontier_value_last), handle.get_stream()); + rmm::device_uvector value_indices(key_indices.size(), handle.get_stream()); + auto index_pair_first = thrust::make_zip_iterator(key_indices.begin(), value_indices.begin()); + auto index_pair_last = thrust::make_zip_iterator(key_indices.end(), value_indices.end()); + thrust::tabulate(handle.get_thrust_policy(), + index_pair_first, + index_pair_last, + [num_values_per_key] __device__(size_t i) { + return thrust::make_tuple(i / num_values_per_key, + static_cast(i % num_values_per_key)); + }); + + auto num_partitions = thresholds.size() / num_values_per_key + 1; + std::vector v_frontier_partition_offsets(num_partitions + 1); + v_frontier_partition_offsets[0] = size_t{0}; + v_frontier_partition_offsets.back() = + static_cast(thrust::distance(frontier_value_first, frontier_value_last)); + + rmm::device_uvector d_thresholds(thresholds.size(), handle.get_stream()); + raft::update_device( + d_thresholds.data(), thresholds.data(), thresholds.size(), handle.get_stream()); + for (size_t i = 0; i < num_partitions - 1; ++i) { + auto false_first = thrust::partition( + handle.get_thrust_policy(), + index_pair_first, + index_pair_last, + [frontier_value_first, + thresholds = raft::device_span(d_thresholds.data(), d_thresholds.size()), + num_values_per_key, + num_partitions, + true_partition_idx = i] __device__(auto pair) { + auto key_idx = thrust::get<0>(pair); + auto value_idx = thrust::get<1>(pair); + return *(frontier_value_first + key_idx * num_values_per_key + value_idx) < + thresholds[key_idx * (num_partitions - 1) + true_partition_idx]; + }); + v_frontier_partition_offsets[1 + i] = + v_frontier_partition_offsets[i] + thrust::distance(index_pair_first, false_first); + index_pair_first = false_first; + } + + return std::make_tuple( + std::move(key_indices), std::move(value_indices), std::move(v_frontier_partition_offsets)); +} + } // namespace detail } // namespace cugraph diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh index 570b5d8fded..be49ad800a6 100644 --- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh +++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh @@ -80,9 +80,12 @@ struct compute_local_value_displacements_and_global_value_t { partitioned_local_value_displacements{}; // one partition per gpu in the same minor_comm raft::device_span global_values{}; int minor_comm_size{}; + size_t num_values_per_key{}; __device__ void operator()(size_t i) const { + auto key_idx = i / num_values_per_key; + auto value_idx = i % num_values_per_key; constexpr int buffer_size = 8; // tuning parameter value_t displacements[buffer_size]; value_t sum{0}; @@ -92,11 +95,12 @@ struct compute_local_value_displacements_and_global_value_t { displacements[j] = sum; sum += gathered_local_values[i + (round * buffer_size + j) * global_values.size()]; } - thrust::copy( - thrust::seq, - displacements, - displacements + loop_count, - partitioned_local_value_displacements.begin() + i * minor_comm_size + round * buffer_size); + thrust::copy(thrust::seq, + displacements, + displacements + loop_count, + partitioned_local_value_displacements.begin() + + key_idx * num_values_per_key * minor_comm_size + value_idx * minor_comm_size + + round * buffer_size); } global_values[i] = sum; } @@ -120,7 +124,7 @@ struct convert_pair_to_quadruplet_t { auto key_idx = thrust::get<1>(pair); auto local_nbr_value = nbr_value; int minor_comm_rank{-1}; - size_t intra_partition_offset{}; + size_t intra_partition_offset{0}; if (nbr_value != invalid_value) { auto displacement_first = partitioned_local_value_displacements.begin() + key_idx * minor_comm_size; @@ -138,6 +142,52 @@ struct convert_pair_to_quadruplet_t { } }; +// convert a (per-type neighbor value, index) pair to a (minor_comm_rank, intra-partition offset, +// per-type local neighbor value, type, key index) 5-tuple, minor_comm_rank is set to -1 if a +// neighbor value is invalid +template +struct convert_pair_to_5tuple_t { + raft::device_span + partitioned_per_type_local_value_displacements{}; // one partition per gpu in the same + // minor_comm + raft::device_span tx_counts{}; + raft::device_span K_offsets{}; + size_t K_sum; + int minor_comm_size{}; + value_t invalid_value{}; + + __device__ thrust::tuple operator()( + thrust::tuple pair) const + { + auto num_edge_types = K_offsets.size() - 1; + auto per_type_nbr_value = thrust::get<0>(pair); + auto idx = thrust::get<1>(pair); + auto key_idx = idx / K_sum; + auto type = static_cast(thrust::distance( + K_offsets.begin() + 1, + thrust::upper_bound(thrust::seq, K_offsets.begin() + 1, K_offsets.end(), idx % K_sum))); + auto per_type_local_nbr_value = per_type_nbr_value; + int minor_comm_rank{-1}; + size_t intra_partition_offset{0}; + if (per_type_nbr_value != invalid_value) { + auto displacement_first = partitioned_per_type_local_value_displacements.begin() + + (key_idx * num_edge_types + type) * minor_comm_size; + minor_comm_rank = + static_cast(thrust::distance(displacement_first, + thrust::upper_bound(thrust::seq, + displacement_first, + displacement_first + minor_comm_size, + per_type_nbr_value))) - + 1; + per_type_local_nbr_value -= *(displacement_first + minor_comm_rank); + cuda::atomic_ref counter(tx_counts[minor_comm_rank]); + intra_partition_offset = counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed); + } + return thrust::make_tuple( + minor_comm_rank, intra_partition_offset, per_type_local_nbr_value, type, key_idx); + } +}; + struct shuffle_index_compute_offset_t { raft::device_span minor_comm_ranks{}; raft::device_span intra_partition_displacements{}; @@ -329,72 +379,62 @@ __global__ static void compute_valid_local_nbr_count_inclusive_sums_high_local_d } } +// compute unique keys & keys to unique keys mapping (in each edge partition) template std::tuple::value_type>, rmm::device_uvector, - std::vector, std::vector> compute_unique_keys(raft::handle_t const& handle, KeyIterator aggregate_local_frontier_key_first, - raft::host_span local_frontier_displacements, - raft::host_span local_frontier_sizes) + raft::host_span local_frontier_offsets) { using key_t = typename thrust::iterator_traits::value_type; auto aggregate_local_frontier_unique_keys = allocate_dataframe_buffer(0, handle.get_stream()); - auto aggregate_local_frontier_key_idx_to_unique_key_idx = rmm::device_uvector( - local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream()); - auto local_frontier_unique_key_displacements = - std::vector(local_frontier_displacements.size()); - auto local_frontier_unique_key_sizes = std::vector(local_frontier_sizes.size()); - - auto tmp_keys = allocate_dataframe_buffer( - local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream()); - for (size_t i = 0; i < local_frontier_displacements.size(); ++i) { + auto aggregate_local_frontier_key_idx_to_unique_key_idx = + rmm::device_uvector(local_frontier_offsets.back(), handle.get_stream()); + auto local_frontier_unique_key_offsets = std::vector(local_frontier_offsets.size(), 0); + + auto tmp_keys = + allocate_dataframe_buffer(local_frontier_offsets.back(), handle.get_stream()); + std::vector local_frontier_unique_key_sizes(local_frontier_offsets.size() - 1); + for (size_t i = 0; i < local_frontier_unique_key_sizes.size(); ++i) { thrust::copy(handle.get_thrust_policy(), - aggregate_local_frontier_key_first + local_frontier_displacements[i], - aggregate_local_frontier_key_first + local_frontier_displacements[i] + - local_frontier_sizes[i], - get_dataframe_buffer_begin(tmp_keys) + local_frontier_displacements[i]); + aggregate_local_frontier_key_first + local_frontier_offsets[i], + aggregate_local_frontier_key_first + local_frontier_offsets[i + 1], + get_dataframe_buffer_begin(tmp_keys) + local_frontier_offsets[i]); thrust::sort(handle.get_thrust_policy(), - get_dataframe_buffer_begin(tmp_keys) + local_frontier_displacements[i], - get_dataframe_buffer_begin(tmp_keys) + local_frontier_displacements[i] + - local_frontier_sizes[i]); + get_dataframe_buffer_begin(tmp_keys) + local_frontier_offsets[i], + get_dataframe_buffer_begin(tmp_keys) + local_frontier_offsets[i + 1]); local_frontier_unique_key_sizes[i] = thrust::distance( - get_dataframe_buffer_begin(tmp_keys) + local_frontier_displacements[i], + get_dataframe_buffer_begin(tmp_keys) + local_frontier_offsets[i], thrust::unique(handle.get_thrust_policy(), - get_dataframe_buffer_begin(tmp_keys) + local_frontier_displacements[i], - get_dataframe_buffer_begin(tmp_keys) + local_frontier_displacements[i] + - local_frontier_sizes[i])); + get_dataframe_buffer_begin(tmp_keys) + local_frontier_offsets[i], + get_dataframe_buffer_begin(tmp_keys) + local_frontier_offsets[i + 1])); } - std::exclusive_scan(local_frontier_unique_key_sizes.begin(), + std::inclusive_scan(local_frontier_unique_key_sizes.begin(), local_frontier_unique_key_sizes.end(), - local_frontier_unique_key_displacements.begin(), - size_t{0}); - resize_dataframe_buffer( - aggregate_local_frontier_unique_keys, - local_frontier_unique_key_displacements.back() + local_frontier_unique_key_sizes.back(), - handle.get_stream()); - for (size_t i = 0; i < local_frontier_displacements.size(); ++i) { + local_frontier_unique_key_offsets.begin() + 1); + resize_dataframe_buffer(aggregate_local_frontier_unique_keys, + local_frontier_unique_key_offsets.back(), + handle.get_stream()); + for (size_t i = 0; i < local_frontier_unique_key_sizes.size(); ++i) { thrust::copy(handle.get_thrust_policy(), - get_dataframe_buffer_begin(tmp_keys) + local_frontier_displacements[i], - get_dataframe_buffer_begin(tmp_keys) + local_frontier_displacements[i] + - local_frontier_unique_key_sizes[i], + get_dataframe_buffer_begin(tmp_keys) + local_frontier_offsets[i], + get_dataframe_buffer_begin(tmp_keys) + local_frontier_offsets[i + 1], get_dataframe_buffer_begin(aggregate_local_frontier_unique_keys) + - local_frontier_unique_key_displacements[i]); + local_frontier_unique_key_offsets[i]); thrust::transform( handle.get_thrust_policy(), - aggregate_local_frontier_key_first + local_frontier_displacements[i], - aggregate_local_frontier_key_first + local_frontier_displacements[i] + - local_frontier_sizes[i], - aggregate_local_frontier_key_idx_to_unique_key_idx.begin() + local_frontier_displacements[i], + aggregate_local_frontier_key_first + local_frontier_offsets[i], + aggregate_local_frontier_key_first + local_frontier_offsets[i + 1], + aggregate_local_frontier_key_idx_to_unique_key_idx.begin() + local_frontier_offsets[i], cuda::proclaim_return_type( [unique_key_first = get_dataframe_buffer_begin(aggregate_local_frontier_unique_keys) + - local_frontier_unique_key_displacements[i], + local_frontier_unique_key_offsets[i], unique_key_last = get_dataframe_buffer_begin(aggregate_local_frontier_unique_keys) + - local_frontier_unique_key_displacements[i] + - local_frontier_unique_key_sizes[i]] __device__(key_t key) { + local_frontier_unique_key_offsets[i + 1]] __device__(key_t key) { return static_cast(thrust::distance( unique_key_first, thrust::find(thrust::seq, unique_key_first, unique_key_last, key))); })); @@ -402,8 +442,7 @@ compute_unique_keys(raft::handle_t const& handle, return std::make_tuple(std::move(aggregate_local_frontier_unique_keys), std::move(aggregate_local_frontier_key_idx_to_unique_key_idx), - std::move(local_frontier_unique_key_displacements), - std::move(local_frontier_unique_key_sizes)); + std::move(local_frontier_unique_key_offsets)); } template @@ -411,28 +450,35 @@ std::tuple, rmm::device_uvector> compute_frontier_value_sums_and_partitioned_local_value_sum_displacements( raft::handle_t const& handle, raft::device_span aggregate_local_frontier_local_value_sums, - raft::host_span local_frontier_displacements, - raft::host_span local_frontier_sizes) + raft::host_span local_frontier_offsets, + size_t num_values_per_key) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto minor_comm_rank = minor_comm.get_rank(); auto minor_comm_size = minor_comm.get_size(); + std::vector tx_sizes(minor_comm_size * num_values_per_key); + for (int i = 0; i < minor_comm_size; ++i) { + tx_sizes[i] = (local_frontier_offsets[i + 1] - local_frontier_offsets[i]) * num_values_per_key; + } + rmm::device_uvector frontier_gathered_local_value_sums(0, handle.get_stream()); std::tie(frontier_gathered_local_value_sums, std::ignore) = shuffle_values(minor_comm, aggregate_local_frontier_local_value_sums.begin(), #if 1 // FIXME: better update shuffle_values to take host_span - std::vector(local_frontier_sizes.begin(), local_frontier_sizes.end()), + tx_sizes, #else - local_frontier_sizes, + raft::host_span(tx_sizes.data(), tx_sizes.size()), #endif handle.get_stream()); - rmm::device_uvector frontier_value_sums(local_frontier_sizes[minor_comm_rank], - handle.get_stream()); + rmm::device_uvector frontier_value_sums( + (local_frontier_offsets[minor_comm_rank + 1] - local_frontier_offsets[minor_comm_rank]) * + num_values_per_key, + handle.get_stream()); rmm::device_uvector frontier_partitioned_local_value_sum_displacements( - frontier_value_sums.size() * minor_comm_size, handle.get_stream()); + frontier_value_sums.size() * minor_comm_size * num_values_per_key, handle.get_stream()); thrust::for_each( handle.get_thrust_policy(), @@ -444,7 +490,8 @@ compute_frontier_value_sums_and_partitioned_local_value_sum_displacements( raft::device_span(frontier_partitioned_local_value_sum_displacements.data(), frontier_partitioned_local_value_sum_displacements.size()), raft::device_span(frontier_value_sums.data(), frontier_value_sums.size()), - minor_comm_size}); + minor_comm_size, + num_values_per_key}); return std::make_tuple(std::move(frontier_value_sums), std::move(frontier_partitioned_local_value_sum_displacements)); @@ -453,12 +500,10 @@ compute_frontier_value_sums_and_partitioned_local_value_sum_displacements( template std::vector< std::tuple, rmm::device_uvector>> -compute_valid_local_nbr_count_inclusive_sums( - raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexIterator aggregate_local_frontier_major_first, - raft::host_span local_frontier_displacements, - raft::host_span local_frontier_sizes) +compute_valid_local_nbr_count_inclusive_sums(raft::handle_t const& handle, + GraphViewType const& graph_view, + VertexIterator aggregate_local_frontier_major_first, + raft::host_span local_frontier_offsets) { using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; @@ -483,15 +528,12 @@ compute_valid_local_nbr_count_inclusive_sums( *edge_mask_view, i) : thrust::nullopt; - auto edge_partition_frontier_major_first = - aggregate_local_frontier_major_first + local_frontier_displacements[i]; - auto edge_partition_local_degrees = edge_partition.compute_local_degrees( - edge_partition_frontier_major_first, - edge_partition_frontier_major_first + local_frontier_sizes[i], + aggregate_local_frontier_major_first + local_frontier_offsets[i], + aggregate_local_frontier_major_first + local_frontier_offsets[i + 1], handle.get_stream()); - auto inclusive_sum_offsets = - rmm::device_uvector(local_frontier_sizes[i] + 1, handle.get_stream()); + auto inclusive_sum_offsets = rmm::device_uvector( + (local_frontier_offsets[i + 1] - local_frontier_offsets[i]) + 1, handle.get_stream()); inclusive_sum_offsets.set_element_to_zero_async(0, handle.get_stream()); auto size_first = thrust::make_transform_iterator( edge_partition_local_degrees.begin(), @@ -523,7 +565,8 @@ compute_valid_local_nbr_count_inclusive_sums( edge_partition_frontier_indices.begin() + frontier_partition_offsets[2], [edge_partition, edge_partition_e_mask, - edge_partition_frontier_major_first, + edge_partition_frontier_major_first = + aggregate_local_frontier_major_first + local_frontier_offsets[i], inclusive_sum_offsets = raft::device_span(inclusive_sum_offsets.data(), inclusive_sum_offsets.size()), inclusive_sums = raft::device_span(inclusive_sums.data(), @@ -560,7 +603,7 @@ compute_valid_local_nbr_count_inclusive_sums( handle.get_stream()>>>( edge_partition, *edge_partition_e_mask, - edge_partition_frontier_major_first, + aggregate_local_frontier_major_first + local_frontier_offsets[i], raft::device_span(inclusive_sum_offsets.data(), inclusive_sum_offsets.size()), raft::device_span( edge_partition_frontier_indices.data() + frontier_partition_offsets[2], @@ -579,7 +622,7 @@ compute_valid_local_nbr_count_inclusive_sums( handle.get_stream()>>>( edge_partition, *edge_partition_e_mask, - edge_partition_frontier_major_first, + aggregate_local_frontier_major_first + local_frontier_offsets[i], raft::device_span(inclusive_sum_offsets.data(), inclusive_sum_offsets.size()), raft::device_span( edge_partition_frontier_indices.data() + frontier_partition_offsets[3], @@ -672,19 +715,19 @@ rmm::device_uvector compute_uniform_sampling_index_without_replacement( if (high_partition_size > 0) { // to limit memory footprint ((1 << 20) is a tuning parameter), std::max for forward progress // guarantee when high_partition_oversampling_K is exorbitantly large - auto seeds_to_sort_per_iteration = + auto keys_to_sort_per_iteration = std::max(static_cast(handle.get_device_properties().multiProcessorCount * (1 << 20)) / high_partition_oversampling_K, size_t{1}); rmm::device_uvector tmp_nbr_indices( - seeds_to_sort_per_iteration * high_partition_oversampling_K, handle.get_stream()); + keys_to_sort_per_iteration * high_partition_oversampling_K, handle.get_stream()); assert(high_partition_oversampling_K * 2 <= static_cast(std::numeric_limits::max())); rmm::device_uvector tmp_sample_indices( tmp_nbr_indices.size(), handle.get_stream()); // sample indices ([0, high_partition_oversampling_K)) within a segment - // (one segment per seed) + // (one segment per key) rmm::device_uvector segment_sorted_tmp_nbr_indices(tmp_nbr_indices.size(), handle.get_stream()); @@ -695,10 +738,10 @@ rmm::device_uvector compute_uniform_sampling_index_without_replacement( size_t tmp_storage_bytes{0}; auto num_chunks = - (high_partition_size + seeds_to_sort_per_iteration - 1) / seeds_to_sort_per_iteration; + (high_partition_size + keys_to_sort_per_iteration - 1) / keys_to_sort_per_iteration; for (size_t i = 0; i < num_chunks; ++i) { - size_t num_segments = std::min(seeds_to_sort_per_iteration, - high_partition_size - seeds_to_sort_per_iteration * i); + size_t num_segments = + std::min(keys_to_sort_per_iteration, high_partition_size - keys_to_sort_per_iteration * i); rmm::device_uvector unique_counts(num_segments, handle.get_stream()); @@ -709,9 +752,8 @@ rmm::device_uvector compute_uniform_sampling_index_without_replacement( std::optional> retry_segment_sorted_nbr_indices{std::nullopt}; std::optional> retry_segment_sorted_sample_indices{std::nullopt}; while (true) { - auto segment_frontier_index_first = frontier_indices.begin() + - frontier_partition_offsets[2] + - seeds_to_sort_per_iteration * i; + auto segment_frontier_index_first = + frontier_indices.begin() + frontier_partition_offsets[2] + keys_to_sort_per_iteration * i; auto segment_frontier_degree_first = thrust::make_transform_iterator( segment_frontier_index_first, indirection_t{frontier_degrees.begin()}); @@ -964,12 +1006,12 @@ rmm::device_uvector compute_uniform_sampling_index_without_replacement( [K, high_partition_oversampling_K, frontier_indices = frontier_indices.begin() + frontier_partition_offsets[2] + - seeds_to_sort_per_iteration * i, + keys_to_sort_per_iteration * i, tmp_nbr_indices = tmp_nbr_indices.data(), nbr_indices = nbr_indices.data()] __device__(size_t i) { - auto seed_idx = *(frontier_indices + i / K); + auto key_idx = *(frontier_indices + i / K); auto sample_idx = static_cast(i % K); - *(nbr_indices + seed_idx * K + sample_idx) = + *(nbr_indices + key_idx * K + sample_idx) = *(tmp_nbr_indices + (i / K) * high_partition_oversampling_K + sample_idx); }); } @@ -982,7 +1024,7 @@ rmm::device_uvector compute_uniform_sampling_index_without_replacement( } template -void compute_biased_sampling_index_without_replacement( +void compute_homogeneous_biased_sampling_index_without_replacement( raft::handle_t const& handle, std::optional> input_frontier_indices, // input_degree_offsets & input_biases @@ -990,7 +1032,7 @@ void compute_biased_sampling_index_without_replacement( raft::device_span input_degree_offsets, raft::device_span input_biases, // bias 0 edges can't be selected std::optional> - output_frontier_indices, // output_biases is already packed if std::nullopt + output_frontier_indices, // output_nbr_indices is already packed if std::nullopt raft::device_span output_nbr_indices, std::optional> output_keys, raft::random::RngState& rng_state, @@ -1026,8 +1068,8 @@ void compute_biased_sampling_index_without_replacement( raft::update_host( &num_pairs, packed_input_degree_offsets - ? (*packed_input_degree_offsets).data() + ((*packed_input_degree_offsets).size() - 1) - : input_degree_offsets.data() + (input_degree_offsets.size() - 1), + ? (*packed_input_degree_offsets).data() + (*packed_input_degree_offsets).size() - 1 + : input_degree_offsets.data() + input_degree_offsets.size() - 1, 1, handle.get_stream()); handle.sync_stream(); @@ -1078,23 +1120,22 @@ void compute_biased_sampling_index_without_replacement( bias_first, keys.begin(), cuda::proclaim_return_type([] __device__(bias_t r, bias_t b) { - return b > 0.0 - ? cuda::std::min(-log(r) / b, std::numeric_limits::max()) - : std::numeric_limits< - bias_t>::infinity() /* inf used as invalid value (can't be selected) */; + assert(b > + 0.0); // 0 bias neighbors shold be pre-filtered before invoking this function + return cuda::std::min(-log(r) / b, std::numeric_limits::max()); })); } else { - thrust::transform(handle.get_thrust_policy(), - keys.begin(), - keys.end(), - input_biases.begin() + element_offsets[i], - keys.begin(), - cuda::proclaim_return_type([] __device__(bias_t r, bias_t b) { - return b > 0.0 ? cuda::std::min(-log(r) / b, - std::numeric_limits::max()) - : std::numeric_limits::infinity() - /* inf used as invalid value (can't be selected) */; - })); + thrust::transform( + handle.get_thrust_policy(), + keys.begin(), + keys.end(), + input_biases.begin() + element_offsets[i], + keys.begin(), + cuda::proclaim_return_type([] __device__(bias_t r, bias_t b) { + assert(b > + 0.0); // 0 bias neighbors shold be pre-filtered before invoking this function + return cuda::std::min(-log(r) / b, std::numeric_limits::max()); + })); } rmm::device_uvector nbr_indices(keys.size(), handle.get_stream()); @@ -1235,14 +1276,252 @@ void compute_biased_sampling_index_without_replacement( return; } +template +void compute_heterogeneous_biased_sampling_index_without_replacement( + raft::handle_t const& handle, + std::optional> + input_frontier_indices, // input_per_tyep_degree_offsets & input_biases are already packed if + // std::nullopt + raft::device_span input_frontier_edge_types, + raft::device_span input_per_type_degree_offsets, + raft::device_span input_biases, // bias 0 edges can't be selected + raft::device_span output_start_displacements, + raft::device_span output_per_type_nbr_indices, + std::optional> output_keys, + raft::random::RngState& rng_state, + raft::device_span K_offsets, + bool jump) +{ + auto num_edge_types = K_offsets.size() - 1; + + if (jump) { // Algorithm A-ExpJ + CUGRAPH_FAIL( + "unimplemented."); // FIXME: this could be faster especially for high-degree vertices + } else { // Algorithm A-Res + // update packed input degree offsets if input_frontier_indices.has_value() is true + + auto packed_input_per_type_degree_offsets = + input_frontier_indices ? std::make_optional>( + (*input_frontier_indices).size() + 1, handle.get_stream()) + : std::nullopt; + if (packed_input_per_type_degree_offsets) { + (*packed_input_per_type_degree_offsets).set_element_to_zero_async(0, handle.get_stream()); + auto per_type_degree_first = thrust::make_transform_iterator( + thrust::make_zip_iterator((*input_frontier_indices).begin(), + input_frontier_edge_types.begin()), + cuda::proclaim_return_type( + [input_per_type_degree_offsets, num_edge_types] __device__(auto pair) { + auto idx = thrust::get<0>(pair); + auto type = thrust::get<1>(pair); + return input_per_type_degree_offsets[idx * num_edge_types + type + 1] - + input_per_type_degree_offsets[idx * num_edge_types + type]; + })); + thrust::inclusive_scan(handle.get_thrust_policy(), + per_type_degree_first, + per_type_degree_first + (*input_frontier_indices).size(), + (*packed_input_per_type_degree_offsets).begin() + 1); + } + + // generate (key, nbr_index) pairs + + size_t num_pairs{}; + raft::update_host( + &num_pairs, + packed_input_per_type_degree_offsets + ? (*packed_input_per_type_degree_offsets).data() + + (*packed_input_per_type_degree_offsets).size() - 1 + : input_per_type_degree_offsets.data() + input_per_type_degree_offsets.size() - 1, + 1, + handle.get_stream()); + handle.sync_stream(); + + auto approx_edges_to_process_per_iteration = + static_cast(handle.get_device_properties().multiProcessorCount) * + (1 << 18) /* tuning parameter */; + auto [chunk_offsets, element_offsets] = cugraph::detail::compute_offset_aligned_element_chunks( + handle, + raft::device_span( + packed_input_per_type_degree_offsets ? (*packed_input_per_type_degree_offsets).data() + : input_per_type_degree_offsets.data(), + packed_input_per_type_degree_offsets ? (*packed_input_per_type_degree_offsets).size() + : input_per_type_degree_offsets.size()), + num_pairs, + approx_edges_to_process_per_iteration); + auto num_chunks = chunk_offsets.size() - 1; + for (size_t i = 0; i < num_chunks; ++i) { + auto num_chunk_pairs = element_offsets[i + 1] - element_offsets[i]; + rmm::device_uvector keys(num_chunk_pairs, handle.get_stream()); + + cugraph::detail::uniform_random_fill( + handle.get_stream(), keys.data(), keys.size(), bias_t{0.0}, bias_t{1.0}, rng_state); + + if (packed_input_per_type_degree_offsets) { + auto bias_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(element_offsets[i]), + cuda::proclaim_return_type( + [input_biases, + input_per_type_degree_offsets, + frontier_indices = *input_frontier_indices, + frontier_types = input_frontier_edge_types, + packed_input_per_type_degree_offsets = + raft::device_span((*packed_input_per_type_degree_offsets).data(), + (*packed_input_per_type_degree_offsets).size()), + num_edge_types] __device__(size_t i) { + auto it = thrust::upper_bound(thrust::seq, + packed_input_per_type_degree_offsets.begin() + 1, + packed_input_per_type_degree_offsets.end(), + i); + auto idx = thrust::distance(packed_input_per_type_degree_offsets.begin() + 1, it); + auto frontier_idx = frontier_indices[idx]; + auto type = frontier_types[idx]; + return input_biases[input_per_type_degree_offsets[frontier_idx * num_edge_types + + type] + + (i - packed_input_per_type_degree_offsets[idx])]; + })); + thrust::transform( + handle.get_thrust_policy(), + keys.begin(), + keys.end(), + bias_first, + keys.begin(), + cuda::proclaim_return_type([] __device__(bias_t r, bias_t b) { + assert(b > + 0.0); // 0 bias neighbors shold be pre-filtered before invoking this function + return cuda::std::min(-log(r) / b, std::numeric_limits::max()); + })); + } else { + thrust::transform( + handle.get_thrust_policy(), + keys.begin(), + keys.end(), + input_biases.begin() + element_offsets[i], + keys.begin(), + cuda::proclaim_return_type([] __device__(bias_t r, bias_t b) { + assert(b > + 0.0); // 0 bias neighbors shold be pre-filtered before invoking this function + return cuda::std::min(-log(r) / b, std::numeric_limits::max()); + })); + } + + rmm::device_uvector per_type_nbr_indices(keys.size(), handle.get_stream()); + thrust::tabulate(handle.get_thrust_policy(), + per_type_nbr_indices.begin(), + per_type_nbr_indices.end(), + [offsets = packed_input_per_type_degree_offsets + ? raft::device_span( + (*packed_input_per_type_degree_offsets).data(), + (*packed_input_per_type_degree_offsets).size()) + : input_per_type_degree_offsets, + element_offset = element_offsets[i]] __device__(size_t i) { + auto it = thrust::upper_bound( + thrust::seq, offsets.begin() + 1, offsets.end(), element_offset + i); + auto idx = thrust::distance(offsets.begin() + 1, it); + return static_cast((element_offset + i) - offsets[idx]); + }); + + // pick top K for each frontier index + + rmm::device_uvector d_tmp_storage(0, handle.get_stream()); + size_t tmp_storage_bytes{0}; + + rmm::device_uvector segment_sorted_keys(keys.size(), handle.get_stream()); + rmm::device_uvector segment_sorted_per_type_nbr_indices(per_type_nbr_indices.size(), + handle.get_stream()); + + auto offset_first = thrust::make_transform_iterator( + (packed_input_per_type_degree_offsets ? (*packed_input_per_type_degree_offsets).begin() + : input_per_type_degree_offsets.begin()) + + chunk_offsets[i], + detail::shift_left_t{element_offsets[i]}); + cub::DeviceSegmentedSort::SortPairs(static_cast(nullptr), + tmp_storage_bytes, + keys.data(), + segment_sorted_keys.data(), + per_type_nbr_indices.data(), + segment_sorted_per_type_nbr_indices.data(), + keys.size(), + chunk_offsets[i + 1] - chunk_offsets[i], + offset_first, + offset_first + 1, + handle.get_stream()); + if (tmp_storage_bytes > d_tmp_storage.size()) { + d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, handle.get_stream()); + } + cub::DeviceSegmentedSort::SortPairs(d_tmp_storage.data(), + tmp_storage_bytes, + keys.data(), + segment_sorted_keys.data(), + per_type_nbr_indices.data(), + segment_sorted_per_type_nbr_indices.data(), + keys.size(), + chunk_offsets[i + 1] - chunk_offsets[i], + offset_first, + offset_first + 1, + handle.get_stream()); + + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(chunk_offsets[i + 1] - chunk_offsets[i]), + [input_frontier_edge_types, + input_per_type_degree_offsets = + packed_input_per_type_degree_offsets + ? raft::device_span((*packed_input_per_type_degree_offsets).data(), + (*packed_input_per_type_degree_offsets).size()) + : input_per_type_degree_offsets, + chunk_offset = chunk_offsets[i], + output_start_displacements, + output_per_type_nbr_indices, + output_keys, + segment_sorted_keys = + raft::device_span(segment_sorted_keys.data(), segment_sorted_keys.size()), + segment_sorted_per_type_nbr_indices = raft::device_span( + segment_sorted_per_type_nbr_indices.data(), segment_sorted_per_type_nbr_indices.size()), + K_offsets, + invalid_idx = cugraph::invalid_edge_id_v] __device__(size_t i) { + auto key_idx = chunk_offset + i; + auto type = input_frontier_edge_types[key_idx]; + auto K = static_cast(K_offsets[type + 1] - K_offsets[type]); + auto per_type_degree = static_cast(input_per_type_degree_offsets[key_idx + 1] - + input_per_type_degree_offsets[key_idx]); + auto segment_sorted_input_start_offset = + input_per_type_degree_offsets[key_idx] - input_per_type_degree_offsets[chunk_offset]; + auto output_start_offset = output_start_displacements[key_idx]; + edge_t j = 0; + for (; j < cuda::std::min(per_type_degree, K); ++j) { + auto segment_sorted_input_idx = segment_sorted_input_start_offset + j; + auto output_idx = output_start_offset + j; + if (segment_sorted_keys[segment_sorted_input_idx] < + std::numeric_limits::infinity()) { + if (output_keys) { + (*output_keys)[output_idx] = segment_sorted_keys[segment_sorted_input_idx]; + } + output_per_type_nbr_indices[output_idx] = + segment_sorted_per_type_nbr_indices[segment_sorted_input_idx]; + } else { + break; + } + } + for (; j < K; ++j) { + auto output_idx = output_start_offset + j; + if (output_keys) { + (*output_keys)[output_idx] = std::numeric_limits::infinity(); + } + output_per_type_nbr_indices[output_idx] = invalid_idx; + } + }); + } + } + + return; +} + template rmm::device_uvector -compute_aggregate_local_frontier_local_degrees( - raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexIterator aggregate_local_frontier_major_first, - raft::host_span local_frontier_displacements, - raft::host_span local_frontier_sizes) +compute_aggregate_local_frontier_local_degrees(raft::handle_t const& handle, + GraphViewType const& graph_view, + VertexIterator aggregate_local_frontier_major_first, + raft::host_span local_frontier_offsets) { using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; @@ -1251,8 +1530,8 @@ compute_aggregate_local_frontier_local_degrees( auto edge_mask_view = graph_view.edge_mask_view(); - auto aggregate_local_frontier_local_degrees = rmm::device_uvector( - local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream()); + auto aggregate_local_frontier_local_degrees = + rmm::device_uvector(local_frontier_offsets.back(), handle.get_stream()); for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { auto edge_partition = edge_partition_device_view_t( @@ -1264,25 +1543,24 @@ compute_aggregate_local_frontier_local_degrees( *edge_mask_view, i) : thrust::nullopt; - auto edge_partition_frontier_major_first = - aggregate_local_frontier_major_first + local_frontier_displacements[i]; auto edge_partition_frontier_local_degrees = - !edge_partition_e_mask ? edge_partition.compute_local_degrees( - edge_partition_frontier_major_first, - edge_partition_frontier_major_first + local_frontier_sizes[i], - handle.get_stream()) - : edge_partition.compute_local_degrees_with_mask( - (*edge_partition_e_mask).value_first(), - edge_partition_frontier_major_first, - edge_partition_frontier_major_first + local_frontier_sizes[i], - handle.get_stream()); + !edge_partition_e_mask + ? edge_partition.compute_local_degrees( + aggregate_local_frontier_major_first + local_frontier_offsets[i], + aggregate_local_frontier_major_first + local_frontier_offsets[i + 1], + handle.get_stream()) + : edge_partition.compute_local_degrees_with_mask( + (*edge_partition_e_mask).value_first(), + aggregate_local_frontier_major_first + local_frontier_offsets[i], + aggregate_local_frontier_major_first + local_frontier_offsets[i + 1], + handle.get_stream()); // FIXME: this copy is unnecessary if edge_partition.compute_local_degrees() takes a pointer // to the output array thrust::copy(handle.get_thrust_policy(), edge_partition_frontier_local_degrees.begin(), edge_partition_frontier_local_degrees.end(), - aggregate_local_frontier_local_degrees.begin() + local_frontier_displacements[i]); + aggregate_local_frontier_local_degrees.begin() + local_frontier_offsets[i]); } return aggregate_local_frontier_local_degrees; @@ -1303,6 +1581,7 @@ std::tuple::type>, + rmm::device_uvector, rmm::device_uvector> compute_aggregate_local_frontier_biases(raft::handle_t const& handle, GraphViewType const& graph_view, @@ -1311,8 +1590,7 @@ compute_aggregate_local_frontier_biases(raft::handle_t const& handle, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, BiasEdgeOp bias_e_op, - raft::host_span local_frontier_displacements, - raft::host_span local_frontier_sizes, + raft::host_span local_frontier_offsets, bool do_expensive_check) { using vertex_t = typename GraphViewType::vertex_type; @@ -1326,6 +1604,11 @@ compute_aggregate_local_frontier_biases(raft::handle_t const& handle, typename EdgeValueInputWrapper::value_type, BiasEdgeOp>::type; + // 1. collect bias values from local neighbors + + std::vector local_frontier_sizes(local_frontier_offsets.size() - 1); + std::adjacent_difference( + local_frontier_offsets.begin() + 1, local_frontier_offsets.end(), local_frontier_sizes.begin()); auto [aggregate_local_frontier_biases, aggregate_local_frontier_local_degree_offsets] = transform_v_frontier_e( handle, @@ -1336,14 +1619,17 @@ compute_aggregate_local_frontier_biases(raft::handle_t const& handle, edge_value_input, bias_e_op, #if 1 // FIXME: better update shuffle_values to take host_span - std::vector(local_frontier_displacements.begin(), local_frontier_displacements.end()), - std::vector(local_frontier_sizes.begin(), local_frontier_sizes.end()) -#else - local_frontier_displacements, + std::vector(local_frontier_offsets.begin(), local_frontier_offsets.end() - 1), local_frontier_sizes +#else + raft::host_span(local_frontier_offsets.data(), + local_frontier_offsets.size() - 1), + raft::host_span(local_frontier_sizes.data(), local_frontier_sizes.size()) #endif ); + // 2. expensive check + if (do_expensive_check) { auto num_invalid_biases = thrust::count_if( handle.get_thrust_policy(), @@ -1359,149 +1645,565 @@ compute_aggregate_local_frontier_biases(raft::handle_t const& handle, "should not exceed std::numeirc_limits::max()."); } - return std::make_tuple(std::move(aggregate_local_frontier_biases), - std::move(aggregate_local_frontier_local_degree_offsets)); -} + // 3. exclude 0 bias neighbors & update offsets + + rmm::device_uvector aggregate_local_frontier_nz_bias_indices( + aggregate_local_frontier_biases.size(), handle.get_stream()); + thrust::tabulate(handle.get_thrust_policy(), + aggregate_local_frontier_nz_bias_indices.begin(), + aggregate_local_frontier_nz_bias_indices.end(), + [offsets = raft::device_span( + aggregate_local_frontier_local_degree_offsets.data(), + aggregate_local_frontier_local_degree_offsets.size())] __device__(size_t i) { + auto it = + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i); + auto idx = thrust::distance(offsets.begin() + 1, it); + return static_cast(i - offsets[idx]); + }); + + rmm::device_uvector aggregate_local_frontier_local_degrees(local_frontier_offsets.back(), + handle.get_stream()); + thrust::adjacent_difference(handle.get_thrust_policy(), + aggregate_local_frontier_local_degree_offsets.begin() + 1, + aggregate_local_frontier_local_degree_offsets.end(), + aggregate_local_frontier_local_degrees.begin()); -// drop the sample_nbr_values array elements having invalid_value if multi_gpu is true -template -std::tuple, - std::optional>, - std::vector> -shuffle_and_compute_local_nbr_values(raft::handle_t const& handle, - rmm::device_uvector&& sample_nbr_values, - std::optional> - frontier_partitioned_value_local_sum_displacements, - size_t K, - value_t invalid_value) -{ - int minor_comm_size{1}; - if constexpr (multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - minor_comm_size = minor_comm.get_size(); + { + auto pair_first = thrust::make_zip_iterator(aggregate_local_frontier_biases.begin(), + thrust::make_counting_iterator(size_t{0})); + thrust::for_each(handle.get_thrust_policy(), + pair_first, + pair_first + aggregate_local_frontier_biases.size(), + [offsets = raft::device_span( + aggregate_local_frontier_local_degree_offsets.data(), + aggregate_local_frontier_local_degree_offsets.size()), + degrees = raft::device_span( + aggregate_local_frontier_local_degrees.data(), + aggregate_local_frontier_local_degrees.size())] __device__(auto pair) { + auto bias = thrust::get<0>(pair); + if (bias == 0.0) { + auto i = thrust::get<1>(pair); + auto it = + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i); + auto idx = thrust::distance(offsets.begin() + 1, it); + cuda::atomic_ref degree(degrees[idx]); + degree.fetch_sub(size_t{1}, cuda::std::memory_order_relaxed); + } + }); } - auto sample_local_nbr_values = std::move( - sample_nbr_values); // neighbor value within an edge partition (note that each vertex's - // neighbors are distributed in minor_comm_size partitions) - std::optional> key_indices{ - std::nullopt}; // relevant only when (minor_comm_size > 1) - std::vector local_frontier_sample_offsets{}; - if (minor_comm_size > 1) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - - key_indices = rmm::device_uvector(sample_local_nbr_values.size(), handle.get_stream()); - auto minor_comm_ranks = - rmm::device_uvector(sample_local_nbr_values.size(), handle.get_stream()); - auto intra_partition_displacements = - rmm::device_uvector(sample_local_nbr_values.size(), handle.get_stream()); - rmm::device_uvector d_tx_counts(minor_comm_size, handle.get_stream()); - thrust::fill(handle.get_thrust_policy(), d_tx_counts.begin(), d_tx_counts.end(), size_t{0}); - auto input_pair_first = thrust::make_zip_iterator( - thrust::make_tuple(sample_local_nbr_values.begin(), - thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}), - divider_t{K}))); - thrust::transform( - handle.get_thrust_policy(), - input_pair_first, - input_pair_first + sample_local_nbr_values.size(), - thrust::make_zip_iterator(thrust::make_tuple(minor_comm_ranks.begin(), - intra_partition_displacements.begin(), - sample_local_nbr_values.begin(), - (*key_indices).begin())), - convert_pair_to_quadruplet_t{ - raft::device_span( - (*frontier_partitioned_value_local_sum_displacements).data(), - (*frontier_partitioned_value_local_sum_displacements).size()), - raft::device_span(d_tx_counts.data(), d_tx_counts.size()), - minor_comm_size, - invalid_value}); - rmm::device_uvector tx_displacements(minor_comm_size, handle.get_stream()); - thrust::exclusive_scan( - handle.get_thrust_policy(), d_tx_counts.begin(), d_tx_counts.end(), tx_displacements.begin()); - auto tmp_sample_local_nbr_values = - rmm::device_uvector(tx_displacements.back_element(handle.get_stream()) + - d_tx_counts.back_element(handle.get_stream()), - handle.get_stream()); - auto tmp_key_indices = - rmm::device_uvector(tmp_sample_local_nbr_values.size(), handle.get_stream()); - auto pair_first = thrust::make_zip_iterator( - thrust::make_tuple(sample_local_nbr_values.begin(), (*key_indices).begin())); - thrust::scatter_if( - handle.get_thrust_policy(), - pair_first, - pair_first + sample_local_nbr_values.size(), - thrust::make_transform_iterator( - thrust::make_counting_iterator(size_t{0}), - shuffle_index_compute_offset_t{ - raft::device_span(minor_comm_ranks.data(), minor_comm_ranks.size()), - raft::device_span(intra_partition_displacements.data(), - intra_partition_displacements.size()), - raft::device_span(tx_displacements.data(), tx_displacements.size())}), - minor_comm_ranks.begin(), - thrust::make_zip_iterator( - thrust::make_tuple(tmp_sample_local_nbr_values.begin(), tmp_key_indices.begin())), - is_not_equal_t{-1}); - - sample_local_nbr_values = std::move(tmp_sample_local_nbr_values); - key_indices = std::move(tmp_key_indices); - - std::vector h_tx_counts(d_tx_counts.size()); - raft::update_host( - h_tx_counts.data(), d_tx_counts.data(), d_tx_counts.size(), handle.get_stream()); - handle.sync_stream(); + thrust::inclusive_scan(handle.get_thrust_policy(), + aggregate_local_frontier_local_degrees.begin(), + aggregate_local_frontier_local_degrees.end(), + aggregate_local_frontier_local_degree_offsets.begin() + 1); - pair_first = thrust::make_zip_iterator( - thrust::make_tuple(sample_local_nbr_values.begin(), (*key_indices).begin())); - auto [rx_value_buffer, rx_counts] = - shuffle_values(minor_comm, pair_first, h_tx_counts, handle.get_stream()); - - sample_local_nbr_values = std::move(std::get<0>(rx_value_buffer)); - key_indices = std::move(std::get<1>(rx_value_buffer)); - local_frontier_sample_offsets = std::vector(rx_counts.size() + 1); - local_frontier_sample_offsets[0] = size_t{0}; - std::inclusive_scan( - rx_counts.begin(), rx_counts.end(), local_frontier_sample_offsets.begin() + 1); - } else { - local_frontier_sample_offsets = std::vector{size_t{0}, sample_local_nbr_values.size()}; + { + auto pair_first = thrust::make_zip_iterator(aggregate_local_frontier_biases.begin(), + aggregate_local_frontier_nz_bias_indices.begin()); + auto pair_last = + thrust::remove_if(handle.get_thrust_policy(), + pair_first, + pair_first + aggregate_local_frontier_biases.size(), + [] __device__(auto pair) { return thrust::get<0>(pair) == 0.0; }); + aggregate_local_frontier_biases.resize(thrust::distance(pair_first, pair_last), + handle.get_stream()); + aggregate_local_frontier_nz_bias_indices.resize(thrust::distance(pair_first, pair_last), + handle.get_stream()); + aggregate_local_frontier_biases.shrink_to_fit(handle.get_stream()); + aggregate_local_frontier_nz_bias_indices.shrink_to_fit(handle.get_stream()); } - return std::make_tuple(std::move(sample_local_nbr_values), - std::move(key_indices), - std::move(local_frontier_sample_offsets)); + return std::make_tuple(std::move(aggregate_local_frontier_biases), + std::move(aggregate_local_frontier_nz_bias_indices), + std::move(aggregate_local_frontier_local_degree_offsets)); } -template -std::tuple /* local_nbr_indices */, - std::optional> /* key_indices */, - std::vector /* local_frontier_sample_offsets */> -homogeneous_biased_sample_with_replacement( +// return (bias segmented local inclusive sums, segment offsets) pairs for each key in the aggregate +// local frontier +template +std::tuple::value_type, + typename GraphViewType::vertex_type, + typename EdgeSrcValueInputWrapper::value_type, + typename EdgeDstValueInputWrapper::value_type, + typename EdgeValueInputWrapper::value_type, + BiasEdgeOp>::type>, + rmm::device_uvector, + rmm::device_uvector, + rmm::device_uvector> +compute_aggregate_local_frontier_bias_type_pairs( raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyIterator aggregate_local_frontier_key_first, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + BiasEdgeOp bias_e_op, + EdgeTypeInputWrapper edge_type_input, raft::host_span local_frontier_displacements, raft::host_span local_frontier_sizes, - raft::device_span aggregate_local_frontier_key_idx_to_unique_key_idx, - raft::host_span local_frontier_unique_key_displacements, - raft::host_span local_frontier_unique_key_sizes, - raft::device_span aggregate_local_frontier_unique_key_biases, - raft::device_span aggregate_local_frontier_unique_key_local_degree_offsets, - raft::random::RngState& rng_state, - size_t K) + bool do_expensive_check) { - int minor_comm_rank{0}; - int minor_comm_size{1}; - if constexpr (multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - minor_comm_rank = minor_comm.get_rank(); - minor_comm_size = minor_comm.get_size(); - } - - auto num_local_edge_partitions = local_frontier_unique_key_displacements.size(); + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = typename thrust::iterator_traits::value_type; - rmm::device_uvector local_nbr_indices(0, handle.get_stream()); - std::optional> key_indices{std::nullopt}; - std::vector local_frontier_sample_offsets{}; + using bias_t = typename edge_op_result_type::type; + + auto [aggregate_local_frontier_bias_type_pairs, aggregate_local_frontier_local_degree_offsets] = + transform_v_frontier_e( + handle, + graph_view, + aggregate_local_frontier_key_first, + edge_src_value_input, + edge_dst_value_input, + view_concat(edge_value_input, edge_type_input), + [bias_e_op] __device__(auto src, auto dst, auto src_val, auto dst_val, auto e_val) { + return thrust::make_tuple(bias_e_op(src, dst, src_val, dst_val, thrust::get<0>(e_val)), + thrust::get<1>(e_val)); + }, +#if 1 // FIXME: better update shuffle_values to take host_span + std::vector(local_frontier_displacements.begin(), local_frontier_displacements.end()), + std::vector(local_frontier_sizes.begin(), local_frontier_sizes.end()) +#else + local_frontier_displacements, + local_frontier_sizes +#endif + ); + + if (do_expensive_check) { + auto num_invalid_biases = thrust::count_if( + handle.get_thrust_policy(), + std::get<0>(aggregate_local_frontier_bias_type_pairs).begin(), + std::get<0>(aggregate_local_frontier_bias_type_pairs).end(), + check_out_of_range_t{bias_t{0.0}, std::numeric_limits::max()}); + if constexpr (GraphViewType::is_multi_gpu) { + num_invalid_biases = host_scalar_allreduce( + handle.get_comms(), num_invalid_biases, raft::comms::op_t::SUM, handle.get_stream()); + } + CUGRAPH_EXPECTS(num_invalid_biases == 0, + "invalid_input_argument: bias_e_op return values should be non-negative and " + "should not exceed std::numeirc_limits::max()."); + } + + // 3. exclude 0 bias neighbors & update offsets + + rmm::device_uvector aggregate_local_frontier_nz_bias_indices( + std::get<0>(aggregate_local_frontier_bias_type_pairs).size(), handle.get_stream()); + thrust::tabulate(handle.get_thrust_policy(), + aggregate_local_frontier_nz_bias_indices.begin(), + aggregate_local_frontier_nz_bias_indices.end(), + [offsets = raft::device_span( + aggregate_local_frontier_local_degree_offsets.data(), + aggregate_local_frontier_local_degree_offsets.size())] __device__(size_t i) { + auto it = + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i); + auto idx = thrust::distance(offsets.begin() + 1, it); + return static_cast(i - offsets[idx]); + }); + + rmm::device_uvector aggregate_local_frontier_local_degrees( + local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream()); + thrust::adjacent_difference(handle.get_thrust_policy(), + aggregate_local_frontier_local_degree_offsets.begin() + 1, + aggregate_local_frontier_local_degree_offsets.end(), + aggregate_local_frontier_local_degrees.begin()); + + { + auto pair_first = + thrust::make_zip_iterator(std::get<0>(aggregate_local_frontier_bias_type_pairs).begin(), + thrust::make_counting_iterator(size_t{0})); + thrust::for_each(handle.get_thrust_policy(), + pair_first, + pair_first + std::get<0>(aggregate_local_frontier_bias_type_pairs).size(), + [offsets = raft::device_span( + aggregate_local_frontier_local_degree_offsets.data(), + aggregate_local_frontier_local_degree_offsets.size()), + degrees = raft::device_span( + aggregate_local_frontier_local_degrees.data(), + aggregate_local_frontier_local_degrees.size())] __device__(auto pair) { + auto bias = thrust::get<0>(pair); + if (bias == 0.0) { + auto i = thrust::get<1>(pair); + auto it = + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i); + auto idx = thrust::distance(offsets.begin() + 1, it); + cuda::atomic_ref degree(degrees[idx]); + return degree.fetch_sub(size_t{1}, cuda::std::memory_order_relaxed); + } + }); + } + + thrust::inclusive_scan(handle.get_thrust_policy(), + aggregate_local_frontier_local_degrees.begin(), + aggregate_local_frontier_local_degrees.end(), + aggregate_local_frontier_local_degree_offsets.begin() + 1); + + { + auto triplet_first = + thrust::make_zip_iterator(std::get<0>(aggregate_local_frontier_bias_type_pairs).begin(), + std::get<1>(aggregate_local_frontier_bias_type_pairs).begin(), + aggregate_local_frontier_nz_bias_indices.begin()); + auto triplet_last = thrust::remove_if( + handle.get_thrust_policy(), + triplet_first, + triplet_first + std::get<0>(aggregate_local_frontier_bias_type_pairs).size(), + [] __device__(auto triplet) { return thrust::get<0>(triplet) == 0.0; }); + std::get<0>(aggregate_local_frontier_bias_type_pairs) + .resize(thrust::distance(triplet_first, triplet_last), handle.get_stream()); + std::get<1>(aggregate_local_frontier_bias_type_pairs) + .resize(thrust::distance(triplet_first, triplet_last), handle.get_stream()); + aggregate_local_frontier_nz_bias_indices.resize(thrust::distance(triplet_first, triplet_last), + handle.get_stream()); + std::get<0>(aggregate_local_frontier_bias_type_pairs).shrink_to_fit(handle.get_stream()); + std::get<1>(aggregate_local_frontier_bias_type_pairs).shrink_to_fit(handle.get_stream()); + aggregate_local_frontier_nz_bias_indices.shrink_to_fit(handle.get_stream()); + } + + return std::make_tuple(std::move(std::get<0>(aggregate_local_frontier_bias_type_pairs)), + std::move(std::get<1>(aggregate_local_frontier_bias_type_pairs)), + std::move(aggregate_local_frontier_nz_bias_indices), + std::move(aggregate_local_frontier_local_degree_offsets)); +} + +template +std::tuple, rmm::device_uvector, std::vector> +shuffle_and_compute_local_nbr_values( + raft::handle_t const& handle, + rmm::device_uvector&& sample_nbr_values, + raft::device_span frontier_partitioned_value_local_sum_displacements, + size_t K, + value_t invalid_value) +{ + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + auto sample_local_nbr_values = std::move( + sample_nbr_values); // neighbor value within an edge partition (note that each vertex's + // neighbors are distributed in minor_comm_size partitions) + rmm::device_uvector key_indices(sample_local_nbr_values.size(), handle.get_stream()); + auto minor_comm_ranks = + rmm::device_uvector(sample_local_nbr_values.size(), handle.get_stream()); + auto intra_partition_displacements = + rmm::device_uvector(sample_local_nbr_values.size(), handle.get_stream()); + + rmm::device_uvector d_tx_counts(minor_comm_size, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), d_tx_counts.begin(), d_tx_counts.end(), size_t{0}); + auto input_pair_first = + thrust::make_zip_iterator(sample_local_nbr_values.begin(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), divider_t{K})); + thrust::transform( + handle.get_thrust_policy(), + input_pair_first, + input_pair_first + sample_local_nbr_values.size(), + thrust::make_zip_iterator(thrust::make_tuple(minor_comm_ranks.begin(), + intra_partition_displacements.begin(), + sample_local_nbr_values.begin(), + key_indices.begin())), + convert_pair_to_quadruplet_t{ + raft::device_span(frontier_partitioned_value_local_sum_displacements.data(), + frontier_partitioned_value_local_sum_displacements.size()), + raft::device_span(d_tx_counts.data(), d_tx_counts.size()), + minor_comm_size, + invalid_value}); + rmm::device_uvector tx_displacements(minor_comm_size, handle.get_stream()); + thrust::exclusive_scan( + handle.get_thrust_policy(), d_tx_counts.begin(), d_tx_counts.end(), tx_displacements.begin()); + auto tmp_sample_local_nbr_values = + rmm::device_uvector(tx_displacements.back_element(handle.get_stream()) + + d_tx_counts.back_element(handle.get_stream()), + handle.get_stream()); + auto tmp_key_indices = + rmm::device_uvector(tmp_sample_local_nbr_values.size(), handle.get_stream()); + auto pair_first = thrust::make_zip_iterator(sample_local_nbr_values.begin(), key_indices.begin()); + thrust::scatter_if( + handle.get_thrust_policy(), + pair_first, + pair_first + sample_local_nbr_values.size(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + shuffle_index_compute_offset_t{ + raft::device_span(minor_comm_ranks.data(), minor_comm_ranks.size()), + raft::device_span(intra_partition_displacements.data(), + intra_partition_displacements.size()), + raft::device_span(tx_displacements.data(), tx_displacements.size())}), + minor_comm_ranks.begin(), + thrust::make_zip_iterator( + thrust::make_tuple(tmp_sample_local_nbr_values.begin(), tmp_key_indices.begin())), + is_not_equal_t{-1}); + + sample_local_nbr_values = std::move(tmp_sample_local_nbr_values); + key_indices = std::move(tmp_key_indices); + + std::vector h_tx_counts(d_tx_counts.size()); + raft::update_host( + h_tx_counts.data(), d_tx_counts.data(), d_tx_counts.size(), handle.get_stream()); + handle.sync_stream(); + + pair_first = thrust::make_zip_iterator( + thrust::make_tuple(sample_local_nbr_values.begin(), key_indices.begin())); + auto [rx_value_buffer, rx_counts] = + shuffle_values(minor_comm, pair_first, h_tx_counts, handle.get_stream()); + + sample_local_nbr_values = std::move(std::get<0>(rx_value_buffer)); + key_indices = std::move(std::get<1>(rx_value_buffer)); + auto local_frontier_sample_offsets = std::vector(rx_counts.size() + 1); + local_frontier_sample_offsets[0] = size_t{0}; + std::inclusive_scan( + rx_counts.begin(), rx_counts.end(), local_frontier_sample_offsets.begin() + 1); + + return std::make_tuple(std::move(sample_local_nbr_values), + std::move(key_indices), + std::move(local_frontier_sample_offsets)); +} + +template +std::tuple, + rmm::device_uvector, + rmm::device_uvector, + std::vector> +shuffle_and_compute_per_type_local_nbr_values( + raft::handle_t const& handle, + rmm::device_uvector&& sample_per_type_nbr_values, + raft::device_span frontier_partitioned_per_type_value_local_sum_displacements, + raft::device_span K_offsets, + size_t K_sum, + value_t invalid_value) +{ + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + auto num_edge_types = K_offsets.size() - 1; + + auto sample_per_type_local_nbr_values = + std::move(sample_per_type_nbr_values); // neighbor value within an edge partition (note that + // each vertex's neighbors are distributed in + // minor_comm_size partitions) + rmm::device_uvector edge_types(sample_per_type_local_nbr_values.size(), + handle.get_stream()); + rmm::device_uvector key_indices(sample_per_type_local_nbr_values.size(), + handle.get_stream()); + auto minor_comm_ranks = + rmm::device_uvector(sample_per_type_local_nbr_values.size(), handle.get_stream()); + auto intra_partition_displacements = + rmm::device_uvector(sample_per_type_local_nbr_values.size(), handle.get_stream()); + + rmm::device_uvector d_tx_counts(minor_comm_size, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), d_tx_counts.begin(), d_tx_counts.end(), size_t{0}); + auto input_pair_first = thrust::make_zip_iterator(sample_per_type_local_nbr_values.begin(), + thrust::make_counting_iterator(size_t{0})); + thrust::transform(handle.get_thrust_policy(), + input_pair_first, + input_pair_first + sample_per_type_local_nbr_values.size(), + thrust::make_zip_iterator(minor_comm_ranks.begin(), + intra_partition_displacements.begin(), + sample_per_type_local_nbr_values.begin(), + edge_types.begin(), + key_indices.begin()), + convert_pair_to_5tuple_t{ + raft::device_span( + frontier_partitioned_per_type_value_local_sum_displacements.data(), + frontier_partitioned_per_type_value_local_sum_displacements.size()), + raft::device_span(d_tx_counts.data(), d_tx_counts.size()), + K_offsets, + K_sum, + minor_comm_size, + invalid_value}); // SK, minor_comm_rank, disp, value, +type+, key + rmm::device_uvector tx_displacements(minor_comm_size, handle.get_stream()); + thrust::exclusive_scan( + handle.get_thrust_policy(), d_tx_counts.begin(), d_tx_counts.end(), tx_displacements.begin()); + auto tmp_sample_per_type_local_nbr_values = + rmm::device_uvector(tx_displacements.back_element(handle.get_stream()) + + d_tx_counts.back_element(handle.get_stream()), + handle.get_stream()); + auto tmp_edge_types = rmm::device_uvector( + tmp_sample_per_type_local_nbr_values.size(), handle.get_stream()); + auto tmp_key_indices = + rmm::device_uvector(tmp_sample_per_type_local_nbr_values.size(), handle.get_stream()); + auto triplet_first = thrust::make_zip_iterator( + sample_per_type_local_nbr_values.begin(), edge_types.begin(), key_indices.begin()); + thrust::scatter_if( + handle.get_thrust_policy(), + triplet_first, + triplet_first + sample_per_type_local_nbr_values.size(), + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + shuffle_index_compute_offset_t{ + raft::device_span(minor_comm_ranks.data(), minor_comm_ranks.size()), + raft::device_span(intra_partition_displacements.data(), + intra_partition_displacements.size()), + raft::device_span(tx_displacements.data(), tx_displacements.size())}), + minor_comm_ranks.begin(), + thrust::make_zip_iterator(tmp_sample_per_type_local_nbr_values.begin(), + tmp_edge_types.begin(), + tmp_key_indices.begin()), + is_not_equal_t{-1}); + + sample_per_type_local_nbr_values = std::move(tmp_sample_per_type_local_nbr_values); + edge_types = std::move(tmp_edge_types); + key_indices = std::move(tmp_key_indices); + + std::vector h_tx_counts(d_tx_counts.size()); + raft::update_host( + h_tx_counts.data(), d_tx_counts.data(), d_tx_counts.size(), handle.get_stream()); + handle.sync_stream(); + + triplet_first = thrust::make_zip_iterator( + sample_per_type_local_nbr_values.begin(), edge_types.begin(), key_indices.begin()); + // SK triplet... + auto [rx_value_buffer, rx_counts] = + shuffle_values(minor_comm, triplet_first, h_tx_counts, handle.get_stream()); + + sample_per_type_local_nbr_values = std::move(std::get<0>(rx_value_buffer)); + edge_types = std::move(std::get<1>(rx_value_buffer)); + key_indices = std::move(std::get<2>(rx_value_buffer)); + auto local_frontier_sample_offsets = std::vector(rx_counts.size() + 1); + local_frontier_sample_offsets[0] = size_t{0}; + std::inclusive_scan( + rx_counts.begin(), rx_counts.end(), local_frontier_sample_offsets.begin() + 1); + + return std::make_tuple(std::move(sample_per_type_local_nbr_values), + std::move(key_indices), + std::move(edge_types), + std::move(local_frontier_sample_offsets)); +} + +#if 0 +// aggregate local frontier (index, type) pairs and compute per-type local degrees (one for each +// pair) +template +thrust::tuple, + rmm::device_uvector, + std::vector, + rmm::device_uvector> +aggregate_sub_frontier_and_compute_per_type_local_degrees( + raft::handle_t const& handle, + raft::device_span aggregate_local_frontier_key_idx_to_unique_key_idx, + raft::host_span local_frontier_offsets, + raft::device_span aggregate_local_frontier_unique_key_per_type_local_degree_offsets, + raft::host_span local_frontier_unique_key_offsets, + raft::device_span sub_frontier_indices, + raft::device_span sub_frontier_types, + size_t num_edge_types) +{ + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + + auto num_local_edge_partitions = local_frontier_offsets.size() - 1; + + assert(sub_frontier_indices.size() == sub_frontier_types.size()); + + auto sub_frontier_size = sub_frontier_indices.size(); + + auto sub_local_frontier_sizes = + host_scalar_allgather(minor_comm, sub_frontier_size, handle.get_stream()); + std::vector sub_local_frontier_offsets(sub_local_frontier_sizes.size() + 1); + sub_local_frontier_offsets[0] = 0; + std::inclusive_scan(sub_local_frontier_sizes.begin(), + sub_local_frontier_sizes.end(), + sub_local_frontier_offsets.begin() + 1); + + auto aggregate_sub_local_frontier_index_type_pairs = + allocate_dataframe_buffer>(sub_local_frontier_offsets.back(), + handle.get_stream()); + rmm::device_uvector aggregate_sub_local_frontier_per_type_local_degrees( + 0, handle.get_stream()); + if (sub_local_frontier_offsets.back() > 0) { + // aggregate frontier index type pairs + + auto aggregate_sub_local_frontier_index_type_pairs = + allocate_dataframe_buffer>( + sub_local_frontier_offsets.back(), handle.get_stream()); + device_allgatherv( + minor_comm, + thrust::make_zip_iterator(sub_frontier_indices.begin(), sub_frontier_types.begin()), + get_dataframe_buffer_beign(aggregate_sub_local_frontier_index_type_pairs), + sub_local_frontier_sizes, + std::vector(sub_local_frontier_offsets.begin(), sub_local_frontier_offsets.end() - 1), + handle.get_stream()); + + // compute per-type local degrees for the aggregated frontier indices + + aggregate_sub_local_frontier_per_type_local_degrees.resize( + size_dataframe_buffer(aggregate_sub_local_frontier_index_type_pairs), handle.get_stream()); + for (size_t i = 0; i < num_local_edge_partitions; ++i) { + thrust::transform( + handle.get_thrust_policy(), + get_dataframe_buffer_begin(aggregate_sub_local_frontier_index_type_pairs) + + sub_local_frontier_offsets[i], + get_dataframe_buffer_begin(aggregate_sub_local_frontier_index_type_pairs) + + sub_local_frontier_offsets[i + 1], + aggregate_sub_local_frontier_per_type_local_degrees.begin() + sub_local_frontier_offsets[i], + cuda::proclaim_return_type( + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), + unique_key_per_type_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data() + + local_frontier_unique_key_offsets[i] * num_edge_types, + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) * + num_edge_types + + 1), + num_edge_types] __device__(auto pair) { + auto key_idx = thrust::get<0>(pair); + auto edge_type = thrust::get<1>(pair); + auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; + return static_cast( + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + edge_type + + 1] - + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + + edge_type]); + })); + } + } + + return std::make_tuple(std::move(std::get<0>(aggregate_sub_local_frontier_index_type_pairs)), + std::move(std::get<1>(aggregate_sub_local_frontier_index_type_pairs)), + std::move(sub_local_frontier_offsets), + std::move(aggregate_sub_local_frontier_per_type_local_degrees)); +} +#endif + +template +std::tuple /* local_nbr_indices */, + std::optional> /* key_indices */, + std::vector /* local_frontier_sample_offsets */> +homogeneous_biased_sample_with_replacement( + raft::handle_t const& handle, + raft::host_span local_frontier_offsets, + raft::device_span aggregate_local_frontier_key_idx_to_unique_key_idx, + raft::host_span local_frontier_unique_key_offsets, + raft::device_span aggregate_local_frontier_unique_key_biases, + raft::device_span aggregate_local_frontier_unique_key_local_degree_offsets, + raft::random::RngState& rng_state, + size_t K) +{ + int minor_comm_rank{0}; + int minor_comm_size{1}; + if constexpr (multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + minor_comm_rank = minor_comm.get_rank(); + minor_comm_size = minor_comm.get_size(); + } + + auto num_local_edge_partitions = local_frontier_offsets.size() - 1; + + rmm::device_uvector local_nbr_indices(0, handle.get_stream()); + std::optional> key_indices{std::nullopt}; + std::vector local_frontier_sample_offsets{}; - // compute segmented inclusive sums (one segment per seed) + // compute segmented inclusive sums (one segment per key) auto unique_key_first = thrust::make_transform_iterator( thrust::make_counting_iterator(size_t{0}), @@ -1523,25 +2225,24 @@ homogeneous_biased_sample_with_replacement( aggregate_local_frontier_unique_key_biases.begin(), aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.begin()); - // sum collect local bias values (one value per seed) and collect local bias sums + // sum collect local bias values (one value per key) and collect local bias sums - auto aggregate_local_frontier_bias_local_sums = rmm::device_uvector( - local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream()); + auto aggregate_local_frontier_bias_local_sums = + rmm::device_uvector(local_frontier_offsets.back(), handle.get_stream()); for (size_t i = 0; i < num_local_edge_partitions; ++i) { thrust::tabulate( handle.get_thrust_policy(), get_dataframe_buffer_begin(aggregate_local_frontier_bias_local_sums) + - local_frontier_displacements[i], + local_frontier_offsets[i], get_dataframe_buffer_begin(aggregate_local_frontier_bias_local_sums) + - local_frontier_displacements[i] + local_frontier_sizes[i], - [key_idx_to_unique_key_idx = - raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data() + - local_frontier_displacements[i], - local_frontier_sizes[i]), + local_frontier_offsets[i + 1], + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), unique_key_local_degree_offsets = raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data() + - local_frontier_unique_key_displacements[i], - local_frontier_unique_key_sizes[i] + 1), + local_frontier_unique_key_offsets[i], + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) + 1), aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums = raft::device_span( aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.data(), @@ -1568,8 +2269,8 @@ homogeneous_biased_sample_with_replacement( handle, raft::device_span(aggregate_local_frontier_bias_local_sums.data(), aggregate_local_frontier_bias_local_sums.size()), - local_frontier_displacements, - local_frontier_sizes); + local_frontier_offsets, + 1); aggregate_local_frontier_bias_local_sums.resize(0, handle.get_stream()); aggregate_local_frontier_bias_local_sums.shrink_to_fit(handle.get_stream()); } else { @@ -1578,8 +2279,9 @@ homogeneous_biased_sample_with_replacement( // sample & compute local neighbor indices - rmm::device_uvector sample_random_numbers(local_frontier_sizes[minor_comm_rank] * K, - handle.get_stream()); + rmm::device_uvector sample_random_numbers( + (local_frontier_offsets[minor_comm_rank + 1] - local_frontier_offsets[minor_comm_rank]) * K, + handle.get_stream()); cugraph::detail::uniform_random_fill(handle.get_stream(), sample_random_numbers.data(), sample_random_numbers.size(), @@ -1592,26 +2294,30 @@ homogeneous_biased_sample_with_replacement( sample_random_numbers.end(), thrust::make_counting_iterator(size_t{0}), sample_random_numbers.begin(), - [frontier_bias_sums = - raft::device_span(frontier_bias_sums.data(), frontier_bias_sums.size()), - K, - invalid_value = std::numeric_limits::infinity()] __device__(bias_t r, size_t i) { - // frontier_bias_sums[i / K] will be 0 if degree is 0 or all the edges have 0 bias - return frontier_bias_sums[i / K] > 0.0 ? r * frontier_bias_sums[i / K] : invalid_value; - }); + cuda::proclaim_return_type( + [frontier_bias_sums = + raft::device_span(frontier_bias_sums.data(), frontier_bias_sums.size()), + K, + invalid_value = std::numeric_limits::infinity()] __device__(bias_t r, size_t i) { + // frontier_bias_sums[i / K] will be 0 if degree is 0 or all the edges have 0 bias + return frontier_bias_sums[i / K] > 0.0 ? r * frontier_bias_sums[i / K] : invalid_value; + })); rmm::device_uvector sample_local_random_numbers(0, handle.get_stream()); - std::tie(sample_local_random_numbers, key_indices, local_frontier_sample_offsets) = - shuffle_and_compute_local_nbr_values( - handle, - std::move(sample_random_numbers), - frontier_partitioned_bias_local_sum_displacements - ? std::make_optional>( - (*frontier_partitioned_bias_local_sum_displacements).data(), - (*frontier_partitioned_bias_local_sum_displacements).size()) - : std::nullopt, - K, - std::numeric_limits::infinity()); + if (minor_comm_size > 1) { + std::tie(sample_local_random_numbers, key_indices, local_frontier_sample_offsets) = + shuffle_and_compute_local_nbr_values( + handle, + std::move(sample_random_numbers), + raft::device_span( + (*frontier_partitioned_bias_local_sum_displacements).data(), + (*frontier_partitioned_bias_local_sum_displacements).size()), + K, + std::numeric_limits::infinity()); + } else { + sample_local_random_numbers = std::move(sample_random_numbers); + local_frontier_sample_offsets = {size_t{0}, sample_local_random_numbers.size()}; + } local_nbr_indices.resize(sample_local_random_numbers.size(), handle.get_stream()); for (size_t i = 0; i < num_local_edge_partitions; ++i) { @@ -1623,23 +2329,22 @@ homogeneous_biased_sample_with_replacement( sample_local_random_numbers = raft::device_span( sample_local_random_numbers.data() + local_frontier_sample_offsets[i], local_frontier_sample_offsets[i + 1] - local_frontier_sample_offsets[i]), - key_indices = key_indices - ? thrust::make_optional>( + key_indices = key_indices + ? thrust::make_optional>( (*key_indices).data() + local_frontier_sample_offsets[i], local_frontier_sample_offsets[i + 1] - local_frontier_sample_offsets[i]) - : thrust::nullopt, - key_idx_to_unique_key_idx = - raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data() + - local_frontier_displacements[i], - local_frontier_sizes[i]), + : thrust::nullopt, + key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums = raft::device_span( aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.data(), aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.size()), unique_key_local_degree_offsets = raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data() + - local_frontier_unique_key_displacements[i], - local_frontier_unique_key_sizes[i] + 1), + local_frontier_unique_key_offsets[i], + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) + 1), invalid_random_number = std::numeric_limits::infinity(), invalid_idx = cugraph::invalid_edge_id_v] __device__(size_t i) { auto key_idx = key_indices ? (*key_indices)[i] : (i / K); @@ -1674,11 +2379,9 @@ std::tuple /* local_nbr_indices */, std::vector /* local_frontier_sample_offsets */> homogeneous_biased_sample_without_replacement( raft::handle_t const& handle, - raft::host_span local_frontier_displacements, - raft::host_span local_frontier_sizes, + raft::host_span local_frontier_offsets, raft::device_span aggregate_local_frontier_key_idx_to_unique_key_idx, - raft::host_span local_frontier_unique_key_displacements, - raft::host_span local_frontier_unique_key_sizes, + raft::host_span local_frontier_unique_key_offsets, raft::device_span aggregate_local_frontier_unique_key_biases, raft::device_span aggregate_local_frontier_unique_key_local_degree_offsets, raft::random::RngState& rng_state, @@ -1692,7 +2395,7 @@ homogeneous_biased_sample_without_replacement( minor_comm_size = minor_comm.get_size(); } - auto num_local_edge_partitions = local_frontier_unique_key_displacements.size(); + auto num_local_edge_partitions = local_frontier_offsets.size() - 1; rmm::device_uvector local_nbr_indices(0, handle.get_stream()); std::optional> key_indices{std::nullopt}; @@ -1703,21 +2406,20 @@ homogeneous_biased_sample_without_replacement( std::nullopt}; { rmm::device_uvector aggregate_local_frontier_local_degrees( - local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream()); + local_frontier_offsets.back(), handle.get_stream()); for (size_t i = 0; i < num_local_edge_partitions; ++i) { thrust::tabulate( handle.get_thrust_policy(), - aggregate_local_frontier_local_degrees.begin() + local_frontier_displacements[i], - aggregate_local_frontier_local_degrees.begin() + local_frontier_displacements[i] + - local_frontier_sizes[i], + aggregate_local_frontier_local_degrees.begin() + local_frontier_offsets[i], + aggregate_local_frontier_local_degrees.begin() + local_frontier_offsets[i + 1], [key_idx_to_unique_key_idx = raft::device_span( - aggregate_local_frontier_key_idx_to_unique_key_idx.data() + - local_frontier_displacements[i], - local_frontier_sizes[i]), + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), unique_key_local_degree_offsets = raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data() + - local_frontier_unique_key_displacements[i], - local_frontier_unique_key_sizes[i] + 1)] __device__(size_t i) { + local_frontier_unique_key_offsets[i], + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) + + 1)] __device__(size_t i) { auto unique_key_idx = key_idx_to_unique_key_idx[i]; return unique_key_local_degree_offsets[unique_key_idx + 1] - unique_key_local_degree_offsets[unique_key_idx]; @@ -1729,8 +2431,8 @@ homogeneous_biased_sample_without_replacement( handle, raft::device_span(aggregate_local_frontier_local_degrees.data(), aggregate_local_frontier_local_degrees.size()), - local_frontier_displacements, - local_frontier_sizes); + local_frontier_offsets, + 1); } else { frontier_degrees = std::move(aggregate_local_frontier_local_degrees); } @@ -1742,180 +2444,28 @@ homogeneous_biased_sample_without_replacement( frontier_degrees.end(), std::vector{static_cast(K + 1), static_cast(minor_comm_size * K * 2)}); - rmm::device_uvector nbr_indices(frontier_degrees.size() * K, handle.get_stream()); - if (minor_comm_size > 1) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - - std::vector low_local_frontier_sizes{}; - low_local_frontier_sizes = - host_scalar_allgather(minor_comm, frontier_partition_offsets[1], handle.get_stream()); - std::vector low_local_frontier_displacements(low_local_frontier_sizes.size()); - std::exclusive_scan(low_local_frontier_sizes.begin(), - low_local_frontier_sizes.end(), - low_local_frontier_displacements.begin(), - size_t{0}); - - if (low_local_frontier_displacements.back() + low_local_frontier_sizes.back() > 0) { - // aggregate frontier indices with their degrees in the low range - - auto aggregate_low_local_frontier_indices = rmm::device_uvector( - low_local_frontier_displacements.back() + low_local_frontier_sizes.back(), - handle.get_stream()); - device_allgatherv(minor_comm, - frontier_indices.begin(), - aggregate_low_local_frontier_indices.begin(), - low_local_frontier_sizes, - low_local_frontier_displacements, - handle.get_stream()); - - // collect 0 bias value neighbor indices - - rmm::device_uvector zero_bias_frontier_indices( - aggregate_low_local_frontier_indices.size() * K /* generous upper bound */, - handle.get_stream()); - rmm::device_uvector zero_bias_local_nbr_indices(zero_bias_frontier_indices.size(), - handle.get_stream()); - rmm::device_scalar counter(0, handle.get_stream()); - std::vector zero_bias_count_inclusive_sums(low_local_frontier_sizes.size()); - for (size_t i = 0; i < num_local_edge_partitions; ++i) { - thrust::for_each( - handle.get_thrust_policy(), - aggregate_low_local_frontier_indices.begin() + low_local_frontier_displacements[i], - aggregate_low_local_frontier_indices.begin() + - (low_local_frontier_displacements[i] + low_local_frontier_sizes[i]), - [key_idx_to_unique_key_idx = raft::device_span( - aggregate_local_frontier_key_idx_to_unique_key_idx.data() + - local_frontier_displacements[i], - local_frontier_sizes[i]), - aggregate_local_frontier_unique_key_biases = - raft::device_span(aggregate_local_frontier_unique_key_biases.data(), - aggregate_local_frontier_unique_key_biases.size()), - unique_key_local_degree_offsets = raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data() + - local_frontier_unique_key_displacements[i], - local_frontier_unique_key_sizes[i] + 1), - zero_bias_frontier_indices = raft::device_span( - zero_bias_frontier_indices.data(), zero_bias_frontier_indices.size()), - zero_bias_local_nbr_indices = raft::device_span( - zero_bias_local_nbr_indices.data(), zero_bias_local_nbr_indices.size()), - input_offset = local_frontier_displacements[i], - counter = counter.data()] __device__(size_t i) { - auto unique_key_idx = key_idx_to_unique_key_idx[i]; - auto start_offset = unique_key_local_degree_offsets[unique_key_idx]; - auto end_offset = unique_key_local_degree_offsets[unique_key_idx + 1]; - cuda::atomic_ref atomic_counter(*counter); - for (auto j = start_offset; j < end_offset; ++j) { - if (aggregate_local_frontier_unique_key_biases[j] == 0.0) { - auto idx = atomic_counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed); - zero_bias_frontier_indices[idx] = i; - zero_bias_local_nbr_indices[idx] = j - start_offset; - } - } - }); - zero_bias_count_inclusive_sums[i] = counter.value(handle.get_stream()); - } - zero_bias_frontier_indices.resize(zero_bias_count_inclusive_sums.back(), handle.get_stream()); - zero_bias_frontier_indices.shrink_to_fit(handle.get_stream()); - zero_bias_local_nbr_indices.resize(zero_bias_frontier_indices.size(), handle.get_stream()); - zero_bias_local_nbr_indices.shrink_to_fit(handle.get_stream()); - std::vector zero_bias_counts(zero_bias_count_inclusive_sums.size()); - std::adjacent_difference(zero_bias_count_inclusive_sums.begin(), - zero_bias_count_inclusive_sums.end(), - zero_bias_counts.begin()); - - rmm::device_uvector low_frontier_gathered_zero_bias_frontier_indices( - 0, handle.get_stream()); - rmm::device_uvector low_frontier_gathered_zero_bias_nbr_indices(0, - handle.get_stream()); - std::vector rx_counts{}; - std::forward_as_tuple(std::tie(low_frontier_gathered_zero_bias_frontier_indices, - low_frontier_gathered_zero_bias_nbr_indices), - rx_counts) = - shuffle_values(minor_comm, - thrust::make_zip_iterator(zero_bias_frontier_indices.begin(), - zero_bias_local_nbr_indices.begin()), - zero_bias_counts, - handle.get_stream()); - std::vector rx_displacements(rx_counts.size()); - std::exclusive_scan(rx_counts.begin(), rx_counts.end(), rx_displacements.begin(), size_t{0}); - - // convert local neighbor indices to global neighbor indices and sort - - auto pair_first = - thrust::make_zip_iterator(low_frontier_gathered_zero_bias_frontier_indices.begin(), - low_frontier_gathered_zero_bias_nbr_indices.begin()); - for (size_t i = 0; i < num_local_edge_partitions; ++i) { - thrust::transform( - handle.get_thrust_policy(), - pair_first + rx_displacements[i], - pair_first + rx_displacements[i] + rx_counts[i], - low_frontier_gathered_zero_bias_nbr_indices.begin() + rx_displacements[i], - cuda::proclaim_return_type( - [frontier_partitioned_local_degree_displacements = raft::device_span( - (*frontier_partitioned_local_degree_displacements).data(), - (*frontier_partitioned_local_degree_displacements).size()), - minor_comm_size, - minor_comm_rank = i] __device__(auto pair) { - auto frontier_idx = thrust::get<0>(pair); - auto local_nbr_idx = thrust::get<1>(pair); - return frontier_partitioned_local_degree_displacements[frontier_idx * - minor_comm_size + - minor_comm_rank] + - local_nbr_idx; - })); - } + rmm::device_uvector nbr_indices(frontier_degrees.size() * K, handle.get_stream()); - thrust::sort(handle.get_thrust_policy(), - pair_first, - pair_first + low_frontier_gathered_zero_bias_frontier_indices.size()); - - // update neighbor indices excluding zero bias neighbor indices + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + if (frontier_partition_offsets[1] > 0) { thrust::for_each( handle.get_thrust_policy(), frontier_indices.begin(), frontier_indices.begin() + frontier_partition_offsets[1], - [sorted_zero_bias_frontier_indices = - raft::device_span(low_frontier_gathered_zero_bias_frontier_indices.data(), - low_frontier_gathered_zero_bias_frontier_indices.size()), - sorted_zero_bias_nbr_indices = - raft::device_span(low_frontier_gathered_zero_bias_nbr_indices.data(), - low_frontier_gathered_zero_bias_nbr_indices.size()), - frontier_degrees = + [frontier_degrees = raft::device_span(frontier_degrees.data(), frontier_degrees.size()), nbr_indices = raft::device_span(nbr_indices.data(), nbr_indices.size()), K, invalid_idx = cugraph::invalid_edge_id_v] __device__(size_t i) { - auto first = thrust::lower_bound(thrust::seq, - sorted_zero_bias_frontier_indices.begin(), - sorted_zero_bias_frontier_indices.end(), - i); - auto last = - thrust::upper_bound(thrust::seq, first, sorted_zero_bias_frontier_indices.end(), i); - auto degree = frontier_degrees[i]; - edge_t num_valid = 0; - if (thrust::distance(first, last) == 0) { - thrust::sequence(thrust::seq, - nbr_indices.begin() + i * K, - nbr_indices.begin() + i * K + degree, - edge_t{0}); - num_valid = degree; - } else { - auto start_offset = thrust::distance(sorted_zero_bias_frontier_indices.begin(), first); - auto end_offset = thrust::distance(sorted_zero_bias_frontier_indices.begin(), last); - for (size_t j = 0; j < degree; ++j) { - if (!thrust::binary_search(thrust::seq, - sorted_zero_bias_nbr_indices.begin() + start_offset, - sorted_zero_bias_nbr_indices.begin() + end_offset, - j)) { - *(nbr_indices.begin() + i * K + num_valid) = j; - ++num_valid; - } - } - } + auto degree = frontier_degrees[i]; + thrust::sequence(thrust::seq, + nbr_indices.begin() + i * K, + nbr_indices.begin() + i * K + degree, + edge_t{0}); thrust::fill(thrust::seq, - nbr_indices.begin() + i * K + num_valid, + nbr_indices.begin() + i * K + +degree, nbr_indices.begin() + (i + 1) * K, invalid_idx); }); @@ -1925,23 +2475,23 @@ homogeneous_biased_sample_without_replacement( std::vector mid_local_frontier_sizes{}; mid_local_frontier_sizes = host_scalar_allgather(minor_comm, mid_frontier_size, handle.get_stream()); - std::vector mid_local_frontier_displacements(mid_local_frontier_sizes.size()); - std::exclusive_scan(mid_local_frontier_sizes.begin(), + std::vector mid_local_frontier_offsets(mid_local_frontier_sizes.size() + 1); + mid_local_frontier_offsets[0] = 0; + std::inclusive_scan(mid_local_frontier_sizes.begin(), mid_local_frontier_sizes.end(), - mid_local_frontier_displacements.begin(), - size_t{0}); + mid_local_frontier_offsets.begin() + 1); - if (mid_local_frontier_displacements.back() + mid_local_frontier_sizes.back() > 0) { + if (mid_local_frontier_offsets.back() > 0) { // aggregate frontier indices with their degrees in the medium range - auto aggregate_mid_local_frontier_indices = rmm::device_uvector( - mid_local_frontier_displacements.back() + mid_local_frontier_sizes.back(), - handle.get_stream()); + auto aggregate_mid_local_frontier_indices = + rmm::device_uvector(mid_local_frontier_offsets.back(), handle.get_stream()); device_allgatherv(minor_comm, frontier_indices.begin() + frontier_partition_offsets[1], aggregate_mid_local_frontier_indices.begin(), mid_local_frontier_sizes, - mid_local_frontier_displacements, + std::vector(mid_local_frontier_offsets.begin(), + mid_local_frontier_offsets.end() - 1), handle.get_stream()); // compute local degrees for the aggregated frontier indices @@ -1951,19 +2501,19 @@ homogeneous_biased_sample_without_replacement( for (size_t i = 0; i < num_local_edge_partitions; ++i) { thrust::transform( handle.get_thrust_policy(), - aggregate_mid_local_frontier_indices.begin() + mid_local_frontier_displacements[i], - aggregate_mid_local_frontier_indices.begin() + mid_local_frontier_displacements[i] + - mid_local_frontier_sizes[i], - aggregate_mid_local_frontier_local_degrees.begin() + mid_local_frontier_displacements[i], + aggregate_mid_local_frontier_indices.begin() + mid_local_frontier_offsets[i], + aggregate_mid_local_frontier_indices.begin() + mid_local_frontier_offsets[i + 1], + aggregate_mid_local_frontier_local_degrees.begin() + mid_local_frontier_offsets[i], cuda::proclaim_return_type( [key_idx_to_unique_key_idx = raft::device_span( aggregate_local_frontier_key_idx_to_unique_key_idx.data() + - local_frontier_displacements[i], - local_frontier_sizes[i]), + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), unique_key_local_degree_offsets = raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data() + - local_frontier_unique_key_displacements[i], - local_frontier_unique_key_sizes[i] + 1)] __device__(size_t i) { + local_frontier_unique_key_offsets[i], + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) + + 1)] __device__(size_t i) { auto unique_key_idx = key_idx_to_unique_key_idx[i]; return static_cast(unique_key_local_degree_offsets[unique_key_idx + 1] - unique_key_local_degree_offsets[unique_key_idx]); @@ -1973,7 +2523,7 @@ homogeneous_biased_sample_without_replacement( // gather biases for the aggregated frontier indices rmm::device_uvector aggregate_mid_local_frontier_biases(0, handle.get_stream()); - std::vector mid_local_frontier_degree_sums(mid_local_frontier_sizes.size()); + std::vector mid_local_frontier_local_degree_sums(mid_local_frontier_sizes.size()); { rmm::device_uvector aggregate_mid_local_frontier_local_degree_offsets( aggregate_mid_local_frontier_local_degrees.size() + 1, handle.get_stream()); @@ -1987,8 +2537,6 @@ homogeneous_biased_sample_without_replacement( aggregate_mid_local_frontier_local_degree_offsets.back_element(handle.get_stream()), handle.get_stream()); - std::vector mid_local_frontier_degree_sum_lasts( - mid_local_frontier_degree_sums.size()); for (size_t i = 0; i < num_local_edge_partitions; ++i) { thrust::for_each( handle.get_thrust_policy(), @@ -1996,17 +2544,18 @@ homogeneous_biased_sample_without_replacement( thrust::make_counting_iterator(mid_local_frontier_sizes[i]), [key_idx_to_unique_key_idx = raft::device_span( aggregate_local_frontier_key_idx_to_unique_key_idx.data() + - local_frontier_displacements[i], - local_frontier_sizes[i]), + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), aggregate_local_frontier_unique_key_biases = raft::device_span(aggregate_local_frontier_unique_key_biases.data(), aggregate_local_frontier_unique_key_biases.size()), unique_key_local_degree_offsets = raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data() + - local_frontier_unique_key_displacements[i], - local_frontier_unique_key_sizes[i] + 1), + local_frontier_unique_key_offsets[i], + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) + + 1), mid_local_frontier_indices = raft::device_span( - aggregate_mid_local_frontier_indices.data() + mid_local_frontier_displacements[i], + aggregate_mid_local_frontier_indices.data() + mid_local_frontier_offsets[i], mid_local_frontier_sizes[i]), aggregate_mid_local_frontier_biases = raft::device_span(aggregate_mid_local_frontier_biases.data(), @@ -2014,7 +2563,7 @@ homogeneous_biased_sample_without_replacement( aggregate_mid_local_frontier_local_degree_offsets = raft::device_span( aggregate_mid_local_frontier_local_degree_offsets.data(), aggregate_mid_local_frontier_local_degree_offsets.size()), - output_offset = mid_local_frontier_displacements[i]] __device__(size_t i) { + output_offset = mid_local_frontier_offsets[i]] __device__(size_t i) { auto unique_key_idx = key_idx_to_unique_key_idx[mid_local_frontier_indices[i]]; thrust::copy(thrust::seq, aggregate_local_frontier_unique_key_biases.begin() + @@ -2024,14 +2573,33 @@ homogeneous_biased_sample_without_replacement( aggregate_mid_local_frontier_biases.begin() + aggregate_mid_local_frontier_local_degree_offsets[output_offset + i]); }); - mid_local_frontier_degree_sum_lasts[i] = - aggregate_mid_local_frontier_local_degree_offsets.element( - mid_local_frontier_displacements[i] + mid_local_frontier_sizes[i], - handle.get_stream()); } - std::adjacent_difference(mid_local_frontier_degree_sum_lasts.begin(), - mid_local_frontier_degree_sum_lasts.end(), - mid_local_frontier_degree_sums.begin()); + + rmm::device_uvector d_mid_local_frontier_offsets(mid_local_frontier_offsets.size(), + handle.get_stream()); + raft::update_device(d_mid_local_frontier_offsets.data(), + mid_local_frontier_offsets.data(), + mid_local_frontier_offsets.size(), + handle.get_stream()); + rmm::device_uvector d_lasts(num_local_edge_partitions, handle.get_stream()); + auto map_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [mid_local_frontier_offsets = raft::device_span( + d_mid_local_frontier_offsets.data(), + d_mid_local_frontier_offsets.size())] __device__(size_t i) { + return mid_local_frontier_offsets[i + 1]; + })); + thrust::gather(handle.get_thrust_policy(), + map_first, + map_first + num_local_edge_partitions, + aggregate_mid_local_frontier_local_degree_offsets.begin(), + d_lasts.begin()); + std::vector h_lasts(d_lasts.size()); + raft::update_host(h_lasts.data(), d_lasts.data(), d_lasts.size(), handle.get_stream()); + handle.sync_stream(); + std::adjacent_difference( + h_lasts.begin(), h_lasts.end(), mid_local_frontier_local_degree_sums.begin()); } aggregate_mid_local_frontier_indices.resize(0, handle.get_stream()); aggregate_mid_local_frontier_indices.shrink_to_fit(handle.get_stream()); @@ -2063,7 +2631,7 @@ homogeneous_biased_sample_without_replacement( std::tie(mid_frontier_gathered_biases, std::ignore) = shuffle_values(minor_comm, aggregate_mid_local_frontier_biases.data(), - mid_local_frontier_degree_sums, + mid_local_frontier_local_degree_sums, handle.get_stream()); aggregate_mid_local_frontier_biases.resize(0, handle.get_stream()); aggregate_mid_local_frontier_biases.shrink_to_fit(handle.get_stream()); @@ -2116,7 +2684,7 @@ homogeneous_biased_sample_without_replacement( // now sample and update indices - compute_biased_sampling_index_without_replacement( + compute_homogeneous_biased_sampling_index_without_replacement( handle, std::nullopt, raft::device_span(mid_frontier_degree_offsets.data(), @@ -2136,29 +2704,28 @@ homogeneous_biased_sample_without_replacement( high_local_frontier_sizes = host_scalar_allgather(minor_comm, high_frontier_size, handle.get_stream()); - std::vector high_local_frontier_displacements(high_local_frontier_sizes.size()); - std::exclusive_scan(high_local_frontier_sizes.begin(), + std::vector high_local_frontier_offsets(high_local_frontier_sizes.size() + 1); + high_local_frontier_offsets[0] = 0; + std::inclusive_scan(high_local_frontier_sizes.begin(), high_local_frontier_sizes.end(), - high_local_frontier_displacements.begin(), - size_t{0}); - if (high_local_frontier_displacements.back() + high_local_frontier_sizes.back() > 0) { + high_local_frontier_offsets.begin() + 1); + if (high_local_frontier_offsets.back() > 0) { // aggregate frontier indices with their degrees in the high range - auto aggregate_high_local_frontier_indices = rmm::device_uvector( - high_local_frontier_displacements.back() + high_local_frontier_sizes.back(), - handle.get_stream()); + auto aggregate_high_local_frontier_indices = + rmm::device_uvector(high_local_frontier_offsets.back(), handle.get_stream()); device_allgatherv(minor_comm, frontier_indices.begin() + frontier_partition_offsets[2], aggregate_high_local_frontier_indices.begin(), high_local_frontier_sizes, - high_local_frontier_displacements, + std::vector(high_local_frontier_offsets.begin(), + high_local_frontier_offsets.end() - 1), handle.get_stream()); // local sample and update indices rmm::device_uvector aggregate_high_local_frontier_local_nbr_indices( - (high_local_frontier_displacements.back() + high_local_frontier_sizes.back()) * K, - handle.get_stream()); + high_local_frontier_offsets.back() * K, handle.get_stream()); rmm::device_uvector aggregate_high_local_frontier_keys( aggregate_high_local_frontier_local_nbr_indices.size(), handle.get_stream()); for (size_t i = 0; i < num_local_edge_partitions; ++i) { @@ -2166,28 +2733,26 @@ homogeneous_biased_sample_without_replacement( handle.get_stream()); thrust::gather( handle.get_thrust_policy(), - aggregate_high_local_frontier_indices.data() + high_local_frontier_displacements[i], - aggregate_high_local_frontier_indices.data() + high_local_frontier_displacements[i] + - high_local_frontier_sizes[i], - aggregate_local_frontier_key_idx_to_unique_key_idx.data() + - local_frontier_displacements[i], + aggregate_high_local_frontier_indices.begin() + high_local_frontier_offsets[i], + aggregate_high_local_frontier_indices.begin() + high_local_frontier_offsets[i + 1], + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], unique_key_indices_for_key_indices.begin()); - compute_biased_sampling_index_without_replacement( + compute_homogeneous_biased_sampling_index_without_replacement( handle, std::make_optional>( unique_key_indices_for_key_indices.data(), unique_key_indices_for_key_indices.size()), raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data() + - local_frontier_unique_key_displacements[i], - local_frontier_unique_key_sizes[i] + 1), + local_frontier_unique_key_offsets[i], + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) + 1), raft::device_span(aggregate_local_frontier_unique_key_biases.data(), aggregate_local_frontier_unique_key_biases.size()), std::nullopt, raft::device_span(aggregate_high_local_frontier_local_nbr_indices.data() + - high_local_frontier_displacements[i] * K, + high_local_frontier_offsets[i] * K, high_local_frontier_sizes[i] * K), std::make_optional>( - aggregate_high_local_frontier_keys.data() + high_local_frontier_displacements[i] * K, + aggregate_high_local_frontier_keys.data() + high_local_frontier_offsets[i] * K, high_local_frontier_sizes[i] * K), rng_state, K, @@ -2319,7 +2884,18 @@ homogeneous_biased_sample_without_replacement( nbr_indices.begin() + high_frontier_indices[i] * K); }); } + + std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = + shuffle_and_compute_local_nbr_values( + handle, + std::move(nbr_indices), + raft::device_span((*frontier_partitioned_local_degree_displacements).data(), + (*frontier_partitioned_local_degree_displacements).size()), + K, + cugraph::invalid_edge_id_v); + } else { // minor_comm_size == 1 + local_nbr_indices.resize(frontier_degrees.size() * K, handle.get_stream()); // sample from low-degree vertices thrust::for_each( @@ -2335,25 +2911,19 @@ homogeneous_biased_sample_without_replacement( aggregate_local_frontier_unique_key_local_degree_offsets = raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data(), aggregate_local_frontier_unique_key_local_degree_offsets.size()), - nbr_indices = raft::device_span(nbr_indices.data(), nbr_indices.size()), + local_nbr_indices = + raft::device_span(local_nbr_indices.data(), local_nbr_indices.size()), K, invalid_idx = cugraph::invalid_edge_id_v] __device__(size_t i) { auto unique_key_idx = key_idx_to_unique_key_idx[i]; - auto start_offset = - aggregate_local_frontier_unique_key_local_degree_offsets[unique_key_idx]; auto degree = aggregate_local_frontier_unique_key_local_degree_offsets[unique_key_idx + 1] - - start_offset; - edge_t num_valid = 0; + aggregate_local_frontier_unique_key_local_degree_offsets[unique_key_idx]; for (size_t j = 0; j < degree; ++j) { - auto bias = aggregate_local_frontier_unique_key_biases[start_offset + j]; - if (bias > 0.0) { - *(nbr_indices.begin() + i * K + num_valid) = j; - ++num_valid; - } + *(local_nbr_indices.begin() + i * K + j) = j; } thrust::fill(thrust::seq, - nbr_indices.begin() + i * K + num_valid, - nbr_indices.begin() + (i + 1) * K, + local_nbr_indices.begin() + i * K + degree, + local_nbr_indices.begin() + (i + 1) * K, invalid_idx); }); @@ -2364,11 +2934,11 @@ homogeneous_biased_sample_without_replacement( handle.get_stream()); thrust::gather( handle.get_thrust_policy(), - frontier_indices.data() + frontier_partition_offsets[1], - frontier_indices.data() + frontier_partition_offsets[1] + mid_and_high_frontier_size, + frontier_indices.begin() + frontier_partition_offsets[1], + frontier_indices.begin() + frontier_partition_offsets[1] + mid_and_high_frontier_size, aggregate_local_frontier_key_idx_to_unique_key_idx.begin(), unique_key_indices_for_key_indices.begin()); - compute_biased_sampling_index_without_replacement( + compute_homogeneous_biased_sampling_index_without_replacement( handle, std::make_optional>( unique_key_indices_for_key_indices.data(), unique_key_indices_for_key_indices.size()), @@ -2379,75 +2949,909 @@ homogeneous_biased_sample_without_replacement( aggregate_local_frontier_unique_key_biases.size()), std::make_optional>( frontier_indices.data() + frontier_partition_offsets[1], mid_and_high_frontier_size), - raft::device_span(nbr_indices.data(), nbr_indices.size()), + raft::device_span(local_nbr_indices.data(), local_nbr_indices.size()), std::nullopt, rng_state, K, false); - } - std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = - shuffle_and_compute_local_nbr_values( - handle, - std::move(nbr_indices), - frontier_partitioned_local_degree_displacements - ? std::make_optional>( - (*frontier_partitioned_local_degree_displacements).data(), - (*frontier_partitioned_local_degree_displacements).size()) - : std::nullopt, - K, - cugraph::invalid_edge_id_v); + local_frontier_sample_offsets = std::vector{0, local_nbr_indices.size()}; + } return std::make_tuple( std::move(local_nbr_indices), std::move(key_indices), std::move(local_frontier_sample_offsets)); } -// skip conversion if local neighbor index is cugraph::invalid_edge_id_v -template -rmm::device_uvector convert_to_unmasked_local_nbr_idx( +template +std::tuple /* local_nbr_indices */, + std::optional> /* key_indices */, + std::vector /* local_frontier_sample_offsets */> +heterogeneous_biased_sample_without_replacement( raft::handle_t const& handle, - GraphViewType const& graph_view, - VertexIterator aggregate_local_frontier_major_first, - rmm::device_uvector&& local_nbr_indices, - std::optional> key_indices, - raft::host_span local_frontier_sample_offsets, - raft::host_span local_frontier_displacements, - raft::host_span local_frontier_sizes, - size_t K) + raft::host_span local_frontier_offsets, + raft::device_span aggregate_local_frontier_key_idx_to_unique_key_idx, + raft::host_span local_frontier_unique_key_offsets, + raft::device_span aggregate_local_frontier_unique_key_biases, + raft::device_span aggregate_local_frontier_unique_key_per_type_local_degree_offsets, + raft::random::RngState& rng_state, + raft::host_span Ks) { - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - static_assert( - std::is_same_v::value_type>); + int minor_comm_rank{0}; + int minor_comm_size{1}; + if constexpr (multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + minor_comm_rank = minor_comm.get_rank(); + minor_comm_size = minor_comm.get_size(); + } - auto edge_mask_view = graph_view.edge_mask_view(); + auto num_local_edge_partitions = local_frontier_offsets.size() - 1; + auto num_edge_types = static_cast(Ks.size()); - auto [aggregate_local_frontier_unique_majors, - aggregate_local_frontier_major_idx_to_unique_major_idx, - local_frontier_unique_major_displacements, - local_frontier_unique_major_sizes] = - compute_unique_keys(handle, - aggregate_local_frontier_major_first, - local_frontier_displacements, - local_frontier_sizes); + rmm::device_uvector local_nbr_indices(0, handle.get_stream()); + std::optional> key_indices{std::nullopt}; + std::vector local_frontier_sample_offsets{}; - // to avoid searching the entire neighbor list K times for high degree vertices with edge masking - auto local_frontier_unique_major_valid_local_nbr_count_inclusive_sums = - compute_valid_local_nbr_count_inclusive_sums( + rmm::device_uvector frontier_per_type_degrees(0, handle.get_stream()); + std::optional> + frontier_partitioned_per_type_local_degree_displacements{std::nullopt}; + { + rmm::device_uvector aggregate_local_frontier_per_type_local_degrees( + local_frontier_offsets.back() * num_edge_types, handle.get_stream()); + for (size_t i = 0; i < num_local_edge_partitions; ++i) { + thrust::tabulate( + handle.get_thrust_policy(), + aggregate_local_frontier_per_type_local_degrees.begin() + + local_frontier_offsets[i] * num_edge_types, + aggregate_local_frontier_per_type_local_degrees.begin() + + local_frontier_offsets[i + 1] * num_edge_types, + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), + unique_key_per_type_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data() + + local_frontier_unique_key_offsets[i] * num_edge_types, + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) * + num_edge_types + + 1)] __device__(size_t i) { + auto key_idx = i / num_edge_types; + auto edge_type = static_cast(i % num_edge_types); + auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; + return static_cast( + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + edge_type + + 1] - + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + edge_type]); + }); + } + if (minor_comm_size > 1) { + std::tie(frontier_per_type_degrees, + frontier_partitioned_per_type_local_degree_displacements) = + compute_frontier_value_sums_and_partitioned_local_value_sum_displacements( + handle, + raft::device_span(aggregate_local_frontier_per_type_local_degrees.data(), + aggregate_local_frontier_per_type_local_degrees.size()), + local_frontier_offsets, + num_edge_types); + } else { + frontier_per_type_degrees = std::move(aggregate_local_frontier_per_type_local_degrees); + } + } + + std::vector thresholds(num_edge_types * 2); + for (size_t i = 0; i < num_edge_types; ++i) { + thresholds[i * 2] = static_cast(Ks[i] + 1); + thresholds[i * 2 + 1] = static_cast(minor_comm_size * Ks[i] * 2); + } + auto [frontier_indices, frontier_edge_types, frontier_partition_offsets] = + partition_v_frontier_per_value_idx( handle, - graph_view, - aggregate_local_frontier_unique_majors.begin(), - raft::host_span(local_frontier_unique_major_displacements.data(), - local_frontier_unique_major_displacements.size()), - raft::host_span(local_frontier_unique_major_sizes.data(), - local_frontier_unique_major_sizes.size())); + frontier_per_type_degrees.begin(), + frontier_per_type_degrees.end(), + raft::host_span(thresholds.data(), thresholds.size()), + num_edge_types); - auto sample_major_idx_first = thrust::make_transform_iterator( - thrust::make_counting_iterator(size_t{0}), - cuda::proclaim_return_type( - [K, - key_indices = key_indices ? thrust::make_optional>( - (*key_indices).data(), (*key_indices).size()) + auto K_sum = std::accumulate(Ks.begin(), Ks.end(), size_t{0}); + + std::vector h_K_offsets(Ks.size() + 1); + h_K_offsets[0] = 0; + std::inclusive_scan(Ks.begin(), Ks.end(), h_K_offsets.begin() + 1); + rmm::device_uvector d_K_offsets(h_K_offsets.size(), handle.get_stream()); + raft::update_device( + d_K_offsets.data(), h_K_offsets.data(), h_K_offsets.size(), handle.get_stream()); + + if (minor_comm_size > 1) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + + rmm::device_uvector per_type_nbr_indices( + (frontier_per_type_degrees.size() / num_edge_types) * K_sum, handle.get_stream()); + + if (frontier_partition_offsets[1] > 0) { + auto pair_first = + thrust::make_zip_iterator(frontier_indices.begin(), frontier_edge_types.begin()); + thrust::for_each( + handle.get_thrust_policy(), + pair_first, + pair_first + frontier_partition_offsets[1], + [frontier_per_type_degrees = raft::device_span( + frontier_per_type_degrees.data(), frontier_per_type_degrees.size()), + per_type_nbr_indices = + raft::device_span(per_type_nbr_indices.data(), per_type_nbr_indices.size()), + K_offsets = raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + K_sum, + invalid_idx = cugraph::invalid_edge_id_v] __device__(auto pair) { + auto idx = thrust::get<0>(pair); + auto type = thrust::get<1>(pair); + auto per_type_degree = frontier_per_type_degrees[idx * num_edge_types + type]; + thrust::sequence( + thrust::seq, + per_type_nbr_indices.begin() + idx * K_sum + K_offsets[type], + per_type_nbr_indices.begin() + idx * K_sum + K_offsets[type] + per_type_degree, + edge_t{0}); + thrust::fill( + thrust::seq, + per_type_nbr_indices.begin() + idx * K_sum + K_offsets[type] + per_type_degree, + per_type_nbr_indices.begin() + idx * K_sum + K_offsets[type + 1], + invalid_idx); + }); + } + + auto mid_frontier_size = frontier_partition_offsets[2] - frontier_partition_offsets[0]; + auto mid_local_frontier_sizes = + host_scalar_allgather(minor_comm, mid_frontier_size, handle.get_stream()); + std::vector mid_local_frontier_offsets(mid_local_frontier_sizes.size() + 1); + mid_local_frontier_offsets[0] = 0; + std::inclusive_scan(mid_local_frontier_sizes.begin(), + mid_local_frontier_sizes.end(), + mid_local_frontier_offsets.begin() + 1); + + if (mid_local_frontier_offsets.back() > 0) { + // aggregate frontier index type pairs with their degrees in the medium range + + auto aggregate_mid_local_frontier_index_type_pairs = + allocate_dataframe_buffer>( + mid_local_frontier_offsets.back(), handle.get_stream()); + device_allgatherv( + minor_comm, + thrust::make_zip_iterator(frontier_indices.begin(), frontier_edge_types.begin()) + + frontier_partition_offsets[1], + get_dataframe_buffer_beign(aggregate_mid_local_frontier_index_type_pairs), + mid_local_frontier_sizes, + std::vector(mid_local_frontier_offsets.begin(), + mid_local_frontier_offsets.end() - 1), + handle.get_stream()); + + // compute per-type local degrees for the aggregated frontier index type pairs + + rmm::device_uvector aggregate_mid_local_frontier_per_type_local_degrees( + size_dataframe_buffer(aggregate_mid_local_frontier_index_type_pairs), handle.get_stream()); + for (size_t i = 0; i < num_local_edge_partitions; ++i) { + thrust::transform( + handle.get_thrust_policy(), + get_dataframe_buffer_begin(aggregate_mid_local_frontier_index_type_pairs) + + mid_local_frontier_offsets[i], + get_dataframe_buffer_begin(aggregate_mid_local_frontier_index_type_pairs) + + mid_local_frontier_offsets[i + 1], + aggregate_mid_local_frontier_per_type_local_degrees.begin() + + mid_local_frontier_offsets[i], + cuda::proclaim_return_type( + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), + unique_key_per_type_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data() + + local_frontier_unique_key_offsets[i] * num_edge_types, + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) * + num_edge_types + + 1), + num_edge_types] __device__(auto pair) { + auto key_idx = thrust::get<0>(pair); + auto edge_type = thrust::get<1>(pair); + auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; + return static_cast( + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + + edge_type + 1] - + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + + edge_type]); + })); + } + + // gather biases for the aggregated frontier index type pairs + + rmm::device_uvector aggregate_mid_local_frontier_biases(0, handle.get_stream()); + std::vector mid_local_frontier_per_type_local_degree_sums( + mid_local_frontier_sizes.size(), 0); + { + rmm::device_uvector aggregate_mid_local_frontier_per_type_local_degree_offsets( + aggregate_mid_local_frontier_per_type_local_degrees.size() + 1, handle.get_stream()); + aggregate_mid_local_frontier_per_type_local_degree_offsets.set_element_to_zero_async( + 0, handle.get_stream()); + thrust::inclusive_scan( + handle.get_thrust_policy(), + aggregate_mid_local_frontier_per_type_local_degrees.begin(), + aggregate_mid_local_frontier_per_type_local_degrees.end(), + aggregate_mid_local_frontier_per_type_local_degree_offsets.begin() + 1); + for (size_t i = 0; i < num_local_edge_partitions; ++i) { + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(mid_local_frontier_offsets[i + 1] - + mid_local_frontier_offsets[i]), + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), + aggregate_local_frontier_unique_key_biases = + raft::device_span(aggregate_local_frontier_unique_key_biases.data(), + aggregate_local_frontier_unique_key_biases.size()), + unique_key_per_type_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data() + + local_frontier_unique_key_offsets[i] * num_edge_types, + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) * + num_edge_types + + 1), + mid_local_frontier_indices = raft::device_span( + std::get<0>(aggregate_mid_local_frontier_index_type_pairs).data() + + mid_local_frontier_offsets[i], + mid_local_frontier_sizes[i]), + mid_local_frontier_types = raft::device_span( + std::get<1>(aggregate_mid_local_frontier_index_type_pairs).data() + + mid_local_frontier_offsets[i], + mid_local_frontier_sizes[i]), + mid_local_frontier_per_type_local_degree_offsets = raft::device_span( + aggregate_mid_local_frontier_per_type_local_degree_offsets.data() + + mid_local_frontier_offsets[i], + mid_local_frontier_sizes[i]), + aggregate_mid_local_frontier_biases = raft::device_span( + aggregate_mid_local_frontier_biases.data(), + aggregate_mid_local_frontier_biases.size())] __device__(size_t i) { + auto unique_key_idx = key_idx_to_unique_key_idx[mid_local_frontier_indices[i]]; + auto type = mid_local_frontier_types[i]; + thrust::copy( + thrust::seq, + aggregate_local_frontier_unique_key_biases.begin() + + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + type], + aggregate_local_frontier_unique_key_biases.begin() + + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + type + + 1], + aggregate_mid_local_frontier_biases.begin() + + mid_local_frontier_per_type_local_degree_offsets[i]); + }); + } + std::get<0>(aggregate_mid_local_frontier_index_type_pairs).resize(0, handle.get_stream()); + std::get<1>(aggregate_mid_local_frontier_index_type_pairs).resize(0, handle.get_stream()); + std::get<0>(aggregate_mid_local_frontier_index_type_pairs) + .shrink_to_fit(handle.get_stream()); + std::get<1>(aggregate_mid_local_frontier_index_type_pairs) + .shrink_to_fit(handle.get_stream()); + + rmm::device_uvector d_mid_local_frontier_offsets(mid_local_frontier_offsets.size(), + handle.get_stream()); + raft::update_device(d_mid_local_frontier_offsets.data(), + mid_local_frontier_offsets.data(), + mid_local_frontier_offsets.size(), + handle.get_stream()); + rmm::device_uvector d_lasts(num_local_edge_partitions, handle.get_stream()); + auto map_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [mid_local_frontier_offsets = raft::device_span( + d_mid_local_frontier_offsets.data(), + d_mid_local_frontier_offsets.size())] __device__(size_t i) { + return mid_local_frontier_offsets[i + 1]; + })); + thrust::gather(handle.get_thrust_policy(), + d_mid_local_frontier_offsets.begin() + 1, + d_mid_local_frontier_offsets.end(), + aggregate_mid_local_frontier_per_type_local_degree_offsets.begin(), + d_lasts.begin()); + std::vector h_lasts(d_lasts.size()); + raft::update_host(h_lasts.data(), d_lasts.data(), d_lasts.size(), handle.get_stream()); + handle.sync_stream(); + std::adjacent_difference( + h_lasts.begin(), h_lasts.end(), mid_local_frontier_per_type_local_degree_sums.begin()); + } + + // shuffle local degrees & biases + + rmm::device_uvector mid_frontier_gathered_per_type_local_degree_offsets( + 0, handle.get_stream()); + { + rmm::device_uvector mid_frontier_gathered_per_type_local_degrees( + 0, handle.get_stream()); + std::tie(mid_frontier_gathered_per_type_local_degrees, std::ignore) = + shuffle_values(minor_comm, + aggregate_mid_local_frontier_per_type_local_degrees.data(), + mid_local_frontier_sizes, + handle.get_stream()); + aggregate_mid_local_frontier_per_type_local_degrees.resize(0, handle.get_stream()); + aggregate_mid_local_frontier_per_type_local_degrees.shrink_to_fit(handle.get_stream()); + mid_frontier_gathered_per_type_local_degree_offsets.resize( + mid_frontier_gathered_per_type_local_degrees.size() + 1, handle.get_stream()); + mid_frontier_gathered_per_type_local_degree_offsets.set_element_to_zero_async( + 0, handle.get_stream()); + thrust::inclusive_scan(handle.get_thrust_policy(), + mid_frontier_gathered_per_type_local_degrees.begin(), + mid_frontier_gathered_per_type_local_degrees.end(), + mid_frontier_gathered_per_type_local_degree_offsets.begin() + 1); + } + + rmm::device_uvector mid_frontier_gathered_biases(0, handle.get_stream()); + std::tie(mid_frontier_gathered_biases, std::ignore) = + shuffle_values(minor_comm, + aggregate_mid_local_frontier_biases.data(), + mid_local_frontier_per_type_local_degree_sums, + handle.get_stream()); + aggregate_mid_local_frontier_biases.resize(0, handle.get_stream()); + aggregate_mid_local_frontier_biases.shrink_to_fit(handle.get_stream()); + + auto mid_frontier_per_type_degree_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(frontier_indices.begin(), frontier_edge_types.begin()) + + frontier_partition_offsets[1], + cuda::proclaim_return_type( + [frontier_per_type_degrees = raft::device_span(frontier_per_type_degrees.data(), + frontier_per_type_degrees.size()), + num_edge_types] __device__(auto pair) { + return frontier_per_type_degrees[thrust::get<0>(pair) * num_edge_types + + thrust::get<1>(pair)]; + })); + rmm::device_uvector mid_frontier_per_type_degree_offsets(mid_frontier_size + 1, + handle.get_stream()); + mid_frontier_per_type_degree_offsets.set_element_to_zero_async(0, handle.get_stream()); + thrust::inclusive_scan(handle.get_thrust_policy(), + mid_frontier_per_type_degree_first, + mid_frontier_per_type_degree_first + mid_frontier_size, + mid_frontier_per_type_degree_offsets.begin() + 1); + rmm::device_uvector mid_frontier_biases(mid_frontier_gathered_biases.size(), + handle.get_stream()); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(mid_frontier_size), + [mid_frontier_gathered_per_type_local_degree_offsets = + raft::device_span(mid_frontier_gathered_per_type_local_degree_offsets.data(), + mid_frontier_gathered_per_type_local_degree_offsets.size()), + mid_frontier_gathered_biases = raft::device_span( + mid_frontier_gathered_biases.data(), mid_frontier_gathered_biases.size()), + mid_frontier_per_type_degree_offsets = + raft::device_span(mid_frontier_per_type_degree_offsets.data(), + mid_frontier_per_type_degree_offsets.size()), + mid_frontier_biases = + raft::device_span(mid_frontier_biases.data(), mid_frontier_biases.size()), + minor_comm_size, + mid_frontier_size] __device__(size_t i) { + auto output_offset = mid_frontier_per_type_degree_offsets[i]; + for (int j = 0; j < minor_comm_size; ++j) { + auto input_offset = + mid_frontier_gathered_per_type_local_degree_offsets[mid_frontier_size * j + i]; + auto input_size = + mid_frontier_gathered_per_type_local_degree_offsets[mid_frontier_size * j + i + 1] - + input_offset; + thrust::copy(thrust::seq, + mid_frontier_gathered_biases.begin() + input_offset, + mid_frontier_gathered_biases.begin() + input_offset + input_size, + mid_frontier_biases.begin() + output_offset); + output_offset += input_size; + } + }); + + // now sample and update indices + + rmm::device_uvector mid_frontier_output_start_displacements(mid_frontier_size, + handle.get_stream()); + auto pair_first = + thrust::make_zip_iterator(frontier_indices.begin(), frontier_edge_types.begin()) + + frontier_partition_offsets[1]; + thrust::transform( + handle.get_thrust_policy(), + pair_first, + pair_first + mid_frontier_size, + mid_frontier_output_start_displacements.begin(), + cuda::proclaim_return_type( + [K_offsets = raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + K_sum] __device__(auto pair) { + auto idx = thrust::get<0>(pair); + auto type = thrust::get<1>(pair); + return idx * K_sum + K_offsets[type]; + })); + + compute_heterogeneous_biased_sampling_index_without_replacement( + handle, + std::nullopt, + raft::device_span( + frontier_edge_types.data() + frontier_partition_offsets[1], mid_frontier_size), + raft::device_span(mid_frontier_per_type_degree_offsets.data(), + mid_frontier_per_type_degree_offsets.size()), + raft::device_span(mid_frontier_biases.data(), mid_frontier_biases.size()), + raft::device_span(mid_frontier_output_start_displacements.data(), + mid_frontier_output_start_displacements.size()), + raft::device_span(per_type_nbr_indices.data(), per_type_nbr_indices.size()), + std::nullopt, + rng_state, + raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + false); + } + + auto high_frontier_size = frontier_partition_offsets[3] - frontier_partition_offsets[2]; + + auto high_local_frontier_sizes = + host_scalar_allgather(minor_comm, high_frontier_size, handle.get_stream()); + std::vector high_local_frontier_offsets(high_local_frontier_sizes.size() + 1); + high_local_frontier_offsets[0] = 0; + std::inclusive_scan(high_local_frontier_sizes.begin(), + high_local_frontier_sizes.end(), + high_local_frontier_offsets.begin() + 1); + + if (high_local_frontier_offsets.back() > 0) { + // aggregate frontier index & type pairs with their degrees in the high range + + auto aggregate_high_local_frontier_index_type_pairs = + allocate_dataframe_buffer>( + high_local_frontier_offsets.back(), handle.get_stream()); + device_allgatherv( + minor_comm, + thrust::make_zip_iterator(frontier_indices.begin(), frontier_edge_types.begin()) + + frontier_partition_offsets[2], + get_dataframe_buffer_begin(aggregate_high_local_frontier_index_type_pairs), + high_local_frontier_sizes, + std::vector(high_local_frontier_offsets.begin(), + high_local_frontier_offsets.end() - 1), + handle.get_stream()); + + // local sample and update indices + + rmm::device_uvector aggregate_high_local_frontier_output_offsets( + high_local_frontier_offsets.back(), handle.get_stream()); + { + auto K_first = thrust::make_transform_iterator( + std::get<1>(aggregate_high_local_frontier_index_type_pairs).begin(), + cuda::proclaim_return_type( + [d_K_offsets = raft::device_span( + d_K_offsets.data(), d_K_offsets.size())] __device__(auto type) { + return d_K_offsets[type + 1] - d_K_offsets[type]; + })); + aggregate_high_local_frontier_output_offsets.set_element_to_zero_async(0, + handle.get_stream()); + thrust::inclusive_scan( + handle.get_thrust_policy(), + K_first, + K_first + std::get<1>(aggregate_high_local_frontier_index_type_pairs).size(), + aggregate_high_local_frontier_output_offsets.begin() + 1); + } + + rmm::device_uvector aggregate_high_local_frontier_per_type_local_nbr_indices( + aggregate_high_local_frontier_output_offsets.back_element(handle.get_stream()), + handle.get_stream()); + rmm::device_uvector aggregate_high_local_frontier_keys( + aggregate_high_local_frontier_per_type_local_nbr_indices.size(), handle.get_stream()); + for (size_t i = 0; i < num_local_edge_partitions; ++i) { + rmm::device_uvector unique_key_indices_for_key_indices(high_local_frontier_sizes[i], + handle.get_stream()); + thrust::gather( + handle.get_thrust_policy(), + std::get<0>(aggregate_high_local_frontier_index_type_pairs).begin() + + high_local_frontier_offsets[i], + std::get<0>(aggregate_high_local_frontier_index_type_pairs).begin() + + high_local_frontier_offsets[i + 1], + aggregate_local_frontier_key_idx_to_unique_key_idx.begin() + local_frontier_offsets[i], + unique_key_indices_for_key_indices.begin()); + compute_heterogeneous_biased_sampling_index_without_replacement( + handle, + std::make_optional>( + unique_key_indices_for_key_indices.data(), unique_key_indices_for_key_indices.size()), + raft::device_span( + std::get<1>(aggregate_high_local_frontier_index_type_pairs).data() + + high_local_frontier_offsets[i], + high_local_frontier_offsets[i + 1] - high_local_frontier_offsets[i]), + raft::device_span( + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data() + + local_frontier_unique_key_offsets[i] * num_edge_types, + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) * + num_edge_types + + 1), + raft::device_span(aggregate_local_frontier_unique_key_biases.data(), + aggregate_local_frontier_unique_key_biases.size()), + raft::device_span( + aggregate_high_local_frontier_output_offsets.data() + high_local_frontier_offsets[i], + (high_local_frontier_offsets[i + 1] - high_local_frontier_offsets[i]) + 1), + raft::device_span( + aggregate_high_local_frontier_per_type_local_nbr_indices.data(), + aggregate_high_local_frontier_per_type_local_nbr_indices.size()), + std::make_optional>(aggregate_high_local_frontier_keys.data(), + aggregate_high_local_frontier_keys.size()), + rng_state, + raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + false); + } + + // shuffle local sampling outputs + + std::vector tx_counts(high_local_frontier_sizes); + { + rmm::device_uvector d_high_local_frontier_offsets( + high_local_frontier_offsets.size(), handle.get_stream()); + raft::update_device(d_high_local_frontier_offsets.data(), + high_local_frontier_offsets.data(), + high_local_frontier_offsets.size(), + handle.get_stream()); + rmm::device_uvector d_lasts(num_local_edge_partitions, handle.get_stream()); + thrust::gather(handle.get_thrust_policy(), + d_high_local_frontier_offsets.begin() + 1, + d_high_local_frontier_offsets.end(), + aggregate_high_local_frontier_output_offsets.begin(), + d_lasts.begin()); + std::vector h_lasts(d_lasts.size()); + raft::update_host(h_lasts.data(), d_lasts.data(), d_lasts.size(), handle.get_stream()); + handle.sync_stream(); + std::adjacent_difference(h_lasts.begin(), h_lasts.end(), tx_counts.begin()); + } + rmm::device_uvector high_frontier_gathered_per_type_local_nbr_indices( + 0, handle.get_stream()); + std::tie(high_frontier_gathered_per_type_local_nbr_indices, std::ignore) = + shuffle_values(minor_comm, + aggregate_high_local_frontier_per_type_local_nbr_indices.data(), + tx_counts, + handle.get_stream()); + rmm::device_uvector high_frontier_gathered_keys(0, handle.get_stream()); + std::tie(high_frontier_gathered_keys, std::ignore) = shuffle_values( + minor_comm, aggregate_high_local_frontier_keys.data(), tx_counts, handle.get_stream()); + aggregate_high_local_frontier_per_type_local_nbr_indices.resize(0, handle.get_stream()); + aggregate_high_local_frontier_per_type_local_nbr_indices.shrink_to_fit(handle.get_stream()); + aggregate_high_local_frontier_keys.resize(0, handle.get_stream()); + aggregate_high_local_frontier_keys.shrink_to_fit(handle.get_stream()); + + // merge local sampling outputs + + rmm::device_uvector high_frontier_output_offsets(high_frontier_size + 1, + handle.get_stream()); + { + auto K_first = thrust::make_transform_iterator( + frontier_edge_types + frontier_partition_offsets[2], + cuda::proclaim_return_type( + [K_offsets = raft::device_span( + d_K_offsets.data(), d_K_offsets.size())] __device__(auto type) { + return K_offsets[type + 1] - K_offsets[type]; + })); + high_frontier_output_offsets.set_element_to_zero_async(0, handle.get_stream()); + thrust::inclusive_scan(handle.get_thrust_policy(), + K_first, + K_first + high_frontier_size, + high_frontier_output_offsets.begin() + 1); + } + + rmm::device_uvector high_frontier_per_type_nbr_indices( + high_frontier_output_offsets.back_element(handle.get_stream()) * minor_comm_size, + handle.get_stream()); + rmm::device_uvector high_frontier_keys(high_frontier_per_type_nbr_indices.size(), + handle.get_stream()); + auto index_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [offsets = raft::device_span(high_frontier_output_offsets.data(), + high_frontier_output_offsets.size()), + minor_comm_size] __device__(size_t i) { + auto idx = thrust::distance( + offsets.begin() + 1, + thrust::upper_bound(offsets.begin() + 1, offsets.end(), i / minor_comm_size)); + auto K = offsets[idx + 1] - offsets[idx]; + auto minor_comm_rank = (i - offsets[idx] * minor_comm_size) / K; + return minor_comm_rank * offsets[offsets.size() - 1] + offsets[idx] + + (i - offsets[idx] * minor_comm_size) % K; + })); + auto high_frontier_gathered_per_type_nbr_idx_first = thrust::make_transform_iterator( + thrust::counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [frontier_partitioned_per_type_local_degree_displacements = + raft::device_span( + (*frontier_partitioned_per_type_local_degree_displacements).data(), + (*frontier_partitioned_per_type_local_degree_displacements).size()), + high_frontier_indices = raft::device_span( + frontier_indices.data() + frontier_partition_offsets[2], high_frontier_size), + high_frontier_edge_types = raft::device_span( + frontier_edge_types.data() + frontier_partition_offsets[2], high_frontier_size), + high_frontier_gathered_per_type_local_nbr_indices = raft::device_span( + high_frontier_gathered_per_type_local_nbr_indices.data(), + high_frontier_gathered_per_type_local_nbr_indices.size()), + offsets = raft::device_span(high_frontier_output_offsets.data(), + high_frontier_output_offsets.size()), + minor_comm_size] __device__(size_t i) { + auto minor_comm_rank = static_cast(i / offsets[offsets.size() - 1]); + auto idx = thrust::distance( + offsets.begin() + 1, + thrust::upper_bound( + offsets.begin() + 1, offsets.end(), i % offsets[offsets.size() - 1])); + auto frontier_idx = high_frontier_indices[idx]; + return frontier_partitioned_per_type_local_degree_displacements[frontier_idx * + minor_comm_size + + minor_comm_rank] + + high_frontier_gathered_per_type_local_nbr_indices[i]; + })); + thrust::gather(handle.get_thrust_policy(), + index_first, + index_first + high_frontier_per_type_nbr_indices.size(), + thrust::make_zip_iterator(high_frontier_gathered_per_type_nbr_idx_first, + high_frontier_gathered_keys.begin()), + thrust::make_zip_iterator(high_frontier_per_type_nbr_indices.begin(), + high_frontier_keys.begin())); + high_frontier_gathered_per_type_local_nbr_indices.resize(0, handle.get_stream()); + high_frontier_gathered_per_type_local_nbr_indices.shrink_to_fit(handle.get_stream()); + high_frontier_gathered_keys.resize(0, handle.get_stream()); + high_frontier_gathered_keys.shrink_to_fit(handle.get_stream()); + + rmm::device_uvector d_tmp_storage(0, handle.get_stream()); + size_t tmp_storage_bytes{0}; + + rmm::device_uvector high_frontier_segment_sorted_per_type_nbr_indices( + high_frontier_per_type_nbr_indices.size(), handle.get_stream()); + rmm::device_uvector high_frontier_segment_sorted_keys(high_frontier_keys.size(), + handle.get_stream()); + auto offset_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [offsets = raft::device_span(high_frontier_output_offsets.data(), + high_frontier_output_offsets.size()), + minor_comm_size] __device__(auto i) { return offsets[i] * minor_comm_size; })); + cub::DeviceSegmentedSort::SortPairs( + static_cast(nullptr), + tmp_storage_bytes, + high_frontier_keys.data(), + high_frontier_segment_sorted_keys.data(), + high_frontier_per_type_nbr_indices.data(), + high_frontier_segment_sorted_per_type_nbr_indices.data(), + high_frontier_output_offsets.back_element(handle.get_stream()) * minor_comm_size, + high_frontier_size, + offset_first, + offset_first + 1, + handle.get_stream()); + if (tmp_storage_bytes > d_tmp_storage.size()) { + d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, handle.get_stream()); + } + cub::DeviceSegmentedSort::SortPairs( + d_tmp_storage.data(), + tmp_storage_bytes, + high_frontier_keys.data(), + high_frontier_segment_sorted_keys.data(), + high_frontier_per_type_nbr_indices.data(), + high_frontier_segment_sorted_per_type_nbr_indices.data(), + high_frontier_output_offsets.back_element(handle.get_stream()) * minor_comm_size, + high_frontier_size, + offset_first, + offset_first + 1, + handle.get_stream()); + + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(high_frontier_size), + [high_frontier_indices = raft::device_span( + frontier_indices.data() + frontier_partition_offsets[2], high_frontier_size), + high_frontier_edge_types = raft::device_span( + frontier_edge_types.data() + frontier_partition_offsets[2], high_frontier_size), + high_frontier_segment_sorted_nbr_indices = raft::device_span( + high_frontier_segment_sorted_per_type_nbr_indices.data(), + high_frontier_segment_sorted_per_type_nbr_indices.size()), + offsets = raft::device_span(high_frontier_output_offsets.data(), + high_frontier_output_offsets.size()), + per_type_nbr_indices = + raft::device_span(per_type_nbr_indices.data(), per_type_nbr_indices.size()), + K_offsets = raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + K_sum, + minor_comm_size] __device__(size_t i) { + auto type = high_frontier_edge_types[i]; + auto K = K_offsets[type + 1] - K_offsets[type]; + thrust::copy( + thrust::seq, + high_frontier_segment_sorted_nbr_indices.begin() + offsets[i] * minor_comm_size, + high_frontier_segment_sorted_nbr_indices.begin() + offsets[i] * minor_comm_size + K, + per_type_nbr_indices.begin() + high_frontier_indices[i] * K_sum + K_offsets[type]); + }); + } + + rmm::device_uvector per_type_local_nbr_indices(0, handle.get_stream()); + rmm::device_uvector edge_types(0, handle.get_stream()); + std::tie(per_type_local_nbr_indices, edge_types, key_indices, local_frontier_sample_offsets) = + shuffle_and_compute_per_type_local_nbr_values( + handle, + std::move(per_type_nbr_indices), + raft::device_span( + (*frontier_partitioned_per_type_local_degree_displacements).data(), + (*frontier_partitioned_per_type_local_degree_displacements).size()), + raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + K_sum, + cugraph::invalid_edge_id_v); + + local_nbr_indices = std::move(per_type_local_nbr_indices); + auto triplet_first = thrust::make_zip_iterator( + local_nbr_indices.begin(), edge_types.begin(), (*key_indices).begin()); + for (size_t i = 0; i < num_local_edge_partitions; ++i) { + thrust::transform( + handle.get_thrust_policy(), + triplet_first + local_frontier_sample_offsets[i], + triplet_first + local_frontier_sample_offsets[i + 1], + local_nbr_indices.begin(), + cuda::proclaim_return_type( + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), + unique_key_per_type_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data() + + local_frontier_unique_key_offsets[i] * num_edge_types, + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) * + num_edge_types + + 1)] __device__(auto triplet) { + auto per_type_local_nbr_idx = thrust::get<0>(triplet); + auto type = thrust::get<1>(triplet); + auto key_idx = thrust::get<2>(triplet); + auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; + auto type_start_offset = static_cast( + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + type] - + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types]); + return type_start_offset + per_type_local_nbr_idx; + })); + } + } else { // minor_comm_size == 1 + rmm::device_uvector per_type_local_nbr_indices(local_frontier_offsets.back() * K_sum, + handle.get_stream()); + + // sample from low-degree vertices + + if (frontier_partition_offsets[1] > 0) { + auto pair_first = + thrust::make_zip_iterator(frontier_indices.begin(), frontier_edge_types.begin()); + thrust::for_each( + handle.get_thrust_policy(), + pair_first, + pair_first + frontier_partition_offsets[1], + [frontier_per_type_degrees = raft::device_span( + frontier_per_type_degrees.data(), frontier_per_type_degrees.size()), + per_type_local_nbr_indices = raft::device_span(per_type_local_nbr_indices.data(), + per_type_local_nbr_indices.size()), + K_offsets = raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + num_edge_types, + K_sum, + invalid_idx = cugraph::invalid_edge_id_v] __device__(auto pair) { + auto key_idx = thrust::get<0>(pair); + auto type = static_cast(thrust::get<1>(pair)); + auto degree = frontier_per_type_degrees[key_idx * num_edge_types + type]; + for (size_t j = 0; j < degree; ++j) { + *(per_type_local_nbr_indices.begin() + key_idx * K_sum + K_offsets[type] + j) = j; + } + thrust::fill( + thrust::seq, + per_type_local_nbr_indices.begin() + key_idx * K_sum + K_offsets[type] + degree, + per_type_local_nbr_indices.begin() + key_idx * K_sum + K_offsets[type + 1], + invalid_idx); + }); + } + + // sample from mid & high-degree vertices + + auto mid_and_high_frontier_size = frontier_partition_offsets[3] - frontier_partition_offsets[1]; + + if (mid_and_high_frontier_size > 0) { + rmm::device_uvector unique_key_indices_for_key_indices(mid_and_high_frontier_size, + handle.get_stream()); + thrust::gather( + handle.get_thrust_policy(), + frontier_indices.begin() + frontier_partition_offsets[1], + frontier_indices.begin() + frontier_partition_offsets[1] + mid_and_high_frontier_size, + aggregate_local_frontier_key_idx_to_unique_key_idx.begin(), + unique_key_indices_for_key_indices.begin()); + + rmm::device_uvector mid_and_high_frontier_output_start_displacements( + mid_and_high_frontier_size, handle.get_stream()); + auto pair_first = + thrust::make_zip_iterator(frontier_indices.begin(), frontier_edge_types.begin()); + thrust::transform( + handle.get_thrust_policy(), + pair_first + frontier_partition_offsets[1], + pair_first + frontier_partition_offsets[1] + mid_and_high_frontier_size, + mid_and_high_frontier_output_start_displacements.begin(), + cuda::proclaim_return_type( + [K_offsets = raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + K_sum] __device__(auto pair) { + auto idx = thrust::get<0>(pair); + auto type = thrust::get<1>(pair); + return idx * K_sum + (K_offsets[type + 1] - K_offsets[type]); + })); + + compute_heterogeneous_biased_sampling_index_without_replacement( + handle, + std::make_optional>( + unique_key_indices_for_key_indices.data(), unique_key_indices_for_key_indices.size()), + raft::device_span(frontier_edge_types.data(), + frontier_edge_types.size()), + raft::device_span( + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.size()), + raft::device_span(aggregate_local_frontier_unique_key_biases.data(), + aggregate_local_frontier_unique_key_biases.size()), + std::make_optional>( + mid_and_high_frontier_output_start_displacements.data(), + mid_and_high_frontier_output_start_displacements.size()), + raft::device_span(per_type_local_nbr_indices.data(), + per_type_local_nbr_indices.size()), + std::nullopt, + rng_state, + raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + false); + } + + local_frontier_sample_offsets = std::vector{0, per_type_local_nbr_indices.size()}; + + local_nbr_indices = std::move(per_type_local_nbr_indices); + { + auto pair_first = thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{0}), + local_nbr_indices.begin()); + thrust::transform( + handle.get_thrust_policy(), + pair_first, + pair_first + local_nbr_indices.size(), + local_nbr_indices.begin(), + cuda::proclaim_return_type( + [per_type_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.size()), + K_offsets = raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + K_sum] __device__(auto pair) { + auto i = thrust::get<0>(pair); + auto per_type_local_nbr_idx = thrust::get<1>(pair); + auto idx = i / K_sum; + auto type = static_cast(thrust::distance( + K_offsets.begin() + 1, + thrust::upper_bound(thrust::seq, K_offsets.begin() + 1, K_offsets.end(), i % K_sum))); + auto type_start_offset = + static_cast(per_type_local_degree_offsets[idx * num_edge_types + type] - + per_type_local_degree_offsets[idx * num_edge_types]); + return type_start_offset + per_type_local_nbr_idx; + })); + } + } + + return std::make_tuple( + std::move(local_nbr_indices), std::move(key_indices), std::move(local_frontier_sample_offsets)); +} + +// skip conversion if local neighbor index is cugraph::invalid_edge_id_v +template +rmm::device_uvector convert_to_unmasked_local_nbr_idx( + raft::handle_t const& handle, + GraphViewType const& graph_view, + VertexIterator aggregate_local_frontier_major_first, + rmm::device_uvector&& local_nbr_indices, + std::optional> key_indices, + raft::host_span local_frontier_sample_offsets, + raft::host_span local_frontier_offsets, + size_t K) +{ + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + static_assert( + std::is_same_v::value_type>); + + auto edge_mask_view = graph_view.edge_mask_view(); + + auto [aggregate_local_frontier_unique_majors, + aggregate_local_frontier_major_idx_to_unique_major_idx, + local_frontier_unique_major_offsets] = + compute_unique_keys(handle, aggregate_local_frontier_major_first, local_frontier_offsets); + + // to avoid searching the entire neighbor list K times for high degree vertices with edge masking + auto local_frontier_unique_major_valid_local_nbr_count_inclusive_sums = + compute_valid_local_nbr_count_inclusive_sums( + handle, + graph_view, + aggregate_local_frontier_unique_majors.begin(), + raft::host_span(local_frontier_unique_major_offsets.data(), + local_frontier_unique_major_offsets.size())); + + auto sample_major_idx_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [K, + key_indices = key_indices ? thrust::make_optional>( + (*key_indices).data(), (*key_indices).size()) : thrust::nullopt] __device__(size_t i) { return key_indices ? (*key_indices)[i] : i / K; })); @@ -2464,7 +3868,7 @@ rmm::device_uvector convert_to_unmasked_local : thrust::nullopt; auto edge_partition_frontier_major_first = - aggregate_local_frontier_major_first + local_frontier_displacements[i]; + aggregate_local_frontier_major_first + local_frontier_offsets[i]; thrust::transform_if( handle.get_thrust_policy(), pair_first + local_frontier_sample_offsets[i], @@ -2478,9 +3882,8 @@ rmm::device_uvector convert_to_unmasked_local edge_partition_e_mask, edge_partition_frontier_major_first, raft::device_span( - aggregate_local_frontier_major_idx_to_unique_major_idx.data() + - local_frontier_displacements[i], - local_frontier_sizes[i]), + aggregate_local_frontier_major_idx_to_unique_major_idx.data() + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), thrust::make_tuple( raft::device_span( std::get<0>(local_frontier_unique_major_valid_local_nbr_count_inclusive_sums[i]).data(), @@ -2500,15 +3903,13 @@ template std::tuple, std::optional>, std::vector> -uniform_sample_and_compute_local_nbr_indices( - raft::handle_t const& handle, - GraphViewType const& graph_view, - KeyIterator aggregate_local_frontier_key_first, - raft::host_span local_frontier_displacements, - raft::host_span local_frontier_sizes, - raft::random::RngState& rng_state, - size_t K, - bool with_replacement) +uniform_sample_and_compute_local_nbr_indices(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyIterator aggregate_local_frontier_key_first, + raft::host_span local_frontier_offsets, + raft::random::RngState& rng_state, + size_t K, + bool with_replacement) { using edge_t = typename GraphViewType::edge_type; using vertex_t = typename GraphViewType::vertex_type; @@ -2531,12 +3932,8 @@ uniform_sample_and_compute_local_nbr_indices( std::optional> frontier_partitioned_local_degree_displacements{ std::nullopt}; { - auto aggregate_local_frontier_local_degrees = - compute_aggregate_local_frontier_local_degrees(handle, - graph_view, - aggregate_local_frontier_major_first, - local_frontier_displacements, - local_frontier_sizes); + auto aggregate_local_frontier_local_degrees = compute_aggregate_local_frontier_local_degrees( + handle, graph_view, aggregate_local_frontier_major_first, local_frontier_offsets); if (minor_comm_size > 1) { std::tie(frontier_degrees, frontier_partitioned_local_degree_displacements) = @@ -2544,8 +3941,8 @@ uniform_sample_and_compute_local_nbr_indices( handle, raft::device_span(aggregate_local_frontier_local_degrees.data(), aggregate_local_frontier_local_degrees.size()), - local_frontier_displacements, - local_frontier_sizes); + local_frontier_offsets, + 1); aggregate_local_frontier_local_degrees.resize(0, handle.get_stream()); aggregate_local_frontier_local_degrees.shrink_to_fit(handle.get_stream()); } else { @@ -2577,17 +3974,22 @@ uniform_sample_and_compute_local_nbr_indices( // 3. shuffle neighbor indices - auto [local_nbr_indices, key_indices, local_frontier_sample_offsets] = - shuffle_and_compute_local_nbr_values( - handle, - std::move(nbr_indices), - frontier_partitioned_local_degree_displacements - ? std::make_optional>( - (*frontier_partitioned_local_degree_displacements).data(), - (*frontier_partitioned_local_degree_displacements).size()) - : std::nullopt, - K, - cugraph::invalid_edge_id_v); + rmm::device_uvector local_nbr_indices(0, handle.get_stream()); + std::optional> key_indices{std::nullopt}; + std::vector local_frontier_sample_offsets{}; + if (minor_comm_size > 1) { + std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = + shuffle_and_compute_local_nbr_values( + handle, + std::move(nbr_indices), + raft::device_span((*frontier_partitioned_local_degree_displacements).data(), + (*frontier_partitioned_local_degree_displacements).size()), + K, + cugraph::invalid_edge_id_v); + } else { + local_nbr_indices = std::move(nbr_indices); + local_frontier_sample_offsets = {size_t{0}, local_nbr_indices.size()}; + } // 4. convert neighbor indices in the neighbor list considering edge mask to neighbor indices in // the neighbor list ignoring edge mask @@ -2603,8 +4005,7 @@ uniform_sample_and_compute_local_nbr_indices( : std::nullopt, raft::host_span(local_frontier_sample_offsets.data(), local_frontier_sample_offsets.size()), - local_frontier_displacements, - local_frontier_sizes, + local_frontier_offsets, K); } @@ -2629,8 +4030,7 @@ biased_sample_and_compute_local_nbr_indices( EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, BiasEdgeOp bias_e_op, - raft::host_span local_frontier_displacements, - raft::host_span local_frontier_sizes, + raft::host_span local_frontier_offsets, raft::random::RngState& rng_state, size_t K, bool with_replacement, @@ -2662,13 +4062,11 @@ biased_sample_and_compute_local_nbr_indices( auto [aggregate_local_frontier_unique_keys, aggregate_local_frontier_key_idx_to_unique_key_idx, - local_frontier_unique_key_displacements, - local_frontier_unique_key_sizes] = compute_unique_keys(handle, - aggregate_local_frontier_key_first, - local_frontier_displacements, - local_frontier_sizes); + local_frontier_unique_key_offsets] = + compute_unique_keys(handle, aggregate_local_frontier_key_first, local_frontier_offsets); auto [aggregate_local_frontier_unique_key_biases, + aggregate_local_frontier_unique_key_nz_bias_indices, aggregate_local_frontier_unique_key_local_degree_offsets] = compute_aggregate_local_frontier_biases( handle, @@ -2678,10 +4076,8 @@ biased_sample_and_compute_local_nbr_indices( edge_dst_value_input, edge_value_input, bias_e_op, - raft::host_span(local_frontier_unique_key_displacements.data(), - local_frontier_unique_key_displacements.size()), - raft::host_span(local_frontier_unique_key_sizes.data(), - local_frontier_unique_key_sizes.size()), + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), do_expensive_check); // 2. sample neighbor indices and shuffle neighbor indices @@ -2693,14 +4089,11 @@ biased_sample_and_compute_local_nbr_indices( std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = homogeneous_biased_sample_with_replacement( handle, - local_frontier_displacements, - local_frontier_sizes, + local_frontier_offsets, raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), aggregate_local_frontier_key_idx_to_unique_key_idx.size()), - raft::host_span(local_frontier_unique_key_displacements.data(), - local_frontier_unique_key_displacements.size()), - raft::host_span(local_frontier_unique_key_sizes.data(), - local_frontier_unique_key_sizes.size()), + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), raft::device_span(aggregate_local_frontier_unique_key_biases.data(), aggregate_local_frontier_unique_key_biases.size()), raft::device_span( @@ -2712,14 +4105,11 @@ biased_sample_and_compute_local_nbr_indices( std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = homogeneous_biased_sample_without_replacement( handle, - local_frontier_displacements, - local_frontier_sizes, + local_frontier_offsets, raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), aggregate_local_frontier_key_idx_to_unique_key_idx.size()), - raft::host_span(local_frontier_unique_key_displacements.data(), - local_frontier_unique_key_displacements.size()), - raft::host_span(local_frontier_unique_key_sizes.data(), - local_frontier_unique_key_sizes.size()), + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), raft::device_span(aggregate_local_frontier_unique_key_biases.data(), aggregate_local_frontier_unique_key_biases.size()), raft::device_span( @@ -2729,7 +4119,65 @@ biased_sample_and_compute_local_nbr_indices( K); } - // 3. convert neighbor indices in the neighbor list considering edge mask to neighbor indices in + // 3. remap non-zero bias local neighbor indices to local neighbor indices + + if (key_indices) { + auto pair_first = thrust::make_zip_iterator(local_nbr_indices.begin(), (*key_indices).begin()); + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + thrust::transform( + handle.get_thrust_policy(), + pair_first + local_frontier_sample_offsets[i], + pair_first + local_frontier_sample_offsets[i + 1], + local_nbr_indices.begin() + local_frontier_sample_offsets[i], + cuda::proclaim_return_type( + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), + unique_key_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data() + + local_frontier_unique_key_offsets[i], + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) + 1), + unique_key_nz_bias_indices = raft::device_span( + aggregate_local_frontier_unique_key_nz_bias_indices.data(), + aggregate_local_frontier_unique_key_nz_bias_indices.size())] __device__(auto pair) { + auto nz_bias_idx = thrust::get<0>(pair); + auto key_idx = thrust::get<1>(pair); + auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; + return unique_key_nz_bias_indices[unique_key_local_degree_offsets[unique_key_idx] + + nz_bias_idx]; + })); + } + } else { + auto pair_first = thrust::make_zip_iterator(local_nbr_indices.begin(), + thrust::make_counting_iterator(size_t{0})); + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + thrust::transform( + handle.get_thrust_policy(), + pair_first + local_frontier_sample_offsets[i], + pair_first + local_frontier_sample_offsets[i + 1], + local_nbr_indices.begin() + local_frontier_sample_offsets[i], + cuda::proclaim_return_type( + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), + unique_key_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data() + + local_frontier_unique_key_offsets[i], + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) + 1), + unique_key_nz_bias_indices = raft::device_span( + aggregate_local_frontier_unique_key_nz_bias_indices.data(), + aggregate_local_frontier_unique_key_nz_bias_indices.size()), + K] __device__(auto pair) { + auto nz_bias_idx = thrust::get<0>(pair); + auto key_idx = thrust::get<1>(pair) / K; + auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; + return unique_key_nz_bias_indices[unique_key_local_degree_offsets[unique_key_idx] + + nz_bias_idx]; + })); + } + } + + // 4. convert neighbor indices in the neighbor list considering edge mask to neighbor indices in // the neighbor list ignoring edge mask if (edge_mask_view) { @@ -2743,8 +4191,7 @@ biased_sample_and_compute_local_nbr_indices( : std::nullopt, raft::host_span(local_frontier_sample_offsets.data(), local_frontier_sample_offsets.size()), - local_frontier_displacements, - local_frontier_sizes, + local_frontier_offsets, K); } @@ -2752,6 +4199,330 @@ biased_sample_and_compute_local_nbr_indices( std::move(local_nbr_indices), std::move(key_indices), std::move(local_frontier_sample_offsets)); } +template +std::tuple, + std::optional>, + std::vector> +biased_sample_and_compute_local_nbr_indices( + raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyIterator aggregate_local_frontier_key_first, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + BiasEdgeOp bias_e_op, + EdgeTypeInputWrapper edge_type_input, + raft::host_span local_frontier_offsets, + raft::random::RngState& rng_state, + raft::host_span Ks, + bool with_replacement, + bool do_expensive_check /* check bias_e_op return values */) +{ + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = typename thrust::iterator_traits::value_type; + + using bias_t = typename edge_op_result_type::type; + using edge_type_t = typename EdgeTypeInputWrapper::value_type; + + int minor_comm_rank{0}; + int minor_comm_size{1}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + minor_comm_rank = minor_comm.get_rank(); + minor_comm_size = minor_comm.get_size(); + } + assert(minor_comm_size == graph_view.number_of_local_edge_partitions()); + + auto num_edge_types = static_cast(Ks.size()); + + auto edge_mask_view = graph_view.edge_mask_view(); + + // 1. compute (bias, type) pairs for unique keys (to reduce memory footprint) + + auto [aggregate_local_frontier_unique_keys, + aggregate_local_frontier_key_idx_to_unique_key_idx, + local_frontier_unique_key_offsets] = + compute_unique_keys(handle, aggregate_local_frontier_key_first, local_frontier_offsets); + + auto [aggregate_local_frontier_unique_key_biases, + aggregate_local_frontier_unique_key_types, + aggregate_local_frontier_unique_key_nz_bias_indices, + aggregate_local_frontier_unique_key_local_degree_offsets] = + compute_aggregate_local_frontier_bias_type_pairs( + handle, + graph_view, + get_dataframe_buffer_begin(aggregate_local_frontier_unique_keys), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + bias_e_op, + edge_type_input, + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), + do_expensive_check); + + // 2. Segmented-sort (index, bias, type) triplets based on types (1 segment per key) + + // to limit memory footprint ((1 << 20) is a tuning parameter) + auto approx_nbrs_to_sort_per_iteration = + static_cast(handle.get_device_properties().multiProcessorCount * (1 << 20)); + + auto [h_key_offsets, h_nbr_offsets] = detail::compute_offset_aligned_element_chunks( + handle, + raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size()), + aggregate_local_frontier_unique_key_biases.size(), + approx_nbrs_to_sort_per_iteration); + + rmm::device_uvector d_tmp_storage(0, handle.get_stream()); + + auto num_chunks = h_key_offsets.size() - 1; + for (size_t i = 0; i < num_chunks; ++i) { + size_t tmp_storage_bytes{0}; + + rmm::device_uvector segment_sorted_types(h_nbr_offsets[i + 1] - h_nbr_offsets[i], + handle.get_stream()); + rmm::device_uvector sequences(h_nbr_offsets[i + 1] - h_nbr_offsets[i], + handle.get_stream()); + thrust::sequence(handle.get_thrust_policy(), sequences.begin(), sequences.end(), size_t{0}); + rmm::device_uvector segment_sorted_sequences(h_nbr_offsets[i + 1] - h_nbr_offsets[i], + handle.get_stream()); + + auto offset_first = thrust::make_transform_iterator( + aggregate_local_frontier_unique_key_local_degree_offsets.data() + h_key_offsets[i], + detail::shift_left_t{h_nbr_offsets[i]}); + cub::DeviceSegmentedSort::SortPairs( + static_cast(nullptr), + tmp_storage_bytes, + aggregate_local_frontier_unique_key_types.begin() + h_nbr_offsets[i], + segment_sorted_types.begin(), + sequences.begin(), + segment_sorted_sequences.begin(), + h_nbr_offsets[i + 1] - h_nbr_offsets[i], + h_key_offsets[i + 1] - h_key_offsets[i], + offset_first, + offset_first + 1, + handle.get_stream()); + if (tmp_storage_bytes > d_tmp_storage.size()) { + d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, handle.get_stream()); + } + cub::DeviceSegmentedSort::SortPairs( + d_tmp_storage.data(), + tmp_storage_bytes, + aggregate_local_frontier_unique_key_types.begin() + h_nbr_offsets[i], + segment_sorted_types.begin(), + sequences.begin(), + segment_sorted_sequences.begin(), + h_nbr_offsets[i + 1] - h_nbr_offsets[i], + h_key_offsets[i + 1] - h_key_offsets[i], + offset_first, + offset_first + 1, + handle.get_stream()); + + thrust::copy(handle.get_thrust_policy(), + segment_sorted_types.begin(), + segment_sorted_types.end(), + aggregate_local_frontier_unique_key_types.begin() + h_nbr_offsets[i]); + + rmm::device_uvector segment_sorted_biases(h_nbr_offsets[i + 1] - h_nbr_offsets[i], + handle.get_stream()); + rmm::device_uvector segment_sorted_nz_bias_indices( + h_nbr_offsets[i + 1] - h_nbr_offsets[i], handle.get_stream()); + thrust::gather( + handle.get_thrust_policy(), + segment_sorted_sequences.begin(), + segment_sorted_sequences.end(), + thrust::make_zip_iterator(aggregate_local_frontier_unique_key_biases.begin(), + aggregate_local_frontier_unique_key_nz_bias_indices.begin()), + thrust::make_zip_iterator(segment_sorted_biases.begin(), + segment_sorted_nz_bias_indices.begin())); + auto segment_sorted_pair_first = thrust::make_zip_iterator( + segment_sorted_biases.begin(), segment_sorted_nz_bias_indices.begin()); + thrust::copy( + handle.get_thrust_policy(), + segment_sorted_pair_first, + segment_sorted_pair_first + segment_sorted_biases.size(), + thrust::make_zip_iterator(aggregate_local_frontier_unique_key_biases.begin(), + aggregate_local_frontier_unique_key_nz_bias_indices.begin())); + } + + // 3. sample neighbor indices and shuffle neighbor indices + + rmm::device_uvector local_nbr_indices(0, handle.get_stream()); + std::optional> key_indices{std::nullopt}; + std::vector local_frontier_sample_offsets{}; + { + rmm::device_uvector aggregate_local_frontier_unique_key_per_type_local_degree_offsets( + local_frontier_unique_key_offsets.back() * num_edge_types, handle.get_stream()); + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.set_element_to_zero_async( + 0, handle.get_stream()); + thrust::tabulate(handle.get_thrust_policy(), + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.begin() + 1, + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.end(), + [unique_key_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size()), + unique_key_types = raft::device_span( + aggregate_local_frontier_unique_key_types.data(), + aggregate_local_frontier_unique_key_types.size()), + num_edge_types] __device__(size_t i) { + auto key_idx = i / num_edge_types; + auto edge_type = static_cast(i % num_edge_types); + auto start_offset = unique_key_local_degree_offsets[key_idx]; + auto end_offset = unique_key_local_degree_offsets[key_idx + 1]; + auto edge_type_first = unique_key_types.begin() + start_offset; + auto edge_type_last = unique_key_types.begin() + end_offset; + return static_cast(thrust::distance( + thrust::lower_bound(edge_type_first, edge_type_last, edge_type), + thrust::upper_bound(edge_type_first, edge_type_last, edge_type))); + }); + thrust::inclusive_scan( + handle.get_thrust_policy(), + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.begin() + 1, + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.end(), + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.begin() + 1); + aggregate_local_frontier_unique_key_types.resize(0, handle.get_stream()); + aggregate_local_frontier_unique_key_types.shrink_to_fit(handle.get_stream()); + + if (with_replacement) { +#if 0 + std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = + heterogeneous_biased_sample_with_replacement( + handle, + local_frontier_offsets, + raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), + aggregate_local_frontier_key_idx_to_unique_key_idx.size()), + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), + raft::device_span(aggregate_local_frontier_unique_key_biases.data(), + aggregate_local_frontier_unique_key_biases.size()), + raft::device_span(aggregate_local_frontier_unique_key_types.data(), + aggregate_local_frontier_unique_key_types.size()), + raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size()), + rng_state, + Ks); +#endif + } else { +#if 1 + std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = + heterogeneous_biased_sample_without_replacement( + handle, + local_frontier_offsets, + raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data(), + aggregate_local_frontier_key_idx_to_unique_key_idx.size()), + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), + raft::device_span(aggregate_local_frontier_unique_key_biases.data(), + aggregate_local_frontier_unique_key_biases.size()), + raft::device_span( + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.size()), + rng_state, + Ks); +#endif + } + } + + // 4. Re-map local neighbor indices + + auto K_sum = std::accumulate(Ks.begin(), Ks.end(), size_t{0}); + + if (key_indices) { + auto pair_first = thrust::make_zip_iterator(local_nbr_indices.begin(), (*key_indices).begin()); + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + thrust::transform( + handle.get_thrust_policy(), + pair_first + local_frontier_sample_offsets[i], + pair_first + local_frontier_sample_offsets[i + 1], + local_nbr_indices.begin() + local_frontier_sample_offsets[i], + cuda::proclaim_return_type( + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), + unique_key_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data() + + local_frontier_unique_key_offsets[i], + (local_frontier_unique_key_offsets[i + 1], local_frontier_unique_key_offsets[i]) + 1), + unique_key_nz_bias_indices = raft::device_span( + aggregate_local_frontier_unique_key_nz_bias_indices.data(), + aggregate_local_frontier_unique_key_nz_bias_indices.size())] __device__(auto pair) { + auto nz_bias_idx = thrust::get<0>(pair); + auto key_idx = thrust::get<1>(pair); + auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; + return unique_key_nz_bias_indices[unique_key_local_degree_offsets[unique_key_idx] + + nz_bias_idx]; + })); + } + } else { + auto pair_first = thrust::make_zip_iterator(local_nbr_indices.begin(), + thrust::make_counting_iterator(size_t{0})); + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + thrust::transform( + handle.get_thrust_policy(), + pair_first + local_frontier_sample_offsets[i], + pair_first + local_frontier_sample_offsets[i + 1], + local_nbr_indices.begin() + local_frontier_sample_offsets[i], + cuda::proclaim_return_type( + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), + unique_key_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data() + + local_frontier_unique_key_offsets[i], + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) + 1), + unique_key_nz_bias_indices = raft::device_span( + aggregate_local_frontier_unique_key_nz_bias_indices.data(), + aggregate_local_frontier_unique_key_nz_bias_indices.size()), + K_sum] __device__(auto pair) { + auto nz_bias_idx = thrust::get<0>(pair); + auto key_idx = thrust::get<1>(pair) / K_sum; + auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; + return unique_key_nz_bias_indices[unique_key_local_degree_offsets[unique_key_idx] + + nz_bias_idx]; + })); + } + } + + // 5. convert neighbor indices in the neighbor list considering edge mask to neighbor indices in + // the neighbor list ignoring edge mask + + if (edge_mask_view) { + local_nbr_indices = convert_to_unmasked_local_nbr_idx( + handle, + graph_view, + thrust_tuple_get_or_identity(aggregate_local_frontier_key_first), + std::move(local_nbr_indices), + key_indices ? std::make_optional>((*key_indices).data(), + (*key_indices).size()) + : std::nullopt, + raft::host_span(local_frontier_sample_offsets.data(), + local_frontier_sample_offsets.size()), + local_frontier_offsets, + K_sum); + } + + return std::make_tuple( + std::move(local_nbr_indices), std::move(key_indices), std::move(local_frontier_sample_offsets)); +} + } // namespace detail } // namespace cugraph diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh index dfa45cb83cc..4705e50c90b 100644 --- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh +++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh @@ -320,11 +320,10 @@ per_v_random_select_transform_e(raft::handle_t const& handle, } else { local_key_list_sizes = std::vector{key_list.size()}; } - std::vector local_key_list_displacements(local_key_list_sizes.size()); - std::exclusive_scan(local_key_list_sizes.begin(), - local_key_list_sizes.end(), - local_key_list_displacements.begin(), - size_t{0}); + std::vector local_key_list_offsets(local_key_list_sizes.size() + 1); + local_key_list_offsets[0] = 0; + std::inclusive_scan( + local_key_list_sizes.begin(), local_key_list_sizes.end(), local_key_list_offsets.begin() + 1); // 1. aggregate key_list @@ -332,14 +331,15 @@ per_v_random_select_transform_e(raft::handle_t const& handle, if (minor_comm_size > 1) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - aggregate_local_key_list = allocate_dataframe_buffer( - local_key_list_displacements.back() + local_key_list_sizes.back(), handle.get_stream()); - device_allgatherv(minor_comm, - key_list.begin(), - get_dataframe_buffer_begin(*aggregate_local_key_list), - local_key_list_sizes, - local_key_list_displacements, - handle.get_stream()); + aggregate_local_key_list = + allocate_dataframe_buffer(local_key_list_offsets.back(), handle.get_stream()); + device_allgatherv( + minor_comm, + key_list.begin(), + get_dataframe_buffer_begin(*aggregate_local_key_list), + local_key_list_sizes, + std::vector(local_key_list_offsets.begin(), local_key_list_offsets.end() - 1), + handle.get_stream()); } // 2. randomly select neighbor indices and compute local neighbor indices for every local edge @@ -362,9 +362,8 @@ per_v_random_select_transform_e(raft::handle_t const& handle, graph_view, (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) : key_list.begin(), - raft::host_span(local_key_list_displacements.data(), - local_key_list_displacements.size()), - raft::host_span(local_key_list_sizes.data(), local_key_list_sizes.size()), + raft::host_span(local_key_list_offsets.data(), + local_key_list_offsets.size()), rng_state, Ks[0], with_replacement); @@ -384,15 +383,30 @@ per_v_random_select_transform_e(raft::handle_t const& handle, bias_edge_dst_value_input, bias_edge_value_input, bias_e_op, - raft::host_span(local_key_list_displacements.data(), - local_key_list_displacements.size()), - raft::host_span(local_key_list_sizes.data(), local_key_list_sizes.size()), + raft::host_span(local_key_list_offsets.data(), + local_key_list_offsets.size()), rng_state, Ks[0], with_replacement, do_expensive_check); } else { // heterogeneous - CUGRAPH_FAIL("unimplemented."); + std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) = + biased_sample_and_compute_local_nbr_indices( + handle, + graph_view, + (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) + : key_list.begin(), + bias_edge_src_value_input, + bias_edge_dst_value_input, + bias_edge_value_input, + bias_e_op, + edge_type_input, + raft::host_span(local_key_list_offsets.data(), + local_key_list_offsets.size()), + rng_state, + Ks, + with_replacement, + do_expensive_check); } } @@ -415,7 +429,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle, auto edge_partition_key_list_first = ((minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) : key_list.begin()) + - local_key_list_displacements[i]; + local_key_list_offsets[i]; auto edge_partition_sample_local_nbr_index_first = sample_local_nbr_indices.begin() + local_key_list_sample_offsets[i]; From 026874c6b273be8d49b001269174864fbe84eb12 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 25 Jan 2025 17:22:53 -0800 Subject: [PATCH 10/21] heterogeneous biased sampling with replacement --- .../sample_and_compute_local_nbr_indices.cuh | 485 +++++++++--------- ...r_v_random_select_transform_outgoing_e.cuh | 6 +- 2 files changed, 234 insertions(+), 257 deletions(-) diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh index be49ad800a6..fb8cccb80de 100644 --- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh +++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh @@ -151,7 +151,7 @@ struct convert_pair_to_5tuple_t { partitioned_per_type_local_value_displacements{}; // one partition per gpu in the same // minor_comm raft::device_span tx_counts{}; - raft::device_span K_offsets{}; + raft::device_span K_offsets{}; size_t K_sum; int minor_comm_size{}; value_t invalid_value{}; @@ -1292,7 +1292,7 @@ void compute_heterogeneous_biased_sampling_index_without_replacement( raft::device_span K_offsets, bool jump) { - auto num_edge_types = K_offsets.size() - 1; + auto num_edge_types = static_cast(K_offsets.size() - 1); if (jump) { // Algorithm A-ExpJ CUGRAPH_FAIL( @@ -1985,7 +1985,7 @@ shuffle_and_compute_per_type_local_nbr_values( auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); - auto num_edge_types = K_offsets.size() - 1; + auto num_edge_types = static_cast(K_offsets.size() - 1); auto sample_per_type_local_nbr_values = std::move(sample_per_type_nbr_values); // neighbor value within an edge partition (note that @@ -2075,119 +2075,24 @@ shuffle_and_compute_per_type_local_nbr_values( rx_counts.begin(), rx_counts.end(), local_frontier_sample_offsets.begin() + 1); return std::make_tuple(std::move(sample_per_type_local_nbr_values), - std::move(key_indices), std::move(edge_types), + std::move(key_indices), std::move(local_frontier_sample_offsets)); } -#if 0 -// aggregate local frontier (index, type) pairs and compute per-type local degrees (one for each -// pair) -template -thrust::tuple, - rmm::device_uvector, - std::vector, - rmm::device_uvector> -aggregate_sub_frontier_and_compute_per_type_local_degrees( - raft::handle_t const& handle, - raft::device_span aggregate_local_frontier_key_idx_to_unique_key_idx, - raft::host_span local_frontier_offsets, - raft::device_span aggregate_local_frontier_unique_key_per_type_local_degree_offsets, - raft::host_span local_frontier_unique_key_offsets, - raft::device_span sub_frontier_indices, - raft::device_span sub_frontier_types, - size_t num_edge_types) -{ - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - - auto num_local_edge_partitions = local_frontier_offsets.size() - 1; - - assert(sub_frontier_indices.size() == sub_frontier_types.size()); - - auto sub_frontier_size = sub_frontier_indices.size(); - - auto sub_local_frontier_sizes = - host_scalar_allgather(minor_comm, sub_frontier_size, handle.get_stream()); - std::vector sub_local_frontier_offsets(sub_local_frontier_sizes.size() + 1); - sub_local_frontier_offsets[0] = 0; - std::inclusive_scan(sub_local_frontier_sizes.begin(), - sub_local_frontier_sizes.end(), - sub_local_frontier_offsets.begin() + 1); - - auto aggregate_sub_local_frontier_index_type_pairs = - allocate_dataframe_buffer>(sub_local_frontier_offsets.back(), - handle.get_stream()); - rmm::device_uvector aggregate_sub_local_frontier_per_type_local_degrees( - 0, handle.get_stream()); - if (sub_local_frontier_offsets.back() > 0) { - // aggregate frontier index type pairs - - auto aggregate_sub_local_frontier_index_type_pairs = - allocate_dataframe_buffer>( - sub_local_frontier_offsets.back(), handle.get_stream()); - device_allgatherv( - minor_comm, - thrust::make_zip_iterator(sub_frontier_indices.begin(), sub_frontier_types.begin()), - get_dataframe_buffer_beign(aggregate_sub_local_frontier_index_type_pairs), - sub_local_frontier_sizes, - std::vector(sub_local_frontier_offsets.begin(), sub_local_frontier_offsets.end() - 1), - handle.get_stream()); - - // compute per-type local degrees for the aggregated frontier indices - - aggregate_sub_local_frontier_per_type_local_degrees.resize( - size_dataframe_buffer(aggregate_sub_local_frontier_index_type_pairs), handle.get_stream()); - for (size_t i = 0; i < num_local_edge_partitions; ++i) { - thrust::transform( - handle.get_thrust_policy(), - get_dataframe_buffer_begin(aggregate_sub_local_frontier_index_type_pairs) + - sub_local_frontier_offsets[i], - get_dataframe_buffer_begin(aggregate_sub_local_frontier_index_type_pairs) + - sub_local_frontier_offsets[i + 1], - aggregate_sub_local_frontier_per_type_local_degrees.begin() + sub_local_frontier_offsets[i], - cuda::proclaim_return_type( - [key_idx_to_unique_key_idx = raft::device_span( - aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], - local_frontier_offsets[i + 1] - local_frontier_offsets[i]), - unique_key_per_type_local_degree_offsets = raft::device_span( - aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data() + - local_frontier_unique_key_offsets[i] * num_edge_types, - (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) * - num_edge_types + - 1), - num_edge_types] __device__(auto pair) { - auto key_idx = thrust::get<0>(pair); - auto edge_type = thrust::get<1>(pair); - auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; - return static_cast( - unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + edge_type + - 1] - - unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + - edge_type]); - })); - } - } - - return std::make_tuple(std::move(std::get<0>(aggregate_sub_local_frontier_index_type_pairs)), - std::move(std::get<1>(aggregate_sub_local_frontier_index_type_pairs)), - std::move(sub_local_frontier_offsets), - std::move(aggregate_sub_local_frontier_per_type_local_degrees)); -} -#endif - -template +template std::tuple /* local_nbr_indices */, std::optional> /* key_indices */, std::vector /* local_frontier_sample_offsets */> -homogeneous_biased_sample_with_replacement( +biased_sample_with_replacement( raft::handle_t const& handle, raft::host_span local_frontier_offsets, raft::device_span aggregate_local_frontier_key_idx_to_unique_key_idx, raft::host_span local_frontier_unique_key_offsets, raft::device_span aggregate_local_frontier_unique_key_biases, - raft::device_span aggregate_local_frontier_unique_key_local_degree_offsets, + raft::device_span aggregate_local_frontier_unique_key_per_type_local_degree_offsets, raft::random::RngState& rng_state, - size_t K) + raft::host_span Ks) { int minor_comm_rank{0}; int minor_comm_size{1}; @@ -2198,19 +2103,32 @@ homogeneous_biased_sample_with_replacement( } auto num_local_edge_partitions = local_frontier_offsets.size() - 1; + auto num_edge_types = static_cast(Ks.size()); rmm::device_uvector local_nbr_indices(0, handle.get_stream()); std::optional> key_indices{std::nullopt}; std::vector local_frontier_sample_offsets{}; - // compute segmented inclusive sums (one segment per key) + auto K_sum = std::accumulate(Ks.begin(), Ks.end(), size_t{0}); + + rmm::device_uvector d_K_offsets(Ks.size() + 1, handle.get_stream()); + { + std::vector h_K_offsets(d_K_offsets.size()); + h_K_offsets[0] = 0; + std::inclusive_scan(Ks.begin(), Ks.end(), h_K_offsets.begin() + 1); + raft::update_device( + d_K_offsets.data(), h_K_offsets.data(), h_K_offsets.size(), handle.get_stream()); + } + + // compute segmented inclusive sums (one segment per key & type pair) auto unique_key_first = thrust::make_transform_iterator( thrust::make_counting_iterator(size_t{0}), cuda::proclaim_return_type( [offsets = raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data(), - aggregate_local_frontier_unique_key_local_degree_offsets.size())] __device__(size_t i) { + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_per_type_local_degree_offsets + .size())] __device__(size_t i) { return static_cast(thrust::distance( offsets.begin() + 1, thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i))); @@ -2225,62 +2143,69 @@ homogeneous_biased_sample_with_replacement( aggregate_local_frontier_unique_key_biases.begin(), aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.begin()); - // sum collect local bias values (one value per key) and collect local bias sums + // sum local bias values (one value per key & type pair) and collect local bias sums - auto aggregate_local_frontier_bias_local_sums = - rmm::device_uvector(local_frontier_offsets.back(), handle.get_stream()); + auto aggregate_local_frontier_per_type_bias_local_sums = rmm::device_uvector( + local_frontier_offsets.back() * num_edge_types, handle.get_stream()); for (size_t i = 0; i < num_local_edge_partitions; ++i) { thrust::tabulate( handle.get_thrust_policy(), - get_dataframe_buffer_begin(aggregate_local_frontier_bias_local_sums) + - local_frontier_offsets[i], - get_dataframe_buffer_begin(aggregate_local_frontier_bias_local_sums) + - local_frontier_offsets[i + 1], + get_dataframe_buffer_begin(aggregate_local_frontier_per_type_bias_local_sums) + + local_frontier_offsets[i] * num_edge_types, + get_dataframe_buffer_begin(aggregate_local_frontier_per_type_bias_local_sums) + + local_frontier_offsets[i + 1] * num_edge_types, [key_idx_to_unique_key_idx = raft::device_span( aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], local_frontier_offsets[i + 1] - local_frontier_offsets[i]), - unique_key_local_degree_offsets = raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data() + - local_frontier_unique_key_offsets[i], - (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) + 1), + unique_key_per_type_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data() + + local_frontier_unique_key_offsets[i] * num_edge_types, + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) * + num_edge_types + + 1), aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums = raft::device_span( aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.data(), - aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums - .size())] __device__(size_t i) { - auto unique_key_idx = key_idx_to_unique_key_idx[i]; - auto degree = unique_key_local_degree_offsets[unique_key_idx + 1] - - unique_key_local_degree_offsets[unique_key_idx]; + aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.size()), + num_edge_types] __device__(size_t i) { + auto key_idx = i / num_edge_types; + auto type = static_cast(i % num_edge_types); + auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; + auto degree = + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + type + 1] - + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + type]; if (degree > 0) { return aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums - [unique_key_local_degree_offsets[unique_key_idx] + degree - 1]; + [unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + type] + + degree - 1]; } else { return bias_t{0.0}; } }); } - rmm::device_uvector frontier_bias_sums(0, handle.get_stream()); - std::optional> frontier_partitioned_bias_local_sum_displacements{ - std::nullopt}; + rmm::device_uvector frontier_per_type_bias_sums(0, handle.get_stream()); + std::optional> + frontier_partitioned_per_type_bias_local_sum_displacements{std::nullopt}; if (minor_comm_size > 1) { - std::tie(frontier_bias_sums, frontier_partitioned_bias_local_sum_displacements) = + std::tie(frontier_per_type_bias_sums, + frontier_partitioned_per_type_bias_local_sum_displacements) = compute_frontier_value_sums_and_partitioned_local_value_sum_displacements( handle, - raft::device_span(aggregate_local_frontier_bias_local_sums.data(), - aggregate_local_frontier_bias_local_sums.size()), + raft::device_span(aggregate_local_frontier_per_type_bias_local_sums.data(), + aggregate_local_frontier_per_type_bias_local_sums.size()), local_frontier_offsets, - 1); - aggregate_local_frontier_bias_local_sums.resize(0, handle.get_stream()); - aggregate_local_frontier_bias_local_sums.shrink_to_fit(handle.get_stream()); + num_edge_types); + aggregate_local_frontier_per_type_bias_local_sums.resize(0, handle.get_stream()); + aggregate_local_frontier_per_type_bias_local_sums.shrink_to_fit(handle.get_stream()); } else { - frontier_bias_sums = std::move(aggregate_local_frontier_bias_local_sums); + frontier_per_type_bias_sums = std::move(aggregate_local_frontier_per_type_bias_local_sums); } // sample & compute local neighbor indices rmm::device_uvector sample_random_numbers( - (local_frontier_offsets[minor_comm_rank + 1] - local_frontier_offsets[minor_comm_rank]) * K, + (local_frontier_offsets[minor_comm_rank + 1] - local_frontier_offsets[minor_comm_rank]) * K_sum, handle.get_stream()); cugraph::detail::uniform_random_fill(handle.get_stream(), sample_random_numbers.data(), @@ -2295,25 +2220,47 @@ homogeneous_biased_sample_with_replacement( thrust::make_counting_iterator(size_t{0}), sample_random_numbers.begin(), cuda::proclaim_return_type( - [frontier_bias_sums = - raft::device_span(frontier_bias_sums.data(), frontier_bias_sums.size()), - K, + [frontier_per_type_bias_sums = raft::device_span( + frontier_per_type_bias_sums.data(), frontier_per_type_bias_sums.size()), + K_offsets = raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + K_sum, invalid_value = std::numeric_limits::infinity()] __device__(bias_t r, size_t i) { - // frontier_bias_sums[i / K] will be 0 if degree is 0 or all the edges have 0 bias - return frontier_bias_sums[i / K] > 0.0 ? r * frontier_bias_sums[i / K] : invalid_value; + auto type = static_cast(thrust::distance( + K_offsets.begin() + 1, + thrust::upper_bound(thrust::seq, K_offsets.begin() + 1, K_offsets.end(), i % K_sum))); + // frontier_bias_sums[i / K_sum + type] will be 0 if degree is 0 or all the edges have 0 + // bias + return frontier_per_type_bias_sums[i / K_sum + type] > 0.0 + ? r * frontier_per_type_bias_sums[i / K_sum + type] + : invalid_value; })); rmm::device_uvector sample_local_random_numbers(0, handle.get_stream()); + std::optional> edge_types{std::nullopt}; if (minor_comm_size > 1) { - std::tie(sample_local_random_numbers, key_indices, local_frontier_sample_offsets) = - shuffle_and_compute_local_nbr_values( - handle, - std::move(sample_random_numbers), - raft::device_span( - (*frontier_partitioned_bias_local_sum_displacements).data(), - (*frontier_partitioned_bias_local_sum_displacements).size()), - K, - std::numeric_limits::infinity()); + if (num_edge_types > 1) { + std::tie( + sample_local_random_numbers, edge_types, key_indices, local_frontier_sample_offsets) = + shuffle_and_compute_per_type_local_nbr_values( + handle, + std::move(sample_random_numbers), + raft::device_span( + (*frontier_partitioned_per_type_bias_local_sum_displacements).data(), + (*frontier_partitioned_per_type_bias_local_sum_displacements).size()), + raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + K_sum, + std::numeric_limits::infinity()); + } else { + std::tie(sample_local_random_numbers, key_indices, local_frontier_sample_offsets) = + shuffle_and_compute_local_nbr_values( + handle, + std::move(sample_random_numbers), + raft::device_span( + (*frontier_partitioned_per_type_bias_local_sum_displacements).data(), + (*frontier_partitioned_per_type_bias_local_sum_displacements).size()), + K_sum, + std::numeric_limits::infinity()); + } } else { sample_local_random_numbers = std::move(sample_random_numbers); local_frontier_sample_offsets = {size_t{0}, sample_local_random_numbers.size()}; @@ -2325,15 +2272,17 @@ homogeneous_biased_sample_with_replacement( handle.get_thrust_policy(), local_nbr_indices.begin() + local_frontier_sample_offsets[i], local_nbr_indices.begin() + local_frontier_sample_offsets[i + 1], - [K, - sample_local_random_numbers = raft::device_span( + [sample_local_random_numbers = raft::device_span( sample_local_random_numbers.data() + local_frontier_sample_offsets[i], local_frontier_sample_offsets[i + 1] - local_frontier_sample_offsets[i]), - key_indices = key_indices - ? thrust::make_optional>( + key_indices = key_indices + ? thrust::make_optional>( (*key_indices).data() + local_frontier_sample_offsets[i], local_frontier_sample_offsets[i + 1] - local_frontier_sample_offsets[i]) - : thrust::nullopt, + : thrust::nullopt, + edge_types = edge_types ? thrust::make_optional>( + (*edge_types).data(), (*edge_types).size()) + : thrust::nullopt, key_idx_to_unique_key_idx = raft::device_span( aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], local_frontier_offsets[i + 1] - local_frontier_offsets[i]), @@ -2341,28 +2290,44 @@ homogeneous_biased_sample_with_replacement( raft::device_span( aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.data(), aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.size()), - unique_key_local_degree_offsets = raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data() + - local_frontier_unique_key_offsets[i], - (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) + 1), + unique_key_per_type_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data() + + local_frontier_unique_key_offsets[i] * num_edge_types, + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) * + num_edge_types + + 1), + K_offsets = raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + K_sum, + num_edge_types, invalid_random_number = std::numeric_limits::infinity(), invalid_idx = cugraph::invalid_edge_id_v] __device__(size_t i) { - auto key_idx = key_indices ? (*key_indices)[i] : (i / K); - auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; auto local_random_number = sample_local_random_numbers[i]; if (local_random_number != invalid_random_number) { - auto local_degree = - static_cast(unique_key_local_degree_offsets[unique_key_idx + 1] - - unique_key_local_degree_offsets[unique_key_idx]); + auto key_idx = key_indices ? (*key_indices)[i] : (i / K_sum); + auto type = + num_edge_types > 1 + ? (edge_types ? (*edge_types)[i] + : static_cast(thrust::distance( + K_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, K_offsets.begin() + 1, K_offsets.end(), i % K_sum)))) + : edge_type_t{0}; + auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; + auto local_degree = static_cast( + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + type + 1] - + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + type]); auto inclusive_sum_first = aggregate_local_frontier_unique_key_bias_segmented_local_inclusive_sums.begin() + - unique_key_local_degree_offsets[unique_key_idx]; + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + type]; auto inclusive_sum_last = inclusive_sum_first + local_degree; auto local_nbr_idx = static_cast(thrust::distance( inclusive_sum_first, thrust::upper_bound( thrust::seq, inclusive_sum_first, inclusive_sum_last, local_random_number))); - return cuda::std::min(local_nbr_idx, local_degree - 1); + auto type_start_offset = static_cast( + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + type] - + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types]); + return type_start_offset + cuda::std::min(local_nbr_idx, local_degree - 1); } else { return invalid_idx; } @@ -2898,62 +2863,68 @@ homogeneous_biased_sample_without_replacement( local_nbr_indices.resize(frontier_degrees.size() * K, handle.get_stream()); // sample from low-degree vertices - thrust::for_each( - handle.get_thrust_policy(), - frontier_indices.begin(), - frontier_indices.begin() + frontier_partition_offsets[1], - [key_idx_to_unique_key_idx = - raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), - aggregate_local_frontier_key_idx_to_unique_key_idx.size()), - aggregate_local_frontier_unique_key_biases = - raft::device_span(aggregate_local_frontier_unique_key_biases.data(), - aggregate_local_frontier_unique_key_biases.size()), - aggregate_local_frontier_unique_key_local_degree_offsets = raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data(), - aggregate_local_frontier_unique_key_local_degree_offsets.size()), - local_nbr_indices = - raft::device_span(local_nbr_indices.data(), local_nbr_indices.size()), - K, - invalid_idx = cugraph::invalid_edge_id_v] __device__(size_t i) { - auto unique_key_idx = key_idx_to_unique_key_idx[i]; - auto degree = aggregate_local_frontier_unique_key_local_degree_offsets[unique_key_idx + 1] - - aggregate_local_frontier_unique_key_local_degree_offsets[unique_key_idx]; - for (size_t j = 0; j < degree; ++j) { - *(local_nbr_indices.begin() + i * K + j) = j; - } - thrust::fill(thrust::seq, - local_nbr_indices.begin() + i * K + degree, - local_nbr_indices.begin() + (i + 1) * K, - invalid_idx); - }); + if (frontier_partition_offsets[1] > 0) { + thrust::for_each( + handle.get_thrust_policy(), + frontier_indices.begin(), + frontier_indices.begin() + frontier_partition_offsets[1], + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data(), + aggregate_local_frontier_key_idx_to_unique_key_idx.size()), + aggregate_local_frontier_unique_key_biases = + raft::device_span(aggregate_local_frontier_unique_key_biases.data(), + aggregate_local_frontier_unique_key_biases.size()), + aggregate_local_frontier_unique_key_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size()), + local_nbr_indices = + raft::device_span(local_nbr_indices.data(), local_nbr_indices.size()), + K, + invalid_idx = cugraph::invalid_edge_id_v] __device__(size_t i) { + auto unique_key_idx = key_idx_to_unique_key_idx[i]; + auto degree = + aggregate_local_frontier_unique_key_local_degree_offsets[unique_key_idx + 1] - + aggregate_local_frontier_unique_key_local_degree_offsets[unique_key_idx]; + for (size_t j = 0; j < degree; ++j) { + *(local_nbr_indices.begin() + i * K + j) = j; + } + thrust::fill(thrust::seq, + local_nbr_indices.begin() + i * K + degree, + local_nbr_indices.begin() + (i + 1) * K, + invalid_idx); + }); + } // sample from mid & high-degree vertices auto mid_and_high_frontier_size = frontier_partition_offsets[3] - frontier_partition_offsets[1]; - rmm::device_uvector unique_key_indices_for_key_indices(mid_and_high_frontier_size, - handle.get_stream()); - thrust::gather( - handle.get_thrust_policy(), - frontier_indices.begin() + frontier_partition_offsets[1], - frontier_indices.begin() + frontier_partition_offsets[1] + mid_and_high_frontier_size, - aggregate_local_frontier_key_idx_to_unique_key_idx.begin(), - unique_key_indices_for_key_indices.begin()); - compute_homogeneous_biased_sampling_index_without_replacement( - handle, - std::make_optional>( - unique_key_indices_for_key_indices.data(), unique_key_indices_for_key_indices.size()), - raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data(), - aggregate_local_frontier_unique_key_local_degree_offsets.size()), - raft::device_span(aggregate_local_frontier_unique_key_biases.data(), - aggregate_local_frontier_unique_key_biases.size()), - std::make_optional>( - frontier_indices.data() + frontier_partition_offsets[1], mid_and_high_frontier_size), - raft::device_span(local_nbr_indices.data(), local_nbr_indices.size()), - std::nullopt, - rng_state, - K, - false); + + if (mid_and_high_frontier_size > 0) { + rmm::device_uvector unique_key_indices_for_key_indices(mid_and_high_frontier_size, + handle.get_stream()); + thrust::gather( + handle.get_thrust_policy(), + frontier_indices.begin() + frontier_partition_offsets[1], + frontier_indices.begin() + frontier_partition_offsets[1] + mid_and_high_frontier_size, + aggregate_local_frontier_key_idx_to_unique_key_idx.begin(), + unique_key_indices_for_key_indices.begin()); + compute_homogeneous_biased_sampling_index_without_replacement( + handle, + std::make_optional>( + unique_key_indices_for_key_indices.data(), unique_key_indices_for_key_indices.size()), + raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size()), + raft::device_span(aggregate_local_frontier_unique_key_biases.data(), + aggregate_local_frontier_unique_key_biases.size()), + std::make_optional>( + frontier_indices.data() + frontier_partition_offsets[1], mid_and_high_frontier_size), + raft::device_span(local_nbr_indices.data(), local_nbr_indices.size()), + std::nullopt, + rng_state, + K, + false); + } local_frontier_sample_offsets = std::vector{0, local_nbr_indices.size()}; } @@ -3051,12 +3022,14 @@ heterogeneous_biased_sample_without_replacement( auto K_sum = std::accumulate(Ks.begin(), Ks.end(), size_t{0}); - std::vector h_K_offsets(Ks.size() + 1); - h_K_offsets[0] = 0; - std::inclusive_scan(Ks.begin(), Ks.end(), h_K_offsets.begin() + 1); - rmm::device_uvector d_K_offsets(h_K_offsets.size(), handle.get_stream()); - raft::update_device( - d_K_offsets.data(), h_K_offsets.data(), h_K_offsets.size(), handle.get_stream()); + rmm::device_uvector d_K_offsets(Ks.size() + 1, handle.get_stream()); + { + std::vector h_K_offsets(d_K_offsets.size()); + h_K_offsets[0] = 0; + std::inclusive_scan(Ks.begin(), Ks.end(), h_K_offsets.begin() + 1); + raft::update_device( + d_K_offsets.data(), h_K_offsets.data(), h_K_offsets.size(), handle.get_stream()); + } if (minor_comm_size > 1) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); @@ -3661,6 +3634,8 @@ heterogeneous_biased_sample_without_replacement( K_sum, cugraph::invalid_edge_id_v); + // per-type local neighbor indices => local neighbor indices + local_nbr_indices = std::move(per_type_local_nbr_indices); auto triplet_first = thrust::make_zip_iterator( local_nbr_indices.begin(), edge_types.begin(), (*key_indices).begin()); @@ -3780,6 +3755,8 @@ heterogeneous_biased_sample_without_replacement( local_frontier_sample_offsets = std::vector{0, per_type_local_nbr_indices.size()}; + // per-type local neighbor indices => local neighbor indices + local_nbr_indices = std::move(per_type_local_nbr_indices); { auto pair_first = thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{0}), @@ -3903,13 +3880,14 @@ template std::tuple, std::optional>, std::vector> -uniform_sample_and_compute_local_nbr_indices(raft::handle_t const& handle, - GraphViewType const& graph_view, - KeyIterator aggregate_local_frontier_key_first, - raft::host_span local_frontier_offsets, - raft::random::RngState& rng_state, - size_t K, - bool with_replacement) +homogeneous_uniform_sample_and_compute_local_nbr_indices( + raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyIterator aggregate_local_frontier_key_first, + raft::host_span local_frontier_offsets, + raft::random::RngState& rng_state, + size_t K, + bool with_replacement) { using edge_t = typename GraphViewType::edge_type; using vertex_t = typename GraphViewType::vertex_type; @@ -4022,7 +4000,7 @@ template , std::optional>, std::vector> -biased_sample_and_compute_local_nbr_indices( +homogeneous_biased_sample_and_compute_local_nbr_indices( raft::handle_t const& handle, GraphViewType const& graph_view, KeyIterator aggregate_local_frontier_key_first, @@ -4046,6 +4024,7 @@ biased_sample_and_compute_local_nbr_indices( typename EdgeDstValueInputWrapper::value_type, typename EdgeValueInputWrapper::value_type, BiasEdgeOp>::type; + using edge_type_t = int32_t; // dummy int minor_comm_rank{0}; int minor_comm_size{1}; @@ -4087,7 +4066,7 @@ biased_sample_and_compute_local_nbr_indices( std::vector local_frontier_sample_offsets{}; if (with_replacement) { std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = - homogeneous_biased_sample_with_replacement( + biased_sample_with_replacement( handle, local_frontier_offsets, raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), @@ -4100,7 +4079,7 @@ biased_sample_and_compute_local_nbr_indices( aggregate_local_frontier_unique_key_local_degree_offsets.data(), aggregate_local_frontier_unique_key_local_degree_offsets.size()), rng_state, - K); + raft::host_span(&K, size_t{1})); } else { std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = homogeneous_biased_sample_without_replacement( @@ -4209,7 +4188,7 @@ template , std::optional>, std::vector> -biased_sample_and_compute_local_nbr_indices( +hetrogeneous_biased_sample_and_compute_local_nbr_indices( raft::handle_t const& handle, GraphViewType const& graph_view, KeyIterator aggregate_local_frontier_key_first, @@ -4398,27 +4377,26 @@ biased_sample_and_compute_local_nbr_indices( aggregate_local_frontier_unique_key_types.shrink_to_fit(handle.get_stream()); if (with_replacement) { -#if 0 - std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = - heterogeneous_biased_sample_with_replacement( - handle, - local_frontier_offsets, - raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), - aggregate_local_frontier_key_idx_to_unique_key_idx.size()), - raft::host_span(local_frontier_unique_key_offsets.data(), - local_frontier_unique_key_offsets.size()), - raft::device_span(aggregate_local_frontier_unique_key_biases.data(), - aggregate_local_frontier_unique_key_biases.size()), - raft::device_span(aggregate_local_frontier_unique_key_types.data(), - aggregate_local_frontier_unique_key_types.size()), - raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data(), - aggregate_local_frontier_unique_key_local_degree_offsets.size()), - rng_state, - Ks); -#endif + std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = + biased_sample_with_replacement( + handle, + local_frontier_offsets, + raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data(), + aggregate_local_frontier_key_idx_to_unique_key_idx.size()), + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), + raft::device_span(aggregate_local_frontier_unique_key_biases.data(), + aggregate_local_frontier_unique_key_biases.size()), + raft::device_span( + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.size()), + raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size()), + rng_state, + Ks); } else { -#if 1 std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = heterogeneous_biased_sample_without_replacement) { // homogeneous std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) = - uniform_sample_and_compute_local_nbr_indices( + homogeneous_uniform_sample_and_compute_local_nbr_indices( handle, graph_view, (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) @@ -374,7 +374,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle, if constexpr (std::is_same_v) { // homogeneous std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) = - biased_sample_and_compute_local_nbr_indices( + homogeneous_biased_sample_and_compute_local_nbr_indices( handle, graph_view, (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) @@ -391,7 +391,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle, do_expensive_check); } else { // heterogeneous std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) = - biased_sample_and_compute_local_nbr_indices( + heterogeneous_biased_sample_and_compute_local_nbr_indices( handle, graph_view, (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) From a02af7d9a700b31e045178f1dc7fcf4cfcf42277 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 29 Jan 2025 20:30:26 -0800 Subject: [PATCH 11/21] update documentation --- ...r_v_random_select_transform_outgoing_e.cuh | 207 ++++++++++++++++-- 1 file changed, 192 insertions(+), 15 deletions(-) diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh index 3694ce36e05..3c177c9653d 100644 --- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh +++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh @@ -368,7 +368,18 @@ per_v_random_select_transform_e(raft::handle_t const& handle, Ks[0], with_replacement); } else { // heterogeneous - CUGRAPH_FAIL("unimplemented."); + std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) = + heterogeneous_uniform_sample_and_compute_local_nbr_indices( + handle, + graph_view, + (minor_comm_size > 1) ? get_dataframe_buffer_cbegin(*aggregate_local_key_list) + : key_list.begin(), + edge_type_input, + raft::host_span(local_key_list_offsets.data(), + local_key_list_offsets.size()), + rng_state, + Ks, + with_replacement); } } else { if constexpr (std::is_same_v> and a dataframe buffer storing the output values of + * type @p T from the selected edges. If @p invalid_value is std::nullopt, the offset vector is + * valid and has the size of @p key_list.size() + 1. If @p invalid_value.has_value() is true, + * std::nullopt is returned (the dataframe buffer will store @p key_list.size() * @p K elements). + */ template ::max(). + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). This parameter is used to pass an edge source + * property value to @p e_op. Use either cugraph::edge_src_property_t::view() (if @p e_op needs to + * access source property values) or cugraph::edge_src_dummy_property_t::view() (if @p e_op does not + * access source property values). Use update_edge_src_property to fill the wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). This parameter is used to pass an edge + * source property value to @p e_op. Use either cugraph::edge_dst_property_t::view() (if @p e_op + * needs to access destination property values) or cugraph::edge_dst_dummy_property_t::view() (if @p + * e_op does not access destination property values). Use update_edge_dst_property to fill the + * wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). This parameter is used to pass an edge source property value to @p + * e_op. Use either cugraph::edge_property_t::view() (if @p e_op needs to access edge property + * values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not access edge property + * values). * @param e_op Quinary operator takes (tagged-)edge source, edge destination, property values for * the source, destination, and edge and returns a value to be collected in the output. This * function is called only for the selected edges. @@ -880,6 +971,92 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, do_expensive_check); } +/** + * @brief Randomly select (per edge type) and transform the input (tagged-)vertices' outgoing edges + * with biases. + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. + * @tparam BiasEdgeSrcValueInputWrapper Type of the wrapper for edge source property values (for + * BiasEdgeOp). + * @tparam BiasEdgeDstValueInputWrapper Type of the wrapper for edge destination property values + * (for BiasEdgeOp). + * @tparam BiasEdgeValueInputWrapper Type of the wrapper for edge property values (for BiasEdgeOp). + * @tparam BiasEdgeOp Type of the quinary edge operator to set-up selection bias + * values. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam EdgeTypeInputWrapper Type of the wrapper for edge type values. + * @tparam T Type of the selected and transformed edge output values. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to sample outgoing + * edges. + * @param bias_edge_src_value_input Wrapper used to access source input property values (for the + * edge sources assigned to this process in multi-GPU). This parameter is used to pass an edge + * source property value to @p bias_e_op. Use either cugraph::edge_src_property_t::view() (if @p + * e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() (if @p + * e_op does not access source property values). Use update_edge_src_property to fill the wrapper. + * @param bias_edge_dst_value_input Wrapper used to access destination input property values (for + * the edge destinations assigned to this process in multi-GPU). This parameter is used to pass an + * edge source property value to @p bias_e_op. Use either cugraph::edge_dst_property_t::view() (if + * @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param bias_edge_value_input Wrapper used to access edge input property values (for the edges + * assigned to this process in multi-GPU). This parameter is used to pass an edge source property + * value to @p bias_e_op. Use either cugraph::edge_property_t::view() (if @p e_op needs to access + * edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not access edge + * property values). + * @param bias_e_op Quinary operator takes (tagged-)edge source, edge destination, property values + * for the source, destination, and edge and returns a floating point bias value to be used in + * biased random selection. The return value should be non-negative. The bias value of 0 indicates + * that the corresponding edge cannot be selected. Assuming that the return value type is bias_t, + * the sum of the bias values for any seed vertex should not exceed + * std::numeric_limits::max(). + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). This parameter is used to pass an edge source + * property value to @p e_op. Use either cugraph::edge_src_property_t::view() (if @p e_op needs to + * access source property values) or cugraph::edge_src_dummy_property_t::view() (if @p e_op does not + * access source property values). Use update_edge_src_property to fill the wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). This parameter is used to pass an edge + * source property value to @p e_op. Use either cugraph::edge_dst_property_t::view() (if @p e_op + * needs to access destination property values) or cugraph::edge_dst_dummy_property_t::view() (if @p + * e_op does not access destination property values). Use update_edge_dst_property to fill the + * wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). This parameter is used to pass an edge source property value to @p + * e_op. Use either cugraph::edge_property_t::view() (if @p e_op needs to access edge property + * values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not access edge property + * values). + * @param e_op Quinary operator takes (tagged-)edge source, edge destination, property values for + * the source, destination, and edge and returns a value to be collected in the output. This + * function is called only for the selected edges. + * @param edge_type_input Wrapper used to access edge type value (for the edges assigned to this + * process in multi-GPU). This parameter is used in per-type (heterogeneous) sampling. Use + * cugraph::edge_property_t::view(). + * @param Ks Number of outgoing edges to select per (tagged-)vertex for each edge type (size = # + * edge types). + * @param with_replacement A flag to specify whether a single outgoing edge can be selected multiple + * times (if @p with_replacement = true) or can be selected only once (if @p with_replacement = + * false). + * @param invalid_value If @p invalid_value.has_value() is true, this value is used to fill the + * output vector for the zero out-degree vertices (if @p with_replacement = true) or the vertices + * with their out-degrees smaller than @p K (if @p with_replacement = false). If @p + * invalid_value.has_value() is false, fewer than @p K values can be returned for the vertices with + * fewer than @p K selected edges. See the return value section for additional details. + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + * @return std::tuple Tuple of an optional offset vector of type + * std::optional> and a dataframe buffer storing the output values of + * type @p T from the selected edges. If @p invalid_value is std::nullopt, the offset vector is + * valid and has the size of @p key_list.size() + 1. If @p invalid_value.has_value() is true, + * std::nullopt is returned (the dataframe buffer will store @p key_list.size() * @p K elements). + */ template Date: Thu, 30 Jan 2025 00:39:36 -0800 Subject: [PATCH 12/21] heterogeneous uniform sampling --- .../sample_and_compute_local_nbr_indices.cuh | 2332 +++++++++++++---- 1 file changed, 1869 insertions(+), 463 deletions(-) diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh index fb8cccb80de..ceb673b14f7 100644 --- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh +++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh @@ -110,7 +110,7 @@ struct compute_local_value_displacements_and_global_value_t { // neighbor value, key index) quadruplet, minor_comm_rank is set to -1 if a neighbor value is // invalid template -struct convert_pair_to_quadruplet_t { +struct convert_value_key_pair_t { raft::device_span partitioned_local_value_displacements{}; // one partition per gpu in the same minor_comm raft::device_span tx_counts{}; @@ -146,7 +146,7 @@ struct convert_pair_to_quadruplet_t { // per-type local neighbor value, type, key index) 5-tuple, minor_comm_rank is set to -1 if a // neighbor value is invalid template -struct convert_pair_to_5tuple_t { +struct convert_per_type_value_key_pair_t { raft::device_span partitioned_per_type_local_value_displacements{}; // one partition per gpu in the same // minor_comm @@ -422,7 +422,8 @@ compute_unique_keys(raft::handle_t const& handle, for (size_t i = 0; i < local_frontier_unique_key_sizes.size(); ++i) { thrust::copy(handle.get_thrust_policy(), get_dataframe_buffer_begin(tmp_keys) + local_frontier_offsets[i], - get_dataframe_buffer_begin(tmp_keys) + local_frontier_offsets[i + 1], + get_dataframe_buffer_begin(tmp_keys) + local_frontier_offsets[i] + + local_frontier_unique_key_sizes[i], get_dataframe_buffer_begin(aggregate_local_frontier_unique_keys) + local_frontier_unique_key_offsets[i]); thrust::transform( @@ -637,81 +638,555 @@ compute_valid_local_nbr_count_inclusive_sums(raft::handle_t const& handle, return local_frontier_valid_local_nbr_count_inclusive_sums; } -template -rmm::device_uvector compute_uniform_sampling_index_without_replacement( +template +void sample_nbr_index_with_replacement( raft::handle_t const& handle, - rmm::device_uvector&& frontier_degrees, + raft::device_span frontier_degrees, + std::optional> frontier_indices, + raft::device_span nbr_indices /* [OUT] */, raft::random::RngState& rng_state, size_t K) { - edge_t mid_partition_degree_range_last = static_cast(K * 10); // tuning parameter - assert(mid_partition_degree_range_last > K); - size_t high_partition_oversampling_K = K * 2; // tuning parameter - assert(high_partition_oversampling_K > K); + auto num_keys = frontier_indices ? (*frontier_indices).size() : frontier_degrees.size(); - auto [frontier_indices, frontier_partition_offsets] = partition_v_frontier( - handle, - frontier_degrees.begin(), - frontier_degrees.end(), - std::vector{static_cast(K + 1), mid_partition_degree_range_last + 1}); + rmm::device_uvector sample_random_numbers(num_keys * K, handle.get_stream()); + cugraph::detail::uniform_random_fill(handle.get_stream(), + sample_random_numbers.data(), + sample_random_numbers.size(), + bias_t{0.0}, + bias_t{1.0}, + rng_state); - rmm::device_uvector nbr_indices(frontier_degrees.size() * K, handle.get_stream()); + auto pair_first = thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{0}), + sample_random_numbers.begin()); + thrust::for_each( + handle.get_thrust_policy(), + pair_first, + pair_first + num_keys * K, + [frontier_degrees, + frontier_indices = frontier_indices + ? cuda::std::optional>(*frontier_indices) + : cuda::std::nullopt, + nbr_indices, + K, + invalid_idx = cugraph::invalid_edge_id_v] __device__(auto pair) { + auto i = thrust::get<0>(pair); + auto r = thrust::get<1>(pair); + auto frontier_idx = frontier_indices ? (*frontier_indices)[i / K] : i / K; + auto degree = frontier_degrees[frontier_idx]; + auto sample_idx = invalid_idx; + if (degree > 0) { sample_idx = cuda::std::min(static_cast(r * degree), degree - 1); } + nbr_indices[frontier_idx * K + (i % K)] = sample_idx; + }); +} + +template +void sample_nbr_index_with_replacement( + raft::handle_t const& handle, + raft::device_span frontier_per_type_degrees, + std::optional, raft::device_span>> + frontier_index_type_pairs, + raft::device_span per_type_nbr_indices /* [OUT] */, + raft::random::RngState& rng_state, + raft::device_span K_offsets, + size_t K_sum) +{ + auto num_edge_types = static_cast(K_offsets.size() - 1); + + auto num_keys = frontier_index_type_pairs ? std::get<0>(*frontier_index_type_pairs).size() + : frontier_per_type_degrees.size(); + assert(frontier_index_type_pairs.has_value() || (num_keys % num_edge_types) == 0); + std::optional> input_r_offsets{std::nullopt}; + if (frontier_index_type_pairs) { + input_r_offsets = rmm::device_uvector(num_keys + 1, handle.get_stream()); + (*input_r_offsets).set_element_to_zero_async(0, handle.get_stream()); + auto k_first = thrust::make_transform_iterator( + std::get<1>(frontier_index_type_pairs).begin(), + cuda::proclaim_return_type( + [K_offsets] __device__(auto type) { return K_offsets[type + 1] - K_offsets[type]; })); + thrust::inclusive_scan( + handle.get_thrust_policy(), k_first, k_first + num_keys, (*input_r_offsets).begin() + 1); + } + + rmm::device_uvector sample_random_numbers( + input_r_offsets ? (*input_r_offsets).back_element(handle.get_stream()) + : (num_keys / num_edge_types) * K_sum, + handle.get_stream()); + cugraph::detail::uniform_random_fill(handle.get_stream(), + sample_random_numbers.data(), + sample_random_numbers.size(), + bias_t{0.0}, + bias_t{1.0}, + rng_state); + + auto pair_first = thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{0}), + sample_random_numbers.begin()); + if (frontier_index_type_pairs) { + thrust::for_each( + handle.get_thrust_policy(), + pair_first, + pair_first + sample_random_numbers.size(), + [frontier_per_type_degrees, + frontier_indices = std::get<0>(*frontier_index_type_pairs), + frontier_types = std::get<1>(*frontier_index_type_pairs), + input_r_offsets = + raft::device_span((*input_r_offsets).data(), (*input_r_offsets).size()), + per_type_nbr_indices, + K_offsets, + K_sum, + invalid_idx = cugraph::invalid_edge_id_v] __device__(auto pair) { + auto i = thrust::get<0>(pair); + auto r = thrust::get<1>(pair); + auto idx = thrust::distance( + input_r_offsets.begin() + 1, + thrust::upper_bound(thrust::seq, input_r_offsets.begin() + 1, input_r_offsets.end(), i)); + auto frontier_idx = frontier_indices[idx]; + auto type = frontier_types[idx]; + auto degree = frontier_per_type_degrees[frontier_idx * num_edge_types + type]; + auto sample_idx = invalid_idx; + if (degree > 0) { + sample_idx = cuda::std::min(static_cast(r * degree), degree - 1); + } + per_type_nbr_indices[frontier_idx * K_sum + K_offsets[type] + (i - input_r_offsets[idx])] = + sample_idx; + }); + } else { + thrust::transform( + handle.get_thrust_policy(), + pair_first, + pair_first + sample_random_numbers.size(), + per_type_nbr_indices.begin(), + cuda::proclaim_return_type( + [frontier_per_type_degrees, + per_type_nbr_indices, + K_offsets, + K_sum, + invalid_idx = cugraph::invalid_edge_id_v] __device__(auto pair) { + auto num_edge_types = static_cast(K_offsets.size() - 1); + auto i = thrust::get<0>(pair); + auto r = thrust::get<1>(pair); + auto frontier_idx = i / K_sum; + auto type = static_cast(thrust::distance( + K_offsets.begin() + 1, + thrust::upper_bound(thrust::seq, K_offsets.begin() + 1, K_offsets.end(), i % K_sum))); + auto degree = frontier_per_type_degrees[frontier_idx * num_edge_types + type]; + auto sample_idx = invalid_idx; + if (degree > 0) { + sample_idx = cuda::std::min(static_cast(r * degree), degree - 1); + } + return sample_idx; + })); + } +} + +template +void sample_nbr_index_without_replacement( + raft::handle_t const& handle, + raft::device_span frontier_degrees, + std::optional> frontier_indices, + raft::device_span nbr_indices /* [OUT] */, + raft::random::RngState& rng_state, + size_t K, + bool algo_r = true) +{ + auto num_keys = frontier_indices ? (*frontier_indices).size() : frontier_degrees.size(); + if (frontier_indices) { + thrust::for_each(handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(num_keys * K), + [frontier_degrees, + frontier_indices = *frontier_indices, + nbr_indices, + K, + invalid_idx = cugraph::invalid_edge_id_v] __device__(auto i) { + auto frontier_idx = frontier_indices[i / K]; + auto d = static_cast(frontier_degrees[frontier_idx]); + nbr_indices[frontier_idx * K + (i % K)] = + (i % K < d) ? static_cast(i % K) : invalid_idx; + }); + } else { + thrust::tabulate( + handle.get_thrust_policy(), + nbr_indices.begin(), + nbr_indices.begin() + num_keys * K, + [frontier_degrees, K, invalid_idx = cugraph::invalid_edge_id_v] __device__(auto i) { + auto d = static_cast(frontier_degrees[i / K]); + return (i % K < d) ? static_cast(i % K) : invalid_idx; + }); + } + + if (algo_r) { // reservoir sampling, algorithm R + rmm::device_uvector input_r_offsets(num_keys + 1, handle.get_stream()); + input_r_offsets.set_element_to_zero_async(0, handle.get_stream()); + if (frontier_indices) { + auto count_first = thrust::make_transform_iterator( + (*frontier_indices).begin(), + cuda::proclaim_return_type([frontier_degrees, K] __device__(size_t i) { + auto d = static_cast(frontier_degrees[i]); + return d > K ? (d - K) : size_t{0}; + })); + thrust::inclusive_scan(handle.get_thrust_policy(), + count_first, + count_first + num_keys, + input_r_offsets.begin() + 1); + + } else { + auto count_first = thrust::make_transform_iterator( + frontier_degrees.begin(), cuda::proclaim_return_type([K] __device__(auto degree) { + auto d = static_cast(degree); + return d > K ? (d - K) : size_t{0}; + })); + thrust::inclusive_scan(handle.get_thrust_policy(), + count_first, + count_first + num_keys, + input_r_offsets.begin() + 1); + } + + rmm::device_uvector sample_random_numbers( + input_r_offsets.back_element(handle.get_stream()), handle.get_stream()); + cugraph::detail::uniform_random_fill(handle.get_stream(), + sample_random_numbers.data(), + sample_random_numbers.size(), + bias_t{0.0}, + bias_t{1.0}, + rng_state); - auto low_partition_size = frontier_partition_offsets[1]; - if (low_partition_size > 0) { thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(low_partition_size * K), - [K, + thrust::make_counting_iterator(sample_random_numbers.size()), + [input_r_offsets = + raft::device_span(input_r_offsets.data(), input_r_offsets.size()), + sample_random_numbers = raft::device_span(sample_random_numbers.data(), + sample_random_numbers.size()), frontier_indices = - raft::device_span(frontier_indices.data(), low_partition_size), - frontier_degrees = - raft::device_span(frontier_degrees.data(), frontier_degrees.size()), - nbr_indices = raft::device_span(nbr_indices.data(), nbr_indices.size()), - invalid_idx = cugraph::invalid_edge_id_v] __device__(size_t i) { - auto frontier_idx = frontier_indices[i / K]; - auto degree = frontier_degrees[frontier_idx]; - auto sample_idx = static_cast(i % K); - nbr_indices[frontier_idx * K + sample_idx] = - (sample_idx < degree) ? sample_idx : invalid_idx; + frontier_indices ? cuda::std::optional>(*frontier_indices) + : cuda::std::nullopt, + nbr_indices, + K] __device__(size_t i) { + auto idx = thrust::distance( + input_r_offsets.begin() + 1, + thrust::upper_bound(thrust::seq, input_r_offsets.begin() + 1, input_r_offsets.end(), i)); + auto nbr_idx = K + (i - input_r_offsets[idx]); + auto r = static_cast(sample_random_numbers[i] * (nbr_idx + 1)); + if (r < K) { + auto frontier_idx = frontier_indices ? (*frontier_indices)[idx] : idx; + cuda::atomic_ref sample_nbr_idx( + nbr_indices[frontier_idx * K + r]); + sample_nbr_idx.fetch_max(nbr_idx, cuda::std::memory_order_relaxed); + } }); + } else { // reservoir sampling, algorithm L + size_t random_numbers_per_key = K * 10; // tuning parameter + + rmm::device_uvector retry_frontier_indices(num_keys, handle.get_stream()); + if (frontier_indices) { + auto last = thrust::copy_if( + handle.get_thrust_policy(), + (*frontier_indices).begin(), + (*frontier_indices).end(), + retry_frontier_indices.begin(), + [frontier_degrees, K] __device__(auto i) { return frontier_degrees[i] > K; }); + retry_frontier_indices.resize(thrust::distance(retry_frontier_indices.begin(), last), + handle.get_stream()); + } else { + auto last = thrust::copy_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(num_keys), + retry_frontier_indices.begin(), + [frontier_degrees, K] __device__(auto i) { return frontier_degrees[i] > K; }); + retry_frontier_indices.resize(thrust::distance(retry_frontier_indices.begin(), last), + handle.get_stream()); + } + + rmm::device_uvector sample_random_numbers( + retry_frontier_indices.size() * random_numbers_per_key, handle.get_stream()); + rmm::device_uvector cur_is(retry_frontier_indices.size(), handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), cur_is.begin(), cur_is.end(), static_cast(K)); + rmm::device_uvector cur_ws(retry_frontier_indices.size(), handle.get_stream()); + cugraph::detail::uniform_random_fill( + handle.get_stream(), + cur_ws.data(), + cur_ws.size(), + std::numeric_limits::min() /* to prevent log(0), which is undefined */, + bias_t{1.0}, + rng_state); + thrust::transform(handle.get_thrust_policy(), + cur_ws.begin(), + cur_ws.end(), + cur_ws.begin(), + cuda::proclaim_return_type([K] __device__(auto r) { + return exp(log(r) / K); // log(r) <= 0, 0.0 <= exp(log(r)/K) <= 1.0 + })); + + while (retry_frontier_indices.size() > 0) { + std::cout << "retry_frontier_indices.size()=" << retry_frontier_indices.size() << " K=" << K + << std::endl; + cugraph::detail::uniform_random_fill( + handle.get_stream(), + sample_random_numbers.data(), + retry_frontier_indices.size() * random_numbers_per_key, + std::numeric_limits::min() /* to prevent log(0), which is undefined */, + bias_t{1.0}, + rng_state); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(retry_frontier_indices.size()), + [frontier_degrees, + nbr_indices, + sample_random_numbers = raft::device_span(sample_random_numbers.data(), + sample_random_numbers.size()), + retry_frontier_indices = + raft::device_span(retry_frontier_indices.data(), retry_frontier_indices.size()), + cur_is = raft::device_span(cur_is.data(), cur_is.size()), + cur_ws = raft::device_span(cur_ws.data(), cur_ws.size()), + K, + random_numbers_per_key, + invalid_frontier_idx = std::numeric_limits::max(), + eps = std::numeric_limits::min()] __device__(size_t idx) { + auto frontier_idx = retry_frontier_indices[idx]; + auto i = cur_is[idx]; + auto w = cur_ws[idx]; + size_t r_idx{0}; + auto degree = frontier_degrees[frontier_idx]; + + while ((i < degree) && (r_idx < random_numbers_per_key)) { + auto r = sample_random_numbers[idx * random_numbers_per_key + r_idx++]; + auto inc = + floor(log(r) / + cuda::std::min( + log(cuda::std::max(1.0 - w, eps /* to prevent log(0), which is undefined */)), + -eps) /* to prevent divide by 0 */) + + 1.0; + i += (inc < static_cast(degree - i)) ? static_cast(inc) : (degree - i); + if (i < degree) { + r = sample_random_numbers[idx * random_numbers_per_key + r_idx++]; + nbr_indices[frontier_idx * K + cuda::std::min(static_cast(r * K), K - 1)] = i; + r = sample_random_numbers[idx * random_numbers_per_key + r_idx++]; + w *= exp(log(r) / K); + } + } + if (i < degree) { + cur_is[idx] = i; + cur_ws[idx] = w; + } else { + retry_frontier_indices[idx] = invalid_frontier_idx; + } + }); + auto triplet_first = + thrust::make_zip_iterator(retry_frontier_indices.begin(), cur_is.begin(), cur_ws.begin()); + auto last = + thrust::remove_if(handle.get_thrust_policy(), + triplet_first, + triplet_first + retry_frontier_indices.size(), + [invalid_frontier_idx = std::numeric_limits::max()] __device__( + auto tup) { return thrust::get<0>(tup) == invalid_frontier_idx; }); + retry_frontier_indices.resize(thrust::distance(triplet_first, last), handle.get_stream()); + cur_is.resize(retry_frontier_indices.size(), handle.get_stream()); + cur_ws.resize(retry_frontier_indices.size(), handle.get_stream()); + } } +} + +template +void sample_nbr_index_without_replacement( + raft::handle_t const& handle, + raft::device_span frontier_per_type_degrees, + std::optional, raft::device_span>> + frontier_index_type_pairs, + raft::device_span per_type_nbr_indices /* [OUT] */, + raft::random::RngState& rng_state, + raft::device_span K_offsets, + size_t K_sum, + bool algo_r = true) +{ + auto num_edge_types = static_cast(K_offsets.size() - 1); + + auto num_keys = frontier_index_type_pairs ? std::get<0>(*frontier_index_type_pairs).size() + : frontier_per_type_degrees.size(); + assert(frontier_index_type_pairs.has_value() || (num_keys % num_edge_types) == 0); + + if (frontier_index_type_pairs) { + rmm::device_uvector sample_size_offsets(num_keys + 1, handle.get_stream()); + sample_size_offsets.set_element_to_zero_async(0, handle.get_stream()); + auto k_first = thrust::make_transform_iterator( + std::get<1>(frontier_index_type_pairs).begin(), + cuda::proclaim_return_type( + [K_offsets] __device__(auto type) { return K_offsets[type + 1] - K_offsets[type]; })); + thrust::inclusive_scan( + handle.get_thrust_policy(), k_first, k_first + num_keys, sample_size_offsets.begin() + 1); - auto mid_partition_size = frontier_partition_offsets[2] - frontier_partition_offsets[1]; - if (mid_partition_size > 0) { - // FIXME: tmp_degrees & tmp_nbr_indices can be avoided if we customize - // cugraph::legacy::ops::get_sampling_index - rmm::device_uvector tmp_degrees(mid_partition_size, handle.get_stream()); - rmm::device_uvector tmp_nbr_indices(mid_partition_size * K, handle.get_stream()); - thrust::gather(handle.get_thrust_policy(), - frontier_indices.begin() + frontier_partition_offsets[1], - frontier_indices.begin() + frontier_partition_offsets[2], - frontier_degrees.begin(), - tmp_degrees.begin()); - cugraph::legacy::ops::graph::get_sampling_index(tmp_nbr_indices.data(), - rng_state, - tmp_degrees.data(), - mid_partition_size, - static_cast(K), - false, - handle.get_stream()); thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(mid_partition_size * K), - [K, - frontier_indices = raft::device_span( - frontier_indices.data() + frontier_partition_offsets[1], mid_partition_size), - tmp_nbr_indices = tmp_nbr_indices.data(), - nbr_indices = nbr_indices.data()] __device__(size_t i) { - auto frontier_idx = frontier_indices[i / K]; - auto sample_idx = static_cast(i % K); - nbr_indices[frontier_idx * K + sample_idx] = tmp_nbr_indices[i]; + thrust::make_counting_iterator(sample_size_offsets.back_element(handle.get_stream())), + [sample_size_offsets = + raft::device_span(sample_size_offsets.data(), sample_size_offsets.size()), + frontier_per_type_degrees, + frontier_indices = std::get<0>(*frontier_index_type_pairs), + types = std::get<1>(*frontier_index_type_pairs), + per_type_nbr_indices, + K_offsets, + K_sum, + invalid_idx = cugraph::invalid_edge_id_v] __device__(auto i) { + auto idx = thrust::distance( + sample_size_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, sample_size_offsets.begin() + 1, sample_size_offsets.end(), i)); + auto frontier_idx = frontier_indices[idx]; + auto type = types[idx]; + auto d = frontier_per_type_degrees[frontier_idx * num_edge_types + type]; + auto K = K_offsets[type + 1] - K_offsets[type]; + auto sample_idx = i - sample_size_offsets[idx]; + per_type_nbr_indices[frontier_idx * K_sum + K_offsets[type] + sample_idx] = + (sample_idx < d) ? sample_idx : invalid_idx; + }); + } else { + thrust::tabulate( + handle.get_thrust_policy(), + per_type_nbr_indices.begin(), + per_type_nbr_indices.begin() + (num_keys / num_edge_types) * K_sum, + [frontier_per_type_degrees, + K_offsets, + K_sum, + invalid_idx = cugraph::invalid_edge_id_v] __device__(auto i) { + auto frontier_idx = i / K_sum; + auto type = static_cast(thrust::distance( + K_offsets.begin() + 1, + thrust::upper_bound(thrust::seq, K_offsets.begin() + 1, K_offsets.end(), i % K_sum))); + auto d = frontier_per_type_degrees[frontier_idx * num_edge_types + type]; + auto K = K_offsets[type + 1] - K_offsets[type]; + auto sample_idx = static_cast((i % K_sum) - K_offsets[type]); + return sample_idx < d ? static_cast(sample_idx) : invalid_idx; + }); + } + + if (algo_r) { // reservoir sampling, algorithm R + rmm::device_uvector input_r_offsets(num_keys + 1, handle.get_stream()); + input_r_offsets.set_element_to_zero_async(0, handle.get_stream()); + if (frontier_index_type_pairs) { + auto count_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(std::get<0>(*frontier_index_type_pairs).begin(), + std::get<1>(*frontier_index_type_pairs).begin()), + cuda::proclaim_return_type( + [frontier_per_type_degrees, K_offsets] __device__(auto pair) { + auto num_edge_types = static_cast(K_offsets.size() - 1); + auto frontier_idx = thrust::get<0>(pair); + auto type = thrust::get<1>(pair); + auto d = + static_cast(frontier_per_type_degrees[frontier_idx * num_edge_types + type]); + auto K = K_offsets[type + 1] - K_offsets[type]; + return d > K ? (d - K) : size_t{0}; + })); + thrust::inclusive_scan(handle.get_thrust_policy(), + count_first, + count_first + num_keys, + input_r_offsets.begin() + 1); + } else { + auto count_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [frontier_per_type_degrees, K_offsets] __device__(auto i) { + auto num_edge_types = static_cast(K_offsets.size() - 1); + auto d = static_cast(frontier_per_type_degrees[i]); + auto type = static_cast(i % num_edge_types); + auto K = K_offsets[type + 1] - K_offsets[type]; + return d > K ? (d - K) : size_t{0}; + })); + thrust::inclusive_scan(handle.get_thrust_policy(), + count_first, + count_first + num_keys, + input_r_offsets.begin() + 1); + } + + rmm::device_uvector sample_random_numbers( + input_r_offsets.back_element(handle.get_stream()), handle.get_stream()); + cugraph::detail::uniform_random_fill(handle.get_stream(), + sample_random_numbers.data(), + sample_random_numbers.size(), + bias_t{0.0}, + bias_t{1.0}, + rng_state); + + // based on reservoir sampling, algorithm R + + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(sample_random_numbers.size()), + [frontier_per_type_degrees, + frontier_indices = frontier_index_type_pairs + ? cuda::std::optional>( + std::get<0>(*frontier_index_type_pairs)) + : cuda::std::nullopt, + types = frontier_index_type_pairs + ? cuda::std::optional>( + std::get<1>(*frontier_index_type_pairs)) + : cuda::std::nullopt, + per_type_nbr_indices, + input_r_offsets = + raft::device_span(input_r_offsets.data(), input_r_offsets.size()), + sample_random_numbers = raft::device_span(sample_random_numbers.data(), + sample_random_numbers.size()), + K_offsets, + K_sum, + invalid_idx = cugraph::invalid_edge_id_v] __device__(size_t i) { + auto idx = thrust::distance( + input_r_offsets.begin() + 1, + thrust::upper_bound(thrust::seq, input_r_offsets.begin() + 1, input_r_offsets.end(), i)); + auto type = types ? (*types)[idx] : static_cast(idx % num_edge_types); + auto K = K_offsets[type + 1] - K_offsets[type]; + auto per_type_nbr_idx = K + (i - input_r_offsets[idx]); + auto r = static_cast(sample_random_numbers[i] * (per_type_nbr_idx + 1)); + if (r < K) { + auto frontier_idx = frontier_indices ? (*frontier_indices)[idx] : idx / num_edge_types; + cuda::atomic_ref sample_nbr_idx( + per_type_nbr_indices[frontier_idx * K_sum + K_offsets[type] + r]); + sample_nbr_idx.fetch_max(per_type_nbr_idx, cuda::std::memory_order_relaxed); + } }); + } else { // reservoir sampling, algorithm L + CUGRAPH_FAIL("unimplemented."); + } +} + +template +rmm::device_uvector compute_homogeneous_uniform_sampling_index_without_replacement( + raft::handle_t const& handle, + raft::device_span frontier_degrees, + raft::random::RngState& rng_state, + size_t K) +{ + using bias_t = double; + + edge_t low_partition_degree_range_last = static_cast(K * 10); // tuning parameter + assert(low_partition_degree_range_last > K); + size_t high_partition_oversampling_K = std::max(K * 2, K + 16); // tuning parameter + assert(high_partition_oversampling_K > K); + + auto [frontier_indices, frontier_partition_offsets] = + partition_v_frontier(handle, + frontier_degrees.begin(), + frontier_degrees.end(), + std::vector{low_partition_degree_range_last + 1}); + + rmm::device_uvector nbr_indices(frontier_degrees.size() * K, handle.get_stream()); + + auto low_partition_size = frontier_partition_offsets[1]; + if (low_partition_size > 0) { + sample_nbr_index_without_replacement( + handle, + frontier_degrees, + std::make_optional>(frontier_indices.data(), + low_partition_size), + raft::device_span(nbr_indices.data(), nbr_indices.size()), + rng_state, + K); } - auto high_partition_size = frontier_partition_offsets[3] - frontier_partition_offsets[2]; + auto high_partition_size = frontier_partition_offsets[2] - frontier_partition_offsets[1]; if (high_partition_size > 0) { // to limit memory footprint ((1 << 20) is a tuning parameter), std::max for forward progress // guarantee when high_partition_oversampling_K is exorbitantly large @@ -745,15 +1220,17 @@ rmm::device_uvector compute_uniform_sampling_index_without_replacement( rmm::device_uvector unique_counts(num_segments, handle.get_stream()); - std::optional> retry_segment_indices{std::nullopt}; - std::optional> retry_degrees{std::nullopt}; - std::optional> retry_nbr_indices{std::nullopt}; - std::optional> retry_sample_indices{std::nullopt}; - std::optional> retry_segment_sorted_nbr_indices{std::nullopt}; - std::optional> retry_segment_sorted_sample_indices{std::nullopt}; while (true) { + std::optional> retry_segment_indices{std::nullopt}; + std::optional> retry_degrees{std::nullopt}; + std::optional> retry_nbr_indices{std::nullopt}; + std::optional> retry_sample_indices{std::nullopt}; + std::optional> retry_segment_sorted_nbr_indices{std::nullopt}; + std::optional> retry_segment_sorted_sample_indices{ + std::nullopt}; + auto segment_frontier_index_first = - frontier_indices.begin() + frontier_partition_offsets[2] + keys_to_sort_per_iteration * i; + frontier_indices.begin() + frontier_partition_offsets[1] + keys_to_sort_per_iteration * i; auto segment_frontier_degree_first = thrust::make_transform_iterator( segment_frontier_index_first, indirection_t{frontier_degrees.begin()}); @@ -777,6 +1254,15 @@ rmm::device_uvector compute_uniform_sampling_index_without_replacement( } if (retry_segment_indices) { + #if 1 + sample_nbr_index_with_replacement( + handle, + raft::device_span((*retry_degrees).data(), (*retry_degrees).size()), + std::nullopt, + raft::device_span((*retry_nbr_indices).data(), (*retry_nbr_indices).size()), + rng_state, + high_partition_oversampling_K); + #else cugraph::legacy::ops::graph::get_sampling_index( (*retry_nbr_indices).data(), rng_state, @@ -785,6 +1271,7 @@ rmm::device_uvector compute_uniform_sampling_index_without_replacement( static_cast(high_partition_oversampling_K), true, handle.get_stream()); + #endif } else { // FIXME: this temporary is unnecessary if we update get_sampling_index to take a thrust // iterator @@ -793,6 +1280,15 @@ rmm::device_uvector compute_uniform_sampling_index_without_replacement( segment_frontier_degree_first, segment_frontier_degree_first + num_segments, tmp_degrees.begin()); + #if 1 + sample_nbr_index_with_replacement( + handle, + raft::device_span(tmp_degrees.data(), tmp_degrees.size()), + std::nullopt, + raft::device_span(tmp_nbr_indices.data(), tmp_nbr_indices.size()), + rng_state, + high_partition_oversampling_K); + #else cugraph::legacy::ops::graph::get_sampling_index( tmp_nbr_indices.data(), rng_state, @@ -801,6 +1297,7 @@ rmm::device_uvector compute_uniform_sampling_index_without_replacement( static_cast(high_partition_oversampling_K), true, handle.get_stream()); + #endif } if (retry_segment_indices) { @@ -1005,111 +1502,523 @@ rmm::device_uvector compute_uniform_sampling_index_without_replacement( thrust::make_counting_iterator(num_segments * K), [K, high_partition_oversampling_K, - frontier_indices = frontier_indices.begin() + frontier_partition_offsets[2] + + frontier_indices = frontier_indices.begin() + frontier_partition_offsets[1] + keys_to_sort_per_iteration * i, - tmp_nbr_indices = tmp_nbr_indices.data(), - nbr_indices = nbr_indices.data()] __device__(size_t i) { + tmp_nbr_indices = + raft::device_span(tmp_nbr_indices.data(), tmp_nbr_indices.size()), + nbr_indices = + raft::device_span(nbr_indices.data(), nbr_indices.size())] __device__(size_t i) { auto key_idx = *(frontier_indices + i / K); auto sample_idx = static_cast(i % K); - *(nbr_indices + key_idx * K + sample_idx) = - *(tmp_nbr_indices + (i / K) * high_partition_oversampling_K + sample_idx); + nbr_indices[key_idx * K + sample_idx] = + tmp_nbr_indices[(i / K) * high_partition_oversampling_K + sample_idx]; }); } } - frontier_degrees.resize(0, handle.get_stream()); - frontier_degrees.shrink_to_fit(handle.get_stream()); - return nbr_indices; } -template -void compute_homogeneous_biased_sampling_index_without_replacement( +template +rmm::device_uvector compute_heterogeneous_uniform_sampling_index_without_replacement( raft::handle_t const& handle, - std::optional> - input_frontier_indices, // input_degree_offsets & input_biases - // are already packed if std::nullopt - raft::device_span input_degree_offsets, - raft::device_span input_biases, // bias 0 edges can't be selected - std::optional> - output_frontier_indices, // output_nbr_indices is already packed if std::nullopt - raft::device_span output_nbr_indices, - std::optional> output_keys, + raft::device_span frontier_per_type_degrees, raft::random::RngState& rng_state, - size_t K, - bool jump) + raft::device_span K_offsets, + size_t K_sum) { - if (jump) { // Algorithm A-ExpJ - CUGRAPH_FAIL( - "unimplemented."); // FIXME: this could be faster especially for high-degree vertices - } else { // Algorithm A-Res - // update packed input degree offsets if input_frontier_indices.has_value() is true + using bias_t = double; - auto packed_input_degree_offsets = - input_frontier_indices ? std::make_optional>( - (*input_frontier_indices).size() + 1, handle.get_stream()) - : std::nullopt; - if (packed_input_degree_offsets) { - (*packed_input_degree_offsets).set_element_to_zero_async(0, handle.get_stream()); - auto degree_first = thrust::make_transform_iterator( - (*input_frontier_indices).begin(), - cuda::proclaim_return_type([input_degree_offsets] __device__(size_t i) { - return input_degree_offsets[i + 1] - input_degree_offsets[i]; - })); - thrust::inclusive_scan(handle.get_thrust_policy(), - degree_first, - degree_first + (*input_frontier_indices).size(), - (*packed_input_degree_offsets).begin() + 1); + size_t max_K{0}; + { + std::vector h_K_offsets(K_offsets.size()); + raft::update_host(h_K_offsets.data(), K_offsets.data(), K_offsets.size(), handle.get_stream()); + handle.sync_stream(); + for (size_t i = 0; i < h_K_offsets.size() - 1; ++i) { + max_K = std::max(max_K, h_K_offsets[i + 1] - h_K_offsets[i]); } + } - // generate (key, nbr_index) pairs + edge_t low_partition_degree_range_last = static_cast(max_K * 10); // tuning parameter + assert(low_partition_degree_range_last > max_K); + size_t high_partition_oversampling_K = std::max(max_K * 2, max_K + 16); // tuning parameter + assert(high_partition_oversampling_K > max_K); - size_t num_pairs{}; - raft::update_host( - &num_pairs, - packed_input_degree_offsets - ? (*packed_input_degree_offsets).data() + (*packed_input_degree_offsets).size() - 1 - : input_degree_offsets.data() + input_degree_offsets.size() - 1, - 1, - handle.get_stream()); - handle.sync_stream(); + auto num_edge_types = static_cast(K_offsets.size() - 1); - auto approx_edges_to_process_per_iteration = - static_cast(handle.get_device_properties().multiProcessorCount) * - (1 << 18) /* tuning parameter */; - auto [chunk_offsets, element_offsets] = cugraph::detail::compute_offset_aligned_element_chunks( + std::vector thresholds(num_edge_types); + for (size_t i = 0; i < num_edge_types; ++i) { + thresholds[i] = low_partition_degree_range_last + 1; + } + + auto [frontier_indices, frontier_edge_types, frontier_partition_offsets] = + partition_v_frontier_per_value_idx( handle, - raft::device_span( - packed_input_degree_offsets ? (*packed_input_degree_offsets).data() - : input_degree_offsets.data(), - packed_input_degree_offsets ? (*packed_input_degree_offsets).size() - : input_degree_offsets.size()), - num_pairs, - approx_edges_to_process_per_iteration); - auto num_chunks = chunk_offsets.size() - 1; - for (size_t i = 0; i < num_chunks; ++i) { - auto num_chunk_pairs = element_offsets[i + 1] - element_offsets[i]; - rmm::device_uvector keys(num_chunk_pairs, handle.get_stream()); + frontier_per_type_degrees.begin(), + frontier_per_type_degrees.end(), + raft::host_span(thresholds.data(), thresholds.size()), + num_edge_types); - cugraph::detail::uniform_random_fill( - handle.get_stream(), keys.data(), keys.size(), bias_t{0.0}, bias_t{1.0}, rng_state); + rmm::device_uvector per_type_nbr_indices( + (frontier_per_type_degrees.size() / num_edge_types) * K_sum, handle.get_stream()); - if (packed_input_degree_offsets) { - auto bias_first = thrust::make_transform_iterator( - thrust::make_counting_iterator(element_offsets[i]), - cuda::proclaim_return_type( - [input_biases, - input_degree_offsets, - frontier_indices = *input_frontier_indices, - packed_input_degree_offsets = raft::device_span( - (*packed_input_degree_offsets).data(), - (*packed_input_degree_offsets).size())] __device__(size_t i) { - auto it = thrust::upper_bound(thrust::seq, - packed_input_degree_offsets.begin() + 1, - packed_input_degree_offsets.end(), - i); - auto idx = thrust::distance(packed_input_degree_offsets.begin() + 1, it); - auto frontier_idx = frontier_indices[idx]; + auto low_partition_size = frontier_partition_offsets[1]; + if (low_partition_size > 0) { + sample_nbr_index_without_replacement( + handle, + frontier_per_type_degrees, + std::make_optional(std::make_tuple( + raft::device_span(frontier_indices.data(), frontier_indices.size()), + raft::device_span(frontier_edge_types.data(), + frontier_edge_types.size()))), + raft::device_span(per_type_nbr_indices.data(), per_type_nbr_indices.size()), + rng_state, + K_offsets, + K_sum); + } + + auto high_partition_size = frontier_partition_offsets[2] - frontier_partition_offsets[1]; + if (high_partition_size > 0) { + // to limit memory footprint ((1 << 20) is a tuning parameter), std::max for forward progress + // guarantee when high_partition_oversampling_K is exorbitantly large + auto keys_to_sort_per_iteration = + std::max(static_cast(handle.get_device_properties().multiProcessorCount * (1 << 20)) / + high_partition_oversampling_K, + size_t{1}); + + rmm::device_uvector tmp_per_type_nbr_indices( + keys_to_sort_per_iteration * high_partition_oversampling_K, handle.get_stream()); + assert(high_partition_oversampling_K * 2 <= + static_cast(std::numeric_limits::max())); + rmm::device_uvector tmp_sample_indices( + tmp_per_type_nbr_indices.size(), + handle.get_stream()); // sample indices ([0, high_partition_oversampling_K)) within a + // segment (one segment per key) + + rmm::device_uvector segment_sorted_tmp_per_type_nbr_indices( + tmp_per_type_nbr_indices.size(), handle.get_stream()); + rmm::device_uvector segment_sorted_tmp_sample_indices(tmp_per_type_nbr_indices.size(), + handle.get_stream()); + + rmm::device_uvector d_tmp_storage(0, handle.get_stream()); + size_t tmp_storage_bytes{0}; + + auto num_chunks = + (high_partition_size + keys_to_sort_per_iteration - 1) / keys_to_sort_per_iteration; + for (size_t i = 0; i < num_chunks; ++i) { + size_t num_segments = + std::min(keys_to_sort_per_iteration, high_partition_size - keys_to_sort_per_iteration * i); + + rmm::device_uvector unique_counts(num_segments, handle.get_stream()); + + auto segment_frontier_index_first = + frontier_indices.begin() + frontier_partition_offsets[1] + keys_to_sort_per_iteration * i; + auto segment_frontier_type_first = frontier_edge_types.begin() + + frontier_partition_offsets[1] + + keys_to_sort_per_iteration * i; + while (true) { + std::optional> retry_segment_indices{std::nullopt}; + std::optional> retry_per_type_degrees{std::nullopt}; + std::optional> retry_per_type_nbr_indices{std::nullopt}; + std::optional> retry_sample_indices{std::nullopt}; + std::optional> retry_segment_sorted_per_type_nbr_indices{ + std::nullopt}; + std::optional> retry_segment_sorted_sample_indices{ + std::nullopt}; + + auto segment_frontier_per_type_degree_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(frontier_indices.begin(), frontier_edge_types.begin()) + + frontier_partition_offsets[1] + keys_to_sort_per_iteration * i, + cuda::proclaim_return_type( + [frontier_per_type_degrees, num_edge_types] __device__(auto pair) { + return frontier_per_type_degrees[thrust::get<0>(pair) * num_edge_types + + thrust::get<1>(pair)]; + })); + + if (retry_segment_indices) { + retry_per_type_degrees = + rmm::device_uvector((*retry_segment_indices).size(), handle.get_stream()); + thrust::gather(handle.get_thrust_policy(), + (*retry_segment_indices).begin(), + (*retry_segment_indices).end(), + segment_frontier_per_type_degree_first, + (*retry_per_type_degrees).begin()); + retry_per_type_nbr_indices = rmm::device_uvector( + (*retry_segment_indices).size() * high_partition_oversampling_K, handle.get_stream()); + retry_sample_indices = + rmm::device_uvector((*retry_per_type_nbr_indices).size(), handle.get_stream()); + retry_segment_sorted_per_type_nbr_indices = + rmm::device_uvector((*retry_per_type_nbr_indices).size(), handle.get_stream()); + retry_segment_sorted_sample_indices = + rmm::device_uvector((*retry_per_type_nbr_indices).size(), handle.get_stream()); + } + + if (retry_segment_indices) { + sample_nbr_index_with_replacement( + handle, + raft::device_span((*retry_per_type_degrees).data(), + (*retry_per_type_degrees).size()), + std::nullopt, + raft::device_span((*retry_per_type_nbr_indices).data(), + (*retry_per_type_nbr_indices).size()), + rng_state, + high_partition_oversampling_K); + } else { + rmm::device_uvector tmp_per_type_degrees(num_segments, handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + segment_frontier_per_type_degree_first, + segment_frontier_per_type_degree_first + num_segments, + tmp_per_type_degrees.begin()); + sample_nbr_index_with_replacement( + handle, + raft::device_span(tmp_per_type_degrees.data(), + tmp_per_type_degrees.size()), + std::nullopt, + raft::device_span(tmp_per_type_nbr_indices.data(), + tmp_per_type_nbr_indices.size()), + rng_state, + high_partition_oversampling_K); + } + + if (retry_segment_indices) { + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator((*retry_segment_indices).size() * + high_partition_oversampling_K), + [high_partition_oversampling_K, + unique_counts = unique_counts.data(), + segment_sorted_tmp_per_type_nbr_indices = + segment_sorted_tmp_per_type_nbr_indices.data(), + retry_segment_indices = (*retry_segment_indices).data(), + retry_per_type_nbr_indices = (*retry_per_type_nbr_indices).data(), + retry_sample_indices = (*retry_sample_indices).data()] __device__(size_t i) { + auto segment_idx = retry_segment_indices[i / high_partition_oversampling_K]; + auto sample_idx = static_cast(i % high_partition_oversampling_K); + auto unique_count = unique_counts[segment_idx]; + auto output_first = thrust::make_zip_iterator( + thrust::make_tuple(retry_per_type_nbr_indices, retry_sample_indices)); + // sample index for the previously selected neighbor indices should be smaller than + // the new candidates to ensure that the previously selected neighbor indices will + // be selected again + if (sample_idx < unique_count) { + *(output_first + i) = + thrust::make_tuple(segment_sorted_tmp_per_type_nbr_indices + [segment_idx * high_partition_oversampling_K + sample_idx], + static_cast(sample_idx)); + } else { + *(output_first + i) = + thrust::make_tuple(retry_per_type_nbr_indices[i], + high_partition_oversampling_K + (sample_idx - unique_count)); + } + }); + } else { + thrust::tabulate( + handle.get_thrust_policy(), + tmp_sample_indices.begin(), + tmp_sample_indices.begin() + num_segments * high_partition_oversampling_K, + [high_partition_oversampling_K] __device__(size_t i) { + return static_cast(i % high_partition_oversampling_K); + }); + } + + // sort the (sample neighbor index, sample index) pairs (key: sample neighbor index) + + cub::DeviceSegmentedSort::SortPairs( + static_cast(nullptr), + tmp_storage_bytes, + retry_segment_indices ? (*retry_per_type_nbr_indices).data() + : tmp_per_type_nbr_indices.data(), + retry_segment_indices ? (*retry_segment_sorted_per_type_nbr_indices).data() + : segment_sorted_tmp_per_type_nbr_indices.data(), + retry_segment_indices ? (*retry_sample_indices).data() : tmp_sample_indices.data(), + retry_segment_indices ? (*retry_segment_sorted_sample_indices).data() + : segment_sorted_tmp_sample_indices.data(), + (retry_segment_indices ? (*retry_segment_indices).size() : num_segments) * + high_partition_oversampling_K, + retry_segment_indices ? (*retry_segment_indices).size() : num_segments, + thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}), + multiplier_t{high_partition_oversampling_K}), + thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{1}), + multiplier_t{high_partition_oversampling_K}), + handle.get_stream()); + if (tmp_storage_bytes > d_tmp_storage.size()) { + d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, handle.get_stream()); + } + cub::DeviceSegmentedSort::SortPairs( + d_tmp_storage.data(), + tmp_storage_bytes, + retry_segment_indices ? (*retry_per_type_nbr_indices).data() + : tmp_per_type_nbr_indices.data(), + retry_segment_indices ? (*retry_segment_sorted_per_type_nbr_indices).data() + : segment_sorted_tmp_per_type_nbr_indices.data(), + retry_segment_indices ? (*retry_sample_indices).data() : tmp_sample_indices.data(), + retry_segment_indices ? (*retry_segment_sorted_sample_indices).data() + : segment_sorted_tmp_sample_indices.data(), + (retry_segment_indices ? (*retry_segment_indices).size() : num_segments) * + high_partition_oversampling_K, + retry_segment_indices ? (*retry_segment_indices).size() : num_segments, + thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}), + multiplier_t{high_partition_oversampling_K}), + thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{1}), + multiplier_t{high_partition_oversampling_K}), + handle.get_stream()); + + // count the number of unique neighbor indices + + if (retry_segment_indices) { + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator((*retry_segment_indices).size()), + [high_partition_oversampling_K, + unique_counts = unique_counts.data(), + retry_segment_indices = (*retry_segment_indices).data(), + retry_segment_sorted_pair_first = thrust::make_zip_iterator( + thrust::make_tuple((*retry_segment_sorted_per_type_nbr_indices).begin(), + (*retry_segment_sorted_sample_indices).begin())), + segment_sorted_pair_first = thrust::make_zip_iterator(thrust::make_tuple( + segment_sorted_tmp_per_type_nbr_indices.begin(), + segment_sorted_tmp_sample_indices.begin()))] __device__(size_t i) { + auto unique_count = static_cast(thrust::distance( + retry_segment_sorted_pair_first + high_partition_oversampling_K * i, + thrust::unique( + thrust::seq, + retry_segment_sorted_pair_first + high_partition_oversampling_K * i, + retry_segment_sorted_pair_first + high_partition_oversampling_K * (i + 1), + [] __device__(auto lhs, auto rhs) { + return thrust::get<0>(lhs) == thrust::get<0>(rhs); + }))); + auto segment_idx = retry_segment_indices[i]; + unique_counts[segment_idx] = unique_count; + thrust::copy( + thrust::seq, + retry_segment_sorted_pair_first + high_partition_oversampling_K * i, + retry_segment_sorted_pair_first + high_partition_oversampling_K * i + unique_count, + segment_sorted_pair_first + high_partition_oversampling_K * segment_idx); + }); + } else { + thrust::tabulate( + handle.get_thrust_policy(), + unique_counts.begin(), + unique_counts.end(), + [high_partition_oversampling_K, + segment_sorted_pair_first = thrust::make_zip_iterator(thrust::make_tuple( + segment_sorted_tmp_per_type_nbr_indices.begin(), + segment_sorted_tmp_sample_indices.begin()))] __device__(size_t i) { + return static_cast(thrust::distance( + segment_sorted_pair_first + high_partition_oversampling_K * i, + thrust::unique(thrust::seq, + segment_sorted_pair_first + high_partition_oversampling_K * i, + segment_sorted_pair_first + high_partition_oversampling_K * (i + 1), + [] __device__(auto lhs, auto rhs) { + return thrust::get<0>(lhs) == thrust::get<0>(rhs); + }))); + }); + } + + auto pair_first = + thrust::make_zip_iterator(unique_counts.begin(), segment_frontier_type_first); + auto num_retry_segments = thrust::count_if( + handle.get_thrust_policy(), + pair_first, + pair_first + unique_counts.size(), + [K_offsets] __device__(auto pair) { + auto count = thrust::get<0>(pair); + auto type = thrust::get<1>(pair); + return count < static_cast(K_offsets[type + 1] - K_offsets[type]); + }); + if (num_retry_segments > 0) { + retry_segment_indices = + rmm::device_uvector(num_retry_segments, handle.get_stream()); + thrust::copy_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(num_segments), + (*retry_segment_indices).begin(), + [unique_counts = + raft::device_span(unique_counts.data(), unique_counts.size()), + segment_frontier_type_first, + K_offsets] __device__(size_t i) { + auto type = *(segment_frontier_type_first + i); + return unique_counts[i] < static_cast(K_offsets[type + 1] - K_offsets[type]); + }); + } else { + break; + } + } + + // sort the segment-sorted (sample index, sample per-type neighbor index) pairs (key: sample + // index) + + cub::DeviceSegmentedSort::SortPairs( + static_cast(nullptr), + tmp_storage_bytes, + segment_sorted_tmp_sample_indices.data(), + tmp_sample_indices.data(), + segment_sorted_tmp_per_type_nbr_indices.data(), + tmp_per_type_nbr_indices.data(), + num_segments * high_partition_oversampling_K, + num_segments, + thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}), + multiplier_t{high_partition_oversampling_K}), + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [high_partition_oversampling_K, unique_counts = unique_counts.data()] __device__( + size_t i) { return i * high_partition_oversampling_K + unique_counts[i]; })), + handle.get_stream()); + if (tmp_storage_bytes > d_tmp_storage.size()) { + d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, handle.get_stream()); + } + cub::DeviceSegmentedSort::SortPairs( + d_tmp_storage.data(), + tmp_storage_bytes, + segment_sorted_tmp_sample_indices.data(), + tmp_sample_indices.data(), + segment_sorted_tmp_per_type_nbr_indices.data(), + tmp_per_type_nbr_indices.data(), + num_segments * high_partition_oversampling_K, + num_segments, + thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}), + multiplier_t{high_partition_oversampling_K}), + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [high_partition_oversampling_K, unique_counts = unique_counts.data()] __device__( + size_t i) { return i * high_partition_oversampling_K + unique_counts[i]; })), + handle.get_stream()); + + // copy the neighbor indices back to nbr_indices + + rmm::device_uvector output_count_offsets(num_segments + 1, handle.get_stream()); + output_count_offsets.set_element_to_zero_async(0, handle.get_stream()); + auto k_first = thrust::make_transform_iterator( + segment_frontier_type_first, + cuda::proclaim_return_type( + [K_offsets] __device__(auto type) { return K_offsets[type + 1] - K_offsets[type]; })); + thrust::inclusive_scan(handle.get_thrust_policy(), + k_first, + k_first + num_segments, + output_count_offsets.begin() + 1); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(output_count_offsets.back_element(handle.get_stream())), + [high_partition_oversampling_K, + segment_frontier_index_first, + segment_frontier_type_first, + tmp_per_type_nbr_indices = raft::device_span( + tmp_per_type_nbr_indices.data(), tmp_per_type_nbr_indices.size()), + output_count_offsets = raft::device_span(output_count_offsets.data(), + output_count_offsets.size()), + per_type_nbr_indices = + raft::device_span(per_type_nbr_indices.data(), per_type_nbr_indices.size()), + K_offsets, + K_sum] __device__(size_t i) { + auto idx = thrust::distance( + output_count_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, output_count_offsets.begin() + 1, output_count_offsets.end(), i)); + auto frontier_idx = *(segment_frontier_index_first + idx); + auto type = *(segment_frontier_type_first + idx); + auto sample_idx = static_cast(i - output_count_offsets[idx]); + *(per_type_nbr_indices + frontier_idx * K_sum + K_offsets[type] + sample_idx) = + *(tmp_per_type_nbr_indices + idx * high_partition_oversampling_K + sample_idx); + }); + } + } + + return per_type_nbr_indices; +} + +template +void compute_homogeneous_biased_sampling_index_without_replacement( + raft::handle_t const& handle, + std::optional> + input_frontier_indices, // input_degree_offsets & input_biases + // are already packed if std::nullopt + raft::device_span input_degree_offsets, + raft::device_span input_biases, // bias 0 edges can't be selected + std::optional> + output_frontier_indices, // output_nbr_indices is already packed if std::nullopt + raft::device_span output_nbr_indices, + std::optional> output_keys, + raft::random::RngState& rng_state, + size_t K, + bool jump) +{ + if (jump) { // Algorithm A-ExpJ + CUGRAPH_FAIL( + "unimplemented."); // FIXME: this could be faster especially for high-degree vertices + } else { // Algorithm A-Res + // update packed input degree offsets if input_frontier_indices.has_value() is true + + auto packed_input_degree_offsets = + input_frontier_indices ? std::make_optional>( + (*input_frontier_indices).size() + 1, handle.get_stream()) + : std::nullopt; + if (packed_input_degree_offsets) { + (*packed_input_degree_offsets).set_element_to_zero_async(0, handle.get_stream()); + auto degree_first = thrust::make_transform_iterator( + (*input_frontier_indices).begin(), + cuda::proclaim_return_type([input_degree_offsets] __device__(size_t i) { + return input_degree_offsets[i + 1] - input_degree_offsets[i]; + })); + thrust::inclusive_scan(handle.get_thrust_policy(), + degree_first, + degree_first + (*input_frontier_indices).size(), + (*packed_input_degree_offsets).begin() + 1); + } + + // generate (key, nbr_index) pairs + + size_t num_pairs{}; + raft::update_host( + &num_pairs, + packed_input_degree_offsets + ? (*packed_input_degree_offsets).data() + (*packed_input_degree_offsets).size() - 1 + : input_degree_offsets.data() + input_degree_offsets.size() - 1, + 1, + handle.get_stream()); + handle.sync_stream(); + + auto approx_edges_to_process_per_iteration = + static_cast(handle.get_device_properties().multiProcessorCount) * + (1 << 18) /* tuning parameter */; + auto [chunk_offsets, element_offsets] = cugraph::detail::compute_offset_aligned_element_chunks( + handle, + raft::device_span( + packed_input_degree_offsets ? (*packed_input_degree_offsets).data() + : input_degree_offsets.data(), + packed_input_degree_offsets ? (*packed_input_degree_offsets).size() + : input_degree_offsets.size()), + num_pairs, + approx_edges_to_process_per_iteration); + auto num_chunks = chunk_offsets.size() - 1; + for (size_t i = 0; i < num_chunks; ++i) { + auto num_chunk_pairs = element_offsets[i + 1] - element_offsets[i]; + rmm::device_uvector keys(num_chunk_pairs, handle.get_stream()); + + cugraph::detail::uniform_random_fill( + handle.get_stream(), keys.data(), keys.size(), bias_t{0.0}, bias_t{1.0}, rng_state); + + if (packed_input_degree_offsets) { + auto bias_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(element_offsets[i]), + cuda::proclaim_return_type( + [input_biases, + input_degree_offsets, + frontier_indices = *input_frontier_indices, + packed_input_degree_offsets = raft::device_span( + (*packed_input_degree_offsets).data(), + (*packed_input_degree_offsets).size())] __device__(size_t i) { + auto it = thrust::upper_bound(thrust::seq, + packed_input_degree_offsets.begin() + 1, + packed_input_degree_offsets.end(), + i); + auto idx = thrust::distance(packed_input_degree_offsets.begin() + 1, it); + auto frontier_idx = frontier_indices[idx]; return input_biases[input_degree_offsets[frontier_idx] + (i - packed_input_degree_offsets[idx])]; })); @@ -1566,8 +2475,55 @@ compute_aggregate_local_frontier_local_degrees(raft::handle_t const& handle, return aggregate_local_frontier_local_degrees; } -// return (bias segmented local inclusive sums, segment offsets) pairs for each key in th eaggregate -// local frontier +template +rmm::device_uvector compute_aggregate_local_frontier_per_type_local_degrees( + raft::handle_t const& handle, + raft::device_span aggregate_local_frontier_key_idx_to_unique_key_idx, + raft::host_span local_frontier_offsets, + raft::device_span aggregate_local_frontier_unique_key_edge_types, + raft::device_span aggregate_local_frontier_unique_key_local_degree_offsets, + raft::host_span local_frontier_unique_key_offsets, + size_t num_edge_types) +{ + auto aggregate_local_frontier_per_type_local_degrees = rmm::device_uvector( + local_frontier_offsets.back() * num_edge_types, handle.get_stream()); + for (size_t i = 0; i < local_frontier_offsets.size() - 1; ++i) { + thrust::tabulate( + handle.get_thrust_policy(), + aggregate_local_frontier_per_type_local_degrees.begin() + + local_frontier_offsets[i] * num_edge_types, + aggregate_local_frontier_per_type_local_degrees.begin() + + local_frontier_offsets[i + 1] * num_edge_types, + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), + aggregate_local_frontier_unique_key_edge_types = + raft::device_span(aggregate_local_frontier_unique_key_edge_types.data(), + aggregate_local_frontier_unique_key_edge_types.size()), + unique_key_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data() + + local_frontier_unique_key_offsets[i], + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) + 1), + num_edge_types] __device__(size_t i) { + auto key_idx = i / num_edge_types; + auto edge_type = static_cast(i % num_edge_types); + auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; + auto start_offset = unique_key_local_degree_offsets[key_idx]; + auto end_offset = unique_key_local_degree_offsets[key_idx + 1]; + auto edge_type_first = + aggregate_local_frontier_unique_key_edge_types.begin() + start_offset; + auto edge_type_last = aggregate_local_frontier_unique_key_edge_types.begin() + end_offset; + return static_cast(thrust::distance( + thrust::lower_bound(thrust::seq, edge_type_first, edge_type_last, edge_type), + thrust::upper_bound(thrust::seq, edge_type_first, edge_type_last, edge_type))); + }); + } + + return aggregate_local_frontier_per_type_local_degrees; +} + +// return (bias values, local neighbor indices with non-zero bias values, segment offsets) pairs for +// each key in th eaggregate local frontier template +std::tuple, + rmm::device_uvector> +compute_aggregate_local_frontier_edge_types( + raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyIterator aggregate_local_frontier_key_first, + EdgeTypeInputWrapper edge_type_input, + raft::host_span local_frontier_displacements, + raft::host_span local_frontier_sizes) +{ + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = typename thrust::iterator_traits::value_type; + + auto [aggregate_local_frontier_types, aggregate_local_frontier_local_degree_offsets] = + transform_v_frontier_e( + handle, + graph_view, + aggregate_local_frontier_key_first, + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_type_input, + [] __device__(auto, auto, auto, auto, auto e_val) { return e_val; }, +#if 1 // FIXME: better update shuffle_values to take host_span + std::vector(local_frontier_displacements.begin(), local_frontier_displacements.end()), + std::vector(local_frontier_sizes.begin(), local_frontier_sizes.end()) +#else + local_frontier_displacements, + local_frontier_sizes +#endif + ); + + return std::make_tuple(std::move(aggregate_local_frontier_types), + std::move(aggregate_local_frontier_local_degree_offsets)); +} + template std::tuple, rmm::device_uvector, std::vector> shuffle_and_compute_local_nbr_values( @@ -1912,7 +2912,7 @@ shuffle_and_compute_local_nbr_values( intra_partition_displacements.begin(), sample_local_nbr_values.begin(), key_indices.begin())), - convert_pair_to_quadruplet_t{ + convert_value_key_pair_t{ raft::device_span(frontier_partitioned_value_local_sum_displacements.data(), frontier_partitioned_value_local_sum_displacements.size()), raft::device_span(d_tx_counts.data(), d_tx_counts.size()), @@ -2012,7 +3012,7 @@ shuffle_and_compute_per_type_local_nbr_values( sample_per_type_local_nbr_values.begin(), edge_types.begin(), key_indices.begin()), - convert_pair_to_5tuple_t{ + convert_per_type_value_key_pair_t{ raft::device_span( frontier_partitioned_per_type_value_local_sum_displacements.data(), frontier_partitioned_per_type_value_local_sum_displacements.size()), @@ -2080,17 +3080,100 @@ shuffle_and_compute_per_type_local_nbr_values( std::move(local_frontier_sample_offsets)); } +template +rmm::device_uvector compute_local_nbr_indices_from_per_type_local_nbr_indices( + raft::handle_t const& handle, + raft::device_span aggregate_local_frontier_key_idx_to_unique_key_idx, + raft::host_span local_frontier_offsets, + raft::device_span aggregate_local_frontier_unique_key_per_type_local_degree_offsets, + raft::host_span local_frontier_unique_key_offsets, + std::optional, raft::device_span>> + edge_type_key_idx_pairs, + rmm::device_uvector&& per_type_local_nbr_indices, + raft::host_span local_frontier_sample_offsets, + raft::device_span K_offsets, + size_t K_sum) +{ + auto num_edge_types = static_cast(K_offsets.size() - 1); + + auto local_nbr_indices = std::move(per_type_local_nbr_indices); + if (edge_type_key_idx_pairs) { + auto triplet_first = thrust::make_zip_iterator(local_nbr_indices.begin(), + std::get<0>(*edge_type_key_idx_pairs).begin(), + std::get<1>(*edge_type_key_idx_pairs).begin()); + for (size_t i = 0; i < local_frontier_sample_offsets.size() - 1; ++i) { + thrust::transform( + handle.get_thrust_policy(), + triplet_first + local_frontier_sample_offsets[i], + triplet_first + local_frontier_sample_offsets[i + 1], + local_nbr_indices.begin(), + cuda::proclaim_return_type( + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), + unique_key_per_type_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data() + + local_frontier_unique_key_offsets[i] * num_edge_types, + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) * + num_edge_types + + 1), + num_edge_types] __device__(auto triplet) { + auto per_type_local_nbr_idx = thrust::get<0>(triplet); + auto type = thrust::get<1>(triplet); + auto key_idx = thrust::get<2>(triplet); + auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; + auto type_start_offset = static_cast( + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + type] - + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types]); + return type_start_offset + per_type_local_nbr_idx; + })); + } + } else { + auto pair_first = thrust::make_zip_iterator(local_nbr_indices.begin(), + thrust::make_counting_iterator(size_t{0})); + thrust::transform( + handle.get_thrust_policy(), + pair_first, + pair_first + local_nbr_indices.size(), + local_nbr_indices.begin(), + cuda::proclaim_return_type( + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data(), + aggregate_local_frontier_key_idx_to_unique_key_idx.size()), + unique_key_per_type_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.size()), + K_offsets, + K_sum] __device__(auto pair) { + auto num_edge_types = static_cast(K_offsets.size() - 1); + auto per_type_local_nbr_idx = thrust::get<0>(pair); + auto i = thrust::get<1>(pair); + auto key_idx = i / K_sum; + auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; + auto type = static_cast(thrust::distance( + K_offsets.begin() + 1, + thrust::upper_bound(thrust::seq, K_offsets.begin() + 1, K_offsets.end(), i % K_sum))); + auto type_start_offset = static_cast( + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + type] - + unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types]); + return type_start_offset + per_type_local_nbr_idx; + })); + } + + return local_nbr_indices; +} + template std::tuple /* local_nbr_indices */, std::optional> /* key_indices */, std::vector /* local_frontier_sample_offsets */> biased_sample_with_replacement( raft::handle_t const& handle, - raft::host_span local_frontier_offsets, raft::device_span aggregate_local_frontier_key_idx_to_unique_key_idx, - raft::host_span local_frontier_unique_key_offsets, + raft::host_span local_frontier_offsets, raft::device_span aggregate_local_frontier_unique_key_biases, raft::device_span aggregate_local_frontier_unique_key_per_type_local_degree_offsets, + raft::host_span local_frontier_unique_key_offsets, raft::random::RngState& rng_state, raft::host_span Ks) { @@ -2202,7 +3285,7 @@ biased_sample_with_replacement( frontier_per_type_bias_sums = std::move(aggregate_local_frontier_per_type_bias_local_sums); } - // sample & compute local neighbor indices + // generate & shuffle random numbers rmm::device_uvector sample_random_numbers( (local_frontier_offsets[minor_comm_rank + 1] - local_frontier_offsets[minor_comm_rank]) * K_sum, @@ -2266,6 +3349,8 @@ biased_sample_with_replacement( local_frontier_sample_offsets = {size_t{0}, sample_local_random_numbers.size()}; } + // compute local neighbor indices from shuffled per-tyep local random numbers + local_nbr_indices.resize(sample_local_random_numbers.size(), handle.get_stream()); for (size_t i = 0; i < num_local_edge_partitions; ++i) { thrust::tabulate( @@ -2344,11 +3429,11 @@ std::tuple /* local_nbr_indices */, std::vector /* local_frontier_sample_offsets */> homogeneous_biased_sample_without_replacement( raft::handle_t const& handle, - raft::host_span local_frontier_offsets, raft::device_span aggregate_local_frontier_key_idx_to_unique_key_idx, - raft::host_span local_frontier_unique_key_offsets, + raft::host_span local_frontier_offsets, raft::device_span aggregate_local_frontier_unique_key_biases, raft::device_span aggregate_local_frontier_unique_key_local_degree_offsets, + raft::host_span local_frontier_unique_key_offsets, raft::random::RngState& rng_state, size_t K) { @@ -2939,11 +4024,11 @@ std::tuple /* local_nbr_indices */, std::vector /* local_frontier_sample_offsets */> heterogeneous_biased_sample_without_replacement( raft::handle_t const& handle, - raft::host_span local_frontier_offsets, raft::device_span aggregate_local_frontier_key_idx_to_unique_key_idx, - raft::host_span local_frontier_unique_key_offsets, + raft::host_span local_frontier_offsets, raft::device_span aggregate_local_frontier_unique_key_biases, raft::device_span aggregate_local_frontier_unique_key_per_type_local_degree_offsets, + raft::host_span local_frontier_unique_key_offsets, raft::random::RngState& rng_state, raft::host_span Ks) { @@ -3031,6 +4116,8 @@ heterogeneous_biased_sample_without_replacement( d_K_offsets.data(), h_K_offsets.data(), h_K_offsets.size(), handle.get_stream()); } + rmm::device_uvector per_type_local_nbr_indices(0, handle.get_stream()); + std::optional> edge_types{std::nullopt}; if (minor_comm_size > 1) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); @@ -3086,7 +4173,7 @@ heterogeneous_biased_sample_without_replacement( minor_comm, thrust::make_zip_iterator(frontier_indices.begin(), frontier_edge_types.begin()) + frontier_partition_offsets[1], - get_dataframe_buffer_beign(aggregate_mid_local_frontier_index_type_pairs), + get_dataframe_buffer_begin(aggregate_mid_local_frontier_index_type_pairs), mid_local_frontier_sizes, std::vector(mid_local_frontier_offsets.begin(), mid_local_frontier_offsets.end() - 1), @@ -3621,53 +4708,18 @@ heterogeneous_biased_sample_without_replacement( }); } - rmm::device_uvector per_type_local_nbr_indices(0, handle.get_stream()); - rmm::device_uvector edge_types(0, handle.get_stream()); std::tie(per_type_local_nbr_indices, edge_types, key_indices, local_frontier_sample_offsets) = shuffle_and_compute_per_type_local_nbr_values( - handle, - std::move(per_type_nbr_indices), - raft::device_span( - (*frontier_partitioned_per_type_local_degree_displacements).data(), - (*frontier_partitioned_per_type_local_degree_displacements).size()), - raft::device_span(d_K_offsets.data(), d_K_offsets.size()), - K_sum, - cugraph::invalid_edge_id_v); - - // per-type local neighbor indices => local neighbor indices - - local_nbr_indices = std::move(per_type_local_nbr_indices); - auto triplet_first = thrust::make_zip_iterator( - local_nbr_indices.begin(), edge_types.begin(), (*key_indices).begin()); - for (size_t i = 0; i < num_local_edge_partitions; ++i) { - thrust::transform( - handle.get_thrust_policy(), - triplet_first + local_frontier_sample_offsets[i], - triplet_first + local_frontier_sample_offsets[i + 1], - local_nbr_indices.begin(), - cuda::proclaim_return_type( - [key_idx_to_unique_key_idx = raft::device_span( - aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], - local_frontier_offsets[i + 1] - local_frontier_offsets[i]), - unique_key_per_type_local_degree_offsets = raft::device_span( - aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data() + - local_frontier_unique_key_offsets[i] * num_edge_types, - (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) * - num_edge_types + - 1)] __device__(auto triplet) { - auto per_type_local_nbr_idx = thrust::get<0>(triplet); - auto type = thrust::get<1>(triplet); - auto key_idx = thrust::get<2>(triplet); - auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; - auto type_start_offset = static_cast( - unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types + type] - - unique_key_per_type_local_degree_offsets[unique_key_idx * num_edge_types]); - return type_start_offset + per_type_local_nbr_idx; - })); - } + handle, + std::move(per_type_nbr_indices), + raft::device_span( + (*frontier_partitioned_per_type_local_degree_displacements).data(), + (*frontier_partitioned_per_type_local_degree_displacements).size()), + raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + K_sum, + cugraph::invalid_edge_id_v); } else { // minor_comm_size == 1 - rmm::device_uvector per_type_local_nbr_indices(local_frontier_offsets.back() * K_sum, - handle.get_stream()); + per_type_local_nbr_indices.resize(local_frontier_offsets.back() * K_sum, handle.get_stream()); // sample from low-degree vertices @@ -3754,40 +4806,104 @@ heterogeneous_biased_sample_without_replacement( } local_frontier_sample_offsets = std::vector{0, per_type_local_nbr_indices.size()}; + } - // per-type local neighbor indices => local neighbor indices + // per-type local neighbor indices => local neighbor indices - local_nbr_indices = std::move(per_type_local_nbr_indices); - { - auto pair_first = thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{0}), - local_nbr_indices.begin()); + assert(edge_types.has_value() == key_indices.has_value()); + local_nbr_indices = + compute_local_nbr_indices_from_per_type_local_nbr_indices( + handle, + raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), + aggregate_local_frontier_key_idx_to_unique_key_idx.size()), + local_frontier_offsets, + raft::device_span( + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.size()), + local_frontier_unique_key_offsets, + edge_types ? std::make_optional(std::make_tuple( + raft::device_span(edge_types.data(), edge_types.size()), + raft::device_span((*key_indices).data(), (*key_indices).size()))) + : std::nullopt, + std::move(per_type_local_nbr_indices), + local_frontier_sample_offsets, + raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + K_sum); + + return std::make_tuple( + std::move(local_nbr_indices), std::move(key_indices), std::move(local_frontier_sample_offsets)); +} + +template +rmm::device_uvector remap_local_nbr_indices( + raft::handle_t const& handle, + raft::device_span aggregate_local_frontier_key_idx_to_unique_key_idx, + raft::host_span local_frontier_offsets, + raft::device_span aggregate_local_frontier_unique_key_org_indices, + raft::device_span aggregate_local_frontier_unique_key_local_degree_offsets, + raft::host_span local_frontier_unique_key_offsets, + rmm::device_uvector&& local_nbr_indices, + std::optional> key_indices, + raft::host_span local_frontier_sample_offsets, + size_t K) +{ + if (key_indices) { + auto pair_first = thrust::make_zip_iterator(local_nbr_indices.begin(), (*key_indices).begin()); + for (size_t i = 0; i < local_frontier_offsets.size() - 1; ++i) { thrust::transform( handle.get_thrust_policy(), - pair_first, - pair_first + local_nbr_indices.size(), - local_nbr_indices.begin(), + pair_first + local_frontier_sample_offsets[i], + pair_first + local_frontier_sample_offsets[i + 1], + local_nbr_indices.begin() + local_frontier_sample_offsets[i], cuda::proclaim_return_type( - [per_type_local_degree_offsets = raft::device_span( - aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data(), - aggregate_local_frontier_unique_key_per_type_local_degree_offsets.size()), - K_offsets = raft::device_span(d_K_offsets.data(), d_K_offsets.size()), - K_sum] __device__(auto pair) { - auto i = thrust::get<0>(pair); - auto per_type_local_nbr_idx = thrust::get<1>(pair); - auto idx = i / K_sum; - auto type = static_cast(thrust::distance( - K_offsets.begin() + 1, - thrust::upper_bound(thrust::seq, K_offsets.begin() + 1, K_offsets.end(), i % K_sum))); - auto type_start_offset = - static_cast(per_type_local_degree_offsets[idx * num_edge_types + type] - - per_type_local_degree_offsets[idx * num_edge_types]); - return type_start_offset + per_type_local_nbr_idx; + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), + unique_key_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data() + + local_frontier_unique_key_offsets[i], + (local_frontier_unique_key_offsets[i + 1], local_frontier_unique_key_offsets[i]) + 1), + aggregate_local_frontier_unique_key_org_indices = raft::device_span( + aggregate_local_frontier_unique_key_org_indices.data(), + aggregate_local_frontier_unique_key_org_indices.size())] __device__(auto pair) { + auto local_nbr_idx = thrust::get<0>(pair); + auto key_idx = thrust::get<1>(pair); + auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; + return aggregate_local_frontier_unique_key_org_indices + [unique_key_local_degree_offsets[unique_key_idx] + local_nbr_idx]; + })); + } + } else { + auto pair_first = thrust::make_zip_iterator(local_nbr_indices.begin(), + thrust::make_counting_iterator(size_t{0})); + for (size_t i = 0; i < local_frontier_offsets.size() - 1; ++i) { + thrust::transform( + handle.get_thrust_policy(), + pair_first + local_frontier_sample_offsets[i], + pair_first + local_frontier_sample_offsets[i + 1], + local_nbr_indices.begin() + local_frontier_sample_offsets[i], + cuda::proclaim_return_type( + [key_idx_to_unique_key_idx = raft::device_span( + aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], + local_frontier_offsets[i + 1] - local_frontier_offsets[i]), + unique_key_local_degree_offsets = raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data() + + local_frontier_unique_key_offsets[i], + (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) + 1), + aggregate_local_frontier_unique_key_org_indices = raft::device_span( + aggregate_local_frontier_unique_key_org_indices.data(), + aggregate_local_frontier_unique_key_org_indices.size()), + K] __device__(auto pair) { + auto local_nbr_idx = thrust::get<0>(pair); + auto key_idx = thrust::get<1>(pair) / K; + auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; + return aggregate_local_frontier_unique_key_org_indices + [unique_key_local_degree_offsets[unique_key_idx] + local_nbr_idx]; })); } } - return std::make_tuple( - std::move(local_nbr_indices), std::move(key_indices), std::move(local_frontier_sample_offsets)); + return std::move(local_nbr_indices); } // skip conversion if local neighbor index is cugraph::invalid_edge_id_v @@ -3814,7 +4930,8 @@ rmm::device_uvector convert_to_unmasked_local local_frontier_unique_major_offsets] = compute_unique_keys(handle, aggregate_local_frontier_major_first, local_frontier_offsets); - // to avoid searching the entire neighbor list K times for high degree vertices with edge masking + // to avoid searching the entire neighbor list K times for high degree vertices with edge + // masking auto local_frontier_unique_major_valid_local_nbr_count_inclusive_sums = compute_valid_local_nbr_count_inclusive_sums( handle, @@ -3889,94 +5006,419 @@ homogeneous_uniform_sample_and_compute_local_nbr_indices( size_t K, bool with_replacement) { - using edge_t = typename GraphViewType::edge_type; using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; using key_t = typename thrust::iterator_traits::value_type; + using bias_t = double; + + int minor_comm_size{1}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + minor_comm_size = minor_comm.get_size(); + } + assert(minor_comm_size == graph_view.number_of_local_edge_partitions()); + + auto aggregate_local_frontier_major_first = + thrust_tuple_get_or_identity(aggregate_local_frontier_key_first); + + auto edge_mask_view = graph_view.edge_mask_view(); + + // 1. compute degrees + + rmm::device_uvector frontier_degrees(0, handle.get_stream()); + std::optional> frontier_partitioned_local_degree_displacements{ + std::nullopt}; + { + auto aggregate_local_frontier_local_degrees = compute_aggregate_local_frontier_local_degrees( + handle, graph_view, aggregate_local_frontier_major_first, local_frontier_offsets); + + if (minor_comm_size > 1) { + std::tie(frontier_degrees, frontier_partitioned_local_degree_displacements) = + compute_frontier_value_sums_and_partitioned_local_value_sum_displacements( + handle, + raft::device_span(aggregate_local_frontier_local_degrees.data(), + aggregate_local_frontier_local_degrees.size()), + local_frontier_offsets, + 1); + aggregate_local_frontier_local_degrees.resize(0, handle.get_stream()); + aggregate_local_frontier_local_degrees.shrink_to_fit(handle.get_stream()); + } else { + frontier_degrees = std::move(aggregate_local_frontier_local_degrees); + } + } + + // 2. sample neighbor indices + + rmm::device_uvector nbr_indices(0, handle.get_stream()); + + if (with_replacement) { + if (frontier_degrees.size() > 0) { + nbr_indices.resize(frontier_degrees.size() * K, handle.get_stream()); + sample_nbr_index_with_replacement( + handle, + raft::device_span(frontier_degrees.data(), frontier_degrees.size()), + std::nullopt, + raft::device_span(nbr_indices.data(), nbr_indices.size()), + rng_state, + K); + frontier_degrees.resize(0, handle.get_stream()); + frontier_degrees.shrink_to_fit(handle.get_stream()); + } + } else { + nbr_indices = compute_homogeneous_uniform_sampling_index_without_replacement( + handle, + raft::device_span(frontier_degrees.data(), frontier_degrees.size()), + rng_state, + K); + } + frontier_degrees.resize(0, handle.get_stream()); + frontier_degrees.shrink_to_fit(handle.get_stream()); + + // 3. shuffle neighbor indices + + rmm::device_uvector local_nbr_indices(0, handle.get_stream()); + std::optional> key_indices{std::nullopt}; + std::vector local_frontier_sample_offsets{}; + if (minor_comm_size > 1) { + std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = + shuffle_and_compute_local_nbr_values( + handle, + std::move(nbr_indices), + raft::device_span((*frontier_partitioned_local_degree_displacements).data(), + (*frontier_partitioned_local_degree_displacements).size()), + K, + cugraph::invalid_edge_id_v); + } else { + local_nbr_indices = std::move(nbr_indices); + local_frontier_sample_offsets = {size_t{0}, local_nbr_indices.size()}; + } + + // 4. convert neighbor indices in the neighbor list considering edge mask to neighbor indices in + // the neighbor list ignoring edge mask + + if (edge_mask_view) { + local_nbr_indices = convert_to_unmasked_local_nbr_idx( + handle, + graph_view, + aggregate_local_frontier_major_first, + std::move(local_nbr_indices), + key_indices ? std::make_optional>((*key_indices).data(), + (*key_indices).size()) + : std::nullopt, + raft::host_span(local_frontier_sample_offsets.data(), + local_frontier_sample_offsets.size()), + local_frontier_offsets, + K); + } + + return std::make_tuple( + std::move(local_nbr_indices), std::move(key_indices), std::move(local_frontier_sample_offsets)); +} + +template +std::tuple, + std::optional>, + std::vector> +heterogeneous_uniform_sample_and_compute_local_nbr_indices( + raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyIterator aggregate_local_frontier_key_first, + EdgeTypeInputWrapper edge_type_input, + raft::host_span local_frontier_offsets, + raft::random::RngState& rng_state, + raft::host_span Ks, + bool with_replacement) +{ + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = typename thrust::iterator_traits::value_type; + using edge_type_t = typename EdgeTypeInputWrapper::value_type; + using bias_t = double; + int minor_comm_rank{0}; int minor_comm_size{1}; if constexpr (GraphViewType::is_multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + minor_comm_rank = minor_comm.get_rank(); minor_comm_size = minor_comm.get_size(); } + assert(minor_comm_size == graph_view.number_of_local_edge_partitions()); + + auto num_edge_types = static_cast(Ks.size()); + + auto edge_mask_view = graph_view.edge_mask_view(); + + auto K_sum = std::accumulate(Ks.begin(), Ks.end(), size_t{0}); + + rmm::device_uvector d_K_offsets(Ks.size() + 1, handle.get_stream()); + { + std::vector h_K_offsets(d_K_offsets.size()); + h_K_offsets[0] = 0; + std::inclusive_scan(Ks.begin(), Ks.end(), h_K_offsets.begin() + 1); + raft::update_device( + d_K_offsets.data(), h_K_offsets.data(), h_K_offsets.size(), handle.get_stream()); + } + + // 1. compute types for unique keys (to reduce memory footprint) + + auto [aggregate_local_frontier_unique_keys, + aggregate_local_frontier_key_idx_to_unique_key_idx, + local_frontier_unique_key_offsets] = + compute_unique_keys(handle, aggregate_local_frontier_key_first, local_frontier_offsets); + + auto [aggregate_local_frontier_unique_key_edge_types, + aggregate_local_frontier_unique_key_local_degree_offsets] = + compute_aggregate_local_frontier_edge_types( + handle, + graph_view, + get_dataframe_buffer_begin(aggregate_local_frontier_unique_keys), + edge_type_input, + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size())); + + // 2. Segment-sort (index, type) pairs on types (1 segment per key) + + rmm::device_uvector aggregate_local_frontier_unique_key_org_indices( + aggregate_local_frontier_unique_key_edge_types.size(), handle.get_stream()); + + { + // to limit memory footprint ((1 << 20) is a tuning parameter) + auto approx_nbrs_to_sort_per_iteration = + static_cast(handle.get_device_properties().multiProcessorCount * (1 << 20)); + + auto [h_key_offsets, h_nbr_offsets] = detail::compute_offset_aligned_element_chunks( + handle, + raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size()), + aggregate_local_frontier_unique_key_edge_types.size(), + approx_nbrs_to_sort_per_iteration); + + rmm::device_uvector d_tmp_storage(0, handle.get_stream()); + + auto num_chunks = h_key_offsets.size() - 1; + for (size_t i = 0; i < num_chunks; ++i) { + size_t tmp_storage_bytes{0}; + + rmm::device_uvector segment_sorted_types(h_nbr_offsets[i + 1] - h_nbr_offsets[i], + handle.get_stream()); + rmm::device_uvector nbr_indices(h_nbr_offsets[i + 1] - h_nbr_offsets[i], + handle.get_stream()); + thrust::tabulate( + handle.get_thrust_policy(), + nbr_indices.begin(), + nbr_indices.end(), + [offsets = raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets + h_key_offsets[i], + (h_key_offsets[i + 1] - h_key_offsets[i]) + 1), + start_offset = h_nbr_offsets[i]] __device__(size_t i) { + auto idx = thrust::distance( + offsets.begin() + 1, + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), start_offset + i)); + return static_cast((start_offset + i) - offsets[idx]); + }); + raft::device_span segment_sorted_nbr_indices( + aggregate_local_frontier_unique_key_org_indices.dta() + h_nbr_offsets[i], + h_nbr_offsets[i + 1] - h_nbr_offsets[i]); + + auto offset_first = thrust::make_transform_iterator( + aggregate_local_frontier_unique_key_local_degree_offsets.data() + h_key_offsets[i], + detail::shift_left_t{h_nbr_offsets[i]}); + cub::DeviceSegmentedSort::SortPairs( + static_cast(nullptr), + tmp_storage_bytes, + aggregate_local_frontier_unique_key_edge_types.begin() + h_nbr_offsets[i], + segment_sorted_types.begin(), + nbr_indices.begin(), + segment_sorted_nbr_indices.begin(), + h_nbr_offsets[i + 1] - h_nbr_offsets[i], + h_key_offsets[i + 1] - h_key_offsets[i], + offset_first, + offset_first + 1, + handle.get_stream()); + if (tmp_storage_bytes > d_tmp_storage.size()) { + d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, handle.get_stream()); + } + cub::DeviceSegmentedSort::SortPairs( + d_tmp_storage.data(), + tmp_storage_bytes, + aggregate_local_frontier_unique_key_edge_types.begin() + h_nbr_offsets[i], + segment_sorted_types.begin(), + nbr_indices.begin(), + segment_sorted_nbr_indices.begin(), + h_nbr_offsets[i + 1] - h_nbr_offsets[i], + h_key_offsets[i + 1] - h_key_offsets[i], + offset_first, + offset_first + 1, + handle.get_stream()); + } + } - auto aggregate_local_frontier_major_first = - thrust_tuple_get_or_identity(aggregate_local_frontier_key_first); - - auto edge_mask_view = graph_view.edge_mask_view(); + // 3. sample neighbor indices and shuffle neighbor indices - // 1. compute degrees + rmm::device_uvector local_nbr_indices(0, handle.get_stream()); + std::optional> key_indices{std::nullopt}; + std::vector local_frontier_sample_offsets{}; - rmm::device_uvector frontier_degrees(0, handle.get_stream()); - std::optional> frontier_partitioned_local_degree_displacements{ - std::nullopt}; { - auto aggregate_local_frontier_local_degrees = compute_aggregate_local_frontier_local_degrees( - handle, graph_view, aggregate_local_frontier_major_first, local_frontier_offsets); + auto aggregate_local_frontier_per_type_local_degrees = + compute_aggregate_local_frontier_per_type_local_degrees( + handle, + raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), + aggregate_local_frontier_key_idx_to_unique_key_idx.size()), + local_frontier_offsets, + raft::device_span(aggregate_local_frontier_unique_key_edge_types.data(), + aggregate_local_frontier_unique_key_edge_types.size()), + raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size()), + local_frontier_unique_key_offsets, + num_edge_types); + rmm::device_uvector frontier_per_type_degrees(0, handle.get_stream()); + std::optional> + frontier_partitioned_per_type_local_degree_displacements(0, handle.get_stream()); if (minor_comm_size > 1) { - std::tie(frontier_degrees, frontier_partitioned_local_degree_displacements) = + std::tie(frontier_per_type_degrees, + frontier_partitioned_per_type_local_degree_displacements) = compute_frontier_value_sums_and_partitioned_local_value_sum_displacements( handle, - raft::device_span(aggregate_local_frontier_local_degrees.data(), - aggregate_local_frontier_local_degrees.size()), + raft::device_span(aggregate_local_frontier_per_type_local_degrees.data(), + aggregate_local_frontier_per_type_local_degrees.size()), local_frontier_offsets, - 1); - aggregate_local_frontier_local_degrees.resize(0, handle.get_stream()); - aggregate_local_frontier_local_degrees.shrink_to_fit(handle.get_stream()); + num_edge_types); } else { - frontier_degrees = std::move(aggregate_local_frontier_local_degrees); + frontier_per_type_degrees = std::move(aggregate_local_frontier_per_type_local_degrees); } - } - // 2. sample neighbor indices + rmm::device_uvector per_type_nbr_indices(0, handle.get_stream()); - rmm::device_uvector nbr_indices(0, handle.get_stream()); + if (with_replacement) { + per_type_nbr_indices.resize((frontier_per_type_degrees.size() / num_edge_types) * K_sum, + handle.get_stream()); + sample_nbr_index_with_replacement( + handle, + raft::device_span(frontier_per_type_degrees.data(), + frontier_per_type_degrees.size()), + std::nullopt, + raft::device_span(per_type_nbr_indices.data(), per_type_nbr_indices.size()), + rng_state, + raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + K_sum); + } else { + per_type_nbr_indices = + compute_heterogeneous_uniform_sampling_index_without_replacement( + handle, + raft::device_span(frontier_per_type_degrees.data(), + frontier_per_type_degrees.size()), + rng_state, + raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + K_sum); + } - if (with_replacement) { - if (frontier_degrees.size() > 0) { - nbr_indices.resize(frontier_degrees.size() * K, handle.get_stream()); - cugraph::legacy::ops::graph::get_sampling_index(nbr_indices.data(), - rng_state, - frontier_degrees.data(), - static_cast(frontier_degrees.size()), - static_cast(K), - with_replacement, - handle.get_stream()); - frontier_degrees.resize(0, handle.get_stream()); - frontier_degrees.shrink_to_fit(handle.get_stream()); + rmm::device_uvector per_type_local_nbr_indices(0, handle.get_stream()); + std::optional> edge_types{std::nullopt}; + if (minor_comm_size > 1) { + std::tie(per_type_local_nbr_indices, edge_types, key_indices, local_frontier_sample_offsets) = + shuffle_and_compute_per_type_local_nbr_values( + handle, + std::move(per_type_nbr_indices), + raft::device_span( + (*frontier_partitioned_per_type_local_degree_displacements).data(), + (*frontier_partitioned_per_type_local_degree_displacements).size()), + raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + K_sum, + cugraph::invalid_edge_id_v); + } else { + per_type_local_nbr_indices = std::move(per_type_nbr_indices); + local_frontier_sample_offsets = {size_t{0}, per_type_local_nbr_indices.size()}; } - } else { - nbr_indices = compute_uniform_sampling_index_without_replacement( - handle, std::move(frontier_degrees), rng_state, K); - } - // 3. shuffle neighbor indices + rmm::device_uvector aggregate_local_frontier_unique_key_per_type_local_degree_offsets( + local_frontier_unique_key_offsets.back() * num_edge_types + 1, handle.get_stream()); + { + rmm::device_uvector aggregate_local_frontier_unique_key_indices( + local_frontier_unique_key_offsets.back(), handle.get_stream()); + for (size_t i = 0; i < local_frontier_unique_key_offsets.size() - 1; ++i) { + thrust::sequence(handle.get_thrust_policy(), + aggregate_local_frontier_unique_key_indices.begin() + + local_frontier_unique_key_offsets[i], + aggregate_local_frontier_unique_key_indices.begin() + + local_frontier_unique_key_offsets[i + 1], + size_t{0}); + } - rmm::device_uvector local_nbr_indices(0, handle.get_stream()); - std::optional> key_indices{std::nullopt}; - std::vector local_frontier_sample_offsets{}; - if (minor_comm_size > 1) { - std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = - shuffle_and_compute_local_nbr_values( + auto aggregate_local_frontier_unique_key_per_type_local_degrees = + compute_aggregate_local_frontier_per_type_local_degrees( + handle, + raft::device_span(aggregate_local_frontier_unique_key_indices.data(), + aggregate_local_frontier_unique_key_indices.size()), + local_frontier_unique_key_offsets, + raft::device_span( + aggregate_local_frontier_unique_key_edge_types.data(), + aggregate_local_frontier_unique_key_edge_types.size()), + raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size()), + local_frontier_unique_key_offsets, + num_edge_types); + + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.set_element_to_zero_async( + 0, handle.get_stream()); + thrust::inclusive_scan( + handle.get_thrust_policy(), + aggregate_local_frontier_unique_key_per_type_local_degrees.begin(), + aggregate_local_frontier_unique_key_per_type_local_degrees.end(), + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.begin() + 1); + } + + assert(edge_types.has_value() == key_indices.has_value()); + local_nbr_indices = + compute_local_nbr_indices_from_per_type_local_nbr_indices( handle, - std::move(nbr_indices), - raft::device_span((*frontier_partitioned_local_degree_displacements).data(), - (*frontier_partitioned_local_degree_displacements).size()), - K, - cugraph::invalid_edge_id_v); - } else { - local_nbr_indices = std::move(nbr_indices); - local_frontier_sample_offsets = {size_t{0}, local_nbr_indices.size()}; + raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), + aggregate_local_frontier_key_idx_to_unique_key_idx.size()), + local_frontier_offsets, + raft::device_span( + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_per_type_local_degree_offsets.size()), + local_frontier_unique_key_offsets, + edge_types + ? std::make_optional(std::make_tuple( + raft::device_span(edge_types.data(), edge_types.size()), + raft::device_span((*key_indices).data(), (*key_indices).size()))) + : std::nullopt, + std::move(per_type_local_nbr_indices), + local_frontier_sample_offsets, + raft::device_span(d_K_offsets.data(), d_K_offsets.size()), + K_sum); } - // 4. convert neighbor indices in the neighbor list considering edge mask to neighbor indices in + // 4. Re-map local neighbor indices + + local_nbr_indices = remap_local_nbr_indices( + handle, + raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), + aggregate_local_frontier_key_idx_to_unique_key_idx.size()), + local_frontier_offsets, + raft::device_span(aggregate_local_frontier_unique_key_org_indices.data(), + aggregate_local_frontier_unique_key_org_indices.size()), + raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size()), + std::move(local_nbr_indices), + raft::host_span(local_frontier_sample_offsets.data(), + local_frontier_sample_offsets.size()), + K_sum); + + // 5. convert neighbor indices in the neighbor list considering edge mask to neighbor indices in // the neighbor list ignoring edge mask if (edge_mask_view) { local_nbr_indices = convert_to_unmasked_local_nbr_idx( handle, graph_view, - aggregate_local_frontier_major_first, + thrust_tuple_get_or_identity(aggregate_local_frontier_key_first), std::move(local_nbr_indices), key_indices ? std::make_optional>((*key_indices).data(), (*key_indices).size()) @@ -3984,7 +5426,7 @@ homogeneous_uniform_sample_and_compute_local_nbr_indices( raft::host_span(local_frontier_sample_offsets.data(), local_frontier_sample_offsets.size()), local_frontier_offsets, - K); + K_sum); } return std::make_tuple( @@ -4018,7 +5460,7 @@ homogeneous_biased_sample_and_compute_local_nbr_indices( using edge_t = typename GraphViewType::edge_type; using key_t = typename thrust::iterator_traits::value_type; - using bias_t = typename edge_op_result_type( handle, - local_frontier_offsets, raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), aggregate_local_frontier_key_idx_to_unique_key_idx.size()), - raft::host_span(local_frontier_unique_key_offsets.data(), - local_frontier_unique_key_offsets.size()), + local_frontier_offsets, raft::device_span(aggregate_local_frontier_unique_key_biases.data(), aggregate_local_frontier_unique_key_biases.size()), raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data(), aggregate_local_frontier_unique_key_local_degree_offsets.size()), + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), rng_state, raft::host_span(&K, size_t{1})); } else { std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = homogeneous_biased_sample_without_replacement( handle, - local_frontier_offsets, raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), aggregate_local_frontier_key_idx_to_unique_key_idx.size()), - raft::host_span(local_frontier_unique_key_offsets.data(), - local_frontier_unique_key_offsets.size()), + local_frontier_offsets, raft::device_span(aggregate_local_frontier_unique_key_biases.data(), aggregate_local_frontier_unique_key_biases.size()), raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data(), aggregate_local_frontier_unique_key_local_degree_offsets.size()), + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), rng_state, K); } @@ -4130,11 +5572,12 @@ homogeneous_biased_sample_and_compute_local_nbr_indices( auto pair_first = thrust::make_zip_iterator(local_nbr_indices.begin(), thrust::make_counting_iterator(size_t{0})); for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - thrust::transform( + thrust::transform_if( handle.get_thrust_policy(), pair_first + local_frontier_sample_offsets[i], pair_first + local_frontier_sample_offsets[i + 1], local_nbr_indices.begin() + local_frontier_sample_offsets[i], + local_nbr_indices.begin() + local_frontier_sample_offsets[i], cuda::proclaim_return_type( [key_idx_to_unique_key_idx = raft::device_span( aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], @@ -4152,7 +5595,8 @@ homogeneous_biased_sample_and_compute_local_nbr_indices( auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; return unique_key_nz_bias_indices[unique_key_local_degree_offsets[unique_key_idx] + nz_bias_idx]; - })); + }), + is_not_equal_t{cugraph::invalid_edge_id_v}); } } @@ -4236,7 +5680,7 @@ hetrogeneous_biased_sample_and_compute_local_nbr_indices( compute_unique_keys(handle, aggregate_local_frontier_key_first, local_frontier_offsets); auto [aggregate_local_frontier_unique_key_biases, - aggregate_local_frontier_unique_key_types, + aggregate_local_frontier_unique_key_edge_types, aggregate_local_frontier_unique_key_nz_bias_indices, aggregate_local_frontier_unique_key_local_degree_offsets] = compute_aggregate_local_frontier_bias_type_pairs( @@ -4254,88 +5698,90 @@ hetrogeneous_biased_sample_and_compute_local_nbr_indices( // 2. Segmented-sort (index, bias, type) triplets based on types (1 segment per key) - // to limit memory footprint ((1 << 20) is a tuning parameter) - auto approx_nbrs_to_sort_per_iteration = - static_cast(handle.get_device_properties().multiProcessorCount * (1 << 20)); + { + // to limit memory footprint ((1 << 20) is a tuning parameter) + auto approx_nbrs_to_sort_per_iteration = + static_cast(handle.get_device_properties().multiProcessorCount * (1 << 20)); - auto [h_key_offsets, h_nbr_offsets] = detail::compute_offset_aligned_element_chunks( - handle, - raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data(), - aggregate_local_frontier_unique_key_local_degree_offsets.size()), - aggregate_local_frontier_unique_key_biases.size(), - approx_nbrs_to_sort_per_iteration); + auto [h_key_offsets, h_nbr_offsets] = detail::compute_offset_aligned_element_chunks( + handle, + raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size()), + aggregate_local_frontier_unique_key_biases.size(), + approx_nbrs_to_sort_per_iteration); - rmm::device_uvector d_tmp_storage(0, handle.get_stream()); + rmm::device_uvector d_tmp_storage(0, handle.get_stream()); - auto num_chunks = h_key_offsets.size() - 1; - for (size_t i = 0; i < num_chunks; ++i) { - size_t tmp_storage_bytes{0}; + auto num_chunks = h_key_offsets.size() - 1; + for (size_t i = 0; i < num_chunks; ++i) { + size_t tmp_storage_bytes{0}; - rmm::device_uvector segment_sorted_types(h_nbr_offsets[i + 1] - h_nbr_offsets[i], - handle.get_stream()); - rmm::device_uvector sequences(h_nbr_offsets[i + 1] - h_nbr_offsets[i], - handle.get_stream()); - thrust::sequence(handle.get_thrust_policy(), sequences.begin(), sequences.end(), size_t{0}); - rmm::device_uvector segment_sorted_sequences(h_nbr_offsets[i + 1] - h_nbr_offsets[i], - handle.get_stream()); - - auto offset_first = thrust::make_transform_iterator( - aggregate_local_frontier_unique_key_local_degree_offsets.data() + h_key_offsets[i], - detail::shift_left_t{h_nbr_offsets[i]}); - cub::DeviceSegmentedSort::SortPairs( - static_cast(nullptr), - tmp_storage_bytes, - aggregate_local_frontier_unique_key_types.begin() + h_nbr_offsets[i], - segment_sorted_types.begin(), - sequences.begin(), - segment_sorted_sequences.begin(), - h_nbr_offsets[i + 1] - h_nbr_offsets[i], - h_key_offsets[i + 1] - h_key_offsets[i], - offset_first, - offset_first + 1, - handle.get_stream()); - if (tmp_storage_bytes > d_tmp_storage.size()) { - d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, handle.get_stream()); - } - cub::DeviceSegmentedSort::SortPairs( - d_tmp_storage.data(), - tmp_storage_bytes, - aggregate_local_frontier_unique_key_types.begin() + h_nbr_offsets[i], - segment_sorted_types.begin(), - sequences.begin(), - segment_sorted_sequences.begin(), - h_nbr_offsets[i + 1] - h_nbr_offsets[i], - h_key_offsets[i + 1] - h_key_offsets[i], - offset_first, - offset_first + 1, - handle.get_stream()); + rmm::device_uvector segment_sorted_types(h_nbr_offsets[i + 1] - h_nbr_offsets[i], + handle.get_stream()); + rmm::device_uvector sequences(h_nbr_offsets[i + 1] - h_nbr_offsets[i], + handle.get_stream()); + thrust::sequence(handle.get_thrust_policy(), sequences.begin(), sequences.end(), size_t{0}); + rmm::device_uvector segment_sorted_sequences(h_nbr_offsets[i + 1] - h_nbr_offsets[i], + handle.get_stream()); - thrust::copy(handle.get_thrust_policy(), - segment_sorted_types.begin(), - segment_sorted_types.end(), - aggregate_local_frontier_unique_key_types.begin() + h_nbr_offsets[i]); + auto offset_first = thrust::make_transform_iterator( + aggregate_local_frontier_unique_key_local_degree_offsets.data() + h_key_offsets[i], + detail::shift_left_t{h_nbr_offsets[i]}); + cub::DeviceSegmentedSort::SortPairs( + static_cast(nullptr), + tmp_storage_bytes, + aggregate_local_frontier_unique_key_edge_types.begin() + h_nbr_offsets[i], + segment_sorted_types.begin(), + sequences.begin(), + segment_sorted_sequences.begin(), + h_nbr_offsets[i + 1] - h_nbr_offsets[i], + h_key_offsets[i + 1] - h_key_offsets[i], + offset_first, + offset_first + 1, + handle.get_stream()); + if (tmp_storage_bytes > d_tmp_storage.size()) { + d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, handle.get_stream()); + } + cub::DeviceSegmentedSort::SortPairs( + d_tmp_storage.data(), + tmp_storage_bytes, + aggregate_local_frontier_unique_key_edge_types.begin() + h_nbr_offsets[i], + segment_sorted_types.begin(), + sequences.begin(), + segment_sorted_sequences.begin(), + h_nbr_offsets[i + 1] - h_nbr_offsets[i], + h_key_offsets[i + 1] - h_key_offsets[i], + offset_first, + offset_first + 1, + handle.get_stream()); - rmm::device_uvector segment_sorted_biases(h_nbr_offsets[i + 1] - h_nbr_offsets[i], - handle.get_stream()); - rmm::device_uvector segment_sorted_nz_bias_indices( - h_nbr_offsets[i + 1] - h_nbr_offsets[i], handle.get_stream()); - thrust::gather( - handle.get_thrust_policy(), - segment_sorted_sequences.begin(), - segment_sorted_sequences.end(), - thrust::make_zip_iterator(aggregate_local_frontier_unique_key_biases.begin(), - aggregate_local_frontier_unique_key_nz_bias_indices.begin()), - thrust::make_zip_iterator(segment_sorted_biases.begin(), - segment_sorted_nz_bias_indices.begin())); - auto segment_sorted_pair_first = thrust::make_zip_iterator( - segment_sorted_biases.begin(), segment_sorted_nz_bias_indices.begin()); - thrust::copy( - handle.get_thrust_policy(), - segment_sorted_pair_first, - segment_sorted_pair_first + segment_sorted_biases.size(), - thrust::make_zip_iterator(aggregate_local_frontier_unique_key_biases.begin(), - aggregate_local_frontier_unique_key_nz_bias_indices.begin())); + thrust::copy(handle.get_thrust_policy(), + segment_sorted_types.begin(), + segment_sorted_types.end(), + aggregate_local_frontier_unique_key_edge_types.begin() + h_nbr_offsets[i]); + + rmm::device_uvector segment_sorted_biases(h_nbr_offsets[i + 1] - h_nbr_offsets[i], + handle.get_stream()); + rmm::device_uvector segment_sorted_nz_bias_indices( + h_nbr_offsets[i + 1] - h_nbr_offsets[i], handle.get_stream()); + thrust::gather( + handle.get_thrust_policy(), + segment_sorted_sequences.begin(), + segment_sorted_sequences.end(), + thrust::make_zip_iterator(aggregate_local_frontier_unique_key_biases.begin(), + aggregate_local_frontier_unique_key_nz_bias_indices.begin()), + thrust::make_zip_iterator(segment_sorted_biases.begin(), + segment_sorted_nz_bias_indices.begin())); + auto segment_sorted_pair_first = thrust::make_zip_iterator( + segment_sorted_biases.begin(), segment_sorted_nz_bias_indices.begin()); + thrust::copy( + handle.get_thrust_policy(), + segment_sorted_pair_first, + segment_sorted_pair_first + segment_sorted_biases.size(), + thrust::make_zip_iterator(aggregate_local_frontier_unique_key_biases.begin(), + aggregate_local_frontier_unique_key_nz_bias_indices.begin())); + } } // 3. sample neighbor indices and shuffle neighbor indices @@ -4344,56 +5790,58 @@ hetrogeneous_biased_sample_and_compute_local_nbr_indices( std::optional> key_indices{std::nullopt}; std::vector local_frontier_sample_offsets{}; { + rmm::device_uvector aggregate_local_frontier_unique_key_indices( + local_frontier_unique_key_offsets.back(), handle.get_stream()); + for (size_t i = 0; i < local_frontier_unique_key_offsets.size() - 1; ++i) { + thrust::sequence( + handle.get_thrust_policy(), + aggregate_local_frontier_unique_key_indices.begin() + local_frontier_unique_key_offsets[i], + aggregate_local_frontier_unique_key_indices.begin() + + local_frontier_unique_key_offsets[i + 1], + size_t{0}); + } + + auto aggregate_local_frontier_unique_key_per_type_local_degrees = + compute_aggregate_local_frontier_per_type_local_degrees( + handle, + raft::device_span(aggregate_local_frontier_unique_key_indices.data(), + aggregate_local_frontier_unique_key_indices.size()), + local_frontier_unique_key_offsets, + raft::device_span(aggregate_local_frontier_unique_key_edge_types.data(), + aggregate_local_frontier_unique_key_edge_types.size()), + raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size()), + local_frontier_unique_key_offsets, + num_edge_types); + rmm::device_uvector aggregate_local_frontier_unique_key_per_type_local_degree_offsets( - local_frontier_unique_key_offsets.back() * num_edge_types, handle.get_stream()); + aggregate_local_frontier_unique_key_per_type_local_degrees.size() + 1, handle.get_stream()); aggregate_local_frontier_unique_key_per_type_local_degree_offsets.set_element_to_zero_async( 0, handle.get_stream()); - thrust::tabulate(handle.get_thrust_policy(), - aggregate_local_frontier_unique_key_per_type_local_degree_offsets.begin() + 1, - aggregate_local_frontier_unique_key_per_type_local_degree_offsets.end(), - [unique_key_local_degree_offsets = raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data(), - aggregate_local_frontier_unique_key_local_degree_offsets.size()), - unique_key_types = raft::device_span( - aggregate_local_frontier_unique_key_types.data(), - aggregate_local_frontier_unique_key_types.size()), - num_edge_types] __device__(size_t i) { - auto key_idx = i / num_edge_types; - auto edge_type = static_cast(i % num_edge_types); - auto start_offset = unique_key_local_degree_offsets[key_idx]; - auto end_offset = unique_key_local_degree_offsets[key_idx + 1]; - auto edge_type_first = unique_key_types.begin() + start_offset; - auto edge_type_last = unique_key_types.begin() + end_offset; - return static_cast(thrust::distance( - thrust::lower_bound(edge_type_first, edge_type_last, edge_type), - thrust::upper_bound(edge_type_first, edge_type_last, edge_type))); - }); thrust::inclusive_scan( handle.get_thrust_policy(), - aggregate_local_frontier_unique_key_per_type_local_degree_offsets.begin() + 1, - aggregate_local_frontier_unique_key_per_type_local_degree_offsets.end(), + aggregate_local_frontier_unique_key_per_type_local_degrees.begin(), + aggregate_local_frontier_unique_key_per_type_local_degrees.end(), aggregate_local_frontier_unique_key_per_type_local_degree_offsets.begin() + 1); - aggregate_local_frontier_unique_key_types.resize(0, handle.get_stream()); - aggregate_local_frontier_unique_key_types.shrink_to_fit(handle.get_stream()); + aggregate_local_frontier_unique_key_edge_types.resize(0, handle.get_stream()); + aggregate_local_frontier_unique_key_edge_types.shrink_to_fit(handle.get_stream()); if (with_replacement) { std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = biased_sample_with_replacement( handle, - local_frontier_offsets, raft::device_span( aggregate_local_frontier_key_idx_to_unique_key_idx.data(), aggregate_local_frontier_key_idx_to_unique_key_idx.size()), - raft::host_span(local_frontier_unique_key_offsets.data(), - local_frontier_unique_key_offsets.size()), + local_frontier_offsets, raft::device_span(aggregate_local_frontier_unique_key_biases.data(), aggregate_local_frontier_unique_key_biases.size()), raft::device_span( aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data(), aggregate_local_frontier_unique_key_per_type_local_degree_offsets.size()), - raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data(), - aggregate_local_frontier_unique_key_local_degree_offsets.size()), + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), rng_state, Ks); } else { @@ -4402,17 +5850,17 @@ hetrogeneous_biased_sample_and_compute_local_nbr_indices( bias_t, GraphViewType::is_multi_gpu>( handle, - local_frontier_offsets, raft::device_span( aggregate_local_frontier_key_idx_to_unique_key_idx.data(), aggregate_local_frontier_key_idx_to_unique_key_idx.size()), - raft::host_span(local_frontier_unique_key_offsets.data(), - local_frontier_unique_key_offsets.size()), + local_frontier_offsets, raft::device_span(aggregate_local_frontier_unique_key_biases.data(), aggregate_local_frontier_unique_key_biases.size()), raft::device_span( aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data(), aggregate_local_frontier_unique_key_per_type_local_degree_offsets.size()), + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), rng_state, Ks); } @@ -4422,61 +5870,19 @@ hetrogeneous_biased_sample_and_compute_local_nbr_indices( auto K_sum = std::accumulate(Ks.begin(), Ks.end(), size_t{0}); - if (key_indices) { - auto pair_first = thrust::make_zip_iterator(local_nbr_indices.begin(), (*key_indices).begin()); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - thrust::transform( - handle.get_thrust_policy(), - pair_first + local_frontier_sample_offsets[i], - pair_first + local_frontier_sample_offsets[i + 1], - local_nbr_indices.begin() + local_frontier_sample_offsets[i], - cuda::proclaim_return_type( - [key_idx_to_unique_key_idx = raft::device_span( - aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], - local_frontier_offsets[i + 1] - local_frontier_offsets[i]), - unique_key_local_degree_offsets = raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data() + - local_frontier_unique_key_offsets[i], - (local_frontier_unique_key_offsets[i + 1], local_frontier_unique_key_offsets[i]) + 1), - unique_key_nz_bias_indices = raft::device_span( - aggregate_local_frontier_unique_key_nz_bias_indices.data(), - aggregate_local_frontier_unique_key_nz_bias_indices.size())] __device__(auto pair) { - auto nz_bias_idx = thrust::get<0>(pair); - auto key_idx = thrust::get<1>(pair); - auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; - return unique_key_nz_bias_indices[unique_key_local_degree_offsets[unique_key_idx] + - nz_bias_idx]; - })); - } - } else { - auto pair_first = thrust::make_zip_iterator(local_nbr_indices.begin(), - thrust::make_counting_iterator(size_t{0})); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - thrust::transform( - handle.get_thrust_policy(), - pair_first + local_frontier_sample_offsets[i], - pair_first + local_frontier_sample_offsets[i + 1], - local_nbr_indices.begin() + local_frontier_sample_offsets[i], - cuda::proclaim_return_type( - [key_idx_to_unique_key_idx = raft::device_span( - aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], - local_frontier_offsets[i + 1] - local_frontier_offsets[i]), - unique_key_local_degree_offsets = raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets.data() + - local_frontier_unique_key_offsets[i], - (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) + 1), - unique_key_nz_bias_indices = raft::device_span( - aggregate_local_frontier_unique_key_nz_bias_indices.data(), - aggregate_local_frontier_unique_key_nz_bias_indices.size()), - K_sum] __device__(auto pair) { - auto nz_bias_idx = thrust::get<0>(pair); - auto key_idx = thrust::get<1>(pair) / K_sum; - auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; - return unique_key_nz_bias_indices[unique_key_local_degree_offsets[unique_key_idx] + - nz_bias_idx]; - })); - } - } + local_nbr_indices = remap_local_nbr_indices( + handle, + raft::device_span(aggregate_local_frontier_key_idx_to_unique_key_idx.data(), + aggregate_local_frontier_key_idx_to_unique_key_idx.size()), + local_frontier_offsets, + raft::device_span(aggregate_local_frontier_unique_key_nz_bias_indices.data(), + aggregate_local_frontier_unique_key_nz_bias_indices.size()), + raft::device_span( + aggregate_local_frontier_unique_key_local_degree_offsets.data(), + aggregate_local_frontier_unique_key_local_degree_offsets.size()), + std::move(local_nbr_indices), + local_frontier_sample_offsets, + K_sum); // 5. convert neighbor indices in the neighbor list considering edge mask to neighbor indices in // the neighbor list ignoring edge mask From 99b2a65a0744c6ed021dea63b6dc9e781b55de6a Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 30 Jan 2025 00:39:59 -0800 Subject: [PATCH 13/21] remove unnecessary include --- cpp/src/structure/select_random_vertices_impl.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/structure/select_random_vertices_impl.hpp b/cpp/src/structure/select_random_vertices_impl.hpp index d7502b3f6da..65b84de8952 100644 --- a/cpp/src/structure/select_random_vertices_impl.hpp +++ b/cpp/src/structure/select_random_vertices_impl.hpp @@ -16,7 +16,6 @@ #pragma once #include "detail/graph_partition_utils.cuh" -#include "from_cugraph_ops/sampling.hpp" #include #include From 24c2f97d0e16d25e89f5d375e332f03a439fb24e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 30 Jan 2025 00:51:29 -0800 Subject: [PATCH 14/21] remove from_cugraph_ops --- cpp/CMakeLists.txt | 1 - cpp/src/from_cugraph_ops/algo_R.cuh | 239 ---------------- cpp/src/from_cugraph_ops/device.cuh | 16 -- cpp/src/from_cugraph_ops/device_atomics.cuh | 73 ----- cpp/src/from_cugraph_ops/device_core.hpp | 49 ---- cpp/src/from_cugraph_ops/device_dim.cuh | 132 --------- .../from_cugraph_ops/device_smem_helper.cuh | 270 ------------------ .../device_warp_collectives.cuh | 98 ------- cpp/src/from_cugraph_ops/macros.hpp | 50 ---- cpp/src/from_cugraph_ops/sampling.hpp | 59 ---- cpp/src/from_cugraph_ops/sampling_index.cu | 36 --- cpp/src/from_cugraph_ops/sampling_index.cuh | 174 ----------- .../sample_and_compute_local_nbr_indices.cuh | 23 -- ...r_v_random_select_transform_outgoing_e.cuh | 1 - 14 files changed, 1221 deletions(-) delete mode 100644 cpp/src/from_cugraph_ops/algo_R.cuh delete mode 100644 cpp/src/from_cugraph_ops/device.cuh delete mode 100644 cpp/src/from_cugraph_ops/device_atomics.cuh delete mode 100644 cpp/src/from_cugraph_ops/device_core.hpp delete mode 100644 cpp/src/from_cugraph_ops/device_dim.cuh delete mode 100644 cpp/src/from_cugraph_ops/device_smem_helper.cuh delete mode 100644 cpp/src/from_cugraph_ops/device_warp_collectives.cuh delete mode 100644 cpp/src/from_cugraph_ops/macros.hpp delete mode 100644 cpp/src/from_cugraph_ops/sampling.hpp delete mode 100644 cpp/src/from_cugraph_ops/sampling_index.cu delete mode 100644 cpp/src/from_cugraph_ops/sampling_index.cuh diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ad30b3769d7..57004959cfa 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -379,7 +379,6 @@ set(CUGRAPH_SOURCES src/centrality/betweenness_centrality_mg_v64_e64.cu src/centrality/betweenness_centrality_mg_v32_e32.cu src/tree/legacy/mst.cu - src/from_cugraph_ops/sampling_index.cu src/components/weakly_connected_components_sg_v64_e64.cu src/components/weakly_connected_components_sg_v32_e32.cu src/components/weakly_connected_components_mg_v64_e64.cu diff --git a/cpp/src/from_cugraph_ops/algo_R.cuh b/cpp/src/from_cugraph_ops/algo_R.cuh deleted file mode 100644 index 031a7d2ceb9..00000000000 --- a/cpp/src/from_cugraph_ops/algo_R.cuh +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - */ - -#pragma once - -#include "device.cuh" - -#include - -#include -#include -#include - -#include - -namespace cugraph::ops::graph { - -// single warp-separated field of type IdxT -template -using smem_algo_r_t = utils::smem_unit_simple_t<1, IdxT>; - -template -__device__ __forceinline__ void warp_algo_r_index(IdxT* smem, - IdxT pop_size, - IdxT idx_offset, - int sample_size, - raft::random::DeviceState& rng_state) -{ - auto lane = utils::lane_id(); - // first 'sample_size' are just copied - CUGRAPH_OPS_UNROLL - for (int i = lane; i < sample_size; i += utils::WARP_SIZE) { - smem[i] = idx_offset + i; - } - auto sample_size_idxt = IdxT{sample_size}; - if (sample_size_idxt >= pop_size) return; - - // we must synchronize here since we have just written to smem - utils::warp_sync(); - // TODO(mjoux): when we support more warps per node enable this - //__syncthreads(); - - auto idx_end = idx_offset + pop_size; - auto n = idx_offset + sample_size_idxt; - auto flat_id = uint64_t{threadIdx.x + blockIdx.x * blockDim.x}; - GenT gen(rng_state, flat_id); - CUGRAPH_OPS_UNROLL - for (auto nidx = n + IdxT{lane}; nidx < idx_end; nidx += IdxT{utils::WARP_SIZE}) { - // nidx - idx_offset inclusive (necessary for correctness of algo R) - auto end = nidx - idx_offset + 1; - raft::random::UniformIntDistParams int_params{}; - int_params.start = IdxT{0}; - int_params.end = IdxT{end}; - int_params.diff = static_cast(end); - IdxT idx; - raft::random::custom_next(gen, &idx, int_params, 0, 0 /* idx / stride unused */); - if (idx < sample_size_idxt) { - // using atomic max instead of exch here because it leads to the same - // output as the sequential algorithm (DGL does this, too) - // Additionally, we use the index instead of the neighbor ID here - // since this allows copying over other node/edge-related data - // (useful for heterogeneous graphs for example) - utils::atomic_max(smem + idx, nidx); - } - } - // must synchronize to make smem valid - utils::warp_sync(); - // TODO(mjoux): when we support more warps per node enable this - //__syncthreads(); -} - -template -__device__ __forceinline__ void warp_algo_r(IdxT* smem, - IdxT row_id, - const IdxT* nodes, - const IdxT* fg_offsets, - int sample_size, - IdxT& node_id, - IdxT& node_start, - IdxT& node_end, - raft::random::DeviceState& rng_state) -{ - auto lane = utils::lane_id(); - if (nodes == nullptr) { - node_id = row_id; - if (lane == 0) - node_start = fg_offsets[node_id]; - else if (lane == 1) - node_end = fg_offsets[node_id + 1]; - node_start = utils::shfl(node_start, 0); - node_end = utils::shfl(node_end, 1); - } else { - if (lane == 0) { - node_id = nodes[row_id]; - node_start = fg_offsets[node_id]; - node_end = fg_offsets[node_id + 1]; - } - node_id = utils::shfl(node_id, 0); - node_start = utils::shfl(node_start, 0); - node_end = utils::shfl(node_end, 0); - } - auto pop_size = node_end - node_start; - warp_algo_r_index(smem, pop_size, node_start, sample_size, rng_state); -} - -// TODO(mjoux): support configuring n_warps_per_node in template -template -CUGRAPH_OPS_KERNEL void algo_r_kernel(raft::random::DeviceState rng_state, - IdxT* neighbors, - IdxT* counts, - // edge_types / node_types should be non-const - // probably detected if `!IS_HG` - // NOLINTNEXTLINE(readability-non-const-parameter) - int32_t* edge_types, - // NOLINTNEXTLINE(readability-non-const-parameter) - int32_t* node_types, - const IdxT* offsets, - const IdxT* indices, - const int32_t* g_edge_types, - const int32_t* g_node_types, - const IdxT* nodes, - IdxT n_dst_nodes, - int sample_size) -{ - auto lane = utils::lane_id(); - auto warp = utils::warp_id(); // 1D block with X dim - auto row_id = warp + static_cast(blockIdx.x) * IdxT{N_WARPS}; - if (row_id >= n_dst_nodes) { return; } - IdxT* s_idx; - smem_algo_r_t smem{}; - int32_t smem_sizes[] = {sample_size}; - smem.set_ptrs(warp, N_WARPS, smem_sizes, s_idx); - IdxT node_id, node_start, node_end; - warp_algo_r( - s_idx, row_id, nodes, offsets, sample_size, node_id, node_start, node_end, rng_state); - - IdxT count = 0; - for (int i = lane; i < sample_size; i += utils::WARP_SIZE) { - auto nidx = s_idx[i]; - // checking for node_end here because sample_size may be larger than - // the total number of neighbors of the node - auto val = nidx < node_end ? indices[nidx] : cugraph::invalid_idx::value; - // TODO(mjoux) it's possible that we break the ELLPACK format here since - // if we set val to invalid, we should add it to end of list, rather - // than simply at index "i". This is ignored for now since the case - // where SAMPLE_SELF := false is rare and unconventional - if (!SAMPLE_SELF && val == node_id) val = cugraph::invalid_idx::value; - auto local_id = row_id * IdxT{sample_size} + i; - neighbors[local_id] = val; - if (val != cugraph::invalid_idx::value) { - ++count; - if (IS_HG) edge_types[local_id] = g_edge_types[nidx]; - } - } - if (IS_HG && lane == 0) node_types[row_id] = g_node_types[node_id]; - if (counts != nullptr) { - count = utils::warp_reduce(count); - if (lane == 0) { counts[row_id] = count; } - } -} - -template -void algo_r_impl(IdxT* neighbors, - IdxT* counts, - int32_t* edge_types, - int32_t* node_types, - raft::random::RngState& rng, - const IdxT* offsets, - const IdxT* indices, - const int32_t* g_edge_types, - const int32_t* g_node_types, - const IdxT* nodes, - IdxT n_dst_nodes, - IdxT g_n_dst_nodes, - IdxT sample_size, - IdxT max_val, - cudaStream_t stream) -{ - if (nodes == nullptr) { n_dst_nodes = g_n_dst_nodes; } - ASSERT(n_dst_nodes <= g_n_dst_nodes, - "Algo R: expected n_dst_nodes <= graph.n_dst_nodes (%ld > %ld)", - long(n_dst_nodes), - long(g_n_dst_nodes)); - ASSERT( - static_cast(sample_size) + 2 < static_cast(std::numeric_limits::max()), - "Expected sample size [+2] to be lower than INT_MAX"); - static constexpr int TPB = 512; - static constexpr int N_WARPS = TPB / utils::WARP_SIZE; - auto n_blks = utils::ceil_div(n_dst_nodes, N_WARPS); - int sample_size_i = static_cast(sample_size); - int32_t smem_sizes[] = {sample_size_i}; - size_t smem_size = smem_algo_r_t::get_size(N_WARPS, smem_sizes); - if (static_cast(max_val) < std::numeric_limits::max()) { - // we'll use the 32-bit based method for generating random integers - // as we most likely do not need less bias - RAFT_CALL_RNG_FUNC( - rng, - (algo_r_kernel<<>>), - neighbors, - counts, - edge_types, - node_types, - offsets, - indices, - g_edge_types, - g_node_types, - nodes, - n_dst_nodes, - sample_size_i); - } else { - RAFT_CALL_RNG_FUNC( - rng, - (algo_r_kernel<<>>), - neighbors, - counts, - edge_types, - node_types, - offsets, - indices, - g_edge_types, - g_node_types, - nodes, - n_dst_nodes, - sample_size_i); - } - // update the rng state (this is a pessimistic update as it is difficult to - // compute the number of RNG calls done per thread!) - auto thread_rs = utils::ceil_div( - std::max(IdxT{0}, std::min(max_val, g_n_dst_nodes) - sample_size), utils::WARP_SIZE); - rng.advance(static_cast(n_blks * TPB), thread_rs); - RAFT_CUDA_TRY(cudaGetLastError()); -} - -} // namespace cugraph::ops::graph diff --git a/cpp/src/from_cugraph_ops/device.cuh b/cpp/src/from_cugraph_ops/device.cuh deleted file mode 100644 index f7d37c62f35..00000000000 --- a/cpp/src/from_cugraph_ops/device.cuh +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - */ - -#pragma once - -#include "device_atomics.cuh" -#include "device_core.hpp" -#include "device_dim.cuh" -#include "device_smem_helper.cuh" -#include "device_warp_collectives.cuh" -#include "macros.hpp" diff --git a/cpp/src/from_cugraph_ops/device_atomics.cuh b/cpp/src/from_cugraph_ops/device_atomics.cuh deleted file mode 100644 index b8be7614284..00000000000 --- a/cpp/src/from_cugraph_ops/device_atomics.cuh +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - */ - -#pragma once - -#include -#include - -#include - -namespace cugraph::ops::utils { - -/** - * @defgroup AtomicMax Device atomic max operation - * - * @{ - */ -template -__device__ inline DataT atomic_max(DataT* address, DataT val) -{ - return atomicMax(address, val); -} -template <> -__device__ inline float atomic_max(float* address, float val) -{ - using u32_t = unsigned int; - auto* address_as_u32 = reinterpret_cast(address); - u32_t old = *address_as_u32, assumed; - do { - assumed = old; - old = atomicCAS(address_as_u32, assumed, __float_as_uint(max(val, __uint_as_float(assumed)))); - } while (assumed != old); - return __uint_as_float(old); -} -template <> -__device__ inline double atomic_max(double* address, double val) -{ - using u64_t = unsigned long long; // NOLINT(google-runtime-int) - auto* address_as_ull = reinterpret_cast(address); - u64_t old = *address_as_ull, assumed; - do { - assumed = old; - old = atomicCAS( - address_as_ull, assumed, __double_as_longlong(max(val, __longlong_as_double(assumed)))); - } while (assumed != old); - return __longlong_as_double(old); -} -template <> -__device__ inline int64_t atomic_max(int64_t* address, int64_t val) -{ - using u64_t = unsigned long long; // NOLINT(google-runtime-int) - auto* val_as_u64 = reinterpret_cast(&val); - auto* address_as_u64 = reinterpret_cast(address); - auto ret = atomicMax(address_as_u64, *val_as_u64); - return *reinterpret_cast(&ret); -} -template <> -__device__ inline uint64_t atomic_max(uint64_t* address, uint64_t val) -{ - using u64_t = unsigned long long; // NOLINT(google-runtime-int) - auto* val_as_u64 = reinterpret_cast(&val); - auto* address_as_u64 = reinterpret_cast(address); - auto ret = atomicMax(address_as_u64, *val_as_u64); - return *reinterpret_cast(&ret); -} -/** @} */ - -} // namespace cugraph::ops::utils diff --git a/cpp/src/from_cugraph_ops/device_core.hpp b/cpp/src/from_cugraph_ops/device_core.hpp deleted file mode 100644 index b548d2d4d1f..00000000000 --- a/cpp/src/from_cugraph_ops/device_core.hpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - */ - -#pragma once - -#include "macros.hpp" - -namespace cugraph::ops::utils { - -/** number of threads per warp */ -static constexpr int WARP_SIZE = 32; - -/** minimum CUDA version required for warp shfl sync functions */ -static constexpr int CUDA_VER_WARP_SHFL = 9000; - -/** - * @brief Provide a ceiling division operation ie. ceil(a / b) - * - * @tparam IntT supposed to be only integers for now! - * - * @param[in] a dividend - * @param[in] b divisor - */ -template -constexpr CUGRAPH_OPS_HD IntT ceil_div(IntT a, IntT b) -{ - return (a + b - 1) / b; -} - -/** - * @brief Provide an alignment function ie. ceil(a / b) * b - * - * @tparam IntT supposed to be only integers for now! - * - * @param[in] a dividend - * @param[in] b divisor - */ -template -constexpr CUGRAPH_OPS_HD IntT align_to(IntT a, IntT b) -{ - return ceil_div(a, b) * b; -} - -} // namespace cugraph::ops::utils diff --git a/cpp/src/from_cugraph_ops/device_dim.cuh b/cpp/src/from_cugraph_ops/device_dim.cuh deleted file mode 100644 index 275d0edd485..00000000000 --- a/cpp/src/from_cugraph_ops/device_dim.cuh +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - */ - -#pragma once - -#include "device_core.hpp" - -namespace cugraph::ops::utils { - -/** get the lane id of the current thread */ -__device__ __forceinline__ int lane_id() -{ - int id; - asm("mov.s32 %0, %%laneid;" : "=r"(id)); - return id; -} - -/** - * get the flat id of the current thread (within block) - * template parameters allow to control which CTA dimensions are used - */ -template -__device__ __forceinline__ int flat_id() -{ - if (!USE_X && !USE_Y && !USE_Z) - return 0; // weird case, but if we get here, we should have 1 thread - if (!USE_X && !USE_Y && USE_Z) return threadIdx.z; - if (!USE_X && USE_Y && !USE_Z) return threadIdx.y; - if (!USE_X && USE_Y && USE_Z) return threadIdx.y + threadIdx.z * blockDim.y; - if (USE_X && !USE_Y && !USE_Z) return threadIdx.x; - if (USE_X && !USE_Y && USE_Z) return threadIdx.x + threadIdx.z * blockDim.x; - if (USE_X && USE_Y && !USE_Z) return threadIdx.x + threadIdx.y * blockDim.x; - // USE_X && USE_Y && USE_Z - return threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; -} - -/** - * get the number of warps of the current block - * template parameters allow to control which CTA dimensions are used - */ -template -__device__ __forceinline__ int num_warps() -{ - if (!USE_X && !USE_Y && !USE_Z) - return 1; // weird case, but if we get here, we should have 1 thread - if (!USE_X && !USE_Y && USE_Z) return ceil_div(blockDim.z, WARP_SIZE); - if (!USE_X && USE_Y && !USE_Z) return ceil_div(blockDim.y, WARP_SIZE); - if (!USE_X && USE_Y && USE_Z) return ceil_div(blockDim.y * blockDim.z, WARP_SIZE); - if (USE_X && !USE_Y && !USE_Z) return ceil_div(blockDim.x, WARP_SIZE); - if (USE_X && !USE_Y && USE_Z) return ceil_div(blockDim.x * blockDim.z, WARP_SIZE); - if (USE_X && USE_Y && !USE_Z) return ceil_div(blockDim.x * blockDim.y, WARP_SIZE); - // USE_X && USE_Y && USE_Z - return ceil_div(blockDim.x * blockDim.y * blockDim.z, WARP_SIZE); -} - -/** - * get the warp id of the current thread - * template parameters allow to control which CTA dimensions are used - * @note: this only makes sense if the first used dimension of the CTA size - * is a multiple of WARP_SIZE. If this is not the case, use - * `flat_id<...>() / WARP_SIZE` to get the warp id of the current thread - */ -template -__device__ __forceinline__ int warp_id() -{ - if (!USE_X && !USE_Y && !USE_Z) - return 0; // weird case, but if we get here, we should have 1 thread - if (!USE_X && !USE_Y && USE_Z) return threadIdx.z / WARP_SIZE; - if (!USE_X && USE_Y && !USE_Z) return threadIdx.y / WARP_SIZE; - if (!USE_X && USE_Y && USE_Z) - return threadIdx.y / WARP_SIZE + threadIdx.z * num_warps(); - if (USE_X && !USE_Y && !USE_Z) return threadIdx.x / WARP_SIZE; - if (USE_X && !USE_Y && USE_Z) - return threadIdx.x / WARP_SIZE + threadIdx.z * num_warps(); - if (USE_X && USE_Y && !USE_Z) - return threadIdx.x / WARP_SIZE + threadIdx.y * num_warps(); - // USE_X && USE_Y && USE_Z - return threadIdx.x / WARP_SIZE + threadIdx.y * num_warps() + - threadIdx.z * blockDim.y * num_warps(); -} - -/** - * get the block dimension of the current executing block - * template parameters allow to control which CTA dimensions are used - */ -template -__device__ __forceinline__ int block_dim() -{ - if (!USE_X && !USE_Y && !USE_Z) - return 1; // weird case, but if we get here, we should have 1 thread - if (!USE_X && !USE_Y && USE_Z) return blockDim.z; - if (!USE_X && USE_Y && !USE_Z) return blockDim.y; - if (!USE_X && USE_Y && USE_Z) return blockDim.y * blockDim.z; - if (USE_X && !USE_Y && !USE_Z) return blockDim.x; - if (USE_X && !USE_Y && USE_Z) return blockDim.x * blockDim.z; - if (USE_X && USE_Y && !USE_Z) return blockDim.x * blockDim.y; - // USE_X && USE_Y && USE_Z - return blockDim.x * blockDim.y * blockDim.z; -} - -/** - * get the flat id of the current thread (within device/grid) - * template parameters allow to control which grid and block/CTA dimensions are used - */ -template -__device__ __forceinline__ int flat_grid_id() -{ - auto b_id = flat_id(); - auto b_dim = block_dim(); - if (!G_USE_X && !G_USE_Y && !G_USE_Z) - return 0; // weird case, but if we get here, we should have 1 thread - if (!G_USE_X && !G_USE_Y && G_USE_Z) return blockIdx.z * b_dim + b_id; - if (!G_USE_X && G_USE_Y && !G_USE_Z) return blockIdx.y * b_dim + b_id; - if (!G_USE_X && G_USE_Y && G_USE_Z) return blockIdx.y * b_dim + blockIdx.z * blockDim.z + b_id; - if (G_USE_X && !G_USE_Y && !G_USE_Z) return blockIdx.x * b_dim + b_id; - if (G_USE_X && !G_USE_Y && G_USE_Z) return blockIdx.x * b_dim + blockIdx.z * blockDim.z + b_id; - if (G_USE_X && G_USE_Y && !G_USE_Z) return blockIdx.x * b_dim + blockIdx.y * blockDim.y + b_id; - // G_USE_X && G_USE_Y && G_USE_Z - return blockIdx.x * b_dim + blockIdx.y * blockDim.y * blockDim.z + blockIdx.z * blockDim.z + b_id; -} - -} // namespace cugraph::ops::utils diff --git a/cpp/src/from_cugraph_ops/device_smem_helper.cuh b/cpp/src/from_cugraph_ops/device_smem_helper.cuh deleted file mode 100644 index f1b5be071d9..00000000000 --- a/cpp/src/from_cugraph_ops/device_smem_helper.cuh +++ /dev/null @@ -1,270 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - */ - -#pragma once - -#include "device_core.hpp" - -#include -#include -#include - -namespace cugraph::ops::utils { - -// The following struct must be used to transmit the size and alignment of -// a field to the shared memory helpers below. -// By default, the alignment is just like the alignment of the original data type. -template -struct field_type { - using data_t = DataT; - static constexpr int32_t BYTES = static_cast(sizeof(DataT)); - static constexpr int32_t ALIGNMENT = ALIGN > 0 ? ALIGN : alignof(DataT); -}; - -// Imagine we have 2 fields of data in shared memory, one for ints, one for doubles. -// The intended usage of the following class in simple cases is as follows: -// 1. specify the type somewhere for both host and kernel code: -// using special_smem_name_t = smem_helper< 0, 0, field_type, field_type >; -// /* can be simplified to the following: */ -// using special_smem_name_t = smem_simple_t< int, double >; -// 2. in host code, get the size of shared memory: -// int32_t smem_sizes[] = {n_ints, n_doubles}; -// /* note: sizes are always in number of elements, not bytes */ -// /* sizes always have type `int32_t` */ -// auto size = special_smem_name_t::get_size(sizes); -// 3. in device code, call the empty constructor: -// special_smem_name_t helper {}; -// int* s_ints; -// double* s_doubles; -// int32_t smem_sizes[] = {n_ints, n_doubles}; -// helper.set_ptrs(sizes, s_ints, s_doubles); -// -// For more complicated use cases, it is often useful to create a struct overloading -// operator[] and passing that to the `get_size` or `set_ptrs` helpers. -// The struct can also be used to directly pass the size information from -// host code (launch) to the kernel, avoiding duplication of calculating sizes. -// Be aware that this overload must have a `__host__ __device__` signature. -// Here is an example struct for the above use case: -// struct sizes_t { -// int32_t n_ints, n_doubles; -// __host__ __device__ sizes_t() = delete; -// __host__ __device__ sizes_t(int32_t _n_ints, int32_t _n_doubles) : -// n_ints(_n_ints), n_doubles(_n_doubles) {} -// -// /* you may also just return int32_t here instead of const int32_t& */ -// __host__ __device__ const int32_t& operator[](int idx) const -// { -// return idx == 0 ? n_ints : n_doubles; -// } -// }; -// -// The ALIGN_INIT template parameter is important for correctness: -// By default (ALIGN_INIT=0), we assume that all alignments are powers of 2, -// and we set ALIGN_INIT to the max alignment of the fields. If you want more -// control, you can set it yourself, but we always assume that it is a multiple -// of all alignment values of the fields. -// -// The N_UNIT_FIELDS template parameters allows specifying sub-spaces -// for a given number of "units" (often warps) such that the first -// `N_UNIT_FIELDS` fields are reserved sub-spaces per unit. -// In this case, the `get_size` and `set_ptrs` methods are modified such that -// you have to specify the number of units, and for `set_ptrs` the unit ID -// as well. -// This is useful for reserving exclusive shared memory per warp for example. -// Each unit (warp) will have its sub-space (containing the `N_UNIT_FIELDS` -// fields) aligned to the initial alignment as described above. -template -class smem_helper { - public: - static constexpr size_t N_ARGS = sizeof...(FieldsT); - - protected: - static_assert(N_ARGS > 0, "smem_helper: must have at least one field type"); - static_assert(N_UNIT_FIELDS >= 0, "smem_helper: #unit fields must be non-negative"); - static_assert(N_UNIT_FIELDS <= N_ARGS, - "smem_helper: #unit fields must be smaller than #field types"); - // following static assertion for FieldsT to not be scalar types is based on - // https://stackoverflow.com/a/28253503/4134127 - template - struct bool_pack; - template - using all_true_t = std::is_same, bool_pack>; - static_assert(all_true_t::value...>::value, - "smem_helper: the given field template types must be of type `field_type` and " - "cannot be scalars"); - - template - __host__ __device__ static constexpr typename std::enable_if<(IDX < N_ARGS), int32_t>::type - max_align() - { - using f_t = typename std::tuple_element>::type; - static_assert(f_t::ALIGNMENT > 0, "field alignments must be greater than 0"); - return max_align() > f_t::ALIGNMENT ? max_align() : f_t::ALIGNMENT; - } - template - __host__ __device__ static constexpr typename std::enable_if<(IDX >= N_ARGS), int32_t>::type - max_align() - { - return -1; - } - - // this is assumed to be a multiple of all alignments - static constexpr int32_t ALIGN_BASE = ALIGN_INIT > 0 ? ALIGN_INIT : max_align<0>(); - - // here we exploit that the base pointer must be aligned to 16 bytes. - // if 16 is a multiple of ALIGN_BASE, that means we don't have any overhead. - // if ALIGN_BASE is a multiple of 16, it means that we need at most - // ALIGN_BASE - 16 extra bytes, otherwise it's ALIGN_BASE - 1 - static constexpr int32_t SIZE_OVERHEAD = 16 % ALIGN_BASE == 0 ? 0 - : ALIGN_BASE % 16 == 0 ? ALIGN_BASE - 16 - : ALIGN_BASE - 1; - - public: - // cannot easily use "= default" here for host-only code - // NOLINTNEXTLINE(modernize-use-equals-default) - __host__ __device__ smem_helper() - { -#if defined(__CUDA_ARCH__) - // must be aligned to 16 bytes on all supported architectures - // (don't have a reference for this at the moment!) - extern __shared__ uint8_t smem[]; - // align manually to `ALIGN_BASE`: this avoids the `__align(X)__` attribute - // which can cause issues if this is used in the same compilation unit - // with different types / alignments. - // In any case, the compiler/hardware cannot do a better job at providing - // an aligned pointer than we can do manually. - auto smem_aligned = align_to(reinterpret_cast(smem), uintptr_t(ALIGN_BASE)); - base_ptr_ = reinterpret_cast(smem_aligned); -#endif - } - - template - __host__ __device__ static inline typename std::enable_if<(N <= 0), int32_t>::type get_size( - const SizeT& sizes) - { - auto current_total = 0; // base pointer must be aligned to ALIGN_BASE - size_helper<1>(current_total, sizes); - return SIZE_OVERHEAD + current_total; - } - - template - __host__ __device__ static inline typename std::enable_if<(N > 0), int32_t>::type get_size( - const int32_t n_units, const SizeT& sizes) - { - auto current_total = 0; // base pointer must be aligned to all alignments - unit_size_helper<1>(current_total, sizes); - // since the unit size is aligned to ALIGN_BASE, every base pointer for - // each unit as well as the base pointer after all units is aligned to - // ALIGN_BASE: since that is a multiple of all alignments, we can safely - // continue adding the sizes afterwards - auto unit_size = align_to(current_total, ALIGN_BASE); - current_total = 0; // base pointer must be aligned to all alignments - size_helper(current_total, sizes); - return SIZE_OVERHEAD + unit_size * n_units + current_total; - } - - template - __device__ inline typename std::enable_if<(N <= 0)>::type set_ptrs( - const SizeT& sizes, typename FieldsT::data_t*&... ptrs) const - { - return ptrs_helper<1>(0, 0, 0, 0, sizes, ptrs...); - } - - template - __device__ inline typename std::enable_if<(N > 0)>::type set_ptrs( - const int32_t& unit_id, - const int32_t& n_units, - const SizeT& sizes, - typename FieldsT::data_t*&... ptrs) const - { - auto current_total = 0; // base pointer must be aligned to all alignments - unit_size_helper<1>(current_total, sizes); - // see explanation in `get_size` for what aligning to ALIGN_BASE means - auto unit_size = align_to(current_total, ALIGN_BASE); - return ptrs_helper<1>(0, unit_id, unit_size, n_units, sizes, ptrs...); - } - - protected: - template - __host__ __device__ static inline void single_size(int32_t& current_total, const SizeT& sizes) - { - using next_field_t = typename std::tuple_element<(NEXT < N_ARGS ? NEXT : N_ARGS - 1), - std::tuple>::type; - using this_field_t = typename std::tuple_element<(NEXT < N_ARGS ? NEXT - 1 : N_ARGS - 1), - std::tuple>::type; - static constexpr int32_t ALIGN = - NEXT == N_UNIT_FIELDS || NEXT >= N_ARGS ? 1 : next_field_t::ALIGNMENT; - current_total = align_to(current_total + sizes[NEXT - 1] * this_field_t::BYTES, ALIGN); - } - - // parentheses in `enable_if` here are used to help the parser understand "<>" - template - __host__ __device__ static inline typename std::enable_if<(NEXT <= N_ARGS)>::type size_helper( - int32_t& current_total, const SizeT& sizes) - { - single_size(current_total, sizes); - size_helper(current_total, sizes); - } - template - __host__ __device__ static inline typename std::enable_if<(NEXT > N_ARGS)>::type size_helper( - int32_t& /* current_total */, const SizeT& /* sizes */) - { - } - - template - __host__ __device__ static inline typename std::enable_if<(NEXT <= N_UNIT_FIELDS)>::type - unit_size_helper(int32_t& current_total, const SizeT& sizes) - { - single_size(current_total, sizes); - unit_size_helper(current_total, sizes); - } - template - __host__ __device__ static inline typename std::enable_if<(NEXT > N_UNIT_FIELDS)>::type - unit_size_helper(int32_t& /* current_total */, const SizeT& /* sizes */) - { - } - - template - __device__ inline void ptrs_helper(const int32_t& /* offset */, - const int32_t& /* unit_id */, - const int32_t& /* unit_size */, - const int32_t& /* n_units */, - const SizeT& /* sizes */) const - { - } - template - __device__ inline void ptrs_helper(const int32_t& offset, - const int32_t& unit_id, - const int32_t& unit_size, - const int32_t& n_units, - const SizeT& sizes, - PtrT*& ptr, - PtrsT*&... ptrs) const - { - // see `get_size`: base_ptr_ + u_off is always aligned to all alignments - // (whether for each individual unit or after all units) - auto u_off = NEXT <= N_UNIT_FIELDS ? unit_id * unit_size : n_units * unit_size; - ptr = reinterpret_cast(base_ptr_ + (u_off + offset)); - int32_t next_offset = offset; - if (NEXT == N_UNIT_FIELDS) - next_offset = 0; // pointer after all unit fields is aligned to all alignments - else - single_size(next_offset, sizes); - ptrs_helper(next_offset, unit_id, unit_size, n_units, sizes, ptrs...); - } - - uint8_t* base_ptr_{nullptr}; -}; - -template -using smem_simple_t = smem_helper<0, 0, field_type...>; - -template -using smem_unit_simple_t = smem_helper<0, N_UNIT_FIELDS, field_type...>; - -} // namespace cugraph::ops::utils diff --git a/cpp/src/from_cugraph_ops/device_warp_collectives.cuh b/cpp/src/from_cugraph_ops/device_warp_collectives.cuh deleted file mode 100644 index 198b3be2f12..00000000000 --- a/cpp/src/from_cugraph_ops/device_warp_collectives.cuh +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - */ - -#pragma once - -#include "device_core.hpp" -#include "device_dim.cuh" -#include "macros.hpp" - -#include - -namespace cugraph::ops::utils { - -/** - * @brief get a bit mask for the `n_threads` lowest threads of a warp - * - * @param[in] n_threads number of threads in the mask - * - * @return the bit mask - */ -__host__ __device__ constexpr uint32_t low_thread_mask(int n_threads) -{ - return n_threads >= WARP_SIZE ? 0xffffffffU : (1U << n_threads) - 1U; -} - -/** - * apply a warp-wide sync (useful from Volta+ archs) - * - * @tparam NP number of participating threads - * - * @note This works on Pascal and earlier archs as well, but all threads with - * lane id <= NP must enter this function together and in convergence. - */ -template -__device__ inline void warp_sync() -{ - __syncwarp(low_thread_mask(NP)); -} - -/** - * @brief Shuffle the data inside a warp - * - * @tparam DataT the data type (currently assumed to be 4B) - * - * @param[in] val value to be shuffled - * @param[in] src_lane lane from where to shuffle - * @param[in] width lane width - * @param[in] mask mask of participating threads (Volta+) - * - * @return the shuffled data - */ -template -__device__ inline DataT shfl(DataT val, - int src_lane, - int width = WARP_SIZE, - uint32_t mask = 0xffffffffU) -{ - static_assert(CUDART_VERSION >= CUDA_VER_WARP_SHFL, - "Expected CUDA >= 9 for warp synchronous shuffle"); - return __shfl_sync(mask, val, src_lane, width); -} - -/** - * @brief Warp-level sum reduction - * - * @tparam DataT data type - * @tparam NP number of participating threads. - * must be a power of 2 and at most warp size - * - * @param[in] val input value - * - * @return only the lane0 will contain valid reduced result - * - * @note Why not cub? Because cub doesn't seem to allow working with arbitrary - * number of warps in a block. - * - * @note All threads with lane id <= NP must enter this function together - * - * TODO(mjoux) Expand this to support arbitrary reduction ops - */ -template -__device__ inline DataT warp_reduce(DataT val) -{ - static constexpr uint32_t MASK = low_thread_mask(NP); - CUGRAPH_OPS_UNROLL - for (int i = NP / 2; i > 0; i >>= 1) { - DataT tmp = shfl(val, lane_id() + i, NP, MASK); - val += tmp; - } - return val; -} - -} // namespace cugraph::ops::utils diff --git a/cpp/src/from_cugraph_ops/macros.hpp b/cpp/src/from_cugraph_ops/macros.hpp deleted file mode 100644 index 0ff08af0b1a..00000000000 --- a/cpp/src/from_cugraph_ops/macros.hpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - */ - -#define CUGRAPH_OPS_STRINGIFY_DETAIL(x) #x -#define CUGRAPH_OPS_STRINGIFY(x) CUGRAPH_OPS_STRINGIFY_DETAIL(x) - -#define CUGRAPH_OPS_UNROLL _Pragma("unroll") -#if defined(__clang__) && defined(__CUDA__) -// clang wants pragma unroll without parentheses -#define CUGRAPH_OPS_UNROLL_N(n) _Pragma(CUGRAPH_OPS_STRINGIFY(unroll n)) -#else -// nvcc / nvrtc want pragma unroll with parentheses -#define CUGRAPH_OPS_UNROLL_N(n) _Pragma(CUGRAPH_OPS_STRINGIFY(unroll(n))) -#endif - -#if defined(__clang__) -#define CUGRAPH_OPS_CONSTEXPR_D constexpr -#else -#define CUGRAPH_OPS_CONSTEXPR_D constexpr __device__ -#endif - -#if defined(__CUDACC__) || defined(__CUDA__) -#define CUGRAPH_OPS_HD __host__ __device__ -#else -#define CUGRAPH_OPS_HD -#endif - -// The CUGRAPH_OPS_KERNEL specificies that a kernel has hidden visibility -// -// cugraph-ops needs to ensure that the visibility of its CUGRAPH_OPS_KERNEL function -// templates have hidden visibility ( default is weak visibility). -// -// When kernels have weak visibility it means that if two dynamic libraries -// both contain identical instantiations of a kernel/template, then the linker -// will discard one of the two instantiations and use only one of them. -// -// Do to unique requirements of how the CUDA works this de-deduplication -// can lead to the wrong kernels being called ( SM version being wrong ), -// silently no kernel being called at all, or cuda runtime errors being -// thrown. -// -// https://github.com/rapidsai/raft/issues/1722 -#ifndef CUGRAPH_OPS_KERNEL -#define CUGRAPH_OPS_KERNEL __global__ static -#endif diff --git a/cpp/src/from_cugraph_ops/sampling.hpp b/cpp/src/from_cugraph_ops/sampling.hpp deleted file mode 100644 index 5663b8d9c03..00000000000 --- a/cpp/src/from_cugraph_ops/sampling.hpp +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - */ - -#pragma once - -// FIXME: This is only here for the prims... -// Need to look how Seunghwa fixed this in his PR -#include - -#include - -#include - -#include - -namespace cugraph::legacy::ops::graph { - -/** - * @brief Generate indexes given population sizes and a sample size, - * with or without replacement - * - * @param[out] index The (dense) index matrix. [on device] - * [dim = `n_sizes x sample_size`] - * In case `replace` is `false`, this may contain - * `ops::graph::INVALID_ID` - * if no index could be generated. - * @param[inout] rng RAFT RngState state object - * @param[in] sizes Input array of population sizes [on device] - * [len = `n_sizes`] - * @param[in] n_sizes number of sizes to sample from. - * @param[in] sample_size max number of indexes to be sampled per element - * in `sizes`. Assumed to be <= 384 at the moment. - * @param[in] replace If `true`, sample with replacement, otherwise - * without replacement. - * @param[in] stream cuda stream - * - @{ - */ -void get_sampling_index(int32_t* index, - raft::random::RngState& rng, - const int32_t* sizes, - int32_t n_sizes, - int32_t sample_size, - bool replace, - cudaStream_t stream); -void get_sampling_index(int64_t* index, - raft::random::RngState& rng, - const int64_t* sizes, - int64_t n_sizes, - int32_t sample_size, - bool replace, - cudaStream_t stream); - -} // namespace cugraph::legacy::ops::graph diff --git a/cpp/src/from_cugraph_ops/sampling_index.cu b/cpp/src/from_cugraph_ops/sampling_index.cu deleted file mode 100644 index fb1f4ac3f1e..00000000000 --- a/cpp/src/from_cugraph_ops/sampling_index.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - */ - -#include "sampling.hpp" -#include "sampling_index.cuh" - -namespace cugraph::legacy::ops::graph { - -void get_sampling_index(int32_t* index, - raft::random::RngState& rng, - const int32_t* sizes, - int32_t n_sizes, - int32_t sample_size, - bool replace, - cudaStream_t stream) -{ - get_sampling_index_impl(index, rng, sizes, n_sizes, sample_size, replace, stream); -} - -void get_sampling_index(int64_t* index, - raft::random::RngState& rng, - const int64_t* sizes, - int64_t n_sizes, - int32_t sample_size, - bool replace, - cudaStream_t stream) -{ - get_sampling_index_impl(index, rng, sizes, n_sizes, sample_size, replace, stream); -} - -} // namespace cugraph::legacy::ops::graph diff --git a/cpp/src/from_cugraph_ops/sampling_index.cuh b/cpp/src/from_cugraph_ops/sampling_index.cuh deleted file mode 100644 index 9ac574315bb..00000000000 --- a/cpp/src/from_cugraph_ops/sampling_index.cuh +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - */ - -#pragma once - -#include "algo_R.cuh" -#include "sampling.hpp" - -#include -#include - -#include - -namespace cugraph::legacy::ops::graph { - -namespace utils = cugraph::ops::utils; - -template -using smem_algo_r_t = utils::smem_unit_simple_t<1, IdxT>; - -template -CUGRAPH_OPS_KERNEL void index_replace_kernel(raft::random::DeviceState rng_state, - IdxT* index, - const IdxT* sizes, - IdxT n_sizes, - int sample_size) -{ - using rand_t = std::make_unsigned_t; - // a warp-wide implementation. - auto lane = cugraph::ops::utils::lane_id(); - auto warp = utils::warp_id(); // 1D block with X dim - auto n_warps = utils::num_warps(); // 1D block with X dim - auto row_id = warp + static_cast(blockIdx.x) * IdxT{n_warps}; - if (row_id >= n_sizes) return; - // 1. load population size (once per warp) - IdxT size = IdxT{0}; - if (lane == 0) size = sizes[row_id]; - - // 2. shuffle it to all threads in warp - size = utils::shfl(size, 0); - - // 3. check valid size: possible early-out - if (size <= 0) { - CUGRAPH_OPS_UNROLL - for (auto i = lane; i < sample_size; i += utils::WARP_SIZE) { - index[row_id * IdxT{sample_size} + IdxT{i}] = cugraph::invalid_idx::value; - } - return; - } - - // 4. every thread generates its indexes - auto flat_id = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - GenT gen(rng_state, flat_id); - raft::random::UniformIntDistParams int_params{}; - int_params.start = IdxT{0}; - int_params.end = size; - int_params.diff = static_cast(size); - CUGRAPH_OPS_UNROLL - for (auto i = lane; i < sample_size; i += utils::WARP_SIZE) { - IdxT idx = IdxT{0}; - raft::random::custom_next(gen, &idx, int_params, 0, 0 /* idx / stride unused */); - - // 5. output index - index[row_id * IdxT{sample_size} + IdxT{i}] = idx; - } -} - -template -void get_sampling_index_replace(IdxT* index, - raft::random::RngState& rng, - const IdxT* sizes, - IdxT n_sizes, - int32_t sample_size, - cudaStream_t stream) -{ - // keep thread per block fairly low since we can expect sample_size < warp_size - // thus we want to have as many blocks as possible to increase parallelism - static constexpr int TPB = 128; - static constexpr int N_WARPS = TPB / utils::WARP_SIZE; - auto n_blks = utils::ceil_div(n_sizes, N_WARPS); - RAFT_CALL_RNG_FUNC( - rng, (index_replace_kernel<<>>), index, sizes, n_sizes, sample_size); - auto thread_rs = utils::ceil_div(IdxT{sample_size}, utils::WARP_SIZE); - rng.advance(static_cast(n_blks * TPB), thread_rs * sizeof(IdxT) / sizeof(int32_t)); - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -CUGRAPH_OPS_KERNEL void index_algo_r_kernel(raft::random::DeviceState rng_state, - IdxT* index, - const IdxT* sizes, - IdxT n_sizes, - int sample_size) -{ - using rand_t = std::make_unsigned_t; - // a warp-wide implementation. - auto lane = utils::lane_id(); - auto warp = utils::warp_id(); // 1D block with X dim - auto row_id = warp + static_cast(blockIdx.x) * IdxT{N_WARPS}; - if (row_id >= n_sizes) return; - IdxT* s_idx; - smem_algo_r_t smem{}; - int32_t smem_sizes[] = {sample_size}; - smem.set_ptrs(warp, N_WARPS, smem_sizes, s_idx); - // 1. load population size (once per warp) - IdxT size = IdxT{0}; - if (lane == 0) size = sizes[row_id]; - - // 2. shuffle it to all threads in warp - size = utils::shfl(size, 0); - - // 3. Get algo R indexes per warp - cugraph::ops::graph::warp_algo_r_index( - s_idx, size, IdxT{0}, sample_size, rng_state); - - CUGRAPH_OPS_UNROLL - for (auto i = lane; i < sample_size; i += utils::WARP_SIZE) { - // 4. output index - // still need to check if the index is actually valid - auto idx = s_idx[i]; - index[row_id * IdxT{sample_size} + IdxT{i}] = - idx >= size ? cugraph::invalid_idx::value : idx; - } -} - -template -void get_sampling_index_reservoir(IdxT* index, - raft::random::RngState& rng, - const IdxT* sizes, - IdxT n_sizes, - int32_t sample_size, - cudaStream_t stream) -{ - // same TPB as in algo R: increased SM occupancy is most important here - static constexpr int TPB = 512; - static constexpr int N_WARPS = TPB / utils::WARP_SIZE; - auto n_blks = utils::ceil_div(n_sizes, N_WARPS); - int32_t smem_sizes[] = {sample_size}; - size_t smem_size = smem_algo_r_t::get_size(N_WARPS, smem_sizes); - RAFT_CALL_RNG_FUNC(rng, - (index_algo_r_kernel<<>>), - index, - sizes, - n_sizes, - sample_size); - auto thread_rs = utils::ceil_div( - std::max(IdxT{0}, std::min(std::numeric_limits::max(), n_sizes) - IdxT{sample_size}), - utils::WARP_SIZE); - rng.advance(static_cast(n_blks * TPB), thread_rs * sizeof(IdxT) / sizeof(int32_t)); - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -void get_sampling_index_impl(IdxT* index, - raft::random::RngState& rng, - const IdxT* sizes, - IdxT n_sizes, - int32_t sample_size, - bool replace, - cudaStream_t stream) -{ - if (replace) { - get_sampling_index_replace(index, rng, sizes, n_sizes, sample_size, stream); - } else { - get_sampling_index_reservoir(index, rng, sizes, n_sizes, sample_size, stream); - } -} - -} // namespace cugraph::legacy::ops::graph diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh index ceb673b14f7..dcbef7632c2 100644 --- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh +++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh @@ -15,7 +15,6 @@ */ #pragma once -#include "from_cugraph_ops/sampling.hpp" #include "prims/detail/partition_v_frontier.cuh" #include "prims/detail/transform_v_frontier_e.cuh" #include "prims/property_op_utils.cuh" @@ -1254,7 +1253,6 @@ rmm::device_uvector compute_homogeneous_uniform_sampling_index_without_r } if (retry_segment_indices) { - #if 1 sample_nbr_index_with_replacement( handle, raft::device_span((*retry_degrees).data(), (*retry_degrees).size()), @@ -1262,16 +1260,6 @@ rmm::device_uvector compute_homogeneous_uniform_sampling_index_without_r raft::device_span((*retry_nbr_indices).data(), (*retry_nbr_indices).size()), rng_state, high_partition_oversampling_K); - #else - cugraph::legacy::ops::graph::get_sampling_index( - (*retry_nbr_indices).data(), - rng_state, - (*retry_degrees).begin(), - (*retry_degrees).size(), - static_cast(high_partition_oversampling_K), - true, - handle.get_stream()); - #endif } else { // FIXME: this temporary is unnecessary if we update get_sampling_index to take a thrust // iterator @@ -1280,7 +1268,6 @@ rmm::device_uvector compute_homogeneous_uniform_sampling_index_without_r segment_frontier_degree_first, segment_frontier_degree_first + num_segments, tmp_degrees.begin()); - #if 1 sample_nbr_index_with_replacement( handle, raft::device_span(tmp_degrees.data(), tmp_degrees.size()), @@ -1288,16 +1275,6 @@ rmm::device_uvector compute_homogeneous_uniform_sampling_index_without_r raft::device_span(tmp_nbr_indices.data(), tmp_nbr_indices.size()), rng_state, high_partition_oversampling_K); - #else - cugraph::legacy::ops::graph::get_sampling_index( - tmp_nbr_indices.data(), - rng_state, - tmp_degrees.data(), - num_segments, - static_cast(high_partition_oversampling_K), - true, - handle.get_stream()); - #endif } if (retry_segment_indices) { diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh index 3c177c9653d..c604b0e739e 100644 --- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh +++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh @@ -15,7 +15,6 @@ */ #pragma once -#include "from_cugraph_ops/sampling.hpp" #include "prims/detail/sample_and_compute_local_nbr_indices.cuh" #include "prims/property_op_utils.cuh" From 5545dafc8fc3275e4c891c40021f8fc5910a6b85 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 30 Jan 2025 09:49:33 -0800 Subject: [PATCH 15/21] fix build error --- cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh index 98eaab2cb4a..86f9fd570f5 100644 --- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh +++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh @@ -3341,10 +3341,10 @@ biased_sample_with_replacement( ? cuda::std::make_optional>( (*key_indices).data() + local_frontier_sample_offsets[i], local_frontier_sample_offsets[i + 1] - local_frontier_sample_offsets[i]) - : thrust::nullopt, + : cuda::std::nullopt, edge_types = edge_types ? cuda::std::make_optional>( (*edge_types).data(), (*edge_types).size()) - : thrust::nullopt, + : cuda::std::nullopt, key_idx_to_unique_key_idx = raft::device_span( aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], local_frontier_offsets[i + 1] - local_frontier_offsets[i]), From d72a0a66ee1c9b26167167f7a710aa14604376f0 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 30 Jan 2025 10:15:26 -0800 Subject: [PATCH 16/21] copyright year --- cpp/src/structure/select_random_vertices_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/structure/select_random_vertices_impl.hpp b/cpp/src/structure/select_random_vertices_impl.hpp index 65b84de8952..eb3678030a5 100644 --- a/cpp/src/structure/select_random_vertices_impl.hpp +++ b/cpp/src/structure/select_random_vertices_impl.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 5f88e93f9bb328ff78e1fad9ee04397d083b1f64 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 30 Jan 2025 13:34:20 -0800 Subject: [PATCH 17/21] udpate mg_graph_to_sg_graph to support edge types --- .../mg_betweenness_centrality_test.cpp | 5 ++- .../mg_edge_betweenness_centrality_test.cpp | 5 ++- .../mg_eigenvector_centrality_test.cpp | 5 ++- .../centrality/mg_katz_centrality_test.cpp | 5 ++- cpp/tests/community/mg_ecg_test.cpp | 5 ++- .../community/mg_edge_triangle_count_test.cpp | 6 ++-- cpp/tests/community/mg_egonet_test.cu | 9 ++++- cpp/tests/community/mg_k_truss_test.cpp | 10 ++++-- cpp/tests/community/mg_leiden_test.cpp | 5 ++- cpp/tests/community/mg_louvain_test.cpp | 5 ++- .../community/mg_triangle_count_test.cpp | 6 ++-- .../community/mg_weighted_matching_test.cpp | 5 ++- .../mg_weakly_connected_components_test.cpp | 6 ++-- cpp/tests/cores/mg_core_number_test.cpp | 6 ++-- cpp/tests/cores/mg_k_core_test.cpp | 12 +++++-- cpp/tests/link_analysis/mg_hits_test.cpp | 5 ++- cpp/tests/link_analysis/mg_pagerank_test.cpp | 5 ++- cpp/tests/mtmg/threaded_test_louvain.cu | 6 +++- cpp/tests/prims/mg_count_if_e.cu | 5 ++- cpp/tests/prims/mg_count_if_v.cu | 5 ++- cpp/tests/prims/mg_extract_transform_e.cu | 6 ++-- ...extract_transform_v_frontier_outgoing_e.cu | 6 ++-- ...r_v_pair_transform_dst_nbr_intersection.cu | 5 ++- ...transform_dst_nbr_weighted_intersection.cu | 5 ++- ...rm_reduce_dst_key_aggregated_outgoing_e.cu | 5 ++- ..._v_transform_reduce_incoming_outgoing_e.cu | 5 ++- cpp/tests/prims/mg_reduce_v.cu | 5 ++- ...st_nbr_intersection_of_e_endpoints_by_v.cu | 5 ++- cpp/tests/prims/mg_transform_reduce_e.cu | 5 ++- .../mg_transform_reduce_e_by_src_dst_key.cu | 5 ++- cpp/tests/prims/mg_transform_reduce_v.cu | 5 ++- ...orm_reduce_v_frontier_outgoing_e_by_dst.cu | 6 ++-- cpp/tests/structure/mg_coarsen_graph_test.cpp | 8 +++-- ..._count_self_loops_and_multi_edges_test.cpp | 5 ++- ...has_edge_and_compute_multiplicity_test.cpp | 3 +- .../structure/mg_induced_subgraph_test.cu | 9 ++++- cpp/tests/structure/mg_symmetrize_test.cpp | 5 ++- .../structure/mg_transpose_storage_test.cpp | 5 ++- cpp/tests/structure/mg_transpose_test.cpp | 5 ++- cpp/tests/traversal/mg_bfs_test.cpp | 6 ++-- .../traversal/mg_extract_bfs_paths_test.cu | 8 +++-- cpp/tests/traversal/mg_k_hop_nbrs_test.cpp | 6 ++-- cpp/tests/traversal/mg_sssp_test.cpp | 5 ++- cpp/tests/utilities/conversion_utilities.hpp | 10 +++++- .../utilities/conversion_utilities_impl.cuh | 36 +++++++++++++------ .../utilities/conversion_utilities_mg.cu | 16 +++++++++ 46 files changed, 241 insertions(+), 70 deletions(-) diff --git a/cpp/tests/centrality/mg_betweenness_centrality_test.cpp b/cpp/tests/centrality/mg_betweenness_centrality_test.cpp index 35f6a5157ff..e85c7abba85 100644 --- a/cpp/tests/centrality/mg_betweenness_centrality_test.cpp +++ b/cpp/tests/centrality/mg_betweenness_centrality_test.cpp @@ -60,6 +60,8 @@ class Tests_MGBetweennessCentrality template void run_current_test(std::tuple const& param) { + using edge_type_t = int32_t; + auto [betweenness_usecase, input_usecase] = param; HighResTimer hr_timer{}; @@ -152,12 +154,13 @@ class Tests_MGBetweennessCentrality std::optional< cugraph::edge_property_t, weight_t>> sg_edge_weights{std::nullopt}; - std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) = + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, mg_edge_weight_view, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/centrality/mg_edge_betweenness_centrality_test.cpp b/cpp/tests/centrality/mg_edge_betweenness_centrality_test.cpp index ff100a33e40..614492e6a86 100644 --- a/cpp/tests/centrality/mg_edge_betweenness_centrality_test.cpp +++ b/cpp/tests/centrality/mg_edge_betweenness_centrality_test.cpp @@ -60,6 +60,8 @@ class Tests_MGEdgeBetweennessCentrality template void run_current_test(std::tuple const& param) { + using edge_type_t = int32_t; + constexpr bool do_expensive_check = false; auto [betweenness_usecase, input_usecase] = param; @@ -142,12 +144,13 @@ class Tests_MGEdgeBetweennessCentrality std::optional< cugraph::edge_property_t, weight_t>> sg_edge_weights{std::nullopt}; - std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) = + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, mg_edge_weight_view, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::optional>{std::nullopt}, false); diff --git a/cpp/tests/centrality/mg_eigenvector_centrality_test.cpp b/cpp/tests/centrality/mg_eigenvector_centrality_test.cpp index 0e2a0e37c1b..a4d13a888b1 100644 --- a/cpp/tests/centrality/mg_eigenvector_centrality_test.cpp +++ b/cpp/tests/centrality/mg_eigenvector_centrality_test.cpp @@ -63,6 +63,8 @@ class Tests_MGEigenvectorCentrality void run_current_test(EigenvectorCentrality_Usecase const& eigenvector_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; // 1. create MG graph @@ -144,12 +146,13 @@ class Tests_MGEigenvectorCentrality std::optional< cugraph::edge_property_t, weight_t>> sg_edge_weights{std::nullopt}; - std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) = + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, mg_edge_weight_view, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/centrality/mg_katz_centrality_test.cpp b/cpp/tests/centrality/mg_katz_centrality_test.cpp index 5ccade18c18..b829a70b772 100644 --- a/cpp/tests/centrality/mg_katz_centrality_test.cpp +++ b/cpp/tests/centrality/mg_katz_centrality_test.cpp @@ -61,6 +61,8 @@ class Tests_MGKatzCentrality void run_current_test(KatzCentrality_Usecase const& katz_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; // 1. create MG graph @@ -151,12 +153,13 @@ class Tests_MGKatzCentrality std::optional< cugraph::edge_property_t, weight_t>> sg_edge_weights{std::nullopt}; - std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) = + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, mg_edge_weight_view, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/community/mg_ecg_test.cpp b/cpp/tests/community/mg_ecg_test.cpp index 14d1697744e..d8cbc988c9e 100644 --- a/cpp/tests/community/mg_ecg_test.cpp +++ b/cpp/tests/community/mg_ecg_test.cpp @@ -70,6 +70,8 @@ class Tests_MGEcg : public ::testing::TestWithParam void run_current_test(std::tuple const& param) { + using edge_type_t = int32_t; + auto [ecg_usecase, input_usecase] = param; HighResTimer hr_timer{}; @@ -127,12 +129,13 @@ class Tests_MGEcg : public ::testing::TestWithParam, weight_t>> sg_edge_weights{std::nullopt}; - std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) = + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, mg_edge_weight_view, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::optional>{std::nullopt}, false); // crate a SG graph with MG graph vertex IDs diff --git a/cpp/tests/community/mg_edge_triangle_count_test.cpp b/cpp/tests/community/mg_edge_triangle_count_test.cpp index 5f00d4bbc5c..073a2a5f193 100644 --- a/cpp/tests/community/mg_edge_triangle_count_test.cpp +++ b/cpp/tests/community/mg_edge_triangle_count_test.cpp @@ -62,7 +62,8 @@ class Tests_MGEdgeTriangleCount void run_current_test(EdgeTriangleCount_Usecase const& edge_triangle_count_usecase, input_usecase_t const& input_usecase) { - using weight_t = float; + using weight_t = float; + using edge_type_t = int32_t; HighResTimer hr_timer{}; @@ -123,13 +124,14 @@ class Tests_MGEdgeTriangleCount std::optional< cugraph::edge_property_t, edge_t>> d_sg_cugraph_results{std::nullopt}; - std::tie(sg_graph, std::ignore, d_sg_cugraph_results, std::ignore) = + std::tie(sg_graph, std::ignore, d_sg_cugraph_results, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, // FIXME: Update 'create_graph_from_edgelist' to support int32_t and int64_t values std::make_optional(d_mg_cugraph_results.view()), + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/community/mg_egonet_test.cu b/cpp/tests/community/mg_egonet_test.cu index 130e01e8df9..0fcf6b97746 100644 --- a/cpp/tests/community/mg_egonet_test.cu +++ b/cpp/tests/community/mg_egonet_test.cu @@ -60,6 +60,8 @@ class Tests_MGEgonet template void run_current_test(std::tuple const& param) { + using edge_type_t = int32_t; + auto [egonet_usecase, input_usecase] = param; HighResTimer hr_timer{}; @@ -199,12 +201,17 @@ class Tests_MGEgonet triplet_first + d_mg_aggregate_edgelist_src.size()); } - auto [sg_graph, sg_edge_weights, sg_edge_ids, sg_number_map] = + cugraph::graph_t sg_graph(*handle_); + std::optional< + cugraph::edge_property_t, weight_t>> + sg_edge_weights{std::nullopt}; + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, mg_edge_weight_view, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/community/mg_k_truss_test.cpp b/cpp/tests/community/mg_k_truss_test.cpp index d3463e73a6f..a1867e6bf25 100644 --- a/cpp/tests/community/mg_k_truss_test.cpp +++ b/cpp/tests/community/mg_k_truss_test.cpp @@ -63,7 +63,8 @@ class Tests_MGKTruss template void run_current_test(KTruss_Usecase const& k_truss_usecase, input_usecase_t const& input_usecase) { - using weight_t = float; + using edge_type_t = int32_t; + using weight_t = float; HighResTimer hr_timer{}; @@ -158,12 +159,17 @@ class Tests_MGKTruss } // 3-1. Convert to SG graph - auto [sg_graph, sg_edge_weights, sg_edge_ids, sg_number_map] = + cugraph::graph_t sg_graph(*handle_); + std::optional< + cugraph::edge_property_t, weight_t>> + sg_edge_weights{std::nullopt}; + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, mg_edge_weight_view, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/community/mg_leiden_test.cpp b/cpp/tests/community/mg_leiden_test.cpp index 081c5cb6dfe..a35b5cbb20d 100644 --- a/cpp/tests/community/mg_leiden_test.cpp +++ b/cpp/tests/community/mg_leiden_test.cpp @@ -80,6 +80,8 @@ class Tests_MGLeiden weight_t theta, weight_t mg_modularity) { + using edge_type_t = int32_t; + auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); @@ -87,12 +89,13 @@ class Tests_MGLeiden std::optional< cugraph::edge_property_t, weight_t>> sg_edge_weights{std::nullopt}; - std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) = + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, mg_edge_weight_view, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::optional>{std::nullopt}, false); // crate an SG graph with MG graph vertex IDs diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp index 4aebd26c256..5aa8f3cf54d 100644 --- a/cpp/tests/community/mg_louvain_test.cpp +++ b/cpp/tests/community/mg_louvain_test.cpp @@ -78,6 +78,8 @@ class Tests_MGLouvain weight_t resolution, weight_t mg_modularity) { + using edge_type_t = int32_t; + auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); @@ -85,12 +87,13 @@ class Tests_MGLouvain std::optional< cugraph::edge_property_t, weight_t>> sg_edge_weights{std::nullopt}; - std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) = + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, mg_edge_weight_view, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::optional>{std::nullopt}, false); // crate an SG graph with MG graph vertex IDs diff --git a/cpp/tests/community/mg_triangle_count_test.cpp b/cpp/tests/community/mg_triangle_count_test.cpp index b541933ca4d..e1e0997f2e3 100644 --- a/cpp/tests/community/mg_triangle_count_test.cpp +++ b/cpp/tests/community/mg_triangle_count_test.cpp @@ -64,7 +64,8 @@ class Tests_MGTriangleCount void run_current_test(TriangleCount_Usecase const& triangle_count_usecase, input_usecase_t const& input_usecase) { - using weight_t = float; + using weight_t = float; + using edge_type_t = int32_t; HighResTimer hr_timer{}; @@ -178,12 +179,13 @@ class Tests_MGTriangleCount d_mg_triangle_counts.size())); cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/community/mg_weighted_matching_test.cpp b/cpp/tests/community/mg_weighted_matching_test.cpp index 4e57450ace7..ee3b66f85df 100644 --- a/cpp/tests/community/mg_weighted_matching_test.cpp +++ b/cpp/tests/community/mg_weighted_matching_test.cpp @@ -57,6 +57,8 @@ class Tests_MGWeightedMatching template void run_current_test(std::tuple const& param) { + using edge_type_t = int32_t; + auto [weighted_matching_usecase, input_usecase] = param; HighResTimer hr_timer{}; @@ -143,12 +145,13 @@ class Tests_MGWeightedMatching std::optional< cugraph::edge_property_t, weight_t>> sg_edge_weights{std::nullopt}; - std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) = + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, mg_edge_weight_view, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::optional>(std::nullopt), false); diff --git a/cpp/tests/components/mg_weakly_connected_components_test.cpp b/cpp/tests/components/mg_weakly_connected_components_test.cpp index bb3bc826a71..2d8494d244c 100644 --- a/cpp/tests/components/mg_weakly_connected_components_test.cpp +++ b/cpp/tests/components/mg_weakly_connected_components_test.cpp @@ -61,7 +61,8 @@ class Tests_MGWeaklyConnectedComponents WeaklyConnectedComponents_Usecase const& weakly_connected_components_usecase, input_usecase_t const& input_usecase) { - using weight_t = float; + using weight_t = float; + using edge_type_t = int32_t; HighResTimer hr_timer{}; @@ -125,12 +126,13 @@ class Tests_MGWeaklyConnectedComponents raft::device_span(d_mg_components.data(), d_mg_components.size())); cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/cores/mg_core_number_test.cpp b/cpp/tests/cores/mg_core_number_test.cpp index f6b73be7b09..cad0fffd302 100644 --- a/cpp/tests/cores/mg_core_number_test.cpp +++ b/cpp/tests/cores/mg_core_number_test.cpp @@ -67,7 +67,8 @@ class Tests_MGCoreNumber void run_current_test(CoreNumber_Usecase const& core_number_usecase, input_usecase_t const& input_usecase) { - using weight_t = float; + using edge_type_t = int32_t; + using weight_t = float; HighResTimer hr_timer{}; @@ -143,12 +144,13 @@ class Tests_MGCoreNumber raft::device_span(d_mg_core_numbers.data(), d_mg_core_numbers.size())); cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/cores/mg_k_core_test.cpp b/cpp/tests/cores/mg_k_core_test.cpp index 8ebbff6f5a4..0e84a86b47c 100644 --- a/cpp/tests/cores/mg_k_core_test.cpp +++ b/cpp/tests/cores/mg_k_core_test.cpp @@ -60,11 +60,12 @@ class Tests_MGKCore : public ::testing::TestWithParam void run_current_test(std::tuple const& param) { + using weight_t = float; + using edge_type_t = int32_t; + constexpr bool renumber = true; auto [k_core_usecase, input_usecase] = param; - using weight_t = float; - HighResTimer hr_timer{}; if (cugraph::test::g_perf) { @@ -160,12 +161,17 @@ class Tests_MGKCore : public ::testing::TestWithParam>{std::nullopt}, raft::device_span(d_mg_core_numbers.data(), d_mg_core_numbers.size())); - auto [sg_graph, sg_edge_weights, sg_edge_ids, sg_number_map] = + cugraph::graph_t sg_graph(*handle_); + std::optional< + cugraph::edge_property_t, weight_t>> + sg_edge_weights{std::nullopt}; + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, mg_edge_weight_view, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/link_analysis/mg_hits_test.cpp b/cpp/tests/link_analysis/mg_hits_test.cpp index 83e76472260..8fe65d0f7f4 100644 --- a/cpp/tests/link_analysis/mg_hits_test.cpp +++ b/cpp/tests/link_analysis/mg_hits_test.cpp @@ -57,6 +57,8 @@ class Tests_MGHits : public ::testing::TestWithParam void run_current_test(Hits_Usecase const& hits_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; // 1. create MG graph @@ -186,12 +188,13 @@ class Tests_MGHits : public ::testing::TestWithParam, weight_t>> sg_edge_weights{std::nullopt}; - std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) = + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp index f6ca6b03192..b005e4655df 100644 --- a/cpp/tests/link_analysis/mg_pagerank_test.cpp +++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp @@ -64,6 +64,8 @@ class Tests_MGPageRank void run_current_test(PageRank_Usecase const& pagerank_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; // 1. create MG graph @@ -202,12 +204,13 @@ class Tests_MGPageRank std::optional< cugraph::edge_property_t, weight_t>> sg_edge_weights{std::nullopt}; - std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) = + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, mg_edge_weight_view, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*d_mg_renumber_map).data(), (*d_mg_renumber_map).size()), false); diff --git a/cpp/tests/mtmg/threaded_test_louvain.cu b/cpp/tests/mtmg/threaded_test_louvain.cu index a2ec9244b38..7347e6c110e 100644 --- a/cpp/tests/mtmg/threaded_test_louvain.cu +++ b/cpp/tests/mtmg/threaded_test_louvain.cu @@ -397,13 +397,15 @@ class Tests_Multithreaded auto thread_handle = instance_manager->get_handle(); if (thread_handle.get_rank() == 0) { - std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) = + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( thread_handle.raft_handle(), graph_view.get(thread_handle), edge_weights ? std::make_optional(edge_weights->get(thread_handle).view()) : std::nullopt, std::optional>{std::nullopt}, + std::optional>{ + std::nullopt}, std::optional>{std::nullopt}, false); // create an SG graph with MG graph vertex IDs } else { @@ -413,6 +415,8 @@ class Tests_Multithreaded edge_weights ? std::make_optional(edge_weights->get(thread_handle).view()) : std::nullopt, std::optional>{std::nullopt}, + std::optional>{ + std::nullopt}, std::optional>{std::nullopt}, false); // create an SG graph with MG graph vertex IDs } diff --git a/cpp/tests/prims/mg_count_if_e.cu b/cpp/tests/prims/mg_count_if_e.cu index 8796383f45d..86960dd9e5d 100644 --- a/cpp/tests/prims/mg_count_if_e.cu +++ b/cpp/tests/prims/mg_count_if_e.cu @@ -78,6 +78,8 @@ class Tests_MGCountIfE bool store_transposed> void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; // 1. create MG graph @@ -149,12 +151,13 @@ class Tests_MGCountIfE if (prims_usecase.check_correctness) { cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/prims/mg_count_if_v.cu b/cpp/tests/prims/mg_count_if_v.cu index 0d399f52acd..39bc7e58c72 100644 --- a/cpp/tests/prims/mg_count_if_v.cu +++ b/cpp/tests/prims/mg_count_if_v.cu @@ -74,6 +74,8 @@ class Tests_MGCountIfV template void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; // 1. create MG graph @@ -123,12 +125,13 @@ class Tests_MGCountIfV if (prims_usecase.check_correctness) { cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/prims/mg_extract_transform_e.cu b/cpp/tests/prims/mg_extract_transform_e.cu index 27e3f471c5b..6db8064be19 100644 --- a/cpp/tests/prims/mg_extract_transform_e.cu +++ b/cpp/tests/prims/mg_extract_transform_e.cu @@ -109,7 +109,8 @@ class Tests_MGExtractTransformE template void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase) { - using result_t = int32_t; + using edge_type_t = int32_t; + using result_t = int32_t; static_assert(std::is_same_v || cugraph::is_arithmetic_or_thrust_tuple_of_arithmetic::value); @@ -211,12 +212,13 @@ class Tests_MGExtractTransformE } cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*d_mg_renumber_map_labels).data(), (*d_mg_renumber_map_labels).size()), false); diff --git a/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu b/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu index 0c625da0a6d..1c1594fd9d8 100644 --- a/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu +++ b/cpp/tests/prims/mg_extract_transform_v_frontier_outgoing_e.cu @@ -140,7 +140,8 @@ class Tests_MGExtractTransformVFrontierOutgoingE typename output_payload_t> void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase) { - using result_t = int32_t; + using edge_type_t = int32_t; + using result_t = int32_t; using key_t = std::conditional_t, vertex_t, thrust::tuple>; @@ -283,12 +284,13 @@ class Tests_MGExtractTransformVFrontierOutgoingE } cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*d_mg_renumber_map_labels).data(), (*d_mg_renumber_map_labels).size()), false); diff --git a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu index 76545fb04cd..7915c32a6ce 100644 --- a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu +++ b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu @@ -86,6 +86,8 @@ class Tests_MGPerVPairTransformDstNbrIntersection template void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; auto const comm_rank = handle_->get_comms().get_rank(); @@ -231,12 +233,13 @@ class Tests_MGPerVPairTransformDstNbrIntersection *handle_, std::get<1>(mg_result_buffer).data(), std::get<1>(mg_result_buffer).size()); cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu index 1cba7885ce6..c013fd889eb 100644 --- a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu +++ b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu @@ -105,6 +105,8 @@ class Tests_MGPerVPairTransformDstNbrIntersection template void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; auto const comm_rank = handle_->get_comms().get_rank(); @@ -264,7 +266,7 @@ class Tests_MGPerVPairTransformDstNbrIntersection weight_t>> sg_edge_weight{std::nullopt}; - std::tie(sg_graph, sg_edge_weight, std::ignore, std::ignore) = + std::tie(sg_graph, sg_edge_weight, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, @@ -272,6 +274,7 @@ class Tests_MGPerVPairTransformDstNbrIntersection ? std::make_optional(mg_edge_weight_view) : std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu index 040e0a6d716..99c5a5ea81b 100644 --- a/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu +++ b/cpp/tests/prims/mg_per_v_transform_reduce_dst_key_aggregated_outgoing_e.cu @@ -95,6 +95,8 @@ class Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE template void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; auto const comm_rank = handle_->get_comms().get_rank(); @@ -296,12 +298,13 @@ class Tests_MGPerVTransformReduceDstKeyAggregatedOutgoingE std::optional< cugraph::edge_property_t, weight_t>> sg_edge_weights{std::nullopt}; - std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) = + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu index 57d77f6c4bd..884fedfe698 100644 --- a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu +++ b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu @@ -99,6 +99,8 @@ class Tests_MGPerVTransformReduceIncomingOutgoingE bool store_transposed> void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; // 1. create MG graph @@ -270,12 +272,13 @@ class Tests_MGPerVTransformReduceIncomingOutgoingE if (prims_usecase.check_correctness) { cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/prims/mg_reduce_v.cu b/cpp/tests/prims/mg_reduce_v.cu index ebd557da004..5c6a4da6c67 100644 --- a/cpp/tests/prims/mg_reduce_v.cu +++ b/cpp/tests/prims/mg_reduce_v.cu @@ -75,6 +75,8 @@ class Tests_MGReduceV bool store_transposed> void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; // 1. create MG graph @@ -163,12 +165,13 @@ class Tests_MGReduceV if (prims_usecase.check_correctness) { cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu b/cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu index cb86a96d78b..9fc17b07a07 100644 --- a/cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu +++ b/cpp/tests/prims/mg_transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cu @@ -83,6 +83,8 @@ class Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV template void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; auto const comm_rank = handle_->get_comms().get_rank(); @@ -173,12 +175,13 @@ class Tests_MGTransformReduceDstNbrIntersectionOfEEndpointsByV raft::device_span(mg_result_buffer.data(), mg_result_buffer.size())); cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/prims/mg_transform_reduce_e.cu b/cpp/tests/prims/mg_transform_reduce_e.cu index b5dcfaa7aa7..6d7ee50f51e 100644 --- a/cpp/tests/prims/mg_transform_reduce_e.cu +++ b/cpp/tests/prims/mg_transform_reduce_e.cu @@ -79,6 +79,8 @@ class Tests_MGTransformReduceE bool store_transposed> void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; // 1. create MG graph @@ -159,12 +161,13 @@ class Tests_MGTransformReduceE if (prims_usecase.check_correctness) { cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu b/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu index 830b48acade..d36fe7a5474 100644 --- a/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu +++ b/cpp/tests/prims/mg_transform_reduce_e_by_src_dst_key.cu @@ -80,6 +80,8 @@ class Tests_MGTransformReduceEBySrcDstKey bool store_transposed> void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; // 1. create MG graph @@ -238,12 +240,13 @@ class Tests_MGTransformReduceEBySrcDstKey cugraph::get_dataframe_buffer_begin(mg_aggregate_by_dst_values)); cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/prims/mg_transform_reduce_v.cu b/cpp/tests/prims/mg_transform_reduce_v.cu index 1e5cb7207b1..ad2c85f5d3c 100644 --- a/cpp/tests/prims/mg_transform_reduce_v.cu +++ b/cpp/tests/prims/mg_transform_reduce_v.cu @@ -83,6 +83,8 @@ class Tests_MGTransformReduceV bool store_transposed> void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; // 1. create MG graph @@ -169,12 +171,13 @@ class Tests_MGTransformReduceV if (prims_usecase.check_correctness) { cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu index acc89491e56..5f9e5fb54dc 100644 --- a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu +++ b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu @@ -112,7 +112,8 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst typename payload_t> void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase) { - using property_t = int32_t; + using edge_type_t = int32_t; + using property_t = int32_t; using key_t = std::conditional_t, vertex_t, thrust::tuple>; @@ -299,12 +300,13 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst } cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/structure/mg_coarsen_graph_test.cpp b/cpp/tests/structure/mg_coarsen_graph_test.cpp index deb4c287183..dfc9800552f 100644 --- a/cpp/tests/structure/mg_coarsen_graph_test.cpp +++ b/cpp/tests/structure/mg_coarsen_graph_test.cpp @@ -247,6 +247,8 @@ class Tests_MGCoarsenGraph void run_current_test(CoarsenGraph_Usecase const& coarsen_graph_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; // 1. create MG graph @@ -330,12 +332,13 @@ class Tests_MGCoarsenGraph cugraph::edge_property_t, weight_t>> sg_edge_weights{std::nullopt}; - std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) = + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, mg_edge_weight_view, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::optional>{std::nullopt}, false); @@ -344,12 +347,13 @@ class Tests_MGCoarsenGraph cugraph::edge_property_t, weight_t>> sg_coarse_edge_weights{std::nullopt}; - std::tie(sg_coarse_graph, sg_coarse_edge_weights, std::ignore, std::ignore) = + std::tie(sg_coarse_graph, sg_coarse_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_coarse_graph_view, mg_coarse_edge_weight_view, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::optional>{std::nullopt}, false); diff --git a/cpp/tests/structure/mg_count_self_loops_and_multi_edges_test.cpp b/cpp/tests/structure/mg_count_self_loops_and_multi_edges_test.cpp index 8b78e16c404..e27ca925222 100644 --- a/cpp/tests/structure/mg_count_self_loops_and_multi_edges_test.cpp +++ b/cpp/tests/structure/mg_count_self_loops_and_multi_edges_test.cpp @@ -63,6 +63,8 @@ class Tests_MGCountSelfLoopsAndMultiEdges CountSelfLoopsAndMultiEdges_Usecase const& count_self_loops_and_multi_edges_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; // 1. create MG graph @@ -126,12 +128,13 @@ class Tests_MGCountSelfLoopsAndMultiEdges // 3-1. aggregate MG results cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp b/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp index 92b66d0260c..5951e0f0fb2 100644 --- a/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp +++ b/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp @@ -211,12 +211,13 @@ class Tests_MGHasEdgeAndComputeMultiplicity d_mg_edge_multiplicities.size())); cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/structure/mg_induced_subgraph_test.cu b/cpp/tests/structure/mg_induced_subgraph_test.cu index 2958686c945..4b2e198df25 100644 --- a/cpp/tests/structure/mg_induced_subgraph_test.cu +++ b/cpp/tests/structure/mg_induced_subgraph_test.cu @@ -65,6 +65,8 @@ class Tests_MGInducedSubgraph void run_current_test( std::tuple const& param) { + using edge_type_t = int32_t; + auto [induced_subgraph_usecase, input_usecase] = param; HighResTimer hr_timer{}; @@ -224,12 +226,17 @@ class Tests_MGInducedSubgraph true, handle_->get_stream()); - auto [sg_graph, sg_edge_weights, sg_edge_ids, sg_number_map] = + cugraph::graph_t sg_graph(*handle_); + std::optional< + cugraph::edge_property_t, weight_t>> + sg_edge_weights{std::nullopt}; + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, mg_edge_weight_view, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::optional>{std::nullopt}, false); diff --git a/cpp/tests/structure/mg_symmetrize_test.cpp b/cpp/tests/structure/mg_symmetrize_test.cpp index 4d56a6e2a95..8ed507fb40e 100644 --- a/cpp/tests/structure/mg_symmetrize_test.cpp +++ b/cpp/tests/structure/mg_symmetrize_test.cpp @@ -60,6 +60,8 @@ class Tests_MGSymmetrize void run_current_test(Symmetrize_Usecase const& symmetrize_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; // 1. create MG graph @@ -89,12 +91,13 @@ class Tests_MGSymmetrize weight_t>> sg_edge_weights{std::nullopt}; if (symmetrize_usecase.check_correctness) { - std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) = + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph.view(), mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/structure/mg_transpose_storage_test.cpp b/cpp/tests/structure/mg_transpose_storage_test.cpp index 41c50b396fb..cb85c2922fe 100644 --- a/cpp/tests/structure/mg_transpose_storage_test.cpp +++ b/cpp/tests/structure/mg_transpose_storage_test.cpp @@ -58,6 +58,8 @@ class Tests_MGTransposeStorage void run_current_test(TransposeStorage_Usecase const& transpose_storage_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; // 1. create MG graph @@ -87,12 +89,13 @@ class Tests_MGTransposeStorage weight_t>> sg_edge_weights{std::nullopt}; if (transpose_storage_usecase.check_correctness) { - std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) = + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph.view(), mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/structure/mg_transpose_test.cpp b/cpp/tests/structure/mg_transpose_test.cpp index 28f5c000d3d..b0c265e8916 100644 --- a/cpp/tests/structure/mg_transpose_test.cpp +++ b/cpp/tests/structure/mg_transpose_test.cpp @@ -58,6 +58,8 @@ class Tests_MGTranspose void run_current_test(Transpose_Usecase const& transpose_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; // 1. create MG graph @@ -87,12 +89,13 @@ class Tests_MGTranspose weight_t>> sg_edge_weights{std::nullopt}; if (transpose_usecase.check_correctness) { - std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) = + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph.view(), mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/traversal/mg_bfs_test.cpp b/cpp/tests/traversal/mg_bfs_test.cpp index 3cd712798e3..e97eb201b18 100644 --- a/cpp/tests/traversal/mg_bfs_test.cpp +++ b/cpp/tests/traversal/mg_bfs_test.cpp @@ -62,7 +62,8 @@ class Tests_MGBFS : public ::testing::TestWithParam void run_current_test(BFS_Usecase const& bfs_usecase, input_usecase_t const& input_usecase) { - using weight_t = float; + using edge_type_t = int32_t; + using weight_t = float; bool constexpr renumber = true; bool constexpr test_weighted = false; @@ -188,12 +189,13 @@ class Tests_MGBFS : public ::testing::TestWithParam sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/traversal/mg_extract_bfs_paths_test.cu b/cpp/tests/traversal/mg_extract_bfs_paths_test.cu index 1ef5c282c1c..843009b966f 100644 --- a/cpp/tests/traversal/mg_extract_bfs_paths_test.cu +++ b/cpp/tests/traversal/mg_extract_bfs_paths_test.cu @@ -69,9 +69,10 @@ class Tests_MGExtractBFSPaths void run_current_test(ExtractBFSPaths_Usecase const& extract_bfs_paths_usecase, input_usecase_t const& input_usecase) { - constexpr bool renumber = true; + using weight_t = float; + using edge_type_t = int32_t; - using weight_t = float; + constexpr bool renumber = true; HighResTimer hr_timer{}; @@ -237,12 +238,13 @@ class Tests_MGExtractBFSPaths cugraph::test::device_gatherv(*handle_, d_mg_paths.data(), d_mg_paths.size()); cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/traversal/mg_k_hop_nbrs_test.cpp b/cpp/tests/traversal/mg_k_hop_nbrs_test.cpp index 4e4b7d38ccd..2a060c8ee45 100644 --- a/cpp/tests/traversal/mg_k_hop_nbrs_test.cpp +++ b/cpp/tests/traversal/mg_k_hop_nbrs_test.cpp @@ -65,7 +65,8 @@ class Tests_MGKHopNbrs void run_current_test(KHopNbrs_Usecase const& k_hop_nbrs_usecase, input_usecase_t const& input_usecase) { - using weight_t = float; + using edge_type_t = int32_t; + using weight_t = float; HighResTimer hr_timer{}; @@ -178,12 +179,13 @@ class Tests_MGKHopNbrs *handle_, raft::device_span(d_mg_nbrs.data(), d_mg_nbrs.size())); cugraph::graph_t sg_graph(*handle_); - std::tie(sg_graph, std::ignore, std::ignore, std::ignore) = + std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, std::optional>{std::nullopt}, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/traversal/mg_sssp_test.cpp b/cpp/tests/traversal/mg_sssp_test.cpp index bc3f5870f6c..c81dccf8649 100644 --- a/cpp/tests/traversal/mg_sssp_test.cpp +++ b/cpp/tests/traversal/mg_sssp_test.cpp @@ -62,6 +62,8 @@ class Tests_MGSSSP : public ::testing::TestWithParam void run_current_test(SSSP_Usecase const& sssp_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; // 1. create MG graph @@ -176,12 +178,13 @@ class Tests_MGSSSP : public ::testing::TestWithParam, weight_t>> sg_edge_weights{std::nullopt}; - std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) = + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, mg_edge_weight_view, std::optional>{std::nullopt}, + std::optional>{std::nullopt}, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); diff --git a/cpp/tests/utilities/conversion_utilities.hpp b/cpp/tests/utilities/conversion_utilities.hpp index 24a8ecbe4fd..6d840577b4a 100644 --- a/cpp/tests/utilities/conversion_utilities.hpp +++ b/cpp/tests/utilities/conversion_utilities.hpp @@ -215,7 +215,11 @@ graph_to_host_csc( std::optional> renumber_map); // Only the rank 0 GPU holds the valid data -template +template std::tuple< cugraph::graph_t, std::optional< @@ -224,12 +228,16 @@ std::tuple< std::optional< cugraph::edge_property_t, edge_t>>, + std::optional< + cugraph::edge_property_t, + edge_type_t>>, std::optional>> mg_graph_to_sg_graph( raft::handle_t const& handle, cugraph::graph_view_t const& graph_view, std::optional> edge_weight_view, std::optional> edge_id_view, + std::optional> edge_type_view, std::optional> renumber_map, bool renumber); diff --git a/cpp/tests/utilities/conversion_utilities_impl.cuh b/cpp/tests/utilities/conversion_utilities_impl.cuh index 40fe6fe8695..b74035999f1 100644 --- a/cpp/tests/utilities/conversion_utilities_impl.cuh +++ b/cpp/tests/utilities/conversion_utilities_impl.cuh @@ -279,17 +279,24 @@ graph_to_host_csc( true>(handle, graph_view, edge_weight_view, renumber_map); } -template +template std::tuple< cugraph::graph_t, std::optional, weight_t>>, std::optional, edge_t>>, + std::optional< + edge_property_t, edge_type_t>>, std::optional>> mg_graph_to_sg_graph( raft::handle_t const& handle, cugraph::graph_view_t const& graph_view, std::optional> edge_weight_view, std::optional> edge_id_view, + std::optional> edge_type_view, std::optional> renumber_map, bool renumber) { @@ -297,14 +304,10 @@ mg_graph_to_sg_graph( rmm::device_uvector d_dst(0, handle.get_stream()); std::optional> d_wgt{std::nullopt}; std::optional> d_edge_id{std::nullopt}; + std::optional> d_edge_type{std::nullopt}; - std::tie(d_src, d_dst, d_wgt, d_edge_id, std::ignore) = cugraph::decompress_to_edgelist( - handle, - graph_view, - edge_weight_view, - edge_id_view, - std::optional>{std::nullopt}, - renumber_map); + std::tie(d_src, d_dst, d_wgt, d_edge_id, d_edge_type) = cugraph::decompress_to_edgelist( + handle, graph_view, edge_weight_view, edge_id_view, edge_type_view, renumber_map); d_src = cugraph::test::device_gatherv( handle, raft::device_span{d_src.data(), d_src.size()}); @@ -316,6 +319,9 @@ mg_graph_to_sg_graph( if (d_edge_id) *d_edge_id = cugraph::test::device_gatherv( handle, raft::device_span{d_edge_id->data(), d_edge_id->size()}); + if (d_edge_type) + *d_edge_type = cugraph::test::device_gatherv( + handle, raft::device_span{d_edge_type->data(), d_edge_type->size()}); rmm::device_uvector vertices(0, handle.get_stream()); if (renumber_map) { vertices = cugraph::test::device_gatherv(handle, *renumber_map); } @@ -325,6 +331,9 @@ mg_graph_to_sg_graph( sg_edge_weights{std::nullopt}; std::optional, edge_t>> sg_edge_ids{std::nullopt}; + std::optional< + edge_property_t, edge_type_t>> + sg_edge_types{std::nullopt}; std::optional> sg_number_map; if (handle.get_comms().get_rank() == 0) { if (!renumber_map) { @@ -333,15 +342,15 @@ mg_graph_to_sg_graph( handle.get_stream(), vertices.data(), vertices.size(), vertex_t{0}); } - std::tie(sg_graph, sg_edge_weights, sg_edge_ids, std::ignore, sg_number_map) = cugraph:: - create_graph_from_edgelist( + std::tie(sg_graph, sg_edge_weights, sg_edge_ids, sg_edge_types, sg_number_map) = cugraph:: + create_graph_from_edgelist( handle, std::make_optional(std::move(vertices)), std::move(d_src), std::move(d_dst), std::move(d_wgt), std::move(d_edge_id), - std::nullopt, + std::move(d_edge_type), cugraph::graph_properties_t{graph_view.is_symmetric(), graph_view.is_multigraph()}, renumber); } else { @@ -357,11 +366,16 @@ mg_graph_to_sg_graph( (*d_edge_id).resize(0, handle.get_stream()); (*d_edge_id).shrink_to_fit(handle.get_stream()); } + if (d_edge_type) { + (*d_edge_type).resize(0, handle.get_stream()); + (*d_edge_type).shrink_to_fit(handle.get_stream()); + } } return std::make_tuple(std::move(sg_graph), std::move(sg_edge_weights), std::move(sg_edge_ids), + std::move(sg_edge_types), std::move(sg_number_map)); } diff --git a/cpp/tests/utilities/conversion_utilities_mg.cu b/cpp/tests/utilities/conversion_utilities_mg.cu index 6c5db5b6c57..41fc6669d57 100644 --- a/cpp/tests/utilities/conversion_utilities_mg.cu +++ b/cpp/tests/utilities/conversion_utilities_mg.cu @@ -262,12 +262,14 @@ template std::tuple< cugraph::graph_t, std::optional, float>>, std::optional, int32_t>>, + std::optional, int32_t>>, std::optional>> mg_graph_to_sg_graph( raft::handle_t const& handle, cugraph::graph_view_t const& graph_view, std::optional> edge_weight_view, std::optional> edge_id_view, + std::optional> edge_type_view, std::optional> renumber_map, bool renumber); @@ -275,12 +277,14 @@ template std::tuple< cugraph::graph_t, std::optional, float>>, std::optional, int64_t>>, + std::optional, int32_t>>, std::optional>> mg_graph_to_sg_graph( raft::handle_t const& handle, cugraph::graph_view_t const& graph_view, std::optional> edge_weight_view, std::optional> edge_id_view, + std::optional> edge_type_view, std::optional> renumber_map, bool renumber); @@ -288,12 +292,14 @@ template std::tuple< cugraph::graph_t, std::optional, double>>, std::optional, int32_t>>, + std::optional, int32_t>>, std::optional>> mg_graph_to_sg_graph( raft::handle_t const& handle, cugraph::graph_view_t const& graph_view, std::optional> edge_weight_view, std::optional> edge_id_view, + std::optional> edge_type_view, std::optional> renumber_map, bool renumber); @@ -301,12 +307,14 @@ template std::tuple< cugraph::graph_t, std::optional, double>>, std::optional, int64_t>>, + std::optional, int32_t>>, std::optional>> mg_graph_to_sg_graph( raft::handle_t const& handle, cugraph::graph_view_t const& graph_view, std::optional> edge_weight_view, std::optional> edge_id_view, + std::optional> edge_type_view, std::optional> renumber_map, bool renumber); @@ -314,12 +322,14 @@ template std::tuple< cugraph::graph_t, std::optional, float>>, std::optional, int32_t>>, + std::optional, int32_t>>, std::optional>> mg_graph_to_sg_graph( raft::handle_t const& handle, cugraph::graph_view_t const& graph_view, std::optional> edge_weight_view, std::optional> edge_id_view, + std::optional> edge_type_view, std::optional> renumber_map, bool renumber); @@ -327,12 +337,14 @@ template std::tuple< cugraph::graph_t, std::optional, float>>, std::optional, int64_t>>, + std::optional, int32_t>>, std::optional>> mg_graph_to_sg_graph( raft::handle_t const& handle, cugraph::graph_view_t const& graph_view, std::optional> edge_weight_view, std::optional> edge_id_view, + std::optional> edge_type_view, std::optional> renumber_map, bool renumber); @@ -340,12 +352,14 @@ template std::tuple< cugraph::graph_t, std::optional, double>>, std::optional, int32_t>>, + std::optional, int32_t>>, std::optional>> mg_graph_to_sg_graph( raft::handle_t const& handle, cugraph::graph_view_t const& graph_view, std::optional> edge_weight_view, std::optional> edge_id_view, + std::optional> edge_type_view, std::optional> renumber_map, bool renumber); @@ -353,12 +367,14 @@ template std::tuple< cugraph::graph_t, std::optional, double>>, std::optional, int64_t>>, + std::optional, int32_t>>, std::optional>> mg_graph_to_sg_graph( raft::handle_t const& handle, cugraph::graph_view_t const& graph_view, std::optional> edge_weight_view, std::optional> edge_id_view, + std::optional> edge_type_view, std::optional> renumber_map, bool renumber); From 481ab0482f7b76436ec042a40f72f70390c784d0 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 30 Jan 2025 13:43:31 -0800 Subject: [PATCH 18/21] fix a compiler warning --- cpp/src/community/k_truss_impl.cuh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/community/k_truss_impl.cuh b/cpp/src/community/k_truss_impl.cuh index 25a1cf63f4d..1871da48876 100644 --- a/cpp/src/community/k_truss_impl.cuh +++ b/cpp/src/community/k_truss_impl.cuh @@ -144,14 +144,16 @@ struct extract_low_to_high_degree_edges_from_endpoints_t { } else if (dst_out_degree < src_out_degree) { return cuda::std::optional>{ thrust::make_tuple(dst, src, count[idx])}; - } else { - if ((src_out_degree == dst_out_degree) && (src < dst) /* tie-breaking using vertex ID */) { + } else { // src_out_degree == dst_out_degree + if (src < dst /* tie-breaking using vertex ID */) { return cuda::std::optional>{ thrust::make_tuple(src, dst, count[idx])}; - } else if ((src_out_degree == dst_out_degree) && - (src > dst) /* tie-breaking using vertex ID */) { + } else if (src > dst /* tie-breaking using vertex ID */) { return cuda::std::optional>{ thrust::make_tuple(dst, src, count[idx])}; + } else { // src == dst (self-loop) + assert(false); // should not be reached as we pre-excluded self-loops + return cuda::std::nullopt; } } } else { From 1b397b3a7cfcb8026114a9a088cdfaf7f0e7c73b Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 30 Jan 2025 13:47:16 -0800 Subject: [PATCH 19/21] better fix of the compiler warning --- cpp/src/community/k_truss_impl.cuh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/src/community/k_truss_impl.cuh b/cpp/src/community/k_truss_impl.cuh index 1871da48876..f152ef4564b 100644 --- a/cpp/src/community/k_truss_impl.cuh +++ b/cpp/src/community/k_truss_impl.cuh @@ -148,12 +148,9 @@ struct extract_low_to_high_degree_edges_from_endpoints_t { if (src < dst /* tie-breaking using vertex ID */) { return cuda::std::optional>{ thrust::make_tuple(src, dst, count[idx])}; - } else if (src > dst /* tie-breaking using vertex ID */) { + } else { return cuda::std::optional>{ thrust::make_tuple(dst, src, count[idx])}; - } else { // src == dst (self-loop) - assert(false); // should not be reached as we pre-excluded self-loops - return cuda::std::nullopt; } } } else { From f88f98422a7cf36e890cb6d9b35c1af885a010d1 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 30 Jan 2025 17:37:32 -0800 Subject: [PATCH 20/21] fix build errors --- cpp/src/prims/detail/partition_v_frontier.cuh | 2 +- .../sample_and_compute_local_nbr_indices.cuh | 244 ++++++++++-------- 2 files changed, 133 insertions(+), 113 deletions(-) diff --git a/cpp/src/prims/detail/partition_v_frontier.cuh b/cpp/src/prims/detail/partition_v_frontier.cuh index 5a58e146790..0fbe61c3cc0 100644 --- a/cpp/src/prims/detail/partition_v_frontier.cuh +++ b/cpp/src/prims/detail/partition_v_frontier.cuh @@ -96,7 +96,7 @@ partition_v_frontier_per_value_idx( raft::handle_t const& handle, ValueIterator frontier_value_first, ValueIterator frontier_value_last, - raft::host_span::value_type> + raft::host_span::value_type const> thresholds /* size = num_values_per_key * (# partitions - 1), thresholds[i] marks the end (exclusive) of the (i % num_values_per_key)'th partition value range for the (i / num_values_per_key)'th value of each key */ diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh index 86f9fd570f5..cac11cd5914 100644 --- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh +++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh @@ -700,7 +700,7 @@ void sample_nbr_index_with_replacement( input_r_offsets = rmm::device_uvector(num_keys + 1, handle.get_stream()); (*input_r_offsets).set_element_to_zero_async(0, handle.get_stream()); auto k_first = thrust::make_transform_iterator( - std::get<1>(frontier_index_type_pairs).begin(), + std::get<1>(*frontier_index_type_pairs).begin(), cuda::proclaim_return_type( [K_offsets] __device__(auto type) { return K_offsets[type + 1] - K_offsets[type]; })); thrust::inclusive_scan( @@ -734,9 +734,10 @@ void sample_nbr_index_with_replacement( K_offsets, K_sum, invalid_idx = cugraph::invalid_edge_id_v] __device__(auto pair) { - auto i = thrust::get<0>(pair); - auto r = thrust::get<1>(pair); - auto idx = thrust::distance( + auto num_edge_types = static_cast(K_offsets.size() - 1); + auto i = thrust::get<0>(pair); + auto r = thrust::get<1>(pair); + auto idx = thrust::distance( input_r_offsets.begin() + 1, thrust::upper_bound(thrust::seq, input_r_offsets.begin() + 1, input_r_offsets.end(), i)); auto frontier_idx = frontier_indices[idx]; @@ -911,13 +912,14 @@ void sample_nbr_index_without_replacement( std::numeric_limits::min() /* to prevent log(0), which is undefined */, bias_t{1.0}, rng_state); - thrust::transform(handle.get_thrust_policy(), - cur_ws.begin(), - cur_ws.end(), - cur_ws.begin(), - cuda::proclaim_return_type([K] __device__(auto r) { - return exp(log(r) / K); // log(r) <= 0, 0.0 <= exp(log(r)/K) <= 1.0 - })); + thrust::transform( + handle.get_thrust_policy(), + cur_ws.begin(), + cur_ws.end(), + cur_ws.begin(), + cuda::proclaim_return_type([K] __device__(auto r) { + return cuda::std::exp(cuda::std::log(r) / K); // log(r) <= 0, 0.0 <= exp(log(r)/K) <= 1.0 + })); while (retry_frontier_indices.size() > 0) { std::cout << "retry_frontier_indices.size()=" << retry_frontier_indices.size() << " K=" << K @@ -954,17 +956,17 @@ void sample_nbr_index_without_replacement( while ((i < degree) && (r_idx < random_numbers_per_key)) { auto r = sample_random_numbers[idx * random_numbers_per_key + r_idx++]; auto inc = - floor(log(r) / - cuda::std::min( - log(cuda::std::max(1.0 - w, eps /* to prevent log(0), which is undefined */)), - -eps) /* to prevent divide by 0 */) + + floor(cuda::std::log(r) / + cuda::std::min(cuda::std::log(cuda::std::max( + 1.0 - w, eps /* to prevent log(0), which is undefined */)), + -eps) /* to prevent divide by 0 */) + 1.0; i += (inc < static_cast(degree - i)) ? static_cast(inc) : (degree - i); if (i < degree) { r = sample_random_numbers[idx * random_numbers_per_key + r_idx++]; nbr_indices[frontier_idx * K + cuda::std::min(static_cast(r * K), K - 1)] = i; r = sample_random_numbers[idx * random_numbers_per_key + r_idx++]; - w *= exp(log(r) / K); + w *= cuda::std::exp(cuda::std::log(r) / K); } } if (i < degree) { @@ -1011,7 +1013,7 @@ void sample_nbr_index_without_replacement( rmm::device_uvector sample_size_offsets(num_keys + 1, handle.get_stream()); sample_size_offsets.set_element_to_zero_async(0, handle.get_stream()); auto k_first = thrust::make_transform_iterator( - std::get<1>(frontier_index_type_pairs).begin(), + std::get<1>(*frontier_index_type_pairs).begin(), cuda::proclaim_return_type( [K_offsets] __device__(auto type) { return K_offsets[type + 1] - K_offsets[type]; })); thrust::inclusive_scan( @@ -1030,7 +1032,8 @@ void sample_nbr_index_without_replacement( K_offsets, K_sum, invalid_idx = cugraph::invalid_edge_id_v] __device__(auto i) { - auto idx = thrust::distance( + auto num_edge_types = static_cast(K_offsets.size() - 1); + auto idx = thrust::distance( sample_size_offsets.begin() + 1, thrust::upper_bound( thrust::seq, sample_size_offsets.begin() + 1, sample_size_offsets.end(), i)); @@ -1051,13 +1054,14 @@ void sample_nbr_index_without_replacement( K_offsets, K_sum, invalid_idx = cugraph::invalid_edge_id_v] __device__(auto i) { - auto frontier_idx = i / K_sum; - auto type = static_cast(thrust::distance( + auto num_edge_types = static_cast(K_offsets.size() - 1); + auto frontier_idx = i / K_sum; + auto type = static_cast(thrust::distance( K_offsets.begin() + 1, thrust::upper_bound(thrust::seq, K_offsets.begin() + 1, K_offsets.end(), i % K_sum))); - auto d = frontier_per_type_degrees[frontier_idx * num_edge_types + type]; - auto K = K_offsets[type + 1] - K_offsets[type]; - auto sample_idx = static_cast((i % K_sum) - K_offsets[type]); + auto d = frontier_per_type_degrees[frontier_idx * num_edge_types + type]; + auto K = K_offsets[type + 1] - K_offsets[type]; + auto sample_idx = static_cast((i % K_sum) - K_offsets[type]); return sample_idx < d ? static_cast(sample_idx) : invalid_idx; }); } @@ -1132,7 +1136,8 @@ void sample_nbr_index_without_replacement( K_offsets, K_sum, invalid_idx = cugraph::invalid_edge_id_v] __device__(size_t i) { - auto idx = thrust::distance( + auto num_edge_types = static_cast(K_offsets.size() - 1); + auto idx = thrust::distance( input_r_offsets.begin() + 1, thrust::upper_bound(thrust::seq, input_r_offsets.begin() + 1, input_r_offsets.end(), i)); auto type = types ? (*types)[idx] : static_cast(idx % num_edge_types); @@ -1523,9 +1528,9 @@ rmm::device_uvector compute_heterogeneous_uniform_sampling_index_without auto num_edge_types = static_cast(K_offsets.size() - 1); - std::vector thresholds(num_edge_types); - for (size_t i = 0; i < num_edge_types; ++i) { - thresholds[i] = low_partition_degree_range_last + 1; + std::vector thresholds(num_edge_types); + for (edge_type_t i = 0; i < num_edge_types; ++i) { + thresholds[i] = static_cast(low_partition_degree_range_last + 1); } auto [frontier_indices, frontier_edge_types, frontier_partition_offsets] = @@ -1541,7 +1546,7 @@ rmm::device_uvector compute_heterogeneous_uniform_sampling_index_without auto low_partition_size = frontier_partition_offsets[1]; if (low_partition_size > 0) { - sample_nbr_index_without_replacement( + sample_nbr_index_without_replacement( handle, frontier_per_type_degrees, std::make_optional(std::make_tuple( @@ -1900,8 +1905,8 @@ rmm::device_uvector compute_heterogeneous_uniform_sampling_index_without auto frontier_idx = *(segment_frontier_index_first + idx); auto type = *(segment_frontier_type_first + idx); auto sample_idx = static_cast(i - output_count_offsets[idx]); - *(per_type_nbr_indices + frontier_idx * K_sum + K_offsets[type] + sample_idx) = - *(tmp_per_type_nbr_indices + idx * high_partition_oversampling_K + sample_idx); + *(per_type_nbr_indices.begin() + frontier_idx * K_sum + K_offsets[type] + sample_idx) = + *(tmp_per_type_nbr_indices.begin() + idx * high_partition_oversampling_K + sample_idx); }); } } @@ -2008,7 +2013,7 @@ void compute_homogeneous_biased_sampling_index_without_replacement( cuda::proclaim_return_type([] __device__(bias_t r, bias_t b) { assert(b > 0.0); // 0 bias neighbors shold be pre-filtered before invoking this function - return cuda::std::min(-log(r) / b, std::numeric_limits::max()); + return cuda::std::min(-cuda::std::log(r) / b, std::numeric_limits::max()); })); } else { thrust::transform( @@ -2020,7 +2025,7 @@ void compute_homogeneous_biased_sampling_index_without_replacement( cuda::proclaim_return_type([] __device__(bias_t r, bias_t b) { assert(b > 0.0); // 0 bias neighbors shold be pre-filtered before invoking this function - return cuda::std::min(-log(r) / b, std::numeric_limits::max()); + return cuda::std::min(-cuda::std::log(r) / b, std::numeric_limits::max()); })); } @@ -2162,7 +2167,7 @@ void compute_homogeneous_biased_sampling_index_without_replacement( return; } -template +template void compute_heterogeneous_biased_sampling_index_without_replacement( raft::handle_t const& handle, std::optional> @@ -2273,7 +2278,7 @@ void compute_heterogeneous_biased_sampling_index_without_replacement( cuda::proclaim_return_type([] __device__(bias_t r, bias_t b) { assert(b > 0.0); // 0 bias neighbors shold be pre-filtered before invoking this function - return cuda::std::min(-log(r) / b, std::numeric_limits::max()); + return cuda::std::min(-cuda::std::log(r) / b, std::numeric_limits::max()); })); } else { thrust::transform( @@ -2285,7 +2290,7 @@ void compute_heterogeneous_biased_sampling_index_without_replacement( cuda::proclaim_return_type([] __device__(bias_t r, bias_t b) { assert(b > 0.0); // 0 bias neighbors shold be pre-filtered before invoking this function - return cuda::std::min(-log(r) / b, std::numeric_limits::max()); + return cuda::std::min(-cuda::std::log(r) / b, std::numeric_limits::max()); })); } @@ -2474,9 +2479,9 @@ rmm::device_uvector compute_aggregate_local_frontier_per_type_local_degr [key_idx_to_unique_key_idx = raft::device_span( aggregate_local_frontier_key_idx_to_unique_key_idx.data() + local_frontier_offsets[i], local_frontier_offsets[i + 1] - local_frontier_offsets[i]), - aggregate_local_frontier_unique_key_edge_types = - raft::device_span(aggregate_local_frontier_unique_key_edge_types.data(), - aggregate_local_frontier_unique_key_edge_types.size()), + aggregate_local_frontier_unique_key_edge_types = raft::device_span( + aggregate_local_frontier_unique_key_edge_types.data(), + aggregate_local_frontier_unique_key_edge_types.size()), unique_key_local_degree_offsets = raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data() + local_frontier_unique_key_offsets[i], @@ -2679,14 +2684,14 @@ compute_aggregate_local_frontier_bias_type_pairs( EdgeValueInputWrapper edge_value_input, BiasEdgeOp bias_e_op, EdgeTypeInputWrapper edge_type_input, - raft::host_span local_frontier_displacements, - raft::host_span local_frontier_sizes, + raft::host_span local_frontier_offsets, bool do_expensive_check) { using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; using key_t = typename thrust::iterator_traits::value_type; + using edge_type_t = typename EdgeTypeInputWrapper::value_type; using bias_t = typename edge_op_result_type::type; + std::vector local_frontier_sizes(local_frontier_offsets.size() - 1); + std::adjacent_difference( + local_frontier_offsets.begin() + 1, local_frontier_offsets.end(), local_frontier_sizes.begin()); auto [aggregate_local_frontier_bias_type_pairs, aggregate_local_frontier_local_degree_offsets] = transform_v_frontier_e( handle, @@ -2702,16 +2710,17 @@ compute_aggregate_local_frontier_bias_type_pairs( edge_src_value_input, edge_dst_value_input, view_concat(edge_value_input, edge_type_input), - [bias_e_op] __device__(auto src, auto dst, auto src_val, auto dst_val, auto e_val) { + cuda::proclaim_return_type>([bias_e_op] __device__(auto src, auto dst, auto src_val, auto dst_val, auto e_val) { return thrust::make_tuple(bias_e_op(src, dst, src_val, dst_val, thrust::get<0>(e_val)), thrust::get<1>(e_val)); - }, + }), #if 1 // FIXME: better update shuffle_values to take host_span - std::vector(local_frontier_displacements.begin(), local_frontier_displacements.end()), - std::vector(local_frontier_sizes.begin(), local_frontier_sizes.end()) -#else - local_frontier_displacements, + std::vector(local_frontier_offsets.begin(), local_frontier_offsets.end() - 1), local_frontier_sizes +#else + raft::host_span(local_frontier_offsets.data(), + local_frontier_offsets.size() - 1), + raft::host_span(local_frontier_sizes.data(), local_frontier_sizes.size()) #endif ); @@ -2746,8 +2755,8 @@ compute_aggregate_local_frontier_bias_type_pairs( return static_cast(i - offsets[idx]); }); - rmm::device_uvector aggregate_local_frontier_local_degrees( - local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream()); + rmm::device_uvector aggregate_local_frontier_local_degrees(local_frontier_offsets.back(), + handle.get_stream()); thrust::adjacent_difference(handle.get_thrust_policy(), aggregate_local_frontier_local_degree_offsets.begin() + 1, aggregate_local_frontier_local_degree_offsets.end(), @@ -2773,7 +2782,7 @@ compute_aggregate_local_frontier_bias_type_pairs( thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i); auto idx = thrust::distance(offsets.begin() + 1, it); cuda::atomic_ref degree(degrees[idx]); - return degree.fetch_sub(size_t{1}, cuda::std::memory_order_relaxed); + degree.fetch_sub(size_t{1}, cuda::std::memory_order_relaxed); } }); } @@ -2811,27 +2820,22 @@ compute_aggregate_local_frontier_bias_type_pairs( } // return (edge types, segment offsets) pairs for each key in the aggregate local frontier -template +template std::tuple, rmm::device_uvector> -compute_aggregate_local_frontier_edge_types( - raft::handle_t const& handle, - GraphViewType const& graph_view, - KeyIterator aggregate_local_frontier_key_first, - EdgeTypeInputWrapper edge_type_input, - raft::host_span local_frontier_displacements, - raft::host_span local_frontier_sizes) +compute_aggregate_local_frontier_edge_types(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyIterator aggregate_local_frontier_key_first, + EdgeTypeInputWrapper edge_type_input, + raft::host_span local_frontier_offsets) { using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; using key_t = typename thrust::iterator_traits::value_type; + std::vector local_frontier_sizes(local_frontier_offsets.size() - 1); + std::adjacent_difference( + local_frontier_offsets.begin() + 1, local_frontier_offsets.end(), local_frontier_sizes.begin()); auto [aggregate_local_frontier_types, aggregate_local_frontier_local_degree_offsets] = transform_v_frontier_e( handle, @@ -2842,11 +2846,12 @@ compute_aggregate_local_frontier_edge_types( edge_type_input, [] __device__(auto, auto, auto, auto, auto e_val) { return e_val; }, #if 1 // FIXME: better update shuffle_values to take host_span - std::vector(local_frontier_displacements.begin(), local_frontier_displacements.end()), - std::vector(local_frontier_sizes.begin(), local_frontier_sizes.end()) -#else - local_frontier_displacements, + std::vector(local_frontier_offsets.begin(), local_frontier_offsets.end() - 1), local_frontier_sizes +#else + raft::host_span(local_frontier_offsets.data(), + local_frontier_offsets.size() - 1), + raft::host_span(local_frontier_sizes.data(), local_frontier_sizes.size()), #endif ); @@ -3995,7 +4000,7 @@ homogeneous_biased_sample_without_replacement( std::move(local_nbr_indices), std::move(key_indices), std::move(local_frontier_sample_offsets)); } -template +template std::tuple /* local_nbr_indices */, std::optional> /* key_indices */, std::vector /* local_frontier_sample_offsets */> @@ -4045,7 +4050,8 @@ heterogeneous_biased_sample_without_replacement( local_frontier_unique_key_offsets[i] * num_edge_types, (local_frontier_unique_key_offsets[i + 1] - local_frontier_unique_key_offsets[i]) * num_edge_types + - 1)] __device__(size_t i) { + 1), + num_edge_types] __device__(size_t i) { auto key_idx = i / num_edge_types; auto edge_type = static_cast(i % num_edge_types); auto unique_key_idx = key_idx_to_unique_key_idx[key_idx]; @@ -4070,7 +4076,7 @@ heterogeneous_biased_sample_without_replacement( } std::vector thresholds(num_edge_types * 2); - for (size_t i = 0; i < num_edge_types; ++i) { + for (edge_type_t i = 0; i < num_edge_types; ++i) { thresholds[i * 2] = static_cast(Ks[i] + 1); thresholds[i * 2 + 1] = static_cast(minor_comm_size * Ks[i] * 2); } @@ -4115,6 +4121,7 @@ heterogeneous_biased_sample_without_replacement( K_offsets = raft::device_span(d_K_offsets.data(), d_K_offsets.size()), K_sum, invalid_idx = cugraph::invalid_edge_id_v] __device__(auto pair) { + auto num_edge_types = static_cast(K_offsets.size() - 1); auto idx = thrust::get<0>(pair); auto type = thrust::get<1>(pair); auto per_type_degree = frontier_per_type_degrees[idx * num_edge_types + type]; @@ -4238,9 +4245,10 @@ heterogeneous_biased_sample_without_replacement( aggregate_mid_local_frontier_per_type_local_degree_offsets.data() + mid_local_frontier_offsets[i], mid_local_frontier_sizes[i]), - aggregate_mid_local_frontier_biases = raft::device_span( - aggregate_mid_local_frontier_biases.data(), - aggregate_mid_local_frontier_biases.size())] __device__(size_t i) { + aggregate_mid_local_frontier_biases = + raft::device_span(aggregate_mid_local_frontier_biases.data(), + aggregate_mid_local_frontier_biases.size()), + num_edge_types] __device__(size_t i) { auto unique_key_idx = key_idx_to_unique_key_idx[mid_local_frontier_indices[i]]; auto type = mid_local_frontier_types[i]; thrust::copy( @@ -4268,14 +4276,6 @@ heterogeneous_biased_sample_without_replacement( mid_local_frontier_offsets.size(), handle.get_stream()); rmm::device_uvector d_lasts(num_local_edge_partitions, handle.get_stream()); - auto map_first = thrust::make_transform_iterator( - thrust::make_counting_iterator(size_t{0}), - cuda::proclaim_return_type( - [mid_local_frontier_offsets = raft::device_span( - d_mid_local_frontier_offsets.data(), - d_mid_local_frontier_offsets.size())] __device__(size_t i) { - return mid_local_frontier_offsets[i + 1]; - })); thrust::gather(handle.get_thrust_policy(), d_mid_local_frontier_offsets.begin() + 1, d_mid_local_frontier_offsets.end(), @@ -4391,7 +4391,7 @@ heterogeneous_biased_sample_without_replacement( return idx * K_sum + K_offsets[type]; })); - compute_heterogeneous_biased_sampling_index_without_replacement( + compute_heterogeneous_biased_sampling_index_without_replacement( handle, std::nullopt, raft::device_span( @@ -4471,9 +4471,9 @@ heterogeneous_biased_sample_without_replacement( high_local_frontier_offsets[i + 1], aggregate_local_frontier_key_idx_to_unique_key_idx.begin() + local_frontier_offsets[i], unique_key_indices_for_key_indices.begin()); - compute_heterogeneous_biased_sampling_index_without_replacement( + compute_heterogeneous_biased_sampling_index_without_replacement( handle, std::make_optional>( unique_key_indices_for_key_indices.data(), unique_key_indices_for_key_indices.size()), @@ -4544,7 +4544,7 @@ heterogeneous_biased_sample_without_replacement( handle.get_stream()); { auto K_first = thrust::make_transform_iterator( - frontier_edge_types + frontier_partition_offsets[2], + frontier_edge_types.begin() + frontier_partition_offsets[2], cuda::proclaim_return_type( [K_offsets = raft::device_span( d_K_offsets.data(), d_K_offsets.size())] __device__(auto type) { @@ -4570,7 +4570,7 @@ heterogeneous_biased_sample_without_replacement( minor_comm_size] __device__(size_t i) { auto idx = thrust::distance( offsets.begin() + 1, - thrust::upper_bound(offsets.begin() + 1, offsets.end(), i / minor_comm_size)); + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i / minor_comm_size)); auto K = offsets[idx + 1] - offsets[idx]; auto minor_comm_rank = (i - offsets[idx] * minor_comm_size) / K; return minor_comm_rank * offsets[offsets.size() - 1] + offsets[idx] + @@ -4596,7 +4596,7 @@ heterogeneous_biased_sample_without_replacement( auto minor_comm_rank = static_cast(i / offsets[offsets.size() - 1]); auto idx = thrust::distance( offsets.begin() + 1, - thrust::upper_bound( + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i % offsets[offsets.size() - 1])); auto frontier_idx = high_frontier_indices[idx]; return frontier_partitioned_per_type_local_degree_displacements[frontier_idx * @@ -4760,7 +4760,7 @@ heterogeneous_biased_sample_without_replacement( return idx * K_sum + (K_offsets[type + 1] - K_offsets[type]); })); - compute_heterogeneous_biased_sampling_index_without_replacement( + compute_heterogeneous_biased_sampling_index_without_replacement( handle, std::make_optional>( unique_key_indices_for_key_indices.data(), unique_key_indices_for_key_indices.size()), @@ -4771,9 +4771,8 @@ heterogeneous_biased_sample_without_replacement( aggregate_local_frontier_unique_key_per_type_local_degree_offsets.size()), raft::device_span(aggregate_local_frontier_unique_key_biases.data(), aggregate_local_frontier_unique_key_biases.size()), - std::make_optional>( - mid_and_high_frontier_output_start_displacements.data(), - mid_and_high_frontier_output_start_displacements.size()), + raft::device_span(mid_and_high_frontier_output_start_displacements.data(), + mid_and_high_frontier_output_start_displacements.size()), raft::device_span(per_type_local_nbr_indices.data(), per_type_local_nbr_indices.size()), std::nullopt, @@ -4798,12 +4797,14 @@ heterogeneous_biased_sample_without_replacement( aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data(), aggregate_local_frontier_unique_key_per_type_local_degree_offsets.size()), local_frontier_unique_key_offsets, - edge_types ? std::make_optional(std::make_tuple( - raft::device_span(edge_types.data(), edge_types.size()), - raft::device_span((*key_indices).data(), (*key_indices).size()))) - : std::nullopt, + edge_types + ? std::make_optional(std::make_tuple( + raft::device_span((*edge_types).data(), (*edge_types).size()), + raft::device_span((*key_indices).data(), (*key_indices).size()))) + : std::nullopt, std::move(per_type_local_nbr_indices), - local_frontier_sample_offsets, + raft::host_span(local_frontier_sample_offsets.data(), + local_frontier_sample_offsets.size()), raft::device_span(d_K_offsets.data(), d_K_offsets.size()), K_sum); @@ -5179,14 +5180,14 @@ heterogeneous_uniform_sample_and_compute_local_nbr_indices( rmm::device_uvector segment_sorted_types(h_nbr_offsets[i + 1] - h_nbr_offsets[i], handle.get_stream()); - rmm::device_uvector nbr_indices(h_nbr_offsets[i + 1] - h_nbr_offsets[i], + rmm::device_uvector nbr_indices(h_nbr_offsets[i + 1] - h_nbr_offsets[i], handle.get_stream()); thrust::tabulate( handle.get_thrust_policy(), nbr_indices.begin(), nbr_indices.end(), [offsets = raft::device_span( - aggregate_local_frontier_unique_key_local_degree_offsets + h_key_offsets[i], + aggregate_local_frontier_unique_key_local_degree_offsets.data() + h_key_offsets[i], (h_key_offsets[i + 1] - h_key_offsets[i]) + 1), start_offset = h_nbr_offsets[i]] __device__(size_t i) { auto idx = thrust::distance( @@ -5195,7 +5196,7 @@ heterogeneous_uniform_sample_and_compute_local_nbr_indices( return static_cast((start_offset + i) - offsets[idx]); }); raft::device_span segment_sorted_nbr_indices( - aggregate_local_frontier_unique_key_org_indices.dta() + h_nbr_offsets[i], + aggregate_local_frontier_unique_key_org_indices.data() + h_nbr_offsets[i], h_nbr_offsets[i + 1] - h_nbr_offsets[i]); auto offset_first = thrust::make_transform_iterator( @@ -5249,12 +5250,13 @@ heterogeneous_uniform_sample_and_compute_local_nbr_indices( raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data(), aggregate_local_frontier_unique_key_local_degree_offsets.size()), - local_frontier_unique_key_offsets, + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), num_edge_types); rmm::device_uvector frontier_per_type_degrees(0, handle.get_stream()); std::optional> - frontier_partitioned_per_type_local_degree_displacements(0, handle.get_stream()); + frontier_partitioned_per_type_local_degree_displacements{std::nullopt}; if (minor_comm_size > 1) { std::tie(frontier_per_type_degrees, frontier_partitioned_per_type_local_degree_displacements) = @@ -5297,7 +5299,7 @@ heterogeneous_uniform_sample_and_compute_local_nbr_indices( std::optional> edge_types{std::nullopt}; if (minor_comm_size > 1) { std::tie(per_type_local_nbr_indices, edge_types, key_indices, local_frontier_sample_offsets) = - shuffle_and_compute_per_type_local_nbr_values( + shuffle_and_compute_per_type_local_nbr_values( handle, std::move(per_type_nbr_indices), raft::device_span( @@ -5330,14 +5332,16 @@ heterogeneous_uniform_sample_and_compute_local_nbr_indices( handle, raft::device_span(aggregate_local_frontier_unique_key_indices.data(), aggregate_local_frontier_unique_key_indices.size()), - local_frontier_unique_key_offsets, + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), raft::device_span( aggregate_local_frontier_unique_key_edge_types.data(), aggregate_local_frontier_unique_key_edge_types.size()), raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data(), aggregate_local_frontier_unique_key_local_degree_offsets.size()), - local_frontier_unique_key_offsets, + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), num_edge_types); aggregate_local_frontier_unique_key_per_type_local_degree_offsets.set_element_to_zero_async( @@ -5359,14 +5363,16 @@ heterogeneous_uniform_sample_and_compute_local_nbr_indices( raft::device_span( aggregate_local_frontier_unique_key_per_type_local_degree_offsets.data(), aggregate_local_frontier_unique_key_per_type_local_degree_offsets.size()), - local_frontier_unique_key_offsets, + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), edge_types ? std::make_optional(std::make_tuple( - raft::device_span(edge_types.data(), edge_types.size()), + raft::device_span((*edge_types).data(), (*edge_types).size()), raft::device_span((*key_indices).data(), (*key_indices).size()))) : std::nullopt, std::move(per_type_local_nbr_indices), - local_frontier_sample_offsets, + raft::host_span(local_frontier_sample_offsets.data(), + local_frontier_sample_offsets.size()), raft::device_span(d_K_offsets.data(), d_K_offsets.size()), K_sum); } @@ -5383,7 +5389,12 @@ heterogeneous_uniform_sample_and_compute_local_nbr_indices( raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data(), aggregate_local_frontier_unique_key_local_degree_offsets.size()), + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), std::move(local_nbr_indices), + key_indices ? std::make_optional>((*key_indices).data(), + (*key_indices).size()) + : std::nullopt, raft::host_span(local_frontier_sample_offsets.data(), local_frontier_sample_offsets.size()), K_sum); @@ -5609,7 +5620,7 @@ template , std::optional>, std::vector> -hetrogeneous_biased_sample_and_compute_local_nbr_indices( +heterogeneous_biased_sample_and_compute_local_nbr_indices( raft::handle_t const& handle, GraphViewType const& graph_view, KeyIterator aggregate_local_frontier_key_first, @@ -5783,13 +5794,15 @@ hetrogeneous_biased_sample_and_compute_local_nbr_indices( handle, raft::device_span(aggregate_local_frontier_unique_key_indices.data(), aggregate_local_frontier_unique_key_indices.size()), - local_frontier_unique_key_offsets, + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), raft::device_span(aggregate_local_frontier_unique_key_edge_types.data(), aggregate_local_frontier_unique_key_edge_types.size()), raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data(), aggregate_local_frontier_unique_key_local_degree_offsets.size()), - local_frontier_unique_key_offsets, + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), num_edge_types); rmm::device_uvector aggregate_local_frontier_unique_key_per_type_local_degree_offsets( @@ -5824,6 +5837,7 @@ hetrogeneous_biased_sample_and_compute_local_nbr_indices( } else { std::tie(local_nbr_indices, key_indices, local_frontier_sample_offsets) = heterogeneous_biased_sample_without_replacement( handle, @@ -5857,8 +5871,14 @@ hetrogeneous_biased_sample_and_compute_local_nbr_indices( raft::device_span( aggregate_local_frontier_unique_key_local_degree_offsets.data(), aggregate_local_frontier_unique_key_local_degree_offsets.size()), + raft::host_span(local_frontier_unique_key_offsets.data(), + local_frontier_unique_key_offsets.size()), std::move(local_nbr_indices), - local_frontier_sample_offsets, + key_indices ? std::make_optional>((*key_indices).data(), + (*key_indices).size()) + : std::nullopt, + raft::host_span(local_frontier_sample_offsets.data(), + local_frontier_sample_offsets.size()), K_sum); // 5. convert neighbor indices in the neighbor list considering edge mask to neighbor indices in From 88880221c3a9336515b6f1a98b37acff53ca490f Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 30 Jan 2025 19:20:51 -0800 Subject: [PATCH 21/21] add heterogeneous sampling primitive tests --- ...er_v_random_select_transform_outgoing_e.cu | 525 +++++++++++++----- 1 file changed, 380 insertions(+), 145 deletions(-) diff --git a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu index 30a53cd15a4..2b6372802c3 100644 --- a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu +++ b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu @@ -59,11 +59,12 @@ struct e_bias_op_t { } }; -template +template struct e_op_t { using result_t = decltype(cugraph::thrust_tuple_cat(thrust::tuple{}, cugraph::to_thrust_tuple(property_t{}), - cugraph::to_thrust_tuple(property_t{}))); + cugraph::to_thrust_tuple(property_t{}), + cugraph::to_thrust_tuple(edge_type_t{}))); __device__ result_t operator()(vertex_t src, vertex_t dst, @@ -78,14 +79,15 @@ struct e_op_t { thrust::get<0>(src_prop), thrust::get<1>(src_prop), thrust::get<0>(dst_prop), - thrust::get<1>(dst_prop)); + thrust::get<1>(dst_prop), + edge_type_t{0}); } else { - return thrust::make_tuple(src, dst, src_prop, dst_prop); + return thrust::make_tuple(src, dst, src_prop, dst_prop, edge_type_t{0}); } } - __device__ result_t - operator()(vertex_t src, vertex_t dst, property_t src_prop, property_t dst_prop, weight_t w) const + __device__ result_t operator()( + vertex_t src, vertex_t dst, property_t src_prop, property_t dst_prop, edge_type_t type) const { if constexpr (cugraph::is_thrust_tuple_of_arithmetic::value) { static_assert(thrust::tuple_size::value == size_t{2}); @@ -94,16 +96,17 @@ struct e_op_t { thrust::get<0>(src_prop), thrust::get<1>(src_prop), thrust::get<0>(dst_prop), - thrust::get<1>(dst_prop)); + thrust::get<1>(dst_prop), + type); } else { - return thrust::make_tuple(src, dst, src_prop, dst_prop); + return thrust::make_tuple(src, dst, src_prop, dst_prop, type); } } }; struct Prims_Usecase { size_t num_seeds{0}; - size_t K{0}; + std::vector Ks{}; bool with_replacement{false}; bool use_invalid_value{false}; bool use_weight_as_bias{false}; @@ -129,6 +132,8 @@ class Tests_MGPerVRandomSelectTransformOutgoingE template void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase) { + using edge_type_t = int32_t; + HighResTimer hr_timer{}; auto const comm_rank = handle_->get_comms().get_rank(); @@ -164,6 +169,15 @@ class Tests_MGPerVRandomSelectTransformOutgoingE mg_graph_view.attach_edge_mask((*edge_mask).view()); } + std::optional> mg_edge_types{ + std::nullopt}; + if (prims_usecase.Ks.size() > 1) { + mg_edge_types = cugraph::test::generate::edge_property( + *handle_, mg_graph_view, static_cast(prims_usecase.Ks.size())); + } + auto mg_edge_type_view = + mg_edge_types ? std::make_optional((*mg_edge_types).view()) : std::nullopt; + if (mg_edge_weight_view && prims_usecase.inject_zero_bias) { cugraph::transform_e( *handle_, @@ -219,7 +233,8 @@ class Tests_MGPerVRandomSelectTransformOutgoingE using result_t = decltype(cugraph::thrust_tuple_cat(thrust::tuple{}, cugraph::to_thrust_tuple(property_t{}), - cugraph::to_thrust_tuple(property_t{}))); + cugraph::to_thrust_tuple(property_t{}), + cugraph::to_thrust_tuple(edge_type_t{}))); std::optional invalid_value{std::nullopt}; if (prims_usecase.use_invalid_value) { @@ -235,34 +250,67 @@ class Tests_MGPerVRandomSelectTransformOutgoingE } auto [mg_sample_offsets, mg_sample_e_op_results] = - prims_usecase.use_weight_as_bias ? cugraph::per_v_random_select_transform_outgoing_e( - *handle_, - mg_graph_view, - mg_vertex_frontier.bucket(bucket_idx_cur), - cugraph::edge_src_dummy_property_t{}.view(), - cugraph::edge_dst_dummy_property_t{}.view(), - *mg_edge_weight_view, - e_bias_op_t{}, - mg_src_prop.view(), - mg_dst_prop.view(), - *mg_edge_weight_view, - e_op_t{}, - rng_state, - prims_usecase.K, - prims_usecase.with_replacement, - invalid_value) - : cugraph::per_v_random_select_transform_outgoing_e( - *handle_, - mg_graph_view, - mg_vertex_frontier.bucket(bucket_idx_cur), - mg_src_prop.view(), - mg_dst_prop.view(), - cugraph::edge_dummy_property_t{}.view(), - e_op_t{}, - rng_state, - prims_usecase.K, - prims_usecase.with_replacement, - invalid_value); + (prims_usecase.Ks.size() == 1) + ? (prims_usecase.use_weight_as_bias + ? cugraph::per_v_random_select_transform_outgoing_e( + *handle_, + mg_graph_view, + mg_vertex_frontier.bucket(bucket_idx_cur), + cugraph::edge_src_dummy_property_t{}.view(), + cugraph::edge_dst_dummy_property_t{}.view(), + *mg_edge_weight_view, + e_bias_op_t{}, + mg_src_prop.view(), + mg_dst_prop.view(), + cugraph::edge_dummy_property_t{}.view(), + e_op_t{}, + rng_state, + prims_usecase.Ks[0], + prims_usecase.with_replacement, + invalid_value) + : cugraph::per_v_random_select_transform_outgoing_e( + *handle_, + mg_graph_view, + mg_vertex_frontier.bucket(bucket_idx_cur), + mg_src_prop.view(), + mg_dst_prop.view(), + cugraph::edge_dummy_property_t{}.view(), + e_op_t{}, + rng_state, + prims_usecase.Ks[0], + prims_usecase.with_replacement, + invalid_value)) + : (prims_usecase.use_weight_as_bias + ? cugraph::per_v_random_select_transform_outgoing_e( + *handle_, + mg_graph_view, + mg_vertex_frontier.bucket(bucket_idx_cur), + cugraph::edge_src_dummy_property_t{}.view(), + cugraph::edge_dst_dummy_property_t{}.view(), + *mg_edge_weight_view, + e_bias_op_t{}, + mg_src_prop.view(), + mg_dst_prop.view(), + *mg_edge_type_view, + e_op_t{}, + *mg_edge_type_view, + rng_state, + raft::host_span(prims_usecase.Ks.data(), prims_usecase.Ks.size()), + prims_usecase.with_replacement, + invalid_value) + : cugraph::per_v_random_select_transform_outgoing_e( + *handle_, + mg_graph_view, + mg_vertex_frontier.bucket(bucket_idx_cur), + mg_src_prop.view(), + mg_dst_prop.view(), + *mg_edge_type_view, + e_op_t{}, + *mg_edge_type_view, + rng_state, + raft::host_span(prims_usecase.Ks.data(), prims_usecase.Ks.size()), + prims_usecase.with_replacement, + invalid_value)); if (cugraph::test::g_perf) { RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement @@ -334,27 +382,35 @@ class Tests_MGPerVRandomSelectTransformOutgoingE cugraph::test::device_gatherv(*handle_, std::get<3>(mg_sample_e_op_results).data(), std::get<3>(mg_sample_e_op_results).size()); + std::get<4>(mg_aggregate_sample_e_op_results) = + cugraph::test::device_gatherv(*handle_, + std::get<4>(mg_sample_e_op_results).data(), + std::get<4>(mg_sample_e_op_results).size()); if constexpr (cugraph::is_thrust_tuple_of_arithmetic::value) { - std::get<4>(mg_aggregate_sample_e_op_results) = - cugraph::test::device_gatherv(*handle_, - std::get<4>(mg_sample_e_op_results).data(), - std::get<4>(mg_sample_e_op_results).size()); std::get<5>(mg_aggregate_sample_e_op_results) = cugraph::test::device_gatherv(*handle_, std::get<5>(mg_sample_e_op_results).data(), std::get<5>(mg_sample_e_op_results).size()); + std::get<6>(mg_aggregate_sample_e_op_results) = + cugraph::test::device_gatherv(*handle_, + std::get<6>(mg_sample_e_op_results).data(), + std::get<6>(mg_sample_e_op_results).size()); } cugraph::graph_t sg_graph(*handle_); std::optional< cugraph::edge_property_t, weight_t>> sg_edge_weights{std::nullopt}; - std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore) = + std::optional, + edge_type_t>> + sg_edge_types{std::nullopt}; + std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph( *handle_, mg_graph_view, mg_edge_weight_view, std::optional>{std::nullopt}, + mg_edge_type_view, std::make_optional>((*mg_renumber_map).data(), (*mg_renumber_map).size()), false); @@ -374,6 +430,8 @@ class Tests_MGPerVRandomSelectTransformOutgoingE auto sg_graph_view = sg_graph.view(); auto sg_edge_weight_view = sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt; + auto sg_edge_type_view = + sg_edge_types ? std::make_optional((*sg_edge_types).view()) : std::nullopt; rmm::device_uvector sg_offsets(sg_graph_view.number_of_vertices() + vertex_t{1}, handle_->get_stream()); @@ -399,6 +457,23 @@ class Tests_MGPerVRandomSelectTransformOutgoingE handle_->get_thrust_policy(), firsts[0], firsts[0] + counts[0], (*sg_biases).begin()); } + std::optional> sg_edge_types{std::nullopt}; + if (sg_edge_type_view) { + auto firsts = (*sg_edge_type_view).value_firsts(); + auto counts = (*sg_edge_type_view).edge_counts(); + assert(firsts.size() == 1); + assert(counts.size() == 1); + sg_edge_types = rmm::device_uvector(counts[0], handle_->get_stream()); + thrust::copy(handle_->get_thrust_policy(), + firsts[0], + firsts[0] + counts[0], + (*sg_edge_types).begin()); + } + + rmm::device_uvector Ks(prims_usecase.Ks.size(), handle_->get_stream()); + raft::update_device( + Ks.data(), prims_usecase.Ks.data(), prims_usecase.Ks.size(), handle_->get_stream()); + auto K_sum = std::reduce(Ks.begin(), Ks.end()); auto num_invalids = static_cast(thrust::count_if( handle_->get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), @@ -413,19 +488,24 @@ class Tests_MGPerVRandomSelectTransformOutgoingE sg_indices = sg_indices.begin(), sg_biases = sg_biases ? cuda::std::make_optional((*sg_biases).begin()) : cuda::std::nullopt, - K = prims_usecase.K, + sg_edge_types = sg_edge_types ? cuda::std::make_optional((*sg_edge_types).begin()) + : cuda::std::nullopt, + Ks = raft::device_span(Ks.data(), Ks.size()), + K_sum, with_replacement = prims_usecase.with_replacement, invalid_value = invalid_value ? cuda::std::make_optional(*invalid_value) : cuda::std::nullopt, property_transform = cugraph::test::detail::vertex_property_transform{ hash_bin_count}] __device__(size_t i) { + auto num_edge_types = static_cast(Ks.size()); + auto v = *(frontier_vertex_first + i); // check sample_offsets - auto offset_first = sample_offsets ? *(*sample_offsets + i) : K * i; - auto offset_last = sample_offsets ? *(*sample_offsets + (i + 1)) : K * (i + 1); + auto offset_first = sample_offsets ? *(*sample_offsets + i) : K_sum * i; + auto offset_last = sample_offsets ? *(*sample_offsets + (i + 1)) : K_sum * (i + 1); if (!sample_offsets) { size_t num_valids{0}; for (size_t j = offset_first; j < offset_last; ++j) { @@ -439,7 +519,7 @@ class Tests_MGPerVRandomSelectTransformOutgoingE } offset_last = offset_first + num_valids; } - auto count = offset_last - offset_first; + auto sample_count = offset_last - offset_first; auto out_degree = *(sg_offsets + v + 1) - *(sg_offsets + v); if (sg_biases) { @@ -449,11 +529,70 @@ class Tests_MGPerVRandomSelectTransformOutgoingE [] __device__(auto bias) { return bias > 0.0; }); } if (with_replacement) { - if ((out_degree > 0 && count != K) || (out_degree == 0 && count != 0)) { + if ((out_degree > 0 && sample_count != K_sum) || + (out_degree == 0 && sample_count != 0)) { return true; } } else { - if (count != std::min(static_cast(out_degree), K)) { return true; } + if (sample_count != std::min(static_cast(out_degree), K_sum)) { return true; } + } + + // check per-type sample counts + + if (num_edge_types > 1) { + edge_type_t constexpr array_size = 8; + cuda::std::array per_type_out_degrees{}; + cuda::std::array per_type_sample_counts{}; + auto num_chunks = (num_edge_types + array_size - edge_type_t{1}) / array_size; + for (edge_type_t c = 0; c < num_chunks; ++c) { + thrust::fill( + thrust::seq, per_type_out_degrees.begin(), per_type_out_degrees.end(), edge_t{0}); + thrust::fill(thrust::seq, + per_type_sample_counts.begin(), + per_type_sample_counts.end(), + edge_t{0}); + + assert(sg_edge_types); + for (auto offset = *(sg_offsets + v); offset < *(sg_offsets + v + 1); ++offset) { + auto type = *((*sg_edge_types) + offset); + if (type >= c * array_size && + type < cuda::std::min((c + 1) * array_size, num_edge_types)) { + if (!sg_biases || (*(*sg_biases + offset) > 0.0)) { + ++per_type_out_degrees[type - c * array_size]; + } + } + } + for (auto offset = offset_first; offset < offset_last; ++offset) { + auto e_op_result = *(sample_e_op_result_first + offset); + auto type = thrust::get::value - 1>(e_op_result); + if (type >= c * array_size && + type < cuda::std::min((c + 1) * array_size, num_edge_types)) { + ++per_type_sample_counts[type - c * array_size]; + } + } + if (with_replacement) { + for (edge_type_t t = 0; + t < cuda::std::min(array_size, num_edge_types - c * array_size); + ++t) { + if ((per_type_out_degrees[t] > 0 && + per_type_sample_counts[t] != + static_cast(Ks[c * array_size + t])) || + (per_type_out_degrees[t] == 0 && per_type_sample_counts[t] != 0)) { + return true; + } + } + } else { + for (edge_type_t t = 0; + t < cuda::std::min(array_size, num_edge_types - c * array_size); + ++t) { + if (per_type_sample_counts[t] != + cuda::std::min(per_type_out_degrees[t], + static_cast(Ks[c * array_size + t]))) { + return true; + } + } + } + } } // check sample_e_op_results @@ -603,60 +742,108 @@ INSTANTIATE_TEST_SUITE_P( file_test, Tests_MGPerVRandomSelectTransformOutgoingE_File, ::testing::Combine( - ::testing::Values(Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, true, false}, - Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, true, true}, - Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, true, false}, - Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, true, true}, - Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, true, false}, - Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, true, true}, - Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, true, false}, - Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, true, true}), + ::testing::Values(Prims_Usecase{size_t{1000}, {4}, false, false, false, false, false}, + Prims_Usecase{size_t{1000}, {4}, false, false, false, false, true}, + Prims_Usecase{size_t{1000}, {4}, false, false, true, false, false}, + Prims_Usecase{size_t{1000}, {4}, false, false, true, true, false}, + Prims_Usecase{size_t{1000}, {4}, false, false, true, false, true}, + Prims_Usecase{size_t{1000}, {4}, false, false, true, true, true}, + Prims_Usecase{size_t{1000}, {4}, false, true, false, false, false}, + Prims_Usecase{size_t{1000}, {4}, false, true, false, false, true}, + Prims_Usecase{size_t{1000}, {4}, false, true, true, false, false}, + Prims_Usecase{size_t{1000}, {4}, false, true, true, true, false}, + Prims_Usecase{size_t{1000}, {4}, false, true, true, false, true}, + Prims_Usecase{size_t{1000}, {4}, false, true, true, true, true}, + Prims_Usecase{size_t{1000}, {4}, true, false, false, false, false}, + Prims_Usecase{size_t{1000}, {4}, true, false, false, false, true}, + Prims_Usecase{size_t{1000}, {4}, true, false, true, false, false}, + Prims_Usecase{size_t{1000}, {4}, true, false, true, true, false}, + Prims_Usecase{size_t{1000}, {4}, true, false, true, false, true}, + Prims_Usecase{size_t{1000}, {4}, true, false, true, true, true}, + Prims_Usecase{size_t{1000}, {4}, true, true, false, false, false}, + Prims_Usecase{size_t{1000}, {4}, true, true, false, false, true}, + Prims_Usecase{size_t{1000}, {4}, true, true, true, false, false}, + Prims_Usecase{size_t{1000}, {4}, true, true, true, true, false}, + Prims_Usecase{size_t{1000}, {4}, true, true, true, false, true}, + Prims_Usecase{size_t{1000}, {4}, true, true, true, true, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, false, false, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, false, false, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, false, true, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, false, true, true, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, false, true, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, false, true, true, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, true, false, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, true, false, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, true, true, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, true, true, true, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, true, true, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, true, true, true, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, false, false, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, false, false, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, false, true, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, false, true, true, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, false, true, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, false, true, true, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, true, false, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, true, false, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, true, true, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, true, true, true, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, true, true, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, true, true, true, true}), ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx")))); INSTANTIATE_TEST_SUITE_P( file_large_test, Tests_MGPerVRandomSelectTransformOutgoingE_File, ::testing::Combine( - ::testing::Values(Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, true, false}, - Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, true, true}, - Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, true, false}, - Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, true, true}, - Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, true, false}, - Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, true, true}, - Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, true, false}, - Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, true, true}), + ::testing::Values(Prims_Usecase{size_t{1000}, {4}, false, false, false, false, false}, + Prims_Usecase{size_t{1000}, {4}, false, false, false, false, true}, + Prims_Usecase{size_t{1000}, {4}, false, false, true, false, false}, + Prims_Usecase{size_t{1000}, {4}, false, false, true, true, false}, + Prims_Usecase{size_t{1000}, {4}, false, false, true, false, true}, + Prims_Usecase{size_t{1000}, {4}, false, false, true, true, true}, + Prims_Usecase{size_t{1000}, {4}, false, true, false, false, false}, + Prims_Usecase{size_t{1000}, {4}, false, true, false, false, true}, + Prims_Usecase{size_t{1000}, {4}, false, true, true, false, false}, + Prims_Usecase{size_t{1000}, {4}, false, true, true, true, false}, + Prims_Usecase{size_t{1000}, {4}, false, true, true, false, true}, + Prims_Usecase{size_t{1000}, {4}, false, true, true, true, true}, + Prims_Usecase{size_t{1000}, {4}, true, false, false, false, false}, + Prims_Usecase{size_t{1000}, {4}, true, false, false, false, true}, + Prims_Usecase{size_t{1000}, {4}, true, false, true, false, false}, + Prims_Usecase{size_t{1000}, {4}, true, false, true, true, false}, + Prims_Usecase{size_t{1000}, {4}, true, false, true, false, true}, + Prims_Usecase{size_t{1000}, {4}, true, false, true, true, true}, + Prims_Usecase{size_t{1000}, {4}, true, true, false, false, false}, + Prims_Usecase{size_t{1000}, {4}, true, true, false, false, true}, + Prims_Usecase{size_t{1000}, {4}, true, true, true, false, false}, + Prims_Usecase{size_t{1000}, {4}, true, true, true, true, false}, + Prims_Usecase{size_t{1000}, {4}, true, true, true, false, true}, + Prims_Usecase{size_t{1000}, {4}, true, true, true, true, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, false, false, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, false, false, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, false, true, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, false, true, true, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, false, true, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, false, true, true, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, true, false, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, true, false, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, true, true, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, true, true, true, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, true, true, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, true, true, true, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, false, false, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, false, false, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, false, true, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, false, true, true, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, false, true, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, false, true, true, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, true, false, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, true, false, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, true, true, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, true, true, true, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, true, true, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, true, true, true, true}), ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"), cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"), cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx")))); @@ -665,30 +852,54 @@ INSTANTIATE_TEST_SUITE_P( rmat_small_test, Tests_MGPerVRandomSelectTransformOutgoingE_Rmat, ::testing::Combine( - ::testing::Values(Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, false, false, false, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, true, false}, - Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, false, false, true, true, true}, - Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, false, true, false, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, true, false}, - Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, false, true, true, true, true}, - Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, true, false, false, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, true, false}, - Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, true, false, true, true, true}, - Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, true, true, false, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, false, false}, - Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, true, false}, - Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, false, true}, - Prims_Usecase{size_t{1000}, size_t{4}, true, true, true, true, true}), + ::testing::Values(Prims_Usecase{size_t{1000}, {4}, false, false, false, false, false}, + Prims_Usecase{size_t{1000}, {4}, false, false, false, false, true}, + Prims_Usecase{size_t{1000}, {4}, false, false, true, false, false}, + Prims_Usecase{size_t{1000}, {4}, false, false, true, true, false}, + Prims_Usecase{size_t{1000}, {4}, false, false, true, false, true}, + Prims_Usecase{size_t{1000}, {4}, false, false, true, true, true}, + Prims_Usecase{size_t{1000}, {4}, false, true, false, false, false}, + Prims_Usecase{size_t{1000}, {4}, false, true, false, false, true}, + Prims_Usecase{size_t{1000}, {4}, false, true, true, false, false}, + Prims_Usecase{size_t{1000}, {4}, false, true, true, true, false}, + Prims_Usecase{size_t{1000}, {4}, false, true, true, false, true}, + Prims_Usecase{size_t{1000}, {4}, false, true, true, true, true}, + Prims_Usecase{size_t{1000}, {4}, true, false, false, false, false}, + Prims_Usecase{size_t{1000}, {4}, true, false, false, false, true}, + Prims_Usecase{size_t{1000}, {4}, true, false, true, false, false}, + Prims_Usecase{size_t{1000}, {4}, true, false, true, true, false}, + Prims_Usecase{size_t{1000}, {4}, true, false, true, false, true}, + Prims_Usecase{size_t{1000}, {4}, true, false, true, true, true}, + Prims_Usecase{size_t{1000}, {4}, true, true, false, false, false}, + Prims_Usecase{size_t{1000}, {4}, true, true, false, false, true}, + Prims_Usecase{size_t{1000}, {4}, true, true, true, false, false}, + Prims_Usecase{size_t{1000}, {4}, true, true, true, true, false}, + Prims_Usecase{size_t{1000}, {4}, true, true, true, false, true}, + Prims_Usecase{size_t{1000}, {4}, true, true, true, true, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, false, false, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, false, false, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, false, true, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, false, true, true, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, false, true, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, false, true, true, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, true, false, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, true, false, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, true, true, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, true, true, true, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, true, true, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, false, true, true, true, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, false, false, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, false, false, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, false, true, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, false, true, true, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, false, true, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, false, true, true, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, true, false, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, true, false, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, true, true, false, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, true, true, true, false}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, true, true, false, true}, + Prims_Usecase{size_t{1000}, {2, 2, 0}, true, true, true, true, true}), ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false)))); INSTANTIATE_TEST_SUITE_P( @@ -700,30 +911,54 @@ INSTANTIATE_TEST_SUITE_P( Tests_MGPerVRandomSelectTransformOutgoingE_Rmat, ::testing::Combine( ::testing::Values( - Prims_Usecase{size_t{10000000}, size_t{25}, false, false, false, false, false, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, false, false, false, false, true, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, false, false, true, false, false, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, false, false, true, true, false, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, false, false, true, false, true, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, false, false, true, true, true, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, false, true, false, false, false, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, false, true, false, false, true, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, false, true, true, false, false, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, false, true, true, true, false, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, false, true, true, false, true, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, false, true, true, true, true, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, true, false, false, false, false, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, true, false, false, false, true, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, true, false, true, false, false, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, true, false, true, true, false, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, true, false, true, false, true, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, true, false, true, true, true, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, true, true, false, false, false, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, true, true, false, false, true, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, true, true, true, false, false, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, true, true, true, true, false, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, true, true, true, false, true, false}, - Prims_Usecase{size_t{10000000}, size_t{25}, true, true, true, true, true, false}), + Prims_Usecase{size_t{10000000}, {25}, false, false, false, false, false, false}, + Prims_Usecase{size_t{10000000}, {25}, false, false, false, false, true, false}, + Prims_Usecase{size_t{10000000}, {25}, false, false, true, false, false, false}, + Prims_Usecase{size_t{10000000}, {25}, false, false, true, true, false, false}, + Prims_Usecase{size_t{10000000}, {25}, false, false, true, false, true, false}, + Prims_Usecase{size_t{10000000}, {25}, false, false, true, true, true, false}, + Prims_Usecase{size_t{10000000}, {25}, false, true, false, false, false, false}, + Prims_Usecase{size_t{10000000}, {25}, false, true, false, false, true, false}, + Prims_Usecase{size_t{10000000}, {25}, false, true, true, false, false, false}, + Prims_Usecase{size_t{10000000}, {25}, false, true, true, true, false, false}, + Prims_Usecase{size_t{10000000}, {25}, false, true, true, false, true, false}, + Prims_Usecase{size_t{10000000}, {25}, false, true, true, true, true, false}, + Prims_Usecase{size_t{10000000}, {25}, true, false, false, false, false, false}, + Prims_Usecase{size_t{10000000}, {25}, true, false, false, false, true, false}, + Prims_Usecase{size_t{10000000}, {25}, true, false, true, false, false, false}, + Prims_Usecase{size_t{10000000}, {25}, true, false, true, true, false, false}, + Prims_Usecase{size_t{10000000}, {25}, true, false, true, false, true, false}, + Prims_Usecase{size_t{10000000}, {25}, true, false, true, true, true, false}, + Prims_Usecase{size_t{10000000}, {25}, true, true, false, false, false, false}, + Prims_Usecase{size_t{10000000}, {25}, true, true, false, false, true, false}, + Prims_Usecase{size_t{10000000}, {25}, true, true, true, false, false, false}, + Prims_Usecase{size_t{10000000}, {25}, true, true, true, true, false, false}, + Prims_Usecase{size_t{10000000}, {25}, true, true, true, false, true, false}, + Prims_Usecase{size_t{10000000}, {25}, true, true, true, true, true, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, false, false, false, false, false, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, false, false, false, false, true, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, false, false, true, false, false, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, false, false, true, true, false, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, false, false, true, false, true, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, false, false, true, true, true, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, false, true, false, false, false, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, false, true, false, false, true, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, false, true, true, false, false, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, false, true, true, true, false, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, false, true, true, false, true, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, false, true, true, true, true, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, true, false, false, false, false, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, true, false, false, false, true, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, true, false, true, false, false, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, true, false, true, true, false, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, true, false, true, false, true, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, true, false, true, true, true, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, true, true, false, false, false, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, true, true, false, false, true, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, true, true, true, false, false, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, true, true, true, true, false, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, true, true, true, false, true, false}, + Prims_Usecase{size_t{10000000}, {10, 0, 15}, true, true, true, true, true, false}), ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false)))); CUGRAPH_MG_TEST_PROGRAM_MAIN()