diff --git a/cpp/include/cugraph/dendrogram.hpp b/cpp/include/cugraph/dendrogram.hpp index aa0802e80b3..2640944dc09 100644 --- a/cpp/include/cugraph/dendrogram.hpp +++ b/cpp/include/cugraph/dendrogram.hpp @@ -27,10 +27,11 @@ class Dendrogram { public: void add_level(vertex_t first_index, vertex_t num_verts, - cudaStream_t stream, + rmm::cuda_stream_view stream_view, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) { - level_ptr_.push_back(std::make_unique>(num_verts, stream, mr)); + level_ptr_.push_back( + std::make_unique>(num_verts, stream_view, mr)); level_first_index_.push_back(first_index); } diff --git a/cpp/include/cugraph/utilities/collect_comm.cuh b/cpp/include/cugraph/utilities/collect_comm.cuh index ddc5621e929..76eff8afc71 100644 --- a/cpp/include/cugraph/utilities/collect_comm.cuh +++ b/cpp/include/cugraph/utilities/collect_comm.cuh @@ -50,7 +50,7 @@ collect_values_for_keys(raft::comms::comms_t const &comm, VertexIterator1 collect_key_first, VertexIterator1 collect_key_last, KeyToGPUIdOp key_to_gpu_id_op, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { using vertex_t = typename std::iterator_traits::value_type; static_assert( @@ -66,7 +66,7 @@ collect_values_for_keys(raft::comms::comms_t const &comm, // 1. build a cuco::static_map object for the map k, v pairs. auto poly_alloc = rmm::mr::polymorphic_allocator(rmm::mr::get_current_device_resource()); - auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(poly_alloc, cudaStream_t{nullptr}); + auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(poly_alloc, stream_view); auto kv_map_ptr = std::make_unique< cuco::static_map>( // cuco::static_map requires at least one empty slot @@ -84,37 +84,37 @@ collect_values_for_keys(raft::comms::comms_t const &comm, // 2. collect values for the unique keys in [collect_key_first, collect_key_last) rmm::device_uvector unique_keys(thrust::distance(collect_key_first, collect_key_last), - stream); + stream_view); thrust::copy( - rmm::exec_policy(stream)->on(stream), collect_key_first, collect_key_last, unique_keys.begin()); - thrust::sort(rmm::exec_policy(stream)->on(stream), unique_keys.begin(), unique_keys.end()); + rmm::exec_policy(stream_view), collect_key_first, collect_key_last, unique_keys.begin()); + thrust::sort(rmm::exec_policy(stream_view), unique_keys.begin(), unique_keys.end()); unique_keys.resize( thrust::distance( unique_keys.begin(), - thrust::unique(rmm::exec_policy(stream)->on(stream), unique_keys.begin(), unique_keys.end())), - stream); + thrust::unique(rmm::exec_policy(stream_view), unique_keys.begin(), unique_keys.end())), + stream_view); - rmm::device_uvector values_for_unique_keys(0, stream); + rmm::device_uvector values_for_unique_keys(0, stream_view); { - rmm::device_uvector rx_unique_keys(0, stream); + rmm::device_uvector rx_unique_keys(0, stream_view); std::vector rx_value_counts{}; std::tie(rx_unique_keys, rx_value_counts) = groupby_gpuid_and_shuffle_values( comm, unique_keys.begin(), unique_keys.end(), [key_to_gpu_id_op] __device__(auto val) { return key_to_gpu_id_op(val); }, - stream); + stream_view); - rmm::device_uvector values_for_rx_unique_keys(rx_unique_keys.size(), stream); + rmm::device_uvector values_for_rx_unique_keys(rx_unique_keys.size(), stream_view); - CUDA_TRY(cudaStreamSynchronize(stream)); // cuco::static_map currently does not take stream + stream_view.synchronize(); // cuco::static_map currently does not take stream kv_map_ptr->find( rx_unique_keys.begin(), rx_unique_keys.end(), values_for_rx_unique_keys.begin()); - rmm::device_uvector rx_values_for_unique_keys(0, stream); + rmm::device_uvector rx_values_for_unique_keys(0, stream_view); std::tie(rx_values_for_unique_keys, std::ignore) = - shuffle_values(comm, values_for_rx_unique_keys.begin(), rx_value_counts, stream); + shuffle_values(comm, values_for_rx_unique_keys.begin(), rx_value_counts, stream_view); values_for_unique_keys = std::move(rx_values_for_unique_keys); } @@ -122,7 +122,7 @@ collect_values_for_keys(raft::comms::comms_t const &comm, // 3. re-build a cuco::static_map object for the k, v pairs in unique_keys, // values_for_unique_keys. - CUDA_TRY(cudaStreamSynchronize(stream)); // cuco::static_map currently does not take stream + stream_view.synchronize(); // cuco::static_map currently does not take stream kv_map_ptr.reset(); @@ -143,7 +143,7 @@ collect_values_for_keys(raft::comms::comms_t const &comm, // 4. find values for [collect_key_first, collect_key_last) auto value_buffer = allocate_dataframe_buffer( - thrust::distance(collect_key_first, collect_key_last), stream); + thrust::distance(collect_key_first, collect_key_last), stream_view); kv_map_ptr->find( collect_key_first, collect_key_last, get_dataframe_buffer_begin(value_buffer)); @@ -165,7 +165,7 @@ collect_values_for_unique_keys(raft::comms::comms_t const &comm, VertexIterator1 collect_unique_key_first, VertexIterator1 collect_unique_key_last, KeyToGPUIdOp key_to_gpu_id_op, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { using vertex_t = typename std::iterator_traits::value_type; static_assert( @@ -181,7 +181,7 @@ collect_values_for_unique_keys(raft::comms::comms_t const &comm, // 1. build a cuco::static_map object for the map k, v pairs. auto poly_alloc = rmm::mr::polymorphic_allocator(rmm::mr::get_current_device_resource()); - auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(poly_alloc, cudaStream_t{nullptr}); + auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(poly_alloc, stream_view); auto kv_map_ptr = std::make_unique< cuco::static_map>( // cuco::static_map requires at least one empty slot @@ -199,33 +199,33 @@ collect_values_for_unique_keys(raft::comms::comms_t const &comm, // 2. collect values for the unique keys in [collect_unique_key_first, collect_unique_key_last) rmm::device_uvector unique_keys( - thrust::distance(collect_unique_key_first, collect_unique_key_last), stream); - thrust::copy(rmm::exec_policy(stream)->on(stream), + thrust::distance(collect_unique_key_first, collect_unique_key_last), stream_view); + thrust::copy(rmm::exec_policy(stream_view), collect_unique_key_first, collect_unique_key_last, unique_keys.begin()); - rmm::device_uvector values_for_unique_keys(0, stream); + rmm::device_uvector values_for_unique_keys(0, stream_view); { - rmm::device_uvector rx_unique_keys(0, stream); + rmm::device_uvector rx_unique_keys(0, stream_view); std::vector rx_value_counts{}; std::tie(rx_unique_keys, rx_value_counts) = groupby_gpuid_and_shuffle_values( comm, unique_keys.begin(), unique_keys.end(), [key_to_gpu_id_op] __device__(auto val) { return key_to_gpu_id_op(val); }, - stream); + stream_view); - rmm::device_uvector values_for_rx_unique_keys(rx_unique_keys.size(), stream); + rmm::device_uvector values_for_rx_unique_keys(rx_unique_keys.size(), stream_view); - CUDA_TRY(cudaStreamSynchronize(stream)); // cuco::static_map currently does not take stream + stream_view.synchronize(); // cuco::static_map currently does not take stream kv_map_ptr->find( rx_unique_keys.begin(), rx_unique_keys.end(), values_for_rx_unique_keys.begin()); - rmm::device_uvector rx_values_for_unique_keys(0, stream); + rmm::device_uvector rx_values_for_unique_keys(0, stream_view); std::tie(rx_values_for_unique_keys, std::ignore) = - shuffle_values(comm, values_for_rx_unique_keys.begin(), rx_value_counts, stream); + shuffle_values(comm, values_for_rx_unique_keys.begin(), rx_value_counts, stream_view); values_for_unique_keys = std::move(rx_values_for_unique_keys); } @@ -233,7 +233,7 @@ collect_values_for_unique_keys(raft::comms::comms_t const &comm, // 3. re-build a cuco::static_map object for the k, v pairs in unique_keys, // values_for_unique_keys. - CUDA_TRY(cudaStreamSynchronize(stream)); // cuco::static_map currently does not take stream + stream_view.synchronize(); // cuco::static_map currently does not take stream kv_map_ptr.reset(); @@ -254,7 +254,7 @@ collect_values_for_unique_keys(raft::comms::comms_t const &comm, // 4. find values for [collect_unique_key_first, collect_unique_key_last) auto value_buffer = allocate_dataframe_buffer( - thrust::distance(collect_unique_key_first, collect_unique_key_last), stream); + thrust::distance(collect_unique_key_first, collect_unique_key_last), stream_view); kv_map_ptr->find(collect_unique_key_first, collect_unique_key_last, get_dataframe_buffer_begin(value_buffer)); diff --git a/cpp/include/cugraph/utilities/dataframe_buffer.cuh b/cpp/include/cugraph/utilities/dataframe_buffer.cuh index beaf4cabe00..d730a3afcff 100644 --- a/cpp/include/cugraph/utilities/dataframe_buffer.cuh +++ b/cpp/include/cugraph/utilities/dataframe_buffer.cuh @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -31,49 +32,50 @@ namespace experimental { namespace detail { template -auto allocate_dataframe_buffer_tuple_element_impl(size_t buffer_size, cudaStream_t stream) +auto allocate_dataframe_buffer_tuple_element_impl(size_t buffer_size, + rmm::cuda_stream_view stream_view) { using element_t = typename thrust::tuple_element::type; - return rmm::device_uvector(buffer_size, stream); + return rmm::device_uvector(buffer_size, stream_view); } template auto allocate_dataframe_buffer_tuple_impl(std::index_sequence, size_t buffer_size, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { return std::make_tuple( - allocate_dataframe_buffer_tuple_element_impl(buffer_size, stream)...); + allocate_dataframe_buffer_tuple_element_impl(buffer_size, stream_view)...); } template struct resize_dataframe_buffer_tuple_iterator_element_impl { - void run(BufferType& buffer, size_t new_buffer_size, cudaStream_t stream) + void run(BufferType& buffer, size_t new_buffer_size, rmm::cuda_stream_view stream_view) { - std::get(buffer).resize(new_buffer_size, stream); + std::get(buffer).resize(new_buffer_size, stream_view); resize_dataframe_buffer_tuple_iterator_element_impl().run( - buffer, new_buffer_size, stream); + buffer, new_buffer_size, stream_view); } }; template struct resize_dataframe_buffer_tuple_iterator_element_impl { - void run(BufferType& buffer, size_t new_buffer_size, cudaStream_t stream) {} + void run(BufferType& buffer, size_t new_buffer_size, rmm::cuda_stream_view stream_view) {} }; template struct shrink_to_fit_dataframe_buffer_tuple_iterator_element_impl { - void run(BufferType& buffer, cudaStream_t stream) + void run(BufferType& buffer, rmm::cuda_stream_view stream_view) { - std::get(buffer).shrink_to_fit(stream); + std::get(buffer).shrink_to_fit(stream_view); shrink_to_fit_dataframe_buffer_tuple_iterator_element_impl() - .run(buffer, stream); + .run(buffer, stream_view); } }; template struct shrink_to_fit_dataframe_buffer_tuple_iterator_element_impl { - void run(BufferType& buffer, cudaStream_t stream) {} + void run(BufferType& buffer, rmm::cuda_stream_view stream_view) {} }; template @@ -108,57 +110,61 @@ auto get_dataframe_buffer_end_tuple_impl(std::index_sequence, BufferType& } // namespace detail template ::value>* = nullptr> -auto allocate_dataframe_buffer(size_t buffer_size, cudaStream_t stream) +auto allocate_dataframe_buffer(size_t buffer_size, rmm::cuda_stream_view stream_view) { - return rmm::device_uvector(buffer_size, stream); + return rmm::device_uvector(buffer_size, stream_view); } template ::value>* = nullptr> -auto allocate_dataframe_buffer(size_t buffer_size, cudaStream_t stream) +auto allocate_dataframe_buffer(size_t buffer_size, rmm::cuda_stream_view stream_view) { size_t constexpr tuple_size = thrust::tuple_size::value; return detail::allocate_dataframe_buffer_tuple_impl( - std::make_index_sequence(), buffer_size, stream); + std::make_index_sequence(), buffer_size, stream_view); } template ::value>* = nullptr> -void resize_dataframe_buffer(BufferType& buffer, size_t new_buffer_size, cudaStream_t stream) +void resize_dataframe_buffer(BufferType& buffer, + size_t new_buffer_size, + rmm::cuda_stream_view stream_view) { - buffer.resize(new_buffer_size, stream); + buffer.resize(new_buffer_size, stream_view); } template ::value>* = nullptr> -void resize_dataframe_buffer(BufferType& buffer, size_t new_buffer_size, cudaStream_t stream) +void resize_dataframe_buffer(BufferType& buffer, + size_t new_buffer_size, + rmm::cuda_stream_view stream_view) { size_t constexpr tuple_size = thrust::tuple_size::value; detail:: resize_dataframe_buffer_tuple_iterator_element_impl() - .run(buffer, new_buffer_size, stream); + .run(buffer, new_buffer_size, stream_view); } template ::value>* = nullptr> -void shrink_to_fit_dataframe_buffer(BufferType& buffer, cudaStream_t stream) +void shrink_to_fit_dataframe_buffer(BufferType& buffer, rmm::cuda_stream_view stream_view) { - buffer.shrink_to_fit(stream); + buffer.shrink_to_fit(stream_view); } template ::value>* = nullptr> -void shrink_to_fit_dataframe_buffer(BufferType& buffer, cudaStream_t stream) +void shrink_to_fit_dataframe_buffer(BufferType& buffer, rmm::cuda_stream_view stream_view) { size_t constexpr tuple_size = thrust::tuple_size::value; detail::shrink_to_fit_dataframe_buffer_tuple_iterator_element_impl() - .run(buffer, stream); + .run(buffer, stream_view); } template #include +#include #include #include @@ -175,7 +176,7 @@ device_sendrecv_impl(raft::comms::comms_t const& comm, OutputIterator output_first, size_t rx_count, int src, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { // no-op } @@ -191,7 +192,7 @@ device_sendrecv_impl(raft::comms::comms_t const& comm, OutputIterator output_first, size_t rx_count, int src, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { using value_type = typename std::iterator_traits::value_type; static_assert( @@ -202,7 +203,7 @@ device_sendrecv_impl(raft::comms::comms_t const& comm, iter_to_raw_ptr(output_first), rx_count, src, - stream); + stream_view.value()); } template @@ -214,7 +215,7 @@ struct device_sendrecv_tuple_iterator_element_impl { OutputIterator output_first, size_t rx_count, int src, - cudaStream_t stream) const + rmm::cuda_stream_view stream_view) const { using output_value_t = typename thrust:: tuple_element::value_type>::type; @@ -228,9 +229,9 @@ struct device_sendrecv_tuple_iterator_element_impl { tuple_element_output_first, rx_count, src, - stream); + stream_view.value()); device_sendrecv_tuple_iterator_element_impl().run( - comm, input_first, tx_count, dst, output_first, rx_count, src, stream); + comm, input_first, tx_count, dst, output_first, rx_count, src, stream_view); } }; @@ -243,7 +244,7 @@ struct device_sendrecv_tuple_iterator_element_impl const& rx_counts, std::vector const& rx_offsets, std::vector const& rx_src_ranks, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { // no-op } @@ -277,7 +278,7 @@ device_multicast_sendrecv_impl(raft::comms::comms_t const& comm, std::vector const& rx_counts, std::vector const& rx_offsets, std::vector const& rx_src_ranks, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { using value_type = typename std::iterator_traits::value_type; static_assert( @@ -290,7 +291,7 @@ device_multicast_sendrecv_impl(raft::comms::comms_t const& comm, rx_counts, rx_offsets, rx_src_ranks, - stream); + stream_view.value()); } template @@ -304,7 +305,7 @@ struct device_multicast_sendrecv_tuple_iterator_element_impl { std::vector const& rx_counts, std::vector const& rx_offsets, std::vector const& rx_src_ranks, - cudaStream_t stream) const + rmm::cuda_stream_view stream_view) const { using output_value_t = typename thrust:: tuple_element::value_type>::type; @@ -320,7 +321,7 @@ struct device_multicast_sendrecv_tuple_iterator_element_impl { rx_counts, rx_offsets, rx_src_ranks, - stream); + stream_view); device_multicast_sendrecv_tuple_iterator_element_impl() .run(comm, input_first, @@ -331,7 +332,7 @@ struct device_multicast_sendrecv_tuple_iterator_element_impl { rx_counts, rx_offsets, rx_src_ranks, - stream); + stream_view); } }; @@ -346,7 +347,7 @@ struct device_multicast_sendrecv_tuple_iterator_element_impl const& rx_counts, std::vector const& rx_offsets, std::vector const& rx_src_ranks, - cudaStream_t stream) const + rmm::cuda_stream_view stream_view) const { } }; @@ -358,7 +359,7 @@ device_bcast_impl(raft::comms::comms_t const& comm, OutputIterator output_first, size_t count, int root, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { // no-op } @@ -372,14 +373,14 @@ device_bcast_impl(raft::comms::comms_t const& comm, OutputIterator output_first, size_t count, int root, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { static_assert(std::is_same::value_type, typename std::iterator_traits::value_type>::value); if (comm.get_rank() == root) { - comm.bcast(iter_to_raw_ptr(input_first), count, root, stream); + comm.bcast(iter_to_raw_ptr(input_first), count, root, stream_view.value()); } else { - comm.bcast(iter_to_raw_ptr(output_first), count, root, stream); + comm.bcast(iter_to_raw_ptr(output_first), count, root, stream_view.value()); } } @@ -390,16 +391,16 @@ struct device_bcast_tuple_iterator_element_impl { OutputIterator output_first, size_t count, int root, - cudaStream_t stream) const + rmm::cuda_stream_view stream_view) const { device_bcast_impl(comm, thrust::get(input_first.get_iterator_tuple()), thrust::get(output_first.get_iterator_tuple()), count, root, - stream); + stream_view); device_bcast_tuple_iterator_element_impl().run( - comm, input_first, output_first, count, root, stream); + comm, input_first, output_first, count, root, stream_view); } }; @@ -410,7 +411,7 @@ struct device_bcast_tuple_iterator_element_impl::value_type, typename std::iterator_traits::value_type>::value); - comm.allreduce(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, op, stream); + comm.allreduce( + iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, op, stream_view.value()); } template @@ -450,16 +452,16 @@ struct device_allreduce_tuple_iterator_element_impl { OutputIterator output_first, size_t count, raft::comms::op_t op, - cudaStream_t stream) const + rmm::cuda_stream_view stream_view) const { device_allreduce_impl(comm, thrust::get(input_first.get_iterator_tuple()), thrust::get(output_first.get_iterator_tuple()), count, op, - stream); + stream_view); device_allreduce_tuple_iterator_element_impl().run( - comm, input_first, output_first, count, op, stream); + comm, input_first, output_first, count, op, stream_view); } }; @@ -470,7 +472,7 @@ struct device_allreduce_tuple_iterator_element_impl::value_type, typename std::iterator_traits::value_type>::value); - comm.reduce(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, op, root, stream); + comm.reduce(iter_to_raw_ptr(input_first), + iter_to_raw_ptr(output_first), + count, + op, + root, + stream_view.value()); } template @@ -513,7 +520,7 @@ struct device_reduce_tuple_iterator_element_impl { size_t count, raft::comms::op_t op, int root, - cudaStream_t stream) const + rmm::cuda_stream_view stream_view) const { device_reduce_impl(comm, thrust::get(input_first.get_iterator_tuple()), @@ -521,9 +528,9 @@ struct device_reduce_tuple_iterator_element_impl { count, op, root, - stream); + stream_view); device_reduce_tuple_iterator_element_impl().run( - comm, input_first, output_first, count, op, root, stream); + comm, input_first, output_first, count, op, root, stream_view); } }; @@ -535,7 +542,7 @@ struct device_reduce_tuple_iterator_element_impl const& recvcounts, std::vector const& displacements, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { // no-op } @@ -561,7 +568,7 @@ device_allgatherv_impl(raft::comms::comms_t const& comm, OutputIterator output_first, std::vector const& recvcounts, std::vector const& displacements, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { static_assert(std::is_same::value_type, typename std::iterator_traits::value_type>::value); @@ -569,7 +576,7 @@ device_allgatherv_impl(raft::comms::comms_t const& comm, iter_to_raw_ptr(output_first), recvcounts.data(), displacements.data(), - stream); + stream_view.value()); } template @@ -579,16 +586,16 @@ struct device_allgatherv_tuple_iterator_element_impl { OutputIterator output_first, std::vector const& recvcounts, std::vector const& displacements, - cudaStream_t stream) const + rmm::cuda_stream_view stream_view) const { device_allgatherv_impl(comm, thrust::get(input_first.get_iterator_tuple()), thrust::get(output_first.get_iterator_tuple()), recvcounts, displacements, - stream); + stream_view); device_allgatherv_tuple_iterator_element_impl().run( - comm, input_first, output_first, recvcounts, displacements, stream); + comm, input_first, output_first, recvcounts, displacements, stream_view); } }; @@ -599,7 +606,7 @@ struct device_allgatherv_tuple_iterator_element_impl const& recvcounts, std::vector const& displacements, - cudaStream_t stream) const + rmm::cuda_stream_view stream_view) const { } }; @@ -613,7 +620,7 @@ device_gatherv_impl(raft::comms::comms_t const& comm, std::vector const& recvcounts, std::vector const& displacements, int root, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { // no-op } @@ -629,7 +636,7 @@ device_gatherv_impl(raft::comms::comms_t const& comm, std::vector const& recvcounts, std::vector const& displacements, int root, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { static_assert(std::is_same::value_type, typename std::iterator_traits::value_type>::value); @@ -639,7 +646,7 @@ device_gatherv_impl(raft::comms::comms_t const& comm, recvcounts.data(), displacements.data(), root, - stream); + stream_view.value()); } template @@ -651,7 +658,7 @@ struct device_gatherv_tuple_iterator_element_impl { std::vector const& recvcounts, std::vector const& displacements, int root, - cudaStream_t stream) const + rmm::cuda_stream_view stream_view) const { device_gatherv_impl(comm, thrust::get(input_first.get_iterator_tuple()), @@ -660,9 +667,9 @@ struct device_gatherv_tuple_iterator_element_impl { recvcounts, displacements, root, - stream); + stream_view); device_gatherv_tuple_iterator_element_impl().run( - comm, input_first, output_first, sendcount, recvcounts, displacements, root, stream); + comm, input_first, output_first, sendcount, recvcounts, displacements, root, stream_view); } }; @@ -675,7 +682,7 @@ struct device_gatherv_tuple_iterator_element_impl const& recvcounts, std::vector const& displacements, int root, - cudaStream_t stream) const + rmm::cuda_stream_view stream_view) const { } }; @@ -772,10 +779,10 @@ device_sendrecv(raft::comms::comms_t const& comm, OutputIterator output_first, size_t rx_count, int src, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { detail::device_sendrecv_impl( - comm, input_first, tx_count, dst, output_first, rx_count, src, stream); + comm, input_first, tx_count, dst, output_first, rx_count, src, stream_view); } template @@ -790,7 +797,7 @@ device_sendrecv(raft::comms::comms_t const& comm, OutputIterator output_first, size_t rx_count, int src, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { static_assert( thrust::tuple_size::value_type>::value == @@ -806,7 +813,7 @@ device_sendrecv(raft::comms::comms_t const& comm, OutputIterator, size_t{0}, tuple_size>() - .run(comm, input_first, tx_count, dst, output_first, rx_count, src, stream); + .run(comm, input_first, tx_count, dst, output_first, rx_count, src, stream_view); } template @@ -822,7 +829,7 @@ device_multicast_sendrecv(raft::comms::comms_t const& comm, std::vector const& rx_counts, std::vector const& rx_offsets, std::vector const& rx_src_ranks, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { detail::device_multicast_sendrecv_impl(comm, input_first, @@ -833,7 +840,7 @@ device_multicast_sendrecv(raft::comms::comms_t const& comm, rx_counts, rx_offsets, rx_src_ranks, - stream); + stream_view); } template @@ -850,7 +857,7 @@ device_multicast_sendrecv(raft::comms::comms_t const& comm, std::vector const& rx_counts, std::vector const& rx_offsets, std::vector const& rx_src_ranks, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { static_assert( thrust::tuple_size::value_type>::value == @@ -875,7 +882,7 @@ device_multicast_sendrecv(raft::comms::comms_t const& comm, rx_counts, rx_offsets, rx_src_ranks, - stream); + stream_view); } template @@ -887,9 +894,9 @@ device_bcast(raft::comms::comms_t const& comm, OutputIterator output_first, size_t count, int root, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { - detail::device_bcast_impl(comm, input_first, output_first, count, root, stream); + detail::device_bcast_impl(comm, input_first, output_first, count, root, stream_view); } template @@ -902,7 +909,7 @@ device_bcast(raft::comms::comms_t const& comm, OutputIterator output_first, size_t count, int root, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { static_assert( thrust::tuple_size::value_type>::value == @@ -913,7 +920,7 @@ device_bcast(raft::comms::comms_t const& comm, detail:: device_bcast_tuple_iterator_element_impl() - .run(comm, input_first, output_first, count, root, stream); + .run(comm, input_first, output_first, count, root, stream_view); } template @@ -925,9 +932,9 @@ device_allreduce(raft::comms::comms_t const& comm, OutputIterator output_first, size_t count, raft::comms::op_t op, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { - detail::device_allreduce_impl(comm, input_first, output_first, count, op, stream); + detail::device_allreduce_impl(comm, input_first, output_first, count, op, stream_view); } template @@ -940,7 +947,7 @@ device_allreduce(raft::comms::comms_t const& comm, OutputIterator output_first, size_t count, raft::comms::op_t op, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { static_assert( thrust::tuple_size::value_type>::value == @@ -953,7 +960,7 @@ device_allreduce(raft::comms::comms_t const& comm, OutputIterator, size_t{0}, tuple_size>() - .run(comm, input_first, output_first, count, op, stream); + .run(comm, input_first, output_first, count, op, stream_view); } template @@ -966,9 +973,9 @@ device_reduce(raft::comms::comms_t const& comm, size_t count, raft::comms::op_t op, int root, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { - detail::device_reduce_impl(comm, input_first, output_first, count, op, root, stream); + detail::device_reduce_impl(comm, input_first, output_first, count, op, root, stream_view); } template @@ -982,7 +989,7 @@ device_reduce(raft::comms::comms_t const& comm, size_t count, raft::comms::op_t op, int root, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { static_assert( thrust::tuple_size::value_type>::value == @@ -995,7 +1002,7 @@ device_reduce(raft::comms::comms_t const& comm, OutputIterator, size_t{0}, tuple_size>() - .run(comm, input_first, output_first, count, op, root, stream); + .run(comm, input_first, output_first, count, op, root, stream_view); } template @@ -1007,10 +1014,10 @@ device_allgatherv(raft::comms::comms_t const& comm, OutputIterator output_first, std::vector const& recvcounts, std::vector const& displacements, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { detail::device_allgatherv_impl( - comm, input_first, output_first, recvcounts, displacements, stream); + comm, input_first, output_first, recvcounts, displacements, stream_view); } template @@ -1023,7 +1030,7 @@ device_allgatherv(raft::comms::comms_t const& comm, OutputIterator output_first, std::vector const& recvcounts, std::vector const& displacements, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { static_assert( thrust::tuple_size::value_type>::value == @@ -1036,7 +1043,7 @@ device_allgatherv(raft::comms::comms_t const& comm, OutputIterator, size_t{0}, tuple_size>() - .run(comm, input_first, output_first, recvcounts, displacements, stream); + .run(comm, input_first, output_first, recvcounts, displacements, stream_view); } template @@ -1050,10 +1057,10 @@ device_gatherv(raft::comms::comms_t const& comm, std::vector const& recvcounts, std::vector const& displacements, int root, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { detail::device_gatherv_impl( - comm, input_first, output_first, sendcount, recvcounts, displacements, root, stream); + comm, input_first, output_first, sendcount, recvcounts, displacements, root, stream_view); } template @@ -1068,7 +1075,7 @@ device_gatherv(raft::comms::comms_t const& comm, std::vector const& recvcounts, std::vector const& displacements, int root, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { static_assert( thrust::tuple_size::value_type>::value == @@ -1081,7 +1088,7 @@ device_gatherv(raft::comms::comms_t const& comm, OutputIterator, size_t{0}, tuple_size>() - .run(comm, input_first, output_first, sendcount, recvcounts, displacements, root, stream); + .run(comm, input_first, output_first, sendcount, recvcounts, displacements, root, stream_view); } } // namespace experimental diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh index 009dde845b5..e4f7067cfdf 100644 --- a/cpp/include/cugraph/utilities/shuffle_comm.cuh +++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh @@ -18,9 +18,9 @@ #include #include -#include #include #include +#include #include #include @@ -46,11 +46,11 @@ inline std::tuple, std::vector> compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const &comm, rmm::device_uvector const &d_tx_value_counts, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { auto const comm_size = comm.get_size(); - rmm::device_uvector d_rx_value_counts(comm_size, stream); + rmm::device_uvector d_rx_value_counts(comm_size, stream_view); // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released. std::vector tx_counts(comm_size, size_t{1}); @@ -72,12 +72,12 @@ compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const &comm, rx_counts, rx_offsets, rx_src_ranks, - stream); + stream_view); - raft::update_host(tx_counts.data(), d_tx_value_counts.data(), comm_size, stream); - raft::update_host(rx_counts.data(), d_rx_value_counts.data(), comm_size, stream); + raft::update_host(tx_counts.data(), d_tx_value_counts.data(), comm_size, stream_view.value()); + raft::update_host(rx_counts.data(), d_rx_value_counts.data(), comm_size, stream_view.value()); - CUDA_TRY(cudaStreamSynchronize(stream)); // rx_counts should be up-to-date + stream_view.synchronize(); std::partial_sum(tx_counts.begin(), tx_counts.end() - 1, tx_offsets.begin() + 1); std::partial_sum(rx_counts.begin(), rx_counts.end() - 1, rx_offsets.begin() + 1); @@ -115,9 +115,9 @@ rmm::device_uvector groupby_and_count(ValueIterator tx_value_first /* [I ValueIterator tx_value_last /* [INOUT */, ValueToGPUIdOp value_to_group_id_op, int num_groups, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { - thrust::sort(rmm::exec_policy(stream)->on(stream), + thrust::sort(rmm::exec_policy(stream_view), tx_value_first, tx_value_last, [value_to_group_id_op] __device__(auto lhs, auto rhs) { @@ -127,19 +127,19 @@ rmm::device_uvector groupby_and_count(ValueIterator tx_value_first /* [I auto group_id_first = thrust::make_transform_iterator( tx_value_first, [value_to_group_id_op] __device__(auto value) { return value_to_group_id_op(value); }); - rmm::device_uvector d_tx_dst_ranks(num_groups, stream); - rmm::device_uvector d_tx_value_counts(d_tx_dst_ranks.size(), stream); + rmm::device_uvector d_tx_dst_ranks(num_groups, stream_view); + rmm::device_uvector d_tx_value_counts(d_tx_dst_ranks.size(), stream_view); auto last = - thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), + thrust::reduce_by_key(rmm::exec_policy(stream_view), group_id_first, group_id_first + thrust::distance(tx_value_first, tx_value_last), thrust::make_constant_iterator(size_t{1}), d_tx_dst_ranks.begin(), d_tx_value_counts.begin()); if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < num_groups) { - rmm::device_uvector d_counts(num_groups, stream); - thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0}); - thrust::scatter(rmm::exec_policy(stream)->on(stream), + rmm::device_uvector d_counts(num_groups, stream_view); + thrust::fill(rmm::exec_policy(stream_view), d_counts.begin(), d_counts.end(), size_t{0}); + thrust::scatter(rmm::exec_policy(stream_view), d_tx_value_counts.begin(), thrust::get<1>(last), d_tx_dst_ranks.begin(), @@ -156,9 +156,9 @@ rmm::device_uvector groupby_and_count(VertexIterator tx_key_first /* [IN ValueIterator tx_value_first /* [INOUT */, KeyToGPUIdOp key_to_group_id_op, int num_groups, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { - thrust::sort_by_key(rmm::exec_policy(stream)->on(stream), + thrust::sort_by_key(rmm::exec_policy(stream_view), tx_key_first, tx_key_last, tx_value_first, @@ -168,18 +168,18 @@ rmm::device_uvector groupby_and_count(VertexIterator tx_key_first /* [IN auto group_id_first = thrust::make_transform_iterator( tx_key_first, [key_to_group_id_op] __device__(auto key) { return key_to_group_id_op(key); }); - rmm::device_uvector d_tx_dst_ranks(num_groups, stream); - rmm::device_uvector d_tx_value_counts(d_tx_dst_ranks.size(), stream); - auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), + rmm::device_uvector d_tx_dst_ranks(num_groups, stream_view); + rmm::device_uvector d_tx_value_counts(d_tx_dst_ranks.size(), stream_view); + auto last = thrust::reduce_by_key(rmm::exec_policy(stream_view), group_id_first, group_id_first + thrust::distance(tx_key_first, tx_key_last), thrust::make_constant_iterator(size_t{1}), d_tx_dst_ranks.begin(), d_tx_value_counts.begin()); if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < num_groups) { - rmm::device_uvector d_counts(num_groups, stream); - thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0}); - thrust::scatter(rmm::exec_policy(stream)->on(stream), + rmm::device_uvector d_counts(num_groups, stream_view); + thrust::fill(rmm::exec_policy(stream_view), d_counts.begin(), d_counts.end(), size_t{0}); + thrust::scatter(rmm::exec_policy(stream_view), d_tx_value_counts.begin(), thrust::get<1>(last), d_tx_dst_ranks.begin(), @@ -194,12 +194,13 @@ template auto shuffle_values(raft::comms::comms_t const &comm, TxValueIterator tx_value_first, std::vector const &tx_value_counts, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { auto const comm_size = comm.get_size(); - rmm::device_uvector d_tx_value_counts(comm_size, stream); - raft::update_device(d_tx_value_counts.data(), tx_value_counts.data(), comm_size, stream); + rmm::device_uvector d_tx_value_counts(comm_size, stream_view); + raft::update_device( + d_tx_value_counts.data(), tx_value_counts.data(), comm_size, stream_view.value()); std::vector tx_counts{}; std::vector tx_offsets{}; @@ -208,11 +209,11 @@ auto shuffle_values(raft::comms::comms_t const &comm, std::vector rx_offsets{}; std::vector rx_src_ranks{}; std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) = - detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream); + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view); auto rx_value_buffer = allocate_dataframe_buffer::value_type>( - rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream); + rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size). @@ -227,7 +228,7 @@ auto shuffle_values(raft::comms::comms_t const &comm, rx_counts, rx_offsets, rx_src_ranks, - stream); + stream_view); if (rx_counts.size() < static_cast(comm_size)) { std::vector tmp_rx_counts(comm_size, size_t{0}); @@ -246,12 +247,12 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const &comm, ValueIterator tx_value_first /* [INOUT */, ValueIterator tx_value_last /* [INOUT */, ValueToGPUIdOp value_to_gpu_id_op, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { auto const comm_size = comm.get_size(); - auto d_tx_value_counts = - groupby_and_count(tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), stream); + auto d_tx_value_counts = groupby_and_count( + tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), stream_view); std::vector tx_counts{}; std::vector tx_offsets{}; @@ -260,11 +261,11 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const &comm, std::vector rx_offsets{}; std::vector rx_src_ranks{}; std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) = - detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream); + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view); auto rx_value_buffer = allocate_dataframe_buffer::value_type>( - rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream); + rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size). @@ -279,7 +280,7 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const &comm, rx_counts, rx_offsets, rx_src_ranks, - stream); + stream_view); if (rx_counts.size() < static_cast(comm_size)) { std::vector tmp_rx_counts(comm_size, size_t{0}); @@ -298,12 +299,12 @@ auto groupby_gpuid_and_shuffle_kv_pairs(raft::comms::comms_t const &comm, VertexIterator tx_key_last /* [INOUT */, ValueIterator tx_value_first /* [INOUT */, KeyToGPUIdOp key_to_gpu_id_op, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { auto const comm_size = comm.get_size(); auto d_tx_value_counts = groupby_and_count( - tx_key_first, tx_key_last, tx_value_first, key_to_gpu_id_op, comm.get_size(), stream); + tx_key_first, tx_key_last, tx_value_first, key_to_gpu_id_op, comm.get_size(), stream_view); std::vector tx_counts{}; std::vector tx_offsets{}; @@ -312,13 +313,13 @@ auto groupby_gpuid_and_shuffle_kv_pairs(raft::comms::comms_t const &comm, std::vector rx_offsets{}; std::vector rx_src_ranks{}; std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) = - detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream); + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view); rmm::device_uvector::value_type> rx_keys( - rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream); + rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); auto rx_value_buffer = allocate_dataframe_buffer::value_type>( - rx_keys.size(), stream); + rx_keys.size(), stream_view); // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size). @@ -331,7 +332,7 @@ auto groupby_gpuid_and_shuffle_kv_pairs(raft::comms::comms_t const &comm, rx_counts, rx_offsets, rx_src_ranks, - stream); + stream_view); // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size). @@ -346,7 +347,7 @@ auto groupby_gpuid_and_shuffle_kv_pairs(raft::comms::comms_t const &comm, rx_counts, rx_offsets, rx_src_ranks, - stream); + stream_view); if (rx_counts.size() < static_cast(comm_size)) { std::vector tmp_rx_counts(comm_size, size_t{0}); diff --git a/cpp/src/centrality/betweenness_centrality.cu b/cpp/src/centrality/betweenness_centrality.cu index cdee2140382..32dddd203db 100644 --- a/cpp/src/centrality/betweenness_centrality.cu +++ b/cpp/src/centrality/betweenness_centrality.cu @@ -23,7 +23,9 @@ #include #include #include + #include +#include #include #include "betweenness_centrality.cuh" @@ -227,15 +229,13 @@ void BC::compute_single_source(vertex_t so // the traversal, this value is avalaible within the bfs implementation and // there could be a way to access it directly and avoid both replace and the // max - thrust::replace(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::replace(rmm::exec_policy(handle_.get_stream_view()), distances_, distances_ + number_of_vertices_, std::numeric_limits::max(), static_cast(-1)); - auto current_max_depth = - thrust::max_element(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), - distances_, - distances_ + number_of_vertices_); + auto current_max_depth = thrust::max_element( + rmm::exec_policy(handle_.get_stream_view()), distances_, distances_ + number_of_vertices_); vertex_t max_depth = 0; CUDA_TRY(cudaMemcpy(&max_depth, current_max_depth, sizeof(vertex_t), cudaMemcpyDeviceToHost)); // Step 2) Dependency accumulation @@ -265,7 +265,7 @@ void BC::accumulate(vertex_t source_vertex template void BC::initialize_dependencies() { - thrust::fill(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::fill(rmm::exec_policy(handle_.get_stream_view()), deltas_, deltas_ + number_of_vertices_, static_cast(0)); @@ -315,16 +315,13 @@ template ::add_reached_endpoints_to_source_betweenness( vertex_t source_vertex) { - vertex_t number_of_unvisited_vertices = - thrust::count(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), - distances_, - distances_ + number_of_vertices_, - -1); + vertex_t number_of_unvisited_vertices = thrust::count( + rmm::exec_policy(handle_.get_stream_view()), distances_, distances_ + number_of_vertices_, -1); vertex_t number_of_visited_vertices_except_source = number_of_vertices_ - number_of_unvisited_vertices - 1; rmm::device_vector buffer(1); buffer[0] = number_of_visited_vertices_except_source; - thrust::transform(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::transform(rmm::exec_policy(handle_.get_stream_view()), buffer.begin(), buffer.end(), betweenness_ + source_vertex, @@ -335,7 +332,7 @@ void BC::add_reached_endpoints_to_source_b template void BC::add_vertices_dependencies_to_betweenness() { - thrust::transform(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::transform(rmm::exec_policy(handle_.get_stream_view()), deltas_, deltas_ + number_of_vertices_, betweenness_, @@ -420,7 +417,7 @@ void BC::apply_rescale_factor_to_betweenne { size_t result_size = number_of_vertices_; if (is_edge_betweenness_) result_size = number_of_edges_; - thrust::transform(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::transform(rmm::exec_policy(handle_.get_stream_view()), betweenness_, betweenness_ + result_size, thrust::make_constant_iterator(rescale_factor), diff --git a/cpp/src/community/ecg.cu b/cpp/src/community/ecg.cu index ef171d127fe..b990055c16c 100644 --- a/cpp/src/community/ecg.cu +++ b/cpp/src/community/ecg.cu @@ -20,8 +20,8 @@ #include #include -#include #include +#include #include @@ -90,16 +90,15 @@ struct update_functor { * responsible for freeing the allocated memory using ALLOC_FREE_TRY(). */ template -void get_permutation_vector(T size, T seed, T *permutation, cudaStream_t stream) +void get_permutation_vector(T size, T seed, T *permutation, rmm::cuda_stream_view stream_view) { - rmm::device_uvector randoms_v(size, stream); + rmm::device_uvector randoms_v(size, stream_view); thrust::counting_iterator index(seed); - thrust::transform( - rmm::exec_policy(stream)->on(stream), index, index + size, randoms_v.begin(), prg()); - thrust::sequence(rmm::exec_policy(stream)->on(stream), permutation, permutation + size, 0); + thrust::transform(rmm::exec_policy(stream_view), index, index + size, randoms_v.begin(), prg()); + thrust::sequence(rmm::exec_policy(stream_view), permutation, permutation + size, 0); thrust::sort_by_key( - rmm::exec_policy(stream)->on(stream), randoms_v.begin(), randoms_v.end(), permutation); + rmm::exec_policy(stream_view), randoms_v.begin(), randoms_v.end(), permutation); } template @@ -117,10 +116,12 @@ class EcgLouvain : public cugraph::Louvain { void initialize_dendrogram_level(vertex_t num_vertices) override { - this->dendrogram_->add_level(0, num_vertices, this->stream_); + this->dendrogram_->add_level(0, num_vertices, this->handle_.get_stream_view()); - get_permutation_vector( - num_vertices, seed_, this->dendrogram_->current_level_begin(), this->stream_); + get_permutation_vector(num_vertices, + seed_, + this->dendrogram_->current_level_begin(), + this->handle_.get_stream_view()); } private: @@ -146,11 +147,9 @@ void ecg(raft::handle_t const &handle, "Invalid input argument: clustering is NULL, should be a device pointer to " "memory for storing the result"); - cudaStream_t stream{0}; + rmm::device_uvector ecg_weights_v(graph.number_of_edges, handle.get_stream_view()); - rmm::device_uvector ecg_weights_v(graph.number_of_edges, handle.get_stream()); - - thrust::copy(rmm::exec_policy(stream)->on(stream), + thrust::copy(rmm::exec_policy(handle.get_stream_view()), graph.edge_data, graph.edge_data + graph.number_of_edges, ecg_weights_v.data()); @@ -172,17 +171,18 @@ void ecg(raft::handle_t const &handle, dim3 grid, block; block.x = 512; grid.x = min(vertex_t{CUDA_MAX_BLOCKS}, (graph.number_of_edges / 512 + 1)); - match_check_kernel<<>>(graph.number_of_edges, - graph.number_of_vertices, - graph.offsets, - graph.indices, - runner.get_dendrogram().get_level_ptr_nocheck(0), - ecg_weights_v.data()); + match_check_kernel<<>>( + graph.number_of_edges, + graph.number_of_vertices, + graph.offsets, + graph.indices, + runner.get_dendrogram().get_level_ptr_nocheck(0), + ecg_weights_v.data()); } // Set weights = min_weight + (1 - min-weight)*sum/ensemble_size update_functor uf(min_weight, ensemble_size); - thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::transform(rmm::exec_policy(handle.get_stream_view()), ecg_weights_v.begin(), ecg_weights_v.end(), ecg_weights_v.begin(), diff --git a/cpp/src/community/leiden.cuh b/cpp/src/community/leiden.cuh index 4ffb7c20eb2..e7e358777a4 100644 --- a/cpp/src/community/leiden.cuh +++ b/cpp/src/community/leiden.cuh @@ -42,10 +42,12 @@ class Leiden : public Louvain { this->timer_start("update_clustering_constrained"); rmm::device_uvector next_cluster_v(this->dendrogram_->current_level_size(), - this->stream_); - rmm::device_uvector delta_Q_v(graph.number_of_edges, this->stream_); - rmm::device_uvector cluster_hash_v(graph.number_of_edges, this->stream_); - rmm::device_uvector old_cluster_sum_v(graph.number_of_vertices, this->stream_); + this->handle_.get_stream_view()); + rmm::device_uvector delta_Q_v(graph.number_of_edges, this->handle_.get_stream_view()); + rmm::device_uvector cluster_hash_v(graph.number_of_edges, + this->handle_.get_stream_view()); + rmm::device_uvector old_cluster_sum_v(graph.number_of_vertices, + this->handle_.get_stream_view()); vertex_t const *d_src_indices = this->src_indices_v_.data(); vertex_t const *d_dst_indices = graph.indices; @@ -56,7 +58,7 @@ class Leiden : public Louvain { weight_t *d_delta_Q = delta_Q_v.data(); vertex_t *d_constraint = constraint_v_.data(); - thrust::copy(rmm::exec_policy(this->stream_)->on(this->stream_), + thrust::copy(rmm::exec_policy(this->handle_.get_stream_view()), this->dendrogram_->current_level_begin(), this->dendrogram_->current_level_end(), next_cluster_v.data()); @@ -79,7 +81,7 @@ class Leiden : public Louvain { // Filter out positive delta_Q values for nodes not in the same constraint group thrust::for_each( - rmm::exec_policy(this->stream_)->on(this->stream_), + rmm::exec_policy(this->handle_.get_stream_view()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(graph.number_of_edges), [d_src_indices, d_dst_indices, d_constraint, d_delta_Q] __device__(vertex_t i) { @@ -95,14 +97,14 @@ class Leiden : public Louvain { new_Q = this->modularity(total_edge_weight, resolution, graph, next_cluster_v.data()); if (new_Q > cur_Q) { - thrust::copy(rmm::exec_policy(this->stream_)->on(this->stream_), + thrust::copy(rmm::exec_policy(this->handle_.get_stream_view()), next_cluster_v.begin(), next_cluster_v.end(), this->dendrogram_->current_level_begin()); } } - this->timer_stop(this->stream_); + this->timer_stop(this->handle_.get_stream_view()); return cur_Q; } @@ -110,7 +112,7 @@ class Leiden : public Louvain { { size_t num_level{0}; - weight_t total_edge_weight = thrust::reduce(rmm::exec_policy(this->stream_)->on(this->stream_), + weight_t total_edge_weight = thrust::reduce(rmm::exec_policy(this->handle_.get_stream_view()), this->weights_v_.begin(), this->weights_v_.end()); @@ -132,9 +134,10 @@ class Leiden : public Louvain { // // Initialize every cluster to reference each vertex to itself // - this->dendrogram_->add_level(0, current_graph.number_of_vertices, this->stream_); + this->dendrogram_->add_level( + 0, current_graph.number_of_vertices, this->handle_.get_stream_view()); - thrust::sequence(rmm::exec_policy(this->stream_)->on(this->stream_), + thrust::sequence(rmm::exec_policy(this->handle_.get_stream_view()), this->dendrogram_->current_level_begin(), this->dendrogram_->current_level_end()); diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index 8fa2b81783a..2c6cf31a61e 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -23,6 +23,7 @@ #include #include +#include //#define TIMING @@ -52,29 +53,28 @@ class Louvain { // to change the logic to populate this properly // in generate_superverticies_graph. // - offsets_v_(graph.number_of_vertices + 1, handle.get_stream()), - indices_v_(graph.number_of_edges, handle.get_stream()), - weights_v_(graph.number_of_edges, handle.get_stream()), - src_indices_v_(graph.number_of_edges, handle.get_stream()), - vertex_weights_v_(graph.number_of_vertices, handle.get_stream()), - cluster_weights_v_(graph.number_of_vertices, handle.get_stream()), - tmp_arr_v_(graph.number_of_vertices, handle.get_stream()), - cluster_inverse_v_(graph.number_of_vertices, handle.get_stream()), + offsets_v_(graph.number_of_vertices + 1, handle.get_stream_view()), + indices_v_(graph.number_of_edges, handle.get_stream_view()), + weights_v_(graph.number_of_edges, handle.get_stream_view()), + src_indices_v_(graph.number_of_edges, handle.get_stream_view()), + vertex_weights_v_(graph.number_of_vertices, handle.get_stream_view()), + cluster_weights_v_(graph.number_of_vertices, handle.get_stream_view()), + tmp_arr_v_(graph.number_of_vertices, handle.get_stream_view()), + cluster_inverse_v_(graph.number_of_vertices, handle.get_stream_view()), number_of_vertices_(graph.number_of_vertices), - number_of_edges_(graph.number_of_edges), - stream_(handle.get_stream()) + number_of_edges_(graph.number_of_edges) { - thrust::copy(rmm::exec_policy(stream_)->on(stream_), + thrust::copy(rmm::exec_policy(handle_.get_stream_view()), graph.offsets, graph.offsets + graph.number_of_vertices + 1, offsets_v_.begin()); - thrust::copy(rmm::exec_policy(stream_)->on(stream_), + thrust::copy(rmm::exec_policy(handle_.get_stream_view()), graph.indices, graph.indices + graph.number_of_edges, indices_v_.begin()); - thrust::copy(rmm::exec_policy(stream_)->on(stream_), + thrust::copy(rmm::exec_policy(handle_.get_stream_view()), graph.edge_data, graph.edge_data + graph.number_of_edges, weights_v_.begin()); @@ -89,17 +89,19 @@ class Louvain { { vertex_t n_verts = graph.number_of_vertices; - rmm::device_uvector inc(n_verts, stream_); - rmm::device_uvector deg(n_verts, stream_); + rmm::device_uvector inc(n_verts, handle_.get_stream_view()); + rmm::device_uvector deg(n_verts, handle_.get_stream_view()); - thrust::fill(rmm::exec_policy(stream_)->on(stream_), inc.begin(), inc.end(), weight_t{0.0}); - thrust::fill(rmm::exec_policy(stream_)->on(stream_), deg.begin(), deg.end(), weight_t{0.0}); + thrust::fill( + rmm::exec_policy(handle_.get_stream_view()), inc.begin(), inc.end(), weight_t{0.0}); + thrust::fill( + rmm::exec_policy(handle_.get_stream_view()), deg.begin(), deg.end(), weight_t{0.0}); // FIXME: Already have weighted degree computed in main loop, // could pass that in rather than computing d_deg... which // would save an atomicAdd (synchronization) // - thrust::for_each(rmm::exec_policy(stream_)->on(stream_), + thrust::for_each(rmm::exec_policy(handle_.get_stream_view()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(graph.number_of_vertices), [d_inc = inc.data(), @@ -123,7 +125,7 @@ class Louvain { }); weight_t Q = thrust::transform_reduce( - rmm::exec_policy(stream_)->on(stream_), + rmm::exec_policy(handle_.get_stream_view()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(graph.number_of_vertices), [d_deg = deg.data(), d_inc = inc.data(), total_edge_weight, resolution] __device__( @@ -146,8 +148,8 @@ class Louvain { virtual weight_t operator()(size_t max_level, weight_t resolution) { - weight_t total_edge_weight = - thrust::reduce(rmm::exec_policy(stream_)->on(stream_), weights_v_.begin(), weights_v_.end()); + weight_t total_edge_weight = thrust::reduce( + rmm::exec_policy(handle_.get_stream_view()), weights_v_.begin(), weights_v_.end()); weight_t best_modularity = weight_t{-1}; @@ -193,10 +195,10 @@ class Louvain { #endif } - void timer_stop(cudaStream_t stream) + void timer_stop(rmm::cuda_stream_view stream_view) { #ifdef TIMING - CUDA_TRY(cudaStreamSynchronize(stream)); + stream_view.synchronize(); hr_timer_.stop(); #endif } @@ -210,9 +212,9 @@ class Louvain { virtual void initialize_dendrogram_level(vertex_t num_vertices) { - dendrogram_->add_level(0, num_vertices, stream_); + dendrogram_->add_level(0, num_vertices, handle_.get_stream_view()); - thrust::sequence(rmm::exec_policy(stream_)->on(stream_), + thrust::sequence(rmm::exec_policy(handle_.get_stream_view()), dendrogram_->current_level_begin(), dendrogram_->current_level_end()); } @@ -232,7 +234,7 @@ class Louvain { // MNMG: copy_v_transform_reduce_out_nbr, then copy // thrust::for_each( - rmm::exec_policy(stream_)->on(stream_), + rmm::exec_policy(handle_.get_stream_view()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(graph.number_of_vertices), [d_offsets, d_indices, d_weights, d_vertex_weights, d_cluster_weights] __device__( @@ -244,7 +246,7 @@ class Louvain { d_cluster_weights[src] = sum; }); - timer_stop(stream_); + timer_stop(handle_.get_stream_view()); } virtual weight_t update_clustering(weight_t total_edge_weight, @@ -253,17 +255,19 @@ class Louvain { { timer_start("update_clustering"); - rmm::device_uvector next_cluster_v(dendrogram_->current_level_size(), stream_); - rmm::device_uvector delta_Q_v(graph.number_of_edges, stream_); - rmm::device_uvector cluster_hash_v(graph.number_of_edges, stream_); - rmm::device_uvector old_cluster_sum_v(graph.number_of_vertices, stream_); + rmm::device_uvector next_cluster_v(dendrogram_->current_level_size(), + handle_.get_stream_view()); + rmm::device_uvector delta_Q_v(graph.number_of_edges, handle_.get_stream_view()); + rmm::device_uvector cluster_hash_v(graph.number_of_edges, handle_.get_stream_view()); + rmm::device_uvector old_cluster_sum_v(graph.number_of_vertices, + handle_.get_stream_view()); vertex_t *d_cluster = dendrogram_->current_level_begin(); weight_t const *d_vertex_weights = vertex_weights_v_.data(); weight_t *d_cluster_weights = cluster_weights_v_.data(); weight_t *d_delta_Q = delta_Q_v.data(); - thrust::copy(rmm::exec_policy(stream_)->on(stream_), + thrust::copy(rmm::exec_policy(handle_.get_stream_view()), dendrogram_->current_level_begin(), dendrogram_->current_level_end(), next_cluster_v.data()); @@ -291,14 +295,14 @@ class Louvain { new_Q = modularity(total_edge_weight, resolution, graph, next_cluster_v.data()); if (new_Q > cur_Q) { - thrust::copy(rmm::exec_policy(stream_)->on(stream_), + thrust::copy(rmm::exec_policy(handle_.get_stream_view()), next_cluster_v.begin(), next_cluster_v.end(), dendrogram_->current_level_begin()); } } - timer_stop(stream_); + timer_stop(handle_.get_stream_view()); return cur_Q; } @@ -320,18 +324,20 @@ class Louvain { weight_t *d_old_cluster_sum = old_cluster_sum_v.data(); weight_t *d_new_cluster_sum = d_delta_Q; - thrust::fill(rmm::exec_policy(stream_)->on(stream_), + thrust::fill(rmm::exec_policy(handle_.get_stream_view()), cluster_hash_v.begin(), cluster_hash_v.end(), vertex_t{-1}); - thrust::fill( - rmm::exec_policy(stream_)->on(stream_), delta_Q_v.begin(), delta_Q_v.end(), weight_t{0.0}); - thrust::fill(rmm::exec_policy(stream_)->on(stream_), + thrust::fill(rmm::exec_policy(handle_.get_stream_view()), + delta_Q_v.begin(), + delta_Q_v.end(), + weight_t{0.0}); + thrust::fill(rmm::exec_policy(handle_.get_stream_view()), old_cluster_sum_v.begin(), old_cluster_sum_v.end(), weight_t{0.0}); - thrust::for_each(rmm::exec_policy(stream_)->on(stream_), + thrust::for_each(rmm::exec_policy(handle_.get_stream_view()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(graph.number_of_edges), [d_src_indices = src_indices_v_.data(), @@ -370,7 +376,7 @@ class Louvain { }); thrust::for_each( - rmm::exec_policy(stream_)->on(stream_), + rmm::exec_policy(handle_.get_stream_view()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(graph.number_of_edges), [total_edge_weight, @@ -409,16 +415,19 @@ class Louvain { rmm::device_uvector &delta_Q_v, bool up_down) { - rmm::device_uvector temp_vertices_v(graph.number_of_vertices, stream_); - rmm::device_uvector temp_cluster_v(graph.number_of_vertices, stream_); - rmm::device_uvector temp_delta_Q_v(graph.number_of_vertices, stream_); - - thrust::fill(rmm::exec_policy(stream_)->on(stream_), + rmm::device_uvector temp_vertices_v(graph.number_of_vertices, + handle_.get_stream_view()); + rmm::device_uvector temp_cluster_v(graph.number_of_vertices, + handle_.get_stream_view()); + rmm::device_uvector temp_delta_Q_v(graph.number_of_vertices, + handle_.get_stream_view()); + + thrust::fill(rmm::exec_policy(handle_.get_stream_view()), temp_cluster_v.begin(), temp_cluster_v.end(), vertex_t{-1}); - thrust::fill(rmm::exec_policy(stream_)->on(stream_), + thrust::fill(rmm::exec_policy(handle_.get_stream_view()), temp_delta_Q_v.begin(), temp_delta_Q_v.end(), weight_t{0}); @@ -430,7 +439,7 @@ class Louvain { thrust::make_zip_iterator(thrust::make_tuple(temp_cluster_v.begin(), temp_delta_Q_v.begin())); auto cluster_reduce_end = - thrust::reduce_by_key(rmm::exec_policy(stream_)->on(stream_), + thrust::reduce_by_key(rmm::exec_policy(handle_.get_stream_view()), src_indices_v_.begin(), src_indices_v_.end(), cluster_reduce_iterator, @@ -449,7 +458,7 @@ class Louvain { vertex_t final_size = thrust::distance(temp_vertices_v.data(), cluster_reduce_end.first); - thrust::for_each(rmm::exec_policy(stream_)->on(stream_), + thrust::for_each(rmm::exec_policy(handle_.get_stream_view()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(final_size), [up_down, @@ -480,12 +489,12 @@ class Louvain { // renumber the clusters to the range 0..(num_clusters-1) vertex_t num_clusters = renumber_clusters(); - cluster_weights_v_.resize(num_clusters, stream_); + cluster_weights_v_.resize(num_clusters, handle_.get_stream_view()); // shrink our graph to represent the graph of supervertices generate_superverticies_graph(graph, num_clusters); - timer_stop(stream_); + timer_stop(handle_.get_stream_view()); } vertex_t renumber_clusters() @@ -499,7 +508,7 @@ class Louvain { // // New technique. Initialize cluster_inverse_v_ to 0 // - thrust::fill(rmm::exec_policy(stream_)->on(stream_), + thrust::fill(rmm::exec_policy(handle_.get_stream_view()), cluster_inverse_v_.begin(), cluster_inverse_v_.end(), vertex_t{0}); @@ -510,7 +519,7 @@ class Louvain { auto first_1 = thrust::make_constant_iterator(1); auto last_1 = first_1 + old_num_clusters; - thrust::scatter(rmm::exec_policy(stream_)->on(stream_), + thrust::scatter(rmm::exec_policy(handle_.get_stream_view()), first_1, last_1, dendrogram_->current_level_begin(), @@ -520,47 +529,47 @@ class Louvain { // Now we'll copy all of the clusters that have a value of 1 into a temporary array // auto copy_end = thrust::copy_if( - rmm::exec_policy(stream_)->on(stream_), + rmm::exec_policy(handle_.get_stream_view()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(old_num_clusters), tmp_arr_v_.begin(), [d_cluster_inverse] __device__(const vertex_t idx) { return d_cluster_inverse[idx] == 1; }); vertex_t new_num_clusters = thrust::distance(tmp_arr_v_.begin(), copy_end); - tmp_arr_v_.resize(new_num_clusters, stream_); + tmp_arr_v_.resize(new_num_clusters, handle_.get_stream_view()); // // Now we can set each value in cluster_inverse of a cluster to its index // - thrust::for_each(rmm::exec_policy(stream_)->on(stream_), + thrust::for_each(rmm::exec_policy(handle_.get_stream_view()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(new_num_clusters), [d_cluster_inverse, d_tmp_array] __device__(const vertex_t idx) { d_cluster_inverse[d_tmp_array[idx]] = idx; }); - thrust::for_each(rmm::exec_policy(stream_)->on(stream_), + thrust::for_each(rmm::exec_policy(handle_.get_stream_view()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(old_num_clusters), [d_cluster, d_cluster_inverse] __device__(vertex_t i) { d_cluster[i] = d_cluster_inverse[d_cluster[i]]; }); - cluster_inverse_v_.resize(new_num_clusters, stream_); + cluster_inverse_v_.resize(new_num_clusters, handle_.get_stream_view()); return new_num_clusters; } void generate_superverticies_graph(graph_t &graph, vertex_t num_clusters) { - rmm::device_uvector new_src_v(graph.number_of_edges, stream_); - rmm::device_uvector new_dst_v(graph.number_of_edges, stream_); - rmm::device_uvector new_weight_v(graph.number_of_edges, stream_); + rmm::device_uvector new_src_v(graph.number_of_edges, handle_.get_stream_view()); + rmm::device_uvector new_dst_v(graph.number_of_edges, handle_.get_stream_view()); + rmm::device_uvector new_weight_v(graph.number_of_edges, handle_.get_stream_view()); // // Renumber the COO // - thrust::for_each(rmm::exec_policy(stream_)->on(stream_), + thrust::for_each(rmm::exec_policy(handle_.get_stream_view()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(graph.number_of_edges), [d_old_src = src_indices_v_.data(), @@ -576,12 +585,12 @@ class Louvain { }); thrust::stable_sort_by_key( - rmm::exec_policy(stream_)->on(stream_), + rmm::exec_policy(handle_.get_stream_view()), new_dst_v.begin(), new_dst_v.end(), thrust::make_zip_iterator(thrust::make_tuple(new_src_v.begin(), new_weight_v.begin()))); thrust::stable_sort_by_key( - rmm::exec_policy(stream_)->on(stream_), + rmm::exec_policy(handle_.get_stream_view()), new_src_v.begin(), new_src_v.end(), thrust::make_zip_iterator(thrust::make_tuple(new_dst_v.begin(), new_weight_v.begin()))); @@ -594,7 +603,7 @@ class Louvain { thrust::make_zip_iterator(thrust::make_tuple(new_src_v.begin(), new_dst_v.begin())); auto new_start = thrust::make_zip_iterator(thrust::make_tuple(src_indices_v_.data(), graph.indices)); - auto new_end = thrust::reduce_by_key(rmm::exec_policy(stream_)->on(stream_), + auto new_end = thrust::reduce_by_key(rmm::exec_policy(handle_.get_stream_view()), start, start + graph.number_of_edges, new_weight_v.begin(), @@ -606,20 +615,21 @@ class Louvain { graph.number_of_edges = thrust::distance(new_start, new_end.first); graph.number_of_vertices = num_clusters; - detail::fill_offset( - src_indices_v_.data(), graph.offsets, num_clusters, graph.number_of_edges, stream_); - CHECK_CUDA(stream_); + detail::fill_offset(src_indices_v_.data(), + graph.offsets, + num_clusters, + graph.number_of_edges, + handle_.get_stream_view()); - src_indices_v_.resize(graph.number_of_edges, stream_); - indices_v_.resize(graph.number_of_edges, stream_); - weights_v_.resize(graph.number_of_edges, stream_); + src_indices_v_.resize(graph.number_of_edges, handle_.get_stream_view()); + indices_v_.resize(graph.number_of_edges, handle_.get_stream_view()); + weights_v_.resize(graph.number_of_edges, handle_.get_stream_view()); } protected: raft::handle_t const &handle_; vertex_t number_of_vertices_; edge_t number_of_edges_; - cudaStream_t stream_; std::unique_ptr> dendrogram_; diff --git a/cpp/src/converters/COOtoCSR.cuh b/cpp/src/converters/COOtoCSR.cuh index 2876f1ccf52..7dcf28cbb0f 100644 --- a/cpp/src/converters/COOtoCSR.cuh +++ b/cpp/src/converters/COOtoCSR.cuh @@ -30,8 +30,8 @@ #include #include -#include #include +#include #include #include @@ -55,38 +55,38 @@ namespace detail { * @tparam WT Type of edge weights. Supported value : float or double. * * @param[in] graph The input graph object - * @param[in] stream The cuda stream for kernel calls + * @param[in] stream_view The cuda stream for kernel calls * * @param[out] result Total number of vertices */ template -VT sort(GraphCOOView &graph, cudaStream_t stream) +VT sort(GraphCOOView &graph, rmm::cuda_stream_view stream_view) { VT max_src_id; VT max_dst_id; if (graph.has_data()) { thrust::stable_sort_by_key( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream_view), graph.dst_indices, graph.dst_indices + graph.number_of_edges, thrust::make_zip_iterator(thrust::make_tuple(graph.src_indices, graph.edge_data))); CUDA_TRY(cudaMemcpy( &max_dst_id, &(graph.dst_indices[graph.number_of_edges - 1]), sizeof(VT), cudaMemcpyDefault)); thrust::stable_sort_by_key( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream_view), graph.src_indices, graph.src_indices + graph.number_of_edges, thrust::make_zip_iterator(thrust::make_tuple(graph.dst_indices, graph.edge_data))); CUDA_TRY(cudaMemcpy( &max_src_id, &(graph.src_indices[graph.number_of_edges - 1]), sizeof(VT), cudaMemcpyDefault)); } else { - thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), + thrust::stable_sort_by_key(rmm::exec_policy(stream_view), graph.dst_indices, graph.dst_indices + graph.number_of_edges, graph.src_indices); CUDA_TRY(cudaMemcpy( &max_dst_id, &(graph.dst_indices[graph.number_of_edges - 1]), sizeof(VT), cudaMemcpyDefault)); - thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), + thrust::stable_sort_by_key(rmm::exec_policy(stream_view), graph.src_indices, graph.src_indices + graph.number_of_edges, graph.dst_indices); @@ -97,14 +97,15 @@ VT sort(GraphCOOView &graph, cudaStream_t stream) } template -void fill_offset( - VT *source, ET *offsets, VT number_of_vertices, ET number_of_edges, cudaStream_t stream) +void fill_offset(VT *source, + ET *offsets, + VT number_of_vertices, + ET number_of_edges, + rmm::cuda_stream_view stream_view) { - thrust::fill(rmm::exec_policy(stream)->on(stream), - offsets, - offsets + number_of_vertices + 1, - number_of_edges); - thrust::for_each(rmm::exec_policy(stream)->on(stream), + thrust::fill( + rmm::exec_policy(stream_view), offsets, offsets + number_of_vertices + 1, number_of_edges); + thrust::for_each(rmm::exec_policy(stream_view), thrust::make_counting_iterator(1), thrust::make_counting_iterator(number_of_edges), [source, offsets] __device__(ET index) { @@ -116,7 +117,7 @@ void fill_offset( off[src[0]] = ET{0}; auto iter = thrust::make_reverse_iterator(offsets + number_of_vertices + 1); - thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream), + thrust::inclusive_scan(rmm::exec_policy(stream_view), iter, iter + number_of_vertices + 1, iter, @@ -127,15 +128,15 @@ template rmm::device_buffer create_offset(VT *source, VT number_of_vertices, ET number_of_edges, - cudaStream_t stream, + rmm::cuda_stream_view stream_view, rmm::mr::device_memory_resource *mr) { // Offset array needs an extra element at the end to contain the ending offsets // of the last vertex - rmm::device_buffer offsets_buffer(sizeof(ET) * (number_of_vertices + 1), stream, mr); + rmm::device_buffer offsets_buffer(sizeof(ET) * (number_of_vertices + 1), stream_view, mr); ET *offsets = static_cast(offsets_buffer.data()); - fill_offset(source, offsets, number_of_vertices, number_of_edges, stream); + fill_offset(source, offsets, number_of_vertices, number_of_edges, stream_view); return offsets_buffer; } @@ -146,13 +147,13 @@ template std::unique_ptr> coo_to_csr(GraphCOOView const &graph, rmm::mr::device_memory_resource *mr) { - cudaStream_t stream{nullptr}; + rmm::cuda_stream_view stream_view; - GraphCOO temp_graph(graph, stream, mr); + GraphCOO temp_graph(graph, stream_view.value(), mr); GraphCOOView temp_graph_view = temp_graph.view(); - VT total_vertex_count = detail::sort(temp_graph_view, stream); + VT total_vertex_count = detail::sort(temp_graph_view, stream_view); rmm::device_buffer offsets = detail::create_offset( - temp_graph.src_indices(), total_vertex_count, temp_graph.number_of_edges(), stream, mr); + temp_graph.src_indices(), total_vertex_count, temp_graph.number_of_edges(), stream_view, mr); auto coo_contents = temp_graph.release(); GraphSparseContents csr_contents{ total_vertex_count, @@ -167,11 +168,14 @@ std::unique_ptr> coo_to_csr(GraphCOOView const template void coo_to_csr_inplace(GraphCOOView &graph, GraphCSRView &result) { - cudaStream_t stream{nullptr}; - - detail::sort(graph, stream); - detail::fill_offset( - graph.src_indices, result.offsets, graph.number_of_vertices, graph.number_of_edges, stream); + rmm::cuda_stream_view stream_view; + + detail::sort(graph, stream_view); + detail::fill_offset(graph.src_indices, + result.offsets, + graph.number_of_vertices, + graph.number_of_edges, + stream_view); CUDA_TRY(cudaMemcpy( result.indices, graph.dst_indices, sizeof(VT) * graph.number_of_edges, cudaMemcpyDefault)); diff --git a/cpp/src/experimental/graph.cu b/cpp/src/experimental/graph.cu index ad6f51d75fe..2a6a60e5280 100644 --- a/cpp/src/experimental/graph.cu +++ b/cpp/src/experimental/graph.cu @@ -20,10 +20,10 @@ #include #include -#include #include #include #include +#include #include #include @@ -68,13 +68,13 @@ std:: vertex_t minor_first, vertex_t minor_last, bool is_weighted, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { - rmm::device_uvector offsets((major_last - major_first) + 1, stream); - rmm::device_uvector indices(edgelist.number_of_edges, stream); - rmm::device_uvector weights(is_weighted ? edgelist.number_of_edges : 0, stream); - thrust::fill(rmm::exec_policy(stream)->on(stream), offsets.begin(), offsets.end(), edge_t{0}); - thrust::fill(rmm::exec_policy(stream)->on(stream), indices.begin(), indices.end(), vertex_t{0}); + rmm::device_uvector offsets((major_last - major_first) + 1, stream_view); + rmm::device_uvector indices(edgelist.number_of_edges, stream_view); + rmm::device_uvector weights(is_weighted ? edgelist.number_of_edges : 0, stream_view); + thrust::fill(rmm::exec_policy(stream_view), offsets.begin(), offsets.end(), edge_t{0}); + thrust::fill(rmm::exec_policy(stream_view), indices.begin(), indices.end(), vertex_t{0}); // FIXME: need to performance test this code with R-mat graphs having highly-skewed degree // distribution. If there is a small number of vertices with very large degrees, atomicAdd can @@ -91,7 +91,7 @@ std:: auto p_indices = indices.data(); auto p_weights = is_weighted ? weights.data() : static_cast(nullptr); - thrust::for_each(rmm::exec_policy(stream)->on(stream), + thrust::for_each(rmm::exec_policy(stream_view), store_transposed ? edgelist.p_dst_vertices : edgelist.p_src_vertices, store_transposed ? edgelist.p_dst_vertices + edgelist.number_of_edges : edgelist.p_src_vertices + edgelist.number_of_edges, @@ -100,12 +100,12 @@ std:: }); thrust::exclusive_scan( - rmm::exec_policy(stream)->on(stream), offsets.begin(), offsets.end(), offsets.begin()); + rmm::exec_policy(stream_view), offsets.begin(), offsets.end(), offsets.begin()); if (is_weighted) { auto edge_first = thrust::make_zip_iterator(thrust::make_tuple( edgelist.p_src_vertices, edgelist.p_dst_vertices, edgelist.p_edge_weights)); - thrust::for_each(rmm::exec_policy(stream)->on(stream), + thrust::for_each(rmm::exec_policy(stream_view), edge_first, edge_first + edgelist.number_of_edges, [p_offsets, p_indices, p_weights, major_first] __device__(auto e) { @@ -128,7 +128,7 @@ std:: } else { auto edge_first = thrust::make_zip_iterator( thrust::make_tuple(edgelist.p_src_vertices, edgelist.p_dst_vertices)); - thrust::for_each(rmm::exec_policy(stream)->on(stream), + thrust::for_each(rmm::exec_policy(stream_view), edge_first, edge_first + edgelist.number_of_edges, [p_offsets, p_indices, p_weights, major_first] __device__(auto e) { @@ -185,7 +185,7 @@ graph_tget_handle_ptr()->get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); - auto default_stream = this->get_handle_ptr()->get_stream(); + auto default_stream_view = this->get_handle_ptr()->get_stream_view(); CUGRAPH_EXPECTS(edgelists.size() > 0, "Invalid input argument: edgelists.size() should be non-zero."); @@ -226,7 +226,7 @@ graph_ton(default_stream), + CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(default_stream_view), edge_first, edge_first + edgelists[i].number_of_edges, out_of_range_t{ @@ -234,7 +234,7 @@ graph_tget_number_of_edges(), "Invalid input argument: the sum of local edge counts does not match with number_of_edges."); @@ -257,9 +257,9 @@ graph_t offsets(0, default_stream); - rmm::device_uvector indices(0, default_stream); - rmm::device_uvector weights(0, default_stream); + rmm::device_uvector offsets(0, default_stream_view); + rmm::device_uvector indices(0, default_stream_view); + rmm::device_uvector weights(0, default_stream_view); std::tie(offsets, indices, weights) = edgelist_to_compressed_sparse(edgelists[i], major_first, @@ -267,7 +267,7 @@ graph_tget_handle_ptr()->get_stream()); + default_stream_view); adj_matrix_partition_offsets_.push_back(std::move(offsets)); adj_matrix_partition_indices_.push_back(std::move(indices)); if (properties.is_weighted) { adj_matrix_partition_weights_.push_back(std::move(weights)); } @@ -282,7 +282,7 @@ graph_ton(default_stream), + CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream_view), degrees.begin(), degrees.end(), thrust::greater{}), @@ -294,26 +294,26 @@ graph_t::max())); rmm::device_uvector d_thresholds(detail::num_segments_per_vertex_partition - 1, - default_stream); + default_stream_view); std::vector h_thresholds = { static_cast(detail::mid_degree_threshold * col_comm_size), static_cast(detail::low_degree_threshold * col_comm_size)}; raft::update_device( - d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), default_stream); + d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), default_stream_view.value()); rmm::device_uvector segment_offsets(detail::num_segments_per_vertex_partition + 1, - default_stream); + default_stream_view); // temporaries are necessary because the &&-overload of device_uvector is deleted // Note that we must sync `default_stream` before these temporaries go out of scope to // avoid use after free. (The syncs are at the end of this function) auto zero_vertex = vertex_t{0}; auto vertex_count = static_cast(degrees.size()); - segment_offsets.set_element_async(0, zero_vertex, default_stream); + segment_offsets.set_element_async(0, zero_vertex, default_stream_view); segment_offsets.set_element_async( - detail::num_segments_per_vertex_partition, vertex_count, default_stream); + detail::num_segments_per_vertex_partition, vertex_count, default_stream_view); - thrust::upper_bound(rmm::exec_policy(default_stream)->on(default_stream), + thrust::upper_bound(rmm::exec_policy(default_stream_view), degrees.begin(), degrees.end(), d_thresholds.begin(), @@ -322,23 +322,22 @@ graph_t{}); rmm::device_uvector aggregate_segment_offsets(col_comm_size * segment_offsets.size(), - default_stream); + default_stream_view); col_comm.allgather(segment_offsets.data(), aggregate_segment_offsets.data(), segment_offsets.size(), - default_stream); + default_stream_view.value()); adj_matrix_partition_segment_offsets_.resize(aggregate_segment_offsets.size()); raft::update_host(adj_matrix_partition_segment_offsets_.data(), aggregate_segment_offsets.data(), aggregate_segment_offsets.size(), - default_stream); + default_stream_view.value()); - auto status = col_comm.sync_stream( - default_stream); // this is necessary as degrees, d_thresholds, and segment_offsets will - // become out-of-scope once control flow exits this block and - // adj_matrix_partition_segment_offsets_ can be used right after return. - CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure."); + default_stream_view + .synchronize(); // this is necessary as degrees, d_thresholds, and segment_offsets will + // become out-of-scope once control flow exits this block and + // adj_matrix_partition_segment_offsets_ can be used right after return. } // optional expensive checks (part 3/3) @@ -366,13 +365,13 @@ graph_t( handle, number_of_vertices, edgelist.number_of_edges, properties), - offsets_(rmm::device_uvector(0, handle.get_stream())), - indices_(rmm::device_uvector(0, handle.get_stream())), - weights_(rmm::device_uvector(0, handle.get_stream())) + offsets_(rmm::device_uvector(0, handle.get_stream_view())), + indices_(rmm::device_uvector(0, handle.get_stream_view())), + weights_(rmm::device_uvector(0, handle.get_stream_view())) { // cheap error checks - auto default_stream = this->get_handle_ptr()->get_stream(); + auto default_stream_view = this->get_handle_ptr()->get_stream_view(); CUGRAPH_EXPECTS( ((edgelist.number_of_edges == 0) || (edgelist.p_src_vertices != nullptr)) && @@ -392,7 +391,7 @@ graph_ton(default_stream), + rmm::exec_policy(default_stream_view), edge_first, edge_first + edgelist.number_of_edges, out_of_range_t{ @@ -415,7 +414,7 @@ graph_tget_number_of_vertices(), properties.is_weighted, - this->get_handle_ptr()->get_stream()); + default_stream_view); // update degree-based segment offsets (to be used for graph analytics kernel optimization) @@ -428,7 +427,7 @@ graph_ton(default_stream), + thrust::is_sorted(rmm::exec_policy(default_stream_view), degree_first, degree_first + this->get_number_of_vertices(), thrust::greater{}), @@ -440,26 +439,26 @@ graph_t::max())); rmm::device_uvector d_thresholds(detail::num_segments_per_vertex_partition - 1, - default_stream); + default_stream_view); std::vector h_thresholds = {static_cast(detail::mid_degree_threshold), static_cast(detail::low_degree_threshold)}; raft::update_device( - d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), default_stream); + d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), default_stream_view.value()); rmm::device_uvector segment_offsets(detail::num_segments_per_vertex_partition + 1, - default_stream); + default_stream_view); // temporaries are necessary because the &&-overload of device_uvector is deleted // Note that we must sync `default_stream` before these temporaries go out of scope to // avoid use after free. (The syncs are at the end of this function) auto zero_vertex = vertex_t{0}; auto vertex_count = static_cast(this->get_number_of_vertices()); - segment_offsets.set_element_async(0, zero_vertex, default_stream); + segment_offsets.set_element_async(0, zero_vertex, default_stream_view); segment_offsets.set_element_async( - detail::num_segments_per_vertex_partition, vertex_count, default_stream); + detail::num_segments_per_vertex_partition, vertex_count, default_stream_view); - thrust::upper_bound(rmm::exec_policy(default_stream)->on(default_stream), + thrust::upper_bound(rmm::exec_policy(default_stream_view), degree_first, degree_first + this->get_number_of_vertices(), d_thresholds.begin(), @@ -468,11 +467,13 @@ graph_t{}); segment_offsets_.resize(segment_offsets.size()); - raft::update_host( - segment_offsets_.data(), segment_offsets.data(), segment_offsets.size(), default_stream); + raft::update_host(segment_offsets_.data(), + segment_offsets.data(), + segment_offsets.size(), + default_stream_view.value()); - CUDA_TRY(cudaStreamSynchronize( - default_stream)); // this is necessary as segment_offsets_ can be used right after return. + default_stream_view + .synchronize(); // this is necessary as segment_offsets_ can be used right after return. } // optional expensive checks (part 3/3) diff --git a/cpp/src/experimental/graph_view.cu b/cpp/src/experimental/graph_view.cu index 3dc5dee4756..b8fcbb8ccba 100644 --- a/cpp/src/experimental/graph_view.cu +++ b/cpp/src/experimental/graph_view.cu @@ -22,9 +22,9 @@ #include #include -#include #include #include +#include #include #include @@ -210,7 +210,7 @@ graph_view_tget_handle_ptr()->get_stream(); + auto default_stream_view = this->get_handle_ptr()->get_stream_view(); auto const row_comm_rank = this->get_handle_ptr() ->get_subcomm(cugraph::partition_2d::key_naming_t().row_name()) @@ -228,7 +228,7 @@ graph_view_ton(default_stream), + thrust::is_sorted(rmm::exec_policy(default_stream_view), adj_matrix_partition_offsets[i], adj_matrix_partition_offsets[i] + (major_last - major_first + 1)), "Internal Error: adj_matrix_partition_offsets[] is not sorted."); @@ -236,20 +236,20 @@ graph_view_ton(default_stream), + thrust::count_if(rmm::exec_policy(default_stream_view), adj_matrix_partition_indices[i], adj_matrix_partition_indices[i] + number_of_local_edges, out_of_range_t{minor_first, minor_last}) == 0, "Internal Error: adj_matrix_partition_indices[] have out-of-range vertex IDs."); } number_of_local_edges_sum = host_scalar_allreduce( - this->get_handle_ptr()->get_comms(), number_of_local_edges_sum, default_stream); + this->get_handle_ptr()->get_comms(), number_of_local_edges_sum, default_stream_view.value()); CUGRAPH_EXPECTS(number_of_local_edges_sum == this->get_number_of_edges(), "Internal Error: the sum of local edges counts does not match with " "number_of_local_edges."); @@ -257,7 +257,7 @@ graph_view_ton(default_stream), + thrust::is_sorted(rmm::exec_policy(default_stream_view), degrees.begin(), degrees.end(), thrust::greater{}), @@ -332,16 +332,16 @@ graph_view_tget_handle_ptr()->get_stream(); + auto default_stream_view = this->get_handle_ptr()->get_stream_view(); - CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream)->on(default_stream), + CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream_view), offsets, offsets + (this->get_number_of_vertices() + 1)), "Internal Error: offsets is not sorted."); // better use thrust::any_of once https://github.com/thrust/thrust/issues/1016 is resolved CUGRAPH_EXPECTS( - thrust::count_if(rmm::exec_policy(default_stream)->on(default_stream), + thrust::count_if(rmm::exec_policy(default_stream_view), indices, indices + this->get_number_of_edges(), out_of_range_t{0, this->get_number_of_vertices()}) == 0, @@ -351,7 +351,7 @@ graph_view_t{offsets}); - CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream)->on(default_stream), + CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream_view), degree_first, degree_first + this->get_number_of_vertices(), thrust::greater{}), @@ -531,9 +531,8 @@ graph_view_ton(handle.get_stream()), - in_degrees.begin(), - in_degrees.end()); + auto it = thrust::max_element( + rmm::exec_policy(handle.get_stream_view()), in_degrees.begin(), in_degrees.end()); rmm::device_scalar ret(edge_t{0}, handle.get_stream()); device_allreduce(handle.get_comms(), it != in_degrees.end() ? it : ret.data(), @@ -558,9 +557,8 @@ edge_t graph_view_ton(handle.get_stream()), - in_degrees.begin(), - in_degrees.end()); + auto it = thrust::max_element( + rmm::exec_policy(handle.get_stream_view()), in_degrees.begin(), in_degrees.end()); edge_t ret{0}; if (it != in_degrees.end()) { raft::update_host(&ret, it, 1, handle.get_stream()); } handle.get_stream_view().synchronize(); @@ -577,9 +575,8 @@ graph_view_ton(handle.get_stream()), - out_degrees.begin(), - out_degrees.end()); + auto it = thrust::max_element( + rmm::exec_policy(handle.get_stream_view()), out_degrees.begin(), out_degrees.end()); rmm::device_scalar ret(edge_t{0}, handle.get_stream()); device_allreduce(handle.get_comms(), it != out_degrees.end() ? it : ret.data(), @@ -604,9 +601,8 @@ edge_t graph_view_ton(handle.get_stream()), - out_degrees.begin(), - out_degrees.end()); + auto it = thrust::max_element( + rmm::exec_policy(handle.get_stream_view()), out_degrees.begin(), out_degrees.end()); edge_t ret{0}; if (it != out_degrees.end()) { raft::update_host(&ret, it, 1, handle.get_stream()); } handle.get_stream_view().synchronize(); @@ -623,9 +619,8 @@ graph_view_ton(handle.get_stream()), - in_weight_sums.begin(), - in_weight_sums.end()); + auto it = thrust::max_element( + rmm::exec_policy(handle.get_stream_view()), in_weight_sums.begin(), in_weight_sums.end()); rmm::device_scalar ret(weight_t{0.0}, handle.get_stream()); device_allreduce(handle.get_comms(), it != in_weight_sums.end() ? it : ret.data(), @@ -650,9 +645,8 @@ weight_t graph_view_ton(handle.get_stream()), - in_weight_sums.begin(), - in_weight_sums.end()); + auto it = thrust::max_element( + rmm::exec_policy(handle.get_stream_view()), in_weight_sums.begin(), in_weight_sums.end()); weight_t ret{0.0}; if (it != in_weight_sums.end()) { raft::update_host(&ret, it, 1, handle.get_stream()); } handle.get_stream_view().synchronize(); @@ -669,9 +663,8 @@ graph_view_ton(handle.get_stream()), - out_weight_sums.begin(), - out_weight_sums.end()); + auto it = thrust::max_element( + rmm::exec_policy(handle.get_stream_view()), out_weight_sums.begin(), out_weight_sums.end()); rmm::device_scalar ret(weight_t{0.0}, handle.get_stream()); device_allreduce(handle.get_comms(), it != out_weight_sums.end() ? it : ret.data(), @@ -696,9 +689,8 @@ weight_t graph_view_t< std::enable_if_t>::compute_max_out_weight_sum(raft::handle_t const& handle) const { auto out_weight_sums = compute_out_weight_sums(handle); - auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - out_weight_sums.begin(), - out_weight_sums.end()); + auto it = thrust::max_element( + rmm::exec_policy(handle.get_stream_view()), out_weight_sums.begin(), out_weight_sums.end()); weight_t ret{0.0}; if (it != out_weight_sums.end()) { raft::update_host(&ret, it, 1, handle.get_stream()); } handle.get_stream_view().synchronize(); diff --git a/cpp/src/experimental/induced_subgraph.cu b/cpp/src/experimental/induced_subgraph.cu index 062bf18cd95..af96103c486 100644 --- a/cpp/src/experimental/induced_subgraph.cu +++ b/cpp/src/experimental/induced_subgraph.cu @@ -20,9 +20,9 @@ #include #include -#include #include #include +#include #include #include @@ -73,18 +73,17 @@ extract_induced_subgraphs( raft::update_host(&should_be_zero, subgraph_offsets, 1, handle.get_stream()); raft::update_host( &num_aggregate_subgraph_vertices, subgraph_offsets + num_subgraphs, 1, handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); + handle.get_stream_view().synchronize(); CUGRAPH_EXPECTS(should_be_zero == 0, "Invalid input argument: subgraph_offsets[0] should be 0."); - CUGRAPH_EXPECTS( - thrust::is_sorted(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - subgraph_offsets, - subgraph_offsets + (num_subgraphs + 1)), - "Invalid input argument: subgraph_offsets is not sorted."); + CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(handle.get_stream_view()), + subgraph_offsets, + subgraph_offsets + (num_subgraphs + 1)), + "Invalid input argument: subgraph_offsets is not sorted."); vertex_partition_device_t> vertex_partition(graph_view); - CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(handle.get_stream_view()), subgraph_vertices, subgraph_vertices + num_aggregate_subgraph_vertices, [vertex_partition] __device__(auto v) { @@ -95,7 +94,7 @@ extract_induced_subgraphs( CUGRAPH_EXPECTS( thrust::count_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rmm::exec_policy(handle.get_stream_view()), thrust::make_counting_iterator(size_t{0}), thrust::make_counting_iterator(num_subgraphs), [subgraph_offsets, subgraph_vertices] __device__(auto i) { @@ -119,28 +118,28 @@ extract_induced_subgraphs( if (multi_gpu) { CUGRAPH_FAIL("Unimplemented."); - return std::make_tuple(rmm::device_uvector(0, handle.get_stream()), - rmm::device_uvector(0, handle.get_stream()), - rmm::device_uvector(0, handle.get_stream()), - rmm::device_uvector(0, handle.get_stream())); + return std::make_tuple(rmm::device_uvector(0, handle.get_stream_view()), + rmm::device_uvector(0, handle.get_stream_view()), + rmm::device_uvector(0, handle.get_stream_view()), + rmm::device_uvector(0, handle.get_stream_view())); } else { // 2-1. Phase 1: calculate memory requirements size_t num_aggregate_subgraph_vertices{}; raft::update_host( &num_aggregate_subgraph_vertices, subgraph_offsets + num_subgraphs, 1, handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); + handle.get_stream_view().synchronize(); rmm::device_uvector subgraph_vertex_output_offsets( num_aggregate_subgraph_vertices + 1, - handle.get_stream()); // for each element of subgraph_vertices + handle.get_stream_view()); // for each element of subgraph_vertices matrix_partition_device_t> matrix_partition(graph_view, 0); // count the numbers of the induced subgraph edges for each vertex in the aggregate subgraph // vertex list. thrust::transform( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rmm::exec_policy(handle.get_stream_view()), thrust::make_counting_iterator(size_t{0}), thrust::make_counting_iterator(num_aggregate_subgraph_vertices), subgraph_vertex_output_offsets.begin(), @@ -166,7 +165,7 @@ extract_induced_subgraphs( return thrust::binary_search(thrust::seq, vertex_first, vertex_last, nbr); }); }); - thrust::exclusive_scan(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::exclusive_scan(rmm::exec_policy(handle.get_stream_view()), subgraph_vertex_output_offsets.begin(), subgraph_vertex_output_offsets.end(), subgraph_vertex_output_offsets.begin()); @@ -176,19 +175,19 @@ extract_induced_subgraphs( subgraph_vertex_output_offsets.data() + num_aggregate_subgraph_vertices, 1, handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); + handle.get_stream_view().synchronize(); // 2-2. Phase 2: find the edges in the induced subgraphs - rmm::device_uvector edge_majors(num_aggregate_edges, handle.get_stream()); - rmm::device_uvector edge_minors(num_aggregate_edges, handle.get_stream()); + rmm::device_uvector edge_majors(num_aggregate_edges, handle.get_stream_view()); + rmm::device_uvector edge_minors(num_aggregate_edges, handle.get_stream_view()); rmm::device_uvector edge_weights( - graph_view.is_weighted() ? num_aggregate_edges : size_t{0}, handle.get_stream()); + graph_view.is_weighted() ? num_aggregate_edges : size_t{0}, handle.get_stream_view()); // fill the edge list buffer (to be returned) for each vetex in the aggregate subgraph vertex // list (use the offsets computed in the Phase 1) thrust::for_each( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rmm::exec_policy(handle.get_stream_view()), thrust::make_counting_iterator(size_t{0}), thrust::make_counting_iterator(num_aggregate_subgraph_vertices), [subgraph_offsets, @@ -244,8 +243,8 @@ extract_induced_subgraphs( } }); - rmm::device_uvector subgraph_edge_offsets(num_subgraphs + 1, handle.get_stream()); - thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rmm::device_uvector subgraph_edge_offsets(num_subgraphs + 1, handle.get_stream_view()); + thrust::gather(rmm::exec_policy(handle.get_stream_view()), subgraph_offsets, subgraph_offsets + (num_subgraphs + 1), subgraph_vertex_output_offsets.begin(), diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh index 6205f13e94d..e77480dce50 100644 --- a/cpp/src/experimental/louvain.cuh +++ b/cpp/src/experimental/louvain.cuh @@ -61,12 +61,12 @@ class Louvain { handle_(handle), dendrogram_(std::make_unique>()), current_graph_view_(graph_view), - cluster_keys_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()), - cluster_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()), - vertex_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()), - src_vertex_weights_cache_v_(0, handle.get_stream()), - src_cluster_cache_v_(0, handle.get_stream()), - dst_cluster_cache_v_(0, handle.get_stream()) + cluster_keys_v_(graph_view.get_number_of_local_vertices(), handle.get_stream_view()), + cluster_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream_view()), + vertex_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream_view()), + src_vertex_weights_cache_v_(0, handle.get_stream_view()), + src_cluster_cache_v_(0, handle.get_stream_view()), + dst_cluster_cache_v_(0, handle.get_stream_view()) { } @@ -122,16 +122,16 @@ class Louvain { #endif } - void timer_stop(cudaStream_t stream) + void timer_stop(rmm::cuda_stream_view stream_view) { #ifdef TIMING if (graph_view_t::is_multi_gpu) { if (handle.get_comms().get_rank() == 0) { - CUDA_TRY(cudaStreamSynchronize(stream)); + stream_view.synchronize(); hr_timer_.stop(); } } else { - CUDA_TRY(cudaStreamSynchronize(stream)); + stream_view.synchronize(); hr_timer_.stop(); } #endif @@ -152,9 +152,9 @@ class Louvain { void initialize_dendrogram_level(vertex_t num_vertices) { dendrogram_->add_level( - current_graph_view_.get_local_vertex_first(), num_vertices, handle_.get_stream()); + current_graph_view_.get_local_vertex_first(), num_vertices, handle_.get_stream_view()); - thrust::sequence(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::sequence(rmm::exec_policy(handle_.get_stream_view()), dendrogram_->current_level_begin(), dendrogram_->current_level_end(), current_graph_view_.get_local_vertex_first()); @@ -164,7 +164,7 @@ class Louvain { weight_t modularity(weight_t total_edge_weight, weight_t resolution) { weight_t sum_degree_squared = thrust::transform_reduce( - rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + rmm::exec_policy(handle_.get_stream_view()), cluster_weights_v_.begin(), cluster_weights_v_.end(), [] __device__(weight_t p) { return p * p; }, @@ -201,10 +201,10 @@ class Louvain { timer_start("compute_vertex_and_cluster_weights"); vertex_weights_v_ = current_graph_view_.compute_out_weight_sums(handle_); - cluster_keys_v_.resize(vertex_weights_v_.size(), handle_.get_stream()); - cluster_weights_v_.resize(vertex_weights_v_.size(), handle_.get_stream()); + cluster_keys_v_.resize(vertex_weights_v_.size(), handle_.get_stream_view()); + cluster_weights_v_.resize(vertex_weights_v_.size(), handle_.get_stream_view()); - thrust::sequence(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::sequence(rmm::exec_policy(handle_.get_stream_view()), cluster_keys_v_.begin(), cluster_keys_v_.end(), current_graph_view_.get_local_vertex_first()); @@ -219,8 +219,8 @@ class Louvain { if (graph_view_t::is_multi_gpu) { auto const comm_size = handle_.get_comms().get_size(); - rmm::device_uvector rx_keys_v(0, handle_.get_stream()); - rmm::device_uvector rx_weights_v(0, handle_.get_stream()); + rmm::device_uvector rx_keys_v(0, handle_.get_stream_view()); + rmm::device_uvector rx_weights_v(0, handle_.get_stream_view()); auto pair_first = thrust::make_zip_iterator( thrust::make_tuple(cluster_keys_v_.begin(), cluster_weights_v_.begin())); @@ -233,13 +233,13 @@ class Louvain { [key_func = cugraph::experimental::detail::compute_gpu_id_from_vertex_t{ comm_size}] __device__(auto val) { return key_func(thrust::get<0>(val)); }, - handle_.get_stream()); + handle_.get_stream_view()); cluster_keys_v_ = std::move(rx_keys_v); cluster_weights_v_ = std::move(rx_weights_v); } - timer_stop(handle_.get_stream()); + timer_stop(handle_.get_stream_view()); } template @@ -247,7 +247,7 @@ class Louvain { { if (graph_view_t::is_multi_gpu) { src_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), - handle_.get_stream()); + handle_.get_stream_view()); copy_to_adj_matrix_row(handle_, current_graph_view_, input.begin(), src_cache_v.begin()); return src_cache_v.begin(); } else { @@ -260,7 +260,7 @@ class Louvain { { if (graph_view_t::is_multi_gpu) { dst_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_cols(), - handle_.get_stream()); + handle_.get_stream_view()); copy_to_adj_matrix_col(handle_, current_graph_view_, input.begin(), dst_cache_v.begin()); return dst_cache_v.begin(); } else { @@ -273,7 +273,7 @@ class Louvain { timer_start("update_clustering"); rmm::device_uvector next_cluster_v(dendrogram_->current_level_size(), - handle_.get_stream()); + handle_.get_stream_view()); raft::copy(next_cluster_v.begin(), dendrogram_->current_level_begin(), @@ -308,7 +308,7 @@ class Louvain { } } - timer_stop(handle_.get_stream()); + timer_stop(handle_.get_stream_view()); return cur_Q; } @@ -317,7 +317,7 @@ class Louvain { { auto output_buffer = cugraph::experimental::allocate_dataframe_buffer>( - current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); + current_graph_view_.get_number_of_local_vertices(), handle_.get_stream_view()); experimental::copy_v_transform_reduce_out_nbr( handle_, @@ -340,7 +340,7 @@ class Louvain { output_buffer)); thrust::transform( - rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + rmm::exec_policy(handle_.get_stream_view()), cugraph::experimental::get_dataframe_buffer_begin>( output_buffer), cugraph::experimental::get_dataframe_buffer_begin>( @@ -350,7 +350,7 @@ class Louvain { [] __device__(auto p) { return thrust::get<1>(p); }); thrust::transform( - rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + rmm::exec_policy(handle_.get_stream_view()), cugraph::experimental::get_dataframe_buffer_begin>( output_buffer), cugraph::experimental::get_dataframe_buffer_begin>( @@ -400,12 +400,12 @@ class Louvain { map_key_last = cluster_keys_v_.end(); map_value_first = cluster_weights_v_.begin(); } else { - thrust::sort_by_key(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::sort_by_key(rmm::exec_policy(handle_.get_stream_view()), cluster_keys_v_.begin(), cluster_keys_v_.end(), cluster_weights_v_.begin()); - thrust::transform(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::transform(rmm::exec_policy(handle_.get_stream_view()), next_cluster_v.begin(), next_cluster_v.end(), src_cluster_weights_v.begin(), @@ -473,7 +473,7 @@ class Louvain { output_buffer)); thrust::transform( - rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + rmm::exec_policy(handle_.get_stream_view()), next_cluster_v.begin(), next_cluster_v.end(), cugraph::experimental::get_dataframe_buffer_begin>( @@ -514,7 +514,7 @@ class Louvain { current_graph_view_ = current_graph_->view(); rmm::device_uvector numbering_indices(numbering_map.size(), handle_.get_stream()); - thrust::sequence(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::sequence(rmm::exec_policy(handle_.get_stream_view()), numbering_indices.begin(), numbering_indices.end(), current_graph_view_.get_local_vertex_first()); diff --git a/cpp/src/experimental/relabel.cu b/cpp/src/experimental/relabel.cu index 7e7a4d64b3e..9e5692f933e 100644 --- a/cpp/src/experimental/relabel.cu +++ b/cpp/src/experimental/relabel.cu @@ -22,10 +22,10 @@ #include #include -#include #include #include #include +#include #include #include @@ -63,40 +63,41 @@ void relabel(raft::handle_t const& handle, // find unique old labels (to be relabeled) - rmm::device_uvector unique_old_labels(num_labels, handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rmm::device_uvector unique_old_labels(num_labels, handle.get_stream_view()); + thrust::copy(rmm::exec_policy(handle.get_stream_view()), labels, labels + num_labels, unique_old_labels.data()); - thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::sort(rmm::exec_policy(handle.get_stream_view()), unique_old_labels.begin(), unique_old_labels.end()); unique_old_labels.resize( - thrust::distance( - unique_old_labels.begin(), - thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - unique_old_labels.begin(), - unique_old_labels.end())), - handle.get_stream()); - unique_old_labels.shrink_to_fit(handle.get_stream()); + thrust::distance(unique_old_labels.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream_view()), + unique_old_labels.begin(), + unique_old_labels.end())), + handle.get_stream_view()); + unique_old_labels.shrink_to_fit(handle.get_stream_view()); // collect new labels for the unique old labels - rmm::device_uvector new_labels_for_unique_old_labels(0, handle.get_stream()); + rmm::device_uvector new_labels_for_unique_old_labels(0, handle.get_stream_view()); { // shuffle the old_new_label_pairs based on applying the compute_gpu_id_from_vertex_t functor // to the old labels - rmm::device_uvector rx_label_pair_old_labels(0, handle.get_stream()); - rmm::device_uvector rx_label_pair_new_labels(0, handle.get_stream()); + rmm::device_uvector rx_label_pair_old_labels(0, handle.get_stream_view()); + rmm::device_uvector rx_label_pair_new_labels(0, handle.get_stream_view()); { - rmm::device_uvector label_pair_old_labels(num_label_pairs, handle.get_stream()); - rmm::device_uvector label_pair_new_labels(num_label_pairs, handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rmm::device_uvector label_pair_old_labels(num_label_pairs, + handle.get_stream_view()); + rmm::device_uvector label_pair_new_labels(num_label_pairs, + handle.get_stream_view()); + thrust::copy(rmm::exec_policy(handle.get_stream_view()), std::get<0>(old_new_label_pairs), std::get<0>(old_new_label_pairs) + num_label_pairs, label_pair_old_labels.begin()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::copy(rmm::exec_policy(handle.get_stream_view()), std::get<1>(old_new_label_pairs), std::get<1>(old_new_label_pairs) + num_label_pairs, label_pair_new_labels.begin()); @@ -109,13 +110,12 @@ void relabel(raft::handle_t const& handle, pair_first, pair_first + num_label_pairs, [key_func] __device__(auto val) { return key_func(thrust::get<0>(val)); }, - handle.get_stream()); + handle.get_stream_view()); } // update intermediate relabel map - CUDA_TRY(cudaStreamSynchronize( - handle.get_stream())); // cuco::static_map currently does not take stream + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream auto poly_alloc = rmm::mr::polymorphic_allocator(rmm::mr::get_current_device_resource()); @@ -134,28 +134,27 @@ void relabel(raft::handle_t const& handle, thrust::make_tuple(rx_label_pair_old_labels.begin(), rx_label_pair_new_labels.begin())); relabel_map.insert(pair_first, pair_first + rx_label_pair_old_labels.size()); - rx_label_pair_old_labels.resize(0, handle.get_stream()); - rx_label_pair_new_labels.resize(0, handle.get_stream()); - rx_label_pair_old_labels.shrink_to_fit(handle.get_stream()); - rx_label_pair_new_labels.shrink_to_fit(handle.get_stream()); + rx_label_pair_old_labels.resize(0, handle.get_stream_view()); + rx_label_pair_new_labels.resize(0, handle.get_stream_view()); + rx_label_pair_old_labels.shrink_to_fit(handle.get_stream_view()); + rx_label_pair_new_labels.shrink_to_fit(handle.get_stream_view()); // shuffle unique_old_labels, relabel using the intermediate relabel map, and shuffle back { - rmm::device_uvector rx_unique_old_labels(0, handle.get_stream()); + rmm::device_uvector rx_unique_old_labels(0, handle.get_stream_view()); std::vector rx_value_counts{}; std::tie(rx_unique_old_labels, rx_value_counts) = groupby_gpuid_and_shuffle_values( handle.get_comms(), unique_old_labels.begin(), unique_old_labels.end(), [key_func] __device__(auto val) { return key_func(val); }, - handle.get_stream()); + handle.get_stream_view()); - CUDA_TRY(cudaStreamSynchronize( - handle.get_stream())); // cuco::static_map currently does not take stream + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream if (skip_missing_labels) { - thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::transform(rmm::exec_policy(handle.get_stream_view()), rx_unique_old_labels.begin(), rx_unique_old_labels.end(), rx_unique_old_labels.begin(), @@ -173,8 +172,11 @@ void relabel(raft::handle_t const& handle, // corresponding old labels } - std::tie(new_labels_for_unique_old_labels, std::ignore) = shuffle_values( - handle.get_comms(), rx_unique_old_labels.begin(), rx_value_counts, handle.get_stream()); + std::tie(new_labels_for_unique_old_labels, std::ignore) = + shuffle_values(handle.get_comms(), + rx_unique_old_labels.begin(), + rx_value_counts, + handle.get_stream_view()); } } @@ -211,7 +213,7 @@ void relabel(raft::handle_t const& handle, thrust::make_tuple(std::get<0>(old_new_label_pairs), std::get<1>(old_new_label_pairs))); relabel_map.insert(pair_first, pair_first + num_label_pairs); if (skip_missing_labels) { - thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::transform(rmm::exec_policy(handle.get_stream_view()), labels, labels + num_labels, labels, @@ -228,7 +230,7 @@ void relabel(raft::handle_t const& handle, if (do_expensive_check && !skip_missing_labels) { CUGRAPH_EXPECTS( - thrust::count(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::count(rmm::exec_policy(handle.get_stream_view()), labels, labels + num_labels, invalid_vertex_id::value) == 0, diff --git a/cpp/src/experimental/renumber_edgelist.cu b/cpp/src/experimental/renumber_edgelist.cu index d6e3f8c93f6..afd7bce772e 100644 --- a/cpp/src/experimental/renumber_edgelist.cu +++ b/cpp/src/experimental/renumber_edgelist.cu @@ -23,10 +23,10 @@ #include #include -#include #include #include #include +#include #include #include @@ -83,16 +83,16 @@ rmm::device_uvector compute_renumber_map( { rmm::device_uvector sorted_major_labels(edgelist_edge_counts[i], handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::copy(rmm::exec_policy(handle.get_stream_view()), edgelist_major_vertices[i], edgelist_major_vertices[i] + edgelist_edge_counts[i], sorted_major_labels.begin()); // FIXME: better refactor this sort-count_if-reduce_by_key routine for reuse - thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::sort(rmm::exec_policy(handle.get_stream_view()), sorted_major_labels.begin(), sorted_major_labels.end()); auto num_unique_labels = - thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::count_if(rmm::exec_policy(handle.get_stream_view()), thrust::make_counting_iterator(size_t{0}), thrust::make_counting_iterator(sorted_major_labels.size()), [labels = sorted_major_labels.data()] __device__(auto i) { @@ -100,7 +100,7 @@ rmm::device_uvector compute_renumber_map( }); tmp_major_labels.resize(num_unique_labels, handle.get_stream()); tmp_major_counts.resize(tmp_major_labels.size(), handle.get_stream()); - thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::reduce_by_key(rmm::exec_policy(handle.get_stream_view()), sorted_major_labels.begin(), sorted_major_labels.end(), thrust::make_constant_iterator(edge_t{1}), @@ -146,20 +146,19 @@ rmm::device_uvector compute_renumber_map( } if (multi_gpu) { // FIXME: better refactor this sort-count_if-reduce_by_key routine for reuse - thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::sort_by_key(rmm::exec_policy(handle.get_stream_view()), major_labels.begin(), major_labels.end(), major_counts.begin()); - auto num_unique_labels = - thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(major_labels.size()), - [labels = major_labels.data()] __device__(auto i) { - return (i == 0) || (labels[i - 1] != labels[i]); - }); + auto num_unique_labels = thrust::count_if(rmm::exec_policy(handle.get_stream_view()), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(major_labels.size()), + [labels = major_labels.data()] __device__(auto i) { + return (i == 0) || (labels[i - 1] != labels[i]); + }); rmm::device_uvector tmp_major_labels(num_unique_labels, handle.get_stream()); rmm::device_uvector tmp_major_counts(tmp_major_labels.size(), handle.get_stream()); - thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::reduce_by_key(rmm::exec_policy(handle.get_stream_view()), major_labels.begin(), major_labels.end(), major_counts.begin(), @@ -177,20 +176,18 @@ rmm::device_uvector compute_renumber_map( rmm::device_uvector minor_labels(minor_displs.back() + edgelist_edge_counts.back(), handle.get_stream()); for (size_t i = 0; i < edgelist_minor_vertices.size(); ++i) { - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::copy(rmm::exec_policy(handle.get_stream_view()), edgelist_minor_vertices[i], edgelist_minor_vertices[i] + edgelist_edge_counts[i], minor_labels.begin() + minor_displs[i]); } - thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - minor_labels.begin(), - minor_labels.end()); - minor_labels.resize( - thrust::distance(minor_labels.begin(), - thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - minor_labels.begin(), - minor_labels.end())), - handle.get_stream()); + thrust::sort( + rmm::exec_policy(handle.get_stream_view()), minor_labels.begin(), minor_labels.end()); + minor_labels.resize(thrust::distance(minor_labels.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream_view()), + minor_labels.begin(), + minor_labels.end())), + handle.get_stream()); if (multi_gpu) { auto& comm = handle.get_comms(); auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); @@ -216,15 +213,13 @@ rmm::device_uvector compute_renumber_map( [key_func = detail::compute_gpu_id_from_vertex_t{row_comm_size}] __device__( auto val) { return key_func(val); }, handle.get_stream()); - thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_minor_labels.begin(), - rx_minor_labels.end()); + thrust::sort( + rmm::exec_policy(handle.get_stream_view()), rx_minor_labels.begin(), rx_minor_labels.end()); rx_minor_labels.resize( - thrust::distance( - rx_minor_labels.begin(), - thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_minor_labels.begin(), - rx_minor_labels.end())), + thrust::distance(rx_minor_labels.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream_view()), + rx_minor_labels.begin(), + rx_minor_labels.end())), handle.get_stream()); minor_labels = std::move(rx_minor_labels); } @@ -241,14 +236,14 @@ rmm::device_uvector compute_renumber_map( comm.barrier(); // currently, this is ncclAllReduce #endif } - minor_labels.shrink_to_fit(handle.get_stream()); + minor_labels.shrink_to_fit(handle.get_stream_view()); // 3. merge major and minor labels and vertex labels rmm::device_uvector merged_labels(major_labels.size() + minor_labels.size(), - handle.get_stream()); - rmm::device_uvector merged_counts(merged_labels.size(), handle.get_stream()); - thrust::merge_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + handle.get_stream_view()); + rmm::device_uvector merged_counts(merged_labels.size(), handle.get_stream_view()); + thrust::merge_by_key(rmm::exec_policy(handle.get_stream_view()), major_labels.begin(), major_labels.end(), minor_labels.begin(), @@ -267,13 +262,12 @@ rmm::device_uvector compute_renumber_map( rmm::device_uvector labels(merged_labels.size(), handle.get_stream()); rmm::device_uvector counts(labels.size(), handle.get_stream()); - auto pair_it = - thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - merged_labels.begin(), - merged_labels.end(), - merged_counts.begin(), - labels.begin(), - counts.begin()); + auto pair_it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream_view()), + merged_labels.begin(), + merged_labels.end(), + merged_counts.begin(), + labels.begin(), + counts.begin()); merged_labels.resize(0, handle.get_stream()); merged_counts.resize(0, handle.get_stream()); merged_labels.shrink_to_fit(handle.get_stream()); @@ -289,14 +283,14 @@ rmm::device_uvector compute_renumber_map( if (optional_vertex_span) { auto [vertices, num_vertices] = *optional_vertex_span; auto num_isolated_vertices = thrust::count_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rmm::exec_policy(handle.get_stream_view()), vertices, vertices + num_vertices, [label_first = labels.begin(), label_last = labels.end()] __device__(auto v) { return !thrust::binary_search(thrust::seq, label_first, label_last, v); }); isolated_vertices.resize(num_isolated_vertices, handle.get_stream()); - thrust::copy_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::copy_if(rmm::exec_policy(handle.get_stream_view()), vertices, vertices + num_vertices, isolated_vertices.begin(), @@ -308,11 +302,11 @@ rmm::device_uvector compute_renumber_map( if (isolated_vertices.size() > 0) { labels.resize(labels.size() + isolated_vertices.size(), handle.get_stream()); counts.resize(labels.size(), handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::copy(rmm::exec_policy(handle.get_stream_view()), isolated_vertices.begin(), isolated_vertices.end(), labels.end() - isolated_vertices.size()); - thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::fill(rmm::exec_policy(handle.get_stream_view()), counts.end() - isolated_vertices.size(), counts.end(), edge_t{0}); @@ -320,7 +314,7 @@ rmm::device_uvector compute_renumber_map( // 6. sort by degree - thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::sort_by_key(rmm::exec_policy(handle.get_stream_view()), counts.begin(), counts.end(), labels.begin(), @@ -341,16 +335,16 @@ void expensive_check_edgelist( if (optional_vertex_span) { auto [vertices, num_vertices] = *optional_vertex_span; sorted_local_vertices.resize(num_vertices, handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::copy(rmm::exec_policy(handle.get_stream_view()), vertices, vertices + num_vertices, sorted_local_vertices.begin()); - thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::sort(rmm::exec_policy(handle.get_stream_view()), sorted_local_vertices.begin(), sorted_local_vertices.end()); CUGRAPH_EXPECTS(static_cast(thrust::distance( sorted_local_vertices.begin(), - thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::unique(rmm::exec_policy(handle.get_stream_view()), sorted_local_vertices.begin(), sorted_local_vertices.end()))) == sorted_local_vertices.size(), "Invalid input argument: local_vertices should not have duplicates."); @@ -375,7 +369,7 @@ void expensive_check_edgelist( auto [local_vertices, num_local_vertices] = *optional_vertex_span; CUGRAPH_EXPECTS( thrust::count_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rmm::exec_policy(handle.get_stream_view()), local_vertices, local_vertices + num_local_vertices, [comm_rank, @@ -390,7 +384,7 @@ void expensive_check_edgelist( thrust::make_tuple(edgelist_major_vertices[i], edgelist_minor_vertices[i])); CUGRAPH_EXPECTS( thrust::count_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rmm::exec_policy(handle.get_stream_view()), edge_first, edge_first + edgelist_edge_counts[i], [comm_size, @@ -442,7 +436,7 @@ void expensive_check_edgelist( recvcounts, displacements, handle.get_stream()); - thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::sort(rmm::exec_policy(handle.get_stream_view()), sorted_major_vertices.begin(), sorted_major_vertices.end()); } @@ -472,7 +466,7 @@ void expensive_check_edgelist( recvcounts, displacements, handle.get_stream()); - thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::sort(rmm::exec_policy(handle.get_stream_view()), sorted_minor_vertices.begin(), sorted_minor_vertices.end()); } @@ -492,7 +486,7 @@ void expensive_check_edgelist( thrust::make_tuple(edgelist_major_vertices[i], edgelist_minor_vertices[i])); CUGRAPH_EXPECTS( thrust::count_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rmm::exec_policy(handle.get_stream_view()), edge_first, edge_first + edgelist_edge_counts[i], [num_major_vertices = static_cast(sorted_major_vertices.size()), @@ -521,7 +515,7 @@ void expensive_check_edgelist( thrust::make_tuple(edgelist_major_vertices[0], edgelist_minor_vertices[0])); CUGRAPH_EXPECTS( thrust::count_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rmm::exec_policy(handle.get_stream_view()), edge_first, edge_first + edgelist_edge_counts[0], [sorted_local_vertices = sorted_local_vertices.data(), diff --git a/cpp/src/experimental/renumber_utils.cu b/cpp/src/experimental/renumber_utils.cu index 9cd2b9a1408..dc2d44a139a 100644 --- a/cpp/src/experimental/renumber_utils.cu +++ b/cpp/src/experimental/renumber_utils.cu @@ -50,17 +50,16 @@ void renumber_ext_vertices(raft::handle_t const& handle, if (do_expensive_check) { rmm::device_uvector labels(local_int_vertex_last - local_int_vertex_first, - handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + handle.get_stream_view()); + thrust::copy(rmm::exec_policy(handle.get_stream_view()), renumber_map_labels, renumber_map_labels + labels.size(), labels.begin()); - thrust::sort( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), labels.begin(), labels.end()); - CUGRAPH_EXPECTS(thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - labels.begin(), - labels.end()) == labels.end(), - "Invalid input arguments: renumber_map_labels have duplicate elements."); + thrust::sort(rmm::exec_policy(handle.get_stream_view()), labels.begin(), labels.end()); + CUGRAPH_EXPECTS( + thrust::unique(rmm::exec_policy(handle.get_stream_view()), labels.begin(), labels.end()) == + labels.end(), + "Invalid input arguments: renumber_map_labels have duplicate elements."); } auto poly_alloc = rmm::mr::polymorphic_allocator(rmm::mr::get_current_device_resource()); @@ -75,26 +74,26 @@ void renumber_ext_vertices(raft::handle_t const& handle, auto& comm = handle.get_comms(); auto const comm_size = comm.get_size(); - rmm::device_uvector sorted_unique_ext_vertices(num_vertices, handle.get_stream()); + rmm::device_uvector sorted_unique_ext_vertices(num_vertices, + handle.get_stream_view()); sorted_unique_ext_vertices.resize( thrust::distance( sorted_unique_ext_vertices.begin(), - thrust::copy_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::copy_if(rmm::exec_policy(handle.get_stream_view()), vertices, vertices + num_vertices, sorted_unique_ext_vertices.begin(), [] __device__(auto v) { return v != invalid_vertex_id::value; })), - handle.get_stream()); - thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + handle.get_stream_view()); + thrust::sort(rmm::exec_policy(handle.get_stream_view()), sorted_unique_ext_vertices.begin(), sorted_unique_ext_vertices.end()); sorted_unique_ext_vertices.resize( - thrust::distance( - sorted_unique_ext_vertices.begin(), - thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - sorted_unique_ext_vertices.begin(), - sorted_unique_ext_vertices.end())), - handle.get_stream()); + thrust::distance(sorted_unique_ext_vertices.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream_view()), + sorted_unique_ext_vertices.begin(), + sorted_unique_ext_vertices.end())), + handle.get_stream_view()); auto int_vertices_for_sorted_unique_ext_vertices = collect_values_for_unique_keys( comm, @@ -104,7 +103,7 @@ void renumber_ext_vertices(raft::handle_t const& handle, sorted_unique_ext_vertices.begin(), sorted_unique_ext_vertices.end(), detail::compute_gpu_id_from_vertex_t{comm_size}, - handle.get_stream()); + handle.get_stream_view()); handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream @@ -145,10 +144,10 @@ void renumber_ext_vertices(raft::handle_t const& handle, } if (do_expensive_check) { - rmm::device_uvector contains(num_vertices, handle.get_stream()); + rmm::device_uvector contains(num_vertices, handle.get_stream_view()); renumber_map_ptr->contains(vertices, vertices + num_vertices, contains.begin()); auto vc_pair_first = thrust::make_zip_iterator(thrust::make_tuple(vertices, contains.begin())); - CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(handle.get_stream_view()), vc_pair_first, vc_pair_first + num_vertices, [] __device__(auto pair) { @@ -177,7 +176,7 @@ void unrenumber_local_int_vertices( { if (do_expensive_check) { CUGRAPH_EXPECTS( - thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::count_if(rmm::exec_policy(handle.get_stream_view()), vertices, vertices + num_vertices, [local_int_vertex_first, local_int_vertex_last] __device__(auto v) { @@ -188,7 +187,7 @@ void unrenumber_local_int_vertices( "+ num_vertices)."); } - thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::transform(rmm::exec_policy(handle.get_stream_view()), vertices, vertices + num_vertices, vertices, @@ -213,7 +212,7 @@ void unrenumber_int_vertices(raft::handle_t const& handle, if (do_expensive_check) { CUGRAPH_EXPECTS( - thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::count_if(rmm::exec_policy(handle.get_stream_view()), vertices, vertices + num_vertices, [int_vertex_last = vertex_partition_lasts.back()] __device__(auto v) { @@ -228,36 +227,36 @@ void unrenumber_int_vertices(raft::handle_t const& handle, auto& comm = handle.get_comms(); auto const comm_size = comm.get_size(); - rmm::device_uvector sorted_unique_int_vertices(num_vertices, handle.get_stream()); + rmm::device_uvector sorted_unique_int_vertices(num_vertices, + handle.get_stream_view()); sorted_unique_int_vertices.resize( thrust::distance( sorted_unique_int_vertices.begin(), - thrust::copy_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::copy_if(rmm::exec_policy(handle.get_stream_view()), vertices, vertices + num_vertices, sorted_unique_int_vertices.begin(), [] __device__(auto v) { return v != invalid_vertex_id::value; })), - handle.get_stream()); - thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + handle.get_stream_view()); + thrust::sort(rmm::exec_policy(handle.get_stream_view()), sorted_unique_int_vertices.begin(), sorted_unique_int_vertices.end()); sorted_unique_int_vertices.resize( - thrust::distance( - sorted_unique_int_vertices.begin(), - thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - sorted_unique_int_vertices.begin(), - sorted_unique_int_vertices.end())), - handle.get_stream()); + thrust::distance(sorted_unique_int_vertices.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream_view()), + sorted_unique_int_vertices.begin(), + sorted_unique_int_vertices.end())), + handle.get_stream_view()); rmm::device_uvector d_vertex_partition_lasts(vertex_partition_lasts.size(), - handle.get_stream()); + handle.get_stream_view()); raft::update_device(d_vertex_partition_lasts.data(), vertex_partition_lasts.data(), vertex_partition_lasts.size(), handle.get_stream()); rmm::device_uvector d_tx_int_vertex_offsets(d_vertex_partition_lasts.size(), - handle.get_stream()); - thrust::lower_bound(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + handle.get_stream_view()); + thrust::lower_bound(rmm::exec_policy(handle.get_stream_view()), sorted_unique_int_vertices.begin(), sorted_unique_int_vertices.end(), d_vertex_partition_lasts.begin(), @@ -272,13 +271,13 @@ void unrenumber_int_vertices(raft::handle_t const& handle, std::adjacent_difference( h_tx_int_vertex_counts.begin(), h_tx_int_vertex_counts.end(), h_tx_int_vertex_counts.begin()); - rmm::device_uvector rx_int_vertices(0, handle.get_stream()); + rmm::device_uvector rx_int_vertices(0, handle.get_stream_view()); std::vector rx_int_vertex_counts{}; std::tie(rx_int_vertices, rx_int_vertex_counts) = shuffle_values( - comm, sorted_unique_int_vertices.begin(), h_tx_int_vertex_counts, handle.get_stream()); + comm, sorted_unique_int_vertices.begin(), h_tx_int_vertex_counts, handle.get_stream_view()); auto tx_ext_vertices = std::move(rx_int_vertices); - thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::transform(rmm::exec_policy(handle.get_stream_view()), tx_ext_vertices.begin(), tx_ext_vertices.end(), tx_ext_vertices.begin(), @@ -287,9 +286,9 @@ void unrenumber_int_vertices(raft::handle_t const& handle, }); rmm::device_uvector rx_ext_vertices_for_sorted_unique_int_vertices( - 0, handle.get_stream()); + 0, handle.get_stream_view()); std::tie(rx_ext_vertices_for_sorted_unique_int_vertices, std::ignore) = - shuffle_values(comm, tx_ext_vertices.begin(), rx_int_vertex_counts, handle.get_stream()); + shuffle_values(comm, tx_ext_vertices.begin(), rx_int_vertex_counts, handle.get_stream_view()); handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream diff --git a/cpp/src/generators/generate_rmat_edgelist.cu b/cpp/src/generators/generate_rmat_edgelist.cu index 638d18b1831..40df2fa5568 100644 --- a/cpp/src/generators/generate_rmat_edgelist.cu +++ b/cpp/src/generators/generate_rmat_edgelist.cu @@ -17,10 +17,10 @@ #include #include -#include #include #include #include +#include #include #include @@ -53,10 +53,10 @@ std::tuple, rmm::device_uvector> generat auto max_edges_to_generate_per_iteration = static_cast(handle.get_device_properties().multiProcessorCount) * 1024; rmm::device_uvector rands( - std::min(num_edges, max_edges_to_generate_per_iteration) * 2 * scale, handle.get_stream()); + std::min(num_edges, max_edges_to_generate_per_iteration) * 2 * scale, handle.get_stream_view()); - rmm::device_uvector srcs(num_edges, handle.get_stream()); - rmm::device_uvector dsts(num_edges, handle.get_stream()); + rmm::device_uvector srcs(num_edges, handle.get_stream_view()); + rmm::device_uvector dsts(num_edges, handle.get_stream_view()); size_t num_edges_generated{0}; while (num_edges_generated < num_edges) { @@ -67,7 +67,7 @@ std::tuple, rmm::device_uvector> generat rng.uniform( rands.data(), num_edges_to_generate * 2 * scale, 0.0f, 1.0f, handle.get_stream()); thrust::transform( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rmm::exec_policy(handle.get_stream_view()), thrust::make_counting_iterator(size_t{0}), thrust::make_counting_iterator(num_edges_to_generate), pair_first, diff --git a/cpp/src/layout/barnes_hut.hpp b/cpp/src/layout/barnes_hut.hpp index ca62eda3716..4cbd8fbd668 100644 --- a/cpp/src/layout/barnes_hut.hpp +++ b/cpp/src/layout/barnes_hut.hpp @@ -16,8 +16,9 @@ #pragma once -#include -#include +#include "bh_kernels.hpp" +#include "fa2_kernels.hpp" +#include "utils.hpp" #include #include @@ -26,9 +27,10 @@ #include #include -#include "bh_kernels.hpp" -#include "fa2_kernels.hpp" -#include "utils.hpp" +#include + +#include +#include namespace cugraph { namespace detail { @@ -52,7 +54,7 @@ void barnes_hut(raft::handle_t const &handle, bool verbose = false, internals::GraphBasedDimRedCallback *callback = nullptr) { - rmm::cuda_stream_view stream(handle.get_stream()); + rmm::cuda_stream_view stream_view(handle.get_stream_view()); const edge_t e = graph.number_of_edges; const vertex_t n = graph.number_of_vertices; @@ -67,34 +69,34 @@ void barnes_hut(raft::handle_t const &handle, // Allocate more space //--------------------------------------------------- - rmm::device_uvector d_limiter(1, stream); - rmm::device_uvector d_maxdepthd(1, stream); - rmm::device_uvector d_bottomd(1, stream); - rmm::device_uvector d_radiusd(1, stream); + rmm::device_uvector d_limiter(1, stream_view); + rmm::device_uvector d_maxdepthd(1, stream_view); + rmm::device_uvector d_bottomd(1, stream_view); + rmm::device_uvector d_radiusd(1, stream_view); unsigned *limiter = d_limiter.data(); int *maxdepthd = d_maxdepthd.data(); int *bottomd = d_bottomd.data(); float *radiusd = d_radiusd.data(); - InitializationKernel<<<1, 1, 0, stream.value()>>>(limiter, maxdepthd, radiusd); - CHECK_CUDA(stream.value()); + InitializationKernel<<<1, 1, 0, stream_view.value()>>>(limiter, maxdepthd, radiusd); + CHECK_CUDA(stream_view.value()); const int FOUR_NNODES = 4 * nnodes; const int FOUR_N = 4 * n; const float theta_squared = theta * theta; const int NNODES = nnodes; - rmm::device_uvector d_startl(nnodes + 1, stream); - rmm::device_uvector d_childl((nnodes + 1) * 4, stream); + rmm::device_uvector d_startl(nnodes + 1, stream_view); + rmm::device_uvector d_childl((nnodes + 1) * 4, stream_view); // FA2 requires degree + 1 - rmm::device_uvector d_massl(nnodes + 1, stream); - thrust::fill(rmm::exec_policy(stream), d_massl.begin(), d_massl.end(), 1); + rmm::device_uvector d_massl(nnodes + 1, stream_view); + thrust::fill(rmm::exec_policy(stream_view), d_massl.begin(), d_massl.end(), 1); - rmm::device_uvector d_maxxl(blocks * FACTOR1, stream); - rmm::device_uvector d_maxyl(blocks * FACTOR1, stream); - rmm::device_uvector d_minxl(blocks * FACTOR1, stream); - rmm::device_uvector d_minyl(blocks * FACTOR1, stream); + rmm::device_uvector d_maxxl(blocks * FACTOR1, stream_view); + rmm::device_uvector d_maxyl(blocks * FACTOR1, stream_view); + rmm::device_uvector d_minxl(blocks * FACTOR1, stream_view); + rmm::device_uvector d_minyl(blocks * FACTOR1, stream_view); // Actual mallocs int *startl = d_startl.data(); @@ -107,21 +109,21 @@ void barnes_hut(raft::handle_t const &handle, float *minyl = d_minyl.data(); // SummarizationKernel - rmm::device_uvector d_countl(nnodes + 1, stream); + rmm::device_uvector d_countl(nnodes + 1, stream_view); int *countl = d_countl.data(); // SortKernel - rmm::device_uvector d_sortl(nnodes + 1, stream); + rmm::device_uvector d_sortl(nnodes + 1, stream_view); int *sortl = d_sortl.data(); // RepulsionKernel - rmm::device_uvector d_rep_forces((nnodes + 1) * 2, stream); + rmm::device_uvector d_rep_forces((nnodes + 1) * 2, stream_view); float *rep_forces = d_rep_forces.data(); - rmm::device_uvector d_radius_squared(1, stream); + rmm::device_uvector d_radius_squared(1, stream_view); float *radiusd_squared = d_radius_squared.data(); - rmm::device_uvector d_nodes_pos((nnodes + 1) * 2, stream); + rmm::device_uvector d_nodes_pos((nnodes + 1) * 2, stream_view); float *nodes_pos = d_nodes_pos.data(); // Initialize positions with random values @@ -129,10 +131,11 @@ void barnes_hut(raft::handle_t const &handle, // Copy start x and y positions. if (x_start && y_start) { - raft::copy(nodes_pos, x_start, n, stream.value()); - raft::copy(nodes_pos + nnodes + 1, y_start, n, stream.value()); + raft::copy(nodes_pos, x_start, n, stream_view.value()); + raft::copy(nodes_pos + nnodes + 1, y_start, n, stream_view.value()); } else { - random_vector(nodes_pos, (nnodes + 1) * 2, random_state, stream.value()); + raft::random::Rng rng(random_state); + rng.uniform(nodes_pos, (nnodes + 1) * 2, -100.0f, 100.0f, stream_view.value()); } // Allocate arrays for force computation @@ -141,24 +144,24 @@ void barnes_hut(raft::handle_t const &handle, float *swinging{nullptr}; float *traction{nullptr}; - rmm::device_uvector d_attract(n * 2, stream); - rmm::device_uvector d_old_forces(n * 2, stream); - rmm::device_uvector d_swinging(n, stream); - rmm::device_uvector d_traction(n, stream); + rmm::device_uvector d_attract(n * 2, stream_view); + rmm::device_uvector d_old_forces(n * 2, stream_view); + rmm::device_uvector d_swinging(n, stream_view); + rmm::device_uvector d_traction(n, stream_view); attract = d_attract.data(); old_forces = d_old_forces.data(); swinging = d_swinging.data(); traction = d_traction.data(); - thrust::fill(rmm::exec_policy(stream), d_old_forces.begin(), d_old_forces.end(), 0.f); + thrust::fill(rmm::exec_policy(stream_view), d_old_forces.begin(), d_old_forces.end(), 0.f); // Sort COO for coalesced memory access. - sort(graph, stream.value()); - CHECK_CUDA(stream.value()); + sort(graph, stream_view.value()); + CHECK_CUDA(stream_view.value()); graph.degree(massl, cugraph::DegreeDirection::OUT); - CHECK_CUDA(stream.value()); + CHECK_CUDA(stream_view.value()); const vertex_t *row = graph.src_indices; const vertex_t *col = graph.dst_indices; @@ -172,7 +175,7 @@ void barnes_hut(raft::handle_t const &handle, // If outboundAttractionDistribution active, compensate. if (outbound_attraction_distribution) { - int sum = thrust::reduce(rmm::exec_policy(stream), d_massl.begin(), d_massl.begin() + n); + int sum = thrust::reduce(rmm::exec_policy(stream_view), d_massl.begin(), d_massl.begin() + n); outbound_att_compensation = sum / (float)n; } @@ -195,70 +198,71 @@ void barnes_hut(raft::handle_t const &handle, for (int iter = 0; iter < max_iter; ++iter) { // Reset force values - thrust::fill(rmm::exec_policy(stream), d_rep_forces.begin(), d_rep_forces.end(), 0.f); - thrust::fill(rmm::exec_policy(stream), d_attract.begin(), d_attract.end(), 0.f); - thrust::fill(rmm::exec_policy(stream), d_swinging.begin(), d_swinging.end(), 0.f); - thrust::fill(rmm::exec_policy(stream), d_traction.begin(), d_traction.end(), 0.f); + thrust::fill(rmm::exec_policy(stream_view), d_rep_forces.begin(), d_rep_forces.end(), 0.f); + thrust::fill(rmm::exec_policy(stream_view), d_attract.begin(), d_attract.end(), 0.f); + thrust::fill(rmm::exec_policy(stream_view), d_swinging.begin(), d_swinging.end(), 0.f); + thrust::fill(rmm::exec_policy(stream_view), d_traction.begin(), d_traction.end(), 0.f); - ResetKernel<<<1, 1, 0, stream.value()>>>(radiusd_squared, bottomd, NNODES, radiusd); - CHECK_CUDA(stream.value()); + ResetKernel<<<1, 1, 0, stream_view.value()>>>(radiusd_squared, bottomd, NNODES, radiusd); + CHECK_CUDA(stream_view.value()); // Compute bounding box arround all bodies - BoundingBoxKernel<<>>(startl, - childl, - massl, - nodes_pos, - nodes_pos + nnodes + 1, - maxxl, - maxyl, - minxl, - minyl, - FOUR_NNODES, - NNODES, - n, - limiter, - radiusd); - CHECK_CUDA(stream.value()); - - ClearKernel1<<>>(childl, FOUR_NNODES, FOUR_N); - CHECK_CUDA(stream.value()); + BoundingBoxKernel<<>>( + startl, + childl, + massl, + nodes_pos, + nodes_pos + nnodes + 1, + maxxl, + maxyl, + minxl, + minyl, + FOUR_NNODES, + NNODES, + n, + limiter, + radiusd); + CHECK_CUDA(stream_view.value()); + + ClearKernel1<<>>(childl, FOUR_NNODES, FOUR_N); + CHECK_CUDA(stream_view.value()); // Build quadtree - TreeBuildingKernel<<>>( + TreeBuildingKernel<<>>( childl, nodes_pos, nodes_pos + nnodes + 1, NNODES, n, maxdepthd, bottomd, radiusd); - CHECK_CUDA(stream.value()); + CHECK_CUDA(stream_view.value()); - ClearKernel2<<>>(startl, massl, NNODES, bottomd); - CHECK_CUDA(stream.value()); + ClearKernel2<<>>(startl, massl, NNODES, bottomd); + CHECK_CUDA(stream_view.value()); // Summarizes mass and position for each cell, bottom up approach - SummarizationKernel<<>>( + SummarizationKernel<<>>( countl, childl, massl, nodes_pos, nodes_pos + nnodes + 1, NNODES, n, bottomd); - CHECK_CUDA(stream.value()); + CHECK_CUDA(stream_view.value()); // Group closed bodies together, used to speed up Repulsion kernel - SortKernel<<>>( + SortKernel<<>>( sortl, countl, startl, childl, NNODES, n, bottomd); - CHECK_CUDA(stream.value()); + CHECK_CUDA(stream_view.value()); // Force computation O(n . log(n)) - RepulsionKernel<<>>(scaling_ratio, - theta, - epssq, - sortl, - childl, - massl, - nodes_pos, - nodes_pos + nnodes + 1, - rep_forces, - rep_forces + nnodes + 1, - theta_squared, - NNODES, - FOUR_NNODES, - n, - radiusd_squared, - maxdepthd); - CHECK_CUDA(stream.value()); + RepulsionKernel<<>>(scaling_ratio, + theta, + epssq, + sortl, + childl, + massl, + nodes_pos, + nodes_pos + nnodes + 1, + rep_forces, + rep_forces + nnodes + 1, + theta_squared, + NNODES, + FOUR_NNODES, + n, + radiusd_squared, + maxdepthd); + CHECK_CUDA(stream_view.value()); apply_gravity(nodes_pos, nodes_pos + nnodes + 1, @@ -269,7 +273,7 @@ void barnes_hut(raft::handle_t const &handle, strong_gravity_mode, scaling_ratio, n, - stream.value()); + stream_view.value()); apply_attraction(row, col, @@ -284,7 +288,7 @@ void barnes_hut(raft::handle_t const &handle, lin_log_mode, edge_weight_influence, outbound_att_compensation, - stream.value()); + stream_view.value()); compute_local_speed(rep_forces, rep_forces + nnodes + 1, @@ -296,28 +300,30 @@ void barnes_hut(raft::handle_t const &handle, swinging, traction, n, - stream.value()); + stream_view.value()); // Compute global swinging and traction values - const float s = thrust::reduce(rmm::exec_policy(stream), d_swinging.begin(), d_swinging.end()); + const float s = + thrust::reduce(rmm::exec_policy(stream_view), d_swinging.begin(), d_swinging.end()); - const float t = thrust::reduce(rmm::exec_policy(stream), d_traction.begin(), d_traction.end()); + const float t = + thrust::reduce(rmm::exec_policy(stream_view), d_traction.begin(), d_traction.end()); // Compute global speed based on gloab and local swinging and traction. adapt_speed(jitter_tolerance, &jt, &speed, &speed_efficiency, s, t, n); // Update positions - apply_forces_bh<<>>(nodes_pos, - nodes_pos + nnodes + 1, - attract, - attract + n, - rep_forces, - rep_forces + nnodes + 1, - old_forces, - old_forces + n, - swinging, - speed, - n); + apply_forces_bh<<>>(nodes_pos, + nodes_pos + nnodes + 1, + attract, + attract + n, + rep_forces, + rep_forces + nnodes + 1, + old_forces, + old_forces + n, + swinging, + speed, + n); if (callback) callback->on_epoch_end(nodes_pos); @@ -329,8 +335,8 @@ void barnes_hut(raft::handle_t const &handle, } // Copy nodes positions into final output pos - raft::copy(pos, nodes_pos, n, stream.value()); - raft::copy(pos + n, nodes_pos + nnodes + 1, n, stream.value()); + raft::copy(pos, nodes_pos, n, stream_view.value()); + raft::copy(pos + n, nodes_pos + nnodes + 1, n, stream_view.value()); if (callback) callback->on_train_end(nodes_pos); } diff --git a/cpp/src/layout/exact_fa2.hpp b/cpp/src/layout/exact_fa2.hpp index a82b7a5faff..567aa8c90c6 100644 --- a/cpp/src/layout/exact_fa2.hpp +++ b/cpp/src/layout/exact_fa2.hpp @@ -24,6 +24,7 @@ #include #include #include +#include #include "exact_repulsion.hpp" #include "fa2_kernels.hpp" @@ -50,7 +51,7 @@ void exact_fa2(raft::handle_t const &handle, bool verbose = false, internals::GraphBasedDimRedCallback *callback = nullptr) { - rmm::cuda_stream_view stream(handle.get_stream()); + auto stream_view = handle.get_stream_view(); const edge_t e = graph.number_of_edges; const vertex_t n = graph.number_of_vertices; @@ -61,15 +62,15 @@ void exact_fa2(raft::handle_t const &handle, float *d_swinging{nullptr}; float *d_traction{nullptr}; - rmm::device_uvector repel(n * 2, stream); - rmm::device_uvector attract(n * 2, stream); - rmm::device_uvector old_forces(n * 2, stream); - thrust::fill(rmm::exec_policy(stream), old_forces.begin(), old_forces.end(), 0.f); + rmm::device_uvector repel(n * 2, stream_view); + rmm::device_uvector attract(n * 2, stream_view); + rmm::device_uvector old_forces(n * 2, stream_view); + thrust::fill(rmm::exec_policy(stream_view), old_forces.begin(), old_forces.end(), 0.f); // FA2 requires degree + 1. - rmm::device_uvector mass(n, stream); - thrust::fill(rmm::exec_policy(stream), mass.begin(), mass.end(), 1); - rmm::device_uvector swinging(n, stream); - rmm::device_uvector traction(n, stream); + rmm::device_uvector mass(n, stream_view); + thrust::fill(rmm::exec_policy(stream_view), mass.begin(), mass.end(), 1); + rmm::device_uvector swinging(n, stream_view); + rmm::device_uvector traction(n, stream_view); d_repel = repel.data(); d_attract = attract.data(); @@ -78,20 +79,21 @@ void exact_fa2(raft::handle_t const &handle, d_swinging = swinging.data(); d_traction = traction.data(); - int random_state = 0; - random_vector(pos, n * 2, random_state, stream.value()); + int seed{0}; + raft::random::Rng rng(seed); + rng.uniform(pos, n * 2, -100.0f, 100.0f, handle.get_stream()); if (x_start && y_start) { - raft::copy(pos, x_start, n, stream.value()); - raft::copy(pos + n, y_start, n, stream.value()); + raft::copy(pos, x_start, n, stream_view.value()); + raft::copy(pos + n, y_start, n, stream_view.value()); } // Sort COO for coalesced memory access. - sort(graph, stream.value()); - CHECK_CUDA(stream.value()); + sort(graph, stream_view.value()); + CHECK_CUDA(stream_view.value()); graph.degree(d_mass, cugraph::DegreeDirection::OUT); - CHECK_CUDA(stream.value()); + CHECK_CUDA(stream_view.value()); const vertex_t *row = graph.src_indices; const vertex_t *col = graph.dst_indices; @@ -103,7 +105,7 @@ void exact_fa2(raft::handle_t const &handle, float jt = 0.f; if (outbound_attraction_distribution) { - int sum = thrust::reduce(rmm::exec_policy(stream), mass.begin(), mass.end()); + int sum = thrust::reduce(rmm::exec_policy(stream_view), mass.begin(), mass.end()); outbound_att_compensation = sum / (float)n; } @@ -114,14 +116,14 @@ void exact_fa2(raft::handle_t const &handle, for (int iter = 0; iter < max_iter; ++iter) { // Reset force arrays - thrust::fill(rmm::exec_policy(stream), repel.begin(), repel.end(), 0.f); - thrust::fill(rmm::exec_policy(stream), attract.begin(), attract.end(), 0.f); - thrust::fill(rmm::exec_policy(stream), swinging.begin(), swinging.end(), 0.f); - thrust::fill(rmm::exec_policy(stream), traction.begin(), traction.end(), 0.f); + thrust::fill(rmm::exec_policy(stream_view), repel.begin(), repel.end(), 0.f); + thrust::fill(rmm::exec_policy(stream_view), attract.begin(), attract.end(), 0.f); + thrust::fill(rmm::exec_policy(stream_view), swinging.begin(), swinging.end(), 0.f); + thrust::fill(rmm::exec_policy(stream_view), traction.begin(), traction.end(), 0.f); // Exact repulsion apply_repulsion( - pos, pos + n, d_repel, d_repel + n, d_mass, scaling_ratio, n, stream.value()); + pos, pos + n, d_repel, d_repel + n, d_mass, scaling_ratio, n, stream_view.value()); apply_gravity(pos, pos + n, @@ -132,7 +134,7 @@ void exact_fa2(raft::handle_t const &handle, strong_gravity_mode, scaling_ratio, n, - stream.value()); + stream_view.value()); apply_attraction(row, col, @@ -147,7 +149,7 @@ void exact_fa2(raft::handle_t const &handle, lin_log_mode, edge_weight_influence, outbound_att_compensation, - stream.value()); + stream_view.value()); compute_local_speed(d_repel, d_repel + n, @@ -159,11 +161,11 @@ void exact_fa2(raft::handle_t const &handle, d_swinging, d_traction, n, - stream.value()); + stream_view.value()); // Compute global swinging and traction values. - const float s = thrust::reduce(rmm::exec_policy(stream), swinging.begin(), swinging.end()); - const float t = thrust::reduce(rmm::exec_policy(stream), traction.begin(), traction.end()); + const float s = thrust::reduce(rmm::exec_policy(stream_view), swinging.begin(), swinging.end()); + const float t = thrust::reduce(rmm::exec_policy(stream_view), traction.begin(), traction.end()); adapt_speed(jitter_tolerance, &jt, &speed, &speed_efficiency, s, t, n); @@ -178,7 +180,7 @@ void exact_fa2(raft::handle_t const &handle, d_swinging, speed, n, - stream.value()); + stream_view.value()); if (callback) callback->on_epoch_end(pos); diff --git a/cpp/src/layout/utils.hpp b/cpp/src/layout/utils.hpp index 335b8ea986c..822459c7751 100644 --- a/cpp/src/layout/utils.hpp +++ b/cpp/src/layout/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,22 +23,6 @@ namespace cugraph { namespace detail { -struct prg { - __host__ __device__ float operator()(int n) - { - thrust::default_random_engine rng; - thrust::uniform_real_distribution dist(-100.f, 100.f); - rng.discard(n); - return dist(rng); - } -}; - -void random_vector(float *vec, int n, int seed, cudaStream_t stream) -{ - thrust::counting_iterator index(seed); - thrust::transform(rmm::exec_policy(stream)->on(stream), index, index + n, vec, prg()); -} - /** helper method to get multi-processor count parameter */ inline int getMultiProcessorCount() { diff --git a/cpp/src/link_prediction/jaccard.cu b/cpp/src/link_prediction/jaccard.cu index b93ad0bd0b3..071302aed9a 100644 --- a/cpp/src/link_prediction/jaccard.cu +++ b/cpp/src/link_prediction/jaccard.cu @@ -19,11 +19,13 @@ * @file jaccard.cu * ---------------------------------------------------------------------------**/ -#include #include #include #include +#include +#include + namespace cugraph { namespace detail { @@ -208,6 +210,7 @@ int jaccard(vertex_t n, weight_t *weight_s, weight_t *weight_j) { + rmm::cuda_stream_view stream_view; dim3 nthreads, nblocks; int y = 4; @@ -221,9 +224,9 @@ int jaccard(vertex_t n, // launch kernel jaccard_row_sum - <<>>(n, csrPtr, csrInd, weight_in, work); - cudaDeviceSynchronize(); - fill(e, weight_i, weight_t{0.0}); + <<>>(n, csrPtr, csrInd, weight_in, work); + + thrust::fill(rmm::exec_policy(stream_view), weight_i, weight_i + e, weight_t{0.0}); // setup launch configuration nthreads.x = 32 / y; @@ -234,8 +237,8 @@ int jaccard(vertex_t n, nblocks.z = min((n + nthreads.z - 1) / nthreads.z, vertex_t{CUDA_MAX_BLOCKS}); // 1; // launch kernel - jaccard_is - <<>>(n, csrPtr, csrInd, weight_in, work, weight_i, weight_s); + jaccard_is<<>>( + n, csrPtr, csrInd, weight_in, work, weight_i, weight_s); // setup launch configuration nthreads.x = min(e, edge_t{CUDA_MAX_KERNEL_THREADS}); @@ -247,7 +250,7 @@ int jaccard(vertex_t n, // launch kernel jaccard_jw - <<>>(e, weight_i, weight_s, weight_j); + <<>>(e, weight_i, weight_s, weight_j); return 0; } diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh index 44a6e9e83aa..81cad454a17 100644 --- a/cpp/src/sampling/random_walks.cuh +++ b/cpp/src/sampling/random_walks.cuh @@ -26,8 +26,8 @@ #include #include -#include #include +#include #include #include @@ -160,7 +160,7 @@ struct rrandom_gen_t { void generate_col_indices(device_vec_t& d_col_indx) const { thrust::transform_if( - rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + rmm::exec_policy(handle_.get_stream_view()), d_ptr_random_, d_ptr_random_ + num_paths_, // input1 d_ptr_out_degs_, // input2 @@ -264,7 +264,7 @@ struct col_indx_extract_ton(handle_.get_stream()), + rmm::exec_policy(handle_.get_stream_view()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_paths_), // input1 d_v_col_indx.begin(), // input2 @@ -376,7 +376,7 @@ struct random_walker_t { // intialize path sizes to 1, as they contain at least one vertex each: // the initial set: d_src_init_v; // - thrust::copy_n(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::copy_n(rmm::exec_policy(handle_.get_stream_view()), thrust::make_constant_iterator(1), num_paths_, d_sizes.begin()); @@ -390,7 +390,7 @@ struct random_walker_t { auto map_it_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), dlambda); - thrust::scatter(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::scatter(rmm::exec_policy(handle_.get_stream_view()), d_src_init_v.begin(), d_src_init_v.end(), map_it_begin, @@ -485,7 +485,7 @@ struct random_walker_t { bool all_paths_stopped(device_vec_t const& d_crt_out_degs) const { auto how_many_stopped = - thrust::count_if(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::count_if(rmm::exec_policy(handle_.get_stream_view()), d_crt_out_degs.begin(), d_crt_out_degs.end(), [] __device__(auto crt_out_deg) { return crt_out_deg == 0; }); @@ -517,19 +517,17 @@ struct random_walker_t { return (col_indx >= ptr_d_sizes[row_indx] - 1); }; - auto new_end_v = - thrust::remove_if(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), - d_coalesced_v.begin(), - d_coalesced_v.end(), - thrust::make_counting_iterator(0), - predicate_v); + auto new_end_v = thrust::remove_if(rmm::exec_policy(handle_.get_stream_view()), + d_coalesced_v.begin(), + d_coalesced_v.end(), + thrust::make_counting_iterator(0), + predicate_v); - auto new_end_w = - thrust::remove_if(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), - d_coalesced_w.begin(), - d_coalesced_w.end(), - thrust::make_counting_iterator(0), - predicate_w); + auto new_end_w = thrust::remove_if(rmm::exec_policy(handle_.get_stream_view()), + d_coalesced_w.begin(), + d_coalesced_w.end(), + thrust::make_counting_iterator(0), + predicate_w); handle_.get_stream_view().synchronize(); @@ -565,7 +563,7 @@ struct random_walker_t { auto map_it_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), dlambda); - thrust::gather(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::gather(rmm::exec_policy(handle_.get_stream_view()), map_it_begin, map_it_begin + nelems, d_src.begin(), @@ -612,7 +610,7 @@ struct random_walker_t { auto map_it_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), dlambda); - thrust::scatter_if(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::scatter_if(rmm::exec_policy(handle_.get_stream_view()), d_src.begin(), d_src.end(), map_it_begin, @@ -651,7 +649,7 @@ struct random_walker_t { device_vec_t& d_sizes) const { thrust::transform_if( - rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + rmm::exec_policy(handle_.get_stream_view()), d_sizes.begin(), d_sizes.end(), // input d_crt_out_degs.begin(), // stencil @@ -669,12 +667,12 @@ struct random_walker_t { void init_padding(device_vec_t& d_coalesced_v, device_vec_t& d_coalesced_w) const { - thrust::fill(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::fill(rmm::exec_policy(handle_.get_stream_view()), d_coalesced_v.begin(), d_coalesced_v.end(), vertex_padding_value_); - thrust::fill(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::fill(rmm::exec_policy(handle_.get_stream_view()), d_coalesced_w.begin(), d_coalesced_w.end(), weight_padding_value_); @@ -742,13 +740,12 @@ random_walks_impl(raft::handle_t const& handle, vertex_t num_vertices = graph.get_number_of_vertices(); - auto how_many_valid = - thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - d_v_start.begin(), - d_v_start.end(), - [num_vertices] __device__(auto crt_vertex) { - return (crt_vertex >= 0) && (crt_vertex < num_vertices); - }); + auto how_many_valid = thrust::count_if(rmm::exec_policy(handle.get_stream_view()), + d_v_start.begin(), + d_v_start.end(), + [num_vertices] __device__(auto crt_vertex) { + return (crt_vertex >= 0) && (crt_vertex < num_vertices); + }); CUGRAPH_EXPECTS(static_cast(how_many_valid) == d_v_start.size(), "Invalid set of starting vertices."); @@ -912,12 +909,11 @@ struct coo_convertor_t { // and edge_paths_sz == 0 don't contribute // anything): // - auto new_end_it = - thrust::copy_if(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), - d_sizes.begin(), - d_sizes.end(), - d_sz_w_scan.begin(), - [] __device__(auto sz_value) { return sz_value > 1; }); + auto new_end_it = thrust::copy_if(rmm::exec_policy(handle_.get_stream_view()), + d_sizes.begin(), + d_sizes.end(), + d_sz_w_scan.begin(), + [] __device__(auto sz_value) { return sz_value > 1; }); // resize to new_end: // @@ -929,7 +925,7 @@ struct coo_convertor_t { // edge_path_sz = (vertex_path_sz-1): // thrust::transform_exclusive_scan( - rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + rmm::exec_policy(handle_.get_stream_view()), d_sz_w_scan.begin(), d_sz_w_scan.end(), d_sz_w_scan.begin(), @@ -944,10 +940,8 @@ struct coo_convertor_t { device_const_vector_view& d_sizes) const { device_vec_t d_scan(num_paths_, handle_.get_stream()); - thrust::inclusive_scan(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), - d_sizes.begin(), - d_sizes.end(), - d_scan.begin()); + thrust::inclusive_scan( + rmm::exec_policy(handle_.get_stream_view()), d_sizes.begin(), d_sizes.end(), d_scan.begin()); index_t total_sz{0}; CUDA_TRY(cudaMemcpy( @@ -957,7 +951,7 @@ struct coo_convertor_t { // initialize stencil to all 1's: // - thrust::copy_n(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::copy_n(rmm::exec_policy(handle_.get_stream_view()), thrust::make_constant_iterator(1), d_stencil.size(), d_stencil.begin()); @@ -967,7 +961,7 @@ struct coo_convertor_t { // and the next one starts, hence there cannot be an edge // between a path ending vertex and next path starting vertex; // - thrust::scatter(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::scatter(rmm::exec_policy(handle_.get_stream_view()), thrust::make_constant_iterator(0), thrust::make_constant_iterator(0) + num_paths_, d_scan.begin(), @@ -990,7 +984,7 @@ struct coo_convertor_t { // in stencil is not 0; (if it is, there's no "next" // or dst index, because the path has ended); // - thrust::copy_if(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::copy_if(rmm::exec_policy(handle_.get_stream_view()), thrust::make_counting_iterator(0), thrust::make_counting_iterator(total_sz_v - 1), valid_src_indx.begin(), @@ -1009,7 +1003,7 @@ struct coo_convertor_t { // generated at the previous step; // thrust::transform( - rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + rmm::exec_policy(handle_.get_stream_view()), valid_src_indx.begin(), valid_src_indx.end(), thrust::make_zip_iterator(thrust::make_tuple(d_src_v.begin(), d_dst_v.begin())), // start_zip @@ -1134,12 +1128,12 @@ query_rw_sizes_offsets(raft::handle_t const& handle, index_t num_paths, index_t rmm::device_uvector d_weight_sizes(num_paths, handle.get_stream()); rmm::device_uvector d_weight_offsets(num_paths, handle.get_stream()); - thrust::exclusive_scan(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::exclusive_scan(rmm::exec_policy(handle.get_stream_view()), ptr_d_sizes, ptr_d_sizes + num_paths, d_vertex_offsets.begin()); - thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::transform(rmm::exec_policy(handle.get_stream_view()), ptr_d_sizes, ptr_d_sizes + num_paths, d_weight_sizes.begin(), @@ -1147,7 +1141,7 @@ query_rw_sizes_offsets(raft::handle_t const& handle, index_t num_paths, index_t handle.get_stream_view().synchronize(); - thrust::exclusive_scan(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::exclusive_scan(rmm::exec_policy(handle.get_stream_view()), d_weight_sizes.begin(), d_weight_sizes.end(), d_weight_offsets.begin()); diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu index 9f683af8209..93bb0a69d23 100644 --- a/cpp/src/structure/graph.cu +++ b/cpp/src/structure/graph.cu @@ -19,6 +19,7 @@ #include #include +#include namespace { @@ -26,11 +27,11 @@ template void degree_from_offsets(vertex_t number_of_vertices, edge_t const *offsets, edge_t *degree, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { // Computes out-degree for x = 0 and x = 2 thrust::for_each( - rmm::exec_policy(stream)->on(stream), + rmm::exec_policy(stream_view), thrust::make_counting_iterator(0), thrust::make_counting_iterator(number_of_vertices), [offsets, degree] __device__(vertex_t v) { degree[v] = offsets[v + 1] - offsets[v]; }); @@ -42,15 +43,15 @@ void degree_from_vertex_ids(const raft::handle_t *handle, edge_t number_of_edges, vertex_t const *indices, edge_t *degree, - cudaStream_t stream) + rmm::cuda_stream_view stream_view) { - thrust::for_each(rmm::exec_policy(stream)->on(stream), + thrust::for_each(rmm::exec_policy(stream_view), thrust::make_counting_iterator(0), thrust::make_counting_iterator(number_of_edges), [indices, degree] __device__(edge_t e) { atomicAdd(degree + indices[e], 1); }); if ((handle != nullptr) && (handle->comms_initialized())) { auto &comm = handle->get_comms(); - comm.allreduce(degree, degree, number_of_vertices, raft::comms::op_t::SUM, stream); + comm.allreduce(degree, degree, number_of_vertices, raft::comms::op_t::SUM, stream_view.value()); } } @@ -118,7 +119,7 @@ void GraphCompressedSparseBaseView::degree(ET *degree, DegreeDirecti // (e.g. if you have a CSC and you want in-degree (x=1) then pass // the offsets/indices and request an out-degree (x=2)) // - cudaStream_t stream{nullptr}; + rmm::cuda_stream_view stream_view; if (direction != DegreeDirection::IN) { if ((GraphViewBase::handle != nullptr) && @@ -127,7 +128,8 @@ void GraphCompressedSparseBaseView::degree(ET *degree, DegreeDirecti // source indexing for // the allreduce to work } - degree_from_offsets(GraphViewBase::number_of_vertices, offsets, degree, stream); + degree_from_offsets( + GraphViewBase::number_of_vertices, offsets, degree, stream_view); } if (direction != DegreeDirection::OUT) { @@ -136,7 +138,7 @@ void GraphCompressedSparseBaseView::degree(ET *degree, DegreeDirecti GraphViewBase::number_of_edges, indices, degree, - stream); + stream_view); } } diff --git a/cpp/src/utilities/graph_utils.cuh b/cpp/src/utilities/graph_utils.cuh index 4eeab9376fa..76e8dc32611 100644 --- a/cpp/src/utilities/graph_utils.cuh +++ b/cpp/src/utilities/graph_utils.cuh @@ -16,8 +16,9 @@ #include #include -#include #include +#include +#include #include #include @@ -80,20 +81,6 @@ __inline__ __device__ value_t parallel_prefix_sum(count_t n, index_t const *ind, return last; } -// dot -template -T dot(size_t n, T *x, T *y) -{ - cudaStream_t stream{nullptr}; - T result = thrust::inner_product(rmm::exec_policy(stream)->on(stream), - thrust::device_pointer_cast(x), - thrust::device_pointer_cast(x + n), - thrust::device_pointer_cast(y), - 0.0f); - CHECK_CUDA(stream); - return result; -} - // axpy template struct axpy_functor : public thrust::binary_function { @@ -105,8 +92,8 @@ struct axpy_functor : public thrust::binary_function { template void axpy(size_t n, T a, T *x, T *y) { - cudaStream_t stream{nullptr}; - thrust::transform(rmm::exec_policy(stream)->on(stream), + rmm::cuda_stream_view stream_view; + thrust::transform(rmm::exec_policy(stream_view), thrust::device_pointer_cast(x), thrust::device_pointer_cast(x + n), thrust::device_pointer_cast(y), @@ -124,9 +111,9 @@ struct square { template T nrm2(size_t n, T *x) { - cudaStream_t stream{nullptr}; + rmm::cuda_stream_view stream_view; T init = 0; - T result = std::sqrt(thrust::transform_reduce(rmm::exec_policy(stream)->on(stream), + T result = std::sqrt(thrust::transform_reduce(rmm::exec_policy(stream_view), thrust::device_pointer_cast(x), thrust::device_pointer_cast(x + n), square(), @@ -139,8 +126,8 @@ T nrm2(size_t n, T *x) template T nrm1(size_t n, T *x) { - cudaStream_t stream{nullptr}; - T result = thrust::reduce(rmm::exec_policy(stream)->on(stream), + rmm::cuda_stream_view stream_view; + T result = thrust::reduce(rmm::exec_policy(stream_view), thrust::device_pointer_cast(x), thrust::device_pointer_cast(x + n)); CHECK_CUDA(stream); @@ -150,8 +137,8 @@ T nrm1(size_t n, T *x) template void scal(size_t n, T val, T *x) { - cudaStream_t stream{nullptr}; - thrust::transform(rmm::exec_policy(stream)->on(stream), + rmm::cuda_stream_view stream_view; + thrust::transform(rmm::exec_policy(stream_view), thrust::device_pointer_cast(x), thrust::device_pointer_cast(x + n), thrust::make_constant_iterator(val), @@ -163,8 +150,8 @@ void scal(size_t n, T val, T *x) template void addv(size_t n, T val, T *x) { - cudaStream_t stream{nullptr}; - thrust::transform(rmm::exec_policy(stream)->on(stream), + rmm::cuda_stream_view stream_view; + thrust::transform(rmm::exec_policy(stream_view), thrust::device_pointer_cast(x), thrust::device_pointer_cast(x + n), thrust::make_constant_iterator(val), @@ -176,19 +163,19 @@ void addv(size_t n, T val, T *x) template void fill(size_t n, T *x, T value) { - cudaStream_t stream{nullptr}; - thrust::fill(rmm::exec_policy(stream)->on(stream), + rmm::cuda_stream_view stream_view; + thrust::fill(rmm::exec_policy(stream_view), thrust::device_pointer_cast(x), thrust::device_pointer_cast(x + n), value); - CHECK_CUDA(stream); + CHECK_CUDA(stream_view.value()); } template void scatter(size_t n, T *src, T *dst, M *map) { - cudaStream_t stream{nullptr}; - thrust::scatter(rmm::exec_policy(stream)->on(stream), + rmm::cuda_stream_view stream_view; + thrust::scatter(rmm::exec_policy(stream_view), thrust::device_pointer_cast(src), thrust::device_pointer_cast(src + n), thrust::device_pointer_cast(map), @@ -216,8 +203,8 @@ void copy(size_t n, T *x, T *res) { thrust::device_ptr dev_ptr(x); thrust::device_ptr res_ptr(res); - cudaStream_t stream{nullptr}; - thrust::copy_n(rmm::exec_policy(stream)->on(stream), dev_ptr, n, res_ptr); + rmm::cuda_stream_view stream_view; + thrust::copy_n(rmm::exec_policy(stream_view), dev_ptr, n, res_ptr); CHECK_CUDA(stream); } @@ -236,8 +223,8 @@ struct dangling_functor : public thrust::unary_function { template void update_dangling_nodes(size_t n, T *dangling_nodes, T damping_factor) { - cudaStream_t stream{nullptr}; - thrust::transform_if(rmm::exec_policy(stream)->on(stream), + rmm::cuda_stream_view stream_view; + thrust::transform_if(rmm::exec_policy(stream_view), thrust::device_pointer_cast(dangling_nodes), thrust::device_pointer_cast(dangling_nodes + n), thrust::device_pointer_cast(dangling_nodes), @@ -332,8 +319,8 @@ void HT_matrix_csc_coo(const IndexType n, ValueType *val, ValueType *bookmark) { - cudaStream_t stream{nullptr}; - rmm::device_vector degree(n, 0); + rmm::cuda_stream_view stream_view; + rmm::device_uvector degree(n, stream_view); dim3 nthreads, nblocks; nthreads.x = min(e, CUDA_MAX_KERNEL_THREADS); @@ -343,8 +330,8 @@ void HT_matrix_csc_coo(const IndexType n, nblocks.y = 1; nblocks.z = 1; degree_coo - <<>>(n, e, csrInd, degree.data().get()); - CHECK_CUDA(stream); + <<>>(n, e, csrInd, degree.data()); + CHECK_CUDA(stream_view.value()); int y = 4; nthreads.x = 32 / y; @@ -354,8 +341,8 @@ void HT_matrix_csc_coo(const IndexType n, nblocks.y = 1; nblocks.z = min((n + nthreads.z - 1) / nthreads.z, CUDA_MAX_BLOCKS); // 1; equi_prob3 - <<>>(n, e, csrPtr, csrInd, val, degree.data().get()); - CHECK_CUDA(stream); + <<>>(n, e, csrPtr, csrInd, val, degree.data()); + CHECK_CUDA(stream.value()); ValueType a = 0.0; fill(n, bookmark, a); @@ -368,96 +355,8 @@ void HT_matrix_csc_coo(const IndexType n, nblocks.y = 1; nblocks.z = 1; flag_leafs_kernel - <<>>(n, degree.data().get(), bookmark); - CHECK_CUDA(stream); -} - -template -__global__ void permute_vals_kernel(const IndexType e, - IndexType *perm, - ValueType *in, - ValueType *out) -{ - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x) - out[i] = in[perm[i]]; -} - -template -void permute_vals( - const IndexType e, IndexType *perm, ValueType *in, ValueType *out, cudaStream_t stream = nullptr) -{ - int nthreads = min(e, CUDA_MAX_KERNEL_THREADS); - int nblocks = min((e + nthreads - 1) / nthreads, CUDA_MAX_BLOCKS); - permute_vals_kernel<<>>(e, perm, in, out); -} - -// This will remove duplicate along with sorting -// This will sort the COO Matrix, row will be sorted and each column of same row will be sorted. -template -void remove_duplicate( - IndexType *src, IndexType *dest, ValueType *val, SizeT &nnz, cudaStream_t stream = nullptr) -{ - if (val != NULL) { - thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), - thrust::raw_pointer_cast(val), - thrust::raw_pointer_cast(val) + nnz, - thrust::make_zip_iterator(thrust::make_tuple( - thrust::raw_pointer_cast(src), thrust::raw_pointer_cast(dest)))); - thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), - thrust::raw_pointer_cast(dest), - thrust::raw_pointer_cast(dest + nnz), - thrust::make_zip_iterator(thrust::make_tuple( - thrust::raw_pointer_cast(src), thrust::raw_pointer_cast(val)))); - thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), - thrust::raw_pointer_cast(src), - thrust::raw_pointer_cast(src + nnz), - thrust::make_zip_iterator(thrust::make_tuple( - thrust::raw_pointer_cast(dest), thrust::raw_pointer_cast(val)))); - - typedef thrust::tuple IteratorTuple; - typedef thrust::zip_iterator ZipIterator; - typedef thrust::tuple ZipIteratorTuple; - typedef thrust::zip_iterator ZipZipIterator; - - ZipZipIterator newEnd = - thrust::unique(rmm::exec_policy(stream)->on(stream), - thrust::make_zip_iterator(thrust::make_tuple( - thrust::raw_pointer_cast(src), - thrust::make_zip_iterator(thrust::make_tuple( - thrust::raw_pointer_cast(dest), thrust::raw_pointer_cast(val))))), - thrust::make_zip_iterator(thrust::make_tuple( - thrust::raw_pointer_cast(src + nnz), - thrust::make_zip_iterator(thrust::make_tuple(dest + nnz, val + nnz))))); - - ZipIteratorTuple endTuple = newEnd.get_iterator_tuple(); - IndexType *row_end = thrust::get<0>(endTuple); - - nnz = ((size_t)row_end - (size_t)src) / sizeof(IndexType); - } else { - thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), - thrust::raw_pointer_cast(dest), - thrust::raw_pointer_cast(dest + nnz), - thrust::raw_pointer_cast(src)); - thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), - thrust::raw_pointer_cast(src), - thrust::raw_pointer_cast(src + nnz), - thrust::raw_pointer_cast(dest)); - - typedef thrust::tuple IteratorTuple; - typedef thrust::zip_iterator ZipIterator; - - ZipIterator newEnd = - thrust::unique(rmm::exec_policy(stream)->on(stream), - thrust::make_zip_iterator(thrust::make_tuple(thrust::raw_pointer_cast(src), - thrust::raw_pointer_cast(dest))), - thrust::make_zip_iterator(thrust::make_tuple( - thrust::raw_pointer_cast(src + nnz), thrust::raw_pointer_cast(dest + nnz)))); - - IteratorTuple endTuple = newEnd.get_iterator_tuple(); - IndexType *row_end = thrust::get<0>(endTuple); - - nnz = ((size_t)row_end - (size_t)src) / sizeof(IndexType); - } + <<>>(n, degree.data(), bookmark); + CHECK_CUDA(stream_view.value()); } template @@ -500,12 +399,12 @@ bool has_negative_val(DistType *arr, size_t n) { // custom kernel with boolean bitwise reduce may be // faster. - cudaStream_t stream{nullptr}; - DistType result = *thrust::min_element(rmm::exec_policy(stream)->on(stream), + rmm::cuda_stream_view stream_view; + DistType result = *thrust::min_element(rmm::exec_policy(stream_view), thrust::device_pointer_cast(arr), thrust::device_pointer_cast(arr + n)); - CHECK_CUDA(stream); + CHECK_CUDA(stream_view.value()); return (result < 0); }