From 092cf49f66fdf4f7676452ae94f9a5ccc5ea3c3b Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 26 Jan 2022 15:33:20 -0800
Subject: [PATCH 01/60] enable multi-stream execution and overlapping
 communication with computation (currently with the temporary mechanism to
 support stream priorities, eventually, rmm should be updated to support this)

---
 .../copy_v_transform_reduce_in_out_nbr.cuh    | 197 +++++++++++++-----
 cpp/tests/link_analysis/mg_pagerank_test.cpp  |   2 +-
 2 files changed, 141 insertions(+), 58 deletions(-)

diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
index 6251c269697..a98013ac996 100644
--- a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
@@ -475,6 +475,17 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
                                  T init,
                                  VertexValueOutputIterator vertex_value_output_first)
 {
+// FIXME: for temporary testing
+#if 1
+  cudaStream_t high_stream0{};
+  cudaStream_t high_stream1{};
+  cudaStream_t mid_stream{};
+  cudaStream_t low_stream{};
+  CUDA_TRY(cudaStreamCreateWithPriority(&high_stream0, cudaStreamNonBlocking, -5));
+  CUDA_TRY(cudaStreamCreateWithPriority(&high_stream1, cudaStreamNonBlocking, -5));
+  CUDA_TRY(cudaStreamCreateWithPriority(&mid_stream, cudaStreamNonBlocking, -3));
+  CUDA_TRY(cudaStreamCreateWithPriority(&low_stream, cudaStreamNonBlocking, 0));
+#endif
   constexpr auto update_major = (in == GraphViewType::is_adj_matrix_transposed);
   using vertex_t              = typename GraphViewType::vertex_type;
   using edge_t                = typename GraphViewType::edge_type;
@@ -559,34 +570,87 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       output_buffer = vertex_value_output_first;
     }
     auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
+    std::optional<std::vector<size_t>> stream_pool_indices{std::nullopt};
     if (segment_offsets) {
-      // FIXME: we may further improve performance by 1) concurrently running kernels on different
-      // segments; 2) individually tuning block sizes for different segments; and 3) adding one more
-      // segment for very high degree vertices and running segmented reduction
       static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
-      if ((*segment_offsets)[1] > 0) {
-        raft::grid_1d_block_t update_grid((*segment_offsets)[1],
-                                          detail::copy_v_transform_reduce_nbr_for_all_block_size,
-                                          handle.get_device_properties().maxGridSize[0]);
-        detail::for_all_major_for_all_nbr_high_degree<update_major, GraphViewType>
-          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+
+      auto num_segments = detail::num_sparse_segments_per_vertex_partition +
+                          (matrix_partition.get_dcs_nzd_vertex_count() ? size_t{1} : size_t{0});
+      if (GraphViewType::is_multi_gpu && handle.get_stream_pool_size() >= num_segments) {
+        stream_pool_indices = std::vector<size_t>(num_segments);
+        std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0});
+        handle.sync_stream();
+      }
+
+      // FIXME: we may further improve performance by 1) individually tuning block sizes for
+      // different segments; and 2) adding one more segment for very high degree vertices and
+      // running segmented reduction
+      if (matrix_partition.get_dcs_nzd_vertex_count()) {
+        auto exec_stream = stream_pool_indices
+                             ? rmm::cuda_stream_view{high_stream0}
+                             /* FIXME for temporary testing,
+                                handle.get_stream_from_stream_pool((*stream_pool_indices)[0]) */
+                             : handle.get_stream();
+        if constexpr (update_major) {  // this is necessary as we don't visit every vertex in the
+                                       // hypersparse segment in
+                                       // for_all_major_for_all_nbr_hypersparse
+          thrust::fill(rmm::exec_policy(exec_stream),
+                       output_buffer + (*segment_offsets)[3],
+                       output_buffer + (*segment_offsets)[4],
+                       major_init);
+        }
+        if (*(matrix_partition.get_dcs_nzd_vertex_count()) > 0) {
+          raft::grid_1d_thread_t update_grid(*(matrix_partition.get_dcs_nzd_vertex_count()),
+                                             detail::copy_v_transform_reduce_nbr_for_all_block_size,
+                                             handle.get_device_properties().maxGridSize[0]);
+          auto segment_output_buffer = output_buffer;
+          if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[3]; }
+          detail::for_all_major_for_all_nbr_hypersparse<update_major, GraphViewType>
+            <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
+              matrix_partition,
+              matrix_partition.get_major_first() + (*segment_offsets)[3],
+              matrix_partition_row_value_input,
+              matrix_partition_col_value_input,
+              segment_output_buffer,
+              e_op,
+              major_init);
+        }
+      }
+      if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
+        auto exec_stream = stream_pool_indices ? rmm::cuda_stream_view{high_stream1}
+                                               /* FIXME for temporary testing,
+                                                  handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count()
+                                                  ? 1 : 0]) */
+                                               : handle.get_stream();
+        raft::grid_1d_thread_t update_grid((*segment_offsets)[3] - (*segment_offsets)[2],
+                                           detail::copy_v_transform_reduce_nbr_for_all_block_size,
+                                           handle.get_device_properties().maxGridSize[0]);
+        auto segment_output_buffer = output_buffer;
+        if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[2]; }
+        detail::for_all_major_for_all_nbr_low_degree<update_major, GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
             matrix_partition,
-            matrix_partition.get_major_first(),
-            matrix_partition.get_major_first() + (*segment_offsets)[1],
+            matrix_partition.get_major_first() + (*segment_offsets)[2],
+            matrix_partition.get_major_first() + (*segment_offsets)[3],
             matrix_partition_row_value_input,
             matrix_partition_col_value_input,
-            output_buffer,
+            segment_output_buffer,
             e_op,
             major_init);
       }
       if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
+        auto exec_stream = stream_pool_indices ? rmm::cuda_stream_view{mid_stream}
+                                               /* FIXME for temporary testing,
+                                                  handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count()
+                                                  ? 2 : 1]) */
+                                               : handle.get_stream();
         raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1],
                                          detail::copy_v_transform_reduce_nbr_for_all_block_size,
                                          handle.get_device_properties().maxGridSize[0]);
         auto segment_output_buffer = output_buffer;
         if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[1]; }
         detail::for_all_major_for_all_nbr_mid_degree<update_major, GraphViewType>
-          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+          <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
             matrix_partition,
             matrix_partition.get_major_first() + (*segment_offsets)[1],
             matrix_partition.get_major_first() + (*segment_offsets)[2],
@@ -596,49 +660,26 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
             e_op,
             major_init);
       }
-      if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
-        raft::grid_1d_thread_t update_grid((*segment_offsets)[3] - (*segment_offsets)[2],
-                                           detail::copy_v_transform_reduce_nbr_for_all_block_size,
-                                           handle.get_device_properties().maxGridSize[0]);
-        auto segment_output_buffer = output_buffer;
-        if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[2]; }
-        detail::for_all_major_for_all_nbr_low_degree<update_major, GraphViewType>
-          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+      if ((*segment_offsets)[1] > 0) {
+        auto exec_stream = stream_pool_indices ? rmm::cuda_stream_view{low_stream}
+                                               /* FIXME for temporary testing,
+                                                  handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count()
+                                                  ? 3 : 2]) */
+                                               : handle.get_stream();
+        raft::grid_1d_block_t update_grid((*segment_offsets)[1],
+                                          detail::copy_v_transform_reduce_nbr_for_all_block_size,
+                                          handle.get_device_properties().maxGridSize[0]);
+        detail::for_all_major_for_all_nbr_high_degree<update_major, GraphViewType>
+          <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
             matrix_partition,
-            matrix_partition.get_major_first() + (*segment_offsets)[2],
-            matrix_partition.get_major_first() + (*segment_offsets)[3],
+            matrix_partition.get_major_first(),
+            matrix_partition.get_major_first() + (*segment_offsets)[1],
             matrix_partition_row_value_input,
             matrix_partition_col_value_input,
-            segment_output_buffer,
+            output_buffer,
             e_op,
             major_init);
       }
-      if (matrix_partition.get_dcs_nzd_vertex_count()) {
-        if constexpr (update_major) {  // this is necessary as we don't visit every vertex in the
-                                       // hypersparse segment in
-                                       // for_all_major_for_all_nbr_hypersparse
-          thrust::fill(handle.get_thrust_policy(),
-                       output_buffer + (*segment_offsets)[3],
-                       output_buffer + (*segment_offsets)[4],
-                       major_init);
-        }
-        if (*(matrix_partition.get_dcs_nzd_vertex_count()) > 0) {
-          raft::grid_1d_thread_t update_grid(*(matrix_partition.get_dcs_nzd_vertex_count()),
-                                             detail::copy_v_transform_reduce_nbr_for_all_block_size,
-                                             handle.get_device_properties().maxGridSize[0]);
-          auto segment_output_buffer = output_buffer;
-          if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[3]; }
-          detail::for_all_major_for_all_nbr_hypersparse<update_major, GraphViewType>
-            <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
-              matrix_partition,
-              matrix_partition.get_major_first() + (*segment_offsets)[3],
-              matrix_partition_row_value_input,
-              matrix_partition_col_value_input,
-              segment_output_buffer,
-              e_op,
-              major_init);
-        }
-      }
     } else {
       if (matrix_partition.get_major_size() > 0) {
         raft::grid_1d_thread_t update_grid(matrix_partition.get_major_size(),
@@ -666,13 +707,48 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       auto const col_comm_rank = col_comm.get_rank();
       auto const col_comm_size = col_comm.get_size();
 
-      device_reduce(col_comm,
-                    major_buffer_first,
-                    vertex_value_output_first,
-                    matrix_partition.get_major_size(),
-                    raft::comms::op_t::SUM,
-                    i,
-                    handle.get_stream());
+      if (segment_offsets && stream_pool_indices) {
+        if ((*segment_offsets).back() - (*segment_offsets)[3] > 0) {
+          device_reduce(col_comm,
+                        major_buffer_first + (*segment_offsets)[3],
+                        vertex_value_output_first + (*segment_offsets)[3],
+                        (*segment_offsets).back() - (*segment_offsets)[3],
+                        raft::comms::op_t::SUM,
+                        i,
+                        high_stream0/* FIXME for temporary testing, handle.get_stream_from_stream_pool((*stream_pool_indices)[0]) */);
+        }
+        if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
+          device_reduce(
+            col_comm, major_buffer_first + (*segment_offsets)[2], vertex_value_output_first + (*segment_offsets)[2], (*segment_offsets)[3] - (*segment_offsets)[2], raft::comms::op_t::SUM, i, high_stream1 /* FIXME for temporary testing, handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count() ? 1 : 0]) */);
+        }
+        if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
+          device_reduce(
+            col_comm, major_buffer_first + (*segment_offsets)[1], vertex_value_output_first + (*segment_offsets)[1], (*segment_offsets)[2] - (*segment_offsets)[1], raft::comms::op_t::SUM, i, mid_stream /* FIXME for temporary testing, handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count() ? 2 : 1]) */);
+        }
+        if ((*segment_offsets)[1] > 0) {
+          device_reduce(
+            col_comm, major_buffer_first, vertex_value_output_first, (*segment_offsets)[1], raft::comms::op_t::SUM, i, low_stream /* FIXME for temporary testing, handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count() ? 3 : 2]) */);
+        }
+      } else {
+        device_reduce(col_comm,
+                      major_buffer_first,
+                      vertex_value_output_first,
+                      matrix_partition.get_major_size(),
+                      raft::comms::op_t::SUM,
+                      i,
+                      handle.get_stream());
+      }
+    }
+
+    if (stream_pool_indices) {
+#if 1  // FIXME: for temporary testing
+      CUDA_TRY(cudaStreamSynchronize(high_stream0));
+      CUDA_TRY(cudaStreamSynchronize(high_stream1));
+      CUDA_TRY(cudaStreamSynchronize(mid_stream));
+      CUDA_TRY(cudaStreamSynchronize(low_stream));
+#else
+      handle.sync_stream_pool(*stream_pool_indices);
+#endif
     }
   }
 
@@ -738,6 +814,13 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       }
     }
   }
+// FIXME: for temporary testing
+#if 1
+  CUDA_TRY(cudaStreamDestroy(low_stream));
+  CUDA_TRY(cudaStreamDestroy(mid_stream));
+  CUDA_TRY(cudaStreamDestroy(high_stream1));
+  CUDA_TRY(cudaStreamDestroy(high_stream0));
+#endif
 }
 
 }  // namespace detail
diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp
index adcd0c94a8f..df264f2e0e1 100644
--- a/cpp/tests/link_analysis/mg_pagerank_test.cpp
+++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp
@@ -61,7 +61,7 @@ class Tests_MGPageRank
   {
     // 1. initialize handle
 
-    raft::handle_t handle{};
+    raft::handle_t handle(rmm::cuda_stream_per_thread, std::make_shared<rmm::cuda_stream_pool>());
     HighResClock hr_clock{};
 
     raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);

From 6102cd1b5f57f831cb6d47ea30c21e3b1e744cb0 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 27 Jan 2022 10:16:37 -0800
Subject: [PATCH 02/60] update group_by_and_count to not use reduce_by_key
 (which is expensive and also seems like having an issue with 2^31 or more
 elements)

---
 .../cugraph/utilities/shuffle_comm.cuh        | 64 +++++++++----------
 1 file changed, 31 insertions(+), 33 deletions(-)

diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh
index 5fd78dc00ee..3840de019fc 100644
--- a/cpp/include/cugraph/utilities/shuffle_comm.cuh
+++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh
@@ -22,6 +22,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/fill.h>
 #include <thrust/reduce.h>
@@ -36,6 +37,21 @@ namespace cugraph {
 
 namespace detail {
 
+template <typename GroupIdIterator>
+struct compute_group_id_count_pair_t {
+  GroupIdIterator group_id_first{};
+  GroupIdIterator group_id_last{};
+
+  __device__ thrust::tuple<int, size_t> operator()(size_t i) const
+  {
+    auto lower_it =
+      thrust::lower_bound(thrust::seq, group_id_first, group_id_last, static_cast<int>(i));
+    auto upper_it = thrust::upper_bound(thrust::seq, lower_it, group_id_last, static_cast<int>(i));
+    return thrust::make_tuple(static_cast<int>(i),
+                              static_cast<size_t>(thrust::distance(lower_it, upper_it)));
+  }
+};
+
 // inline to suppress a complaint about ODR violation
 inline std::tuple<std::vector<size_t>,
                   std::vector<size_t>,
@@ -128,23 +144,14 @@ rmm::device_uvector<size_t> groupby_and_count(ValueIterator tx_value_first /* [I
     [value_to_group_id_op] __device__(auto value) { return value_to_group_id_op(value); });
   rmm::device_uvector<int> d_tx_dst_ranks(num_groups, stream_view);
   rmm::device_uvector<size_t> d_tx_value_counts(d_tx_dst_ranks.size(), stream_view);
-  auto last =
-    thrust::reduce_by_key(rmm::exec_policy(stream_view),
-                          group_id_first,
-                          group_id_first + thrust::distance(tx_value_first, tx_value_last),
-                          thrust::make_constant_iterator(size_t{1}),
-                          d_tx_dst_ranks.begin(),
-                          d_tx_value_counts.begin());
-  if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < num_groups) {
-    rmm::device_uvector<size_t> d_counts(num_groups, stream_view);
-    thrust::fill(rmm::exec_policy(stream_view), d_counts.begin(), d_counts.end(), size_t{0});
-    thrust::scatter(rmm::exec_policy(stream_view),
-                    d_tx_value_counts.begin(),
-                    thrust::get<1>(last),
-                    d_tx_dst_ranks.begin(),
-                    d_counts.begin());
-    d_tx_value_counts = std::move(d_counts);
-  }
+  auto rank_count_pair_first = thrust::make_zip_iterator(
+    thrust::make_tuple(d_tx_dst_ranks.begin(), d_tx_value_counts.begin()));
+  thrust::tabulate(
+    rmm::exec_policy(stream_view),
+    rank_count_pair_first,
+    rank_count_pair_first + num_groups,
+    detail::compute_group_id_count_pair_t<decltype(group_id_first)>{
+      group_id_first, group_id_first + thrust::distance(tx_value_first, tx_value_last)});
 
   return d_tx_value_counts;
 }
@@ -169,22 +176,13 @@ rmm::device_uvector<size_t> groupby_and_count(VertexIterator tx_key_first /* [IN
     tx_key_first, [key_to_group_id_op] __device__(auto key) { return key_to_group_id_op(key); });
   rmm::device_uvector<int> d_tx_dst_ranks(num_groups, stream_view);
   rmm::device_uvector<size_t> d_tx_value_counts(d_tx_dst_ranks.size(), stream_view);
-  auto last = thrust::reduce_by_key(rmm::exec_policy(stream_view),
-                                    group_id_first,
-                                    group_id_first + thrust::distance(tx_key_first, tx_key_last),
-                                    thrust::make_constant_iterator(size_t{1}),
-                                    d_tx_dst_ranks.begin(),
-                                    d_tx_value_counts.begin());
-  if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < num_groups) {
-    rmm::device_uvector<size_t> d_counts(num_groups, stream_view);
-    thrust::fill(rmm::exec_policy(stream_view), d_counts.begin(), d_counts.end(), size_t{0});
-    thrust::scatter(rmm::exec_policy(stream_view),
-                    d_tx_value_counts.begin(),
-                    thrust::get<1>(last),
-                    d_tx_dst_ranks.begin(),
-                    d_counts.begin());
-    d_tx_value_counts = std::move(d_counts);
-  }
+  auto rank_count_pair_first = thrust::make_zip_iterator(
+    thrust::make_tuple(d_tx_dst_ranks.begin(), d_tx_value_counts.begin()));
+  thrust::tabulate(rmm::exec_policy(stream_view),
+                   rank_count_pair_first,
+                   rank_count_pair_first + num_groups,
+                   detail::compute_group_id_count_pair_t<decltype(group_id_first)>{
+                     group_id_first, group_id_first + thrust::distance(tx_key_first, tx_key_last)});
 
   return d_tx_value_counts;
 }

From 5146b198b4617396f8b11664df8736e4f6e374d7 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 27 Jan 2022 11:51:52 -0800
Subject: [PATCH 03/60] add time measurements (should be undone)

---
 cpp/src/structure/graph_impl.cuh             | 34 +++++++++++++++
 cpp/src/structure/renumber_edgelist_impl.cuh | 34 +++++++++++++++
 cpp/tests/link_analysis/mg_pagerank_test.cpp | 22 ++++++++++
 cpp/tests/utilities/test_graphs.hpp          | 44 ++++++++++++++++++++
 4 files changed, 134 insertions(+)

diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index e969bb4a6a3..6aeb41a915b 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -486,6 +486,13 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
       handle, meta.number_of_vertices, meta.number_of_edges, meta.properties),
     partition_(meta.partition)
 {
+#if 1 // FIXME: delete
+  handle.sync_stream();
+  if constexpr (multi_gpu) {
+    handle.get_comms().barrier();
+  }
+  auto time0 = std::chrono::steady_clock::now();
+#endif
   // cheap error checks
 
   auto& comm           = this->get_handle_ptr()->get_comms();
@@ -618,6 +625,10 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   }
 
   // aggregate segment_offsets
+#if 1 // FIXME: delete
+  handle.sync_stream();
+  auto time1 = std::chrono::steady_clock::now();
+#endif
 
   if (meta.segment_offsets) {
     // FIXME: we need to add host_allgather
@@ -647,6 +658,10 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   }
 
   // compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid
+#if 1 // FIXME: delete
+  handle.sync_stream();
+  auto time2 = std::chrono::steady_clock::now();
+#endif
 
   adj_matrix_partition_offsets_.reserve(edgelists.size());
   adj_matrix_partition_indices_.reserve(edgelists.size());
@@ -689,6 +704,10 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   }
 
   // segmented sort neighbors
+#if 1 // FIXME: delete
+  handle.sync_stream();
+  auto time3 = std::chrono::steady_clock::now();
+#endif
 
   for (size_t i = 0; i < adj_matrix_partition_offsets_.size(); ++i) {
     sort_adjacency_list(handle,
@@ -703,6 +722,10 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
 
   // if # unique edge rows/cols << V / row_comm_size|col_comm_size, store unique edge rows/cols to
   // support storing edge row/column properties in (key, value) pairs.
+#if 1 // FIXME: delete
+  handle.sync_stream();
+  auto time4 = std::chrono::steady_clock::now();
+#endif
 
   auto num_local_unique_edge_majors =
     store_transposed ? meta.num_local_unique_edge_cols : meta.num_local_unique_edge_rows;
@@ -859,6 +882,17 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
       local_sorted_unique_edge_col_offsets_ = std::move(h_key_offsets);
     }
   }
+#if 1 // FIXME: delete
+  handle.sync_stream();
+  auto time5 = std::chrono::steady_clock::now();
+  std::chrono::duration<double> elapsed_total = time5 - time0;
+  std::chrono::duration<double> elapsed0 = time1 - time0;
+  std::chrono::duration<double> elapsed1 = time2 - time1;
+  std::chrono::duration<double> elapsed2 = time3 - time2;
+  std::chrono::duration<double> elapsed3 = time4 - time3;
+  std::chrono::duration<double> elapsed4 = time5 - time4;
+  std::cout << "Graph constructor took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << ") ms." << std::endl;
+#endif
 }
 
 template <typename vertex_t,
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index bd0a64ce211..94ab7f3a495 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -556,6 +556,13 @@ renumber_edgelist(
   std::optional<std::vector<std::vector<edge_t>>> const& edgelist_intra_partition_segment_offsets,
   bool do_expensive_check)
 {
+#if 1 // FIXME: delete
+  handle.sync_stream();
+  if constexpr (multi_gpu) {
+    handle.get_comms().barrier();
+  }
+  auto time0 = std::chrono::steady_clock::now();
+#endif
   auto& comm               = handle.get_comms();
   auto const comm_size     = comm.get_size();
   auto const comm_rank     = comm.get_rank();
@@ -613,6 +620,10 @@ renumber_edgelist(
 
   // 1. compute renumber map
 
+#if 1 // FIXME: delete
+  handle.sync_stream();
+  auto time1 = std::chrono::steady_clock::now();
+#endif
   auto [renumber_map_labels,
         vertex_partition_segment_offsets,
         num_unique_edge_majors,
@@ -626,6 +637,10 @@ renumber_edgelist(
   // 2. initialize partition_t object, number_of_vertices, and number_of_edges for the coarsened
   // graph
 
+#if 1 // FIXME: delete
+  handle.sync_stream();
+  auto time2 = std::chrono::steady_clock::now();
+#endif
   auto vertex_counts = host_scalar_allgather(
     comm, static_cast<vertex_t>(renumber_map_labels.size()), handle.get_stream());
   std::vector<vertex_t> vertex_partition_offsets(comm_size + 1, 0);
@@ -649,6 +664,10 @@ renumber_edgelist(
   // FIXME: compare this hash based approach with a binary search based approach in both memory
   // footprint and execution time
 
+#if 1 // FIXME: delete
+  handle.sync_stream();
+  auto time3 = std::chrono::steady_clock::now();
+#endif
   {
     vertex_t max_matrix_partition_major_size{0};
     for (size_t i = 0; i < edgelist_majors.size(); ++i) {
@@ -696,6 +715,10 @@ renumber_edgelist(
     }
   }
 
+#if 1 // FIXME: delete
+  handle.sync_stream();
+  auto time4 = std::chrono::steady_clock::now();
+#endif
   if ((partition.get_matrix_partition_minor_size() >= number_of_edges / comm_size) &&
       edgelist_intra_partition_segment_offsets) {  // memory footprint dominated by the O(V/sqrt(P))
                                                    // part than the O(E/P) part
@@ -791,6 +814,17 @@ renumber_edgelist(
                         handle.get_stream());
     }
   }
+#if 1 // FIXME: delete
+  handle.sync_stream();
+  auto time5 = std::chrono::steady_clock::now();
+  std::chrono::duration<double> elapsed_total = time5 - time0;
+  std::chrono::duration<double> elapsed0 = time1 - time0;
+  std::chrono::duration<double> elapsed1 = time2 - time1;
+  std::chrono::duration<double> elapsed2 = time3 - time2;
+  std::chrono::duration<double> elapsed3 = time4 - time3;
+  std::chrono::duration<double> elapsed4 = time5 - time4;
+  std::cout << "Renumber took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << ") ms." << std::endl;
+#endif
 
   return std::make_tuple(
     std::move(renumber_map_labels),
diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp
index adcd0c94a8f..4a6dd08dabd 100644
--- a/cpp/tests/link_analysis/mg_pagerank_test.cpp
+++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp
@@ -63,6 +63,9 @@ class Tests_MGPageRank
 
     raft::handle_t handle{};
     HighResClock hr_clock{};
+#if 1  // FIXME: delete
+    auto time0 = std::chrono::steady_clock::now();
+#endif
 
     raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
     auto& comm           = handle.get_comms();
@@ -75,6 +78,25 @@ class Tests_MGPageRank
     }
     cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
       subcomm_factory(handle, row_comm_size);
+#if 1  // FIXME: delete
+    {
+      rmm::device_uvector<int32_t> tx_ints(comm_size, handle.get_stream());
+      rmm::device_uvector<int32_t> rx_ints(comm_size, handle.get_stream());
+      std::vector<size_t> tx_sizes(comm_size, size_t{1});
+      std::vector<size_t> tx_offsets(comm_size);
+      std::iota(tx_offsets.begin(), tx_offsets.end(), size_t{0});
+      std::vector<int32_t> tx_ranks(comm_size);
+      std::iota(tx_ranks.begin(), tx_ranks.end(), int32_t{0});
+      auto rx_sizes = tx_sizes;
+      auto rx_offsets = tx_offsets;
+      auto rx_ranks = tx_ranks;
+      handle.get_comms().device_multicast_sendrecv(tx_ints.data(), tx_sizes, tx_offsets, tx_ranks, rx_ints.data(), rx_sizes, rx_offsets, rx_ranks, handle.get_stream());
+      handle.sync_stream();
+    }
+    auto time1 = std::chrono::steady_clock::now();
+    std::chrono::duration<double> elapsed = time1 - time0;
+    std::cout << "Handle initialization and 1st all-to-all took " << elapsed.count() * 1e3 << " ms." << std::endl;
+#endif
 
     // 2. create MG graph
 
diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp
index 9fa4cee9f7a..8818d9633bf 100644
--- a/cpp/tests/utilities/test_graphs.hpp
+++ b/cpp/tests/utilities/test_graphs.hpp
@@ -147,6 +147,13 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
                       static_cast<size_t>(std::numeric_limits<edge_t>::max()),
                     "Invalid template parameter: (scale_, edge_factor_) too large for edge_t");
 
+#if 1 // FIXME: delete
+    handle.sync_stream();
+    if constexpr (multi_gpu) {
+      handle.get_comms().barrier();
+    }
+    auto time0 = std::chrono::steady_clock::now();
+#endif
     std::vector<size_t> partition_ids(1);
     size_t num_partitions;
 
@@ -191,6 +198,10 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
       }
     }
 
+#if 1 // FIXME: delete
+    handle.sync_stream();
+    auto time1 = std::chrono::steady_clock::now();
+#endif
     rmm::device_uvector<vertex_t> src_v(0, handle.get_stream());
     rmm::device_uvector<vertex_t> dst_v(0, handle.get_stream());
     auto weights_v = test_weighted
@@ -247,13 +258,25 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
       }
     }
 
+#if 1 // FIXME: delete
+    handle.sync_stream();
+    auto time2 = std::chrono::steady_clock::now();
+#endif
     translate(handle, src_v, dst_v);
 
+#if 1 // FIXME: delete
+    handle.sync_stream();
+    auto time3 = std::chrono::steady_clock::now();
+#endif
     if (undirected_)
       std::tie(src_v, dst_v, weights_v) =
         cugraph::symmetrize_edgelist_from_triangular<vertex_t, weight_t>(
           handle, std::move(src_v), std::move(dst_v), std::move(weights_v));
 
+#if 1 // FIXME: delete
+    handle.sync_stream();
+    auto time4 = std::chrono::steady_clock::now();
+#endif
     if (multi_gpu) {
       std::tie(store_transposed ? dst_v : src_v, store_transposed ? src_v : dst_v, weights_v) =
         cugraph::detail::shuffle_edgelist_by_gpu_id(
@@ -263,6 +286,10 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
           std::move(weights_v));
     }
 
+#if 1 // FIXME: delete
+    handle.sync_stream();
+    auto time5 = std::chrono::steady_clock::now();
+#endif
     rmm::device_uvector<vertex_t> vertices_v(0, handle.get_stream());
     for (size_t i = 0; i < partition_ids.size(); ++i) {
       auto id = partition_ids[i];
@@ -276,10 +303,27 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
                                      partition_vertex_firsts[i]);
     }
 
+#if 1 // FIXME: delete
+    handle.sync_stream();
+    auto time6 = std::chrono::steady_clock::now();
+#endif
     if constexpr (multi_gpu) {
       vertices_v = cugraph::detail::shuffle_vertices_by_gpu_id(handle, std::move(vertices_v));
     }
 
+#if 1 // FIXME: delete
+    handle.sync_stream();
+    auto time7 = std::chrono::steady_clock::now();
+    std::chrono::duration<double> elapsed_total = time7 - time0;
+    std::chrono::duration<double> elapsed0 = time1 - time0;
+    std::chrono::duration<double> elapsed1 = time2 - time1;
+    std::chrono::duration<double> elapsed2 = time3 - time2;
+    std::chrono::duration<double> elapsed3 = time4 - time3;
+    std::chrono::duration<double> elapsed4 = time5 - time4;
+    std::chrono::duration<double> elapsed5 = time6 - time5;
+    std::chrono::duration<double> elapsed6 = time7 - time6;
+    std::cout << "Edge generation took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << "," << elapsed5.count() * 1e3 << "," << elapsed6.count() * 1e3 << ") ms." << std::endl;
+#endif
     return std::make_tuple(
       std::move(src_v),
       std::move(dst_v),

From 965f0cddc0b2379b9480285d066824b8fbff89de Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Mon, 31 Jan 2022 11:32:52 -0800
Subject: [PATCH 04/60] cosmetic updates

---
 cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
index 508294c9e89..f93ee1a6ae5 100644
--- a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
+++ b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
@@ -170,7 +170,7 @@ void copy_to_matrix_major(raft::handle_t const& handle,
         matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
           graph_view.get_matrix_partition_view(i));
 
-      if (col_comm_rank == i) {
+      if (i == col_comm_rank) {
         auto vertex_partition =
           vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
             graph_view.get_vertex_partition_view());
@@ -365,7 +365,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
       matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
         graph_view.get_matrix_partition_view(size_t{0}));
     for (int i = 0; i < row_comm_size; ++i) {
-      if (row_comm_rank == i) {
+      if (i == row_comm_rank) {
         auto vertex_partition =
           vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
             graph_view.get_vertex_partition_view());

From 7c02fcf0ee8661621b6e45d9e260058082ee31c9 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Mon, 31 Jan 2022 17:52:04 -0800
Subject: [PATCH 05/60] improve weak scaling behavior of renumber

---
 cpp/src/structure/renumber_edgelist_impl.cuh | 582 +++++++++++--------
 1 file changed, 348 insertions(+), 234 deletions(-)

diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index 94ab7f3a495..2d501b6f1cc 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cugraph/detail/graph_utils.cuh>
+#include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/utilities/device_comm.cuh>
@@ -53,240 +54,328 @@ compute_renumber_map(raft::handle_t const& handle,
                      std::vector<vertex_t const*> const& edgelist_minors,
                      std::vector<edge_t> const& edgelist_edge_counts)
 {
-  // FIXME: compare this sort based approach with hash based approach in both speed and memory
-  // footprint
+#if 1  // FIXME: delete
+  handle.sync_stream();
+  if constexpr (multi_gpu) {
+    rmm::device_uvector<int32_t> dummy(1, handle.get_stream());
+    handle.get_comms().allreduce(
+      dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream());
+  }
+  auto time0 = std::chrono::steady_clock::now();
+#endif
+  rmm::device_uvector<vertex_t> sorted_local_vertices(0, handle.get_stream());
+  vertex_t num_local_unique_edge_majors{0};
+  vertex_t num_local_unique_edge_minors{0};
 
-  // 1. acquire (unique major label, count) pairs
+  edge_t num_local_edges = std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end());
 
-  rmm::device_uvector<vertex_t> major_labels(0, handle.get_stream());
-  rmm::device_uvector<edge_t> major_counts(0, handle.get_stream());
-  vertex_t num_local_unique_edge_majors{0};
-  for (size_t i = 0; i < edgelist_majors.size(); ++i) {
-    rmm::device_uvector<vertex_t> tmp_major_labels(0, handle.get_stream());
-    rmm::device_uvector<edge_t> tmp_major_counts(0, handle.get_stream());
-    {
-      rmm::device_uvector<vertex_t> sorted_major_labels(edgelist_edge_counts[i],
-                                                        handle.get_stream());
+  // 1. if local_vertices.has_value() is false, keep unique vertices from edge majors as well (to
+  // construct local_vertices)
+
+  rmm::device_uvector<vertex_t> sorted_unique_majors(0, handle.get_stream());
+  if (!local_vertices) {
+    sorted_unique_majors.resize(num_local_edges, handle.get_stream());
+    size_t major_offset{0};
+    for (size_t i = 0; i < edgelist_majors.size(); ++i) {
       thrust::copy(handle.get_thrust_policy(),
                    edgelist_majors[i],
                    edgelist_majors[i] + edgelist_edge_counts[i],
-                   sorted_major_labels.begin());
-      // FIXME: better refactor this sort-count_if-reduce_by_key routine for reuse
-      thrust::sort(
-        handle.get_thrust_policy(), sorted_major_labels.begin(), sorted_major_labels.end());
-      auto num_unique_labels =
-        thrust::count_if(handle.get_thrust_policy(),
-                         thrust::make_counting_iterator(size_t{0}),
-                         thrust::make_counting_iterator(sorted_major_labels.size()),
-                         [labels = sorted_major_labels.data()] __device__(auto i) {
-                           return (i == 0) || (labels[i - 1] != labels[i]);
-                         });
-      tmp_major_labels.resize(num_unique_labels, handle.get_stream());
-      tmp_major_counts.resize(tmp_major_labels.size(), handle.get_stream());
-      thrust::reduce_by_key(handle.get_thrust_policy(),
-                            sorted_major_labels.begin(),
-                            sorted_major_labels.end(),
-                            thrust::make_constant_iterator(edge_t{1}),
-                            tmp_major_labels.begin(),
-                            tmp_major_counts.begin());
+                   sorted_unique_majors.begin() + major_offset);
+      thrust::sort(handle.get_thrust_policy(),
+                   sorted_unique_majors.begin() + major_offset,
+                   sorted_unique_majors.begin() + major_offset + edgelist_edge_counts[i]);
+      major_offset += static_cast<size_t>(thrust::distance(
+        sorted_unique_majors.begin() + major_offset,
+        thrust::unique(handle.get_thrust_policy(),
+                       sorted_unique_majors.begin() + major_offset,
+                       sorted_unique_majors.begin() + major_offset + edgelist_edge_counts[i])));
     }
-    num_local_unique_edge_majors += static_cast<vertex_t>(tmp_major_labels.size());
-
-    if (multi_gpu) {
-      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-      auto const col_comm_rank = col_comm.get_rank();
-      auto const col_comm_size = col_comm.get_size();
-
-      rmm::device_uvector<vertex_t> rx_major_labels(0, handle.get_stream());
-      rmm::device_uvector<edge_t> rx_major_counts(0, handle.get_stream());
-      auto rx_sizes = host_scalar_gather(
-        col_comm, tmp_major_labels.size(), static_cast<int>(i), handle.get_stream());
-      std::vector<size_t> rx_displs{};
-      if (static_cast<int>(i) == col_comm_rank) {
-        rx_displs.assign(col_comm_size, size_t{0});
-        std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1);
-        rx_major_labels.resize(rx_displs.back() + rx_sizes.back(), handle.get_stream());
-        rx_major_counts.resize(rx_major_labels.size(), handle.get_stream());
-      }
-      device_gatherv(col_comm,
-                     thrust::make_zip_iterator(
-                       thrust::make_tuple(tmp_major_labels.begin(), tmp_major_counts.begin())),
-                     thrust::make_zip_iterator(
-                       thrust::make_tuple(rx_major_labels.begin(), rx_major_counts.begin())),
-                     tmp_major_labels.size(),
-                     rx_sizes,
-                     rx_displs,
-                     static_cast<int>(i),
-                     handle.get_stream());
-      if (static_cast<int>(i) == col_comm_rank) {
-        major_labels = std::move(rx_major_labels);
-        major_counts = std::move(rx_major_counts);
-      }
-    } else {
-      assert(i == 0);
-      major_labels = std::move(tmp_major_labels);
-      major_counts = std::move(tmp_major_counts);
+    sorted_unique_majors.resize(major_offset, handle.get_stream());
+
+    if (edgelist_majors.size() > 1) {
+      thrust::sort(
+        handle.get_thrust_policy(), sorted_unique_majors.begin(), sorted_unique_majors.end());
+      sorted_unique_majors.resize(thrust::distance(sorted_unique_majors.begin(),
+                                                   thrust::unique(handle.get_thrust_policy(),
+                                                                  sorted_unique_majors.begin(),
+                                                                  sorted_unique_majors.end())),
+                                  handle.get_stream());
     }
-  }
-  if (multi_gpu) {
-    // FIXME: better refactor this sort-count_if-reduce_by_key routine for reuse
-    thrust::sort_by_key(
-      handle.get_thrust_policy(), major_labels.begin(), major_labels.end(), major_counts.begin());
-    auto num_unique_labels = thrust::count_if(handle.get_thrust_policy(),
-                                              thrust::make_counting_iterator(size_t{0}),
-                                              thrust::make_counting_iterator(major_labels.size()),
-                                              [labels = major_labels.data()] __device__(auto i) {
-                                                return (i == 0) || (labels[i - 1] != labels[i]);
-                                              });
-    rmm::device_uvector<vertex_t> tmp_major_labels(num_unique_labels, handle.get_stream());
-    rmm::device_uvector<edge_t> tmp_major_counts(tmp_major_labels.size(), handle.get_stream());
-    thrust::reduce_by_key(handle.get_thrust_policy(),
-                          major_labels.begin(),
-                          major_labels.end(),
-                          major_counts.begin(),
-                          tmp_major_labels.begin(),
-                          tmp_major_counts.begin());
-    major_labels = std::move(tmp_major_labels);
-    major_counts = std::move(tmp_major_counts);
+    sorted_unique_majors.shrink_to_fit(handle.get_stream());
   }
 
-  // 2. acquire unique minor labels
+  // 2. count unique edge minors.
+  // if local_vertices.has_value() is false, keep unique vertices from edge minors as well (to
+  // construct local_vertices)
 
-  std::vector<edge_t> minor_displs(edgelist_minors.size(), edge_t{0});
-  std::partial_sum(
-    edgelist_edge_counts.begin(), edgelist_edge_counts.end() - 1, minor_displs.begin() + 1);
-  rmm::device_uvector<vertex_t> minor_labels(minor_displs.back() + edgelist_edge_counts.back(),
-                                             handle.get_stream());
-  vertex_t minor_offset{0};
+#if 1  // FIXME: delete
+  handle.sync_stream();
+  auto time1 = std::chrono::steady_clock::now();
+#endif
+  rmm::device_uvector<vertex_t> sorted_unique_minors(num_local_edges, handle.get_stream());
+  size_t minor_offset{0};
   for (size_t i = 0; i < edgelist_minors.size(); ++i) {
     thrust::copy(handle.get_thrust_policy(),
                  edgelist_minors[i],
                  edgelist_minors[i] + edgelist_edge_counts[i],
-                 minor_labels.begin() + minor_offset);
+                 sorted_unique_minors.begin() + minor_offset);
     thrust::sort(handle.get_thrust_policy(),
-                 minor_labels.begin() + minor_offset,
-                 minor_labels.begin() + minor_offset + edgelist_edge_counts[i]);
-    minor_offset += thrust::distance(
-      minor_labels.begin() + minor_offset,
+                 sorted_unique_minors.begin() + minor_offset,
+                 sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i]);
+    minor_offset += static_cast<size_t>(thrust::distance(
+      sorted_unique_minors.begin() + minor_offset,
       thrust::unique(handle.get_thrust_policy(),
-                     minor_labels.begin() + minor_offset,
-                     minor_labels.begin() + minor_offset + edgelist_edge_counts[i]));
+                     sorted_unique_minors.begin() + minor_offset,
+                     sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i])));
   }
-  minor_labels.resize(minor_offset, handle.get_stream());
-  thrust::sort(handle.get_thrust_policy(), minor_labels.begin(), minor_labels.end());
-  minor_labels.resize(
-    thrust::distance(
-      minor_labels.begin(),
-      thrust::unique(handle.get_thrust_policy(), minor_labels.begin(), minor_labels.end())),
-    handle.get_stream());
-  auto num_local_unique_edge_minors = static_cast<vertex_t>(minor_labels.size());
-  if (multi_gpu) {
+  sorted_unique_minors.resize(minor_offset, handle.get_stream());
+  if (edgelist_minors.size() > 1) {
+    thrust::sort(
+      handle.get_thrust_policy(), sorted_unique_minors.begin(), sorted_unique_minors.end());
+    sorted_unique_minors.resize(thrust::distance(sorted_unique_minors.begin(),
+                                                 thrust::unique(handle.get_thrust_policy(),
+                                                                sorted_unique_minors.begin(),
+                                                                sorted_unique_minors.end())),
+                                handle.get_stream());
+  }
+
+  num_local_unique_edge_minors = static_cast<vertex_t>(sorted_unique_minors.size());
+
+  if (local_vertices) { sorted_unique_minors.resize(0, handle.get_stream()); }
+  sorted_unique_minors.shrink_to_fit(handle.get_stream());
+
+  // 3. update sorted_local_vertices.
+  // if local_vertices.has_value() is false, reconstruct local_vertices first
+
+#if 1  // FIXME: delete
+  handle.sync_stream();
+  auto time2 = std::chrono::steady_clock::now();
+#endif
+  if (!local_vertices) {
+    sorted_local_vertices.resize(sorted_unique_majors.size() + sorted_unique_minors.size(),
+                                 handle.get_stream());
+
+    thrust::merge(handle.get_thrust_policy(),
+                  sorted_unique_majors.begin(),
+                  sorted_unique_majors.end(),
+                  sorted_unique_minors.begin(),
+                  sorted_unique_minors.end(),
+                  sorted_local_vertices.begin());
+
+    sorted_unique_majors.resize(0, handle.get_stream());
+    sorted_unique_majors.shrink_to_fit(handle.get_stream());
+    sorted_unique_minors.resize(0, handle.get_stream());
+    sorted_unique_minors.shrink_to_fit(handle.get_stream());
+
+    sorted_local_vertices.resize(thrust::distance(sorted_local_vertices.begin(),
+                                                  thrust::unique(handle.get_thrust_policy(),
+                                                                 sorted_local_vertices.begin(),
+                                                                 sorted_local_vertices.end())),
+                                 handle.get_stream());
+    sorted_local_vertices.shrink_to_fit(handle.get_stream());
+
+    if constexpr (multi_gpu) {
+      sorted_local_vertices =
+        cugraph::detail::shuffle_vertices_by_gpu_id(handle, std::move(sorted_local_vertices));
+    }
+  } else {
+    sorted_local_vertices = std::move(*local_vertices);
+    thrust::sort(
+      handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end());
+  }
+
+  // 4. compute global degrees for the sorted local vertices, and count unique edge majors on the
+  // way
+
+#if 1  // FIXME: delete
+  handle.sync_stream();
+  auto time3 = std::chrono::steady_clock::now();
+#endif
+  rmm::device_uvector<edge_t> sorted_local_vertex_degrees(0, handle.get_stream());
+  std::optional<std::vector<size_t>> stream_pool_indices{std::nullopt};  // FIXME: move this inside the if statement
+  if constexpr (multi_gpu) {
     auto& comm               = handle.get_comms();
-    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-    auto const row_comm_size = row_comm.get_size();
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_rank = col_comm.get_rank();
+    auto const col_comm_size = col_comm.get_size();
+
+    assert(edgelist_majors.size() == col_comm_size);
 
-    if (row_comm_size > 1) {
-      rmm::device_uvector<vertex_t> rx_minor_labels(0, handle.get_stream());
-      std::tie(rx_minor_labels, std::ignore) = groupby_gpuid_and_shuffle_values(
-        row_comm,
-        minor_labels.begin(),
-        minor_labels.end(),
-        [key_func = detail::compute_gpu_id_from_vertex_t<vertex_t>{row_comm_size}] __device__(
-          auto val) { return key_func(val); },
+    auto edge_partition_major_sizes =
+      host_scalar_allgather(col_comm, sorted_local_vertices.size(), handle.get_stream());
+
+    if ((col_comm_size >= 2) && (handle.get_stream_pool_size() >= 2)) {
+      auto vertex_edge_counts = host_scalar_allreduce(
+        comm,
+        thrust::make_tuple(static_cast<vertex_t>(sorted_local_vertices.size()), num_local_edges),
+        raft::comms::op_t::SUM,
         handle.get_stream());
-      thrust::sort(handle.get_thrust_policy(), rx_minor_labels.begin(), rx_minor_labels.end());
-      rx_minor_labels.resize(thrust::distance(rx_minor_labels.begin(),
-                                              thrust::unique(handle.get_thrust_policy(),
-                                                             rx_minor_labels.begin(),
-                                                             rx_minor_labels.end())),
-                             handle.get_stream());
-      minor_labels = std::move(rx_minor_labels);
+      // memory footprint vs parallelism trade-off
+      // peak memory requirement per loop is
+      // min(
+      //   (E / (comm_size * col_comm_size)) * sizeof(vertex_t) * 2,
+      //   (E / (comm_size * col_comm_size)) * sizeof(vertex_t) +
+      //     (V/P) * (sizeof(vertex_t) + sizeof(edge_t)),
+      //   (V/P) * (sizeof(vertex_t) + sizeof(edge_t)) * 2
+      // )
+      // and limit temporary memory requirement to (E / comm_size) * sizeof(vertex_t) * 2
+      auto avg_vertex_degree = thrust::get<0>(vertex_edge_counts) > 0
+                                 ? static_cast<double>(thrust::get<1>(vertex_edge_counts)) /
+                                     static_cast<double>(thrust::get<0>(vertex_edge_counts))
+                                 : double{0.0};
+      auto num_streams =
+        std::min(static_cast<size_t>(avg_vertex_degree *
+                                     (static_cast<double>(sizeof(vertex_t)) /
+                                      static_cast<double>(sizeof(vertex_t) + sizeof(edge_t)))),
+                 static_cast<size_t>(
+                   std::min(static_cast<size_t>(col_comm_size), handle.get_stream_pool_size())));
+      if (num_streams >= 2) {
+        stream_pool_indices = std::vector<size_t>(num_streams);
+        std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0});
+        handle.sync_stream();
+      }
     }
-  }
-  minor_labels.shrink_to_fit(handle.get_stream());
-
-  // 3. merge major and minor labels and vertex labels
-
-  rmm::device_uvector<vertex_t> merged_labels(major_labels.size() + minor_labels.size(),
-                                              handle.get_stream());
-  rmm::device_uvector<edge_t> merged_counts(merged_labels.size(), handle.get_stream());
-  thrust::merge_by_key(handle.get_thrust_policy(),
-                       major_labels.begin(),
-                       major_labels.end(),
-                       minor_labels.begin(),
-                       minor_labels.end(),
-                       major_counts.begin(),
-                       thrust::make_constant_iterator(edge_t{0}),
-                       merged_labels.begin(),
-                       merged_counts.begin());
-
-  major_labels.resize(0, handle.get_stream());
-  major_counts.resize(0, handle.get_stream());
-  minor_labels.resize(0, handle.get_stream());
-  major_labels.shrink_to_fit(handle.get_stream());
-  major_counts.shrink_to_fit(handle.get_stream());
-  minor_labels.shrink_to_fit(handle.get_stream());
-
-  rmm::device_uvector<vertex_t> labels(merged_labels.size(), handle.get_stream());
-  rmm::device_uvector<edge_t> counts(labels.size(), handle.get_stream());
-  auto pair_it = thrust::reduce_by_key(handle.get_thrust_policy(),
-                                       merged_labels.begin(),
-                                       merged_labels.end(),
-                                       merged_counts.begin(),
-                                       labels.begin(),
-                                       counts.begin());
-  merged_labels.resize(0, handle.get_stream());
-  merged_counts.resize(0, handle.get_stream());
-  merged_labels.shrink_to_fit(handle.get_stream());
-  merged_counts.shrink_to_fit(handle.get_stream());
-  labels.resize(thrust::distance(labels.begin(), thrust::get<0>(pair_it)), handle.get_stream());
-  counts.resize(labels.size(), handle.get_stream());
-  labels.shrink_to_fit(handle.get_stream());
-  counts.shrink_to_fit(handle.get_stream());
-
-  auto num_non_isolated_vertices = static_cast<vertex_t>(labels.size());
-
-  // 4. if local_vertices.has_value() == true, append isolated vertices
+    stream_pool_indices = std::nullopt;  // FIXME: delete
 
-  if (local_vertices) {
-    rmm::device_uvector<vertex_t> isolated_vertices(0, handle.get_stream());
-
-    auto num_isolated_vertices = thrust::count_if(
-      handle.get_thrust_policy(),
-      (*local_vertices).begin(),
-      (*local_vertices).end(),
-      [label_first = labels.begin(), label_last = labels.end()] __device__(auto v) {
-        return !thrust::binary_search(thrust::seq, label_first, label_last, v);
-      });
-    isolated_vertices.resize(num_isolated_vertices, handle.get_stream());
-    thrust::copy_if(handle.get_thrust_policy(),
-                    (*local_vertices).begin(),
-                    (*local_vertices).end(),
-                    isolated_vertices.begin(),
-                    [label_first = labels.begin(), label_last = labels.end()] __device__(auto v) {
-                      return !thrust::binary_search(thrust::seq, label_first, label_last, v);
-                    });
-    (*local_vertices).resize(0, handle.get_stream());
-    (*local_vertices).shrink_to_fit(handle.get_stream());
-
-    if (isolated_vertices.size() > 0) {
-      labels.resize(labels.size() + isolated_vertices.size(), handle.get_stream());
-      thrust::copy(handle.get_thrust_policy(),
-                   isolated_vertices.begin(),
-                   isolated_vertices.end(),
-                   labels.end() - isolated_vertices.size());
+    for (int i = 0; i < col_comm_size; ++i) {
+      auto loop_stream = stream_pool_indices
+                           ? handle.get_stream_from_stream_pool(
+                               (*stream_pool_indices)[i % (*stream_pool_indices).size()])
+                           : handle.get_stream();
+
+      rmm::device_uvector<vertex_t> tmp_majors(edgelist_edge_counts[i], loop_stream);
+      thrust::copy(rmm::exec_policy(loop_stream),
+                   edgelist_majors[i],
+                   edgelist_majors[i] + edgelist_edge_counts[i],
+                   tmp_majors.begin());
+      thrust::sort(rmm::exec_policy(loop_stream), tmp_majors.begin(), tmp_majors.end());
+      auto num_unique_majors =
+        thrust::count_if(rmm::exec_policy(loop_stream),
+                         thrust::make_counting_iterator(size_t{0}),
+                         thrust::make_counting_iterator(tmp_majors.size()),
+                         [majors = tmp_majors.data()] __device__(auto idx) {
+                           return (idx == 0) || (majors[idx - 1] != majors[idx]);
+                         });
+      rmm::device_uvector<vertex_t> tmp_keys(num_unique_majors, loop_stream);
+      rmm::device_uvector<edge_t> tmp_values(num_unique_majors, loop_stream);
+      thrust::reduce_by_key(rmm::exec_policy(loop_stream),
+                            tmp_majors.begin(),
+                            tmp_majors.end(),
+                            thrust::make_constant_iterator(edge_t{1}),
+                            tmp_keys.begin(),
+                            tmp_values.begin());
+
+      num_local_unique_edge_majors += num_unique_majors;
+
+      tmp_majors.resize(0, loop_stream);
+      tmp_majors.shrink_to_fit(loop_stream);
+
+      rmm::device_uvector<vertex_t> sorted_majors(edge_partition_major_sizes[i], loop_stream);
+      device_bcast(col_comm,
+                   sorted_local_vertices.data(),
+                   sorted_majors.data(),
+                   edge_partition_major_sizes[i],
+                   static_cast<int>(i),
+                   loop_stream);
+
+      rmm::device_uvector<edge_t> sorted_major_degrees(sorted_majors.size(), loop_stream);
+      thrust::fill(rmm::exec_policy(loop_stream),
+                   sorted_major_degrees.begin(),
+                   sorted_major_degrees.end(),
+                   edge_t{0});
+
+      auto kv_pair_first =
+        thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin()));
+      thrust::for_each(
+        rmm::exec_policy(loop_stream),
+        kv_pair_first,
+        kv_pair_first + tmp_keys.size(),
+        [sorted_major_first = sorted_majors.begin(),
+         sorted_major_last  = sorted_majors.end(),
+         degrees            = sorted_major_degrees.begin()] __device__(auto pair) {
+          auto it = thrust::lower_bound(
+            thrust::seq, sorted_major_first, sorted_major_last, thrust::get<0>(pair));
+          *(degrees + thrust::distance(sorted_major_first, it)) = thrust::get<1>(pair);
+        });
+
+      device_reduce(col_comm,
+                    sorted_major_degrees.begin(),
+                    sorted_major_degrees.begin(),
+                    edge_partition_major_sizes[i],
+                    raft::comms::op_t::SUM,
+                    i,
+                    loop_stream);
+      if (i == col_comm_rank) { sorted_local_vertex_degrees = std::move(sorted_major_degrees); }
     }
+
+    if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); }
+  } else {
+    assert(edgelist_majors.size() == 1);
+
+    rmm::device_uvector<vertex_t> tmp_majors(edgelist_edge_counts[0], handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 edgelist_majors[0],
+                 edgelist_majors[0] + edgelist_edge_counts[0],
+                 tmp_majors.begin());
+    thrust::sort(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end());
+    auto num_unique_majors =
+      thrust::count_if(handle.get_thrust_policy(),
+                       thrust::make_counting_iterator(size_t{0}),
+                       thrust::make_counting_iterator(tmp_majors.size()),
+                       [majors = tmp_majors.data()] __device__(auto idx) {
+                         return (idx == 0) || (majors[idx - 1] != majors[idx]);
+                       });
+    rmm::device_uvector<vertex_t> tmp_keys(num_unique_majors, handle.get_stream());
+    rmm::device_uvector<edge_t> tmp_values(num_unique_majors, handle.get_stream());
+    thrust::reduce_by_key(handle.get_thrust_policy(),
+                          tmp_majors.begin(),
+                          tmp_majors.end(),
+                          thrust::make_constant_iterator(edge_t{1}),
+                          tmp_keys.begin(),
+                          tmp_values.begin());
+
+    num_local_unique_edge_majors += num_unique_majors;
+
+    tmp_majors.resize(0, handle.get_stream());
+    tmp_majors.shrink_to_fit(handle.get_stream());
+
+    sorted_local_vertex_degrees.resize(sorted_local_vertices.size(), handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(),
+                 sorted_local_vertex_degrees.begin(),
+                 sorted_local_vertex_degrees.end(),
+                 edge_t{0});
+
+    auto kv_pair_first =
+      thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin()));
+    thrust::for_each(handle.get_thrust_policy(),
+                     kv_pair_first,
+                     kv_pair_first + tmp_keys.size(),
+                     [sorted_major_first = sorted_local_vertices.begin(),
+                      sorted_major_last  = sorted_local_vertices.end(),
+                      degrees = sorted_local_vertex_degrees.begin()] __device__(auto pair) {
+                       auto it = thrust::lower_bound(
+                         thrust::seq, sorted_major_first, sorted_major_last, thrust::get<0>(pair));
+                       *(degrees + thrust::distance(sorted_major_first, it)) = thrust::get<1>(pair);
+                     });
   }
 
-  // 5. sort non-isolated vertices by degree
+  // 4. sort local vertices by degree (descending)
+#if 1  // FIXME: delete
+  handle.sync_stream();
+  auto time4 = std::chrono::steady_clock::now();
+#endif
 
   thrust::sort_by_key(handle.get_thrust_policy(),
-                      counts.begin(),
-                      counts.begin() + num_non_isolated_vertices,
-                      labels.begin(),
+                      sorted_local_vertex_degrees.begin(),
+                      sorted_local_vertex_degrees.end(),
+                      sorted_local_vertices.begin(),
                       thrust::greater<edge_t>());
 
-  // 6. compute segment_offsets
+  // 5. compute segment_offsets
+#if 1  // FIXME: delete
+  handle.sync_stream();
+  auto time5 = std::chrono::steady_clock::now();
+#endif
 
   static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
   static_assert((detail::low_degree_threshold <= detail::mid_degree_threshold) &&
@@ -320,14 +409,14 @@ compute_renumber_map(raft::handle_t const& handle,
                                                   handle.get_stream());
 
   auto zero_vertex  = vertex_t{0};
-  auto vertex_count = static_cast<vertex_t>(labels.size());
+  auto vertex_count = static_cast<vertex_t>(sorted_local_vertices.size());
   d_segment_offsets.set_element_async(0, zero_vertex, handle.get_stream());
   d_segment_offsets.set_element_async(
     num_segments_per_vertex_partition, vertex_count, handle.get_stream());
 
   thrust::upper_bound(handle.get_thrust_policy(),
-                      counts.begin(),
-                      counts.end(),
+                      sorted_local_vertex_degrees.begin(),
+                      sorted_local_vertex_degrees.end(),
                       d_thresholds.begin(),
                       d_thresholds.end(),
                       d_segment_offsets.begin() + 1,
@@ -340,7 +429,25 @@ compute_renumber_map(raft::handle_t const& handle,
                     handle.get_stream());
   handle.sync_stream();
 
-  return std::make_tuple(std::move(labels),
+#if 1  // FIXME: delete
+  handle.sync_stream();
+  auto time6                                  = std::chrono::steady_clock::now();
+  std::chrono::duration<double> elapsed_total = time6 - time0;
+  std::chrono::duration<double> elapsed0      = time1 - time0;
+  std::chrono::duration<double> elapsed1      = time2 - time1;
+  std::chrono::duration<double> elapsed2      = time3 - time2;
+  std::chrono::duration<double> elapsed3      = time4 - time3;
+  std::chrono::duration<double> elapsed4      = time5 - time4;
+  std::chrono::duration<double> elapsed5      = time6 - time5;
+  std::cout << "Compute renumber map (num_streams:"
+            << (stream_pool_indices ? (*stream_pool_indices).size() : size_t{0}) << ") took "
+            << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << ","
+            << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << ","
+            << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << ","
+            << elapsed5.count() * 1e3 << ") ms." << std::endl;
+#endif
+
+  return std::make_tuple(std::move(sorted_local_vertices),
                          h_segment_offsets,
                          num_local_unique_edge_majors,
                          num_local_unique_edge_minors);
@@ -436,17 +543,19 @@ void expensive_check_edgelist(
 
         rmm::device_uvector<vertex_t> sorted_majors(0, handle.get_stream());
         {
-          auto recvcounts =
-            host_scalar_allgather(col_comm, (*sorted_local_vertices).size(), handle.get_stream());
-          std::vector<size_t> displacements(recvcounts.size(), size_t{0});
-          std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1);
-          sorted_majors.resize(displacements.back() + recvcounts.back(), handle.get_stream());
-          device_allgatherv(col_comm,
-                            (*sorted_local_vertices).data(),
-                            sorted_majors.data(),
-                            recvcounts,
-                            displacements,
-                            handle.get_stream());
+          auto major_size =
+            host_scalar_bcast(col_comm,
+                              static_cast<int>(i) == col_comm_rank ? (*sorted_local_vertices).size()
+                                                                   : size_t{0} /* dummy */,
+                              i,
+                              handle.get_stream());
+          sorted_majors.resize(major_size, handle.get_stream());
+          device_bcast(col_comm,
+                       (*sorted_local_vertices).begin(),
+                       sorted_majors.begin(),
+                       major_size,
+                       i,
+                       handle.get_stream());
           thrust::sort(handle.get_thrust_policy(), sorted_majors.begin(), sorted_majors.end());
         }
 
@@ -556,10 +665,12 @@ renumber_edgelist(
   std::optional<std::vector<std::vector<edge_t>>> const& edgelist_intra_partition_segment_offsets,
   bool do_expensive_check)
 {
-#if 1 // FIXME: delete
+#if 1  // FIXME: delete
   handle.sync_stream();
   if constexpr (multi_gpu) {
-    handle.get_comms().barrier();
+    rmm::device_uvector<int32_t> dummy(1, handle.get_stream());
+    handle.get_comms().allreduce(
+      dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream());
   }
   auto time0 = std::chrono::steady_clock::now();
 #endif
@@ -620,7 +731,7 @@ renumber_edgelist(
 
   // 1. compute renumber map
 
-#if 1 // FIXME: delete
+#if 1  // FIXME: delete
   handle.sync_stream();
   auto time1 = std::chrono::steady_clock::now();
 #endif
@@ -637,7 +748,7 @@ renumber_edgelist(
   // 2. initialize partition_t object, number_of_vertices, and number_of_edges for the coarsened
   // graph
 
-#if 1 // FIXME: delete
+#if 1  // FIXME: delete
   handle.sync_stream();
   auto time2 = std::chrono::steady_clock::now();
 #endif
@@ -664,7 +775,7 @@ renumber_edgelist(
   // FIXME: compare this hash based approach with a binary search based approach in both memory
   // footprint and execution time
 
-#if 1 // FIXME: delete
+#if 1  // FIXME: delete
   handle.sync_stream();
   auto time3 = std::chrono::steady_clock::now();
 #endif
@@ -715,7 +826,7 @@ renumber_edgelist(
     }
   }
 
-#if 1 // FIXME: delete
+#if 1  // FIXME: delete
   handle.sync_stream();
   auto time4 = std::chrono::steady_clock::now();
 #endif
@@ -814,16 +925,19 @@ renumber_edgelist(
                         handle.get_stream());
     }
   }
-#if 1 // FIXME: delete
+#if 1  // FIXME: delete
   handle.sync_stream();
-  auto time5 = std::chrono::steady_clock::now();
+  auto time5                                  = std::chrono::steady_clock::now();
   std::chrono::duration<double> elapsed_total = time5 - time0;
-  std::chrono::duration<double> elapsed0 = time1 - time0;
-  std::chrono::duration<double> elapsed1 = time2 - time1;
-  std::chrono::duration<double> elapsed2 = time3 - time2;
-  std::chrono::duration<double> elapsed3 = time4 - time3;
-  std::chrono::duration<double> elapsed4 = time5 - time4;
-  std::cout << "Renumber took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << ") ms." << std::endl;
+  std::chrono::duration<double> elapsed0      = time1 - time0;
+  std::chrono::duration<double> elapsed1      = time2 - time1;
+  std::chrono::duration<double> elapsed2      = time3 - time2;
+  std::chrono::duration<double> elapsed3      = time4 - time3;
+  std::chrono::duration<double> elapsed4      = time5 - time4;
+  std::cout << "Renumber took " << elapsed_total.count() * 1e3 << " ms, breakdown=("
+            << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << ","
+            << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << ","
+            << elapsed4.count() * 1e3 << ") ms." << std::endl;
 #endif
 
   return std::make_tuple(

From 0744160e841bf2566c9725edec7635ea3ddf8d3d Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 1 Feb 2022 01:09:26 -0800
Subject: [PATCH 06/60] move is_first_in_run_t to graph_utils.cuh

---
 cpp/include/cugraph/detail/graph_utils.cuh             |  8 ++++++++
 .../copy_v_transform_reduce_key_aggregated_out_nbr.cuh | 10 ----------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/cpp/include/cugraph/detail/graph_utils.cuh b/cpp/include/cugraph/detail/graph_utils.cuh
index 2d9ee2b2547..ca918c53a62 100644
--- a/cpp/include/cugraph/detail/graph_utils.cuh
+++ b/cpp/include/cugraph/detail/graph_utils.cuh
@@ -77,5 +77,13 @@ struct compute_partition_id_from_edge_t {
   }
 };
 
+template <typename vertex_t>
+struct is_first_in_run_t {
+  vertex_t const* vertices{nullptr};
+  __device__ bool operator()(size_t i) const {
+    return (i == 0) || (vertices[i - 1] != vertices[i]);
+  }
+};
+
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
index 4cf6ce5b4cb..1dee131a000 100644
--- a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
@@ -99,16 +99,6 @@ struct call_key_aggregated_e_op_t {
   }
 };
 
-// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
-template <typename vertex_t>
-struct is_first_in_run_t {
-  vertex_t const* major_vertices{nullptr};
-  __device__ bool operator()(size_t i) const
-  {
-    return ((i == 0) || (major_vertices[i] != major_vertices[i - 1])) ? true : false;
-  }
-};
-
 // a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
 template <typename vertex_t>
 struct is_valid_vertex_t {

From 077008bc3770440bd90bfa54a169152afd749ef1 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 1 Feb 2022 01:11:00 -0800
Subject: [PATCH 07/60] avoid using device lambdas

---
 cpp/src/structure/renumber_edgelist_impl.cuh | 134 ++++++++++---------
 1 file changed, 69 insertions(+), 65 deletions(-)

diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index 2d501b6f1cc..3f7891a7c4e 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -45,6 +45,38 @@
 namespace cugraph {
 namespace detail {
 
+template <typename vertex_t>
+struct check_edge_src_and_dst_t {
+  vertex_t const* sorted_majors{nullptr};
+  vertex_t num_majors{0};
+  vertex_t const* sorted_minors{nullptr};
+  vertex_t num_minors{0};
+
+  __device__ bool operator()(thrust::tuple<vertex_t, vertex_t> e) const
+  {
+    return !thrust::binary_search(
+             thrust::seq, sorted_majors, sorted_majors + num_majors, thrust::get<0>(e)) ||
+           !thrust::binary_search(
+             thrust::seq, sorted_minors, sorted_minors + num_minors, thrust::get<1>(e));
+  }
+};
+
+template <typename vertex_t, typename edge_t>
+struct search_and_set_degree_t {
+  vertex_t const* sorted_vertices{nullptr};
+  vertex_t num_vertices{0};
+  edge_t* degrees{nullptr};
+
+  __device__ void operator()(thrust::tuple<vertex_t, edge_t> vertex_degree_pair) const
+  {
+    auto it                                            = thrust::lower_bound(thrust::seq,
+                                  sorted_vertices,
+                                  sorted_vertices + num_vertices,
+                                  thrust::get<0>(vertex_degree_pair));
+    *(degrees + thrust::distance(sorted_vertices, it)) = thrust::get<1>(vertex_degree_pair);
+  }
+};
+
 // returns renumber map, segment_offsets, and # unique edge majors & minors
 template <typename vertex_t, typename edge_t, bool multi_gpu>
 std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t, vertex_t>
@@ -192,7 +224,8 @@ compute_renumber_map(raft::handle_t const& handle,
   auto time3 = std::chrono::steady_clock::now();
 #endif
   rmm::device_uvector<edge_t> sorted_local_vertex_degrees(0, handle.get_stream());
-  std::optional<std::vector<size_t>> stream_pool_indices{std::nullopt};  // FIXME: move this inside the if statement
+  std::optional<std::vector<size_t>> stream_pool_indices{
+    std::nullopt};  // FIXME: move this inside the if statement
   if constexpr (multi_gpu) {
     auto& comm               = handle.get_comms();
     auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
@@ -249,13 +282,10 @@ compute_renumber_map(raft::handle_t const& handle,
                    edgelist_majors[i] + edgelist_edge_counts[i],
                    tmp_majors.begin());
       thrust::sort(rmm::exec_policy(loop_stream), tmp_majors.begin(), tmp_majors.end());
-      auto num_unique_majors =
-        thrust::count_if(rmm::exec_policy(loop_stream),
-                         thrust::make_counting_iterator(size_t{0}),
-                         thrust::make_counting_iterator(tmp_majors.size()),
-                         [majors = tmp_majors.data()] __device__(auto idx) {
-                           return (idx == 0) || (majors[idx - 1] != majors[idx]);
-                         });
+      auto num_unique_majors = thrust::count_if(rmm::exec_policy(loop_stream),
+                                                thrust::make_counting_iterator(size_t{0}),
+                                                thrust::make_counting_iterator(tmp_majors.size()),
+                                                is_first_in_run_t<vertex_t>{tmp_majors.data()});
       rmm::device_uvector<vertex_t> tmp_keys(num_unique_majors, loop_stream);
       rmm::device_uvector<edge_t> tmp_values(num_unique_majors, loop_stream);
       thrust::reduce_by_key(rmm::exec_policy(loop_stream),
@@ -286,17 +316,12 @@ compute_renumber_map(raft::handle_t const& handle,
 
       auto kv_pair_first =
         thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin()));
-      thrust::for_each(
-        rmm::exec_policy(loop_stream),
-        kv_pair_first,
-        kv_pair_first + tmp_keys.size(),
-        [sorted_major_first = sorted_majors.begin(),
-         sorted_major_last  = sorted_majors.end(),
-         degrees            = sorted_major_degrees.begin()] __device__(auto pair) {
-          auto it = thrust::lower_bound(
-            thrust::seq, sorted_major_first, sorted_major_last, thrust::get<0>(pair));
-          *(degrees + thrust::distance(sorted_major_first, it)) = thrust::get<1>(pair);
-        });
+      thrust::for_each(rmm::exec_policy(loop_stream),
+                       kv_pair_first,
+                       kv_pair_first + tmp_keys.size(),
+                       search_and_set_degree_t{sorted_majors.data(),
+                                               static_cast<vertex_t>(sorted_majors.size()),
+                                               sorted_major_degrees.data()});
 
       device_reduce(col_comm,
                     sorted_major_degrees.begin(),
@@ -318,13 +343,10 @@ compute_renumber_map(raft::handle_t const& handle,
                  edgelist_majors[0] + edgelist_edge_counts[0],
                  tmp_majors.begin());
     thrust::sort(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end());
-    auto num_unique_majors =
-      thrust::count_if(handle.get_thrust_policy(),
-                       thrust::make_counting_iterator(size_t{0}),
-                       thrust::make_counting_iterator(tmp_majors.size()),
-                       [majors = tmp_majors.data()] __device__(auto idx) {
-                         return (idx == 0) || (majors[idx - 1] != majors[idx]);
-                       });
+    auto num_unique_majors = thrust::count_if(handle.get_thrust_policy(),
+                                              thrust::make_counting_iterator(size_t{0}),
+                                              thrust::make_counting_iterator(tmp_majors.size()),
+                                              is_first_in_run_t<vertex_t>{tmp_majors.data()});
     rmm::device_uvector<vertex_t> tmp_keys(num_unique_majors, handle.get_stream());
     rmm::device_uvector<edge_t> tmp_values(num_unique_majors, handle.get_stream());
     thrust::reduce_by_key(handle.get_thrust_policy(),
@@ -350,13 +372,9 @@ compute_renumber_map(raft::handle_t const& handle,
     thrust::for_each(handle.get_thrust_policy(),
                      kv_pair_first,
                      kv_pair_first + tmp_keys.size(),
-                     [sorted_major_first = sorted_local_vertices.begin(),
-                      sorted_major_last  = sorted_local_vertices.end(),
-                      degrees = sorted_local_vertex_degrees.begin()] __device__(auto pair) {
-                       auto it = thrust::lower_bound(
-                         thrust::seq, sorted_major_first, sorted_major_last, thrust::get<0>(pair));
-                       *(degrees + thrust::distance(sorted_major_first, it)) = thrust::get<1>(pair);
-                     });
+                     search_and_set_degree_t{sorted_local_vertices.data(),
+                                             static_cast<vertex_t>(sorted_local_vertices.size()),
+                                             sorted_major_degrees.data()});
   }
 
   // 4. sort local vertices by degree (descending)
@@ -577,22 +595,16 @@ void expensive_check_edgelist(
 
         auto edge_first =
           thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors[i], edgelist_minors[i]));
-        CUGRAPH_EXPECTS(
-          thrust::count_if(
-            handle.get_thrust_policy(),
-            edge_first,
-            edge_first + edgelist_edge_counts[i],
-            [num_majors    = static_cast<vertex_t>(sorted_majors.size()),
-             sorted_majors = sorted_majors.data(),
-             num_minors    = static_cast<vertex_t>(sorted_minors.size()),
-             sorted_minors = sorted_minors.data()] __device__(auto e) {
-              return !thrust::binary_search(
-                       thrust::seq, sorted_majors, sorted_majors + num_majors, thrust::get<0>(e)) ||
-                     !thrust::binary_search(
-                       thrust::seq, sorted_minors, sorted_minors + num_minors, thrust::get<1>(e));
-            }) == 0,
-          "Invalid input argument: edgelist_majors and/or edgelist_minors have "
-          "invalid vertex ID(s).");
+        CUGRAPH_EXPECTS(thrust::count_if(handle.get_thrust_policy(),
+                                         edge_first,
+                                         edge_first + edgelist_edge_counts[i],
+                                         check_edge_src_and_dst_t<vertex_t>{
+                                           sorted_majors.data(),
+                                           static_cast<vertex_t>(sorted_majors.size()),
+                                           sorted_minors.data(),
+                                           static_cast<vertex_t>(sorted_minors.size())}) == 0,
+                        "Invalid input argument: edgelist_majors and/or edgelist_minors have "
+                        "invalid vertex ID(s).");
       }
 
       if (edgelist_intra_partition_segment_offsets) {
@@ -623,22 +635,14 @@ void expensive_check_edgelist(
       auto edge_first =
         thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors[0], edgelist_minors[0]));
       CUGRAPH_EXPECTS(
-        thrust::count_if(
-          handle.get_thrust_policy(),
-          edge_first,
-          edge_first + edgelist_edge_counts[0],
-          [sorted_local_vertices = (*sorted_local_vertices).data(),
-           num_sorted_local_vertices =
-             static_cast<vertex_t>((*sorted_local_vertices).size())] __device__(auto e) {
-            return !thrust::binary_search(thrust::seq,
-                                          sorted_local_vertices,
-                                          sorted_local_vertices + num_sorted_local_vertices,
-                                          thrust::get<0>(e)) ||
-                   !thrust::binary_search(thrust::seq,
-                                          sorted_local_vertices,
-                                          sorted_local_vertices + num_sorted_local_vertices,
-                                          thrust::get<1>(e));
-          }) == 0,
+        thrust::count_if(handle.get_thrust_policy(),
+                         edge_first,
+                         edge_first + edgelist_edge_counts[0],
+                         check_edge_src_and_dst_t<vertex_t>{
+                           (*sorted_local_vertices).data(),
+                           static_cast<vertex_t>((*sorted_local_vertices).size()),
+                           (*sorted_local_vertices).data(),
+                           static_cast<vertex_t>((*sorted_local_vertices).size())}) == 0,
         "Invalid input argument: edgelist_majors and/or edgelist_minors have "
         "invalid vertex ID(s).");
     }

From 6a0dfa1850cf3aa912f39b1a4ca8659232ac8fdc Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 1 Feb 2022 12:54:50 -0800
Subject: [PATCH 08/60] fix compile errors

---
 cpp/src/structure/renumber_edgelist_impl.cuh | 26 +++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index 3f7891a7c4e..e186571d38d 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -316,12 +316,13 @@ compute_renumber_map(raft::handle_t const& handle,
 
       auto kv_pair_first =
         thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin()));
-      thrust::for_each(rmm::exec_policy(loop_stream),
-                       kv_pair_first,
-                       kv_pair_first + tmp_keys.size(),
-                       search_and_set_degree_t{sorted_majors.data(),
-                                               static_cast<vertex_t>(sorted_majors.size()),
-                                               sorted_major_degrees.data()});
+      thrust::for_each(
+        rmm::exec_policy(loop_stream),
+        kv_pair_first,
+        kv_pair_first + tmp_keys.size(),
+        search_and_set_degree_t<vertex_t, edge_t>{sorted_majors.data(),
+                                                  static_cast<vertex_t>(sorted_majors.size()),
+                                                  sorted_major_degrees.data()});
 
       device_reduce(col_comm,
                     sorted_major_degrees.begin(),
@@ -369,12 +370,13 @@ compute_renumber_map(raft::handle_t const& handle,
 
     auto kv_pair_first =
       thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin()));
-    thrust::for_each(handle.get_thrust_policy(),
-                     kv_pair_first,
-                     kv_pair_first + tmp_keys.size(),
-                     search_and_set_degree_t{sorted_local_vertices.data(),
-                                             static_cast<vertex_t>(sorted_local_vertices.size()),
-                                             sorted_major_degrees.data()});
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      kv_pair_first,
+      kv_pair_first + tmp_keys.size(),
+      search_and_set_degree_t<vertex_t, edge_t>{sorted_local_vertices.data(),
+                                                static_cast<vertex_t>(sorted_local_vertices.size()),
+                                                sorted_local_vertex_degrees.data()});
   }
 
   // 4. sort local vertices by degree (descending)

From 41645aa63b2a0999b0e639a034f422a7dd41b4d0 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 1 Feb 2022 16:49:02 -0800
Subject: [PATCH 09/60] code cleanup

---
 cpp/src/structure/renumber_edgelist_impl.cuh | 146 ++++++++++---------
 1 file changed, 74 insertions(+), 72 deletions(-)

diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index e186571d38d..f1358ddfddb 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -101,8 +101,8 @@ compute_renumber_map(raft::handle_t const& handle,
 
   edge_t num_local_edges = std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end());
 
-  // 1. if local_vertices.has_value() is false, keep unique vertices from edge majors as well (to
-  // construct local_vertices)
+  // 1. if local_vertices.has_value() is false, find unique vertices from edge majors (to construct
+  // local_vertices) unique edge majors will be counted in step 4.
 
   rmm::device_uvector<vertex_t> sorted_unique_majors(0, handle.get_stream());
   if (!local_vertices) {
@@ -183,7 +183,11 @@ compute_renumber_map(raft::handle_t const& handle,
   handle.sync_stream();
   auto time2 = std::chrono::steady_clock::now();
 #endif
-  if (!local_vertices) {
+  if (local_vertices) {
+    sorted_local_vertices = std::move(*local_vertices);
+    thrust::sort(
+      handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end());
+  } else {
     sorted_local_vertices.resize(sorted_unique_majors.size() + sorted_unique_minors.size(),
                                  handle.get_stream());
 
@@ -209,11 +213,15 @@ compute_renumber_map(raft::handle_t const& handle,
     if constexpr (multi_gpu) {
       sorted_local_vertices =
         cugraph::detail::shuffle_vertices_by_gpu_id(handle, std::move(sorted_local_vertices));
+      thrust::sort(
+        handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end());
+      sorted_local_vertices.resize(thrust::distance(sorted_local_vertices.begin(),
+                                                    thrust::unique(handle.get_thrust_policy(),
+                                                                   sorted_local_vertices.begin(),
+                                                                   sorted_local_vertices.end())),
+                                   handle.get_stream());
+      sorted_local_vertices.shrink_to_fit(handle.get_stream());
     }
-  } else {
-    sorted_local_vertices = std::move(*local_vertices);
-    thrust::sort(
-      handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end());
   }
 
   // 4. compute global degrees for the sorted local vertices, and count unique edge majors on the
@@ -272,8 +280,7 @@ compute_renumber_map(raft::handle_t const& handle,
 
     for (int i = 0; i < col_comm_size; ++i) {
       auto loop_stream = stream_pool_indices
-                           ? handle.get_stream_from_stream_pool(
-                               (*stream_pool_indices)[i % (*stream_pool_indices).size()])
+                           ? handle.get_stream_from_stream_pool(i % (*stream_pool_indices).size())
                            : handle.get_stream();
 
       rmm::device_uvector<vertex_t> tmp_majors(edgelist_edge_counts[i], loop_stream);
@@ -305,7 +312,7 @@ compute_renumber_map(raft::handle_t const& handle,
                    sorted_local_vertices.data(),
                    sorted_majors.data(),
                    edge_partition_major_sizes[i],
-                   static_cast<int>(i),
+                   i,
                    loop_stream);
 
       rmm::device_uvector<edge_t> sorted_major_degrees(sorted_majors.size(), loop_stream);
@@ -517,20 +524,6 @@ void expensive_check_edgelist(
                     "Invalid input argument: both edgelist_majors.size() & "
                     "edgelist_minors.size() should coincide with col_comm_size.");
 
-    if (sorted_local_vertices) {
-      CUGRAPH_EXPECTS(
-        thrust::count_if(
-          handle.get_thrust_policy(),
-          (*sorted_local_vertices).begin(),
-          (*sorted_local_vertices).end(),
-          [comm_rank,
-           key_func =
-             detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size}] __device__(auto val) {
-            return key_func(val) != comm_rank;
-          }) == 0,
-        "Invalid input argument: local_vertices should be pre-shuffled.");
-    }
-
     for (size_t i = 0; i < edgelist_majors.size(); ++i) {
       auto edge_first =
         thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors[i], edgelist_minors[i]));
@@ -557,46 +550,75 @@ void expensive_check_edgelist(
         "Invalid input argument: edgelist_majors & edgelist_minors should be "
         "pre-shuffled.");
 
-      if (sorted_local_vertices) {
-        auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-        auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      if (edgelist_intra_partition_segment_offsets) {
+        for (int j = 0; j < row_comm_size; ++j) {
+          CUGRAPH_EXPECTS(
+            thrust::count_if(
+              handle.get_thrust_policy(),
+              edgelist_minors[i] + (*edgelist_intra_partition_segment_offsets)[i][j],
+              edgelist_minors[i] + (*edgelist_intra_partition_segment_offsets)[i][j + 1],
+              [row_comm_size,
+               col_comm_rank,
+               j,
+               gpu_id_key_func =
+                 detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size}] __device__(auto minor) {
+                return gpu_id_key_func(minor) != col_comm_rank * row_comm_size + j;
+              }) == 0,
+            "Invalid input argument: if edgelist_intra_partition_segment_offsets.has_value() is "
+            "true, edgelist_majors & edgelist_minors should be properly grouped "
+            "within each local partition.");
+        }
+      }
+    }
+
+    if (sorted_local_vertices) {
+      auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+
+      CUGRAPH_EXPECTS(
+        thrust::count_if(
+          handle.get_thrust_policy(),
+          (*sorted_local_vertices).begin(),
+          (*sorted_local_vertices).end(),
+          [comm_rank,
+           key_func =
+             detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size}] __device__(auto val) {
+            return key_func(val) != comm_rank;
+          }) == 0,
+        "Invalid input argument: local_vertices should be pre-shuffled.");
 
+      auto major_sizes =
+        host_scalar_allgather(col_comm, (*sorted_local_vertices).size(), handle.get_stream());
+
+      rmm::device_uvector<vertex_t> sorted_minors(0, handle.get_stream());
+      auto recvcounts =
+        host_scalar_allgather(row_comm, (*sorted_local_vertices).size(), handle.get_stream());
+      std::vector<size_t> displacements(recvcounts.size(), size_t{0});
+      std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1);
+      sorted_minors.resize(displacements.back() + recvcounts.back(), handle.get_stream());
+      device_allgatherv(row_comm,
+                        (*sorted_local_vertices).data(),
+                        sorted_minors.data(),
+                        recvcounts,
+                        displacements,
+                        handle.get_stream());
+      thrust::sort(handle.get_thrust_policy(), sorted_minors.begin(), sorted_minors.end());
+
+      for (size_t i = 0; i < edgelist_majors.size(); ++i) {
         rmm::device_uvector<vertex_t> sorted_majors(0, handle.get_stream());
         {
-          auto major_size =
-            host_scalar_bcast(col_comm,
-                              static_cast<int>(i) == col_comm_rank ? (*sorted_local_vertices).size()
-                                                                   : size_t{0} /* dummy */,
-                              i,
-                              handle.get_stream());
-          sorted_majors.resize(major_size, handle.get_stream());
+          sorted_majors.resize(major_sizes[i], handle.get_stream());
           device_bcast(col_comm,
                        (*sorted_local_vertices).begin(),
                        sorted_majors.begin(),
-                       major_size,
+                       major_sizes[i],
                        i,
                        handle.get_stream());
-          thrust::sort(handle.get_thrust_policy(), sorted_majors.begin(), sorted_majors.end());
-        }
-
-        rmm::device_uvector<vertex_t> sorted_minors(0, handle.get_stream());
-        {
-          auto recvcounts =
-            host_scalar_allgather(row_comm, (*sorted_local_vertices).size(), handle.get_stream());
-          std::vector<size_t> displacements(recvcounts.size(), size_t{0});
-          std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1);
-          sorted_minors.resize(displacements.back() + recvcounts.back(), handle.get_stream());
-          device_allgatherv(row_comm,
-                            (*sorted_local_vertices).data(),
-                            sorted_minors.data(),
-                            recvcounts,
-                            displacements,
-                            handle.get_stream());
-          thrust::sort(handle.get_thrust_policy(), sorted_minors.begin(), sorted_minors.end());
         }
 
         auto edge_first =
           thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors[i], edgelist_minors[i]));
+
         CUGRAPH_EXPECTS(thrust::count_if(handle.get_thrust_policy(),
                                          edge_first,
                                          edge_first + edgelist_edge_counts[i],
@@ -608,26 +630,6 @@ void expensive_check_edgelist(
                         "Invalid input argument: edgelist_majors and/or edgelist_minors have "
                         "invalid vertex ID(s).");
       }
-
-      if (edgelist_intra_partition_segment_offsets) {
-        for (int j = 0; j < row_comm_size; ++j) {
-          CUGRAPH_EXPECTS(
-            thrust::count_if(
-              handle.get_thrust_policy(),
-              edgelist_minors[i] + (*edgelist_intra_partition_segment_offsets)[i][j],
-              edgelist_minors[i] + (*edgelist_intra_partition_segment_offsets)[i][j + 1],
-              [row_comm_size,
-               col_comm_rank,
-               j,
-               gpu_id_key_func =
-                 detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size}] __device__(auto minor) {
-                return gpu_id_key_func(minor) != col_comm_rank * row_comm_size + j;
-              }) == 0,
-            "Invalid input argument: if edgelist_intra_partition_segment_offsets.has_value() is "
-            "true, edgelist_majors & edgelist_minors should be properly grouped "
-            "within each local partition.");
-        }
-      }
     }
   } else {
     assert(edgelist_majors.size() == 1);

From 6b4b6828c5b8ca6fd6100d15d35689ca36a31e18 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 1 Feb 2022 17:13:56 -0800
Subject: [PATCH 10/60] update copy_v_transform_reduce_in_out_nbr to process
 multiple edge partitions in parallel

---
 .../copy_v_transform_reduce_in_out_nbr.cuh    | 199 ++++++++++++------
 1 file changed, 139 insertions(+), 60 deletions(-)

diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
index a98013ac996..fbd339fdf32 100644
--- a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
@@ -35,6 +35,7 @@
 #include <thrust/tuple.h>
 #include <thrust/type_traits/integer_sequence.h>
 
+#include <numeric>
 #include <type_traits>
 #include <utility>
 
@@ -475,21 +476,12 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
                                  T init,
                                  VertexValueOutputIterator vertex_value_output_first)
 {
-// FIXME: for temporary testing
-#if 1
-  cudaStream_t high_stream0{};
-  cudaStream_t high_stream1{};
-  cudaStream_t mid_stream{};
-  cudaStream_t low_stream{};
-  CUDA_TRY(cudaStreamCreateWithPriority(&high_stream0, cudaStreamNonBlocking, -5));
-  CUDA_TRY(cudaStreamCreateWithPriority(&high_stream1, cudaStreamNonBlocking, -5));
-  CUDA_TRY(cudaStreamCreateWithPriority(&mid_stream, cudaStreamNonBlocking, -3));
-  CUDA_TRY(cudaStreamCreateWithPriority(&low_stream, cudaStreamNonBlocking, 0));
-#endif
   constexpr auto update_major = (in == GraphViewType::is_adj_matrix_transposed);
-  using vertex_t              = typename GraphViewType::vertex_type;
-  using edge_t                = typename GraphViewType::edge_type;
-  using weight_t              = typename GraphViewType::weight_type;
+  [[maybe_unused]] constexpr auto max_segments =
+    detail::num_sparse_segments_per_vertex_partition + size_t{1};
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
 
   static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
 
@@ -524,15 +516,81 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
     }
   }
 
+  std::optional<std::vector<size_t>> stream_pool_indices{std::nullopt};
+  size_t num_concurrent_loops{1};  // FIXME: this can go inside the loop after temporary testing
+#if 1                              // FIXME: for temporary testing
+  std::vector<cudaStream_t> high_streams0{};
+  std::vector<cudaStream_t> high_streams1{};
+  std::vector<cudaStream_t> mid_streams{};
+  std::vector<cudaStream_t> low_streams{};
+#endif
+  if constexpr (GraphViewType::is_multi_gpu) {
+    if (handle.get_stream_pool_size() >= max_segments) {
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_size = col_comm.get_size();
+
+      // memory footprint vs parallelism trade-off
+      // peak memory requirement per loop is
+      // update_major ? V / comm_size * sizeof(T) : 0
+      // and limit memory requirement to (E / comm_size) * sizeof(vertex_t)
+
+      num_concurrent_loops =
+        std::min(static_cast<size_t>(col_comm_size), handle.get_stream_pool_size() / max_segments);
+      if constexpr (update_major) {
+        size_t value_size{0};
+        if constexpr (is_thrust_tuple_of_arithmetic<T>::value) {
+          auto elem_sizes = compute_thrust_tuple_element_sizes<T>{}();
+          value_size      = std::reduce(elem_sizes.begin(), elem_sizes.end());
+        } else {
+          value_size = sizeof(T);
+        }
+
+        auto avg_vertex_degree = graph_view.get_number_of_vertices() > 0
+                                   ? (static_cast<double>(graph_view.get_number_of_edges()) /
+                                      static_cast<double>(graph_view.get_number_of_vertices()))
+                                   : double{0.0};
+
+        num_concurrent_loops =
+          std::min(static_cast<size_t>(avg_vertex_degree * (static_cast<double>(sizeof(vertex_t)) /
+                                                            static_cast<double>(value_size))),
+                   num_concurrent_loops);
+      }
+
+#if 1  // FIXME: for temporary testing
+      high_streams0.resize(num_concurrent_loops);
+      high_streams1.resize(num_concurrent_loops);
+      mid_streams.resize(num_concurrent_loops);
+      low_streams.resize(num_concurrent_loops);
+      for (size_t i = 0; i < num_concurrent_loops; ++i) {
+        CUDA_TRY(cudaStreamCreateWithPriority(&high_streams0[i], cudaStreamNonBlocking, -2));
+        CUDA_TRY(cudaStreamCreateWithPriority(&high_streams1[i], cudaStreamNonBlocking, -2));
+        CUDA_TRY(cudaStreamCreateWithPriority(&mid_streams[i], cudaStreamNonBlocking, -1));
+        CUDA_TRY(cudaStreamCreateWithPriority(&low_streams[i], cudaStreamNonBlocking, 0));
+      }
+#endif
+
+      stream_pool_indices = std::vector<size_t>(num_concurrent_loops * max_segments);
+      std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0});
+      handle.sync_stream();
+    }
+  }
+
   for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+    auto loop_stream = stream_pool_indices
+                         ? rmm::cuda_stream_view{high_streams0[i % num_concurrent_loops]}
+                         /* FIXME for temporary testing,
+     handle.get_stream_from_stream_pool((i *
+     max_segments) %
+     (*stream_pool_indices).size()) */
+                         : handle.get_stream();
+
     auto matrix_partition =
       matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
         graph_view.get_matrix_partition_view(i));
 
     auto major_tmp_buffer_size =
       GraphViewType::is_multi_gpu && update_major ? matrix_partition.get_major_size() : vertex_t{0};
-    auto major_tmp_buffer =
-      allocate_dataframe_buffer<T>(major_tmp_buffer_size, handle.get_stream());
+    auto major_tmp_buffer   = allocate_dataframe_buffer<T>(major_tmp_buffer_size, loop_stream);
     auto major_buffer_first = get_dataframe_buffer_begin(major_tmp_buffer);
 
     auto major_init = T{};
@@ -569,28 +627,28 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
     } else {
       output_buffer = vertex_value_output_first;
     }
+
+    if (stream_pool_indices) {
+      CUDA_TRY(cudaStreamSynchronize(
+        high_streams0[i % num_concurrent_loops])); /* FIXME for temporary testing,
+                               handle.sync_stream_pool(std::vector<size_t>{(i * max_segments) %
+                               (*stream_pool_indices).size()}); */
+    }
+
     auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
-    std::optional<std::vector<size_t>> stream_pool_indices{std::nullopt};
     if (segment_offsets) {
       static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
 
-      auto num_segments = detail::num_sparse_segments_per_vertex_partition +
-                          (matrix_partition.get_dcs_nzd_vertex_count() ? size_t{1} : size_t{0});
-      if (GraphViewType::is_multi_gpu && handle.get_stream_pool_size() >= num_segments) {
-        stream_pool_indices = std::vector<size_t>(num_segments);
-        std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0});
-        handle.sync_stream();
-      }
-
       // FIXME: we may further improve performance by 1) individually tuning block sizes for
       // different segments; and 2) adding one more segment for very high degree vertices and
       // running segmented reduction
       if (matrix_partition.get_dcs_nzd_vertex_count()) {
         auto exec_stream = stream_pool_indices
-                             ? rmm::cuda_stream_view{high_stream0}
+                             ? rmm::cuda_stream_view{high_streams0[i % num_concurrent_loops]}
                              /* FIXME for temporary testing,
-                                handle.get_stream_from_stream_pool((*stream_pool_indices)[0]) */
-                             : handle.get_stream();
+                                handle.get_stream_from_stream_pool((i * max_segments) %
+                                (*stream_pool_indices).size()) */
+                             : loop_stream;
         if constexpr (update_major) {  // this is necessary as we don't visit every vertex in the
                                        // hypersparse segment in
                                        // for_all_major_for_all_nbr_hypersparse
@@ -617,11 +675,12 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
         }
       }
       if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
-        auto exec_stream = stream_pool_indices ? rmm::cuda_stream_view{high_stream1}
-                                               /* FIXME for temporary testing,
-                                                  handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count()
-                                                  ? 1 : 0]) */
-                                               : handle.get_stream();
+        auto exec_stream = stream_pool_indices
+                             ? rmm::cuda_stream_view{high_streams1[i % num_concurrent_loops]}
+                             /* FIXME for temporary testing,
+                                handle.get_stream_from_stream_pool((i * max_segments + 1) %
+                                (*stream_pool_indices).size()) */
+                             : loop_stream;
         raft::grid_1d_thread_t update_grid((*segment_offsets)[3] - (*segment_offsets)[2],
                                            detail::copy_v_transform_reduce_nbr_for_all_block_size,
                                            handle.get_device_properties().maxGridSize[0]);
@@ -639,11 +698,12 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
             major_init);
       }
       if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
-        auto exec_stream = stream_pool_indices ? rmm::cuda_stream_view{mid_stream}
-                                               /* FIXME for temporary testing,
-                                                  handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count()
-                                                  ? 2 : 1]) */
-                                               : handle.get_stream();
+        auto exec_stream = stream_pool_indices
+                             ? rmm::cuda_stream_view{mid_streams[i % num_concurrent_loops]}
+                             /* FIXME for temporary testing,
+                                handle.get_stream_from_stream_pool((i * max_segments + 2) %
+                                (*stream_pool_indices).size()) */
+                             : loop_stream;
         raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1],
                                          detail::copy_v_transform_reduce_nbr_for_all_block_size,
                                          handle.get_device_properties().maxGridSize[0]);
@@ -661,11 +721,12 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
             major_init);
       }
       if ((*segment_offsets)[1] > 0) {
-        auto exec_stream = stream_pool_indices ? rmm::cuda_stream_view{low_stream}
-                                               /* FIXME for temporary testing,
-                                                  handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count()
-                                                  ? 3 : 2]) */
-                                               : handle.get_stream();
+        auto exec_stream = stream_pool_indices
+                             ? rmm::cuda_stream_view{low_streams[i % num_concurrent_loops]}
+                             /* FIXME for temporary testing,
+                                handle.get_stream_from_stream_pool((i * max_segments + 3) %
+                                (*stream_pool_indices).size()) */
+                             : loop_stream;
         raft::grid_1d_block_t update_grid((*segment_offsets)[1],
                                           detail::copy_v_transform_reduce_nbr_for_all_block_size,
                                           handle.get_device_properties().maxGridSize[0]);
@@ -686,7 +747,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
                                            detail::copy_v_transform_reduce_nbr_for_all_block_size,
                                            handle.get_device_properties().maxGridSize[0]);
         detail::for_all_major_for_all_nbr_low_degree<update_major, GraphViewType>
-          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+          <<<update_grid.num_blocks, update_grid.block_size, 0, loop_stream>>>(
             matrix_partition,
             matrix_partition.get_major_first(),
             matrix_partition.get_major_last(),
@@ -715,19 +776,19 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
                         (*segment_offsets).back() - (*segment_offsets)[3],
                         raft::comms::op_t::SUM,
                         i,
-                        high_stream0/* FIXME for temporary testing, handle.get_stream_from_stream_pool((*stream_pool_indices)[0]) */);
+                        high_streams0[i % num_concurrent_loops]/* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) */);
         }
         if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
           device_reduce(
-            col_comm, major_buffer_first + (*segment_offsets)[2], vertex_value_output_first + (*segment_offsets)[2], (*segment_offsets)[3] - (*segment_offsets)[2], raft::comms::op_t::SUM, i, high_stream1 /* FIXME for temporary testing, handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count() ? 1 : 0]) */);
+            col_comm, major_buffer_first + (*segment_offsets)[2], vertex_value_output_first + (*segment_offsets)[2], (*segment_offsets)[3] - (*segment_offsets)[2], raft::comms::op_t::SUM, i, high_streams1[i % num_concurrent_loops] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 1) % (*stream_pool_indices).size()) */);
         }
         if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
           device_reduce(
-            col_comm, major_buffer_first + (*segment_offsets)[1], vertex_value_output_first + (*segment_offsets)[1], (*segment_offsets)[2] - (*segment_offsets)[1], raft::comms::op_t::SUM, i, mid_stream /* FIXME for temporary testing, handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count() ? 2 : 1]) */);
+            col_comm, major_buffer_first + (*segment_offsets)[1], vertex_value_output_first + (*segment_offsets)[1], (*segment_offsets)[2] - (*segment_offsets)[1], raft::comms::op_t::SUM, i, mid_streams[i % num_concurrent_loops] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 2) % (*stream_pool_indices).size()) */);
         }
         if ((*segment_offsets)[1] > 0) {
           device_reduce(
-            col_comm, major_buffer_first, vertex_value_output_first, (*segment_offsets)[1], raft::comms::op_t::SUM, i, low_stream /* FIXME for temporary testing, handle.get_stream_from_stream_pool((*stream_pool_indices)[matrix_partition.get_dcs_nzd_vertex_count() ? 3 : 2]) */);
+            col_comm, major_buffer_first, vertex_value_output_first, (*segment_offsets)[1], raft::comms::op_t::SUM, i, low_streams[i % num_concurrent_loops] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 3) % (*stream_pool_indices).size()) */);
         }
       } else {
         device_reduce(col_comm,
@@ -736,20 +797,28 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
                       matrix_partition.get_major_size(),
                       raft::comms::op_t::SUM,
                       i,
-                      handle.get_stream());
+                      loop_stream);
       }
     }
+  }
 
-    if (stream_pool_indices) {
+  if (stream_pool_indices) {
 #if 1  // FIXME: for temporary testing
-      CUDA_TRY(cudaStreamSynchronize(high_stream0));
-      CUDA_TRY(cudaStreamSynchronize(high_stream1));
-      CUDA_TRY(cudaStreamSynchronize(mid_stream));
-      CUDA_TRY(cudaStreamSynchronize(low_stream));
+    for (size_t i = 0; i < high_streams0.size(); ++i) {
+      CUDA_TRY(cudaStreamSynchronize(high_streams0[i]));
+    }
+    for (size_t i = 0; i < high_streams1.size(); ++i) {
+      CUDA_TRY(cudaStreamSynchronize(high_streams1[i]));
+    }
+    for (size_t i = 0; i < mid_streams.size(); ++i) {
+      CUDA_TRY(cudaStreamSynchronize(mid_streams[i]));
+    }
+    for (size_t i = 0; i < low_streams.size(); ++i) {
+      CUDA_TRY(cudaStreamSynchronize(low_streams[i]));
+    }
 #else
-      handle.sync_stream_pool(*stream_pool_indices);
+    handle.sync_stream_pool(*stream_pool_indices);
 #endif
-    }
   }
 
   if constexpr (GraphViewType::is_multi_gpu && !update_major) {
@@ -816,10 +885,20 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
   }
 // FIXME: for temporary testing
 #if 1
-  CUDA_TRY(cudaStreamDestroy(low_stream));
-  CUDA_TRY(cudaStreamDestroy(mid_stream));
-  CUDA_TRY(cudaStreamDestroy(high_stream1));
-  CUDA_TRY(cudaStreamDestroy(high_stream0));
+  if (stream_pool_indices) {
+    for (size_t i = 0; i < low_streams.size(); ++i) {
+      CUDA_TRY(cudaStreamDestroy(low_streams[i]));
+    }
+    for (size_t i = 0; i < mid_streams.size(); ++i) {
+      CUDA_TRY(cudaStreamDestroy(mid_streams[i]));
+    }
+    for (size_t i = 0; i < high_streams1.size(); ++i) {
+      CUDA_TRY(cudaStreamDestroy(high_streams1[i]));
+    }
+    for (size_t i = 0; i < high_streams0.size(); ++i) {
+      CUDA_TRY(cudaStreamDestroy(high_streams0[i]));
+    }
+  }
 #endif
 }
 

From a0b009e9f97d09511e9dd42dd5e047f476f65ca0 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 2 Feb 2022 11:19:14 -0800
Subject: [PATCH 11/60] fix overflow bug with 2^31 or more vertices

---
 cpp/src/generators/generate_rmat_edgelist.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/generators/generate_rmat_edgelist.cu b/cpp/src/generators/generate_rmat_edgelist.cu
index 8ee99d61747..07b01853fdd 100644
--- a/cpp/src/generators/generate_rmat_edgelist.cu
+++ b/cpp/src/generators/generate_rmat_edgelist.cu
@@ -94,8 +94,8 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> generat
               }
             }
           }
-          src += src_bit_set ? static_cast<vertex_t>(1 << bit) : 0;
-          dst += dst_bit_set ? static_cast<vertex_t>(1 << bit) : 0;
+          src += src_bit_set ? static_cast<vertex_t>(vertex_t{1} << bit) : 0;
+          dst += dst_bit_set ? static_cast<vertex_t>(vertex_t{1} << bit) : 0;
         }
         return thrust::make_tuple(src, dst);
       });

From 9a70472cc33f53389fe526571281fdb02187bdd3 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 3 Feb 2022 13:22:49 -0800
Subject: [PATCH 12/60] delete temporary performance measurement code

---
 cpp/src/structure/graph_impl.cuh | 34 --------------------------------
 1 file changed, 34 deletions(-)

diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index a4f6d55836f..ef64e60ac2f 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -486,13 +486,6 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
       handle, meta.number_of_vertices, meta.number_of_edges, meta.properties),
     partition_(meta.partition)
 {
-#if 1 // FIXME: delete
-  handle.sync_stream();
-  if constexpr (multi_gpu) {
-    handle.get_comms().barrier();
-  }
-  auto time0 = std::chrono::steady_clock::now();
-#endif
   // cheap error checks
 
   auto& comm           = this->get_handle_ptr()->get_comms();
@@ -625,10 +618,6 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   }
 
   // aggregate segment_offsets
-#if 1 // FIXME: delete
-  handle.sync_stream();
-  auto time1 = std::chrono::steady_clock::now();
-#endif
 
   if (meta.segment_offsets) {
     // FIXME: we need to add host_allgather
@@ -658,10 +647,6 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   }
 
   // compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid
-#if 1 // FIXME: delete
-  handle.sync_stream();
-  auto time2 = std::chrono::steady_clock::now();
-#endif
 
   adj_matrix_partition_offsets_.reserve(edgelists.size());
   adj_matrix_partition_indices_.reserve(edgelists.size());
@@ -704,10 +689,6 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   }
 
   // segmented sort neighbors
-#if 1 // FIXME: delete
-  handle.sync_stream();
-  auto time3 = std::chrono::steady_clock::now();
-#endif
 
   for (size_t i = 0; i < adj_matrix_partition_offsets_.size(); ++i) {
     sort_adjacency_list(handle,
@@ -722,10 +703,6 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
 
   // if # unique edge rows/cols << V / row_comm_size|col_comm_size, store unique edge rows/cols to
   // support storing edge row/column properties in (key, value) pairs.
-#if 1 // FIXME: delete
-  handle.sync_stream();
-  auto time4 = std::chrono::steady_clock::now();
-#endif
 
   auto num_local_unique_edge_majors =
     store_transposed ? meta.num_local_unique_edge_cols : meta.num_local_unique_edge_rows;
@@ -882,17 +859,6 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
       local_sorted_unique_edge_col_offsets_ = std::move(h_key_offsets);
     }
   }
-#if 1 // FIXME: delete
-  handle.sync_stream();
-  auto time5 = std::chrono::steady_clock::now();
-  std::chrono::duration<double> elapsed_total = time5 - time0;
-  std::chrono::duration<double> elapsed0 = time1 - time0;
-  std::chrono::duration<double> elapsed1 = time2 - time1;
-  std::chrono::duration<double> elapsed2 = time3 - time2;
-  std::chrono::duration<double> elapsed3 = time4 - time3;
-  std::chrono::duration<double> elapsed4 = time5 - time4;
-  std::cout << "Graph constructor took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << ") ms." << std::endl;
-#endif
 }
 
 template <typename vertex_t,

From 214ada96222ab9f3885fc4dd4182fdd93e1bd29f Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 3 Feb 2022 13:26:34 -0800
Subject: [PATCH 13/60] delete additional temporary performance measurement
 code

---
 cpp/src/structure/renumber_edgelist_impl.cuh | 87 --------------------
 1 file changed, 87 deletions(-)

diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index f1358ddfddb..ac4c5f7fe1f 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -86,15 +86,6 @@ compute_renumber_map(raft::handle_t const& handle,
                      std::vector<vertex_t const*> const& edgelist_minors,
                      std::vector<edge_t> const& edgelist_edge_counts)
 {
-#if 1  // FIXME: delete
-  handle.sync_stream();
-  if constexpr (multi_gpu) {
-    rmm::device_uvector<int32_t> dummy(1, handle.get_stream());
-    handle.get_comms().allreduce(
-      dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream());
-  }
-  auto time0 = std::chrono::steady_clock::now();
-#endif
   rmm::device_uvector<vertex_t> sorted_local_vertices(0, handle.get_stream());
   vertex_t num_local_unique_edge_majors{0};
   vertex_t num_local_unique_edge_minors{0};
@@ -140,10 +131,6 @@ compute_renumber_map(raft::handle_t const& handle,
   // if local_vertices.has_value() is false, keep unique vertices from edge minors as well (to
   // construct local_vertices)
 
-#if 1  // FIXME: delete
-  handle.sync_stream();
-  auto time1 = std::chrono::steady_clock::now();
-#endif
   rmm::device_uvector<vertex_t> sorted_unique_minors(num_local_edges, handle.get_stream());
   size_t minor_offset{0};
   for (size_t i = 0; i < edgelist_minors.size(); ++i) {
@@ -179,10 +166,6 @@ compute_renumber_map(raft::handle_t const& handle,
   // 3. update sorted_local_vertices.
   // if local_vertices.has_value() is false, reconstruct local_vertices first
 
-#if 1  // FIXME: delete
-  handle.sync_stream();
-  auto time2 = std::chrono::steady_clock::now();
-#endif
   if (local_vertices) {
     sorted_local_vertices = std::move(*local_vertices);
     thrust::sort(
@@ -227,10 +210,6 @@ compute_renumber_map(raft::handle_t const& handle,
   // 4. compute global degrees for the sorted local vertices, and count unique edge majors on the
   // way
 
-#if 1  // FIXME: delete
-  handle.sync_stream();
-  auto time3 = std::chrono::steady_clock::now();
-#endif
   rmm::device_uvector<edge_t> sorted_local_vertex_degrees(0, handle.get_stream());
   std::optional<std::vector<size_t>> stream_pool_indices{
     std::nullopt};  // FIXME: move this inside the if statement
@@ -276,7 +255,6 @@ compute_renumber_map(raft::handle_t const& handle,
         handle.sync_stream();
       }
     }
-    stream_pool_indices = std::nullopt;  // FIXME: delete
 
     for (int i = 0; i < col_comm_size; ++i) {
       auto loop_stream = stream_pool_indices
@@ -387,10 +365,6 @@ compute_renumber_map(raft::handle_t const& handle,
   }
 
   // 4. sort local vertices by degree (descending)
-#if 1  // FIXME: delete
-  handle.sync_stream();
-  auto time4 = std::chrono::steady_clock::now();
-#endif
 
   thrust::sort_by_key(handle.get_thrust_policy(),
                       sorted_local_vertex_degrees.begin(),
@@ -399,10 +373,6 @@ compute_renumber_map(raft::handle_t const& handle,
                       thrust::greater<edge_t>());
 
   // 5. compute segment_offsets
-#if 1  // FIXME: delete
-  handle.sync_stream();
-  auto time5 = std::chrono::steady_clock::now();
-#endif
 
   static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
   static_assert((detail::low_degree_threshold <= detail::mid_degree_threshold) &&
@@ -456,24 +426,6 @@ compute_renumber_map(raft::handle_t const& handle,
                     handle.get_stream());
   handle.sync_stream();
 
-#if 1  // FIXME: delete
-  handle.sync_stream();
-  auto time6                                  = std::chrono::steady_clock::now();
-  std::chrono::duration<double> elapsed_total = time6 - time0;
-  std::chrono::duration<double> elapsed0      = time1 - time0;
-  std::chrono::duration<double> elapsed1      = time2 - time1;
-  std::chrono::duration<double> elapsed2      = time3 - time2;
-  std::chrono::duration<double> elapsed3      = time4 - time3;
-  std::chrono::duration<double> elapsed4      = time5 - time4;
-  std::chrono::duration<double> elapsed5      = time6 - time5;
-  std::cout << "Compute renumber map (num_streams:"
-            << (stream_pool_indices ? (*stream_pool_indices).size() : size_t{0}) << ") took "
-            << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << ","
-            << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << ","
-            << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << ","
-            << elapsed5.count() * 1e3 << ") ms." << std::endl;
-#endif
-
   return std::make_tuple(std::move(sorted_local_vertices),
                          h_segment_offsets,
                          num_local_unique_edge_majors,
@@ -673,15 +625,6 @@ renumber_edgelist(
   std::optional<std::vector<std::vector<edge_t>>> const& edgelist_intra_partition_segment_offsets,
   bool do_expensive_check)
 {
-#if 1  // FIXME: delete
-  handle.sync_stream();
-  if constexpr (multi_gpu) {
-    rmm::device_uvector<int32_t> dummy(1, handle.get_stream());
-    handle.get_comms().allreduce(
-      dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream());
-  }
-  auto time0 = std::chrono::steady_clock::now();
-#endif
   auto& comm               = handle.get_comms();
   auto const comm_size     = comm.get_size();
   auto const comm_rank     = comm.get_rank();
@@ -739,10 +682,6 @@ renumber_edgelist(
 
   // 1. compute renumber map
 
-#if 1  // FIXME: delete
-  handle.sync_stream();
-  auto time1 = std::chrono::steady_clock::now();
-#endif
   auto [renumber_map_labels,
         vertex_partition_segment_offsets,
         num_unique_edge_majors,
@@ -756,10 +695,6 @@ renumber_edgelist(
   // 2. initialize partition_t object, number_of_vertices, and number_of_edges for the coarsened
   // graph
 
-#if 1  // FIXME: delete
-  handle.sync_stream();
-  auto time2 = std::chrono::steady_clock::now();
-#endif
   auto vertex_counts = host_scalar_allgather(
     comm, static_cast<vertex_t>(renumber_map_labels.size()), handle.get_stream());
   std::vector<vertex_t> vertex_partition_offsets(comm_size + 1, 0);
@@ -783,10 +718,6 @@ renumber_edgelist(
   // FIXME: compare this hash based approach with a binary search based approach in both memory
   // footprint and execution time
 
-#if 1  // FIXME: delete
-  handle.sync_stream();
-  auto time3 = std::chrono::steady_clock::now();
-#endif
   {
     vertex_t max_matrix_partition_major_size{0};
     for (size_t i = 0; i < edgelist_majors.size(); ++i) {
@@ -834,10 +765,6 @@ renumber_edgelist(
     }
   }
 
-#if 1  // FIXME: delete
-  handle.sync_stream();
-  auto time4 = std::chrono::steady_clock::now();
-#endif
   if ((partition.get_matrix_partition_minor_size() >= number_of_edges / comm_size) &&
       edgelist_intra_partition_segment_offsets) {  // memory footprint dominated by the O(V/sqrt(P))
                                                    // part than the O(E/P) part
@@ -933,20 +860,6 @@ renumber_edgelist(
                         handle.get_stream());
     }
   }
-#if 1  // FIXME: delete
-  handle.sync_stream();
-  auto time5                                  = std::chrono::steady_clock::now();
-  std::chrono::duration<double> elapsed_total = time5 - time0;
-  std::chrono::duration<double> elapsed0      = time1 - time0;
-  std::chrono::duration<double> elapsed1      = time2 - time1;
-  std::chrono::duration<double> elapsed2      = time3 - time2;
-  std::chrono::duration<double> elapsed3      = time4 - time3;
-  std::chrono::duration<double> elapsed4      = time5 - time4;
-  std::cout << "Renumber took " << elapsed_total.count() * 1e3 << " ms, breakdown=("
-            << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << ","
-            << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << ","
-            << elapsed4.count() * 1e3 << ") ms." << std::endl;
-#endif
 
   return std::make_tuple(
     std::move(renumber_map_labels),

From 3a605b51d815ae6ad4936df7e36c99aa94bc87ba Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 3 Feb 2022 13:36:45 -0800
Subject: [PATCH 14/60] remove temporary performance measurement code

---
 cpp/src/structure/renumber_edgelist_impl.cuh |  2 +-
 cpp/tests/link_analysis/mg_pagerank_test.cpp | 22 ----------
 cpp/tests/utilities/test_graphs.hpp          | 44 --------------------
 3 files changed, 1 insertion(+), 67 deletions(-)

diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index ac4c5f7fe1f..959d11b783f 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -93,7 +93,7 @@ compute_renumber_map(raft::handle_t const& handle,
   edge_t num_local_edges = std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end());
 
   // 1. if local_vertices.has_value() is false, find unique vertices from edge majors (to construct
-  // local_vertices) unique edge majors will be counted in step 4.
+  // local_vertices), unique edge majors will be counted in step 4.
 
   rmm::device_uvector<vertex_t> sorted_unique_majors(0, handle.get_stream());
   if (!local_vertices) {
diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp
index 4a6dd08dabd..adcd0c94a8f 100644
--- a/cpp/tests/link_analysis/mg_pagerank_test.cpp
+++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp
@@ -63,9 +63,6 @@ class Tests_MGPageRank
 
     raft::handle_t handle{};
     HighResClock hr_clock{};
-#if 1  // FIXME: delete
-    auto time0 = std::chrono::steady_clock::now();
-#endif
 
     raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
     auto& comm           = handle.get_comms();
@@ -78,25 +75,6 @@ class Tests_MGPageRank
     }
     cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
       subcomm_factory(handle, row_comm_size);
-#if 1  // FIXME: delete
-    {
-      rmm::device_uvector<int32_t> tx_ints(comm_size, handle.get_stream());
-      rmm::device_uvector<int32_t> rx_ints(comm_size, handle.get_stream());
-      std::vector<size_t> tx_sizes(comm_size, size_t{1});
-      std::vector<size_t> tx_offsets(comm_size);
-      std::iota(tx_offsets.begin(), tx_offsets.end(), size_t{0});
-      std::vector<int32_t> tx_ranks(comm_size);
-      std::iota(tx_ranks.begin(), tx_ranks.end(), int32_t{0});
-      auto rx_sizes = tx_sizes;
-      auto rx_offsets = tx_offsets;
-      auto rx_ranks = tx_ranks;
-      handle.get_comms().device_multicast_sendrecv(tx_ints.data(), tx_sizes, tx_offsets, tx_ranks, rx_ints.data(), rx_sizes, rx_offsets, rx_ranks, handle.get_stream());
-      handle.sync_stream();
-    }
-    auto time1 = std::chrono::steady_clock::now();
-    std::chrono::duration<double> elapsed = time1 - time0;
-    std::cout << "Handle initialization and 1st all-to-all took " << elapsed.count() * 1e3 << " ms." << std::endl;
-#endif
 
     // 2. create MG graph
 
diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp
index 8818d9633bf..9fa4cee9f7a 100644
--- a/cpp/tests/utilities/test_graphs.hpp
+++ b/cpp/tests/utilities/test_graphs.hpp
@@ -147,13 +147,6 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
                       static_cast<size_t>(std::numeric_limits<edge_t>::max()),
                     "Invalid template parameter: (scale_, edge_factor_) too large for edge_t");
 
-#if 1 // FIXME: delete
-    handle.sync_stream();
-    if constexpr (multi_gpu) {
-      handle.get_comms().barrier();
-    }
-    auto time0 = std::chrono::steady_clock::now();
-#endif
     std::vector<size_t> partition_ids(1);
     size_t num_partitions;
 
@@ -198,10 +191,6 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
       }
     }
 
-#if 1 // FIXME: delete
-    handle.sync_stream();
-    auto time1 = std::chrono::steady_clock::now();
-#endif
     rmm::device_uvector<vertex_t> src_v(0, handle.get_stream());
     rmm::device_uvector<vertex_t> dst_v(0, handle.get_stream());
     auto weights_v = test_weighted
@@ -258,25 +247,13 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
       }
     }
 
-#if 1 // FIXME: delete
-    handle.sync_stream();
-    auto time2 = std::chrono::steady_clock::now();
-#endif
     translate(handle, src_v, dst_v);
 
-#if 1 // FIXME: delete
-    handle.sync_stream();
-    auto time3 = std::chrono::steady_clock::now();
-#endif
     if (undirected_)
       std::tie(src_v, dst_v, weights_v) =
         cugraph::symmetrize_edgelist_from_triangular<vertex_t, weight_t>(
           handle, std::move(src_v), std::move(dst_v), std::move(weights_v));
 
-#if 1 // FIXME: delete
-    handle.sync_stream();
-    auto time4 = std::chrono::steady_clock::now();
-#endif
     if (multi_gpu) {
       std::tie(store_transposed ? dst_v : src_v, store_transposed ? src_v : dst_v, weights_v) =
         cugraph::detail::shuffle_edgelist_by_gpu_id(
@@ -286,10 +263,6 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
           std::move(weights_v));
     }
 
-#if 1 // FIXME: delete
-    handle.sync_stream();
-    auto time5 = std::chrono::steady_clock::now();
-#endif
     rmm::device_uvector<vertex_t> vertices_v(0, handle.get_stream());
     for (size_t i = 0; i < partition_ids.size(); ++i) {
       auto id = partition_ids[i];
@@ -303,27 +276,10 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
                                      partition_vertex_firsts[i]);
     }
 
-#if 1 // FIXME: delete
-    handle.sync_stream();
-    auto time6 = std::chrono::steady_clock::now();
-#endif
     if constexpr (multi_gpu) {
       vertices_v = cugraph::detail::shuffle_vertices_by_gpu_id(handle, std::move(vertices_v));
     }
 
-#if 1 // FIXME: delete
-    handle.sync_stream();
-    auto time7 = std::chrono::steady_clock::now();
-    std::chrono::duration<double> elapsed_total = time7 - time0;
-    std::chrono::duration<double> elapsed0 = time1 - time0;
-    std::chrono::duration<double> elapsed1 = time2 - time1;
-    std::chrono::duration<double> elapsed2 = time3 - time2;
-    std::chrono::duration<double> elapsed3 = time4 - time3;
-    std::chrono::duration<double> elapsed4 = time5 - time4;
-    std::chrono::duration<double> elapsed5 = time6 - time5;
-    std::chrono::duration<double> elapsed6 = time7 - time6;
-    std::cout << "Edge generation took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << "," << elapsed5.count() * 1e3 << "," << elapsed6.count() * 1e3 << ") ms." << std::endl;
-#endif
     return std::make_tuple(
       std::move(src_v),
       std::move(dst_v),

From 4852ce4536e7fef9e1364eaaf11a8c59ac01bc74 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 3 Feb 2022 14:06:21 -0800
Subject: [PATCH 15/60] clang-format & copyright year

---
 cpp/include/cugraph/detail/graph_utils.cuh     | 5 +++--
 cpp/include/cugraph/utilities/shuffle_comm.cuh | 2 +-
 cpp/src/generators/generate_rmat_edgelist.cu   | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cugraph/detail/graph_utils.cuh b/cpp/include/cugraph/detail/graph_utils.cuh
index ca918c53a62..254744d11d9 100644
--- a/cpp/include/cugraph/detail/graph_utils.cuh
+++ b/cpp/include/cugraph/detail/graph_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -80,7 +80,8 @@ struct compute_partition_id_from_edge_t {
 template <typename vertex_t>
 struct is_first_in_run_t {
   vertex_t const* vertices{nullptr};
-  __device__ bool operator()(size_t i) const {
+  __device__ bool operator()(size_t i) const
+  {
     return (i == 0) || (vertices[i - 1] != vertices[i]);
   }
 };
diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh
index 3840de019fc..b1b60f49fde 100644
--- a/cpp/include/cugraph/utilities/shuffle_comm.cuh
+++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/generators/generate_rmat_edgelist.cu b/cpp/src/generators/generate_rmat_edgelist.cu
index 07b01853fdd..8aa33d744f7 100644
--- a/cpp/src/generators/generate_rmat_edgelist.cu
+++ b/cpp/src/generators/generate_rmat_edgelist.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 96e9693d9dc7777377ba54473e1be60e6141d693 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 3 Feb 2022 17:46:23 -0800
Subject: [PATCH 16/60] add temporary performance measurement code

---
 .../cugraph/utilities/shuffle_comm.cuh        | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh
index 3840de019fc..04fcbf749b7 100644
--- a/cpp/include/cugraph/utilities/shuffle_comm.cuh
+++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh
@@ -244,11 +244,23 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm,
                                       ValueToGPUIdOp value_to_gpu_id_op,
                                       rmm::cuda_stream_view stream_view)
 {
+#if 1  // FIXME: delete
+  rmm::device_uvector<int32_t> dummy(1, stream_view);
+  stream_view.synchronize();
+  comm.allreduce(dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, stream_view);
+  auto time0 = std::chrono::steady_clock::now();
+#endif
   auto const comm_size = comm.get_size();
 
   auto d_tx_value_counts = groupby_and_count(
     tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), stream_view);
 
+#if 1  // FIXME: delete
+  stream_view.synchronize();
+  auto time1 = std::chrono::steady_clock::now();
+  comm.allreduce(dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, stream_view);
+  auto time2 = std::chrono::steady_clock::now();
+#endif
   std::vector<size_t> tx_counts{};
   std::vector<size_t> tx_offsets{};
   std::vector<int> tx_dst_ranks{};
@@ -262,6 +274,12 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm,
     allocate_dataframe_buffer<typename std::iterator_traits<ValueIterator>::value_type>(
       rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view);
 
+#if 1  // FIXME: delete
+  stream_view.synchronize();
+  auto time3 = std::chrono::steady_clock::now();
+  comm.allreduce(dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, stream_view);
+  auto time4 = std::chrono::steady_clock::now();
+#endif
   // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released
   // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size).
   device_multicast_sendrecv(comm,
@@ -275,6 +293,10 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm,
                             rx_src_ranks,
                             stream_view);
 
+#if 1  // FIXME: delete
+  stream_view.synchronize();
+  auto time5 = std::chrono::steady_clock::now();
+#endif
   if (rx_counts.size() < static_cast<size_t>(comm_size)) {
     std::vector<size_t> tmp_rx_counts(comm_size, size_t{0});
     for (size_t i = 0; i < rx_src_ranks.size(); ++i) {
@@ -282,6 +304,18 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm,
     }
     rx_counts = std::move(tmp_rx_counts);
   }
+#if 1  // FIXME: delete
+  stream_view.synchronize();
+  auto time6 = std::chrono::steady_clock::now();
+  std::chrono::duration<double> elapsed_total = time6 - time0;
+  std::chrono::duration<double> elapsed0 = time1 - time0;
+  std::chrono::duration<double> elapsed1 = time2 - time1;
+  std::chrono::duration<double> elapsed2 = time3 - time2;
+  std::chrono::duration<double> elapsed3 = time4 - time3;
+  std::chrono::duration<double> elapsed4 = time5 - time4;
+  std::chrono::duration<double> elapsed5 = time6 - time5;
+  std::cout << "Shuffle values (" << thrust::distance(tx_value_first, tx_value_last) << ") took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << "," << elapsed5.count() * 1e3 << ") ms." << std::endl;
+#endif
 
   return std::make_tuple(std::move(rx_value_buffer), rx_counts);
 }

From e8769d34b1d8113dae19a348aef97722914051bd Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 3 Feb 2022 23:35:14 -0800
Subject: [PATCH 17/60] add temporary performance measurement code to PageRank
 implementation

---
 cpp/src/link_analysis/pagerank_impl.cuh | 35 +++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh
index b6023d21bf2..a4df330c617 100644
--- a/cpp/src/link_analysis/pagerank_impl.cuh
+++ b/cpp/src/link_analysis/pagerank_impl.cuh
@@ -192,6 +192,14 @@ void pagerank(
   row_properties_t<GraphViewType, result_t> adj_matrix_row_pageranks(handle, pull_graph_view);
   size_t iter{0};
   while (true) {
+#if 1 // FIXME: delete
+    handle.sync_stream();
+    if constexpr (GraphViewType::is_multi_gpu) {
+      rmm::device_uvector<int32_t> dummy(1, handle.get_stream());
+      handle.get_comms().allreduce(dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream());
+    }
+    auto time0 = std::chrono::steady_clock::now();
+#endif
     thrust::copy(handle.get_thrust_policy(),
                  pageranks,
                  pageranks + pull_graph_view.get_number_of_local_vertices(),
@@ -223,8 +231,16 @@ void pagerank(
                         return pagerank / divisor;
                       });
 
+#if 1 // FIXME: delete
+    handle.sync_stream();
+    auto time1 = std::chrono::steady_clock::now();
+#endif
     copy_to_adj_matrix_row(handle, pull_graph_view, pageranks, adj_matrix_row_pageranks);
 
+#if 1 // FIXME: delete
+    handle.sync_stream();
+    auto time2 = std::chrono::steady_clock::now();
+#endif
     auto unvarying_part = aggregate_personalization_vector_size == 0
                             ? (dangling_sum * alpha + static_cast<result_t>(1.0 - alpha)) /
                                 static_cast<result_t>(num_vertices)
@@ -241,6 +257,10 @@ void pagerank(
       unvarying_part,
       pageranks);
 
+#if 1 // FIXME: delete
+    handle.sync_stream();
+    auto time3 = std::chrono::steady_clock::now();
+#endif
     if (aggregate_personalization_vector_size > 0) {
       auto vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
         pull_graph_view.get_vertex_partition_view());
@@ -260,6 +280,10 @@ void pagerank(
         });
     }
 
+#if 1 // FIXME: delete
+    handle.sync_stream();
+    auto time4 = std::chrono::steady_clock::now();
+#endif
     auto diff_sum = transform_reduce_v(
       handle,
       pull_graph_view,
@@ -267,6 +291,17 @@ void pagerank(
       [] __device__(auto val) { return std::abs(thrust::get<0>(val) - thrust::get<1>(val)); },
       result_t{0.0});
 
+#if 1 // FIXME: delete
+    handle.sync_stream();
+    auto time5 = std::chrono::steady_clock::now();
+    std::chrono::duration<double> elapsed_total = time5 - time0;
+    std::chrono::duration<double> elapsed0 = time1 - time0;
+    std::chrono::duration<double> elapsed1 = time2 - time1;
+    std::chrono::duration<double> elapsed2 = time3 - time2;
+    std::chrono::duration<double> elapsed3 = time4 - time3;
+    std::chrono::duration<double> elapsed4 = time5 - time4;
+    std::cout << "PageRank iter " << iter << " took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << ") ms." << std::endl;
+#endif
     iter++;
 
     if (diff_sum < epsilon) {

From dc6acefb5a6386283a7dcb8d061dca1a4bbd33e5 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 4 Feb 2022 00:27:28 -0800
Subject: [PATCH 18/60] add more performance measurements to MG PageRank test

---
 cpp/tests/link_analysis/mg_pagerank_test.cpp | 87 ++++++++++++++++++--
 1 file changed, 79 insertions(+), 8 deletions(-)

diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp
index df264f2e0e1..5b60a130728 100644
--- a/cpp/tests/link_analysis/mg_pagerank_test.cpp
+++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp
@@ -61,26 +61,82 @@ class Tests_MGPageRank
   {
     // 1. initialize handle
 
-    raft::handle_t handle(rmm::cuda_stream_per_thread, std::make_shared<rmm::cuda_stream_pool>());
+    auto constexpr pool_size = 64;  // FIXME: tuning parameter
+    raft::handle_t handle(rmm::cuda_stream_per_thread, std::make_shared<rmm::cuda_stream_pool>(pool_size));
     HighResClock hr_clock{};
+#if 1  // FIXME: delete
+    auto time0 = std::chrono::steady_clock::now();
+#endif
 
     raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
     auto& comm           = handle.get_comms();
     auto const comm_size = comm.get_size();
     auto const comm_rank = comm.get_rank();
 
-    auto row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
-    while (comm_size % row_comm_size != 0) {
-      --row_comm_size;
+    int row_comm_size{};
+    int num_gpus_per_node{};
+    RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node));
+    if (comm_size > num_gpus_per_node) {  // multi-node, inter-node communication bandwidth
+                                          // (Infinniband) is more likely to be a bottleneck than
+                                          // intra-node (NVLink) communication bandwidth
+      CUGRAPH_EXPECTS((comm_size % num_gpus_per_node) == 0,
+                      "Invalid MPI configuration: in multi-node execution, # MPI processes should "
+                      "be a multiple of the number of GPUs per node.");
+      auto num_nodes = comm_size / num_gpus_per_node;
+      row_comm_size  = static_cast<int>(sqrt(static_cast<double>(num_nodes)));
+      while (num_nodes % row_comm_size != 0) {
+        --row_comm_size;
+      }
+      row_comm_size *= num_gpus_per_node;
+    } else {
+      row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
+      while (comm_size % row_comm_size != 0) {
+        --row_comm_size;
+      }
     }
+
     cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
       subcomm_factory(handle, row_comm_size);
+#if 1  // FIXME: delete
+    {
+      rmm::device_uvector<int32_t> tx_ints(comm_size, handle.get_stream());
+      rmm::device_uvector<int32_t> rx_ints(comm_size, handle.get_stream());
+      std::vector<size_t> tx_sizes(comm_size, size_t{1});
+      std::vector<size_t> tx_offsets(comm_size);
+      std::iota(tx_offsets.begin(), tx_offsets.end(), size_t{0});
+      std::vector<int32_t> tx_ranks(comm_size);
+      std::iota(tx_ranks.begin(), tx_ranks.end(), int32_t{0});
+      auto rx_sizes   = tx_sizes;
+      auto rx_offsets = tx_offsets;
+      auto rx_ranks   = tx_ranks;
+      handle.get_comms().device_multicast_sendrecv(tx_ints.data(),
+                                                   tx_sizes,
+                                                   tx_offsets,
+                                                   tx_ranks,
+                                                   rx_ints.data(),
+                                                   rx_sizes,
+                                                   rx_offsets,
+                                                   rx_ranks,
+                                                   handle.get_stream());
+      handle.sync_stream();
+    }
+    auto time1                            = std::chrono::steady_clock::now();
+    std::chrono::duration<double> elapsed = time1 - time0;
+    std::cout << "Handle initialization and 1st all-to-all (comm_size=" << comm_size
+              << ", row_comm_size=" << row_comm_size << ") took " << elapsed.count() * 1e3 << " ms."
+              << std::endl;
+#endif
 
     // 2. create MG graph
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      handle.get_comms().barrier();
+#if 1  // FIXME: should use handle.get_comms().barrier() instead once raft is updated to internally
+       // use NCCL All-Reduce instead of MPI barrier
+      rmm::device_uvector<int32_t> dummy(1, handle.get_stream());
+      handle.get_comms().allreduce(
+        dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream());
+#endif
       hr_clock.start();
     }
 
@@ -90,7 +146,12 @@ class Tests_MGPageRank
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      handle.get_comms().barrier();
+#if 1  // FIXME: should use handle.get_comms().barrier() instead once raft is updated to internally
+       // use NCCL All-Reduce instead of MPI barrier
+      rmm::device_uvector<int32_t> dummy(1, handle.get_stream());
+      handle.get_comms().allreduce(
+        dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream());
+#endif
       double elapsed_time{0.0};
       hr_clock.stop(&elapsed_time);
       std::cout << "MG construct_graph took " << elapsed_time * 1e-6 << " s.\n";
@@ -155,7 +216,12 @@ class Tests_MGPageRank
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      handle.get_comms().barrier();
+#if 1  // FIXME: should use handle.get_comms().barrier() instead once raft is updated to internally
+       // use NCCL All-Reduce instead of MPI barrier
+      rmm::device_uvector<int32_t> dummy(1, handle.get_stream());
+      handle.get_comms().allreduce(
+        dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream());
+#endif
       hr_clock.start();
     }
 
@@ -180,7 +246,12 @@ class Tests_MGPageRank
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      handle.get_comms().barrier();
+#if 1  // FIXME: should use handle.get_comms().barrier() instead once raft is updated to internally
+       // use NCCL All-Reduce instead of MPI barrier
+      rmm::device_uvector<int32_t> dummy(1, handle.get_stream());
+      handle.get_comms().allreduce(
+        dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream());
+#endif
       double elapsed_time{0.0};
       hr_clock.stop(&elapsed_time);
       std::cout << "MG PageRank took " << elapsed_time * 1e-6 << " s.\n";

From 8136b77130fe2cb57bdd1f6824ea622025a47e4e Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 4 Feb 2022 00:40:22 -0800
Subject: [PATCH 19/60] add more experimental code (should be cleaned-up before
 merge)

---
 .../copy_v_transform_reduce_in_out_nbr.cuh    | 189 +++++++++---------
 1 file changed, 98 insertions(+), 91 deletions(-)

diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
index fbd339fdf32..63f1aae6c8a 100644
--- a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
@@ -517,15 +517,16 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
   }
 
   std::optional<std::vector<size_t>> stream_pool_indices{std::nullopt};
-  size_t num_concurrent_loops{1};  // FIXME: this can go inside the loop after temporary testing
-#if 1                              // FIXME: for temporary testing
-  std::vector<cudaStream_t> high_streams0{};
-  std::vector<cudaStream_t> high_streams1{};
-  std::vector<cudaStream_t> mid_streams{};
-  std::vector<cudaStream_t> low_streams{};
+#if 1  // FIXME: for temporary testing
+  std::vector<cudaStream_t> pool_streams{};
 #endif
   if constexpr (GraphViewType::is_multi_gpu) {
-    if (handle.get_stream_pool_size() >= max_segments) {
+    if ((graph_view.get_local_adj_matrix_partition_segment_offsets(0)) &&
+        (handle.get_stream_pool_size() >= max_segments)) {
+      for (size_t i = 1; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+        assert(graph_view.get_local_adj_matrix_partition_segment_offsets(i));
+      }
+
       auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
       auto const col_comm_size = col_comm.get_size();
 
@@ -534,8 +535,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       // update_major ? V / comm_size * sizeof(T) : 0
       // and limit memory requirement to (E / comm_size) * sizeof(vertex_t)
 
-      num_concurrent_loops =
-        std::min(static_cast<size_t>(col_comm_size), handle.get_stream_pool_size() / max_segments);
+      size_t num_streams = std::min(static_cast<size_t>(col_comm_size) * max_segments,
+                                    (handle.get_stream_pool_size() / max_segments) * max_segments);
       if constexpr (update_major) {
         size_t value_size{0};
         if constexpr (is_thrust_tuple_of_arithmetic<T>::value) {
@@ -550,60 +551,87 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
                                       static_cast<double>(graph_view.get_number_of_vertices()))
                                    : double{0.0};
 
-        num_concurrent_loops =
+        num_streams =
           std::min(static_cast<size_t>(avg_vertex_degree * (static_cast<double>(sizeof(vertex_t)) /
-                                                            static_cast<double>(value_size))),
-                   num_concurrent_loops);
+                                                            static_cast<double>(value_size))) *
+                     max_segments,
+                   num_streams);
       }
 
+      if (num_streams >= max_segments) {
 #if 1  // FIXME: for temporary testing
-      high_streams0.resize(num_concurrent_loops);
-      high_streams1.resize(num_concurrent_loops);
-      mid_streams.resize(num_concurrent_loops);
-      low_streams.resize(num_concurrent_loops);
-      for (size_t i = 0; i < num_concurrent_loops; ++i) {
-        CUDA_TRY(cudaStreamCreateWithPriority(&high_streams0[i], cudaStreamNonBlocking, -2));
-        CUDA_TRY(cudaStreamCreateWithPriority(&high_streams1[i], cudaStreamNonBlocking, -2));
-        CUDA_TRY(cudaStreamCreateWithPriority(&mid_streams[i], cudaStreamNonBlocking, -1));
-        CUDA_TRY(cudaStreamCreateWithPriority(&low_streams[i], cudaStreamNonBlocking, 0));
-      }
+        pool_streams.resize(num_streams);
+        for (size_t i = 0; i < pool_streams.size() / max_segments; ++i) {
+          static_assert(max_segments == 4);
+          CUDA_TRY(cudaStreamCreateWithPriority(
+            &pool_streams[i * max_segments], cudaStreamNonBlocking, -2));
+          CUDA_TRY(cudaStreamCreateWithPriority(
+            &pool_streams[i * max_segments + 1], cudaStreamNonBlocking, -2));
+          CUDA_TRY(cudaStreamCreateWithPriority(
+            &pool_streams[i * max_segments + 2], cudaStreamNonBlocking, -1));
+          CUDA_TRY(cudaStreamCreateWithPriority(
+            &pool_streams[i * max_segments + 3], cudaStreamNonBlocking, 0));
+        }
 #endif
 
-      stream_pool_indices = std::vector<size_t>(num_concurrent_loops * max_segments);
-      std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0});
-      handle.sync_stream();
+        stream_pool_indices = std::vector<size_t>(num_streams);
+        std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0});
+        handle.sync_stream();
+      }
     }
   }
 
-  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
-    auto loop_stream = stream_pool_indices
-                         ? rmm::cuda_stream_view{high_streams0[i % num_concurrent_loops]}
-                         /* FIXME for temporary testing,
-     handle.get_stream_from_stream_pool((i *
-     max_segments) %
-     (*stream_pool_indices).size()) */
-                         : handle.get_stream();
+  std::vector<decltype(allocate_dataframe_buffer<T>(0, rmm::cuda_stream_view{}))>
+    major_tmp_buffers{};
+  if constexpr (GraphViewType::is_multi_gpu && update_major) {
+    std::vector<size_t> major_tmp_buffer_sizes(
+      graph_view.get_number_of_local_adj_matrix_partitions(), size_t{0});
+    for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+      major_tmp_buffer_sizes[i] = GraphViewType::is_adj_matrix_transposed
+                                    ? graph_view.get_number_of_local_adj_matrix_partition_cols(i)
+                                    : graph_view.get_number_of_local_adj_matrix_partition_rows(i);
+    }
+    if (stream_pool_indices) {
+      auto num_concurrent_loops = (*stream_pool_indices).size() / max_segments;
+      major_tmp_buffers.reserve(num_concurrent_loops);
+      for (size_t i = 0; i < num_concurrent_loops; ++i) {
+        size_t max_size{0};
+        for (size_t j = i; j < graph_view.get_number_of_local_adj_matrix_partitions();
+             j += num_concurrent_loops) {
+          max_size = std::max(major_tmp_buffer_sizes[j], max_size);
+        }
+        major_tmp_buffers.push_back(allocate_dataframe_buffer<T>(max_size, handle.get_stream()));
+      }
+    } else {
+      major_tmp_buffers.reserve(1);
+      major_tmp_buffers.push_back(allocate_dataframe_buffer<T>(
+        *std::max_element(major_tmp_buffer_sizes.begin(), major_tmp_buffer_sizes.end()),
+        handle.get_stream()));
+    }
+  } else {  // dummy
+    major_tmp_buffers.reserve(1);
+    major_tmp_buffers.push_back(allocate_dataframe_buffer<T>(size_t{0}, handle.get_stream()));
+  }
 
+  if (stream_pool_indices) { handle.sync_stream(); }
+
+  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
     auto matrix_partition =
       matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
         graph_view.get_matrix_partition_view(i));
 
-    auto major_tmp_buffer_size =
-      GraphViewType::is_multi_gpu && update_major ? matrix_partition.get_major_size() : vertex_t{0};
-    auto major_tmp_buffer   = allocate_dataframe_buffer<T>(major_tmp_buffer_size, loop_stream);
-    auto major_buffer_first = get_dataframe_buffer_begin(major_tmp_buffer);
-
     auto major_init = T{};
     if constexpr (update_major) {
       if constexpr (GraphViewType::is_multi_gpu) {
         auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
         auto const col_comm_rank = col_comm.get_rank();
-        major_init               = (col_comm_rank == 0) ? init : T{};
+        major_init               = (static_cast<int>(i) == col_comm_rank) ? init : T{};
       } else {
         major_init = init;
       }
     }
 
+    // FIXME: need to double check whether this leads to actual copy
     auto matrix_partition_row_value_input = adj_matrix_row_value_input;
     auto matrix_partition_col_value_input = adj_matrix_col_value_input;
     if constexpr (GraphViewType::is_adj_matrix_transposed) {
@@ -612,6 +640,9 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       matrix_partition_row_value_input.set_local_adj_matrix_partition_idx(i);
     }
 
+    auto major_buffer_first =
+      get_dataframe_buffer_begin(major_tmp_buffers[i % major_tmp_buffers.size()]);
+
     std::conditional_t<GraphViewType::is_multi_gpu,
                        std::conditional_t<update_major,
                                           decltype(major_buffer_first),
@@ -628,13 +659,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       output_buffer = vertex_value_output_first;
     }
 
-    if (stream_pool_indices) {
-      CUDA_TRY(cudaStreamSynchronize(
-        high_streams0[i % num_concurrent_loops])); /* FIXME for temporary testing,
-                               handle.sync_stream_pool(std::vector<size_t>{(i * max_segments) %
-                               (*stream_pool_indices).size()}); */
-    }
-
     auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
     if (segment_offsets) {
       static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
@@ -644,11 +668,12 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       // running segmented reduction
       if (matrix_partition.get_dcs_nzd_vertex_count()) {
         auto exec_stream = stream_pool_indices
-                             ? rmm::cuda_stream_view{high_streams0[i % num_concurrent_loops]}
+                             ? rmm::cuda_stream_view{pool_streams[(i * max_segments) %
+                                                                  (*stream_pool_indices).size()]}
                              /* FIXME for temporary testing,
                                 handle.get_stream_from_stream_pool((i * max_segments) %
                                 (*stream_pool_indices).size()) */
-                             : loop_stream;
+                             : handle.get_stream();
         if constexpr (update_major) {  // this is necessary as we don't visit every vertex in the
                                        // hypersparse segment in
                                        // for_all_major_for_all_nbr_hypersparse
@@ -676,11 +701,11 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       }
       if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
         auto exec_stream = stream_pool_indices
-                             ? rmm::cuda_stream_view{high_streams1[i % num_concurrent_loops]}
-                             /* FIXME for temporary testing,
-                                handle.get_stream_from_stream_pool((i * max_segments + 1) %
-                                (*stream_pool_indices).size()) */
-                             : loop_stream;
+                             ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 1) %
+                                                                  (*stream_pool_indices).size()]}
+                             /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i *
+                                max_segments + 1) % (*stream_pool_indices).size()) */
+                             : handle.get_stream();
         raft::grid_1d_thread_t update_grid((*segment_offsets)[3] - (*segment_offsets)[2],
                                            detail::copy_v_transform_reduce_nbr_for_all_block_size,
                                            handle.get_device_properties().maxGridSize[0]);
@@ -699,11 +724,11 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       }
       if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
         auto exec_stream = stream_pool_indices
-                             ? rmm::cuda_stream_view{mid_streams[i % num_concurrent_loops]}
-                             /* FIXME for temporary testing,
-                                handle.get_stream_from_stream_pool((i * max_segments + 2) %
-                                (*stream_pool_indices).size()) */
-                             : loop_stream;
+                             ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 2) %
+                                                                  (*stream_pool_indices).size()]}
+                             /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i *
+                                max_segments + 2) % (*stream_pool_indices).size()) */
+                             : handle.get_stream();
         raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1],
                                          detail::copy_v_transform_reduce_nbr_for_all_block_size,
                                          handle.get_device_properties().maxGridSize[0]);
@@ -722,11 +747,11 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       }
       if ((*segment_offsets)[1] > 0) {
         auto exec_stream = stream_pool_indices
-                             ? rmm::cuda_stream_view{low_streams[i % num_concurrent_loops]}
-                             /* FIXME for temporary testing,
-                                handle.get_stream_from_stream_pool((i * max_segments + 3) %
-                                (*stream_pool_indices).size()) */
-                             : loop_stream;
+                             ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 3) %
+                                                                  (*stream_pool_indices).size()]}
+                             /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i *
+                                max_segments + 3) % (*stream_pool_indices).size()) */
+                             : handle.get_stream();
         raft::grid_1d_block_t update_grid((*segment_offsets)[1],
                                           detail::copy_v_transform_reduce_nbr_for_all_block_size,
                                           handle.get_device_properties().maxGridSize[0]);
@@ -747,7 +772,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
                                            detail::copy_v_transform_reduce_nbr_for_all_block_size,
                                            handle.get_device_properties().maxGridSize[0]);
         detail::for_all_major_for_all_nbr_low_degree<update_major, GraphViewType>
-          <<<update_grid.num_blocks, update_grid.block_size, 0, loop_stream>>>(
+          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
             matrix_partition,
             matrix_partition.get_major_first(),
             matrix_partition.get_major_last(),
@@ -776,19 +801,19 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
                         (*segment_offsets).back() - (*segment_offsets)[3],
                         raft::comms::op_t::SUM,
                         i,
-                        high_streams0[i % num_concurrent_loops]/* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) */);
+                        pool_streams[(i * max_segments) % (*stream_pool_indices).size()]/* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) */);
         }
         if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
           device_reduce(
-            col_comm, major_buffer_first + (*segment_offsets)[2], vertex_value_output_first + (*segment_offsets)[2], (*segment_offsets)[3] - (*segment_offsets)[2], raft::comms::op_t::SUM, i, high_streams1[i % num_concurrent_loops] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 1) % (*stream_pool_indices).size()) */);
+            col_comm, major_buffer_first + (*segment_offsets)[2], vertex_value_output_first + (*segment_offsets)[2], (*segment_offsets)[3] - (*segment_offsets)[2], raft::comms::op_t::SUM, i, pool_streams[(i * max_segments + 1) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 1) % (*stream_pool_indices).size()) */);
         }
         if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
           device_reduce(
-            col_comm, major_buffer_first + (*segment_offsets)[1], vertex_value_output_first + (*segment_offsets)[1], (*segment_offsets)[2] - (*segment_offsets)[1], raft::comms::op_t::SUM, i, mid_streams[i % num_concurrent_loops] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 2) % (*stream_pool_indices).size()) */);
+            col_comm, major_buffer_first + (*segment_offsets)[1], vertex_value_output_first + (*segment_offsets)[1], (*segment_offsets)[2] - (*segment_offsets)[1], raft::comms::op_t::SUM, i, pool_streams[(i * max_segments + 2) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 2) % (*stream_pool_indices).size()) */);
         }
         if ((*segment_offsets)[1] > 0) {
           device_reduce(
-            col_comm, major_buffer_first, vertex_value_output_first, (*segment_offsets)[1], raft::comms::op_t::SUM, i, low_streams[i % num_concurrent_loops] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 3) % (*stream_pool_indices).size()) */);
+            col_comm, major_buffer_first, vertex_value_output_first, (*segment_offsets)[1], raft::comms::op_t::SUM, i, pool_streams[(i * max_segments + 3) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 3) % (*stream_pool_indices).size()) */);
         }
       } else {
         device_reduce(col_comm,
@@ -797,24 +822,15 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
                       matrix_partition.get_major_size(),
                       raft::comms::op_t::SUM,
                       i,
-                      loop_stream);
+                      handle.get_stream());
       }
     }
   }
 
   if (stream_pool_indices) {
 #if 1  // FIXME: for temporary testing
-    for (size_t i = 0; i < high_streams0.size(); ++i) {
-      CUDA_TRY(cudaStreamSynchronize(high_streams0[i]));
-    }
-    for (size_t i = 0; i < high_streams1.size(); ++i) {
-      CUDA_TRY(cudaStreamSynchronize(high_streams1[i]));
-    }
-    for (size_t i = 0; i < mid_streams.size(); ++i) {
-      CUDA_TRY(cudaStreamSynchronize(mid_streams[i]));
-    }
-    for (size_t i = 0; i < low_streams.size(); ++i) {
-      CUDA_TRY(cudaStreamSynchronize(low_streams[i]));
+    for (size_t i = 0; i < pool_streams.size(); ++i) {
+      CUDA_TRY(cudaStreamSynchronize(pool_streams[i]));
     }
 #else
     handle.sync_stream_pool(*stream_pool_indices);
@@ -886,17 +902,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
 // FIXME: for temporary testing
 #if 1
   if (stream_pool_indices) {
-    for (size_t i = 0; i < low_streams.size(); ++i) {
-      CUDA_TRY(cudaStreamDestroy(low_streams[i]));
-    }
-    for (size_t i = 0; i < mid_streams.size(); ++i) {
-      CUDA_TRY(cudaStreamDestroy(mid_streams[i]));
-    }
-    for (size_t i = 0; i < high_streams1.size(); ++i) {
-      CUDA_TRY(cudaStreamDestroy(high_streams1[i]));
-    }
-    for (size_t i = 0; i < high_streams0.size(); ++i) {
-      CUDA_TRY(cudaStreamDestroy(high_streams0[i]));
+    for (size_t i = 0; i < pool_streams.size(); ++i) {
+      CUDA_TRY(cudaStreamDestroy(pool_streams[i]));
     }
   }
 #endif

From f28ccfa186e90bd7390f945fe9e5c3c9f602268f Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 9 Feb 2022 16:28:52 -0800
Subject: [PATCH 20/60] remove some temporary code

---
 .../cugraph/utilities/shuffle_comm.cuh        | 34 -------------------
 cpp/tests/link_analysis/mg_pagerank_test.cpp  | 28 +++------------
 2 files changed, 4 insertions(+), 58 deletions(-)

diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh
index cd5828f9554..f10f9db95e1 100644
--- a/cpp/include/cugraph/utilities/shuffle_comm.cuh
+++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh
@@ -246,23 +246,11 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm,
                                       ValueToGPUIdOp value_to_gpu_id_op,
                                       rmm::cuda_stream_view stream_view)
 {
-#if 1  // FIXME: delete
-  rmm::device_uvector<int32_t> dummy(1, stream_view);
-  stream_view.synchronize();
-  comm.allreduce(dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, stream_view);
-  auto time0 = std::chrono::steady_clock::now();
-#endif
   auto const comm_size = comm.get_size();
 
   auto d_tx_value_counts = groupby_and_count(
     tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), stream_view);
 
-#if 1  // FIXME: delete
-  stream_view.synchronize();
-  auto time1 = std::chrono::steady_clock::now();
-  comm.allreduce(dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, stream_view);
-  auto time2 = std::chrono::steady_clock::now();
-#endif
   std::vector<size_t> tx_counts{};
   std::vector<size_t> tx_offsets{};
   std::vector<int> tx_dst_ranks{};
@@ -276,12 +264,6 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm,
     allocate_dataframe_buffer<typename std::iterator_traits<ValueIterator>::value_type>(
       rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view);
 
-#if 1  // FIXME: delete
-  stream_view.synchronize();
-  auto time3 = std::chrono::steady_clock::now();
-  comm.allreduce(dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, stream_view);
-  auto time4 = std::chrono::steady_clock::now();
-#endif
   // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released
   // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size).
   device_multicast_sendrecv(comm,
@@ -295,10 +277,6 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm,
                             rx_src_ranks,
                             stream_view);
 
-#if 1  // FIXME: delete
-  stream_view.synchronize();
-  auto time5 = std::chrono::steady_clock::now();
-#endif
   if (rx_counts.size() < static_cast<size_t>(comm_size)) {
     std::vector<size_t> tmp_rx_counts(comm_size, size_t{0});
     for (size_t i = 0; i < rx_src_ranks.size(); ++i) {
@@ -306,18 +284,6 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm,
     }
     rx_counts = std::move(tmp_rx_counts);
   }
-#if 1  // FIXME: delete
-  stream_view.synchronize();
-  auto time6 = std::chrono::steady_clock::now();
-  std::chrono::duration<double> elapsed_total = time6 - time0;
-  std::chrono::duration<double> elapsed0 = time1 - time0;
-  std::chrono::duration<double> elapsed1 = time2 - time1;
-  std::chrono::duration<double> elapsed2 = time3 - time2;
-  std::chrono::duration<double> elapsed3 = time4 - time3;
-  std::chrono::duration<double> elapsed4 = time5 - time4;
-  std::chrono::duration<double> elapsed5 = time6 - time5;
-  std::cout << "Shuffle values (" << thrust::distance(tx_value_first, tx_value_last) << ") took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << "," << elapsed5.count() * 1e3 << ") ms." << std::endl;
-#endif
 
   return std::make_tuple(std::move(rx_value_buffer), rx_counts);
 }
diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp
index 5b60a130728..c2a9bf74e2e 100644
--- a/cpp/tests/link_analysis/mg_pagerank_test.cpp
+++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp
@@ -131,12 +131,7 @@ class Tests_MGPageRank
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-#if 1  // FIXME: should use handle.get_comms().barrier() instead once raft is updated to internally
-       // use NCCL All-Reduce instead of MPI barrier
-      rmm::device_uvector<int32_t> dummy(1, handle.get_stream());
-      handle.get_comms().allreduce(
-        dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream());
-#endif
+      handle.get_comms().barrier();
       hr_clock.start();
     }
 
@@ -146,12 +141,7 @@ class Tests_MGPageRank
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-#if 1  // FIXME: should use handle.get_comms().barrier() instead once raft is updated to internally
-       // use NCCL All-Reduce instead of MPI barrier
-      rmm::device_uvector<int32_t> dummy(1, handle.get_stream());
-      handle.get_comms().allreduce(
-        dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream());
-#endif
+      handle.get_comms().barrier();
       double elapsed_time{0.0};
       hr_clock.stop(&elapsed_time);
       std::cout << "MG construct_graph took " << elapsed_time * 1e-6 << " s.\n";
@@ -216,12 +206,7 @@ class Tests_MGPageRank
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-#if 1  // FIXME: should use handle.get_comms().barrier() instead once raft is updated to internally
-       // use NCCL All-Reduce instead of MPI barrier
-      rmm::device_uvector<int32_t> dummy(1, handle.get_stream());
-      handle.get_comms().allreduce(
-        dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream());
-#endif
+      handle.get_comms().barrier();
       hr_clock.start();
     }
 
@@ -246,12 +231,7 @@ class Tests_MGPageRank
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-#if 1  // FIXME: should use handle.get_comms().barrier() instead once raft is updated to internally
-       // use NCCL All-Reduce instead of MPI barrier
-      rmm::device_uvector<int32_t> dummy(1, handle.get_stream());
-      handle.get_comms().allreduce(
-        dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream());
-#endif
+      handle.get_comms().barrier();
       double elapsed_time{0.0};
       hr_clock.stop(&elapsed_time);
       std::cout << "MG PageRank took " << elapsed_time * 1e-6 << " s.\n";

From 1839a61d4ee37f98aeedf67fa4589f934ac5f54b Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 9 Feb 2022 16:30:31 -0800
Subject: [PATCH 21/60] undo some temporary fix

---
 cpp/src/link_analysis/pagerank_impl.cuh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh
index a4df330c617..e346a6892b9 100644
--- a/cpp/src/link_analysis/pagerank_impl.cuh
+++ b/cpp/src/link_analysis/pagerank_impl.cuh
@@ -195,8 +195,7 @@ void pagerank(
 #if 1 // FIXME: delete
     handle.sync_stream();
     if constexpr (GraphViewType::is_multi_gpu) {
-      rmm::device_uvector<int32_t> dummy(1, handle.get_stream());
-      handle.get_comms().allreduce(dummy.data(), dummy.data(), 1, raft::comms::op_t::SUM, handle.get_stream());
+      handle.get_comms().barrier();
     }
     auto time0 = std::chrono::steady_clock::now();
 #endif

From 7cb1f03b2d832ee3290f00158cd3c124e2d8187b Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 10 Feb 2022 00:06:42 -0800
Subject: [PATCH 22/60] remove host_barrier (no longer used)

---
 cpp/CMakeLists.txt                            |   1 -
 .../cugraph/utilities/host_barrier.hpp        |  26 -----
 cpp/src/utilities/host_barrier.cpp            | 104 ------------------
 3 files changed, 131 deletions(-)
 delete mode 100644 cpp/include/cugraph/utilities/host_barrier.hpp
 delete mode 100644 cpp/src/utilities/host_barrier.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ec3141343b4..0ec6b249df0 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -228,7 +228,6 @@ add_library(cugraph SHARED
     src/structure/create_graph_from_edgelist_mg.cu
     src/structure/symmetrize_edgelist_sg.cu
     src/structure/symmetrize_edgelist_mg.cu
-    src/utilities/host_barrier.cpp
     src/visitors/graph_envelope.cpp
     src/visitors/visitors_factory.cpp
     src/visitors/bfs_visitor.cpp
diff --git a/cpp/include/cugraph/utilities/host_barrier.hpp b/cpp/include/cugraph/utilities/host_barrier.hpp
deleted file mode 100644
index 6825814eb93..00000000000
--- a/cpp/include/cugraph/utilities/host_barrier.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <raft/handle.hpp>
-
-namespace cugraph {
-
-// FIXME: a temporary hack till UCC is integrated into RAFT (so we can use UCC barrier for DASK and
-// MPI barrier for MPI)
-void host_barrier(raft::comms::comms_t const& comm, rmm::cuda_stream_view stream_view);
-
-}  // namespace cugraph
diff --git a/cpp/src/utilities/host_barrier.cpp b/cpp/src/utilities/host_barrier.cpp
deleted file mode 100644
index 2887350ad4d..00000000000
--- a/cpp/src/utilities/host_barrier.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <cugraph/utilities/host_barrier.hpp>
-
-#include <vector>
-
-namespace cugraph {
-
-// FIXME: a temporary hack till UCC is integrated into RAFT (so we can use UCC barrier for DASK and
-// MPI barrier for MPI)
-void host_barrier(raft::comms::comms_t const& comm, rmm::cuda_stream_view stream_view)
-{
-  stream_view.synchronize();
-
-  auto const comm_size = comm.get_size();
-  auto const comm_rank = comm.get_rank();
-
-  // k-tree barrier
-
-  int constexpr k = 2;
-  static_assert(k >= 2);
-  std::vector<raft::comms::request_t> requests(k - 1);
-  std::vector<std::byte> dummies(k - 1);
-
-  // up
-
-  int mod = 1;
-  while (mod < comm_size) {
-    if (comm_rank % mod == 0) {
-      auto level_rank = comm_rank / mod;
-      if (level_rank % k == 0) {
-        auto num_irecvs = 0;
-        ;
-        for (int i = 1; i < k; ++i) {
-          auto src_rank = (level_rank + i) * mod;
-          if (src_rank < comm_size) {
-            comm.irecv(dummies.data() + (i - 1),
-                       sizeof(std::byte),
-                       src_rank,
-                       int{0} /* tag */,
-                       requests.data() + (i - 1));
-            ++num_irecvs;
-          }
-        }
-        comm.waitall(num_irecvs, requests.data());
-      } else {
-        comm.isend(dummies.data(),
-                   sizeof(std::byte),
-                   (level_rank - (level_rank % k)) * mod,
-                   int{0} /* tag */,
-                   requests.data());
-        comm.waitall(1, requests.data());
-      }
-    }
-    mod *= k;
-  }
-
-  // down
-
-  mod /= k;
-  while (mod >= 1) {
-    if (comm_rank % mod == 0) {
-      auto level_rank = comm_rank / mod;
-      if (level_rank % k == 0) {
-        auto num_isends = 0;
-        for (int i = 1; i < k; ++i) {
-          auto dst_rank = (level_rank + i) * mod;
-          if (dst_rank < comm_size) {
-            comm.isend(dummies.data() + (i - 1),
-                       sizeof(std::byte),
-                       dst_rank,
-                       int{0} /* tag */,
-                       requests.data() + (i - 1));
-            ++num_isends;
-          }
-        }
-        comm.waitall(num_isends, requests.data());
-      } else {
-        comm.irecv(dummies.data(),
-                   sizeof(std::byte),
-                   (level_rank - (level_rank % k)) * mod,
-                   int{0} /* tag */,
-                   requests.data());
-        comm.waitall(1, requests.data());
-      }
-    }
-    mod /= k;
-  }
-}
-
-}  // namespace cugraph

From eefe72984aaa423f7668b0c5c714016145e29053 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Sat, 12 Feb 2022 12:41:52 -0800
Subject: [PATCH 23/60] reduce temporary memory requirement in R-mat edge list
 generation

---
 cpp/tests/utilities/test_graphs.hpp | 178 +++++++++++++++++-----------
 1 file changed, 112 insertions(+), 66 deletions(-)

diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp
index 9fa4cee9f7a..0934beb466a 100644
--- a/cpp/tests/utilities/test_graphs.hpp
+++ b/cpp/tests/utilities/test_graphs.hpp
@@ -146,23 +146,33 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
     CUGRAPH_EXPECTS(((size_t{1} << scale_) * edge_factor_) <=
                       static_cast<size_t>(std::numeric_limits<edge_t>::max()),
                     "Invalid template parameter: (scale_, edge_factor_) too large for edge_t");
+    // generate in multi-partitions to limit peak memory usage (thrust::sort &
+    // shuffle_edgelist_by_gpu_id requires a temporary buffer with the size of the original data)
+    // With the current implementation, the temporary memory requirement is roughly 50% of the
+    // original data with num_partitions_per_gpu = 2. If we use cuMemAddressReserve
+    // (https://developer.nvidia.com/blog/introducing-low-level-gpu-virtual-memory-management), we
+    // can reduce the temporary memory requirement to (1 / num_partitions) * (original data size)
+    size_t constexpr num_partitions_per_gpu = 2;
 
-    std::vector<size_t> partition_ids(1);
-    size_t num_partitions;
+    // 1. calculate # partitions, # edges to generate in each partition, and partition vertex ranges
+
+    std::vector<size_t> partition_ids{};
+    size_t num_partitions{};
 
     if (multi_gpu_usecase_) {
       auto& comm           = handle.get_comms();
-      num_partitions       = comm.get_size();
+      num_partitions       = comm.get_size() * num_partitions_per_gpu;
       auto const comm_rank = comm.get_rank();
 
-      partition_ids.resize(multi_gpu ? size_t{1} : static_cast<size_t>(num_partitions));
+      partition_ids.resize(multi_gpu ? num_partitions_per_gpu : num_partitions);
 
       std::iota(partition_ids.begin(),
                 partition_ids.end(),
-                multi_gpu ? static_cast<size_t>(comm_rank) : size_t{0});
+                multi_gpu ? static_cast<size_t>(comm_rank) * num_partitions_per_gpu : size_t{0});
     } else {
-      num_partitions   = 1;
-      partition_ids[0] = size_t{0};
+      num_partitions = num_partitions_per_gpu;
+      partition_ids.resize(num_partitions);
+      std::iota(partition_ids.begin(), partition_ids.end(), size_t{0});
     }
 
     vertex_t number_of_vertices = static_cast<vertex_t>(size_t{1} << scale_);
@@ -191,17 +201,20 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
       }
     }
 
-    rmm::device_uvector<vertex_t> src_v(0, handle.get_stream());
-    rmm::device_uvector<vertex_t> dst_v(0, handle.get_stream());
-    auto weights_v = test_weighted
-                       ? std::make_optional<rmm::device_uvector<weight_t>>(0, handle.get_stream())
-                       : std::nullopt;
+    // 2. generate edges
+
+    std::vector<rmm::device_uvector<vertex_t>> src_partitions{};
+    std::vector<rmm::device_uvector<vertex_t>> dst_partitions{};
+    auto weight_partitions = test_weighted
+                               ? std::make_optional<std::vector<rmm::device_uvector<weight_t>>>()
+                               : std::nullopt;
+    src_partitions.reserve(partition_ids.size());
+    dst_partitions.reserve(partition_ids.size());
+    if (weight_partitions) { (*weight_partitions).reserve(partition_ids.size()); }
     for (size_t i = 0; i < partition_ids.size(); ++i) {
       auto id = partition_ids[i];
 
-      rmm::device_uvector<vertex_t> tmp_src_v(0, handle.get_stream());
-      rmm::device_uvector<vertex_t> tmp_dst_v(0, handle.get_stream());
-      std::tie(i == 0 ? src_v : tmp_src_v, i == 0 ? dst_v : tmp_dst_v) =
+      auto [tmp_src_v, tmp_dst_v] =
         cugraph::generate_rmat_edgelist<vertex_t>(handle,
                                                   scale_,
                                                   partition_edge_counts[i],
@@ -212,79 +225,112 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
                                                   undirected_ ? true : false);
 
       std::optional<rmm::device_uvector<weight_t>> tmp_weights_v{std::nullopt};
-      if (weights_v) {
-        if (i == 0) {
-          weights_v->resize(src_v.size(), handle.get_stream());
-        } else {
-          tmp_weights_v = std::make_optional<rmm::device_uvector<weight_t>>(tmp_src_v.size(),
-                                                                            handle.get_stream());
-        }
+      if (weight_partitions) {
+        tmp_weights_v =
+          std::make_optional<rmm::device_uvector<weight_t>>(tmp_src_v.size(), handle.get_stream());
 
         cugraph::detail::uniform_random_fill(handle.get_stream(),
-                                             i == 0 ? weights_v->data() : tmp_weights_v->data(),
-                                             i == 0 ? weights_v->size() : tmp_weights_v->size(),
+                                             tmp_weights_v->data(),
+                                             tmp_weights_v->size(),
                                              weight_t{0.0},
                                              weight_t{1.0},
                                              seed_ + num_partitions + id);
       }
 
-      if (i > 0) {
-        auto start_offset = src_v.size();
-        src_v.resize(start_offset + tmp_src_v.size(), handle.get_stream());
-        dst_v.resize(start_offset + tmp_dst_v.size(), handle.get_stream());
-        raft::copy(
-          src_v.begin() + start_offset, tmp_src_v.begin(), tmp_src_v.size(), handle.get_stream());
-        raft::copy(
-          dst_v.begin() + start_offset, tmp_dst_v.begin(), tmp_dst_v.size(), handle.get_stream());
-
-        if (weights_v) {
-          weights_v->resize(start_offset + tmp_weights_v->size(), handle.get_stream());
-          raft::copy(weights_v->begin() + start_offset,
-                     tmp_weights_v->begin(),
-                     tmp_weights_v->size(),
-                     handle.get_stream());
-        }
+      translate(handle, tmp_src_v, tmp_dst_v);
+
+      if (undirected_) {
+        std::tie(tmp_src_v, tmp_dst_v, tmp_weights_v) =
+          cugraph::symmetrize_edgelist_from_triangular<vertex_t, weight_t>(
+            handle, std::move(tmp_src_v), std::move(tmp_dst_v), std::move(tmp_weights_v));
       }
-    }
 
-    translate(handle, src_v, dst_v);
+      if (multi_gpu) {
+        std::tie(store_transposed ? tmp_dst_v : tmp_src_v,
+                 store_transposed ? tmp_src_v : tmp_dst_v,
+                 tmp_weights_v) =
+          cugraph::detail::shuffle_edgelist_by_gpu_id(
+            handle,
+            store_transposed ? std::move(tmp_dst_v) : std::move(tmp_src_v),
+            store_transposed ? std::move(tmp_src_v) : std::move(tmp_dst_v),
+            std::move(tmp_weights_v));
+      }
 
-    if (undirected_)
-      std::tie(src_v, dst_v, weights_v) =
-        cugraph::symmetrize_edgelist_from_triangular<vertex_t, weight_t>(
-          handle, std::move(src_v), std::move(dst_v), std::move(weights_v));
+      src_partitions.push_back(std::move(tmp_src_v));
+      dst_partitions.push_back(std::move(tmp_dst_v));
+      if (weight_partitions) { (*weight_partitions).push_back(std::move(*tmp_weights_v)); }
+    }
 
-    if (multi_gpu) {
-      std::tie(store_transposed ? dst_v : src_v, store_transposed ? src_v : dst_v, weights_v) =
-        cugraph::detail::shuffle_edgelist_by_gpu_id(
-          handle,
-          store_transposed ? std::move(dst_v) : std::move(src_v),
-          store_transposed ? std::move(src_v) : std::move(dst_v),
-          std::move(weights_v));
+    size_t tot_edge_counts{0};
+    for (size_t i = 0; i < src_partitions.size(); ++i) {
+      tot_edge_counts += src_partitions[i].size();
     }
 
-    rmm::device_uvector<vertex_t> vertices_v(0, handle.get_stream());
-    for (size_t i = 0; i < partition_ids.size(); ++i) {
-      auto id = partition_ids[i];
+    rmm::device_uvector<vertex_t> src_v(tot_edge_counts, handle.get_stream());
+    size_t src_offset{0};
+    for (size_t i = 0; i < src_partitions.size(); ++i) {
+      thrust::copy(handle.get_thrust_policy(),
+                   src_partitions[i].begin(),
+                   src_partitions[i].end(),
+                   src_v.begin() + src_offset);
+      src_offset += src_partitions[i].size();
+    }
+    src_partitions.clear();
+    src_partitions.shrink_to_fit();
+
+    rmm::device_uvector<vertex_t> dst_v(tot_edge_counts, handle.get_stream());
+    size_t dst_offset{0};
+    for (size_t i = 0; i < dst_partitions.size(); ++i) {
+      thrust::copy(handle.get_thrust_policy(),
+                   dst_partitions[i].begin(),
+                   dst_partitions[i].end(),
+                   dst_v.begin() + dst_offset);
+      dst_offset += dst_partitions[i].size();
+    }
+    dst_partitions.clear();
+    dst_partitions.shrink_to_fit();
+
+    std::optional<rmm::device_uvector<weight_t>> weight_v{std::nullopt};
+    if (weight_partitions) {
+      weight_v = rmm::device_uvector<weight_t>(tot_edge_counts, handle.get_stream());
+      size_t weight_offset{0};
+      for (size_t i = 0; i < (*weight_partitions).size(); ++i) {
+        thrust::copy(handle.get_thrust_policy(),
+                     (*weight_partitions)[i].begin(),
+                     (*weight_partitions)[i].end(),
+                     (*weight_v).begin() + weight_offset);
+        weight_offset += (*weight_partitions)[i].size();
+      }
+      (*weight_partitions).clear();
+      (*weight_partitions).shrink_to_fit();
+    }
+
+    // 3. generate vertices
 
-      auto start_offset = vertices_v.size();
-      vertices_v.resize(start_offset + (partition_vertex_lasts[i] - partition_vertex_firsts[i]),
-                        handle.get_stream());
-      cugraph::detail::sequence_fill(handle.get_stream(),
-                                     vertices_v.begin() + start_offset,
-                                     vertices_v.size() - start_offset,
-                                     partition_vertex_firsts[i]);
+    size_t tot_vertex_counts{0};
+    for (size_t i = 0; i < partition_vertex_firsts.size(); ++i) {
+      tot_vertex_counts += partition_vertex_lasts[i] - partition_vertex_firsts[i];
+    }
+    rmm::device_uvector<vertex_t> vertex_v(tot_vertex_counts, handle.get_stream());
+    size_t v_offset{0};
+    for (size_t i = 0; i < partition_vertex_firsts.size(); ++i) {
+      cugraph::detail::sequence_fill(
+        handle.get_stream(),
+        vertex_v.begin() + v_offset,
+        partition_vertex_lasts[i] - partition_vertex_firsts[i],
+        partition_vertex_firsts[i]);
+      v_offset += partition_vertex_lasts[i] - partition_vertex_firsts[i];
     }
 
     if constexpr (multi_gpu) {
-      vertices_v = cugraph::detail::shuffle_vertices_by_gpu_id(handle, std::move(vertices_v));
+      vertex_v = cugraph::detail::shuffle_vertices_by_gpu_id(handle, std::move(vertex_v));
     }
 
     return std::make_tuple(
       std::move(src_v),
       std::move(dst_v),
-      std::move(weights_v),
-      std::move(vertices_v),
+      std::move(weight_v),
+      std::move(vertex_v),
       static_cast<vertex_t>(detail::TranslateGraph_Usecase::base_vertex_id_) + number_of_vertices,
       undirected_);
   }

From 167b5abd046117ffbef90d4c9a5b7ca8cee87142 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Sat, 12 Feb 2022 12:43:23 -0800
Subject: [PATCH 24/60] input parameter renaming for clarity

---
 cpp/include/cugraph/detail/shuffle_wrappers.hpp | 6 +++---
 cpp/src/detail/shuffle_wrappers.cu              | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cugraph/detail/shuffle_wrappers.hpp b/cpp/include/cugraph/detail/shuffle_wrappers.hpp
index e205110d4f4..db02ab94a5d 100644
--- a/cpp/include/cugraph/detail/shuffle_wrappers.hpp
+++ b/cpp/include/cugraph/detail/shuffle_wrappers.hpp
@@ -76,8 +76,8 @@ rmm::device_uvector<vertex_t> shuffle_vertices_by_gpu_id(
  * @param[in/out] d_edgelist_minors Vertex IDs for columns (if the graph adjacency matrix is stored
  * as is) or rows (if the graph adjacency matrix is stored transposed)
  * @param[in/out] d_edgelist_weights Optional edge weights
- * @param[in] groupby_and_count_local_partition If set to true, groupby and count edges based on
- * (local partition ID, GPU ID) pairs (where GPU IDs are computed by applying the
+ * @param[in] groupby_and_count_local_partition_by_minor If set to true, groupby and count edges
+ * based on (local partition ID, GPU ID) pairs (where GPU IDs are computed by applying the
  * compute_gpu_id_from_vertex_t function to the minor vertex ID). If set to false, groupby and count
  * edges by just local partition ID.
  *
@@ -91,7 +91,7 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
   rmm::device_uvector<vertex_t>& d_edgelist_majors,
   rmm::device_uvector<vertex_t>& d_edgelist_minors,
   std::optional<rmm::device_uvector<weight_t>>& d_edgelist_weights,
-  bool groupby_and_count_local_partition = false);
+  bool groupby_and_count_local_partition_by_minor = false);
 
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/detail/shuffle_wrappers.cu b/cpp/src/detail/shuffle_wrappers.cu
index a9fa67c769f..4f25dcf30b7 100644
--- a/cpp/src/detail/shuffle_wrappers.cu
+++ b/cpp/src/detail/shuffle_wrappers.cu
@@ -147,7 +147,7 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
   rmm::device_uvector<vertex_t>& d_edgelist_majors,
   rmm::device_uvector<vertex_t>& d_edgelist_minors,
   std::optional<rmm::device_uvector<weight_t>>& d_edgelist_weights,
-  bool groupby_and_count_local_partition)
+  bool groupby_and_count_local_partition_by_minor)
 {
   auto& comm               = handle.get_comms();
   auto const comm_size     = comm.get_size();
@@ -162,7 +162,7 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
   auto pair_first = thrust::make_zip_iterator(
     thrust::make_tuple(d_edgelist_majors.begin(), d_edgelist_minors.begin()));
 
-  if (groupby_and_count_local_partition) {
+  if (groupby_and_count_local_partition_by_minor) {
     auto local_partition_id_gpu_id_pair_op =
       [comm_size,
        row_comm_size,

From f06f32b796b601cf6b37b0686e49c13000551719 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Sat, 12 Feb 2022 23:24:44 -0800
Subject: [PATCH 25/60] add a heuristic to cut peak memory footprint

---
 .../cugraph/utilities/shuffle_comm.cuh        | 228 ++++++++++++++++--
 cpp/src/detail/shuffle_wrappers.cu            |  13 +
 cpp/src/utilities/cython.cu                   |   2 +
 3 files changed, 227 insertions(+), 16 deletions(-)

diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh
index f10f9db95e1..00936a8c373 100644
--- a/cpp/include/cugraph/utilities/shuffle_comm.cuh
+++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh
@@ -125,6 +125,157 @@ compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const& comm,
   return std::make_tuple(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks);
 }
 
+template <typename value_type, typename ValueToGroupIdOp>
+struct value_group_id_less_t {
+  ValueToGroupIdOp value_to_group_id_op{};
+  int pivot{};
+  __device__ bool operator()(value_type v) const { return value_to_group_id_op(v) < pivot; }
+};
+
+template <typename key_type, typename value_type, typename KeyToGroupIdOp>
+struct kv_pair_group_id_less_t {
+  KeyToGroupIdOp key_to_group_id_op{};
+  int pivot{};
+  __device__ bool operator()(thrust::tuple<key_type, value_type> t) const
+  {
+    return key_to_group_id_op(thrust::get<0>(t)) < pivot;
+  }
+};
+
+template <typename value_type, typename ValueToGroupIdOp>
+struct value_group_id_greater_equal_t {
+  ValueToGroupIdOp value_to_group_id_op{};
+  int pivot{};
+  __device__ bool operator()(value_type v) const { return value_to_group_id_op(v) >= pivot; }
+};
+
+template <typename key_type, typename value_type, typename KeyToGroupIdOp>
+struct kv_pair_group_id_greater_equal_t {
+  KeyToGroupIdOp key_to_group_id_op{};
+  int pivot{};
+  __device__ bool operator()(thrust::tuple<key_type, value_type> t) const
+  {
+    return key_to_group_id_op(thrust::get<0>(t)) >= pivot;
+  }
+};
+
+// use roughly half temporary buffer than thrust::partition (if first & second partition sizes are
+// comparable)
+template <typename ValueIterator, typename ValueToGroupIdOp>
+ValueIterator mem_frugal_partition(
+  ValueIterator value_first,
+  ValueIterator value_last,
+  ValueToGroupIdOp value_to_group_id_op,
+  int pivot,  // group Id less than pivot goes to the first partition
+  rmm::cuda_stream_view stream_view)
+{
+  auto num_elements = static_cast<size_t>(thrust::distance(value_first, value_last));
+  auto first_size   = static_cast<size_t>(thrust::count_if(
+    rmm::exec_policy(stream_view),
+    value_first,
+    value_last,
+    value_group_id_less_t<typename thrust::iterator_traits<ValueIterator>::value_type,
+                          ValueToGroupIdOp>{value_to_group_id_op, pivot}));
+  auto second_size  = num_elements - first_size;
+
+  auto tmp_buffer =
+    allocate_dataframe_buffer<typename thrust::iterator_traits<ValueIterator>::value_type>(
+      second_size, stream_view);
+
+  // to limit memory footprint (16 * 1024 * 1024 is a tuning parameter)
+  // thrust::copy_if (1.15.0) also uses temporary buffer
+  auto constexpr max_elements_per_iteration = size_t{16} * 1024 * 1024;
+  auto num_chunks = (num_elements + max_elements_per_iteration - 1) / max_elements_per_iteration;
+  auto output_chunk_first = get_dataframe_buffer_begin(tmp_buffer);
+  for (size_t i = 0; i < num_chunks; ++i) {
+    output_chunk_first = thrust::copy_if(
+      rmm::exec_policy(stream_view),
+      value_first + max_elements_per_iteration * i,
+      value_first + std::min(max_elements_per_iteration * (i + 1), num_elements),
+      output_chunk_first,
+      value_group_id_greater_equal_t<typename thrust::iterator_traits<ValueIterator>::value_type,
+                                     ValueToGroupIdOp>{value_to_group_id_op, pivot});
+  }
+
+  thrust::remove_if(
+    rmm::exec_policy(stream_view),
+    value_first,
+    value_last,
+    value_group_id_greater_equal_t<typename thrust::iterator_traits<ValueIterator>::value_type,
+                                   ValueToGroupIdOp>{value_to_group_id_op, pivot});
+  thrust::copy(rmm::exec_policy(stream_view),
+               get_dataframe_buffer_cbegin(tmp_buffer),
+               get_dataframe_buffer_cend(tmp_buffer),
+               value_first + first_size);
+
+  return value_first + first_size;
+}
+
+// use roughly half temporary buffer than thrust::partition (if first & second partition sizes are
+// comparable)
+template <typename KeyIterator, typename ValueIterator, typename KeyToGroupIdOp>
+std::tuple<KeyIterator, ValueIterator> mem_frugal_partition(
+  KeyIterator key_first,
+  KeyIterator key_last,
+  ValueIterator value_first,
+  KeyToGroupIdOp key_to_group_id_op,
+  int pivot,  // group Id less than pivot goes to the first partition
+  rmm::cuda_stream_view stream_view)
+{
+  auto num_elements = static_cast<size_t>(thrust::distance(key_first, key_last));
+  auto first_size   = static_cast<size_t>(thrust::count_if(
+    rmm::exec_policy(stream_view),
+    key_first,
+    key_last,
+    kv_pair_group_id_less_t<typename thrust::iterator_traits<KeyIterator>::value_type,
+                            typename thrust::iterator_traits<ValueIterator>::value_type,
+                            KeyToGroupIdOp>{key_to_group_id_op, pivot}));
+  auto second_size  = num_elements - first_size;
+
+  auto tmp_key_buffer =
+    allocate_dataframe_buffer<typename thrust::iterator_traits<KeyIterator>::value_type>(
+      second_size, stream_view);
+  auto tmp_value_buffer =
+    allocate_dataframe_buffer<typename thrust::iterator_traits<ValueIterator>::value_type>(
+      second_size, stream_view);
+
+  // to limit memory footprint (16 * 1024 * 1024 is a tuning parameter)
+  // thrust::copy_if (1.15.0) also uses temporary buffer
+  auto max_elements_per_iteration = size_t{16} * 1024 * 1024;
+  auto num_chunks    = (num_elements + max_elements_per_iteration - 1) / max_elements_per_iteration;
+  auto kv_pair_first = thrust::make_zip_iterator(thrust::make_tuple(key_first, value_first));
+  auto output_chunk_first = thrust::make_zip_iterator(thrust::make_tuple(
+    get_dataframe_buffer_begin(tmp_key_buffer), get_dataframe_buffer_begin(tmp_value_buffer)));
+  for (size_t i = 0; i < num_chunks; ++i) {
+    output_chunk_first = thrust::copy_if(
+      rmm::exec_policy(stream_view),
+      kv_pair_first + max_elements_per_iteration * i,
+      kv_pair_first + std::min(max_elements_per_iteration * (i + 1), num_elements),
+      output_chunk_first,
+      kv_pair_group_id_greater_equal_t<typename thrust::iterator_traits<KeyIterator>::value_type,
+                                       typename thrust::iterator_traits<ValueIterator>::value_type,
+                                       KeyToGroupIdOp>{key_to_group_id_op, pivot});
+  }
+
+  thrust::remove_if(
+    rmm::exec_policy(stream_view),
+    kv_pair_first,
+    kv_pair_first + num_elements,
+    kv_pair_group_id_greater_equal_t<typename thrust::iterator_traits<KeyIterator>::value_type,
+                                     typename thrust::iterator_traits<ValueIterator>::value_type,
+                                     KeyToGroupIdOp>{key_to_group_id_op, pivot});
+  thrust::copy(rmm::exec_policy(stream_view),
+               get_dataframe_buffer_cbegin(tmp_key_buffer),
+               get_dataframe_buffer_cend(tmp_key_buffer),
+               key_first + first_size);
+  thrust::copy(rmm::exec_policy(stream_view),
+               get_dataframe_buffer_cbegin(tmp_value_buffer),
+               get_dataframe_buffer_cend(tmp_value_buffer),
+               value_first + first_size);
+
+  return std::make_tuple(key_first + first_size, value_first + first_size);
+}
+
 }  // namespace detail
 
 template <typename ValueIterator, typename ValueToGPUIdOp>
@@ -132,14 +283,33 @@ rmm::device_uvector<size_t> groupby_and_count(ValueIterator tx_value_first /* [I
                                               ValueIterator tx_value_last /* [INOUT */,
                                               ValueToGPUIdOp value_to_group_id_op,
                                               int num_groups,
+                                              bool mem_frugal,
                                               rmm::cuda_stream_view stream_view)
 {
-  thrust::sort(rmm::exec_policy(stream_view),
-               tx_value_first,
-               tx_value_last,
-               [value_to_group_id_op] __device__(auto lhs, auto rhs) {
-                 return value_to_group_id_op(lhs) < value_to_group_id_op(rhs);
-               });
+  if (mem_frugal) {
+    auto pivot        = num_groups / 2;
+    auto second_first = detail::mem_frugal_partition(
+      tx_value_first, tx_value_last, value_to_group_id_op, pivot, stream_view);
+    thrust::sort(rmm::exec_policy(stream_view),
+                 tx_value_first,
+                 second_first,
+                 [value_to_group_id_op] __device__(auto lhs, auto rhs) {
+                   return value_to_group_id_op(lhs) < value_to_group_id_op(rhs);
+                 });
+    thrust::sort(rmm::exec_policy(stream_view),
+                 second_first,
+                 tx_value_last,
+                 [value_to_group_id_op] __device__(auto lhs, auto rhs) {
+                   return value_to_group_id_op(lhs) < value_to_group_id_op(rhs);
+                 });
+  } else {
+    thrust::sort(rmm::exec_policy(stream_view),
+                 tx_value_first,
+                 tx_value_last,
+                 [value_to_group_id_op] __device__(auto lhs, auto rhs) {
+                   return value_to_group_id_op(lhs) < value_to_group_id_op(rhs);
+                 });
+  }
 
   auto group_id_first = thrust::make_transform_iterator(
     tx_value_first,
@@ -164,15 +334,36 @@ rmm::device_uvector<size_t> groupby_and_count(VertexIterator tx_key_first /* [IN
                                               ValueIterator tx_value_first /* [INOUT */,
                                               KeyToGPUIdOp key_to_group_id_op,
                                               int num_groups,
+                                              bool mem_frugal,
                                               rmm::cuda_stream_view stream_view)
 {
-  thrust::sort_by_key(rmm::exec_policy(stream_view),
-                      tx_key_first,
-                      tx_key_last,
-                      tx_value_first,
-                      [key_to_group_id_op] __device__(auto lhs, auto rhs) {
-                        return key_to_group_id_op(lhs) < key_to_group_id_op(rhs);
-                      });
+  if (mem_frugal) {
+    auto pivot        = num_groups / 2;
+    auto second_first = detail::mem_frugal_partition(
+      tx_key_first, tx_key_last, tx_value_first, key_to_group_id_op, pivot, stream_view);
+    thrust::sort_by_key(rmm::exec_policy(stream_view),
+                        tx_key_first,
+                        std::get<0>(second_first),
+                        tx_value_first,
+                        [key_to_group_id_op] __device__(auto lhs, auto rhs) {
+                          return key_to_group_id_op(lhs) < key_to_group_id_op(rhs);
+                        });
+    thrust::sort_by_key(rmm::exec_policy(stream_view),
+                        std::get<0>(second_first),
+                        tx_key_last,
+                        std::get<1>(second_first),
+                        [key_to_group_id_op] __device__(auto lhs, auto rhs) {
+                          return key_to_group_id_op(lhs) < key_to_group_id_op(rhs);
+                        });
+  } else {
+    thrust::sort_by_key(rmm::exec_policy(stream_view),
+                        tx_key_first,
+                        tx_key_last,
+                        tx_value_first,
+                        [key_to_group_id_op] __device__(auto lhs, auto rhs) {
+                          return key_to_group_id_op(lhs) < key_to_group_id_op(rhs);
+                        });
+  }
 
   auto group_id_first = thrust::make_transform_iterator(
     tx_key_first, [key_to_group_id_op] __device__(auto key) { return key_to_group_id_op(key); });
@@ -249,7 +440,7 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm,
   auto const comm_size = comm.get_size();
 
   auto d_tx_value_counts = groupby_and_count(
-    tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), stream_view);
+    tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), false, stream_view);
 
   std::vector<size_t> tx_counts{};
   std::vector<size_t> tx_offsets{};
@@ -298,8 +489,13 @@ auto groupby_gpuid_and_shuffle_kv_pairs(raft::comms::comms_t const& comm,
 {
   auto const comm_size = comm.get_size();
 
-  auto d_tx_value_counts = groupby_and_count(
-    tx_key_first, tx_key_last, tx_value_first, key_to_gpu_id_op, comm.get_size(), stream_view);
+  auto d_tx_value_counts = groupby_and_count(tx_key_first,
+                                             tx_key_last,
+                                             tx_value_first,
+                                             key_to_gpu_id_op,
+                                             comm.get_size(),
+                                             false,
+                                             stream_view);
 
   std::vector<size_t> tx_counts{};
   std::vector<size_t> tx_offsets{};
diff --git a/cpp/src/detail/shuffle_wrappers.cu b/cpp/src/detail/shuffle_wrappers.cu
index 4f25dcf30b7..7a52fd07822 100644
--- a/cpp/src/detail/shuffle_wrappers.cu
+++ b/cpp/src/detail/shuffle_wrappers.cu
@@ -159,6 +159,15 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
   auto const col_comm_size = col_comm.get_size();
   auto const col_comm_rank = col_comm.get_rank();
 
+  auto total_global_mem = handle.get_device_properties().totalGlobalMem;
+  auto element_size = sizeof(vertex_t) * 2 + (d_edgelist_weights ? sizeof(weight_t) : size_t{0});
+  auto mem_frugal =
+    d_edgelist_majors.size() * element_size >=
+    total_global_mem /
+      4;  // if the data size exceeds 1/4 of the device memory (1/4 is a tuning parameter),
+          // groupby_and_count requires temporary buffer comparable to the input data size, if
+          // mem_frugal is set to true, temporary buffer size can be reduced up to 50%
+
   auto pair_first = thrust::make_zip_iterator(
     thrust::make_tuple(d_edgelist_majors.begin(), d_edgelist_minors.begin()));
 
@@ -183,11 +192,13 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
                                                            d_edgelist_weights->begin(),
                                                            local_partition_id_gpu_id_pair_op,
                                                            comm_size,
+                                                           mem_frugal,
                                                            handle.get_stream())
                               : cugraph::groupby_and_count(pair_first,
                                                            pair_first + d_edgelist_majors.size(),
                                                            local_partition_id_gpu_id_pair_op,
                                                            comm_size,
+                                                           mem_frugal,
                                                            handle.get_stream());
   } else {
     auto local_partition_id_op =
@@ -203,11 +214,13 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
                                                            d_edgelist_weights->begin(),
                                                            local_partition_id_op,
                                                            col_comm_size,
+                                                           mem_frugal,
                                                            handle.get_stream())
                               : cugraph::groupby_and_count(pair_first,
                                                            pair_first + d_edgelist_majors.size(),
                                                            local_partition_id_op,
                                                            col_comm_size,
+                                                           mem_frugal,
                                                            handle.get_stream());
   }
 }
diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu
index 35a6be4edc3..afbabb64431 100644
--- a/cpp/src/utilities/cython.cu
+++ b/cpp/src/utilities/cython.cu
@@ -1248,11 +1248,13 @@ std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
                                                     ptr_ret->get_weights().data(),
                                                     local_partition_id_op,
                                                     col_comm_size,
+                                                    false,
                                                     handle.get_stream())
                        : cugraph::groupby_and_count(pair_first,
                                                     pair_first + ptr_ret->get_major().size(),
                                                     local_partition_id_op,
                                                     col_comm_size,
+                                                    false,
                                                     handle.get_stream());
 
   std::vector<size_t> h_edge_counts(edge_counts.size());

From 6dceeabdefe95cb98319fe7c95004402748bbe4c Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Sat, 12 Feb 2022 23:41:44 -0800
Subject: [PATCH 26/60] improve inconsistencies in naming

---
 ...ransform_reduce_key_aggregated_out_nbr.cuh |  2 +-
 ...orm_reduce_by_adj_matrix_row_col_key_e.cuh |  2 +-
 .../cugraph/utilities/collect_comm.cuh        |  4 +-
 cpp/include/cugraph/utilities/cython.hpp      |  2 +-
 .../cugraph/utilities/shuffle_comm.cuh        | 38 +++++++++----------
 cpp/src/community/louvain.cuh                 |  2 +-
 .../weakly_connected_components_impl.cuh      | 30 +++++++--------
 cpp/src/detail/shuffle_wrappers.cu            |  6 +--
 cpp/src/structure/coarsen_graph_impl.cuh      |  2 +-
 cpp/src/structure/relabel_impl.cuh            |  4 +-
 cpp/src/utilities/cython.cu                   |  6 +--
 11 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
index 1dee131a000..1ff109c7766 100644
--- a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
@@ -482,7 +482,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
       rmm::device_uvector<weight_t> rx_key_aggregated_edge_weights(0, handle.get_stream());
       std::forward_as_tuple(
         std::tie(rx_major_vertices, rx_minor_keys, rx_key_aggregated_edge_weights), std::ignore) =
-        groupby_gpuid_and_shuffle_values(
+        groupby_gpu_id_and_shuffle_values(
           col_comm,
           triplet_first,
           triplet_first + tmp_major_vertices.size(),
diff --git a/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh b/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh
index 7f4cad5eded..968d99b7d25 100644
--- a/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh
+++ b/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh
@@ -505,7 +505,7 @@ transform_reduce_by_adj_matrix_row_col_key_e(
       rmm::device_uvector<vertex_t> rx_unique_keys(0, handle.get_stream());
       auto rx_value_for_unique_key_buffer = allocate_dataframe_buffer<T>(0, handle.get_stream());
       std::tie(rx_unique_keys, rx_value_for_unique_key_buffer, std::ignore) =
-        groupby_gpuid_and_shuffle_kv_pairs(
+        groupby_gpu_id_and_shuffle_kv_pairs(
           comm,
           tmp_keys.begin(),
           tmp_keys.end(),
diff --git a/cpp/include/cugraph/utilities/collect_comm.cuh b/cpp/include/cugraph/utilities/collect_comm.cuh
index 8b89d941885..5b414f1f1eb 100644
--- a/cpp/include/cugraph/utilities/collect_comm.cuh
+++ b/cpp/include/cugraph/utilities/collect_comm.cuh
@@ -103,7 +103,7 @@ collect_values_for_keys(raft::comms::comms_t const& comm,
   {
     rmm::device_uvector<vertex_t> rx_unique_keys(0, stream_view);
     std::vector<size_t> rx_value_counts{};
-    std::tie(rx_unique_keys, rx_value_counts) = groupby_gpuid_and_shuffle_values(
+    std::tie(rx_unique_keys, rx_value_counts) = groupby_gpu_id_and_shuffle_values(
       comm,
       unique_keys.begin(),
       unique_keys.end(),
@@ -228,7 +228,7 @@ collect_values_for_unique_keys(raft::comms::comms_t const& comm,
   {
     rmm::device_uvector<vertex_t> rx_unique_keys(0, stream_view);
     std::vector<size_t> rx_value_counts{};
-    std::tie(rx_unique_keys, rx_value_counts) = groupby_gpuid_and_shuffle_values(
+    std::tie(rx_unique_keys, rx_value_counts) = groupby_gpu_id_and_shuffle_values(
       comm,
       unique_keys.begin(),
       unique_keys.end(),
diff --git a/cpp/include/cugraph/utilities/cython.hpp b/cpp/include/cugraph/utilities/cython.hpp
index 100a9d7db5e..260393009bd 100644
--- a/cpp/include/cugraph/utilities/cython.hpp
+++ b/cpp/include/cugraph/utilities/cython.hpp
@@ -588,7 +588,7 @@ template <typename vertex_t, typename edge_t, typename weight_t>
 std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
   raft::handle_t const& handle,
   vertex_t*
-    edgelist_major_vertices,  // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place
+    edgelist_major_vertices,  // [IN / OUT]: groupby_gpu_id_and_shuffle_values() sorts in-place
   vertex_t* edgelist_minor_vertices,  // [IN / OUT]
   weight_t* edgelist_weights,         // [IN / OUT]
   edge_t num_edgelist_edges);
diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh
index 00936a8c373..d951930a2b3 100644
--- a/cpp/include/cugraph/utilities/shuffle_comm.cuh
+++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh
@@ -278,10 +278,10 @@ std::tuple<KeyIterator, ValueIterator> mem_frugal_partition(
 
 }  // namespace detail
 
-template <typename ValueIterator, typename ValueToGPUIdOp>
+template <typename ValueIterator, typename ValueToGroupIdOp>
 rmm::device_uvector<size_t> groupby_and_count(ValueIterator tx_value_first /* [INOUT */,
                                               ValueIterator tx_value_last /* [INOUT */,
-                                              ValueToGPUIdOp value_to_group_id_op,
+                                              ValueToGroupIdOp value_to_group_id_op,
                                               int num_groups,
                                               bool mem_frugal,
                                               rmm::cuda_stream_view stream_view)
@@ -328,11 +328,11 @@ rmm::device_uvector<size_t> groupby_and_count(ValueIterator tx_value_first /* [I
   return d_tx_value_counts;
 }
 
-template <typename VertexIterator, typename ValueIterator, typename KeyToGPUIdOp>
+template <typename VertexIterator, typename ValueIterator, typename KeyToGroupIdOp>
 rmm::device_uvector<size_t> groupby_and_count(VertexIterator tx_key_first /* [INOUT */,
                                               VertexIterator tx_key_last /* [INOUT */,
                                               ValueIterator tx_value_first /* [INOUT */,
-                                              KeyToGPUIdOp key_to_group_id_op,
+                                              KeyToGroupIdOp key_to_group_id_op,
                                               int num_groups,
                                               bool mem_frugal,
                                               rmm::cuda_stream_view stream_view)
@@ -402,7 +402,7 @@ auto shuffle_values(raft::comms::comms_t const& comm,
     detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view);
 
   auto rx_value_buffer =
-    allocate_dataframe_buffer<typename std::iterator_traits<TxValueIterator>::value_type>(
+    allocate_dataframe_buffer<typename thrust::iterator_traits<TxValueIterator>::value_type>(
       rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view);
 
   // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released
@@ -431,11 +431,11 @@ auto shuffle_values(raft::comms::comms_t const& comm,
 }
 
 template <typename ValueIterator, typename ValueToGPUIdOp>
-auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm,
-                                      ValueIterator tx_value_first /* [INOUT */,
-                                      ValueIterator tx_value_last /* [INOUT */,
-                                      ValueToGPUIdOp value_to_gpu_id_op,
-                                      rmm::cuda_stream_view stream_view)
+auto groupby_gpu_id_and_shuffle_values(raft::comms::comms_t const& comm,
+                                       ValueIterator tx_value_first /* [INOUT */,
+                                       ValueIterator tx_value_last /* [INOUT */,
+                                       ValueToGPUIdOp value_to_gpu_id_op,
+                                       rmm::cuda_stream_view stream_view)
 {
   auto const comm_size = comm.get_size();
 
@@ -452,7 +452,7 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm,
     detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view);
 
   auto rx_value_buffer =
-    allocate_dataframe_buffer<typename std::iterator_traits<ValueIterator>::value_type>(
+    allocate_dataframe_buffer<typename thrust::iterator_traits<ValueIterator>::value_type>(
       rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view);
 
   // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released
@@ -480,12 +480,12 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const& comm,
 }
 
 template <typename VertexIterator, typename ValueIterator, typename KeyToGPUIdOp>
-auto groupby_gpuid_and_shuffle_kv_pairs(raft::comms::comms_t const& comm,
-                                        VertexIterator tx_key_first /* [INOUT */,
-                                        VertexIterator tx_key_last /* [INOUT */,
-                                        ValueIterator tx_value_first /* [INOUT */,
-                                        KeyToGPUIdOp key_to_gpu_id_op,
-                                        rmm::cuda_stream_view stream_view)
+auto groupby_gpu_id_and_shuffle_kv_pairs(raft::comms::comms_t const& comm,
+                                         VertexIterator tx_key_first /* [INOUT */,
+                                         VertexIterator tx_key_last /* [INOUT */,
+                                         ValueIterator tx_value_first /* [INOUT */,
+                                         KeyToGPUIdOp key_to_gpu_id_op,
+                                         rmm::cuda_stream_view stream_view)
 {
   auto const comm_size = comm.get_size();
 
@@ -506,10 +506,10 @@ auto groupby_gpuid_and_shuffle_kv_pairs(raft::comms::comms_t const& comm,
   std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) =
     detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view);
 
-  rmm::device_uvector<typename std::iterator_traits<VertexIterator>::value_type> rx_keys(
+  rmm::device_uvector<typename thrust::iterator_traits<VertexIterator>::value_type> rx_keys(
     rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view);
   auto rx_value_buffer =
-    allocate_dataframe_buffer<typename std::iterator_traits<ValueIterator>::value_type>(
+    allocate_dataframe_buffer<typename thrust::iterator_traits<ValueIterator>::value_type>(
       rx_keys.size(), stream_view);
 
   // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index 025c520abf5..094f3bc6546 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -308,7 +308,7 @@ class Louvain {
         thrust::make_tuple(cluster_keys_v_.begin(), cluster_weights_v_.begin()));
 
       std::forward_as_tuple(std::tie(rx_keys_v, rx_weights_v), std::ignore) =
-        groupby_gpuid_and_shuffle_values(
+        groupby_gpu_id_and_shuffle_values(
           handle_.get_comms(),
           pair_first,
           pair_first + current_graph_view_.get_number_of_local_vertices(),
diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh
index 21e9571fbb2..757fc9e3d23 100644
--- a/cpp/src/components/weakly_connected_components_impl.cuh
+++ b/cpp/src/components/weakly_connected_components_impl.cuh
@@ -371,17 +371,17 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
         // with fewer than one root per GPU
         if (std::reduce(first_candidate_degrees.begin(), first_candidate_degrees.end()) >
             degree_sum_threshold * comm_size) {
-          std::vector<std::tuple<edge_t, int>> degree_gpuid_pairs(comm_size);
+          std::vector<std::tuple<edge_t, int>> degree_gpu_id_pairs(comm_size);
           for (int i = 0; i < comm_size; ++i) {
-            degree_gpuid_pairs[i] = std::make_tuple(first_candidate_degrees[i], i);
+            degree_gpu_id_pairs[i] = std::make_tuple(first_candidate_degrees[i], i);
           }
-          std::sort(degree_gpuid_pairs.begin(), degree_gpuid_pairs.end(), [](auto lhs, auto rhs) {
+          std::sort(degree_gpu_id_pairs.begin(), degree_gpu_id_pairs.end(), [](auto lhs, auto rhs) {
             return std::get<0>(lhs) > std::get<0>(rhs);
           });
           edge_t sum{0};
-          for (size_t i = 0; i < degree_gpuid_pairs.size(); ++i) {
-            sum += std::get<0>(degree_gpuid_pairs[i]);
-            init_max_new_root_counts[std::get<1>(degree_gpuid_pairs[i])] = 1;
+          for (size_t i = 0; i < degree_gpu_id_pairs.size(); ++i) {
+            sum += std::get<0>(degree_gpu_id_pairs[i]);
+            init_max_new_root_counts[std::get<1>(degree_gpu_id_pairs[i])] = 1;
             if (sum > degree_sum_threshold * comm_size) { break; }
           }
         }
@@ -390,18 +390,18 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
         else if (level_graph_view.get_number_of_vertices() <=
                  static_cast<vertex_t>(handle.get_comms().get_size() *
                                        ceil(1.0 / max_new_roots_ratio))) {
-          std::vector<int> gpuids{};
-          gpuids.reserve(
+          std::vector<int> gpu_ids{};
+          gpu_ids.reserve(
             std::reduce(new_root_candidate_counts.begin(), new_root_candidate_counts.end()));
           for (size_t i = 0; i < new_root_candidate_counts.size(); ++i) {
-            gpuids.insert(gpuids.end(), new_root_candidate_counts[i], static_cast<int>(i));
+            gpu_ids.insert(gpu_ids.end(), new_root_candidate_counts[i], static_cast<int>(i));
           }
           std::random_device rd{};
-          std::shuffle(gpuids.begin(), gpuids.end(), std::mt19937(rd()));
-          gpuids.resize(
-            std::max(static_cast<vertex_t>(gpuids.size() * max_new_roots_ratio), vertex_t{1}));
-          for (size_t i = 0; i < gpuids.size(); ++i) {
-            ++init_max_new_root_counts[gpuids[i]];
+          std::shuffle(gpu_ids.begin(), gpu_ids.end(), std::mt19937(rd()));
+          gpu_ids.resize(
+            std::max(static_cast<vertex_t>(gpu_ids.size() * max_new_roots_ratio), vertex_t{1}));
+          for (size_t i = 0; i < gpu_ids.size(); ++i) {
+            ++init_max_new_root_counts[gpu_ids[i]];
           }
         } else {
           std::fill(init_max_new_root_counts.begin(),
@@ -678,7 +678,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
         auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
         auto const col_comm_size = col_comm.get_size();
 
-        std::tie(edge_buffer, std::ignore) = cugraph::groupby_gpuid_and_shuffle_values(
+        std::tie(edge_buffer, std::ignore) = cugraph::groupby_gpu_id_and_shuffle_values(
           comm,
           get_dataframe_buffer_begin(edge_buffer),
           get_dataframe_buffer_end(edge_buffer),
diff --git a/cpp/src/detail/shuffle_wrappers.cu b/cpp/src/detail/shuffle_wrappers.cu
index 7a52fd07822..fd5bffbb950 100644
--- a/cpp/src/detail/shuffle_wrappers.cu
+++ b/cpp/src/detail/shuffle_wrappers.cu
@@ -51,7 +51,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,
     std::forward_as_tuple(
       std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors, d_rx_edgelist_weights),
       std::ignore) =
-      cugraph::groupby_gpuid_and_shuffle_values(
+      cugraph::groupby_gpu_id_and_shuffle_values(
         comm,  // handle.get_comms(),
         edge_first,
         edge_first + d_edgelist_majors.size(),
@@ -67,7 +67,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,
 
     std::forward_as_tuple(std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors),
                           std::ignore) =
-      cugraph::groupby_gpuid_and_shuffle_values(
+      cugraph::groupby_gpu_id_and_shuffle_values(
         comm,  // handle.get_comms(),
         edge_first,
         edge_first + d_edgelist_majors.size(),
@@ -124,7 +124,7 @@ rmm::device_uvector<vertex_t> shuffle_vertices_by_gpu_id(raft::handle_t const& h
   auto const comm_size = comm.get_size();
 
   rmm::device_uvector<vertex_t> d_rx_vertices(0, handle.get_stream());
-  std::tie(d_rx_vertices, std::ignore) = cugraph::groupby_gpuid_and_shuffle_values(
+  std::tie(d_rx_vertices, std::ignore) = cugraph::groupby_gpu_id_and_shuffle_values(
     comm,  // handle.get_comms(),
     d_vertices.begin(),
     d_vertices.end(),
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index b0f6c7eca05..aa2a87ccc72 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -267,7 +267,7 @@ coarsen_graph(
 
     // 1-3. append data to local adjacency matrix partitions
 
-    // FIXME: we can skip this if groupby_gpuid_and_shuffle_values is updated to return sorted edge
+    // FIXME: we can skip this if groupby_gpu_id_and_shuffle_values is updated to return sorted edge
     // list based on the final matrix partition (maybe add
     // groupby_adj_matrix_partition_and_shuffle_values).
 
diff --git a/cpp/src/structure/relabel_impl.cuh b/cpp/src/structure/relabel_impl.cuh
index d709152f71c..4ace52e351a 100644
--- a/cpp/src/structure/relabel_impl.cuh
+++ b/cpp/src/structure/relabel_impl.cuh
@@ -95,7 +95,7 @@ void relabel(raft::handle_t const& handle,
           thrust::make_tuple(label_pair_old_labels.begin(), label_pair_new_labels.begin()));
         std::forward_as_tuple(std::tie(rx_label_pair_old_labels, rx_label_pair_new_labels),
                               std::ignore) =
-          groupby_gpuid_and_shuffle_values(
+          groupby_gpu_id_and_shuffle_values(
             handle.get_comms(),
             pair_first,
             pair_first + num_label_pairs,
@@ -136,7 +136,7 @@ void relabel(raft::handle_t const& handle,
       {
         rmm::device_uvector<vertex_t> rx_unique_old_labels(0, handle.get_stream());
         std::vector<size_t> rx_value_counts{};
-        std::tie(rx_unique_old_labels, rx_value_counts) = groupby_gpuid_and_shuffle_values(
+        std::tie(rx_unique_old_labels, rx_value_counts) = groupby_gpu_id_and_shuffle_values(
           handle.get_comms(),
           unique_old_labels.begin(),
           unique_old_labels.end(),
diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu
index afbabb64431..59241a3e913 100644
--- a/cpp/src/utilities/cython.cu
+++ b/cpp/src/utilities/cython.cu
@@ -1182,7 +1182,7 @@ template <typename vertex_t, typename edge_t, typename weight_t>
 std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
   raft::handle_t const& handle,
   vertex_t*
-    edgelist_major_vertices,  // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place
+    edgelist_major_vertices,  // [IN / OUT]: groupby_gpu_id_and_shuffle_values() sorts in-place
   vertex_t* edgelist_minor_vertices,  // [IN / OUT]
   weight_t* edgelist_weights,         // [IN / OUT]
   edge_t num_edgelist_edges)
@@ -1204,7 +1204,7 @@ std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
     std::forward_as_tuple(
       std::tie(ptr_ret->get_major(), ptr_ret->get_minor(), ptr_ret->get_weights()),
       std::ignore) =
-      cugraph::groupby_gpuid_and_shuffle_values(
+      cugraph::groupby_gpu_id_and_shuffle_values(
         comm,  // handle.get_comms(),
         zip_edge,
         zip_edge + num_edgelist_edges,
@@ -1220,7 +1220,7 @@ std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
 
     std::forward_as_tuple(std::tie(ptr_ret->get_major(), ptr_ret->get_minor()),
                           std::ignore) =
-      cugraph::groupby_gpuid_and_shuffle_values(
+      cugraph::groupby_gpu_id_and_shuffle_values(
         comm,  // handle.get_comms(),
         zip_edge,
         zip_edge + num_edgelist_edges,

From febd72d6e2e7fa9d561238b3ddc5fc3b0e10abae Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Mon, 14 Feb 2022 18:02:22 -0800
Subject: [PATCH 27/60] split groupby_and_shuffle_edgelist to groupby_and_count
 and shuffle_edgelist to pass mem_frugal=true to limit the maximum allocation
 chunk size to avoid malloc failure due to fragmentation with the pool
 allocator

---
 cpp/src/detail/shuffle_wrappers.cu | 85 +++++++++++++++++++++---------
 1 file changed, 60 insertions(+), 25 deletions(-)

diff --git a/cpp/src/detail/shuffle_wrappers.cu b/cpp/src/detail/shuffle_wrappers.cu
index fd5bffbb950..b22c839e346 100644
--- a/cpp/src/detail/shuffle_wrappers.cu
+++ b/cpp/src/detail/shuffle_wrappers.cu
@@ -41,6 +41,22 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,
   auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
   auto const col_comm_size = col_comm.get_size();
 
+  auto total_global_mem = handle.get_device_properties().totalGlobalMem;
+  auto element_size = sizeof(vertex_t) * 2 + (d_edgelist_weights ? sizeof(weight_t) : size_t{0});
+  auto mem_frugal =
+    d_edgelist_majors.size() * element_size >=
+    total_global_mem /
+      5;  // if the data size exceeds 1/5 of the device memory (1/5 is a tuning parameter),
+          // groupby_and_count requires temporary buffer comparable to the input data size, if
+          // mem_frugal is set to true, temporary buffer size can be reduced up to 50%
+
+  // invoke groupby_and_count and shuffle values to pass mem_frugal instead of directly calling
+  // groupby_gpu_id_and_shuffle_values there is no benefit in reducing peak memory as we need to
+  // allocate a receive buffer anyways) but this reduces the maximum memory allocation size by half
+  // (thrust::sort used inside the groupby_and_count allocates the entire temporary buffer in a
+  // single chunk, and the pool allocator  often cannot handle a large single allocation (due to
+  // fragmentation) even when the remaining free memory in aggregate is significantly larger than
+  // the requested size).
   rmm::device_uvector<vertex_t> d_rx_edgelist_majors(0, handle.get_stream());
   rmm::device_uvector<vertex_t> d_rx_edgelist_minors(0, handle.get_stream());
   std::optional<rmm::device_uvector<weight_t>> d_rx_edgelist_weights{std::nullopt};
@@ -48,35 +64,54 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,
     auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(
       d_edgelist_majors.begin(), d_edgelist_minors.begin(), (*d_edgelist_weights).begin()));
 
+    auto d_tx_value_counts = cugraph::groupby_and_count(
+      edge_first,
+      edge_first + d_edgelist_majors.size(),
+      [key_func =
+         cugraph::detail::compute_gpu_id_from_edge_t<vertex_t>{
+           comm_size, row_comm_size, col_comm_size}] __device__(auto val) {
+        return key_func(thrust::get<0>(val), thrust::get<1>(val));
+      },
+      comm_size,
+      mem_frugal,
+      handle.get_stream());
+
+    std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+    raft::update_host(h_tx_value_counts.data(),
+                      d_tx_value_counts.data(),
+                      d_tx_value_counts.size(),
+                      handle.get_stream());
+    handle.sync_stream();
+
     std::forward_as_tuple(
-      std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors, d_rx_edgelist_weights),
-      std::ignore) =
-      cugraph::groupby_gpu_id_and_shuffle_values(
-        comm,  // handle.get_comms(),
-        edge_first,
-        edge_first + d_edgelist_majors.size(),
-        [key_func =
-           cugraph::detail::compute_gpu_id_from_edge_t<vertex_t>{
-             comm_size, row_comm_size, col_comm_size}] __device__(auto val) {
-          return key_func(thrust::get<0>(val), thrust::get<1>(val));
-        },
-        handle.get_stream());
+      std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors, d_rx_edgelist_weights), std::ignore) =
+      shuffle_values(comm, edge_first, h_tx_value_counts, handle.get_stream());
   } else {
     auto edge_first = thrust::make_zip_iterator(
       thrust::make_tuple(d_edgelist_majors.begin(), d_edgelist_minors.begin()));
 
-    std::forward_as_tuple(std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors),
-                          std::ignore) =
-      cugraph::groupby_gpu_id_and_shuffle_values(
-        comm,  // handle.get_comms(),
-        edge_first,
-        edge_first + d_edgelist_majors.size(),
-        [key_func =
-           cugraph::detail::compute_gpu_id_from_edge_t<vertex_t>{
-             comm_size, row_comm_size, col_comm_size}] __device__(auto val) {
-          return key_func(thrust::get<0>(val), thrust::get<1>(val));
-        },
-        handle.get_stream());
+    auto d_tx_value_counts = cugraph::groupby_and_count(
+      edge_first,
+      edge_first + d_edgelist_majors.size(),
+      [key_func =
+         cugraph::detail::compute_gpu_id_from_edge_t<vertex_t>{
+           comm_size, row_comm_size, col_comm_size}] __device__(auto val) {
+        return key_func(thrust::get<0>(val), thrust::get<1>(val));
+      },
+      comm_size,
+      mem_frugal,
+      handle.get_stream());
+
+    std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+    raft::update_host(h_tx_value_counts.data(),
+                      d_tx_value_counts.data(),
+                      d_tx_value_counts.size(),
+                      handle.get_stream());
+    handle.sync_stream();
+
+    std::forward_as_tuple(
+      std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors), std::ignore) =
+      shuffle_values(comm, edge_first, h_tx_value_counts, handle.get_stream());
   }
 
   return std::make_tuple(std::move(d_rx_edgelist_majors),
@@ -164,7 +199,7 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
   auto mem_frugal =
     d_edgelist_majors.size() * element_size >=
     total_global_mem /
-      4;  // if the data size exceeds 1/4 of the device memory (1/4 is a tuning parameter),
+      5;  // if the data size exceeds 1/5 of the device memory (1/5 is a tuning parameter),
           // groupby_and_count requires temporary buffer comparable to the input data size, if
           // mem_frugal is set to true, temporary buffer size can be reduced up to 50%
 

From 48fa2559725add366b7dd87a6221071a53f88bdf Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 16 Feb 2022 09:45:03 -0800
Subject: [PATCH 28/60] use temporary host buffer to concatenate edge list in
 edge generation

---
 cpp/tests/utilities/test_graphs.hpp | 98 +++++++++++++++++------------
 1 file changed, 59 insertions(+), 39 deletions(-)

diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp
index 0934beb466a..dc0f13fc9f0 100644
--- a/cpp/tests/utilities/test_graphs.hpp
+++ b/cpp/tests/utilities/test_graphs.hpp
@@ -29,6 +29,53 @@ namespace test {
 
 namespace detail {
 
+template <typename T>
+std::optional<rmm::device_uvector<T>> try_allocate(raft::handle_t const& handle, size_t size)
+{
+  try {
+    return std::make_optional<rmm::device_uvector<T>>(size, handle.get_stream());
+  } catch (std::exception const& e) {
+    return std::nullopt;
+  }
+}
+
+// use host memory as temporary buffer if memroy allocation on device fails
+template <typename T>
+rmm::device_uvector<T> concatenate(raft::handle_t const& handle,
+                                   std::vector<rmm::device_uvector<T>>&& inputs)
+{
+  size_t tot_count{0};
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    tot_count += inputs[i].size();
+  }
+
+  auto output = try_allocate<T>(handle, tot_count);
+  if (output) {
+    size_t offset{0};
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      raft::copy(
+        (*output).data() + offset, inputs[i].data(), inputs[i].size(), handle.get_stream());
+      offset += inputs[i].size();
+    }
+    inputs.clear();
+    inputs.shrink_to_fit();
+  } else {
+    std::vector<T> h_buffer(tot_count);
+    size_t offset{0};
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      raft::update_host(
+        h_buffer.data() + offset, inputs[i].data(), inputs[i].size(), handle.get_stream());
+      offset += inputs[i].size();
+    }
+    inputs.clear();
+    inputs.shrink_to_fit();
+    output = rmm::device_uvector<T>(tot_count, handle.get_stream());
+    raft::update_device((*output).data(), h_buffer.data(), h_buffer.size(), handle.get_stream());
+  }
+
+  return std::move(*output);
+}
+
 class TranslateGraph_Usecase {
  public:
   TranslateGraph_Usecase() = delete;
@@ -266,43 +313,17 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
       tot_edge_counts += src_partitions[i].size();
     }
 
-    rmm::device_uvector<vertex_t> src_v(tot_edge_counts, handle.get_stream());
-    size_t src_offset{0};
-    for (size_t i = 0; i < src_partitions.size(); ++i) {
-      thrust::copy(handle.get_thrust_policy(),
-                   src_partitions[i].begin(),
-                   src_partitions[i].end(),
-                   src_v.begin() + src_offset);
-      src_offset += src_partitions[i].size();
-    }
-    src_partitions.clear();
-    src_partitions.shrink_to_fit();
-
-    rmm::device_uvector<vertex_t> dst_v(tot_edge_counts, handle.get_stream());
-    size_t dst_offset{0};
-    for (size_t i = 0; i < dst_partitions.size(); ++i) {
-      thrust::copy(handle.get_thrust_policy(),
-                   dst_partitions[i].begin(),
-                   dst_partitions[i].end(),
-                   dst_v.begin() + dst_offset);
-      dst_offset += dst_partitions[i].size();
-    }
-    dst_partitions.clear();
-    dst_partitions.shrink_to_fit();
+    // detail::concatenate uses a host buffer to store input vectors if initial device memory
+    // allocation for the return vector fails. This does not improve peak memory usage and is not
+    // helpful with the rmm_mode = cuda. However, if rmm_mode = pool, memory allocation can fail
+    // even when the aggregate free memory size far exceeds the requested size. This heuristic is
+    // helpful in this case.
 
+    auto src_v = detail::concatenate(handle, std::move(src_partitions));
+    auto dst_v = detail::concatenate(handle, std::move(dst_partitions));
     std::optional<rmm::device_uvector<weight_t>> weight_v{std::nullopt};
     if (weight_partitions) {
-      weight_v = rmm::device_uvector<weight_t>(tot_edge_counts, handle.get_stream());
-      size_t weight_offset{0};
-      for (size_t i = 0; i < (*weight_partitions).size(); ++i) {
-        thrust::copy(handle.get_thrust_policy(),
-                     (*weight_partitions)[i].begin(),
-                     (*weight_partitions)[i].end(),
-                     (*weight_v).begin() + weight_offset);
-        weight_offset += (*weight_partitions)[i].size();
-      }
-      (*weight_partitions).clear();
-      (*weight_partitions).shrink_to_fit();
+      weight_v = detail::concatenate(handle, std::move(*weight_partitions));
     }
 
     // 3. generate vertices
@@ -314,11 +335,10 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
     rmm::device_uvector<vertex_t> vertex_v(tot_vertex_counts, handle.get_stream());
     size_t v_offset{0};
     for (size_t i = 0; i < partition_vertex_firsts.size(); ++i) {
-      cugraph::detail::sequence_fill(
-        handle.get_stream(),
-        vertex_v.begin() + v_offset,
-        partition_vertex_lasts[i] - partition_vertex_firsts[i],
-        partition_vertex_firsts[i]);
+      cugraph::detail::sequence_fill(handle.get_stream(),
+                                     vertex_v.begin() + v_offset,
+                                     partition_vertex_lasts[i] - partition_vertex_firsts[i],
+                                     partition_vertex_firsts[i]);
       v_offset += partition_vertex_lasts[i] - partition_vertex_firsts[i];
     }
 

From 234a11971d70a007a607cad8b92350f15f8b6fea Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 16 Feb 2022 09:51:40 -0800
Subject: [PATCH 29/60] update groupby to take mem_frugal_threshold instead of
 bool mem_frugal

---
 .../cugraph/utilities/shuffle_comm.cuh        | 231 +++++++++++++-----
 cpp/src/detail/shuffle_wrappers.cu            |  59 ++---
 2 files changed, 200 insertions(+), 90 deletions(-)

diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh
index d951930a2b3..309a30c78e2 100644
--- a/cpp/include/cugraph/utilities/shuffle_comm.cuh
+++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh
@@ -159,14 +159,16 @@ struct kv_pair_group_id_greater_equal_t {
   }
 };
 
-// use roughly half temporary buffer than thrust::partition (if first & second partition sizes are
-// comparable)
+// Use roughly half temporary buffer than thrust::partition (if first & second partition sizes are
+// comparable). This also uses multiple smaller allocations than one single allocation (thrust::sort
+// does this) of the same aggregate size if the input iterators are the zip iterators (this is more
+// favorable to the pool allocator).
 template <typename ValueIterator, typename ValueToGroupIdOp>
 ValueIterator mem_frugal_partition(
   ValueIterator value_first,
   ValueIterator value_last,
   ValueToGroupIdOp value_to_group_id_op,
-  int pivot,  // group Id less than pivot goes to the first partition
+  int pivot,  // group id less than pivot goes to the first partition
   rmm::cuda_stream_view stream_view)
 {
   auto num_elements = static_cast<size_t>(thrust::distance(value_first, value_last));
@@ -211,8 +213,10 @@ ValueIterator mem_frugal_partition(
   return value_first + first_size;
 }
 
-// use roughly half temporary buffer than thrust::partition (if first & second partition sizes are
-// comparable)
+// Use roughly half temporary buffer than thrust::partition (if first & second partition sizes are
+// comparable). This also uses multiple smaller allocations than one single allocation (thrust::sort
+// does this) of the same aggregate size if the input iterators are the zip iterators (this is more
+// favorable to the pool allocator).
 template <typename KeyIterator, typename ValueIterator, typename KeyToGroupIdOp>
 std::tuple<KeyIterator, ValueIterator> mem_frugal_partition(
   KeyIterator key_first,
@@ -276,6 +280,145 @@ std::tuple<KeyIterator, ValueIterator> mem_frugal_partition(
   return std::make_tuple(key_first + first_size, value_first + first_size);
 }
 
+template <typename ValueIterator, typename ValueToGroupIdOp>
+void mem_frugal_groupby(
+  ValueIterator value_first,
+  ValueIterator value_last,
+  ValueToGroupIdOp value_to_group_id_op,
+  int num_groups,
+  size_t mem_frugal_threshold,  // take the memory frugal approach (instead of thrust::sort) if #
+                                // elements to groupby is no smaller than this value
+  rmm::cuda_stream_view stream_view)
+{
+  std::vector<int> group_firsts{};
+  std::vector<int> group_lasts{};
+  std::vector<ValueIterator> value_firsts{};
+  std::vector<ValueIterator> value_lasts{};
+  if (num_groups > 1) {
+    group_firsts.push_back(int{0});
+    group_lasts.push_back(num_groups);
+    value_firsts.push_back(value_first);
+    value_lasts.push_back(value_last);
+  }
+
+  auto offset_first = size_t{0};
+  auto offset_last  = group_firsts.size();
+  while (offset_first < offset_last) {
+    for (size_t i = offset_first; i < offset_last; ++i) {
+      auto pivot = (group_firsts[i] + group_lasts[i]) / 2;
+      if (static_cast<size_t>(thrust::distance(value_firsts[i], value_lasts[i])) <
+          mem_frugal_threshold) {
+        if (group_lasts[i] - group_firsts[i] == 2) {
+          thrust::partition(
+            rmm::exec_policy(stream_view),
+            value_firsts[i],
+            value_lasts[i],
+            value_group_id_less_t<typename thrust::iterator_traits<ValueIterator>::value_type,
+                                  ValueToGroupIdOp>{value_to_group_id_op, pivot});
+        } else {
+          thrust::sort(rmm::exec_policy(stream_view),
+                       value_firsts[i],
+                       value_lasts[i],
+                       [value_to_group_id_op] __device__(auto lhs, auto rhs) {
+                         return value_to_group_id_op(lhs) < value_to_group_id_op(rhs);
+                       });
+        }
+      } else {
+        auto second_first = mem_frugal_partition(
+          value_firsts[i], value_lasts[i], value_to_group_id_op, pivot, stream_view);
+        if (pivot - group_firsts[i] > 1) {
+          group_firsts.push_back(group_firsts[i]);
+          group_lasts.push_back(pivot);
+          value_firsts.push_back(value_firsts[i]);
+          value_lasts.push_back(second_first);
+        }
+        if (group_lasts[i] - pivot > 1) {
+          group_firsts.push_back(pivot);
+          group_lasts.push_back(group_lasts[i]);
+          value_firsts.push_back(second_first);
+          value_lasts.push_back(value_lasts[i]);
+        }
+      }
+    }
+    offset_first = offset_last;
+    offset_last  = group_firsts.size();
+  }
+}
+
+template <typename KeyIterator, typename ValueIterator, typename KeyToGroupIdOp>
+void mem_frugal_groupby(
+  KeyIterator key_first,
+  KeyIterator key_last,
+  ValueIterator value_first,
+  KeyToGroupIdOp key_to_group_id_op,
+  int num_groups,
+  size_t mem_frugal_threshold,  // take the memory frugal approach (instead of thrust::sort) if #
+                                // elements to groupby is no smaller than this value
+  rmm::cuda_stream_view stream_view)
+{
+  std::vector<int> group_firsts{};
+  std::vector<int> group_lasts{};
+  std::vector<KeyIterator> key_firsts{};
+  std::vector<KeyIterator> key_lasts{};
+  std::vector<ValueIterator> value_firsts{};
+  if (num_groups > 1) {
+    group_firsts.push_back(int{0});
+    group_lasts.push_back(num_groups);
+    key_firsts.push_back(key_first);
+    key_lasts.push_back(key_last);
+    value_firsts.push_back(value_first);
+  }
+
+  auto offset_first = size_t{0};
+  auto offset_last  = group_firsts.size();
+  while (offset_first < offset_last) {
+    for (size_t i = offset_first; i < offset_last; ++i) {
+      auto pivot = (group_firsts[i] + group_lasts[i]) / 2;
+      if (static_cast<size_t>(thrust::distance(key_firsts[i], key_lasts[i])) <
+          mem_frugal_threshold) {
+        if (group_lasts[i] - group_firsts[i] == 2) {
+          auto kv_pair_first =
+            thrust::make_zip_iterator(thrust::make_tuple(key_firsts[i], value_firsts[i]));
+          thrust::partition(
+            rmm::exec_policy(stream_view),
+            kv_pair_first,
+            kv_pair_first + thrust::distance(key_firsts[i], key_lasts[i]),
+            kv_pair_group_id_less_t<typename thrust::iterator_traits<KeyIterator>::value_type,
+                                    typename thrust::iterator_traits<ValueIterator>::value_type,
+                                    KeyToGroupIdOp>{key_to_group_id_op, pivot});
+        } else {
+          thrust::sort_by_key(rmm::exec_policy(stream_view),
+                              key_firsts[i],
+                              key_lasts[i],
+                              value_firsts[i],
+                              [key_to_group_id_op] __device__(auto lhs, auto rhs) {
+                                return key_to_group_id_op(lhs) < key_to_group_id_op(rhs);
+                              });
+        }
+      } else {
+        auto second_first = mem_frugal_partition(
+          key_firsts[i], key_lasts[i], value_firsts[i], key_to_group_id_op, pivot, stream_view);
+        if (pivot - group_firsts[i] > 1) {
+          group_firsts.push_back(group_firsts[i]);
+          group_lasts.push_back(pivot);
+          key_firsts.push_back(key_firsts[i]);
+          key_lasts.push_back(std::get<0>(second_first));
+          value_firsts.push_back(value_firsts[i]);
+        }
+        if (group_lasts[i] - pivot > 1) {
+          group_firsts.push_back(pivot);
+          group_lasts.push_back(group_lasts[i]);
+          key_firsts.push_back(std::get<0>(second_first));
+          key_lasts.push_back(key_lasts[i]);
+          value_firsts.push_back(std::get<1>(second_first));
+        }
+      }
+    }
+    offset_first = offset_last;
+    offset_last  = group_firsts.size();
+  }
+}
+
 }  // namespace detail
 
 template <typename ValueIterator, typename ValueToGroupIdOp>
@@ -283,33 +426,15 @@ rmm::device_uvector<size_t> groupby_and_count(ValueIterator tx_value_first /* [I
                                               ValueIterator tx_value_last /* [INOUT */,
                                               ValueToGroupIdOp value_to_group_id_op,
                                               int num_groups,
-                                              bool mem_frugal,
+                                              size_t mem_frugal_threshold,
                                               rmm::cuda_stream_view stream_view)
 {
-  if (mem_frugal) {
-    auto pivot        = num_groups / 2;
-    auto second_first = detail::mem_frugal_partition(
-      tx_value_first, tx_value_last, value_to_group_id_op, pivot, stream_view);
-    thrust::sort(rmm::exec_policy(stream_view),
-                 tx_value_first,
-                 second_first,
-                 [value_to_group_id_op] __device__(auto lhs, auto rhs) {
-                   return value_to_group_id_op(lhs) < value_to_group_id_op(rhs);
-                 });
-    thrust::sort(rmm::exec_policy(stream_view),
-                 second_first,
-                 tx_value_last,
-                 [value_to_group_id_op] __device__(auto lhs, auto rhs) {
-                   return value_to_group_id_op(lhs) < value_to_group_id_op(rhs);
-                 });
-  } else {
-    thrust::sort(rmm::exec_policy(stream_view),
-                 tx_value_first,
-                 tx_value_last,
-                 [value_to_group_id_op] __device__(auto lhs, auto rhs) {
-                   return value_to_group_id_op(lhs) < value_to_group_id_op(rhs);
-                 });
-  }
+  detail::mem_frugal_groupby(tx_value_first,
+                             tx_value_last,
+                             value_to_group_id_op,
+                             num_groups,
+                             mem_frugal_threshold,
+                             stream_view);
 
   auto group_id_first = thrust::make_transform_iterator(
     tx_value_first,
@@ -334,36 +459,16 @@ rmm::device_uvector<size_t> groupby_and_count(VertexIterator tx_key_first /* [IN
                                               ValueIterator tx_value_first /* [INOUT */,
                                               KeyToGroupIdOp key_to_group_id_op,
                                               int num_groups,
-                                              bool mem_frugal,
+                                              size_t mem_frugal_threshold,
                                               rmm::cuda_stream_view stream_view)
 {
-  if (mem_frugal) {
-    auto pivot        = num_groups / 2;
-    auto second_first = detail::mem_frugal_partition(
-      tx_key_first, tx_key_last, tx_value_first, key_to_group_id_op, pivot, stream_view);
-    thrust::sort_by_key(rmm::exec_policy(stream_view),
-                        tx_key_first,
-                        std::get<0>(second_first),
-                        tx_value_first,
-                        [key_to_group_id_op] __device__(auto lhs, auto rhs) {
-                          return key_to_group_id_op(lhs) < key_to_group_id_op(rhs);
-                        });
-    thrust::sort_by_key(rmm::exec_policy(stream_view),
-                        std::get<0>(second_first),
-                        tx_key_last,
-                        std::get<1>(second_first),
-                        [key_to_group_id_op] __device__(auto lhs, auto rhs) {
-                          return key_to_group_id_op(lhs) < key_to_group_id_op(rhs);
-                        });
-  } else {
-    thrust::sort_by_key(rmm::exec_policy(stream_view),
-                        tx_key_first,
-                        tx_key_last,
-                        tx_value_first,
-                        [key_to_group_id_op] __device__(auto lhs, auto rhs) {
-                          return key_to_group_id_op(lhs) < key_to_group_id_op(rhs);
-                        });
-  }
+  detail::mem_frugal_groupby(tx_key_first,
+                             tx_key_last,
+                             tx_value_first,
+                             key_to_group_id_op,
+                             num_groups,
+                             mem_frugal_threshold,
+                             stream_view);
 
   auto group_id_first = thrust::make_transform_iterator(
     tx_key_first, [key_to_group_id_op] __device__(auto key) { return key_to_group_id_op(key); });
@@ -439,8 +544,12 @@ auto groupby_gpu_id_and_shuffle_values(raft::comms::comms_t const& comm,
 {
   auto const comm_size = comm.get_size();
 
-  auto d_tx_value_counts = groupby_and_count(
-    tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), false, stream_view);
+  auto d_tx_value_counts = groupby_and_count(tx_value_first,
+                                             tx_value_last,
+                                             value_to_gpu_id_op,
+                                             comm.get_size(),
+                                             std::numeric_limits<size_t>::max(),
+                                             stream_view);
 
   std::vector<size_t> tx_counts{};
   std::vector<size_t> tx_offsets{};
@@ -494,7 +603,7 @@ auto groupby_gpu_id_and_shuffle_kv_pairs(raft::comms::comms_t const& comm,
                                              tx_value_first,
                                              key_to_gpu_id_op,
                                              comm.get_size(),
-                                             false,
+                                             std::numeric_limits<size_t>::max(),
                                              stream_view);
 
   std::vector<size_t> tx_counts{};
diff --git a/cpp/src/detail/shuffle_wrappers.cu b/cpp/src/detail/shuffle_wrappers.cu
index b22c839e346..6e9434882ba 100644
--- a/cpp/src/detail/shuffle_wrappers.cu
+++ b/cpp/src/detail/shuffle_wrappers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,20 +43,21 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,
 
   auto total_global_mem = handle.get_device_properties().totalGlobalMem;
   auto element_size = sizeof(vertex_t) * 2 + (d_edgelist_weights ? sizeof(weight_t) : size_t{0});
-  auto mem_frugal =
-    d_edgelist_majors.size() * element_size >=
-    total_global_mem /
-      5;  // if the data size exceeds 1/5 of the device memory (1/5 is a tuning parameter),
-          // groupby_and_count requires temporary buffer comparable to the input data size, if
-          // mem_frugal is set to true, temporary buffer size can be reduced up to 50%
-
-  // invoke groupby_and_count and shuffle values to pass mem_frugal instead of directly calling
-  // groupby_gpu_id_and_shuffle_values there is no benefit in reducing peak memory as we need to
-  // allocate a receive buffer anyways) but this reduces the maximum memory allocation size by half
-  // (thrust::sort used inside the groupby_and_count allocates the entire temporary buffer in a
-  // single chunk, and the pool allocator  often cannot handle a large single allocation (due to
-  // fragmentation) even when the remaining free memory in aggregate is significantly larger than
-  // the requested size).
+  auto constexpr mem_frugal_ratio =
+    0.1;  // if the expected temporary buffer size exceeds the mem_frugal_ratio of the
+          // total_global_mem, switch to the memory frugal approach (thrust::sort is used to
+          // group-by by default, and thrust::sort requires temporary buffer comparable to the input
+          // data size)
+  auto mem_frugal_threshold =
+    static_cast<size_t>(static_cast<double>(total_global_mem / element_size) * mem_frugal_ratio);
+
+  // invoke groupby_and_count and shuffle values to pass mem_frugal_threshold instead of directly
+  // calling groupby_gpu_id_and_shuffle_values there is no benefit in reducing peak memory as we
+  // need to allocate a receive buffer anyways) but this reduces the maximum memory allocation size
+  // by half or more (thrust::sort used inside the groupby_and_count allocates the entire temporary
+  // buffer in a single chunk, and the pool allocator  often cannot handle a large single allocation
+  // (due to fragmentation) even when the remaining free memory in aggregate is significantly larger
+  // than the requested size).
   rmm::device_uvector<vertex_t> d_rx_edgelist_majors(0, handle.get_stream());
   rmm::device_uvector<vertex_t> d_rx_edgelist_minors(0, handle.get_stream());
   std::optional<rmm::device_uvector<weight_t>> d_rx_edgelist_weights{std::nullopt};
@@ -73,7 +74,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,
         return key_func(thrust::get<0>(val), thrust::get<1>(val));
       },
       comm_size,
-      mem_frugal,
+      mem_frugal_threshold,
       handle.get_stream());
 
     std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
@@ -99,7 +100,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,
         return key_func(thrust::get<0>(val), thrust::get<1>(val));
       },
       comm_size,
-      mem_frugal,
+      mem_frugal_threshold,
       handle.get_stream());
 
     std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
@@ -109,8 +110,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,
                       handle.get_stream());
     handle.sync_stream();
 
-    std::forward_as_tuple(
-      std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors), std::ignore) =
+    std::forward_as_tuple(std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors), std::ignore) =
       shuffle_values(comm, edge_first, h_tx_value_counts, handle.get_stream());
   }
 
@@ -196,12 +196,13 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
 
   auto total_global_mem = handle.get_device_properties().totalGlobalMem;
   auto element_size = sizeof(vertex_t) * 2 + (d_edgelist_weights ? sizeof(weight_t) : size_t{0});
-  auto mem_frugal =
-    d_edgelist_majors.size() * element_size >=
-    total_global_mem /
-      5;  // if the data size exceeds 1/5 of the device memory (1/5 is a tuning parameter),
-          // groupby_and_count requires temporary buffer comparable to the input data size, if
-          // mem_frugal is set to true, temporary buffer size can be reduced up to 50%
+  auto constexpr mem_frugal_ratio =
+    0.1;  // if the expected temporary buffer size exceeds the mem_frugal_ratio of the
+          // total_global_mem, switch to the memory frugal approach (thrust::sort is used to
+          // group-by by default, and thrust::sort requires temporary buffer comparable to the input
+          // data size)
+  auto mem_frugal_threshold =
+    static_cast<size_t>(static_cast<double>(total_global_mem / element_size) * mem_frugal_ratio);
 
   auto pair_first = thrust::make_zip_iterator(
     thrust::make_tuple(d_edgelist_majors.begin(), d_edgelist_minors.begin()));
@@ -227,13 +228,13 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
                                                            d_edgelist_weights->begin(),
                                                            local_partition_id_gpu_id_pair_op,
                                                            comm_size,
-                                                           mem_frugal,
+                                                           mem_frugal_threshold,
                                                            handle.get_stream())
                               : cugraph::groupby_and_count(pair_first,
                                                            pair_first + d_edgelist_majors.size(),
                                                            local_partition_id_gpu_id_pair_op,
                                                            comm_size,
-                                                           mem_frugal,
+                                                           mem_frugal_threshold,
                                                            handle.get_stream());
   } else {
     auto local_partition_id_op =
@@ -249,13 +250,13 @@ rmm::device_uvector<size_t> groupby_and_count_edgelist_by_local_partition_id(
                                                            d_edgelist_weights->begin(),
                                                            local_partition_id_op,
                                                            col_comm_size,
-                                                           mem_frugal,
+                                                           mem_frugal_threshold,
                                                            handle.get_stream())
                               : cugraph::groupby_and_count(pair_first,
                                                            pair_first + d_edgelist_majors.size(),
                                                            local_partition_id_op,
                                                            col_comm_size,
-                                                           mem_frugal,
+                                                           mem_frugal_threshold,
                                                            handle.get_stream());
   }
 }

From 95f2295714756f4e72647f00b833dadb234b113b Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 16 Feb 2022 13:14:13 -0800
Subject: [PATCH 30/60] move counting unique local edge majors/mionrs inside
 the graph constructor to limit peak memory usage

---
 cpp/include/cugraph/graph_functions.hpp       |   3 -
 cpp/src/structure/coarsen_graph_impl.cuh      |   4 +-
 .../create_graph_from_edgelist_impl.cuh       |   4 +-
 cpp/src/structure/graph_impl.cuh              | 100 +++++++++-------
 cpp/src/structure/renumber_edgelist_impl.cuh  | 112 ++++++++----------
 5 files changed, 103 insertions(+), 120 deletions(-)

diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 5ddc244b183..d3017ac7aaa 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -37,9 +37,6 @@ struct renumber_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_gpu>>
   edge_t number_of_edges{};
   partition_t<vertex_t> partition{};
   std::vector<vertex_t> segment_offsets{};
-
-  vertex_t num_local_unique_edge_majors{};
-  vertex_t num_local_unique_edge_minors{};
 };
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index aa2a87ccc72..2559f9ef408 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -421,9 +421,7 @@ coarsen_graph(
         meta.number_of_edges,
         graph_properties_t{graph_view.is_symmetric(), false},
         meta.partition,
-        meta.segment_offsets,
-        store_transposed ? meta.num_local_unique_edge_minors : meta.num_local_unique_edge_majors,
-        store_transposed ? meta.num_local_unique_edge_majors : meta.num_local_unique_edge_minors}),
+        meta.segment_offsets}),
     std::move(renumber_map_labels));
 }
 
diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
index f05f5f957c6..985c99eda36 100644
--- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh
+++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
@@ -288,9 +288,7 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
         meta.number_of_edges,
         graph_properties,
         meta.partition,
-        meta.segment_offsets,
-        store_transposed ? meta.num_local_unique_edge_minors : meta.num_local_unique_edge_majors,
-        store_transposed ? meta.num_local_unique_edge_majors : meta.num_local_unique_edge_minors}),
+        meta.segment_offsets}),
     std::optional<rmm::device_uvector<vertex_t>>{std::move(renumber_map_labels)});
 }
 
diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index ef64e60ac2f..c674430ac20 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -80,6 +80,29 @@ struct has_nzd_t {
   }
 };
 
+// can't use lambda due to nvcc limitations (The enclosing parent function ("graph_t") for an
+// extended __device__ lambda must allow its address to be taken)
+template <typename vertex_t>
+struct atomic_or_bitmap_t {
+  uint32_t* bitmaps{nullptr};
+  vertex_t minor_first{};
+
+  __device__ void operator()(vertex_t minor) const {
+    auto minor_offset = minor - minor_first;
+    auto mask         = uint32_t{1} << (minor_offset % (sizeof(uint32_t) * 8));
+    atomicOr(bitmaps + (minor_offset / (sizeof(uint32_t) * 8)), mask);
+  }
+};
+
+// can't use lambda due to nvcc limitations (The enclosing parent function ("graph_t") for an
+// extended __device__ lambda must allow its address to be taken)
+template <typename vertex_t>
+struct popc_t {
+  __device__ vertex_t operator()(uint32_t bitmap) const {
+    return static_cast<vertex_t>(__popc(bitmap));
+  }
+};
+
 // can't use lambda due to nvcc limitations (The enclosing parent function ("graph_t") for an
 // extended __device__ lambda must allow its address to be taken)
 template <typename edge_t>
@@ -573,48 +596,6 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
         "Invalid input argument: meta.property.is_multigraph is false but the input edge list has "
         "parallel edges.");
     }
-
-    rmm::device_uvector<vertex_t> majors(number_of_local_edges, handle.get_stream());
-    rmm::device_uvector<vertex_t> minors(number_of_local_edges, handle.get_stream());
-    size_t cur_size{0};
-    for (size_t i = 0; i < edgelists.size(); ++i) {
-      auto p_majors = store_transposed ? edgelists[i].p_dst_vertices : edgelists[i].p_src_vertices;
-      auto p_minors = store_transposed ? edgelists[i].p_src_vertices : edgelists[i].p_dst_vertices;
-      thrust::copy(handle.get_thrust_policy(),
-                   p_majors,
-                   p_majors + edgelists[i].number_of_edges,
-                   majors.begin() + cur_size);
-      thrust::copy(handle.get_thrust_policy(),
-                   p_minors,
-                   p_minors + edgelists[i].number_of_edges,
-                   minors.begin() + cur_size);
-      cur_size += edgelists[i].number_of_edges;
-    }
-    thrust::sort(handle.get_thrust_policy(), majors.begin(), majors.end());
-    thrust::sort(handle.get_thrust_policy(), minors.begin(), minors.end());
-    auto num_local_unique_edge_majors = static_cast<vertex_t>(thrust::distance(
-      majors.begin(), thrust::unique(handle.get_thrust_policy(), majors.begin(), majors.end())));
-    auto num_local_unique_edge_minors = static_cast<vertex_t>(thrust::distance(
-      minors.begin(), thrust::unique(handle.get_thrust_policy(), minors.begin(), minors.end())));
-    // FIXME: temporarily disable this check as these are currently not used
-    // (row_col_properties_kv_pair_fill_ratio_threshold is set to 0.0, so (key, value) pairs for
-    // row/column properties will be never enabled) and we're not currently exposing this to the
-    // python layer. Should be re-enabled later once we enable the (key, value) pair feature and
-    // hopefully simplify the python graph creation pipeline as well (so no need to pass this
-    // information to the python layer).
-#if 0
-    if constexpr (store_transposed) {
-      CUGRAPH_EXPECTS(num_local_unique_edge_majors == meta.num_local_unique_edge_cols,
-                      "Invalid input argument: num_local_unique_edge_cols is erroneous.");
-      CUGRAPH_EXPECTS(num_local_unique_edge_minors == meta.num_local_unique_edge_rows,
-                      "Invalid input argument: num_local_unique_edge_rows is erroneous.");
-    } else {
-      CUGRAPH_EXPECTS(num_local_unique_edge_majors == meta.num_local_unique_edge_rows,
-                      "Invalid input argument: num_local_unique_edge_rows is erroneous.");
-      CUGRAPH_EXPECTS(num_local_unique_edge_minors == meta.num_local_unique_edge_cols,
-                      "Invalid input argument: num_local_unique_edge_cols is erroneous.");
-    }
-#endif
   }
 
   // aggregate segment_offsets
@@ -701,13 +682,40 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                         static_cast<edge_t>(adj_matrix_partition_indices_[i].size()));
   }
 
+
   // if # unique edge rows/cols << V / row_comm_size|col_comm_size, store unique edge rows/cols to
   // support storing edge row/column properties in (key, value) pairs.
 
-  auto num_local_unique_edge_majors =
-    store_transposed ? meta.num_local_unique_edge_cols : meta.num_local_unique_edge_rows;
-  auto num_local_unique_edge_minors =
-    store_transposed ? meta.num_local_unique_edge_rows : meta.num_local_unique_edge_cols;
+  vertex_t num_local_unique_edge_majors{0};
+  for (size_t i = 0; i < adj_matrix_partition_offsets_.size(); ++i) {
+    num_local_unique_edge_majors += thrust::count_if(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(vertex_t{0}),
+      thrust::make_counting_iterator(static_cast<vertex_t>(adj_matrix_partition_offsets_[i].size() - 1)),
+      has_nzd_t<vertex_t, edge_t>{adj_matrix_partition_offsets_[i].data(), vertex_t{0}});
+  }
+
+  auto [minor_first, minor_last] = partition_.get_matrix_partition_minor_range();
+  rmm::device_uvector<uint32_t> minor_bitmaps(
+    ((minor_last - minor_first) + sizeof(uint32_t) * 8 - 1) / (sizeof(uint32_t) * 8),
+    handle.get_stream());
+  thrust::fill(handle.get_thrust_policy(), minor_bitmaps.begin(), minor_bitmaps.end(), uint32_t{0});
+  for (size_t i = 0; i < adj_matrix_partition_indices_.size(); ++i) {
+    thrust::for_each(handle.get_thrust_policy(),
+                     adj_matrix_partition_indices_[i].begin(),
+                     adj_matrix_partition_indices_[i].end(),
+                     atomic_or_bitmap_t<vertex_t>{minor_bitmaps.data(), minor_first});
+  }
+
+  auto count_first = thrust::make_transform_iterator(minor_bitmaps.begin(), popc_t<vertex_t>{});
+  auto num_local_unique_edge_minors = thrust::reduce(
+    handle.get_thrust_policy(),
+    count_first,
+    count_first + minor_bitmaps.size(),
+    vertex_t{0});
+
+  minor_bitmaps.resize(0, handle.get_stream());
+  minor_bitmaps.shrink_to_fit(handle.get_stream());
 
   vertex_t aggregate_major_size{0};
   for (size_t i = 0; i < partition_.get_number_of_matrix_partitions(); ++i) {
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index 959d11b783f..aeb7682f440 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -77,23 +77,21 @@ struct search_and_set_degree_t {
   }
 };
 
-// returns renumber map, segment_offsets, and # unique edge majors & minors
+// returns renumber map and segment_offsets
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t, vertex_t>
-compute_renumber_map(raft::handle_t const& handle,
-                     std::optional<rmm::device_uvector<vertex_t>>&& local_vertices,
-                     std::vector<vertex_t const*> const& edgelist_majors,
-                     std::vector<vertex_t const*> const& edgelist_minors,
-                     std::vector<edge_t> const& edgelist_edge_counts)
+std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>> compute_renumber_map(
+  raft::handle_t const& handle,
+  std::optional<rmm::device_uvector<vertex_t>>&& local_vertices,
+  std::vector<vertex_t const*> const& edgelist_majors,
+  std::vector<vertex_t const*> const& edgelist_minors,
+  std::vector<edge_t> const& edgelist_edge_counts)
 {
   rmm::device_uvector<vertex_t> sorted_local_vertices(0, handle.get_stream());
-  vertex_t num_local_unique_edge_majors{0};
-  vertex_t num_local_unique_edge_minors{0};
 
   edge_t num_local_edges = std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end());
 
   // 1. if local_vertices.has_value() is false, find unique vertices from edge majors (to construct
-  // local_vertices), unique edge majors will be counted in step 4.
+  // local_vertices)
 
   rmm::device_uvector<vertex_t> sorted_unique_majors(0, handle.get_stream());
   if (!local_vertices) {
@@ -127,42 +125,40 @@ compute_renumber_map(raft::handle_t const& handle,
     sorted_unique_majors.shrink_to_fit(handle.get_stream());
   }
 
-  // 2. count unique edge minors.
-  // if local_vertices.has_value() is false, keep unique vertices from edge minors as well (to
-  // construct local_vertices)
+  // 2. if local_vertices.has_value() is false, find unique vertices from edge minors (to construct
+  // local_vertices)
 
-  rmm::device_uvector<vertex_t> sorted_unique_minors(num_local_edges, handle.get_stream());
-  size_t minor_offset{0};
-  for (size_t i = 0; i < edgelist_minors.size(); ++i) {
-    thrust::copy(handle.get_thrust_policy(),
-                 edgelist_minors[i],
-                 edgelist_minors[i] + edgelist_edge_counts[i],
-                 sorted_unique_minors.begin() + minor_offset);
-    thrust::sort(handle.get_thrust_policy(),
-                 sorted_unique_minors.begin() + minor_offset,
-                 sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i]);
-    minor_offset += static_cast<size_t>(thrust::distance(
-      sorted_unique_minors.begin() + minor_offset,
-      thrust::unique(handle.get_thrust_policy(),
-                     sorted_unique_minors.begin() + minor_offset,
-                     sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i])));
-  }
-  sorted_unique_minors.resize(minor_offset, handle.get_stream());
-  if (edgelist_minors.size() > 1) {
-    thrust::sort(
-      handle.get_thrust_policy(), sorted_unique_minors.begin(), sorted_unique_minors.end());
-    sorted_unique_minors.resize(thrust::distance(sorted_unique_minors.begin(),
-                                                 thrust::unique(handle.get_thrust_policy(),
-                                                                sorted_unique_minors.begin(),
-                                                                sorted_unique_minors.end())),
-                                handle.get_stream());
+  rmm::device_uvector<vertex_t> sorted_unique_minors(0, handle.get_stream());
+  if (!local_vertices) {
+    sorted_unique_minors.resize(num_local_edges, handle.get_stream());
+    size_t minor_offset{0};
+    for (size_t i = 0; i < edgelist_minors.size(); ++i) {
+      thrust::copy(handle.get_thrust_policy(),
+                   edgelist_minors[i],
+                   edgelist_minors[i] + edgelist_edge_counts[i],
+                   sorted_unique_minors.begin() + minor_offset);
+      thrust::sort(handle.get_thrust_policy(),
+                   sorted_unique_minors.begin() + minor_offset,
+                   sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i]);
+      minor_offset += static_cast<size_t>(thrust::distance(
+        sorted_unique_minors.begin() + minor_offset,
+        thrust::unique(handle.get_thrust_policy(),
+                       sorted_unique_minors.begin() + minor_offset,
+                       sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i])));
+    }
+    sorted_unique_minors.resize(minor_offset, handle.get_stream());
+    if (edgelist_minors.size() > 1) {
+      thrust::sort(
+        handle.get_thrust_policy(), sorted_unique_minors.begin(), sorted_unique_minors.end());
+      sorted_unique_minors.resize(thrust::distance(sorted_unique_minors.begin(),
+                                                   thrust::unique(handle.get_thrust_policy(),
+                                                                  sorted_unique_minors.begin(),
+                                                                  sorted_unique_minors.end())),
+                                  handle.get_stream());
+    }
+    sorted_unique_minors.shrink_to_fit(handle.get_stream());
   }
 
-  num_local_unique_edge_minors = static_cast<vertex_t>(sorted_unique_minors.size());
-
-  if (local_vertices) { sorted_unique_minors.resize(0, handle.get_stream()); }
-  sorted_unique_minors.shrink_to_fit(handle.get_stream());
-
   // 3. update sorted_local_vertices.
   // if local_vertices.has_value() is false, reconstruct local_vertices first
 
@@ -207,8 +203,7 @@ compute_renumber_map(raft::handle_t const& handle,
     }
   }
 
-  // 4. compute global degrees for the sorted local vertices, and count unique edge majors on the
-  // way
+  // 4. compute global degrees for the sorted local vertices
 
   rmm::device_uvector<edge_t> sorted_local_vertex_degrees(0, handle.get_stream());
   std::optional<std::vector<size_t>> stream_pool_indices{
@@ -280,8 +275,6 @@ compute_renumber_map(raft::handle_t const& handle,
                             tmp_keys.begin(),
                             tmp_values.begin());
 
-      num_local_unique_edge_majors += num_unique_majors;
-
       tmp_majors.resize(0, loop_stream);
       tmp_majors.shrink_to_fit(loop_stream);
 
@@ -342,8 +335,6 @@ compute_renumber_map(raft::handle_t const& handle,
                           tmp_keys.begin(),
                           tmp_values.begin());
 
-    num_local_unique_edge_majors += num_unique_majors;
-
     tmp_majors.resize(0, handle.get_stream());
     tmp_majors.shrink_to_fit(handle.get_stream());
 
@@ -426,10 +417,7 @@ compute_renumber_map(raft::handle_t const& handle,
                     handle.get_stream());
   handle.sync_stream();
 
-  return std::make_tuple(std::move(sorted_local_vertices),
-                         h_segment_offsets,
-                         num_local_unique_edge_majors,
-                         num_local_unique_edge_minors);
+  return std::make_tuple(std::move(sorted_local_vertices), h_segment_offsets);
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
@@ -682,10 +670,7 @@ renumber_edgelist(
 
   // 1. compute renumber map
 
-  auto [renumber_map_labels,
-        vertex_partition_segment_offsets,
-        num_unique_edge_majors,
-        num_unique_edge_minors] =
+  auto [renumber_map_labels, vertex_partition_segment_offsets] =
     detail::compute_renumber_map<vertex_t, edge_t, multi_gpu>(handle,
                                                               std::move(local_vertices),
                                                               edgelist_const_majors,
@@ -765,7 +750,8 @@ renumber_edgelist(
     }
   }
 
-  if ((partition.get_matrix_partition_minor_size() >= number_of_edges / comm_size) &&
+  if ((static_cast<double>(partition.get_matrix_partition_minor_size() / load_factor) >=
+       static_cast<double>(number_of_edges / comm_size)) &&
       edgelist_intra_partition_segment_offsets) {  // memory footprint dominated by the O(V/sqrt(P))
                                                    // part than the O(E/P) part
     vertex_t max_segment_size{0};
@@ -863,12 +849,8 @@ renumber_edgelist(
 
   return std::make_tuple(
     std::move(renumber_map_labels),
-    renumber_meta_t<vertex_t, edge_t, multi_gpu>{number_of_vertices,
-                                                 number_of_edges,
-                                                 partition,
-                                                 vertex_partition_segment_offsets,
-                                                 num_unique_edge_majors,
-                                                 num_unique_edge_minors});
+    renumber_meta_t<vertex_t, edge_t, multi_gpu>{
+      number_of_vertices, number_of_edges, partition, vertex_partition_segment_offsets});
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
@@ -894,7 +876,7 @@ renumber_edgelist(raft::handle_t const& handle,
 
   rmm::device_uvector<vertex_t> renumber_map_labels(0, handle.get_stream());
   std::vector<vertex_t> segment_offsets{};
-  std::tie(renumber_map_labels, segment_offsets, std::ignore, std::ignore) =
+  std::tie(renumber_map_labels, segment_offsets) =
     detail::compute_renumber_map<vertex_t, edge_t, multi_gpu>(
       handle,
       std::move(vertices),

From 15aeb4ca9e9db56a2fb2c7434eb5483327e6ea78 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 16 Feb 2022 13:32:15 -0800
Subject: [PATCH 31/60] clang-format

---
 cpp/src/structure/create_graph_from_edgelist_impl.cuh | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
index 985c99eda36..bad875d554f 100644
--- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh
+++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
@@ -283,12 +283,11 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
     cugraph::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
       handle,
       edgelists,
-      cugraph::graph_meta_t<vertex_t, edge_t, multi_gpu>{
-        meta.number_of_vertices,
-        meta.number_of_edges,
-        graph_properties,
-        meta.partition,
-        meta.segment_offsets}),
+      cugraph::graph_meta_t<vertex_t, edge_t, multi_gpu>{meta.number_of_vertices,
+                                                         meta.number_of_edges,
+                                                         graph_properties,
+                                                         meta.partition,
+                                                         meta.segment_offsets}),
     std::optional<rmm::device_uvector<vertex_t>>{std::move(renumber_map_labels)});
 }
 

From e7102e09d81edbd233744bb313f6dee004fcef1f Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 16 Feb 2022 13:35:42 -0800
Subject: [PATCH 32/60] copyright year

---
 cpp/include/cugraph/graph_functions.hpp                         | 2 +-
 .../prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh      | 2 +-
 cpp/include/cugraph/utilities/cython.hpp                        | 2 +-
 cpp/src/components/weakly_connected_components_impl.cuh         | 2 +-
 cpp/src/structure/coarsen_graph_impl.cuh                        | 2 +-
 cpp/src/structure/create_graph_from_edgelist_impl.cuh           | 2 +-
 cpp/src/utilities/cython.cu                                     | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index d3017ac7aaa..c170ce65253 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh b/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh
index 968d99b7d25..c81cf2d133e 100644
--- a/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh
+++ b/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/cugraph/utilities/cython.hpp b/cpp/include/cugraph/utilities/cython.hpp
index 260393009bd..7cc6afb8aee 100644
--- a/cpp/include/cugraph/utilities/cython.hpp
+++ b/cpp/include/cugraph/utilities/cython.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh
index 757fc9e3d23..a1f663a301c 100644
--- a/cpp/src/components/weakly_connected_components_impl.cuh
+++ b/cpp/src/components/weakly_connected_components_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index 2559f9ef408..6234acf5559 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
index bad875d554f..ea12a3562ba 100644
--- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh
+++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu
index 59241a3e913..1527ae90afd 100644
--- a/cpp/src/utilities/cython.cu
+++ b/cpp/src/utilities/cython.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 7b544a6b9044bc23bb8cebf0cd61b5e8748120bd Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 16 Feb 2022 13:40:01 -0800
Subject: [PATCH 33/60] clang-format

---
 cpp/src/structure/graph_impl.cuh | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index e136906aef4..eff76df8a79 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -87,7 +87,8 @@ struct atomic_or_bitmap_t {
   uint32_t* bitmaps{nullptr};
   vertex_t minor_first{};
 
-  __device__ void operator()(vertex_t minor) const {
+  __device__ void operator()(vertex_t minor) const
+  {
     auto minor_offset = minor - minor_first;
     auto mask         = uint32_t{1} << (minor_offset % (sizeof(uint32_t) * 8));
     atomicOr(bitmaps + (minor_offset / (sizeof(uint32_t) * 8)), mask);
@@ -98,7 +99,8 @@ struct atomic_or_bitmap_t {
 // extended __device__ lambda must allow its address to be taken)
 template <typename vertex_t>
 struct popc_t {
-  __device__ vertex_t operator()(uint32_t bitmap) const {
+  __device__ vertex_t operator()(uint32_t bitmap) const
+  {
     return static_cast<vertex_t>(__popc(bitmap));
   }
 };
@@ -682,7 +684,6 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                         static_cast<edge_t>(adj_matrix_partition_indices_[i].size()));
   }
 
-
   // if # unique edge rows/cols << V / row_comm_size|col_comm_size, store unique edge rows/cols to
   // support storing edge row/column properties in (key, value) pairs.
 
@@ -691,7 +692,8 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
     num_local_unique_edge_majors += thrust::count_if(
       handle.get_thrust_policy(),
       thrust::make_counting_iterator(vertex_t{0}),
-      thrust::make_counting_iterator(static_cast<vertex_t>(adj_matrix_partition_offsets_[i].size() - 1)),
+      thrust::make_counting_iterator(
+        static_cast<vertex_t>(adj_matrix_partition_offsets_[i].size() - 1)),
       has_nzd_t<vertex_t, edge_t>{adj_matrix_partition_offsets_[i].data(), vertex_t{0}});
   }
 
@@ -709,10 +711,7 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
 
   auto count_first = thrust::make_transform_iterator(minor_bitmaps.begin(), popc_t<vertex_t>{});
   auto num_local_unique_edge_minors = thrust::reduce(
-    handle.get_thrust_policy(),
-    count_first,
-    count_first + minor_bitmaps.size(),
-    vertex_t{0});
+    handle.get_thrust_policy(), count_first, count_first + minor_bitmaps.size(), vertex_t{0});
 
   minor_bitmaps.resize(0, handle.get_stream());
   minor_bitmaps.shrink_to_fit(handle.get_stream());

From 6b4018735c3ac58a578183830834f19794ec5731 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 16 Feb 2022 17:29:12 -0800
Subject: [PATCH 34/60] cosmetic updates

---
 cpp/src/structure/coarsen_graph_impl.cuh | 211 +++++++++++------------
 1 file changed, 100 insertions(+), 111 deletions(-)

diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index b0f6c7eca05..ec01135f7ae 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -46,43 +46,38 @@ namespace cugraph {
 namespace {
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-edge_t groupby_e_and_coarsen_edgelist(vertex_t* edgelist_major_vertices /* [INOUT] */,
-                                      vertex_t* edgelist_minor_vertices /* [INOUT] */,
+edge_t groupby_e_and_coarsen_edgelist(vertex_t* edgelist_majors /* [INOUT] */,
+                                      vertex_t* edgelist_minors /* [INOUT] */,
                                       std::optional<weight_t*> edgelist_weights /* [INOUT] */,
                                       edge_t number_of_edges,
                                       cudaStream_t stream)
 {
-  auto pair_first =
-    thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices));
+  auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors, edgelist_minors));
 
   if (edgelist_weights) {
     thrust::sort_by_key(
       rmm::exec_policy(stream), pair_first, pair_first + number_of_edges, *edgelist_weights);
 
-    rmm::device_uvector<vertex_t> tmp_edgelist_major_vertices(number_of_edges, stream);
-    rmm::device_uvector<vertex_t> tmp_edgelist_minor_vertices(tmp_edgelist_major_vertices.size(),
-                                                              stream);
-    rmm::device_uvector<weight_t> tmp_edgelist_weights(tmp_edgelist_major_vertices.size(), stream);
-    auto it = thrust::reduce_by_key(
-      rmm::exec_policy(stream),
-      pair_first,
-      pair_first + number_of_edges,
-      (*edgelist_weights),
-      thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(),
-                                                   tmp_edgelist_minor_vertices.begin())),
-      tmp_edgelist_weights.begin());
+    rmm::device_uvector<vertex_t> tmp_edgelist_majors(number_of_edges, stream);
+    rmm::device_uvector<vertex_t> tmp_edgelist_minors(tmp_edgelist_majors.size(), stream);
+    rmm::device_uvector<weight_t> tmp_edgelist_weights(tmp_edgelist_majors.size(), stream);
+    auto it = thrust::reduce_by_key(rmm::exec_policy(stream),
+                                    pair_first,
+                                    pair_first + number_of_edges,
+                                    (*edgelist_weights),
+                                    thrust::make_zip_iterator(thrust::make_tuple(
+                                      tmp_edgelist_majors.begin(), tmp_edgelist_minors.begin())),
+                                    tmp_edgelist_weights.begin());
     auto ret =
       static_cast<edge_t>(thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it)));
 
-    auto edge_first =
-      thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(),
-                                                   tmp_edgelist_minor_vertices.begin(),
-                                                   tmp_edgelist_weights.begin()));
+    auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(
+      tmp_edgelist_majors.begin(), tmp_edgelist_minors.begin(), tmp_edgelist_weights.begin()));
     thrust::copy(rmm::exec_policy(stream),
                  edge_first,
                  edge_first + ret,
-                 thrust::make_zip_iterator(thrust::make_tuple(
-                   edgelist_major_vertices, edgelist_minor_vertices, *edgelist_weights)));
+                 thrust::make_zip_iterator(
+                   thrust::make_tuple(edgelist_majors, edgelist_minors, *edgelist_weights)));
 
     return ret;
   } else {
@@ -113,27 +108,26 @@ decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(
   // FIXME: it might be possible to directly create relabled & coarsened edgelist from the
   // compressed sparse format to save memory
 
-  rmm::device_uvector<vertex_t> edgelist_major_vertices(matrix_partition.get_number_of_edges(),
-                                                        handle.get_stream());
-  rmm::device_uvector<vertex_t> edgelist_minor_vertices(edgelist_major_vertices.size(),
-                                                        handle.get_stream());
+  rmm::device_uvector<vertex_t> edgelist_majors(matrix_partition.get_number_of_edges(),
+                                                handle.get_stream());
+  rmm::device_uvector<vertex_t> edgelist_minors(edgelist_majors.size(), handle.get_stream());
   auto edgelist_weights = matrix_partition.get_weights()
                             ? std::make_optional<rmm::device_uvector<weight_t>>(
-                                edgelist_major_vertices.size(), handle.get_stream())
+                                edgelist_majors.size(), handle.get_stream())
                             : std::nullopt;
   detail::decompress_matrix_partition_to_edgelist(
     handle,
     matrix_partition,
-    edgelist_major_vertices.data(),
-    edgelist_minor_vertices.data(),
+    edgelist_majors.data(),
+    edgelist_minors.data(),
     edgelist_weights ? std::optional<weight_t*>{(*edgelist_weights).data()} : std::nullopt,
     segment_offsets);
 
-  auto pair_first = thrust::make_zip_iterator(
-    thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin()));
+  auto pair_first =
+    thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors.begin(), edgelist_minors.begin()));
   thrust::transform(handle.get_thrust_policy(),
                     pair_first,
-                    pair_first + edgelist_major_vertices.size(),
+                    pair_first + edgelist_majors.size(),
                     pair_first,
                     [major_label_first,
                      minor_label_input,
@@ -145,23 +139,22 @@ decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(
                     });
 
   auto number_of_edges = groupby_e_and_coarsen_edgelist(
-    edgelist_major_vertices.data(),
-    edgelist_minor_vertices.data(),
+    edgelist_majors.data(),
+    edgelist_minors.data(),
     edgelist_weights ? std::optional<weight_t*>{(*edgelist_weights).data()} : std::nullopt,
-    static_cast<edge_t>(edgelist_major_vertices.size()),
+    static_cast<edge_t>(edgelist_majors.size()),
     handle.get_stream());
-  edgelist_major_vertices.resize(number_of_edges, handle.get_stream());
-  edgelist_major_vertices.shrink_to_fit(handle.get_stream());
-  edgelist_minor_vertices.resize(number_of_edges, handle.get_stream());
-  edgelist_minor_vertices.shrink_to_fit(handle.get_stream());
+  edgelist_majors.resize(number_of_edges, handle.get_stream());
+  edgelist_majors.shrink_to_fit(handle.get_stream());
+  edgelist_minors.resize(number_of_edges, handle.get_stream());
+  edgelist_minors.shrink_to_fit(handle.get_stream());
   if (edgelist_weights) {
     (*edgelist_weights).resize(number_of_edges, handle.get_stream());
     (*edgelist_weights).shrink_to_fit(handle.get_stream());
   }
 
-  return std::make_tuple(std::move(edgelist_major_vertices),
-                         std::move(edgelist_minor_vertices),
-                         std::move(edgelist_weights));
+  return std::make_tuple(
+    std::move(edgelist_majors), std::move(edgelist_minors), std::move(edgelist_weights));
 }
 
 }  // namespace
@@ -213,19 +206,19 @@ coarsen_graph(
     copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_minor_labels);
   }
 
-  std::vector<rmm::device_uvector<vertex_t>> coarsened_edgelist_major_vertices{};
-  std::vector<rmm::device_uvector<vertex_t>> coarsened_edgelist_minor_vertices{};
+  std::vector<rmm::device_uvector<vertex_t>> coarsened_edgelist_majors{};
+  std::vector<rmm::device_uvector<vertex_t>> coarsened_edgelist_minors{};
   auto coarsened_edgelist_weights =
     graph_view.is_weighted() ? std::make_optional<std::vector<rmm::device_uvector<weight_t>>>({})
                              : std::nullopt;
-  coarsened_edgelist_major_vertices.reserve(graph_view.get_number_of_local_adj_matrix_partitions());
-  coarsened_edgelist_minor_vertices.reserve(coarsened_edgelist_major_vertices.size());
+  coarsened_edgelist_majors.reserve(graph_view.get_number_of_local_adj_matrix_partitions());
+  coarsened_edgelist_minors.reserve(coarsened_edgelist_majors.size());
   if (coarsened_edgelist_weights) {
-    (*coarsened_edgelist_weights).reserve(coarsened_edgelist_major_vertices.size());
+    (*coarsened_edgelist_weights).reserve(coarsened_edgelist_majors.size());
   }
   for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
-    coarsened_edgelist_major_vertices.emplace_back(0, handle.get_stream());
-    coarsened_edgelist_minor_vertices.emplace_back(0, handle.get_stream());
+    coarsened_edgelist_majors.emplace_back(0, handle.get_stream());
+    coarsened_edgelist_minors.emplace_back(0, handle.get_stream());
     if (coarsened_edgelist_weights) {
       (*coarsened_edgelist_weights).emplace_back(0, handle.get_stream());
     }
@@ -248,7 +241,7 @@ coarsen_graph(
                  static_cast<int>(i),
                  handle.get_stream());
 
-    auto [edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights] =
+    auto [edgelist_majors, edgelist_minors, edgelist_weights] =
       decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(
         handle,
         matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu>(
@@ -259,10 +252,10 @@ coarsen_graph(
 
     // 1-2. globally shuffle
 
-    std::tie(edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights) =
+    std::tie(edgelist_majors, edgelist_minors, edgelist_weights) =
       cugraph::detail::shuffle_edgelist_by_gpu_id(handle,
-                                                  std::move(edgelist_major_vertices),
-                                                  std::move(edgelist_minor_vertices),
+                                                  std::move(edgelist_majors),
+                                                  std::move(edgelist_minors),
                                                   std::move(edgelist_weights));
 
     // 1-3. append data to local adjacency matrix partitions
@@ -272,7 +265,7 @@ coarsen_graph(
     // groupby_adj_matrix_partition_and_shuffle_values).
 
     auto counts = cugraph::detail::groupby_and_count_edgelist_by_local_partition_id(
-      handle, edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights);
+      handle, edgelist_majors, edgelist_minors, edgelist_weights);
 
     std::vector<size_t> h_counts(counts.size());
     raft::update_host(h_counts.data(), counts.data(), counts.size(), handle.get_stream());
@@ -283,68 +276,66 @@ coarsen_graph(
 
     for (int j = 0; j < col_comm_size; ++j) {
       auto number_of_partition_edges = groupby_e_and_coarsen_edgelist(
-        edgelist_major_vertices.begin() + h_displacements[j],
-        edgelist_minor_vertices.begin() + h_displacements[j],
+        edgelist_majors.begin() + h_displacements[j],
+        edgelist_minors.begin() + h_displacements[j],
         edgelist_weights ? std::optional<weight_t*>{(*edgelist_weights).data() + h_displacements[j]}
                          : std::nullopt,
         h_counts[j],
         handle.get_stream());
 
-      auto cur_size = coarsened_edgelist_major_vertices[j].size();
+      auto cur_size = coarsened_edgelist_majors[j].size();
       // FIXME: this can lead to frequent costly reallocation; we may be able to avoid this if we
       // can reserve address space to avoid expensive reallocation.
       // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management
-      coarsened_edgelist_major_vertices[j].resize(cur_size + number_of_partition_edges,
-                                                  handle.get_stream());
-      coarsened_edgelist_minor_vertices[j].resize(coarsened_edgelist_major_vertices[j].size(),
-                                                  handle.get_stream());
+      coarsened_edgelist_majors[j].resize(cur_size + number_of_partition_edges,
+                                          handle.get_stream());
+      coarsened_edgelist_minors[j].resize(coarsened_edgelist_majors[j].size(), handle.get_stream());
 
       if (coarsened_edgelist_weights) {
-        (*coarsened_edgelist_weights)[j].resize(coarsened_edgelist_major_vertices[j].size(),
+        (*coarsened_edgelist_weights)[j].resize(coarsened_edgelist_majors[j].size(),
                                                 handle.get_stream());
 
-        auto src_edge_first =
-          thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(),
-                                                       edgelist_minor_vertices.begin(),
-                                                       (*edgelist_weights).begin())) +
+        auto input_edge_first =
+          thrust::make_zip_iterator(thrust::make_tuple(
+            edgelist_majors.begin(), edgelist_minors.begin(), (*edgelist_weights).begin())) +
           h_displacements[j];
-        auto dst_edge_first =
-          thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices[j].begin(),
-                                                       coarsened_edgelist_minor_vertices[j].begin(),
+        auto output_edge_first =
+          thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_majors[j].begin(),
+                                                       coarsened_edgelist_minors[j].begin(),
                                                        (*coarsened_edgelist_weights)[j].begin())) +
           cur_size;
         thrust::copy(handle.get_thrust_policy(),
-                     src_edge_first,
-                     src_edge_first + number_of_partition_edges,
-                     dst_edge_first);
+                     input_edge_first,
+                     input_edge_first + number_of_partition_edges,
+                     output_edge_first);
       } else {
-        auto src_edge_first = thrust::make_zip_iterator(thrust::make_tuple(
-                                edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())) +
-                              h_displacements[j];
-        auto dst_edge_first = thrust::make_zip_iterator(
-                                thrust::make_tuple(coarsened_edgelist_major_vertices[j].begin(),
-                                                   coarsened_edgelist_minor_vertices[j].begin())) +
-                              cur_size;
+        auto input_edge_first = thrust::make_zip_iterator(thrust::make_tuple(
+                                  edgelist_majors.begin(), edgelist_minors.begin())) +
+                                h_displacements[j];
+        auto output_edge_first =
+          thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_majors[j].begin(),
+                                                       coarsened_edgelist_minors[j].begin())) +
+          cur_size;
         thrust::copy(handle.get_thrust_policy(),
-                     src_edge_first,
-                     src_edge_first + number_of_partition_edges,
-                     dst_edge_first);
+                     input_edge_first,
+                     input_edge_first + number_of_partition_edges,
+                     output_edge_first);
       }
     }
   }
 
-  for (size_t i = 0; i < coarsened_edgelist_major_vertices.size(); ++i) {
+  for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) {
     auto number_of_partition_edges = groupby_e_and_coarsen_edgelist(
-      coarsened_edgelist_major_vertices[i].data(),
-      coarsened_edgelist_minor_vertices[i].data(),
+      coarsened_edgelist_majors[i].data(),
+      coarsened_edgelist_minors[i].data(),
       coarsened_edgelist_weights ? std::optional<weight_t*>{(*coarsened_edgelist_weights)[i].data()}
                                  : std::nullopt,
-      static_cast<edge_t>(coarsened_edgelist_major_vertices[i].size()),
+      static_cast<edge_t>(coarsened_edgelist_majors[i].size()),
       handle.get_stream());
-    coarsened_edgelist_major_vertices[i].resize(number_of_partition_edges, handle.get_stream());
-    coarsened_edgelist_major_vertices[i].shrink_to_fit(handle.get_stream());
-    coarsened_edgelist_minor_vertices[i].resize(number_of_partition_edges, handle.get_stream());
-    coarsened_edgelist_minor_vertices[i].shrink_to_fit(handle.get_stream());
+    coarsened_edgelist_majors[i].resize(number_of_partition_edges, handle.get_stream());
+    coarsened_edgelist_majors[i].shrink_to_fit(handle.get_stream());
+    coarsened_edgelist_minors[i].resize(number_of_partition_edges, handle.get_stream());
+    coarsened_edgelist_minors[i].shrink_to_fit(handle.get_stream());
     if (coarsened_edgelist_weights) {
       (*coarsened_edgelist_weights)[i].resize(number_of_partition_edges, handle.get_stream());
       (*coarsened_edgelist_weights)[i].shrink_to_fit(handle.get_stream());
@@ -378,13 +369,13 @@ coarsen_graph(
   rmm::device_uvector<vertex_t> renumber_map_labels(0, handle.get_stream());
   renumber_meta_t<vertex_t, edge_t, multi_gpu> meta{};
   {
-    std::vector<vertex_t*> major_ptrs(coarsened_edgelist_major_vertices.size());
+    std::vector<vertex_t*> major_ptrs(coarsened_edgelist_majors.size());
     std::vector<vertex_t*> minor_ptrs(major_ptrs.size());
     std::vector<edge_t> counts(major_ptrs.size());
-    for (size_t i = 0; i < coarsened_edgelist_major_vertices.size(); ++i) {
-      major_ptrs[i] = coarsened_edgelist_major_vertices[i].data();
-      minor_ptrs[i] = coarsened_edgelist_minor_vertices[i].data();
-      counts[i]     = static_cast<edge_t>(coarsened_edgelist_major_vertices[i].size());
+    for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) {
+      major_ptrs[i] = coarsened_edgelist_majors[i].data();
+      minor_ptrs[i] = coarsened_edgelist_minors[i].data();
+      counts[i]     = static_cast<edge_t>(coarsened_edgelist_majors[i].size());
     }
     std::tie(renumber_map_labels, meta) = renumber_edgelist<vertex_t, edge_t, multi_gpu>(
       handle,
@@ -401,15 +392,15 @@ coarsen_graph(
   std::vector<edgelist_t<vertex_t, edge_t, weight_t>> edgelists{};
   edgelists.resize(graph_view.get_number_of_local_adj_matrix_partitions());
   for (size_t i = 0; i < edgelists.size(); ++i) {
-    edgelists[i].p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices[i].data()
-                                                   : coarsened_edgelist_major_vertices[i].data();
-    edgelists[i].p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices[i].data()
-                                                   : coarsened_edgelist_minor_vertices[i].data();
+    edgelists[i].p_src_vertices =
+      store_transposed ? coarsened_edgelist_minors[i].data() : coarsened_edgelist_majors[i].data();
+    edgelists[i].p_dst_vertices =
+      store_transposed ? coarsened_edgelist_majors[i].data() : coarsened_edgelist_minors[i].data();
     edgelists[i].p_edge_weights =
       coarsened_edgelist_weights
         ? std::optional<weight_t const*>{(*coarsened_edgelist_weights)[i].data()}
         : std::nullopt,
-    edgelists[i].number_of_edges = static_cast<edge_t>(coarsened_edgelist_major_vertices[i].size());
+    edgelists[i].number_of_edges = static_cast<edge_t>(coarsened_edgelist_majors[i].size());
   }
 
   return std::make_tuple(
@@ -447,9 +438,7 @@ coarsen_graph(
     // currently, nothing to do
   }
 
-  auto [coarsened_edgelist_major_vertices,
-        coarsened_edgelist_minor_vertices,
-        coarsened_edgelist_weights] =
+  auto [coarsened_edgelist_majors, coarsened_edgelist_minors, coarsened_edgelist_weights] =
     decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(
       handle,
       matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu>(
@@ -472,20 +461,20 @@ coarsen_graph(
   auto [renumber_map_labels, meta] = renumber_edgelist<vertex_t, edge_t, multi_gpu>(
     handle,
     std::optional<rmm::device_uvector<vertex_t>>{std::move(unique_labels)},
-    coarsened_edgelist_major_vertices.data(),
-    coarsened_edgelist_minor_vertices.data(),
-    static_cast<edge_t>(coarsened_edgelist_major_vertices.size()),
+    coarsened_edgelist_majors.data(),
+    coarsened_edgelist_minors.data(),
+    static_cast<edge_t>(coarsened_edgelist_majors.size()),
     do_expensive_check);
 
   edgelist_t<vertex_t, edge_t, weight_t> edgelist{};
-  edgelist.p_src_vertices  = store_transposed ? coarsened_edgelist_minor_vertices.data()
-                                              : coarsened_edgelist_major_vertices.data();
-  edgelist.p_dst_vertices  = store_transposed ? coarsened_edgelist_major_vertices.data()
-                                              : coarsened_edgelist_minor_vertices.data();
+  edgelist.p_src_vertices =
+    store_transposed ? coarsened_edgelist_minors.data() : coarsened_edgelist_majors.data();
+  edgelist.p_dst_vertices =
+    store_transposed ? coarsened_edgelist_majors.data() : coarsened_edgelist_minors.data();
   edgelist.p_edge_weights  = coarsened_edgelist_weights
                                ? std::optional<weight_t const*>{(*coarsened_edgelist_weights).data()}
                                : std::nullopt;
-  edgelist.number_of_edges = static_cast<edge_t>(coarsened_edgelist_major_vertices.size());
+  edgelist.number_of_edges = static_cast<edge_t>(coarsened_edgelist_majors.size());
 
   return std::make_tuple(
     std::make_unique<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>(

From 85f39fb9840a92719849bc1b064c1a871758a393 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 16 Feb 2022 17:31:17 -0800
Subject: [PATCH 35/60] update louvain tests to take --perf option

---
 cpp/tests/community/louvain_test.cpp | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/cpp/tests/community/louvain_test.cpp b/cpp/tests/community/louvain_test.cpp
index b86cfdee5c6..364a0b8a68e 100644
--- a/cpp/tests/community/louvain_test.cpp
+++ b/cpp/tests/community/louvain_test.cpp
@@ -9,6 +9,7 @@
  *
  */
 #include <utilities/base_fixture.hpp>
+#include <utilities/high_res_clock.h>
 #include <utilities/test_graphs.hpp>
 #include <utilities/test_utilities.hpp>
 
@@ -90,15 +91,28 @@ class Tests_Louvain
     auto [louvain_usecase, input_usecase] = param;
 
     raft::handle_t handle{};
+    HighResClock hr_clock{};
 
     // Can't currently check correctness if we renumber
     bool renumber = true;
     if (louvain_usecase.check_correctness_) renumber = false;
 
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_clock.start();
+    }
+
     auto [graph, d_renumber_map_labels] =
       cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
         handle, input_usecase, true, renumber);
 
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "construct_graph took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
     auto graph_view = graph.view();
 
     // "FIXME": remove this check once we drop support for Pascal
@@ -109,6 +123,11 @@ class Tests_Louvain
     cudaDeviceProp device_prop;
     RAFT_CUDA_TRY(cudaGetDeviceProperties(&device_prop, 0));
 
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_clock.start();
+    }
+
     if (device_prop.major < 7) {
       EXPECT_THROW(louvain(graph_view,
                            graph_view.get_number_of_local_vertices(),
@@ -123,6 +142,13 @@ class Tests_Louvain
               louvain_usecase.expected_level_,
               louvain_usecase.expected_modularity_);
     }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "Louvain took " << elapsed_time * 1e-6 << " s.\n";
+    }
   }
 
   template <typename graph_t>

From 48009a6a8fb430e40e23d841ab6a2e79157ee7cf Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 17 Feb 2022 01:58:00 -0800
Subject: [PATCH 36/60] fix error in comments

---
 cpp/include/cugraph/detail/decompress_matrix_partition.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cugraph/detail/decompress_matrix_partition.cuh b/cpp/include/cugraph/detail/decompress_matrix_partition.cuh
index aa9b2897075..b419d4bdbb3 100644
--- a/cpp/include/cugraph/detail/decompress_matrix_partition.cuh
+++ b/cpp/include/cugraph/detail/decompress_matrix_partition.cuh
@@ -184,9 +184,9 @@ template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 void decompress_matrix_partition_to_edgelist(
   raft::handle_t const& handle,
   matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> const matrix_partition,
-  vertex_t* edgelist_majors /* [INOUT] */,
-  vertex_t* edgelist_minors /* [INOUT] */,
-  std::optional<weight_t*> edgelist_weights /* [INOUT] */,
+  vertex_t* edgelist_majors /* [OUT] */,
+  vertex_t* edgelist_minors /* [OUT] */,
+  std::optional<weight_t*> edgelist_weights /* [OUT] */,
   std::optional<std::vector<vertex_t>> const& segment_offsets)
 {
   auto number_of_edges = matrix_partition.get_number_of_edges();

From 59d16a3ebdae73c118f812db9ea96a4757f35e8f Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 17 Feb 2022 01:58:36 -0800
Subject: [PATCH 37/60] add is_first_in_run_pair_t to graph_utils.cuh

---
 cpp/include/cugraph/detail/graph_utils.cuh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cpp/include/cugraph/detail/graph_utils.cuh b/cpp/include/cugraph/detail/graph_utils.cuh
index 254744d11d9..a5006c711fe 100644
--- a/cpp/include/cugraph/detail/graph_utils.cuh
+++ b/cpp/include/cugraph/detail/graph_utils.cuh
@@ -86,5 +86,14 @@ struct is_first_in_run_t {
   }
 };
 
+template <typename PairIterator>
+struct is_first_in_run_pair_t {
+  PairIterator pair_first{};
+  __device__ bool operator()(size_t i) const
+  {
+    return (i == 0) || (*(pair_first + (i - 1)) != *(pair_first + i));
+  }
+};
+
 }  // namespace detail
 }  // namespace cugraph

From b7d9570fd88f8284cecc8c5b7defe13cd35cc6b4 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 17 Feb 2022 02:00:34 -0800
Subject: [PATCH 38/60] add R-mat symmetric cases to coarsen_graph tests

---
 cpp/tests/structure/coarsen_graph_test.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/structure/coarsen_graph_test.cpp b/cpp/tests/structure/coarsen_graph_test.cpp
index dedcb2a718d..dc9298813be 100644
--- a/cpp/tests/structure/coarsen_graph_test.cpp
+++ b/cpp/tests/structure/coarsen_graph_test.cpp
@@ -433,7 +433,8 @@ INSTANTIATE_TEST_SUITE_P(
   ::testing::Combine(
     // enable correctness checks
     ::testing::Values(CoarsenGraph_Usecase{0.2, false}, CoarsenGraph_Usecase{0.2, true}),
-    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false),
+                      cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true, false))));
 
 INSTANTIATE_TEST_SUITE_P(
   file_benchmark_test, /* note that the test filename can be overridden in benchmarking (with
@@ -457,6 +458,7 @@ INSTANTIATE_TEST_SUITE_P(
     // disable correctness checks for large graphs
     ::testing::Values(CoarsenGraph_Usecase{0.2, false, false},
                       CoarsenGraph_Usecase{0.2, true, false}),
-    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false),
+                      cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, true, false))));
 
 CUGRAPH_TEST_PROGRAM_MAIN()

From 0a0c6c1fbc03b2da6cbf4056069419d99903ddae Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 17 Feb 2022 02:05:57 -0800
Subject: [PATCH 39/60] if the input grpah is symmetric, update coarsen_grpah
 to work on the lower triangular part only and symmetrize to avoid (slightly)
 asymmetric edge weights due to the limited floating point precisions

---
 cpp/src/structure/coarsen_graph_impl.cuh | 486 ++++++++++++++++++-----
 1 file changed, 376 insertions(+), 110 deletions(-)

diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index ec01135f7ae..0611d642cb3 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -45,46 +45,66 @@ namespace cugraph {
 
 namespace {
 
+template <typename EdgeTupleType>
+struct is_not_lower_triangular_t {
+  __device__ bool operator()(EdgeTupleType e) const
+  {
+    return thrust::get<0>(e) < thrust::get<1>(e);
+  }
+};
+
+template <typename EdgeTupleType>
+struct is_not_self_loop_t {
+  __device__ bool operator()(EdgeTupleType e) const
+  {
+    return thrust::get<0>(e) != thrust::get<1>(e);
+  }
+};
+
 template <typename vertex_t, typename edge_t, typename weight_t>
 edge_t groupby_e_and_coarsen_edgelist(vertex_t* edgelist_majors /* [INOUT] */,
                                       vertex_t* edgelist_minors /* [INOUT] */,
                                       std::optional<weight_t*> edgelist_weights /* [INOUT] */,
                                       edge_t number_of_edges,
-                                      cudaStream_t stream)
+                                      rmm::cuda_stream_view stream_view)
 {
   auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(edgelist_majors, edgelist_minors));
 
   if (edgelist_weights) {
     thrust::sort_by_key(
-      rmm::exec_policy(stream), pair_first, pair_first + number_of_edges, *edgelist_weights);
-
-    rmm::device_uvector<vertex_t> tmp_edgelist_majors(number_of_edges, stream);
-    rmm::device_uvector<vertex_t> tmp_edgelist_minors(tmp_edgelist_majors.size(), stream);
-    rmm::device_uvector<weight_t> tmp_edgelist_weights(tmp_edgelist_majors.size(), stream);
-    auto it = thrust::reduce_by_key(rmm::exec_policy(stream),
-                                    pair_first,
-                                    pair_first + number_of_edges,
-                                    (*edgelist_weights),
-                                    thrust::make_zip_iterator(thrust::make_tuple(
-                                      tmp_edgelist_majors.begin(), tmp_edgelist_minors.begin())),
-                                    tmp_edgelist_weights.begin());
-    auto ret =
-      static_cast<edge_t>(thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it)));
+      rmm::exec_policy(stream_view), pair_first, pair_first + number_of_edges, *edgelist_weights);
+
+    auto num_uniques =
+      thrust::count_if(rmm::exec_policy(stream_view),
+                       thrust::make_counting_iterator(size_t{0}),
+                       thrust::make_counting_iterator(static_cast<size_t>(number_of_edges)),
+                       detail::is_first_in_run_pair_t<decltype(pair_first)>{pair_first});
+
+    rmm::device_uvector<vertex_t> tmp_edgelist_majors(num_uniques, stream_view);
+    rmm::device_uvector<vertex_t> tmp_edgelist_minors(tmp_edgelist_majors.size(), stream_view);
+    rmm::device_uvector<weight_t> tmp_edgelist_weights(tmp_edgelist_majors.size(), stream_view);
+    thrust::reduce_by_key(rmm::exec_policy(stream_view),
+                          pair_first,
+                          pair_first + number_of_edges,
+                          (*edgelist_weights),
+                          thrust::make_zip_iterator(thrust::make_tuple(
+                            tmp_edgelist_majors.begin(), tmp_edgelist_minors.begin())),
+                          tmp_edgelist_weights.begin());
 
     auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(
       tmp_edgelist_majors.begin(), tmp_edgelist_minors.begin(), tmp_edgelist_weights.begin()));
-    thrust::copy(rmm::exec_policy(stream),
+    thrust::copy(rmm::exec_policy(stream_view),
                  edge_first,
-                 edge_first + ret,
+                 edge_first + num_uniques,
                  thrust::make_zip_iterator(
                    thrust::make_tuple(edgelist_majors, edgelist_minors, *edgelist_weights)));
 
-    return ret;
+    return num_uniques;
   } else {
-    thrust::sort(rmm::exec_policy(stream), pair_first, pair_first + number_of_edges);
+    thrust::sort(rmm::exec_policy(stream_view), pair_first, pair_first + number_of_edges);
     return static_cast<edge_t>(thrust::distance(
       pair_first,
-      thrust::unique(rmm::exec_policy(stream), pair_first, pair_first + number_of_edges)));
+      thrust::unique(rmm::exec_policy(stream_view), pair_first, pair_first + number_of_edges)));
   }
 }
 
@@ -96,12 +116,24 @@ template <typename vertex_t,
 std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>>
-decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(
-  raft::handle_t const& handle,
-  matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> const matrix_partition,
-  vertex_t const* major_label_first,
-  AdjMatrixMinorLabelInputWrapper const minor_label_input,
-  std::optional<std::vector<vertex_t>> const& segment_offsets)
+decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(raft::handle_t const&
+                                                                              handle,
+                                                                            matrix_partition_device_view_t<
+                                                                              vertex_t,
+                                                                              edge_t,
+                                                                              weight_t,
+                                                                              multi_gpu> const
+                                                                              matrix_partition,
+                                                                            vertex_t const*
+                                                                              major_label_first,
+                                                                            AdjMatrixMinorLabelInputWrapper const
+                                                                              minor_label_input,
+                                                                            std::optional<
+                                                                              std::vector<
+                                                                                vertex_t>> const&
+                                                                              segment_offsets,
+                                                                            bool
+                                                                              lower_triangular_only)
 {
   static_assert(std::is_same_v<typename AdjMatrixMinorLabelInputWrapper::value_type, vertex_t>);
 
@@ -138,6 +170,41 @@ decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(
                         minor_label_input.get(thrust::get<1>(val) - minor_first));
                     });
 
+  if (lower_triangular_only) {
+    if (edgelist_weights) {
+      auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(
+        edgelist_majors.begin(), edgelist_minors.begin(), (*edgelist_weights).begin()));
+      edgelist_majors.resize(
+        thrust::distance(
+          edge_first,
+          thrust::remove_if(
+            handle.get_thrust_policy(),
+            edge_first,
+            edge_first + edgelist_majors.size(),
+            is_not_lower_triangular_t<thrust::tuple<vertex_t, vertex_t, weight_t>>{})),
+        handle.get_stream());
+      edgelist_majors.shrink_to_fit(handle.get_stream());
+      edgelist_minors.resize(edgelist_majors.size(), handle.get_stream());
+      edgelist_minors.shrink_to_fit(handle.get_stream());
+      (*edgelist_weights).resize(edgelist_majors.size(), handle.get_stream());
+      (*edgelist_weights).shrink_to_fit(handle.get_stream());
+    } else {
+      auto edge_first = thrust::make_zip_iterator(
+        thrust::make_tuple(edgelist_majors.begin(), edgelist_minors.begin()));
+      edgelist_majors.resize(
+        thrust::distance(
+          edge_first,
+          thrust::remove_if(handle.get_thrust_policy(),
+                            edge_first,
+                            edge_first + edgelist_majors.size(),
+                            is_not_lower_triangular_t<thrust::tuple<vertex_t, vertex_t>>{})),
+        handle.get_stream());
+      edgelist_majors.shrink_to_fit(handle.get_stream());
+      edgelist_minors.resize(edgelist_majors.size(), handle.get_stream());
+      edgelist_minors.shrink_to_fit(handle.get_stream());
+    }
+  }
+
   auto number_of_edges = groupby_e_and_coarsen_edgelist(
     edgelist_majors.data(),
     edgelist_minors.data(),
@@ -191,7 +258,11 @@ coarsen_graph(
     // currently, nothing to do
   }
 
-  // 1. construct coarsened edge list
+  // 1. construct coarsened edge lists from each local partition (if the input graph is symmetric,
+  // start with only the lower triangular edges in the original graph, this is to prevent edge
+  // weights in the coarsened graph becoming asymmmetric due to limited floatping point resolution)
+
+  bool lower_triangular_only = graph_view.is_symmetric();
 
   std::conditional_t<
     store_transposed,
@@ -216,17 +287,6 @@ coarsen_graph(
   if (coarsened_edgelist_weights) {
     (*coarsened_edgelist_weights).reserve(coarsened_edgelist_majors.size());
   }
-  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
-    coarsened_edgelist_majors.emplace_back(0, handle.get_stream());
-    coarsened_edgelist_minors.emplace_back(0, handle.get_stream());
-    if (coarsened_edgelist_weights) {
-      (*coarsened_edgelist_weights).emplace_back(0, handle.get_stream());
-    }
-  }
-  // FIXME: we may compare performance/memory footprint with the hash_based approach especially when
-  // cuco::dynamic_map becomes available (so we don't need to preallocate memory assuming the worst
-  // case). We may be able to limit the memory requirement close to the final coarsened edgelist
-  // with the hash based approach.
   for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
     // 1-1. locally construct coarsened edge list
 
@@ -248,7 +308,8 @@ coarsen_graph(
           graph_view.get_matrix_partition_view(i)),
         major_labels.data(),
         adj_matrix_minor_labels.device_view(),
-        graph_view.get_local_adj_matrix_partition_segment_offsets(i));
+        graph_view.get_local_adj_matrix_partition_segment_offsets(i),
+        lower_triangular_only);
 
     // 1-2. globally shuffle
 
@@ -258,91 +319,236 @@ coarsen_graph(
                                                   std::move(edgelist_minors),
                                                   std::move(edgelist_weights));
 
-    // 1-3. append data to local adjacency matrix partitions
+    // 1-3. groupby and coarsen again
 
-    // FIXME: we can skip this if groupby_gpuid_and_shuffle_values is updated to return sorted edge
-    // list based on the final matrix partition (maybe add
-    // groupby_adj_matrix_partition_and_shuffle_values).
+    auto coarsened_size = groupby_e_and_coarsen_edgelist(
+      edgelist_majors.data(),
+      edgelist_minors.data(),
+      edgelist_weights ? std::optional<weight_t*>{(*edgelist_weights).data()} : std::nullopt,
+      edgelist_majors.size(),
+      handle.get_stream());
+    edgelist_majors.resize(coarsened_size, handle.get_stream());
+    edgelist_majors.shrink_to_fit(handle.get_stream());
+    edgelist_minors.resize(edgelist_majors.size(), handle.get_stream());
+    edgelist_minors.shrink_to_fit(handle.get_stream());
+    if (edgelist_weights) {
+      (*edgelist_weights).resize(edgelist_majors.size(), handle.get_stream());
+      (*edgelist_weights).shrink_to_fit(handle.get_stream());
+    }
 
-    auto counts = cugraph::detail::groupby_and_count_edgelist_by_local_partition_id(
-      handle, edgelist_majors, edgelist_minors, edgelist_weights);
+    coarsened_edgelist_majors.push_back(std::move(edgelist_majors));
+    coarsened_edgelist_minors.push_back(std::move(edgelist_minors));
+    if (edgelist_weights) { (*coarsened_edgelist_weights).push_back(std::move(*edgelist_weights)); }
+  }
 
-    std::vector<size_t> h_counts(counts.size());
-    raft::update_host(h_counts.data(), counts.data(), counts.size(), handle.get_stream());
-    handle.sync_stream();
+  // 2. concatenate and groupby and coarsen again (and if the input graph is symmetric, create a
+  // copy excluding self loops and globally shuffle)
 
-    std::vector<size_t> h_displacements(h_counts.size(), size_t{0});
-    std::partial_sum(h_counts.begin(), h_counts.end() - 1, h_displacements.begin() + 1);
+  edge_t tot_count{0};
+  for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) {
+    tot_count += coarsened_edgelist_majors[i].size();
+  }
 
-    for (int j = 0; j < col_comm_size; ++j) {
-      auto number_of_partition_edges = groupby_e_and_coarsen_edgelist(
-        edgelist_majors.begin() + h_displacements[j],
-        edgelist_minors.begin() + h_displacements[j],
-        edgelist_weights ? std::optional<weight_t*>{(*edgelist_weights).data() + h_displacements[j]}
-                         : std::nullopt,
-        h_counts[j],
-        handle.get_stream());
+  rmm::device_uvector<vertex_t> concatenated_edgelist_majors(tot_count, handle.get_stream());
+  size_t major_offset{0};
+  for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) {
+    thrust::copy(handle.get_thrust_policy(),
+                 coarsened_edgelist_majors[i].begin(),
+                 coarsened_edgelist_majors[i].end(),
+                 concatenated_edgelist_majors.begin() + major_offset);
+    major_offset += coarsened_edgelist_majors[i].size();
+    coarsened_edgelist_majors[i].resize(0, handle.get_stream());
+    coarsened_edgelist_majors[i].shrink_to_fit(handle.get_stream());
+  }
 
-      auto cur_size = coarsened_edgelist_majors[j].size();
-      // FIXME: this can lead to frequent costly reallocation; we may be able to avoid this if we
-      // can reserve address space to avoid expensive reallocation.
-      // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management
-      coarsened_edgelist_majors[j].resize(cur_size + number_of_partition_edges,
-                                          handle.get_stream());
-      coarsened_edgelist_minors[j].resize(coarsened_edgelist_majors[j].size(), handle.get_stream());
+  rmm::device_uvector<vertex_t> concatenated_edgelist_minors(tot_count, handle.get_stream());
+  size_t minor_offset{0};
+  for (size_t i = 0; i < coarsened_edgelist_minors.size(); ++i) {
+    thrust::copy(handle.get_thrust_policy(),
+                 coarsened_edgelist_minors[i].begin(),
+                 coarsened_edgelist_minors[i].end(),
+                 concatenated_edgelist_minors.begin() + minor_offset);
+    minor_offset += coarsened_edgelist_minors[i].size();
+    coarsened_edgelist_minors[i].resize(0, handle.get_stream());
+    coarsened_edgelist_minors[i].shrink_to_fit(handle.get_stream());
+  }
 
-      if (coarsened_edgelist_weights) {
-        (*coarsened_edgelist_weights)[j].resize(coarsened_edgelist_majors[j].size(),
-                                                handle.get_stream());
+  std::optional<rmm::device_uvector<weight_t>> concatenated_edgelist_weights{std::nullopt};
+  if (coarsened_edgelist_weights) {
+    concatenated_edgelist_weights = rmm::device_uvector<weight_t>(tot_count, handle.get_stream());
+    size_t weight_offset{0};
+    for (size_t i = 0; i < (*coarsened_edgelist_weights).size(); ++i) {
+      thrust::copy(handle.get_thrust_policy(),
+                   (*coarsened_edgelist_weights)[i].begin(),
+                   (*coarsened_edgelist_weights)[i].end(),
+                   (*concatenated_edgelist_weights).begin() + weight_offset);
+      weight_offset += (*coarsened_edgelist_weights)[i].size();
+      (*coarsened_edgelist_weights)[i].resize(0, handle.get_stream());
+      (*coarsened_edgelist_weights)[i].shrink_to_fit(handle.get_stream());
+    }
+  }
 
-        auto input_edge_first =
-          thrust::make_zip_iterator(thrust::make_tuple(
-            edgelist_majors.begin(), edgelist_minors.begin(), (*edgelist_weights).begin())) +
-          h_displacements[j];
-        auto output_edge_first =
-          thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_majors[j].begin(),
-                                                       coarsened_edgelist_minors[j].begin(),
-                                                       (*coarsened_edgelist_weights)[j].begin())) +
-          cur_size;
-        thrust::copy(handle.get_thrust_policy(),
-                     input_edge_first,
-                     input_edge_first + number_of_partition_edges,
-                     output_edge_first);
-      } else {
-        auto input_edge_first = thrust::make_zip_iterator(thrust::make_tuple(
-                                  edgelist_majors.begin(), edgelist_minors.begin())) +
-                                h_displacements[j];
-        auto output_edge_first =
-          thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_majors[j].begin(),
-                                                       coarsened_edgelist_minors[j].begin())) +
-          cur_size;
-        thrust::copy(handle.get_thrust_policy(),
-                     input_edge_first,
-                     input_edge_first + number_of_partition_edges,
-                     output_edge_first);
-      }
+  auto concatenated_and_coarsened_size = groupby_e_and_coarsen_edgelist(
+    concatenated_edgelist_majors.data(),
+    concatenated_edgelist_minors.data(),
+    concatenated_edgelist_weights
+      ? std::optional<weight_t*>{(*concatenated_edgelist_weights).data()}
+      : std::nullopt,
+    concatenated_edgelist_majors.size(),
+    handle.get_stream());
+  concatenated_edgelist_majors.resize(concatenated_and_coarsened_size, handle.get_stream());
+  concatenated_edgelist_majors.shrink_to_fit(handle.get_stream());
+  concatenated_edgelist_minors.resize(concatenated_edgelist_majors.size(), handle.get_stream());
+  concatenated_edgelist_minors.shrink_to_fit(handle.get_stream());
+  if (concatenated_edgelist_weights) {
+    (*concatenated_edgelist_weights)
+      .resize(concatenated_edgelist_majors.size(), handle.get_stream());
+    (*concatenated_edgelist_weights).shrink_to_fit(handle.get_stream());
+  }
+
+  std::optional<rmm::device_uvector<vertex_t>> reversed_edgelist_majors{std::nullopt};
+  std::optional<rmm::device_uvector<vertex_t>> reversed_edgelist_minors{std::nullopt};
+  std::optional<rmm::device_uvector<weight_t>> reversed_edgelist_weights{std::nullopt};
+  if (lower_triangular_only) {
+    if (concatenated_edgelist_weights) {
+      auto edge_first =
+        thrust::make_zip_iterator(thrust::make_tuple(concatenated_edgelist_majors.begin(),
+                                                     concatenated_edgelist_minors.begin(),
+                                                     (*concatenated_edgelist_weights).begin()));
+      auto last =
+        thrust::partition(handle.get_thrust_policy(),
+                          edge_first,
+                          edge_first + concatenated_edgelist_majors.size(),
+                          is_not_self_loop_t<thrust::tuple<vertex_t, vertex_t, weight_t>>{});
+      reversed_edgelist_majors =
+        rmm::device_uvector<vertex_t>(thrust::distance(edge_first, last), handle.get_stream());
+      reversed_edgelist_minors =
+        rmm::device_uvector<vertex_t>((*reversed_edgelist_majors).size(), handle.get_stream());
+      reversed_edgelist_weights =
+        rmm::device_uvector<weight_t>((*reversed_edgelist_majors).size(), handle.get_stream());
+      thrust::copy(
+        handle.get_thrust_policy(),
+        edge_first,
+        edge_first + (*reversed_edgelist_majors).size(),
+        thrust::make_zip_iterator(thrust::make_tuple((*reversed_edgelist_minors).begin(),
+                                                     (*reversed_edgelist_majors).begin(),
+                                                     (*reversed_edgelist_weights).begin())));
+    } else {
+      auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(
+        concatenated_edgelist_majors.begin(), concatenated_edgelist_minors.begin()));
+      auto last       = thrust::partition(handle.get_thrust_policy(),
+                                    edge_first,
+                                    edge_first + concatenated_edgelist_majors.size(),
+                                    is_not_self_loop_t<thrust::tuple<vertex_t, vertex_t>>{});
+      reversed_edgelist_majors =
+        rmm::device_uvector<vertex_t>(thrust::distance(edge_first, last), handle.get_stream());
+      reversed_edgelist_minors =
+        rmm::device_uvector<vertex_t>((*reversed_edgelist_majors).size(), handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   edge_first,
+                   edge_first + (*reversed_edgelist_majors).size(),
+                   thrust::make_zip_iterator(thrust::make_tuple(
+                     (*reversed_edgelist_minors).begin(), (*reversed_edgelist_majors).begin())));
     }
+
+    std::tie(*reversed_edgelist_majors, *reversed_edgelist_minors, reversed_edgelist_weights) =
+      cugraph::detail::shuffle_edgelist_by_gpu_id(handle,
+                                                  std::move(*reversed_edgelist_majors),
+                                                  std::move(*reversed_edgelist_minors),
+                                                  std::move(reversed_edgelist_weights));
+  }
+
+  // 3. split concatenated edge list to local partitions
+
+  auto concatenated_counts =
+    groupby_and_count_edgelist_by_local_partition_id(handle,
+                                                     concatenated_edgelist_majors,
+                                                     concatenated_edgelist_minors,
+                                                     concatenated_edgelist_weights);
+
+  std::vector<size_t> h_concatenated_counts(concatenated_counts.size());
+  raft::update_host(h_concatenated_counts.data(),
+                    concatenated_counts.data(),
+                    concatenated_counts.size(),
+                    handle.get_stream());
+
+  std::optional<std::vector<size_t>> h_reversed_counts{std::nullopt};
+  if (reversed_edgelist_majors) {
+    auto reversed_counts = groupby_and_count_edgelist_by_local_partition_id(
+      handle, *reversed_edgelist_majors, *reversed_edgelist_minors, reversed_edgelist_weights);
+
+    h_reversed_counts = std::vector<size_t>(reversed_counts.size());
+    raft::update_host((*h_reversed_counts).data(),
+                      reversed_counts.data(),
+                      reversed_counts.size(),
+                      handle.get_stream());
+  }
+
+  handle.sync_stream();
+
+  std::vector<size_t> h_concatenated_displacements(h_concatenated_counts.size(), size_t{0});
+  std::partial_sum(h_concatenated_counts.begin(),
+                   h_concatenated_counts.end() - 1,
+                   h_concatenated_displacements.begin() + 1);
+
+  std::optional<std::vector<size_t>> h_reversed_displacements{std::nullopt};
+  if (h_reversed_counts) {
+    h_reversed_displacements = std::vector<size_t>((*h_reversed_counts).size(), size_t{0});
+    std::partial_sum((*h_reversed_counts).begin(),
+                     (*h_reversed_counts).end() - 1,
+                     (*h_reversed_displacements).begin() + 1);
   }
 
   for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) {
-    auto number_of_partition_edges = groupby_e_and_coarsen_edgelist(
-      coarsened_edgelist_majors[i].data(),
-      coarsened_edgelist_minors[i].data(),
-      coarsened_edgelist_weights ? std::optional<weight_t*>{(*coarsened_edgelist_weights)[i].data()}
-                                 : std::nullopt,
-      static_cast<edge_t>(coarsened_edgelist_majors[i].size()),
+    coarsened_edgelist_majors[i].resize(
+      h_concatenated_counts[i] + (h_reversed_counts ? (*h_reversed_counts)[i] : size_t{0}),
       handle.get_stream());
-    coarsened_edgelist_majors[i].resize(number_of_partition_edges, handle.get_stream());
-    coarsened_edgelist_majors[i].shrink_to_fit(handle.get_stream());
-    coarsened_edgelist_minors[i].resize(number_of_partition_edges, handle.get_stream());
-    coarsened_edgelist_minors[i].shrink_to_fit(handle.get_stream());
+    coarsened_edgelist_minors[i].resize(coarsened_edgelist_majors[i].size(), handle.get_stream());
     if (coarsened_edgelist_weights) {
-      (*coarsened_edgelist_weights)[i].resize(number_of_partition_edges, handle.get_stream());
-      (*coarsened_edgelist_weights)[i].shrink_to_fit(handle.get_stream());
+      (*coarsened_edgelist_weights)[i].resize(coarsened_edgelist_majors[i].size(),
+                                              handle.get_stream());
+    }
+
+    thrust::copy(handle.get_thrust_policy(),
+                 concatenated_edgelist_majors.begin() + h_concatenated_displacements[i],
+                 concatenated_edgelist_majors.begin() +
+                   (h_concatenated_displacements[i] + h_concatenated_counts[i]),
+                 coarsened_edgelist_majors[i].begin());
+    thrust::copy(handle.get_thrust_policy(),
+                 concatenated_edgelist_minors.begin() + h_concatenated_displacements[i],
+                 concatenated_edgelist_minors.begin() +
+                   (h_concatenated_displacements[i] + h_concatenated_counts[i]),
+                 coarsened_edgelist_minors[i].begin());
+    if (coarsened_edgelist_weights) {
+      thrust::copy(handle.get_thrust_policy(),
+                   (*concatenated_edgelist_weights).begin() + h_concatenated_displacements[i],
+                   (*concatenated_edgelist_weights).begin() +
+                     (h_concatenated_displacements[i] + h_concatenated_counts[i]),
+                   (*coarsened_edgelist_weights)[i].begin());
+    }
+
+    if (reversed_edgelist_majors) {
+      thrust::copy(handle.get_thrust_policy(),
+                   (*reversed_edgelist_majors).begin() + (*h_reversed_displacements)[i],
+                   (*reversed_edgelist_majors).begin() +
+                     ((*h_reversed_displacements)[i] + (*h_reversed_counts)[i]),
+                   coarsened_edgelist_majors[i].begin() + h_concatenated_counts[i]);
+      thrust::copy(handle.get_thrust_policy(),
+                   (*reversed_edgelist_minors).begin() + (*h_reversed_displacements)[i],
+                   (*reversed_edgelist_minors).begin() +
+                     ((*h_reversed_displacements)[i] + (*h_reversed_counts)[i]),
+                   coarsened_edgelist_minors[i].begin() + h_concatenated_counts[i]);
+      if (coarsened_edgelist_weights) {
+        thrust::copy(handle.get_thrust_policy(),
+                     (*reversed_edgelist_weights).begin() + (*h_reversed_displacements)[i],
+                     (*reversed_edgelist_weights).begin() +
+                       ((*h_reversed_displacements)[i] + (*h_reversed_counts)[i]),
+                     (*coarsened_edgelist_weights)[i].begin() + h_concatenated_counts[i]);
+      }
     }
   }
 
-  // 3. find unique labels for this GPU
+  // 4. find unique labels for this GPU
 
   rmm::device_uvector<vertex_t> unique_labels(graph_view.get_number_of_local_vertices(),
                                               handle.get_stream());
@@ -364,7 +570,7 @@ coarsen_graph(
       thrust::unique(handle.get_thrust_policy(), unique_labels.begin(), unique_labels.end())),
     handle.get_stream());
 
-  // 4. renumber
+  // 5. renumber
 
   rmm::device_uvector<vertex_t> renumber_map_labels(0, handle.get_stream());
   renumber_meta_t<vertex_t, edge_t, multi_gpu> meta{};
@@ -387,7 +593,7 @@ coarsen_graph(
       do_expensive_check);
   }
 
-  // 5. build a graph
+  // 6. build a graph
 
   std::vector<edgelist_t<vertex_t, edge_t, weight_t>> edgelists{};
   edgelists.resize(graph_view.get_number_of_local_adj_matrix_partitions());
@@ -438,6 +644,8 @@ coarsen_graph(
     // currently, nothing to do
   }
 
+  bool lower_triangular_only = graph_view.is_symmetric();
+
   auto [coarsened_edgelist_majors, coarsened_edgelist_minors, coarsened_edgelist_weights] =
     decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(
       handle,
@@ -445,7 +653,65 @@ coarsen_graph(
         graph_view.get_matrix_partition_view()),
       labels,
       detail::minor_properties_device_view_t<vertex_t, vertex_t const*>(labels),
-      graph_view.get_local_adj_matrix_partition_segment_offsets(0));
+      graph_view.get_local_adj_matrix_partition_segment_offsets(0),
+      lower_triangular_only);
+
+  if (lower_triangular_only) {
+    if (coarsened_edgelist_weights) {
+      std::cout << "lower_triangular weighted" << std::endl;
+      auto edge_first =
+        thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_majors.begin(),
+                                                     coarsened_edgelist_minors.begin(),
+                                                     (*coarsened_edgelist_weights).begin()));
+      auto last =
+        thrust::partition(handle.get_thrust_policy(),
+                          edge_first,
+                          edge_first + coarsened_edgelist_majors.size(),
+                          is_not_self_loop_t<thrust::tuple<vertex_t, vertex_t, weight_t>>{});
+
+      auto cur_size      = coarsened_edgelist_majors.size();
+      auto reversed_size = static_cast<size_t>(thrust::distance(edge_first, last));
+
+      coarsened_edgelist_majors.resize(cur_size + reversed_size, handle.get_stream());
+      coarsened_edgelist_minors.resize(coarsened_edgelist_majors.size(), handle.get_stream());
+      (*coarsened_edgelist_weights).resize(coarsened_edgelist_majors.size(), handle.get_stream());
+
+      edge_first =
+        thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_majors.begin(),
+                                                     coarsened_edgelist_minors.begin(),
+                                                     (*coarsened_edgelist_weights).begin()));
+      thrust::copy(
+        handle.get_thrust_policy(),
+        edge_first,
+        edge_first + reversed_size,
+        thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_minors.begin(),
+                                                     coarsened_edgelist_majors.begin(),
+                                                     (*coarsened_edgelist_weights).begin())) +
+          cur_size);
+    } else {
+      auto edge_first = thrust::make_zip_iterator(
+        thrust::make_tuple(coarsened_edgelist_majors.begin(), coarsened_edgelist_minors.begin()));
+      auto last = thrust::partition(handle.get_thrust_policy(),
+                                    edge_first,
+                                    edge_first + coarsened_edgelist_majors.size(),
+                                    is_not_self_loop_t<thrust::tuple<vertex_t, vertex_t>>{});
+
+      auto cur_size      = coarsened_edgelist_majors.size();
+      auto reversed_size = static_cast<size_t>(thrust::distance(edge_first, last));
+
+      coarsened_edgelist_majors.resize(cur_size + reversed_size, handle.get_stream());
+      coarsened_edgelist_minors.resize(coarsened_edgelist_majors.size(), handle.get_stream());
+
+      edge_first = thrust::make_zip_iterator(
+        thrust::make_tuple(coarsened_edgelist_majors.begin(), coarsened_edgelist_minors.begin()));
+      thrust::copy(handle.get_thrust_policy(),
+                   edge_first,
+                   edge_first + reversed_size,
+                   thrust::make_zip_iterator(thrust::make_tuple(
+                     coarsened_edgelist_minors.begin(), coarsened_edgelist_majors.begin())) +
+                     cur_size);
+    }
+  }
 
   rmm::device_uvector<vertex_t> unique_labels(graph_view.get_number_of_vertices(),
                                               handle.get_stream());

From 86f1d110d92ddbf12d0b6319ab72f76b87b13ba1 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 17 Feb 2022 02:19:02 -0800
Subject: [PATCH 40/60] clang-format

---
 cpp/src/structure/coarsen_graph_impl.cuh | 25 +++++++-----------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index 0611d642cb3..02e8529d108 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -116,24 +116,13 @@ template <typename vertex_t,
 std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>>
-decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(raft::handle_t const&
-                                                                              handle,
-                                                                            matrix_partition_device_view_t<
-                                                                              vertex_t,
-                                                                              edge_t,
-                                                                              weight_t,
-                                                                              multi_gpu> const
-                                                                              matrix_partition,
-                                                                            vertex_t const*
-                                                                              major_label_first,
-                                                                            AdjMatrixMinorLabelInputWrapper const
-                                                                              minor_label_input,
-                                                                            std::optional<
-                                                                              std::vector<
-                                                                                vertex_t>> const&
-                                                                              segment_offsets,
-                                                                            bool
-                                                                              lower_triangular_only)
+decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(
+  raft::handle_t const& handle,
+  matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> const matrix_partition,
+  vertex_t const* major_label_first,
+  AdjMatrixMinorLabelInputWrapper const minor_label_input,
+  std::optional<std::vector<vertex_t>> const& segment_offsets,
+  bool lower_triangular_only)
 {
   static_assert(std::is_same_v<typename AdjMatrixMinorLabelInputWrapper::value_type, vertex_t>);
 

From 75d6f81f6acd83e6a0b195f4a9c48d54fe45cad9 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 17 Feb 2022 10:05:36 -0800
Subject: [PATCH 41/60] fix comments

---
 cpp/src/structure/coarsen_graph_impl.cuh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index 02e8529d108..39ebaab8d31 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -248,8 +248,8 @@ coarsen_graph(
   }
 
   // 1. construct coarsened edge lists from each local partition (if the input graph is symmetric,
-  // start with only the lower triangular edges in the original graph, this is to prevent edge
-  // weights in the coarsened graph becoming asymmmetric due to limited floatping point resolution)
+  // start with only the lower triangular edges after relabeling, this is to prevent edge weights in
+  // the coarsened graph becoming asymmmetric due to limited floatping point resolution)
 
   bool lower_triangular_only = graph_view.is_symmetric();
 
@@ -647,7 +647,6 @@ coarsen_graph(
 
   if (lower_triangular_only) {
     if (coarsened_edgelist_weights) {
-      std::cout << "lower_triangular weighted" << std::endl;
       auto edge_first =
         thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_majors.begin(),
                                                      coarsened_edgelist_minors.begin(),

From 29e7b84484fd48e029ff3e400ef5d965aba7f834 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 17 Feb 2022 10:29:24 -0800
Subject: [PATCH 42/60] fix to make merge with enh_mg_louvain PR easier

---
 cpp/include/cugraph/detail/graph_utils.cuh | 7 ++++---
 cpp/src/structure/coarsen_graph_impl.cuh   | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cugraph/detail/graph_utils.cuh b/cpp/include/cugraph/detail/graph_utils.cuh
index a5006c711fe..8cd1eced921 100644
--- a/cpp/include/cugraph/detail/graph_utils.cuh
+++ b/cpp/include/cugraph/detail/graph_utils.cuh
@@ -86,12 +86,13 @@ struct is_first_in_run_t {
   }
 };
 
-template <typename PairIterator>
+template <typename vertex_t>
 struct is_first_in_run_pair_t {
-  PairIterator pair_first{};
+  vertex_t const* vertices0{nullptr};
+  vertex_t const* vertices1{nullptr};
   __device__ bool operator()(size_t i) const
   {
-    return (i == 0) || (*(pair_first + (i - 1)) != *(pair_first + i));
+    return (i == 0) || ((vertices0[i - 1] != vertices0[i]) || (vertices1[i - 1] != vertices1[i]));
   }
 };
 
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index 39ebaab8d31..93eb35b437c 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -78,7 +78,7 @@ edge_t groupby_e_and_coarsen_edgelist(vertex_t* edgelist_majors /* [INOUT] */,
       thrust::count_if(rmm::exec_policy(stream_view),
                        thrust::make_counting_iterator(size_t{0}),
                        thrust::make_counting_iterator(static_cast<size_t>(number_of_edges)),
-                       detail::is_first_in_run_pair_t<decltype(pair_first)>{pair_first});
+                       detail::is_first_in_run_pair_t<vertex_t>{edgelist_majors, edgelist_minors});
 
     rmm::device_uvector<vertex_t> tmp_edgelist_majors(num_uniques, stream_view);
     rmm::device_uvector<vertex_t> tmp_edgelist_minors(tmp_edgelist_majors.size(), stream_view);

From 57a084332cd58e57cf6c6550d58a02667989163f Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 17 Feb 2022 10:31:05 -0800
Subject: [PATCH 43/60] copyright

---
 cpp/include/cugraph/detail/decompress_matrix_partition.cuh | 2 +-
 cpp/src/structure/coarsen_graph_impl.cuh                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cugraph/detail/decompress_matrix_partition.cuh b/cpp/include/cugraph/detail/decompress_matrix_partition.cuh
index b419d4bdbb3..ac8864d7e8f 100644
--- a/cpp/include/cugraph/detail/decompress_matrix_partition.cuh
+++ b/cpp/include/cugraph/detail/decompress_matrix_partition.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index 93eb35b437c..53b2193ac46 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 14c47d4265c87644d28ae37b1a638e838a5be3c2 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 17 Feb 2022 17:39:22 -0800
Subject: [PATCH 44/60] remove one more memory footprint bottleneck in graph
 object creation

---
 cpp/src/structure/renumber_edgelist_impl.cuh | 100 ++++++++++---------
 1 file changed, 55 insertions(+), 45 deletions(-)

diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index aeb7682f440..2a3ed5df5df 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -214,6 +214,12 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>> compute_renumbe
     auto const col_comm_rank = col_comm.get_rank();
     auto const col_comm_size = col_comm.get_size();
 
+    auto constexpr num_chunks = size_t{
+      2};  // tuning parameter, this trade-offs # binary searches (up to num_chunks times more
+           // binary searches can be necessary if num_unique_majors << edgelist_edge_counts[i]) and
+           // temporary buffer requirement (cut by num_chunks times), currently set to 2 to avoid
+           // peak memory usage happening in this part (especially when col_comm_size is small)
+
     assert(edgelist_majors.size() == col_comm_size);
 
     auto edge_partition_major_sizes =
@@ -226,24 +232,22 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>> compute_renumbe
         raft::comms::op_t::SUM,
         handle.get_stream());
       // memory footprint vs parallelism trade-off
-      // peak memory requirement per loop is
-      // min(
-      //   (E / (comm_size * col_comm_size)) * sizeof(vertex_t) * 2,
-      //   (E / (comm_size * col_comm_size)) * sizeof(vertex_t) +
-      //     (V/P) * (sizeof(vertex_t) + sizeof(edge_t)),
-      //   (V/P) * (sizeof(vertex_t) + sizeof(edge_t)) * 2
-      // )
-      // and limit temporary memory requirement to (E / comm_size) * sizeof(vertex_t) * 2
+      // peak memory requirement per loop is approximately
+      //   (V/P) * (sizeof(vertex_t) + sizeof(edge_t)) +
+      //   (E / (comm_size * col_comm_size)) / num_chunks * sizeof(vertex_t) * 2 +
+      //   std::min(V/P, (E / (comm_size * col_comm_size)) / num_chunks) * (sizeof(vertex_t) +
+      //   sizeof(edge_t))
+      // and limit temporary memory requirement to (E / comm_size) * sizeof(vertex_t)
       auto avg_vertex_degree = thrust::get<0>(vertex_edge_counts) > 0
                                  ? static_cast<double>(thrust::get<1>(vertex_edge_counts)) /
                                      static_cast<double>(thrust::get<0>(vertex_edge_counts))
                                  : double{0.0};
-      auto num_streams =
-        std::min(static_cast<size_t>(avg_vertex_degree *
-                                     (static_cast<double>(sizeof(vertex_t)) /
-                                      static_cast<double>(sizeof(vertex_t) + sizeof(edge_t)))),
-                 static_cast<size_t>(
-                   std::min(static_cast<size_t>(col_comm_size), handle.get_stream_pool_size())));
+      auto num_streams       = static_cast<size_t>(
+        (avg_vertex_degree * sizeof(vertex_t)) /
+        (static_cast<double>(sizeof(vertex_t) + sizeof(edge_t)) +
+         (((avg_vertex_degree / col_comm_size) / num_chunks) * sizeof(vertex_t) * 2) +
+         (std::min(1.0, ((avg_vertex_degree / col_comm_size) / num_chunks)) *
+          (sizeof(vertex_t) + sizeof(edge_t)))));
       if (num_streams >= 2) {
         stream_pool_indices = std::vector<size_t>(num_streams);
         std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0});
@@ -256,28 +260,6 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>> compute_renumbe
                            ? handle.get_stream_from_stream_pool(i % (*stream_pool_indices).size())
                            : handle.get_stream();
 
-      rmm::device_uvector<vertex_t> tmp_majors(edgelist_edge_counts[i], loop_stream);
-      thrust::copy(rmm::exec_policy(loop_stream),
-                   edgelist_majors[i],
-                   edgelist_majors[i] + edgelist_edge_counts[i],
-                   tmp_majors.begin());
-      thrust::sort(rmm::exec_policy(loop_stream), tmp_majors.begin(), tmp_majors.end());
-      auto num_unique_majors = thrust::count_if(rmm::exec_policy(loop_stream),
-                                                thrust::make_counting_iterator(size_t{0}),
-                                                thrust::make_counting_iterator(tmp_majors.size()),
-                                                is_first_in_run_t<vertex_t>{tmp_majors.data()});
-      rmm::device_uvector<vertex_t> tmp_keys(num_unique_majors, loop_stream);
-      rmm::device_uvector<edge_t> tmp_values(num_unique_majors, loop_stream);
-      thrust::reduce_by_key(rmm::exec_policy(loop_stream),
-                            tmp_majors.begin(),
-                            tmp_majors.end(),
-                            thrust::make_constant_iterator(edge_t{1}),
-                            tmp_keys.begin(),
-                            tmp_values.begin());
-
-      tmp_majors.resize(0, loop_stream);
-      tmp_majors.shrink_to_fit(loop_stream);
-
       rmm::device_uvector<vertex_t> sorted_majors(edge_partition_major_sizes[i], loop_stream);
       device_bcast(col_comm,
                    sorted_local_vertices.data(),
@@ -292,15 +274,43 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>> compute_renumbe
                    sorted_major_degrees.end(),
                    edge_t{0});
 
-      auto kv_pair_first =
-        thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin()));
-      thrust::for_each(
-        rmm::exec_policy(loop_stream),
-        kv_pair_first,
-        kv_pair_first + tmp_keys.size(),
-        search_and_set_degree_t<vertex_t, edge_t>{sorted_majors.data(),
-                                                  static_cast<vertex_t>(sorted_majors.size()),
-                                                  sorted_major_degrees.data()});
+      rmm::device_uvector<vertex_t> tmp_majors(
+        (static_cast<size_t>(edgelist_edge_counts[i]) + (num_chunks - 1)) / num_chunks,
+        handle.get_stream());
+      size_t offset{0};
+      for (size_t j = 0; j < num_chunks; ++j) {
+        size_t this_chunk_size =
+          std::min(tmp_majors.size(), static_cast<size_t>(edgelist_edge_counts[i]) - offset);
+        thrust::copy(rmm::exec_policy(loop_stream),
+                     edgelist_majors[i] + offset,
+                     edgelist_majors[i] + offset + this_chunk_size,
+                     tmp_majors.begin());
+        thrust::sort(
+          rmm::exec_policy(loop_stream), tmp_majors.begin(), tmp_majors.begin() + this_chunk_size);
+        auto num_unique_majors = thrust::count_if(rmm::exec_policy(loop_stream),
+                                                  thrust::make_counting_iterator(size_t{0}),
+                                                  thrust::make_counting_iterator(this_chunk_size),
+                                                  is_first_in_run_t<vertex_t>{tmp_majors.data()});
+        rmm::device_uvector<vertex_t> tmp_keys(num_unique_majors, loop_stream);
+        rmm::device_uvector<edge_t> tmp_values(num_unique_majors, loop_stream);
+        thrust::reduce_by_key(rmm::exec_policy(loop_stream),
+                              tmp_majors.begin(),
+                              tmp_majors.begin() + this_chunk_size,
+                              thrust::make_constant_iterator(edge_t{1}),
+                              tmp_keys.begin(),
+                              tmp_values.begin());
+
+        auto kv_pair_first =
+          thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin()));
+        thrust::for_each(
+          rmm::exec_policy(loop_stream),
+          kv_pair_first,
+          kv_pair_first + tmp_keys.size(),
+          search_and_set_degree_t<vertex_t, edge_t>{sorted_majors.data(),
+                                                    static_cast<vertex_t>(sorted_majors.size()),
+                                                    sorted_major_degrees.data()});
+        offset += this_chunk_size;
+      }
 
       device_reduce(col_comm,
                     sorted_major_degrees.begin(),

From 550576bda3d90b4c1f9dab01bd9943506d899c51 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 17 Feb 2022 17:45:09 -0800
Subject: [PATCH 45/60] add graph constructors taking edge list as R-value (so
 they can be destroyed as soon as they are no longer necessary to reduce peak
 memory usage)

---
 cpp/include/cugraph/graph.hpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp
index cc711af663c..216552ec770 100644
--- a/cpp/include/cugraph/graph.hpp
+++ b/cpp/include/cugraph/graph.hpp
@@ -97,6 +97,13 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
           graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
           bool do_expensive_check = false);
 
+  graph_t(raft::handle_t const& handle,
+          std::vector<rmm::device_uvector<vertex_t>>&& edgelist_srcs,
+          std::vector<rmm::device_uvector<vertex_t>>&& edgelist_dsts,
+          std::optional<std::vector<rmm::device_uvector<weight_t>>>&& edgelist_weights,
+          graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
+          bool do_expensive_check = false);
+
   /**
    * @brief Symmetrize this graph.
    *
@@ -264,6 +271,13 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
           graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
           bool do_expensive_check = false);
 
+  graph_t(raft::handle_t const& handle,
+          rmm::device_uvector<vertex_t>&& edgelist_srcs,
+          rmm::device_uvector<vertex_t>&& edgelist_dsts,
+          std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+          graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
+          bool do_expensive_check = false);
+
   /**
    * @brief Symmetrize this graph.
    *

From 09f30721041017ea0aa0b92ce5b4a4b905a0b337 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 18 Feb 2022 01:06:25 -0800
Subject: [PATCH 46/60] cosmetic fix

---
 cpp/include/cugraph/graph.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp
index 216552ec770..f50a64665af 100644
--- a/cpp/include/cugraph/graph.hpp
+++ b/cpp/include/cugraph/graph.hpp
@@ -98,9 +98,9 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
           bool do_expensive_check = false);
 
   graph_t(raft::handle_t const& handle,
-          std::vector<rmm::device_uvector<vertex_t>>&& edgelist_srcs,
-          std::vector<rmm::device_uvector<vertex_t>>&& edgelist_dsts,
-          std::optional<std::vector<rmm::device_uvector<weight_t>>>&& edgelist_weights,
+          std::vector<rmm::device_uvector<vertex_t>>&& edgelist_src_partitions,
+          std::vector<rmm::device_uvector<vertex_t>>&& edgelist_dst_partitions,
+          std::optional<std::vector<rmm::device_uvector<weight_t>>>&& edge_weight_partitions,
           graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
           bool do_expensive_check = false);
 

From 2b9d37d07c175d730ef433bbd25b36976e80d101 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 18 Feb 2022 01:08:06 -0800
Subject: [PATCH 47/60] update coarsen_graph return type

---
 cpp/include/cugraph/graph_functions.hpp    |  8 ++---
 cpp/src/community/louvain.cuh              |  5 +--
 cpp/src/structure/coarsen_graph_mg.cu      | 36 ++++++++--------------
 cpp/src/structure/coarsen_graph_sg.cu      | 36 ++++++++--------------
 cpp/tests/structure/coarsen_graph_test.cpp |  2 +-
 5 files changed, 32 insertions(+), 55 deletions(-)

diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index c170ce65253..988dd152bd9 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -351,16 +351,16 @@ symmetrize_edgelist(raft::handle_t const& handle,
  * @param graph_view Graph view object of the input graph to be coarsened.
  * @param labels Vertex labels (assigned to this process in multi-GPU) to be used in coarsening.
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
- * @return std::tuple<std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, store_transposed,
- * multi_gpu>>, rmm::device_uvector<vertex_t>> Tuple of the coarsened graph and labels mapped to the
- * vertices (assigned to this process in multi-GPU) in the coarsened graph.
+ * @return std::tuple<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+ * rmm::device_uvector<vertex_t>> Tuple of the coarsened graph and labels mapped to the vertices
+ * (assigned to this process in multi-GPU) in the coarsened graph.
  */
 template <typename vertex_t,
           typename edge_t,
           typename weight_t,
           bool store_transposed,
           bool multi_gpu>
-std::tuple<std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>,
+std::tuple<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
            rmm::device_uvector<vertex_t>>
 coarsen_graph(
   raft::handle_t const& handle,
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index 094f3bc6546..16ccf872d0c 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -137,6 +137,7 @@ class Louvain {
 #endif
       handle_(handle),
       dendrogram_(std::make_unique<Dendrogram<vertex_t>>()),
+      current_graph_(handle),
       current_graph_view_(graph_view),
       cluster_keys_v_(0, handle.get_stream()),
       cluster_weights_v_(0, handle.get_stream()),
@@ -559,7 +560,7 @@ class Louvain {
     std::tie(current_graph_, numbering_map) =
       coarsen_graph(handle_, current_graph_view_, dendrogram_->current_level_begin());
 
-    current_graph_view_ = current_graph_->view();
+    current_graph_view_ = current_graph_.view();
 
     rmm::device_uvector<vertex_t> numbering_indices(numbering_map.size(), handle_.get_stream());
     thrust::sequence(handle_.get_thrust_policy(),
@@ -589,7 +590,7 @@ class Louvain {
   //  but as we shrink the graph we'll keep the
   //  current graph here
   //
-  std::unique_ptr<graph_t> current_graph_{};
+  graph_t current_graph_;
   graph_view_t current_graph_view_;
 
   rmm::device_uvector<vertex_t> cluster_keys_v_;
diff --git a/cpp/src/structure/coarsen_graph_mg.cu b/cpp/src/structure/coarsen_graph_mg.cu
index 73ca9a82b06..95506878ef1 100644
--- a/cpp/src/structure/coarsen_graph_mg.cu
+++ b/cpp/src/structure/coarsen_graph_mg.cu
@@ -19,85 +19,73 @@ namespace cugraph {
 
 // MG instantiation
 
-template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, float, true, true>>,
-                    rmm::device_uvector<int32_t>>
+template std::tuple<graph_t<int32_t, int32_t, float, true, true>, rmm::device_uvector<int32_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int32_t, int32_t, float, true, true> const& graph_view,
               int32_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, float, false, true>>,
-                    rmm::device_uvector<int32_t>>
+template std::tuple<graph_t<int32_t, int32_t, float, false, true>, rmm::device_uvector<int32_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int32_t, int32_t, float, false, true> const& graph_view,
               int32_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, float, true, true>>,
-                    rmm::device_uvector<int32_t>>
+template std::tuple<graph_t<int32_t, int64_t, float, true, true>, rmm::device_uvector<int32_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int32_t, int64_t, float, true, true> const& graph_view,
               int32_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, float, false, true>>,
-                    rmm::device_uvector<int32_t>>
+template std::tuple<graph_t<int32_t, int64_t, float, false, true>, rmm::device_uvector<int32_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int32_t, int64_t, float, false, true> const& graph_view,
               int32_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, float, true, true>>,
-                    rmm::device_uvector<int64_t>>
+template std::tuple<graph_t<int64_t, int64_t, float, true, true>, rmm::device_uvector<int64_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int64_t, int64_t, float, true, true> const& graph_view,
               int64_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, float, false, true>>,
-                    rmm::device_uvector<int64_t>>
+template std::tuple<graph_t<int64_t, int64_t, float, false, true>, rmm::device_uvector<int64_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int64_t, int64_t, float, false, true> const& graph_view,
               int64_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, double, true, true>>,
-                    rmm::device_uvector<int32_t>>
+template std::tuple<graph_t<int32_t, int32_t, double, true, true>, rmm::device_uvector<int32_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int32_t, int32_t, double, true, true> const& graph_view,
               int32_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, double, false, true>>,
-                    rmm::device_uvector<int32_t>>
+template std::tuple<graph_t<int32_t, int32_t, double, false, true>, rmm::device_uvector<int32_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int32_t, int32_t, double, false, true> const& graph_view,
               int32_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, double, true, true>>,
-                    rmm::device_uvector<int32_t>>
+template std::tuple<graph_t<int32_t, int64_t, double, true, true>, rmm::device_uvector<int32_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int32_t, int64_t, double, true, true> const& graph_view,
               int32_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, double, false, true>>,
-                    rmm::device_uvector<int32_t>>
+template std::tuple<graph_t<int32_t, int64_t, double, false, true>, rmm::device_uvector<int32_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int32_t, int64_t, double, false, true> const& graph_view,
               int32_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, double, true, true>>,
-                    rmm::device_uvector<int64_t>>
+template std::tuple<graph_t<int64_t, int64_t, double, true, true>, rmm::device_uvector<int64_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int64_t, int64_t, double, true, true> const& graph_view,
               int64_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, double, false, true>>,
-                    rmm::device_uvector<int64_t>>
+template std::tuple<graph_t<int64_t, int64_t, double, false, true>, rmm::device_uvector<int64_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int64_t, int64_t, double, false, true> const& graph_view,
               int64_t const* labels,
diff --git a/cpp/src/structure/coarsen_graph_sg.cu b/cpp/src/structure/coarsen_graph_sg.cu
index d32dd1c744d..6cc07420957 100644
--- a/cpp/src/structure/coarsen_graph_sg.cu
+++ b/cpp/src/structure/coarsen_graph_sg.cu
@@ -19,85 +19,73 @@ namespace cugraph {
 
 // SG instantiation
 
-template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, float, true, false>>,
-                    rmm::device_uvector<int32_t>>
+template std::tuple<graph_t<int32_t, int32_t, float, true, false>, rmm::device_uvector<int32_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int32_t, int32_t, float, true, false> const& graph_view,
               int32_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, float, false, false>>,
-                    rmm::device_uvector<int32_t>>
+template std::tuple<graph_t<int32_t, int32_t, float, false, false>, rmm::device_uvector<int32_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int32_t, int32_t, float, false, false> const& graph_view,
               int32_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, float, true, false>>,
-                    rmm::device_uvector<int32_t>>
+template std::tuple<graph_t<int32_t, int64_t, float, true, false>, rmm::device_uvector<int32_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int32_t, int64_t, float, true, false> const& graph_view,
               int32_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, float, false, false>>,
-                    rmm::device_uvector<int32_t>>
+template std::tuple<graph_t<int32_t, int64_t, float, false, false>, rmm::device_uvector<int32_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int32_t, int64_t, float, false, false> const& graph_view,
               int32_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, float, true, false>>,
-                    rmm::device_uvector<int64_t>>
+template std::tuple<graph_t<int64_t, int64_t, float, true, false>, rmm::device_uvector<int64_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int64_t, int64_t, float, true, false> const& graph_view,
               int64_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, float, false, false>>,
-                    rmm::device_uvector<int64_t>>
+template std::tuple<graph_t<int64_t, int64_t, float, false, false>, rmm::device_uvector<int64_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int64_t, int64_t, float, false, false> const& graph_view,
               int64_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, double, true, false>>,
-                    rmm::device_uvector<int32_t>>
+template std::tuple<graph_t<int32_t, int32_t, double, true, false>, rmm::device_uvector<int32_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int32_t, int32_t, double, true, false> const& graph_view,
               int32_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, double, false, false>>,
-                    rmm::device_uvector<int32_t>>
+template std::tuple<graph_t<int32_t, int32_t, double, false, false>, rmm::device_uvector<int32_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int32_t, int32_t, double, false, false> const& graph_view,
               int32_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, double, true, false>>,
-                    rmm::device_uvector<int32_t>>
+template std::tuple<graph_t<int32_t, int64_t, double, true, false>, rmm::device_uvector<int32_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int32_t, int64_t, double, true, false> const& graph_view,
               int32_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, double, false, false>>,
-                    rmm::device_uvector<int32_t>>
+template std::tuple<graph_t<int32_t, int64_t, double, false, false>, rmm::device_uvector<int32_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int32_t, int64_t, double, false, false> const& graph_view,
               int32_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, double, true, false>>,
-                    rmm::device_uvector<int64_t>>
+template std::tuple<graph_t<int64_t, int64_t, double, true, false>, rmm::device_uvector<int64_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int64_t, int64_t, double, true, false> const& graph_view,
               int64_t const* labels,
               bool do_expensive_check);
 
-template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, double, false, false>>,
-                    rmm::device_uvector<int64_t>>
+template std::tuple<graph_t<int64_t, int64_t, double, false, false>, rmm::device_uvector<int64_t>>
 coarsen_graph(raft::handle_t const& handle,
               graph_view_t<int64_t, int64_t, double, false, false> const& graph_view,
               int64_t const* labels,
diff --git a/cpp/tests/structure/coarsen_graph_test.cpp b/cpp/tests/structure/coarsen_graph_test.cpp
index dc9298813be..dec7c994a69 100644
--- a/cpp/tests/structure/coarsen_graph_test.cpp
+++ b/cpp/tests/structure/coarsen_graph_test.cpp
@@ -324,7 +324,7 @@ class Tests_CoarsenGraph
                           handle.get_stream());
       }
 
-      auto coarse_graph_view = coarse_graph->view();
+      auto coarse_graph_view = coarse_graph.view();
 
       std::vector<edge_t> h_coarse_offsets(coarse_graph_view.get_number_of_vertices() + 1);
       std::vector<vertex_t> h_coarse_indices(coarse_graph_view.get_number_of_edges());

From 35fc8d10b41375f17e8cc3fcf1641f47b8016f4f Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 18 Feb 2022 01:11:04 -0800
Subject: [PATCH 48/60] refactor graph constructors and implement graph
 constructors taking edge list in R-values

---
 cpp/src/structure/graph_impl.cuh | 940 ++++++++++++++++++++-----------
 1 file changed, 609 insertions(+), 331 deletions(-)

diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index eff76df8a79..924679cc48a 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -268,6 +268,399 @@ bool check_no_parallel_edge(raft::handle_t const& handle,
   }
 }
 
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::enable_if_t<multi_gpu, void> check_graph_constructor_input_arguments(
+  raft::handle_t const& handle,
+  std::vector<edgelist_t<vertex_t, edge_t, weight_t>> const& edgelists,
+  graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
+  bool do_expensive_check)
+{
+  // cheap error checks
+
+  auto& comm               = handle.get_comms();
+  auto const comm_size     = comm.get_size();
+  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_comm_size = col_comm.get_size();
+
+  CUGRAPH_EXPECTS(edgelists.size() == static_cast<size_t>(col_comm_size),
+                  "Invalid input argument: erroneous edgelists.size().");
+  CUGRAPH_EXPECTS(
+    !(meta.segment_offsets).has_value() ||
+      ((*(meta.segment_offsets)).size() ==
+       (detail::num_sparse_segments_per_vertex_partition + 1)) ||
+      ((*(meta.segment_offsets)).size() == (detail::num_sparse_segments_per_vertex_partition + 2)),
+    "Invalid input argument: (*(meta.segment_offsets)).size() returns an invalid value.");
+
+  auto is_weighted = edgelists[0].p_edge_weights.has_value();
+
+  CUGRAPH_EXPECTS(
+    std::any_of(edgelists.begin(),
+                edgelists.end(),
+                [is_weighted](auto edgelist) {
+                  return ((edgelist.number_of_edges > 0) && (edgelist.p_src_vertices == nullptr)) ||
+                         ((edgelist.number_of_edges > 0) && (edgelist.p_dst_vertices == nullptr)) ||
+                         (is_weighted && (edgelist.number_of_edges > 0) &&
+                          ((edgelist.p_edge_weights.has_value() == false) ||
+                           (*(edgelist.p_edge_weights) == nullptr)));
+                }) == false,
+    "Invalid input argument: edgelists[].p_src_vertices and edgelists[].p_dst_vertices should not "
+    "be nullptr if edgelists[].number_of_edges > 0 and edgelists[].p_edge_weights should be "
+    "neither std::nullopt nor nullptr if weighted and edgelists[].number_of_edges >  0.");
+
+  // optional expensive checks
+
+  if (do_expensive_check) {
+    edge_t number_of_local_edges{0};
+    for (size_t i = 0; i < edgelists.size(); ++i) {
+      auto [major_first, major_last] = meta.partition.get_matrix_partition_major_range(i);
+      auto [minor_first, minor_last] = meta.partition.get_matrix_partition_minor_range();
+
+      number_of_local_edges += edgelists[i].number_of_edges;
+
+      auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(
+        store_transposed ? edgelists[i].p_dst_vertices : edgelists[i].p_src_vertices,
+        store_transposed ? edgelists[i].p_src_vertices : edgelists[i].p_dst_vertices));
+      // better use thrust::any_of once https://github.com/thrust/thrust/issues/1016 is resolved
+      CUGRAPH_EXPECTS(thrust::count_if(handle.get_thrust_policy(),
+                                       edge_first,
+                                       edge_first + edgelists[i].number_of_edges,
+                                       out_of_range_t<vertex_t>{
+                                         major_first, major_last, minor_first, minor_last}) == 0,
+                      "Invalid input argument: edgelists[] have out-of-range values.");
+    }
+    auto number_of_local_edges_sum = host_scalar_allreduce(
+      comm, number_of_local_edges, raft::comms::op_t::SUM, handle.get_stream());
+    CUGRAPH_EXPECTS(number_of_local_edges_sum == meta.number_of_edges,
+                    "Invalid input argument: the sum of local edge counts does not match with "
+                    "meta.number_of_edges.");
+
+    CUGRAPH_EXPECTS(
+      meta.partition.get_vertex_partition_last(comm_size - 1) == meta.number_of_vertices,
+      "Invalid input argument: vertex partition should cover [0, meta.number_of_vertices).");
+
+    if (meta.properties.is_symmetric) {
+      CUGRAPH_EXPECTS(
+        (check_symmetric<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(handle,
+                                                                                  edgelists)),
+        "Invalid input argument: meta.property.is_symmetric is true but the input edge list is not "
+        "symmetric.");
+    }
+    if (!meta.properties.is_multigraph) {
+      CUGRAPH_EXPECTS(
+        check_no_parallel_edge(handle, edgelists),
+        "Invalid input argument: meta.property.is_multigraph is false but the input edge list has "
+        "parallel edges.");
+    }
+  }
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::enable_if_t<!multi_gpu, void> check_graph_constructor_input_arguments(
+  raft::handle_t const& handle,
+  edgelist_t<vertex_t, edge_t, weight_t> const& edgelist,
+  graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
+  bool do_expensive_check)
+{
+  // cheap error checks
+
+  auto is_weighted = edgelist.p_edge_weights.has_value();
+
+  CUGRAPH_EXPECTS(
+    ((edgelist.number_of_edges == 0) || (edgelist.p_src_vertices != nullptr)) &&
+      ((edgelist.number_of_edges == 0) || (edgelist.p_dst_vertices != nullptr)) &&
+      (!is_weighted || (is_weighted && ((edgelist.number_of_edges == 0) ||
+                                        (*(edgelist.p_edge_weights) != nullptr)))),
+    "Invalid input argument: edgelist.p_src_vertices and edgelist.p_dst_vertices should not be "
+    "nullptr if edgelist.number_of_edges > 0 and edgelist.p_edge_weights should be neither "
+    "std::nullopt nor nullptr if weighted and edgelist.number_of_edges > 0.");
+
+  CUGRAPH_EXPECTS(
+    !meta.segment_offsets.has_value() ||
+      ((*(meta.segment_offsets)).size() == (detail::num_sparse_segments_per_vertex_partition + 1)),
+    "Invalid input argument: (*(meta.segment_offsets)).size() returns an invalid value.");
+
+  // optional expensive checks
+
+  if (do_expensive_check) {
+    auto edge_first = thrust::make_zip_iterator(
+      thrust::make_tuple(store_transposed ? edgelist.p_dst_vertices : edgelist.p_src_vertices,
+                         store_transposed ? edgelist.p_src_vertices : edgelist.p_dst_vertices));
+    // better use thrust::any_of once https://github.com/thrust/thrust/issues/1016 is resolved
+    CUGRAPH_EXPECTS(
+      thrust::count_if(
+        handle.get_thrust_policy(),
+        edge_first,
+        edge_first + edgelist.number_of_edges,
+        out_of_range_t<vertex_t>{0, meta.number_of_vertices, 0, meta.number_of_vertices}) == 0,
+      "Invalid input argument: edgelist have out-of-range values.");
+
+    if (meta.properties.is_symmetric) {
+      CUGRAPH_EXPECTS(
+        (check_symmetric<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+          handle, std::vector<edgelist_t<vertex_t, edge_t, weight_t>>{edgelist})),
+        "Invalid input argument: meta.property.is_symmetric is true but the input edge list is not "
+        "symmetric.");
+    }
+    if (!meta.properties.is_multigraph) {
+      CUGRAPH_EXPECTS(
+        check_no_parallel_edge(handle,
+                               std::vector<edgelist_t<vertex_t, edge_t, weight_t>>{edgelist}),
+        "Invalid input argument: meta.property.is_multigraph is false but the input edge list has "
+        "parallel edges.");
+    }
+  }
+}
+
+template <typename vertex_t>
+std::vector<vertex_t> aggregate_segment_offsets(raft::handle_t const& handle,
+                                                std::vector<vertex_t> const& segment_offsets)
+{
+  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_comm_size = col_comm.get_size();
+
+  rmm::device_uvector<vertex_t> d_segment_offsets(segment_offsets.size(), handle.get_stream());
+  raft::update_device(
+    d_segment_offsets.data(), segment_offsets.data(), segment_offsets.size(), handle.get_stream());
+  rmm::device_uvector<vertex_t> d_aggregate_segment_offsets(
+    col_comm_size * d_segment_offsets.size(), handle.get_stream());
+  col_comm.allgather(d_segment_offsets.data(),
+                     d_aggregate_segment_offsets.data(),
+                     d_segment_offsets.size(),
+                     handle.get_stream());
+
+  std::vector<vertex_t> h_aggregate_segment_offsets(d_aggregate_segment_offsets.size(),
+                                                    vertex_t{0});
+  raft::update_host(h_aggregate_segment_offsets.data(),
+                    d_aggregate_segment_offsets.data(),
+                    d_aggregate_segment_offsets.size(),
+                    handle.get_stream());
+
+  handle.sync_stream();  // this is necessary as h_aggregate_offsets can be used right after return.
+
+  return h_aggregate_segment_offsets;
+}
+
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+std::enable_if_t<multi_gpu,
+                 std::tuple<std::optional<rmm::device_uvector<vertex_t>>,
+                            std::optional<std::vector<vertex_t>>,
+                            std::optional<rmm::device_uvector<vertex_t>>,
+                            std::optional<std::vector<vertex_t>>>>
+update_local_sorted_unique_edge_majors_minors(
+  raft::handle_t const& handle,
+  graph_meta_t<vertex_t, edge_t, multi_gpu> const& meta,
+  std::optional<std::vector<vertex_t>> const& adj_matrix_partition_segment_offsets,
+  std::vector<rmm::device_uvector<edge_t>> const& adj_matrix_partition_offsets,
+  std::vector<rmm::device_uvector<vertex_t>> const& adj_matrix_partition_indices,
+  std::optional<std::vector<rmm::device_uvector<vertex_t>>> const&
+    adj_matrix_partition_dcs_nzd_vertices,
+  std::optional<std::vector<vertex_t>> const& adj_matrix_partition_dcs_nzd_vertex_counts)
+{
+  auto& comm               = handle.get_comms();
+  auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+  auto const row_comm_rank = row_comm.get_rank();
+  auto const row_comm_size = row_comm.get_size();
+  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_comm_rank = col_comm.get_rank();
+  auto const col_comm_size = col_comm.get_size();
+
+  auto use_dcs =
+    meta.segment_offsets
+      ? ((*(meta.segment_offsets)).size() > (detail::num_sparse_segments_per_vertex_partition + 1))
+      : false;
+
+  std::optional<rmm::device_uvector<vertex_t>> local_sorted_unique_edge_majors{std::nullopt};
+  std::optional<std::vector<vertex_t>> local_sorted_unique_edge_major_offsets{std::nullopt};
+
+  std::optional<rmm::device_uvector<vertex_t>> local_sorted_unique_edge_minors{std::nullopt};
+  std::optional<std::vector<vertex_t>> local_sorted_unique_edge_minor_offsets{std::nullopt};
+
+  // if # unique edge majors/minors << V / row_comm_size|col_comm_size, store unique edge
+  // majors/minors to support storing edge major/minor properties in (key, value) pairs.
+
+  vertex_t num_local_unique_edge_majors{0};
+  for (size_t i = 0; i < adj_matrix_partition_offsets.size(); ++i) {
+    num_local_unique_edge_majors += thrust::count_if(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(vertex_t{0}),
+      thrust::make_counting_iterator(
+        static_cast<vertex_t>(adj_matrix_partition_offsets[i].size() - 1)),
+      has_nzd_t<vertex_t, edge_t>{adj_matrix_partition_offsets[i].data(), vertex_t{0}});
+  }
+
+  auto [minor_first, minor_last] = meta.partition.get_matrix_partition_minor_range();
+  rmm::device_uvector<uint32_t> minor_bitmaps(
+    ((minor_last - minor_first) + sizeof(uint32_t) * 8 - 1) / (sizeof(uint32_t) * 8),
+    handle.get_stream());
+  thrust::fill(handle.get_thrust_policy(), minor_bitmaps.begin(), minor_bitmaps.end(), uint32_t{0});
+  for (size_t i = 0; i < adj_matrix_partition_indices.size(); ++i) {
+    thrust::for_each(handle.get_thrust_policy(),
+                     adj_matrix_partition_indices[i].begin(),
+                     adj_matrix_partition_indices[i].end(),
+                     atomic_or_bitmap_t<vertex_t>{minor_bitmaps.data(), minor_first});
+  }
+
+  auto count_first = thrust::make_transform_iterator(minor_bitmaps.begin(), popc_t<vertex_t>{});
+  auto num_local_unique_edge_minors = thrust::reduce(
+    handle.get_thrust_policy(), count_first, count_first + minor_bitmaps.size(), vertex_t{0});
+
+  minor_bitmaps.resize(0, handle.get_stream());
+  minor_bitmaps.shrink_to_fit(handle.get_stream());
+
+  vertex_t aggregate_major_size{0};
+  for (size_t i = 0; i < meta.partition.get_number_of_matrix_partitions(); ++i) {
+    aggregate_major_size += meta.partition.get_matrix_partition_major_size(i);
+  }
+  auto minor_size                      = meta.partition.get_matrix_partition_minor_size();
+  auto max_major_properties_fill_ratio = host_scalar_allreduce(
+    comm,
+    static_cast<double>(num_local_unique_edge_majors) / static_cast<double>(aggregate_major_size),
+    raft::comms::op_t::MAX,
+    handle.get_stream());
+  auto max_minor_properties_fill_ratio = host_scalar_allreduce(
+    comm,
+    static_cast<double>(num_local_unique_edge_minors) / static_cast<double>(minor_size),
+    raft::comms::op_t::MAX,
+    handle.get_stream());
+
+  if (max_major_properties_fill_ratio < detail::row_col_properties_kv_pair_fill_ratio_threshold) {
+    local_sorted_unique_edge_majors =
+      rmm::device_uvector<vertex_t>(num_local_unique_edge_majors, handle.get_stream());
+    size_t cur_size{0};
+    for (size_t i = 0; i < adj_matrix_partition_offsets.size(); ++i) {
+      auto [major_first, major_last] = meta.partition.get_matrix_partition_major_range(i);
+      auto major_hypersparse_first =
+        use_dcs ? std::optional<vertex_t>{major_first +
+                                          (*adj_matrix_partition_segment_offsets)
+                                            [(*(meta.segment_offsets)).size() * i +
+                                             detail::num_sparse_segments_per_vertex_partition]}
+                : std::nullopt;
+      cur_size += thrust::distance(
+        (*local_sorted_unique_edge_majors).data() + cur_size,
+        thrust::copy_if(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator(major_first),
+          thrust::make_counting_iterator(use_dcs ? *major_hypersparse_first : major_last),
+          (*local_sorted_unique_edge_majors).data() + cur_size,
+          has_nzd_t<vertex_t, edge_t>{adj_matrix_partition_offsets[i].data(), major_first}));
+      if (use_dcs) {
+        thrust::copy(handle.get_thrust_policy(),
+                     (*adj_matrix_partition_dcs_nzd_vertices)[i].begin(),
+                     (*adj_matrix_partition_dcs_nzd_vertices)[i].begin() +
+                       (*adj_matrix_partition_dcs_nzd_vertex_counts)[i],
+                     (*local_sorted_unique_edge_majors).data() + cur_size);
+        cur_size += (*adj_matrix_partition_dcs_nzd_vertex_counts)[i];
+      }
+    }
+    assert(cur_size == num_local_unique_edge_majors);
+
+    std::vector<vertex_t> h_vertex_partition_firsts(col_comm_size - 1);
+    for (int i = 1; i < col_comm_size; ++i) {
+      h_vertex_partition_firsts[i - 1] =
+        meta.partition.get_vertex_partition_first(i * row_comm_size + row_comm_rank);
+    }
+    rmm::device_uvector<vertex_t> d_vertex_partition_firsts(h_vertex_partition_firsts.size(),
+                                                            handle.get_stream());
+    raft::update_device(d_vertex_partition_firsts.data(),
+                        h_vertex_partition_firsts.data(),
+                        h_vertex_partition_firsts.size(),
+                        handle.get_stream());
+    rmm::device_uvector<vertex_t> d_key_offsets(d_vertex_partition_firsts.size(),
+                                                handle.get_stream());
+    thrust::lower_bound(handle.get_thrust_policy(),
+                        (*local_sorted_unique_edge_majors).begin(),
+                        (*local_sorted_unique_edge_majors).end(),
+                        d_vertex_partition_firsts.begin(),
+                        d_vertex_partition_firsts.end(),
+                        d_key_offsets.begin());
+    std::vector<vertex_t> h_key_offsets(col_comm_size + 1, vertex_t{0});
+    h_key_offsets.back() = static_cast<vertex_t>((*local_sorted_unique_edge_majors).size());
+    raft::update_host(
+      h_key_offsets.data() + 1, d_key_offsets.data(), d_key_offsets.size(), handle.get_stream());
+
+    local_sorted_unique_edge_major_offsets = std::move(h_key_offsets);
+  }
+
+  if (max_minor_properties_fill_ratio < detail::row_col_properties_kv_pair_fill_ratio_threshold) {
+    local_sorted_unique_edge_minors = rmm::device_uvector<vertex_t>(0, handle.get_stream());
+    for (size_t i = 0; i < adj_matrix_partition_indices.size(); ++i) {
+      rmm::device_uvector<vertex_t> tmp_minors(adj_matrix_partition_indices[i].size(),
+                                               handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   adj_matrix_partition_indices[i].begin(),
+                   adj_matrix_partition_indices[i].end(),
+                   tmp_minors.begin());
+      thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end());
+      tmp_minors.resize(
+        thrust::distance(
+          tmp_minors.begin(),
+          thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())),
+        handle.get_stream());
+      auto cur_size = (*local_sorted_unique_edge_minors).size();
+      if (cur_size == 0) {
+        (*local_sorted_unique_edge_minors) = std::move(tmp_minors);
+      } else {
+        (*local_sorted_unique_edge_minors)
+          .resize((*local_sorted_unique_edge_minors).size() + tmp_minors.size(),
+                  handle.get_stream());
+        thrust::copy(handle.get_thrust_policy(),
+                     tmp_minors.begin(),
+                     tmp_minors.end(),
+                     (*local_sorted_unique_edge_minors).begin() + cur_size);
+      }
+    }
+    thrust::sort(handle.get_thrust_policy(),
+                 (*local_sorted_unique_edge_minors).begin(),
+                 (*local_sorted_unique_edge_minors).end());
+    (*local_sorted_unique_edge_minors)
+      .resize(thrust::distance((*local_sorted_unique_edge_minors).begin(),
+                               thrust::unique(handle.get_thrust_policy(),
+                                              (*local_sorted_unique_edge_minors).begin(),
+                                              (*local_sorted_unique_edge_minors).end())),
+              handle.get_stream());
+    (*local_sorted_unique_edge_minors).shrink_to_fit(handle.get_stream());
+
+    std::vector<vertex_t> h_vertex_partition_firsts(row_comm_size - 1);
+    for (int i = 1; i < row_comm_size; ++i) {
+      h_vertex_partition_firsts[i - 1] =
+        meta.partition.get_vertex_partition_first(col_comm_rank * row_comm_size + i);
+    }
+    rmm::device_uvector<vertex_t> d_vertex_partition_firsts(h_vertex_partition_firsts.size(),
+                                                            handle.get_stream());
+    raft::update_device(d_vertex_partition_firsts.data(),
+                        h_vertex_partition_firsts.data(),
+                        h_vertex_partition_firsts.size(),
+                        handle.get_stream());
+    rmm::device_uvector<vertex_t> d_key_offsets(d_vertex_partition_firsts.size(),
+                                                handle.get_stream());
+    thrust::lower_bound(handle.get_thrust_policy(),
+                        (*local_sorted_unique_edge_minors).begin(),
+                        (*local_sorted_unique_edge_minors).end(),
+                        d_vertex_partition_firsts.begin(),
+                        d_vertex_partition_firsts.end(),
+                        d_key_offsets.begin());
+    std::vector<vertex_t> h_key_offsets(row_comm_size + 1, vertex_t{0});
+    h_key_offsets.back() = static_cast<vertex_t>((*local_sorted_unique_edge_minors).size());
+    raft::update_host(
+      h_key_offsets.data() + 1, d_key_offsets.data(), d_key_offsets.size(), handle.get_stream());
+
+    local_sorted_unique_edge_minor_offsets = std::move(h_key_offsets);
+  }
+
+  return std::make_tuple(std::move(local_sorted_unique_edge_majors),
+                         std::move(local_sorted_unique_edge_major_offsets),
+                         std::move(local_sorted_unique_edge_minors),
+                         std::move(local_sorted_unique_edge_minor_offsets));
+}
+
 // compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid
 template <bool store_transposed, typename vertex_t, typename edge_t, typename weight_t>
 std::tuple<rmm::device_uvector<edge_t>,
@@ -504,129 +897,157 @@ template <typename vertex_t,
           bool multi_gpu>
 graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   graph_t(raft::handle_t const& handle,
-          std::vector<edgelist_t<vertex_t, edge_t, weight_t>> const& edgelists,
+          std::vector<edgelist_t<vertex_t, edge_t, weight_t>> const& edgelists,
+          graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
+          bool do_expensive_check)
+  : detail::graph_base_t<vertex_t, edge_t, weight_t>(
+      handle, meta.number_of_vertices, meta.number_of_edges, meta.properties),
+    partition_(meta.partition)
+{
+  auto is_weighted = edgelists[0].p_edge_weights.has_value();
+  auto use_dcs =
+    meta.segment_offsets
+      ? ((*(meta.segment_offsets)).size() > (detail::num_sparse_segments_per_vertex_partition + 1))
+      : false;
+
+  check_graph_constructor_input_arguments<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+    handle, edgelists, meta, do_expensive_check);
+
+  if (meta.segment_offsets) {
+    adj_matrix_partition_segment_offsets_ =
+      aggregate_segment_offsets(handle, (*meta.segment_offsets));
+  }
+
+  // compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid
+
+  adj_matrix_partition_offsets_.reserve(edgelists.size());
+  adj_matrix_partition_indices_.reserve(edgelists.size());
+  if (is_weighted) {
+    adj_matrix_partition_weights_ = std::vector<rmm::device_uvector<weight_t>>{};
+    (*adj_matrix_partition_weights_).reserve(edgelists.size());
+  }
+  if (use_dcs) {
+    adj_matrix_partition_dcs_nzd_vertices_      = std::vector<rmm::device_uvector<vertex_t>>{};
+    adj_matrix_partition_dcs_nzd_vertex_counts_ = std::vector<vertex_t>{};
+    (*adj_matrix_partition_dcs_nzd_vertices_).reserve(edgelists.size());
+    (*adj_matrix_partition_dcs_nzd_vertex_counts_).reserve(edgelists.size());
+  }
+  for (size_t i = 0; i < edgelists.size(); ++i) {
+    auto [major_first, major_last] = partition_.get_matrix_partition_major_range(i);
+    auto [minor_first, minor_last] = partition_.get_matrix_partition_minor_range();
+    auto major_hypersparse_first =
+      use_dcs ? std::optional<vertex_t>{major_first +
+                                        (*adj_matrix_partition_segment_offsets_)
+                                          [(*(meta.segment_offsets)).size() * i +
+                                           detail::num_sparse_segments_per_vertex_partition]}
+              : std::nullopt;
+    auto [offsets, indices, weights, dcs_nzd_vertices] =
+      compress_edgelist<store_transposed>(edgelists[i],
+                                          major_first,
+                                          major_hypersparse_first,
+                                          major_last,
+                                          minor_first,
+                                          minor_last,
+                                          handle.get_stream());
+
+    adj_matrix_partition_offsets_.push_back(std::move(offsets));
+    adj_matrix_partition_indices_.push_back(std::move(indices));
+    if (is_weighted) { (*adj_matrix_partition_weights_).push_back(std::move(*weights)); }
+    if (use_dcs) {
+      auto dcs_nzd_vertex_count = static_cast<vertex_t>((*dcs_nzd_vertices).size());
+      (*adj_matrix_partition_dcs_nzd_vertices_).push_back(std::move(*dcs_nzd_vertices));
+      (*adj_matrix_partition_dcs_nzd_vertex_counts_).push_back(dcs_nzd_vertex_count);
+    }
+  }
+
+  // segmented sort neighbors
+
+  for (size_t i = 0; i < adj_matrix_partition_offsets_.size(); ++i) {
+    sort_adjacency_list(handle,
+                        adj_matrix_partition_offsets_[i].data(),
+                        adj_matrix_partition_indices_[i].data(),
+                        adj_matrix_partition_weights_
+                          ? std::optional<weight_t*>{(*adj_matrix_partition_weights_)[i].data()}
+                          : std::nullopt,
+                        static_cast<vertex_t>(adj_matrix_partition_offsets_[i].size() - 1),
+                        static_cast<edge_t>(adj_matrix_partition_indices_[i].size()));
+  }
+
+  // update local sorted unique edge sources/destinations (only if key, value pair will be used)
+
+  std::tie(store_transposed ? local_sorted_unique_edge_cols_ : local_sorted_unique_edge_rows_,
+           store_transposed ? local_sorted_unique_edge_col_offsets_
+                            : local_sorted_unique_edge_row_offsets_,
+           store_transposed ? local_sorted_unique_edge_rows_ : local_sorted_unique_edge_cols_,
+           store_transposed ? local_sorted_unique_edge_row_offsets_
+                            : local_sorted_unique_edge_col_offsets_) =
+    update_local_sorted_unique_edge_majors_minors<vertex_t, edge_t, store_transposed, multi_gpu>(
+      handle,
+      meta,
+      adj_matrix_partition_segment_offsets_,
+      adj_matrix_partition_offsets_,
+      adj_matrix_partition_indices_,
+      adj_matrix_partition_dcs_nzd_vertices_,
+      adj_matrix_partition_dcs_nzd_vertex_counts_);
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  graph_t(raft::handle_t const& handle,
+          std::vector<rmm::device_uvector<vertex_t>>&& edgelist_src_partitions,
+          std::vector<rmm::device_uvector<vertex_t>>&& edgelist_dst_partitions,
+          std::optional<std::vector<rmm::device_uvector<weight_t>>>&& edgelist_weight_partitions,
           graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
           bool do_expensive_check)
   : detail::graph_base_t<vertex_t, edge_t, weight_t>(
       handle, meta.number_of_vertices, meta.number_of_edges, meta.properties),
     partition_(meta.partition)
 {
-  // cheap error checks
-
-  auto& comm           = this->get_handle_ptr()->get_comms();
-  auto const comm_size = comm.get_size();
-  auto& row_comm =
-    this->get_handle_ptr()->get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-  auto const row_comm_rank = row_comm.get_rank();
-  auto const row_comm_size = row_comm.get_size();
-  auto& col_comm =
-    this->get_handle_ptr()->get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-  auto const col_comm_rank = col_comm.get_rank();
-  auto const col_comm_size = col_comm.get_size();
-  auto default_stream_view = this->get_handle_ptr()->get_stream();
-
-  CUGRAPH_EXPECTS(edgelists.size() == static_cast<size_t>(col_comm_size),
-                  "Invalid input argument: erroneous edgelists.size().");
   CUGRAPH_EXPECTS(
-    !(meta.segment_offsets).has_value() ||
-      ((*(meta.segment_offsets)).size() ==
-       (detail::num_sparse_segments_per_vertex_partition + 1)) ||
-      ((*(meta.segment_offsets)).size() == (detail::num_sparse_segments_per_vertex_partition + 2)),
-    "Invalid input argument: (*(meta.segment_offsets)).size() returns an invalid value.");
+    edgelist_src_partitions.size() == edgelist_dst_partitions.size(),
+    "Invalid input argument: edgelist_src_partitions.size() != edgelist_dst_partitions.size().");
+  CUGRAPH_EXPECTS(!edgelist_weight_partitions.has_value() ||
+                    (edgelist_src_partitions.size() == (*edgelist_weight_partitions).size()),
+                  "Invalid input argument: edgelist_weight_partitions.has_value() && "
+                  "edgelist_src_partitions.size() != (*edgelist_weight_partitions).size().");
+  for (size_t i = 0; i < edgelist_src_partitions.size(); ++i) {
+    CUGRAPH_EXPECTS(edgelist_src_partitions[i].size() == edgelist_dst_partitions[i].size(),
+                    "Invalid input argument: edgelist_src_partitions[].size() != "
+                    "edgelist_dst_partitions[].size().");
+    CUGRAPH_EXPECTS(
+      !edgelist_weight_partitions.has_value() ||
+        (edgelist_src_partitions[i].size() == (*edgelist_weight_partitions)[i].size()),
+      "Invalid input argument: edgelist_weight_partitions.has_value() && "
+      "edgelist_src_partitions[].size() != (*edgelist_weight_partitions)[].size().");
+  }
 
-  auto is_weighted = edgelists[0].p_edge_weights.has_value();
+  auto is_weighted = edgelist_weight_partitions.has_value();
   auto use_dcs =
     meta.segment_offsets
       ? ((*(meta.segment_offsets)).size() > (detail::num_sparse_segments_per_vertex_partition + 1))
       : false;
 
-  CUGRAPH_EXPECTS(
-    std::any_of(edgelists.begin(),
-                edgelists.end(),
-                [is_weighted](auto edgelist) {
-                  return ((edgelist.number_of_edges > 0) && (edgelist.p_src_vertices == nullptr)) ||
-                         ((edgelist.number_of_edges > 0) && (edgelist.p_dst_vertices == nullptr)) ||
-                         (is_weighted && (edgelist.number_of_edges > 0) &&
-                          ((edgelist.p_edge_weights.has_value() == false) ||
-                           (*(edgelist.p_edge_weights) == nullptr)));
-                }) == false,
-    "Invalid input argument: edgelists[].p_src_vertices and edgelists[].p_dst_vertices should not "
-    "be nullptr if edgelists[].number_of_edges > 0 and edgelists[].p_edge_weights should be "
-    "neither std::nullopt nor nullptr if weighted and edgelists[].number_of_edges >  0.");
-
-  // optional expensive checks
-
-  if (do_expensive_check) {
-    edge_t number_of_local_edges{0};
-    for (size_t i = 0; i < edgelists.size(); ++i) {
-      auto [major_first, major_last] = partition_.get_matrix_partition_major_range(i);
-      auto [minor_first, minor_last] = partition_.get_matrix_partition_minor_range();
-
-      number_of_local_edges += edgelists[i].number_of_edges;
-
-      auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(
-        store_transposed ? edgelists[i].p_dst_vertices : edgelists[i].p_src_vertices,
-        store_transposed ? edgelists[i].p_src_vertices : edgelists[i].p_dst_vertices));
-      // better use thrust::any_of once https://github.com/thrust/thrust/issues/1016 is resolved
-      CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(default_stream_view),
-                                       edge_first,
-                                       edge_first + edgelists[i].number_of_edges,
-                                       out_of_range_t<vertex_t>{
-                                         major_first, major_last, minor_first, minor_last}) == 0,
-                      "Invalid input argument: edgelists[] have out-of-range values.");
-    }
-    auto number_of_local_edges_sum = host_scalar_allreduce(
-      comm, number_of_local_edges, raft::comms::op_t::SUM, default_stream_view.value());
-    CUGRAPH_EXPECTS(number_of_local_edges_sum == this->get_number_of_edges(),
-                    "Invalid input argument: the sum of local edge counts does not match with "
-                    "meta.number_of_edges.");
-
-    CUGRAPH_EXPECTS(
-      partition_.get_vertex_partition_last(comm_size - 1) == meta.number_of_vertices,
-      "Invalid input argument: vertex partition should cover [0, meta.number_of_vertices).");
-
-    if (this->is_symmetric()) {
-      CUGRAPH_EXPECTS(
-        (check_symmetric<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(handle,
-                                                                                  edgelists)),
-        "Invalid input argument: meta.property.is_symmetric is true but the input edge list is not "
-        "symmetric.");
-    }
-    if (!this->is_multigraph()) {
-      CUGRAPH_EXPECTS(
-        check_no_parallel_edge(handle, edgelists),
-        "Invalid input argument: meta.property.is_multigraph is false but the input edge list has "
-        "parallel edges.");
-    }
+  std::vector<edgelist_t<vertex_t, edge_t, weight_t>> edgelists(edgelist_src_partitions.size());
+  for (size_t i = 0; i < edgelists.size(); ++i) {
+    edgelists[i] = edgelist_t<vertex_t, edge_t, weight_t>{
+      edgelist_src_partitions[i].data(),
+      edgelist_dst_partitions[i].data(),
+      edgelist_weight_partitions
+        ? std::optional<weight_t const*>{(*edgelist_weight_partitions)[i].data()}
+        : std::nullopt,
+      static_cast<edge_t>(edgelist_src_partitions[i].size())};
   }
 
-  // aggregate segment_offsets
+  check_graph_constructor_input_arguments<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+    handle, edgelists, meta, do_expensive_check);
 
   if (meta.segment_offsets) {
-    // FIXME: we need to add host_allgather
-    rmm::device_uvector<vertex_t> d_segment_offsets((*(meta.segment_offsets)).size(),
-                                                    default_stream_view);
-    raft::update_device(d_segment_offsets.data(),
-                        (*(meta.segment_offsets)).data(),
-                        (*(meta.segment_offsets)).size(),
-                        default_stream_view.value());
-    rmm::device_uvector<vertex_t> d_aggregate_segment_offsets(
-      col_comm_size * d_segment_offsets.size(), default_stream_view);
-    col_comm.allgather(d_segment_offsets.data(),
-                       d_aggregate_segment_offsets.data(),
-                       d_segment_offsets.size(),
-                       default_stream_view.value());
-
     adj_matrix_partition_segment_offsets_ =
-      std::vector<vertex_t>(d_aggregate_segment_offsets.size(), vertex_t{0});
-    raft::update_host((*adj_matrix_partition_segment_offsets_).data(),
-                      d_aggregate_segment_offsets.data(),
-                      d_aggregate_segment_offsets.size(),
-                      default_stream_view.value());
-
-    default_stream_view
-      .synchronize();  // this is necessary as adj_matrix_partition_segment_offsets_ can be used
-                       // right after return.
+      aggregate_segment_offsets(handle, (*meta.segment_offsets));
   }
 
   // compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid
@@ -659,7 +1080,15 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                                           major_last,
                                           minor_first,
                                           minor_last,
-                                          default_stream_view);
+                                          handle.get_stream());
+    edgelist_src_partitions[i].resize(0, handle.get_stream());
+    edgelist_src_partitions[i].shrink_to_fit(handle.get_stream());
+    edgelist_dst_partitions[i].resize(0, handle.get_stream());
+    edgelist_dst_partitions[i].shrink_to_fit(handle.get_stream());
+    if (edgelist_weight_partitions) {
+      (*edgelist_weight_partitions)[i].resize(0, handle.get_stream());
+      (*edgelist_weight_partitions)[i].shrink_to_fit(handle.get_stream());
+    }
 
     adj_matrix_partition_offsets_.push_back(std::move(offsets));
     adj_matrix_partition_indices_.push_back(std::move(indices));
@@ -684,188 +1113,22 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                         static_cast<edge_t>(adj_matrix_partition_indices_[i].size()));
   }
 
-  // if # unique edge rows/cols << V / row_comm_size|col_comm_size, store unique edge rows/cols to
-  // support storing edge row/column properties in (key, value) pairs.
-
-  vertex_t num_local_unique_edge_majors{0};
-  for (size_t i = 0; i < adj_matrix_partition_offsets_.size(); ++i) {
-    num_local_unique_edge_majors += thrust::count_if(
-      handle.get_thrust_policy(),
-      thrust::make_counting_iterator(vertex_t{0}),
-      thrust::make_counting_iterator(
-        static_cast<vertex_t>(adj_matrix_partition_offsets_[i].size() - 1)),
-      has_nzd_t<vertex_t, edge_t>{adj_matrix_partition_offsets_[i].data(), vertex_t{0}});
-  }
-
-  auto [minor_first, minor_last] = partition_.get_matrix_partition_minor_range();
-  rmm::device_uvector<uint32_t> minor_bitmaps(
-    ((minor_last - minor_first) + sizeof(uint32_t) * 8 - 1) / (sizeof(uint32_t) * 8),
-    handle.get_stream());
-  thrust::fill(handle.get_thrust_policy(), minor_bitmaps.begin(), minor_bitmaps.end(), uint32_t{0});
-  for (size_t i = 0; i < adj_matrix_partition_indices_.size(); ++i) {
-    thrust::for_each(handle.get_thrust_policy(),
-                     adj_matrix_partition_indices_[i].begin(),
-                     adj_matrix_partition_indices_[i].end(),
-                     atomic_or_bitmap_t<vertex_t>{minor_bitmaps.data(), minor_first});
-  }
-
-  auto count_first = thrust::make_transform_iterator(minor_bitmaps.begin(), popc_t<vertex_t>{});
-  auto num_local_unique_edge_minors = thrust::reduce(
-    handle.get_thrust_policy(), count_first, count_first + minor_bitmaps.size(), vertex_t{0});
-
-  minor_bitmaps.resize(0, handle.get_stream());
-  minor_bitmaps.shrink_to_fit(handle.get_stream());
-
-  vertex_t aggregate_major_size{0};
-  for (size_t i = 0; i < partition_.get_number_of_matrix_partitions(); ++i) {
-    aggregate_major_size += partition_.get_matrix_partition_major_size(i);
-  }
-  auto minor_size                      = partition_.get_matrix_partition_minor_size();
-  auto max_major_properties_fill_ratio = host_scalar_allreduce(
-    comm,
-    static_cast<double>(num_local_unique_edge_majors) / static_cast<double>(aggregate_major_size),
-    raft::comms::op_t::MAX,
-    handle.get_stream());
-  auto max_minor_properties_fill_ratio = host_scalar_allreduce(
-    comm,
-    static_cast<double>(num_local_unique_edge_minors) / static_cast<double>(minor_size),
-    raft::comms::op_t::MAX,
-    handle.get_stream());
-
-  if (max_major_properties_fill_ratio < detail::row_col_properties_kv_pair_fill_ratio_threshold) {
-    rmm::device_uvector<vertex_t> local_sorted_unique_edge_majors(num_local_unique_edge_majors,
-                                                                  handle.get_stream());
-    size_t cur_size{0};
-    for (size_t i = 0; i < adj_matrix_partition_offsets_.size(); ++i) {
-      auto [major_first, major_last] = partition_.get_matrix_partition_major_range(i);
-      auto major_hypersparse_first =
-        use_dcs ? std::optional<vertex_t>{major_first +
-                                          (*adj_matrix_partition_segment_offsets_)
-                                            [(*(meta.segment_offsets)).size() * i +
-                                             detail::num_sparse_segments_per_vertex_partition]}
-                : std::nullopt;
-      cur_size += thrust::distance(
-        local_sorted_unique_edge_majors.data() + cur_size,
-        thrust::copy_if(
-          handle.get_thrust_policy(),
-          thrust::make_counting_iterator(major_first),
-          thrust::make_counting_iterator(use_dcs ? *major_hypersparse_first : major_last),
-          local_sorted_unique_edge_majors.data() + cur_size,
-          has_nzd_t<vertex_t, edge_t>{adj_matrix_partition_offsets_[i].data(), major_first}));
-      if (use_dcs) {
-        thrust::copy(handle.get_thrust_policy(),
-                     (*adj_matrix_partition_dcs_nzd_vertices_)[i].begin(),
-                     (*adj_matrix_partition_dcs_nzd_vertices_)[i].begin() +
-                       (*adj_matrix_partition_dcs_nzd_vertex_counts_)[i],
-                     local_sorted_unique_edge_majors.data() + cur_size);
-        cur_size += (*adj_matrix_partition_dcs_nzd_vertex_counts_)[i];
-      }
-    }
-    assert(cur_size == num_local_unique_edge_majors);
-
-    std::vector<vertex_t> h_vertex_partition_firsts(col_comm_size - 1);
-    for (int i = 1; i < col_comm_size; ++i) {
-      h_vertex_partition_firsts[i - 1] =
-        partition_.get_vertex_partition_first(i * row_comm_size + row_comm_rank);
-    }
-    rmm::device_uvector<vertex_t> d_vertex_partition_firsts(h_vertex_partition_firsts.size(),
-                                                            handle.get_stream());
-    raft::update_device(d_vertex_partition_firsts.data(),
-                        h_vertex_partition_firsts.data(),
-                        h_vertex_partition_firsts.size(),
-                        handle.get_stream());
-    rmm::device_uvector<vertex_t> d_key_offsets(d_vertex_partition_firsts.size(),
-                                                handle.get_stream());
-    thrust::lower_bound(handle.get_thrust_policy(),
-                        local_sorted_unique_edge_majors.begin(),
-                        local_sorted_unique_edge_majors.end(),
-                        d_vertex_partition_firsts.begin(),
-                        d_vertex_partition_firsts.end(),
-                        d_key_offsets.begin());
-    std::vector<vertex_t> h_key_offsets(col_comm_size + 1, vertex_t{0});
-    h_key_offsets.back() = static_cast<vertex_t>(local_sorted_unique_edge_majors.size());
-    raft::update_host(
-      h_key_offsets.data() + 1, d_key_offsets.data(), d_key_offsets.size(), handle.get_stream());
-
-    if constexpr (store_transposed) {
-      local_sorted_unique_edge_cols_        = std::move(local_sorted_unique_edge_majors);
-      local_sorted_unique_edge_col_offsets_ = std::move(h_key_offsets);
-    } else {
-      local_sorted_unique_edge_rows_        = std::move(local_sorted_unique_edge_majors);
-      local_sorted_unique_edge_row_offsets_ = std::move(h_key_offsets);
-    }
-  }
-
-  if (max_minor_properties_fill_ratio < detail::row_col_properties_kv_pair_fill_ratio_threshold) {
-    rmm::device_uvector<vertex_t> local_sorted_unique_edge_minors(0, handle.get_stream());
-    for (size_t i = 0; i < adj_matrix_partition_indices_.size(); ++i) {
-      rmm::device_uvector<vertex_t> tmp_minors(adj_matrix_partition_indices_[i].size(),
-                                               handle.get_stream());
-      thrust::copy(handle.get_thrust_policy(),
-                   adj_matrix_partition_indices_[i].begin(),
-                   adj_matrix_partition_indices_[i].end(),
-                   tmp_minors.begin());
-      thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end());
-      tmp_minors.resize(
-        thrust::distance(
-          tmp_minors.begin(),
-          thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())),
-        handle.get_stream());
-      auto cur_size = local_sorted_unique_edge_minors.size();
-      if (cur_size == 0) {
-        local_sorted_unique_edge_minors = std::move(tmp_minors);
-      } else {
-        local_sorted_unique_edge_minors.resize(
-          local_sorted_unique_edge_minors.size() + tmp_minors.size(), handle.get_stream());
-        thrust::copy(handle.get_thrust_policy(),
-                     tmp_minors.begin(),
-                     tmp_minors.end(),
-                     local_sorted_unique_edge_minors.begin() + cur_size);
-      }
-    }
-    thrust::sort(handle.get_thrust_policy(),
-                 local_sorted_unique_edge_minors.begin(),
-                 local_sorted_unique_edge_minors.end());
-    local_sorted_unique_edge_minors.resize(
-      thrust::distance(local_sorted_unique_edge_minors.begin(),
-                       thrust::unique(handle.get_thrust_policy(),
-                                      local_sorted_unique_edge_minors.begin(),
-                                      local_sorted_unique_edge_minors.end())),
-      handle.get_stream());
-    local_sorted_unique_edge_minors.shrink_to_fit(handle.get_stream());
-
-    std::vector<vertex_t> h_vertex_partition_firsts(row_comm_size - 1);
-    for (int i = 1; i < row_comm_size; ++i) {
-      h_vertex_partition_firsts[i - 1] =
-        partition_.get_vertex_partition_first(col_comm_rank * row_comm_size + i);
-    }
-    rmm::device_uvector<vertex_t> d_vertex_partition_firsts(h_vertex_partition_firsts.size(),
-                                                            handle.get_stream());
-    raft::update_device(d_vertex_partition_firsts.data(),
-                        h_vertex_partition_firsts.data(),
-                        h_vertex_partition_firsts.size(),
-                        handle.get_stream());
-    rmm::device_uvector<vertex_t> d_key_offsets(d_vertex_partition_firsts.size(),
-                                                handle.get_stream());
-    thrust::lower_bound(handle.get_thrust_policy(),
-                        local_sorted_unique_edge_minors.begin(),
-                        local_sorted_unique_edge_minors.end(),
-                        d_vertex_partition_firsts.begin(),
-                        d_vertex_partition_firsts.end(),
-                        d_key_offsets.begin());
-    std::vector<vertex_t> h_key_offsets(row_comm_size + 1, vertex_t{0});
-    h_key_offsets.back() = static_cast<vertex_t>(local_sorted_unique_edge_minors.size());
-    raft::update_host(
-      h_key_offsets.data() + 1, d_key_offsets.data(), d_key_offsets.size(), handle.get_stream());
+  // update local sorted unique edge sources/destinations (only if key, value pair will be used)
 
-    if constexpr (store_transposed) {
-      local_sorted_unique_edge_rows_        = std::move(local_sorted_unique_edge_minors);
-      local_sorted_unique_edge_row_offsets_ = std::move(h_key_offsets);
-    } else {
-      local_sorted_unique_edge_cols_        = std::move(local_sorted_unique_edge_minors);
-      local_sorted_unique_edge_col_offsets_ = std::move(h_key_offsets);
-    }
-  }
+  std::tie(store_transposed ? local_sorted_unique_edge_cols_ : local_sorted_unique_edge_rows_,
+           store_transposed ? local_sorted_unique_edge_col_offsets_
+                            : local_sorted_unique_edge_row_offsets_,
+           store_transposed ? local_sorted_unique_edge_rows_ : local_sorted_unique_edge_cols_,
+           store_transposed ? local_sorted_unique_edge_row_offsets_
+                            : local_sorted_unique_edge_col_offsets_) =
+    update_local_sorted_unique_edge_majors_minors<vertex_t, edge_t, store_transposed, multi_gpu>(
+      handle,
+      meta,
+      adj_matrix_partition_segment_offsets_,
+      adj_matrix_partition_offsets_,
+      adj_matrix_partition_indices_,
+      adj_matrix_partition_dcs_nzd_vertices_,
+      adj_matrix_partition_dcs_nzd_vertex_counts_);
 }
 
 template <typename vertex_t,
@@ -884,56 +1147,63 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
     indices_(rmm::device_uvector<vertex_t>(0, handle.get_stream())),
     segment_offsets_(meta.segment_offsets)
 {
-  // cheap error checks
+  check_graph_constructor_input_arguments<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+    handle, edgelist, meta, do_expensive_check);
 
-  auto default_stream_view = this->get_handle_ptr()->get_stream();
+  // convert edge list (COO) to compressed sparse format (CSR or CSC)
 
-  auto is_weighted = edgelist.p_edge_weights.has_value();
+  std::tie(offsets_, indices_, weights_, std::ignore) =
+    compress_edgelist<store_transposed>(edgelist,
+                                        vertex_t{0},
+                                        std::optional<vertex_t>{std::nullopt},
+                                        this->get_number_of_vertices(),
+                                        vertex_t{0},
+                                        this->get_number_of_vertices(),
+                                        handle.get_stream());
 
-  CUGRAPH_EXPECTS(
-    ((edgelist.number_of_edges == 0) || (edgelist.p_src_vertices != nullptr)) &&
-      ((edgelist.number_of_edges == 0) || (edgelist.p_dst_vertices != nullptr)) &&
-      (!is_weighted || (is_weighted && ((edgelist.number_of_edges == 0) ||
-                                        (*(edgelist.p_edge_weights) != nullptr)))),
-    "Invalid input argument: edgelist.p_src_vertices and edgelist.p_dst_vertices should not be "
-    "nullptr if edgelist.number_of_edges > 0 and edgelist.p_edge_weights should be neither "
-    "std::nullopt nor nullptr if weighted and edgelist.number_of_edges > 0.");
+  // segmented sort neighbors
+
+  sort_adjacency_list(handle,
+                      offsets_.data(),
+                      indices_.data(),
+                      weights_ ? std::optional<weight_t*>{(*weights_).data()} : std::nullopt,
+                      static_cast<vertex_t>(offsets_.size() - 1),
+                      static_cast<edge_t>(indices_.size()));
+}
 
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
+  graph_t(raft::handle_t const& handle,
+          rmm::device_uvector<vertex_t>&& edgelist_srcs,
+          rmm::device_uvector<vertex_t>&& edgelist_dsts,
+          std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+          graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
+          bool do_expensive_check)
+  : detail::graph_base_t<vertex_t, edge_t, weight_t>(
+      handle, meta.number_of_vertices, static_cast<edge_t>(edgelist_srcs.size()), meta.properties),
+    offsets_(rmm::device_uvector<edge_t>(0, handle.get_stream())),
+    indices_(rmm::device_uvector<vertex_t>(0, handle.get_stream())),
+    segment_offsets_(meta.segment_offsets)
+{
+  CUGRAPH_EXPECTS(edgelist_srcs.size() == edgelist_dsts.size(),
+                  "Invalid input argument: edgelist_srcs.size() != edgelist_dsts.size().");
   CUGRAPH_EXPECTS(
-    !segment_offsets_.has_value() ||
-      ((*segment_offsets_).size() == (detail::num_sparse_segments_per_vertex_partition + 1)),
-    "Invalid input argument: (*(meta.segment_offsets)).size() returns an invalid value.");
+    !edgelist_weights.has_value() || (edgelist_srcs.size() == (*edgelist_weights).size()),
+    "Invalid input argument: edgelist_weights.has_value() && edgelist_srcs.size() != "
+    "(*edgelist_weights).size().");
 
-  // optional expensive checks
+  edgelist_t<vertex_t, edge_t, weight_t> edgelist{
+    edgelist_srcs.data(),
+    edgelist_dsts.data(),
+    edgelist_weights ? std::optional<weight_t const*>{(*edgelist_weights).data()} : std::nullopt,
+    static_cast<edge_t>(edgelist_srcs.size())};
 
-  if (do_expensive_check) {
-    auto edge_first = thrust::make_zip_iterator(
-      thrust::make_tuple(store_transposed ? edgelist.p_dst_vertices : edgelist.p_src_vertices,
-                         store_transposed ? edgelist.p_src_vertices : edgelist.p_dst_vertices));
-    // better use thrust::any_of once https://github.com/thrust/thrust/issues/1016 is resolved
-    CUGRAPH_EXPECTS(thrust::count_if(
-                      rmm::exec_policy(default_stream_view),
-                      edge_first,
-                      edge_first + edgelist.number_of_edges,
-                      out_of_range_t<vertex_t>{
-                        0, this->get_number_of_vertices(), 0, this->get_number_of_vertices()}) == 0,
-                    "Invalid input argument: edgelist have out-of-range values.");
-
-    if (this->is_symmetric()) {
-      CUGRAPH_EXPECTS(
-        (check_symmetric<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
-          handle, std::vector<edgelist_t<vertex_t, edge_t, weight_t>>{edgelist})),
-        "Invalid input argument: meta.property.is_symmetric is true but the input edge list is not "
-        "symmetric.");
-    }
-    if (!this->is_multigraph()) {
-      CUGRAPH_EXPECTS(
-        check_no_parallel_edge(handle,
-                               std::vector<edgelist_t<vertex_t, edge_t, weight_t>>{edgelist}),
-        "Invalid input argument: meta.property.is_multigraph is false but the input edge list has "
-        "parallel edges.");
-    }
-  }
+  check_graph_constructor_input_arguments<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+    handle, edgelist, meta, do_expensive_check);
 
   // convert edge list (COO) to compressed sparse format (CSR or CSC)
 
@@ -944,7 +1214,15 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                                         this->get_number_of_vertices(),
                                         vertex_t{0},
                                         this->get_number_of_vertices(),
-                                        default_stream_view);
+                                        handle.get_stream());
+  edgelist_srcs.resize(0, handle.get_stream());
+  edgelist_srcs.shrink_to_fit(handle.get_stream());
+  edgelist_dsts.resize(0, handle.get_stream());
+  edgelist_dsts.shrink_to_fit(handle.get_stream());
+  if (edgelist_weights) {
+    (*edgelist_weights).resize(0, handle.get_stream());
+    (*edgelist_weights).shrink_to_fit(handle.get_stream());
+  }
 
   // segmented sort neighbors
 

From dc4a2137f8d0e2ce4c5cf17651df9245d06ac967 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 18 Feb 2022 01:11:57 -0800
Subject: [PATCH 49/60] update create_grpah_from_edgelist to call graph_t
 taking R-value edgelists

---
 .../create_graph_from_edgelist_impl.cuh       | 77 +++++++++++++------
 1 file changed, 55 insertions(+), 22 deletions(-)

diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
index ea12a3562ba..0b711936ee0 100644
--- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh
+++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
@@ -248,15 +248,60 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
                    edgelist_edge_counts.end() - 1,
                    edgelist_displacements.begin() + 1);
 
+  // 2. split the input edges to local partitions
+
+  std::vector<rmm::device_uvector<vertex_t>> edgelist_src_partitions{};
+  edgelist_src_partitions.reserve(col_comm_size);
+  for (int i = 0; i < col_comm_size; ++i) {
+    rmm::device_uvector<vertex_t> tmp_srcs(edgelist_edge_counts[i], handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 edgelist_rows.begin() + edgelist_displacements[i],
+                 edgelist_rows.begin() + edgelist_displacements[i] + edgelist_edge_counts[i],
+                 tmp_srcs.begin());
+    edgelist_src_partitions.push_back(std::move(tmp_srcs));
+  }
+  edgelist_rows.resize(0, handle.get_stream());
+  edgelist_rows.shrink_to_fit(handle.get_stream());
+
+  std::vector<rmm::device_uvector<vertex_t>> edgelist_dst_partitions{};
+  edgelist_dst_partitions.reserve(col_comm_size);
+  for (int i = 0; i < col_comm_size; ++i) {
+    rmm::device_uvector<vertex_t> tmp_dsts(edgelist_edge_counts[i], handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 edgelist_cols.begin() + edgelist_displacements[i],
+                 edgelist_cols.begin() + edgelist_displacements[i] + edgelist_edge_counts[i],
+                 tmp_dsts.begin());
+    edgelist_dst_partitions.push_back(std::move(tmp_dsts));
+  }
+  edgelist_cols.resize(0, handle.get_stream());
+  edgelist_cols.shrink_to_fit(handle.get_stream());
+
+  std::optional<std::vector<rmm::device_uvector<weight_t>>> edgelist_weight_partitions{};
+  if (edgelist_weights) {
+    edgelist_weight_partitions = std::vector<rmm::device_uvector<weight_t>>{};
+    (*edgelist_weight_partitions).reserve(col_comm_size);
+    for (int i = 0; i < col_comm_size; ++i) {
+      rmm::device_uvector<weight_t> tmp_weights(edgelist_edge_counts[i], handle.get_stream());
+      thrust::copy(
+        handle.get_thrust_policy(),
+        (*edgelist_weights).begin() + edgelist_displacements[i],
+        (*edgelist_weights).begin() + edgelist_displacements[i] + edgelist_edge_counts[i],
+        tmp_weights.begin());
+      (*edgelist_weight_partitions).push_back(std::move(tmp_weights));
+    }
+    (*edgelist_weights).resize(0, handle.get_stream());
+    (*edgelist_weights).shrink_to_fit(handle.get_stream());
+  }
+
   // 2. renumber
 
   std::vector<vertex_t*> major_ptrs(col_comm_size);
   std::vector<vertex_t*> minor_ptrs(major_ptrs.size());
   for (int i = 0; i < col_comm_size; ++i) {
-    major_ptrs[i] = (store_transposed ? edgelist_cols.begin() : edgelist_rows.begin()) +
-                    edgelist_displacements[i];
-    minor_ptrs[i] = (store_transposed ? edgelist_rows.begin() : edgelist_cols.begin()) +
-                    edgelist_displacements[i];
+    major_ptrs[i] =
+      store_transposed ? edgelist_dst_partitions[i].begin() : edgelist_src_partitions[i].begin();
+    minor_ptrs[i] =
+      store_transposed ? edgelist_src_partitions[i].begin() : edgelist_dst_partitions[i].begin();
   }
   auto [renumber_map_labels, meta] = cugraph::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
     handle,
@@ -268,21 +313,12 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
 
   // 3. create a graph
 
-  std::vector<cugraph::edgelist_t<vertex_t, edge_t, weight_t>> edgelists(col_comm_size);
-  for (int i = 0; i < col_comm_size; ++i) {
-    edgelists[i] = cugraph::edgelist_t<vertex_t, edge_t, weight_t>{
-      edgelist_rows.data() + edgelist_displacements[i],
-      edgelist_cols.data() + edgelist_displacements[i],
-      edgelist_weights
-        ? std::optional<weight_t const*>{(*edgelist_weights).data() + edgelist_displacements[i]}
-        : std::nullopt,
-      static_cast<edge_t>(edgelist_edge_counts[i])};
-  }
-
   return std::make_tuple(
     cugraph::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
       handle,
-      edgelists,
+      std::move(edgelist_src_partitions),
+      std::move(edgelist_dst_partitions),
+      std::move(edgelist_weight_partitions),
       cugraph::graph_meta_t<vertex_t, edge_t, multi_gpu>{meta.number_of_vertices,
                                                          meta.number_of_edges,
                                                          graph_properties,
@@ -351,12 +387,9 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
   return std::make_tuple(
     cugraph::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
       handle,
-      cugraph::edgelist_t<vertex_t, edge_t, weight_t>{
-        edgelist_rows.data(),
-        edgelist_cols.data(),
-        edgelist_weights ? std::optional<weight_t const*>{(*edgelist_weights).data()}
-                         : std::nullopt,
-        static_cast<edge_t>(edgelist_rows.size())},
+      std::move(edgelist_rows),
+      std::move(edgelist_cols),
+      std::move(edgelist_weights),
       cugraph::graph_meta_t<vertex_t, edge_t, multi_gpu>{
         num_vertices,
         graph_properties,

From 5177e86a4845f83052eb708a3d0b01ff289b4fdb Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 18 Feb 2022 01:12:52 -0800
Subject: [PATCH 50/60] update coarsen_graph_impl to call
 create_graph_from_edgelist instead of directly calling graph_t constructor

---
 cpp/src/structure/coarsen_graph_impl.cuh | 264 +++++++----------------
 1 file changed, 76 insertions(+), 188 deletions(-)

diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index e4cc48dfd99..3d314144fe3 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -223,10 +223,9 @@ template <typename vertex_t,
           typename weight_t,
           bool store_transposed,
           bool multi_gpu>
-std::enable_if_t<
-  multi_gpu,
-  std::tuple<std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>,
-             rmm::device_uvector<vertex_t>>>
+std::enable_if_t<multi_gpu,
+                 std::tuple<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+                            rmm::device_uvector<vertex_t>>>
 coarsen_graph(
   raft::handle_t const& handle,
   graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> const& graph_view,
@@ -330,8 +329,8 @@ coarsen_graph(
     if (edgelist_weights) { (*coarsened_edgelist_weights).push_back(std::move(*edgelist_weights)); }
   }
 
-  // 2. concatenate and groupby and coarsen again (and if the input graph is symmetric, create a
-  // copy excluding self loops and globally shuffle)
+  // 2. concatenate and groupby and coarsen again (and if the input graph is symmetric, 1) create a
+  // copy excluding self loops, 2) globally shuffle, and 3) concatenate again)
 
   edge_t tot_count{0};
   for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) {
@@ -395,10 +394,11 @@ coarsen_graph(
     (*concatenated_edgelist_weights).shrink_to_fit(handle.get_stream());
   }
 
-  std::optional<rmm::device_uvector<vertex_t>> reversed_edgelist_majors{std::nullopt};
-  std::optional<rmm::device_uvector<vertex_t>> reversed_edgelist_minors{std::nullopt};
-  std::optional<rmm::device_uvector<weight_t>> reversed_edgelist_weights{std::nullopt};
   if (lower_triangular_only) {
+    rmm::device_uvector<vertex_t> reversed_edgelist_majors(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> reversed_edgelist_minors(0, handle.get_stream());
+    std::optional<rmm::device_uvector<weight_t>> reversed_edgelist_weights{std::nullopt};
+
     if (concatenated_edgelist_weights) {
       auto edge_first =
         thrust::make_zip_iterator(thrust::make_tuple(concatenated_edgelist_majors.begin(),
@@ -409,18 +409,16 @@ coarsen_graph(
                           edge_first,
                           edge_first + concatenated_edgelist_majors.size(),
                           is_not_self_loop_t<thrust::tuple<vertex_t, vertex_t, weight_t>>{});
-      reversed_edgelist_majors =
-        rmm::device_uvector<vertex_t>(thrust::distance(edge_first, last), handle.get_stream());
-      reversed_edgelist_minors =
-        rmm::device_uvector<vertex_t>((*reversed_edgelist_majors).size(), handle.get_stream());
+      reversed_edgelist_majors.resize(thrust::distance(edge_first, last), handle.get_stream());
+      reversed_edgelist_minors.resize(reversed_edgelist_majors.size(), handle.get_stream());
       reversed_edgelist_weights =
-        rmm::device_uvector<weight_t>((*reversed_edgelist_majors).size(), handle.get_stream());
+        rmm::device_uvector<weight_t>(reversed_edgelist_majors.size(), handle.get_stream());
       thrust::copy(
         handle.get_thrust_policy(),
         edge_first,
-        edge_first + (*reversed_edgelist_majors).size(),
-        thrust::make_zip_iterator(thrust::make_tuple((*reversed_edgelist_minors).begin(),
-                                                     (*reversed_edgelist_majors).begin(),
+        edge_first + reversed_edgelist_majors.size(),
+        thrust::make_zip_iterator(thrust::make_tuple(reversed_edgelist_minors.begin(),
+                                                     reversed_edgelist_majors.begin(),
                                                      (*reversed_edgelist_weights).begin())));
     } else {
       auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(
@@ -429,115 +427,53 @@ coarsen_graph(
                                     edge_first,
                                     edge_first + concatenated_edgelist_majors.size(),
                                     is_not_self_loop_t<thrust::tuple<vertex_t, vertex_t>>{});
-      reversed_edgelist_majors =
-        rmm::device_uvector<vertex_t>(thrust::distance(edge_first, last), handle.get_stream());
-      reversed_edgelist_minors =
-        rmm::device_uvector<vertex_t>((*reversed_edgelist_majors).size(), handle.get_stream());
+      reversed_edgelist_majors.resize(thrust::distance(edge_first, last), handle.get_stream());
+      reversed_edgelist_minors.resize(reversed_edgelist_majors.size(), handle.get_stream());
       thrust::copy(handle.get_thrust_policy(),
                    edge_first,
-                   edge_first + (*reversed_edgelist_majors).size(),
-                   thrust::make_zip_iterator(thrust::make_tuple(
-                     (*reversed_edgelist_minors).begin(), (*reversed_edgelist_majors).begin())));
+                   edge_first + reversed_edgelist_majors.size(),
+                   thrust::make_zip_iterator(thrust::make_tuple(reversed_edgelist_minors.begin(),
+                                                                reversed_edgelist_majors.begin())));
     }
 
-    std::tie(*reversed_edgelist_majors, *reversed_edgelist_minors, reversed_edgelist_weights) =
+    std::tie(reversed_edgelist_majors, reversed_edgelist_minors, reversed_edgelist_weights) =
       cugraph::detail::shuffle_edgelist_by_gpu_id(handle,
-                                                  std::move(*reversed_edgelist_majors),
-                                                  std::move(*reversed_edgelist_minors),
+                                                  std::move(reversed_edgelist_majors),
+                                                  std::move(reversed_edgelist_minors),
                                                   std::move(reversed_edgelist_weights));
-  }
-
-  // 3. split concatenated edge list to local partitions
-
-  auto concatenated_counts =
-    groupby_and_count_edgelist_by_local_partition_id(handle,
-                                                     concatenated_edgelist_majors,
-                                                     concatenated_edgelist_minors,
-                                                     concatenated_edgelist_weights);
-
-  std::vector<size_t> h_concatenated_counts(concatenated_counts.size());
-  raft::update_host(h_concatenated_counts.data(),
-                    concatenated_counts.data(),
-                    concatenated_counts.size(),
-                    handle.get_stream());
-
-  std::optional<std::vector<size_t>> h_reversed_counts{std::nullopt};
-  if (reversed_edgelist_majors) {
-    auto reversed_counts = groupby_and_count_edgelist_by_local_partition_id(
-      handle, *reversed_edgelist_majors, *reversed_edgelist_minors, reversed_edgelist_weights);
-
-    h_reversed_counts = std::vector<size_t>(reversed_counts.size());
-    raft::update_host((*h_reversed_counts).data(),
-                      reversed_counts.data(),
-                      reversed_counts.size(),
-                      handle.get_stream());
-  }
 
-  handle.sync_stream();
-
-  std::vector<size_t> h_concatenated_displacements(h_concatenated_counts.size(), size_t{0});
-  std::partial_sum(h_concatenated_counts.begin(),
-                   h_concatenated_counts.end() - 1,
-                   h_concatenated_displacements.begin() + 1);
-
-  std::optional<std::vector<size_t>> h_reversed_displacements{std::nullopt};
-  if (h_reversed_counts) {
-    h_reversed_displacements = std::vector<size_t>((*h_reversed_counts).size(), size_t{0});
-    std::partial_sum((*h_reversed_counts).begin(),
-                     (*h_reversed_counts).end() - 1,
-                     (*h_reversed_displacements).begin() + 1);
-  }
-
-  for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) {
-    coarsened_edgelist_majors[i].resize(
-      h_concatenated_counts[i] + (h_reversed_counts ? (*h_reversed_counts)[i] : size_t{0}),
-      handle.get_stream());
-    coarsened_edgelist_minors[i].resize(coarsened_edgelist_majors[i].size(), handle.get_stream());
-    if (coarsened_edgelist_weights) {
-      (*coarsened_edgelist_weights)[i].resize(coarsened_edgelist_majors[i].size(),
-                                              handle.get_stream());
-    }
+    auto output_offset = concatenated_edgelist_majors.size();
 
+    concatenated_edgelist_majors.resize(
+      concatenated_edgelist_majors.size() + reversed_edgelist_majors.size(), handle.get_stream());
     thrust::copy(handle.get_thrust_policy(),
-                 concatenated_edgelist_majors.begin() + h_concatenated_displacements[i],
-                 concatenated_edgelist_majors.begin() +
-                   (h_concatenated_displacements[i] + h_concatenated_counts[i]),
-                 coarsened_edgelist_majors[i].begin());
+                 reversed_edgelist_majors.begin(),
+                 reversed_edgelist_majors.end(),
+                 concatenated_edgelist_majors.begin() + output_offset);
+    reversed_edgelist_majors.resize(0, handle.get_stream());
+    reversed_edgelist_majors.shrink_to_fit(handle.get_stream());
+
+    concatenated_edgelist_minors.resize(concatenated_edgelist_majors.size(), handle.get_stream());
     thrust::copy(handle.get_thrust_policy(),
-                 concatenated_edgelist_minors.begin() + h_concatenated_displacements[i],
-                 concatenated_edgelist_minors.begin() +
-                   (h_concatenated_displacements[i] + h_concatenated_counts[i]),
-                 coarsened_edgelist_minors[i].begin());
-    if (coarsened_edgelist_weights) {
-      thrust::copy(handle.get_thrust_policy(),
-                   (*concatenated_edgelist_weights).begin() + h_concatenated_displacements[i],
-                   (*concatenated_edgelist_weights).begin() +
-                     (h_concatenated_displacements[i] + h_concatenated_counts[i]),
-                   (*coarsened_edgelist_weights)[i].begin());
-    }
+                 reversed_edgelist_minors.begin(),
+                 reversed_edgelist_minors.end(),
+                 concatenated_edgelist_minors.begin() + output_offset);
+    reversed_edgelist_minors.resize(0, handle.get_stream());
+    reversed_edgelist_minors.shrink_to_fit(handle.get_stream());
 
-    if (reversed_edgelist_majors) {
-      thrust::copy(handle.get_thrust_policy(),
-                   (*reversed_edgelist_majors).begin() + (*h_reversed_displacements)[i],
-                   (*reversed_edgelist_majors).begin() +
-                     ((*h_reversed_displacements)[i] + (*h_reversed_counts)[i]),
-                   coarsened_edgelist_majors[i].begin() + h_concatenated_counts[i]);
+    if (concatenated_edgelist_weights) {
+      (*concatenated_edgelist_weights)
+        .resize(concatenated_edgelist_majors.size(), handle.get_stream());
       thrust::copy(handle.get_thrust_policy(),
-                   (*reversed_edgelist_minors).begin() + (*h_reversed_displacements)[i],
-                   (*reversed_edgelist_minors).begin() +
-                     ((*h_reversed_displacements)[i] + (*h_reversed_counts)[i]),
-                   coarsened_edgelist_minors[i].begin() + h_concatenated_counts[i]);
-      if (coarsened_edgelist_weights) {
-        thrust::copy(handle.get_thrust_policy(),
-                     (*reversed_edgelist_weights).begin() + (*h_reversed_displacements)[i],
-                     (*reversed_edgelist_weights).begin() +
-                       ((*h_reversed_displacements)[i] + (*h_reversed_counts)[i]),
-                     (*coarsened_edgelist_weights)[i].begin() + h_concatenated_counts[i]);
-      }
+                   (*reversed_edgelist_weights).begin(),
+                   (*reversed_edgelist_weights).end(),
+                   (*concatenated_edgelist_weights).begin() + output_offset);
+      (*reversed_edgelist_weights).resize(0, handle.get_stream());
+      (*reversed_edgelist_weights).shrink_to_fit(handle.get_stream());
     }
   }
 
-  // 4. find unique labels for this GPU
+  // 3. find unique labels for this GPU
 
   rmm::device_uvector<vertex_t> unique_labels(graph_view.get_number_of_local_vertices(),
                                               handle.get_stream());
@@ -559,56 +495,22 @@ coarsen_graph(
       thrust::unique(handle.get_thrust_policy(), unique_labels.begin(), unique_labels.end())),
     handle.get_stream());
 
-  // 5. renumber
+  // 4. create a graph
 
-  rmm::device_uvector<vertex_t> renumber_map_labels(0, handle.get_stream());
-  renumber_meta_t<vertex_t, edge_t, multi_gpu> meta{};
-  {
-    std::vector<vertex_t*> major_ptrs(coarsened_edgelist_majors.size());
-    std::vector<vertex_t*> minor_ptrs(major_ptrs.size());
-    std::vector<edge_t> counts(major_ptrs.size());
-    for (size_t i = 0; i < coarsened_edgelist_majors.size(); ++i) {
-      major_ptrs[i] = coarsened_edgelist_majors[i].data();
-      minor_ptrs[i] = coarsened_edgelist_minors[i].data();
-      counts[i]     = static_cast<edge_t>(coarsened_edgelist_majors[i].size());
-    }
-    std::tie(renumber_map_labels, meta) = renumber_edgelist<vertex_t, edge_t, multi_gpu>(
+  auto [coarsened_graph, renumber_map] =
+    create_graph_from_edgelist<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
       handle,
-      std::optional<rmm::device_uvector<vertex_t>>{std::move(unique_labels)},
-      major_ptrs,
-      minor_ptrs,
-      counts,
-      std::nullopt,
+      std::move(unique_labels),
+      store_transposed ? std::move(concatenated_edgelist_minors)
+                       : std::move(concatenated_edgelist_majors),
+      store_transposed ? std::move(concatenated_edgelist_majors)
+                       : std::move(concatenated_edgelist_minors),
+      std::move(concatenated_edgelist_weights),
+      graph_properties_t{graph_view.is_symmetric(), false},
+      true,
       do_expensive_check);
-  }
-
-  // 6. build a graph
-
-  std::vector<edgelist_t<vertex_t, edge_t, weight_t>> edgelists{};
-  edgelists.resize(graph_view.get_number_of_local_adj_matrix_partitions());
-  for (size_t i = 0; i < edgelists.size(); ++i) {
-    edgelists[i].p_src_vertices =
-      store_transposed ? coarsened_edgelist_minors[i].data() : coarsened_edgelist_majors[i].data();
-    edgelists[i].p_dst_vertices =
-      store_transposed ? coarsened_edgelist_majors[i].data() : coarsened_edgelist_minors[i].data();
-    edgelists[i].p_edge_weights =
-      coarsened_edgelist_weights
-        ? std::optional<weight_t const*>{(*coarsened_edgelist_weights)[i].data()}
-        : std::nullopt,
-    edgelists[i].number_of_edges = static_cast<edge_t>(coarsened_edgelist_majors[i].size());
-  }
 
-  return std::make_tuple(
-    std::make_unique<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>(
-      handle,
-      edgelists,
-      graph_meta_t<vertex_t, edge_t, multi_gpu>{
-        meta.number_of_vertices,
-        meta.number_of_edges,
-        graph_properties_t{graph_view.is_symmetric(), false},
-        meta.partition,
-        meta.segment_offsets}),
-    std::move(renumber_map_labels));
+  return std::make_tuple(std::move(coarsened_graph), std::move(*renumber_map));
 }
 
 // single-GPU version
@@ -617,10 +519,9 @@ template <typename vertex_t,
           typename weight_t,
           bool store_transposed,
           bool multi_gpu>
-std::enable_if_t<
-  !multi_gpu,
-  std::tuple<std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>,
-             rmm::device_uvector<vertex_t>>>
+std::enable_if_t<!multi_gpu,
+                 std::tuple<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+                            rmm::device_uvector<vertex_t>>>
 coarsen_graph(
   raft::handle_t const& handle,
   graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> const& graph_view,
@@ -710,33 +611,20 @@ coarsen_graph(
       thrust::unique(handle.get_thrust_policy(), unique_labels.begin(), unique_labels.end())),
     handle.get_stream());
 
-  auto [renumber_map_labels, meta] = renumber_edgelist<vertex_t, edge_t, multi_gpu>(
-    handle,
-    std::optional<rmm::device_uvector<vertex_t>>{std::move(unique_labels)},
-    coarsened_edgelist_majors.data(),
-    coarsened_edgelist_minors.data(),
-    static_cast<edge_t>(coarsened_edgelist_majors.size()),
-    do_expensive_check);
-
-  edgelist_t<vertex_t, edge_t, weight_t> edgelist{};
-  edgelist.p_src_vertices =
-    store_transposed ? coarsened_edgelist_minors.data() : coarsened_edgelist_majors.data();
-  edgelist.p_dst_vertices =
-    store_transposed ? coarsened_edgelist_majors.data() : coarsened_edgelist_minors.data();
-  edgelist.p_edge_weights  = coarsened_edgelist_weights
-                               ? std::optional<weight_t const*>{(*coarsened_edgelist_weights).data()}
-                               : std::nullopt;
-  edgelist.number_of_edges = static_cast<edge_t>(coarsened_edgelist_majors.size());
-
-  return std::make_tuple(
-    std::make_unique<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>(
+  auto [coarsened_graph, renumber_map] =
+    create_graph_from_edgelist<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
       handle,
-      edgelist,
-      graph_meta_t<vertex_t, edge_t, multi_gpu>{
-        static_cast<vertex_t>(renumber_map_labels.size()),
-        graph_properties_t{graph_view.is_symmetric(), false},
-        meta.segment_offsets}),
-    std::move(renumber_map_labels));
+      std::optional<rmm::device_uvector<vertex_t>>{std::move(unique_labels)},
+      store_transposed ? std::move(coarsened_edgelist_minors)
+                       : std::move(coarsened_edgelist_majors),
+      store_transposed ? std::move(coarsened_edgelist_majors)
+                       : std::move(coarsened_edgelist_minors),
+      std::move(coarsened_edgelist_weights),
+      graph_properties_t{graph_view.is_symmetric(), false},
+      true,
+      do_expensive_check);
+
+  return std::make_tuple(std::move(coarsened_graph), std::move(*renumber_map));
 }
 
 }  // namespace detail
@@ -746,7 +634,7 @@ template <typename vertex_t,
           typename weight_t,
           bool store_transposed,
           bool multi_gpu>
-std::tuple<std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>,
+std::tuple<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
            rmm::device_uvector<vertex_t>>
 coarsen_graph(
   raft::handle_t const& handle,

From 3bdb95eb5b8a1c53255afcc25c1902a57534776e Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 18 Feb 2022 17:56:09 -0800
Subject: [PATCH 51/60] additional (potential) parallelism vs peak memory
 trade-off in edge shuffling

---
 cpp/src/detail/shuffle_wrappers.cu | 51 +++++++++++++++++++++++++++---
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/cpp/src/detail/shuffle_wrappers.cu b/cpp/src/detail/shuffle_wrappers.cu
index 6e9434882ba..7d2f2453d08 100644
--- a/cpp/src/detail/shuffle_wrappers.cu
+++ b/cpp/src/detail/shuffle_wrappers.cu
@@ -84,9 +84,33 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,
                       handle.get_stream());
     handle.sync_stream();
 
-    std::forward_as_tuple(
-      std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors, d_rx_edgelist_weights), std::ignore) =
-      shuffle_values(comm, edge_first, h_tx_value_counts, handle.get_stream());
+    if (d_edgelist_majors.size() >
+        mem_frugal_threshold) {  // trade-off potential parallelism to lower peak memory
+      std::tie(d_rx_edgelist_majors, std::ignore) =
+        shuffle_values(comm, d_edgelist_majors.begin(), h_tx_value_counts, handle.get_stream());
+      d_edgelist_majors.resize(0, handle.get_stream());
+      d_edgelist_majors.shrink_to_fit(handle.get_stream());
+
+      std::tie(d_rx_edgelist_minors, std::ignore) =
+        shuffle_values(comm, d_edgelist_minors.begin(), h_tx_value_counts, handle.get_stream());
+      d_edgelist_minors.resize(0, handle.get_stream());
+      d_edgelist_minors.shrink_to_fit(handle.get_stream());
+
+      std::tie(d_rx_edgelist_weights, std::ignore) =
+        shuffle_values(comm, (*d_edgelist_weights).begin(), h_tx_value_counts, handle.get_stream());
+      (*d_edgelist_weights).resize(0, handle.get_stream());
+      (*d_edgelist_weights).shrink_to_fit(handle.get_stream());
+    } else {
+      std::forward_as_tuple(
+        std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors, d_rx_edgelist_weights), std::ignore) =
+        shuffle_values(comm, edge_first, h_tx_value_counts, handle.get_stream());
+      d_edgelist_majors.resize(0, handle.get_stream());
+      d_edgelist_majors.shrink_to_fit(handle.get_stream());
+      d_edgelist_minors.resize(0, handle.get_stream());
+      d_edgelist_minors.shrink_to_fit(handle.get_stream());
+      (*d_edgelist_weights).resize(0, handle.get_stream());
+      (*d_edgelist_weights).shrink_to_fit(handle.get_stream());
+    }
   } else {
     auto edge_first = thrust::make_zip_iterator(
       thrust::make_tuple(d_edgelist_majors.begin(), d_edgelist_minors.begin()));
@@ -110,8 +134,25 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,
                       handle.get_stream());
     handle.sync_stream();
 
-    std::forward_as_tuple(std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors), std::ignore) =
-      shuffle_values(comm, edge_first, h_tx_value_counts, handle.get_stream());
+    if (d_edgelist_majors.size() >
+        mem_frugal_threshold) {  // trade-off potential parallelism to lower peak memory
+      std::tie(d_rx_edgelist_majors, std::ignore) =
+        shuffle_values(comm, d_edgelist_majors.begin(), h_tx_value_counts, handle.get_stream());
+      d_edgelist_majors.resize(0, handle.get_stream());
+      d_edgelist_majors.shrink_to_fit(handle.get_stream());
+
+      std::tie(d_rx_edgelist_minors, std::ignore) =
+        shuffle_values(comm, d_edgelist_minors.begin(), h_tx_value_counts, handle.get_stream());
+      d_edgelist_minors.resize(0, handle.get_stream());
+      d_edgelist_minors.shrink_to_fit(handle.get_stream());
+    } else {
+      std::forward_as_tuple(std::tie(d_rx_edgelist_majors, d_rx_edgelist_minors), std::ignore) =
+        shuffle_values(comm, edge_first, h_tx_value_counts, handle.get_stream());
+      d_edgelist_majors.resize(0, handle.get_stream());
+      d_edgelist_majors.shrink_to_fit(handle.get_stream());
+      d_edgelist_minors.resize(0, handle.get_stream());
+      d_edgelist_minors.shrink_to_fit(handle.get_stream());
+    }
   }
 
   return std::make_tuple(std::move(d_rx_edgelist_majors),

From edd7cddd23c53d15167904dc4030d15fbfe21d89 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Sat, 19 Feb 2022 11:11:05 -0800
Subject: [PATCH 52/60] bug fix

---
 cpp/include/cugraph/utilities/shuffle_comm.cuh | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh
index 309a30c78e2..cfed6c33dd3 100644
--- a/cpp/include/cugraph/utilities/shuffle_comm.cuh
+++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh
@@ -125,6 +125,13 @@ compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const& comm,
   return std::make_tuple(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks);
 }
 
+template <typename key_type, typename KeyToGroupIdOp>
+struct key_group_id_less_t {
+  KeyToGroupIdOp key_to_group_id_op{};
+  int pivot{};
+  __device__ bool operator()(key_type k) const { return key_to_group_id_op(k) < pivot; }
+};
+
 template <typename value_type, typename ValueToGroupIdOp>
 struct value_group_id_less_t {
   ValueToGroupIdOp value_to_group_id_op{};
@@ -231,9 +238,8 @@ std::tuple<KeyIterator, ValueIterator> mem_frugal_partition(
     rmm::exec_policy(stream_view),
     key_first,
     key_last,
-    kv_pair_group_id_less_t<typename thrust::iterator_traits<KeyIterator>::value_type,
-                            typename thrust::iterator_traits<ValueIterator>::value_type,
-                            KeyToGroupIdOp>{key_to_group_id_op, pivot}));
+    key_group_id_less_t<typename thrust::iterator_traits<KeyIterator>::value_type, KeyToGroupIdOp>{
+      key_to_group_id_op, pivot}));
   auto second_size  = num_elements - first_size;
 
   auto tmp_key_buffer =

From ff12636896ef34af7b8cd76935d2ef356211ebf2 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Sun, 20 Feb 2022 12:58:54 -0800
Subject: [PATCH 53/60] fix for a possible hang

---
 cpp/src/detail/shuffle_wrappers.cu | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/cpp/src/detail/shuffle_wrappers.cu b/cpp/src/detail/shuffle_wrappers.cu
index 7d2f2453d08..26bdd21a1f9 100644
--- a/cpp/src/detail/shuffle_wrappers.cu
+++ b/cpp/src/detail/shuffle_wrappers.cu
@@ -16,6 +16,7 @@
 #include <cugraph/detail/graph_utils.cuh>
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/partition_manager.hpp>
+#include <cugraph/utilities/host_scalar_comm.cuh>
 #include <cugraph/utilities/shuffle_comm.cuh>
 
 #include <rmm/exec_policy.hpp>
@@ -51,6 +52,12 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,
   auto mem_frugal_threshold =
     static_cast<size_t>(static_cast<double>(total_global_mem / element_size) * mem_frugal_ratio);
 
+  auto mem_frugal_flag =
+    host_scalar_allreduce(comm,
+                          d_edgelist_majors.size() > mem_frugal_threshold ? int{1} : int{0},
+                          raft::comms::op_t::MAX,
+                          handle.get_stream());
+
   // invoke groupby_and_count and shuffle values to pass mem_frugal_threshold instead of directly
   // calling groupby_gpu_id_and_shuffle_values there is no benefit in reducing peak memory as we
   // need to allocate a receive buffer anyways) but this reduces the maximum memory allocation size
@@ -84,8 +91,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,
                       handle.get_stream());
     handle.sync_stream();
 
-    if (d_edgelist_majors.size() >
-        mem_frugal_threshold) {  // trade-off potential parallelism to lower peak memory
+    if (mem_frugal_flag) {  // trade-off potential parallelism to lower peak memory
       std::tie(d_rx_edgelist_majors, std::ignore) =
         shuffle_values(comm, d_edgelist_majors.begin(), h_tx_value_counts, handle.get_stream());
       d_edgelist_majors.resize(0, handle.get_stream());
@@ -134,8 +140,7 @@ shuffle_edgelist_by_gpu_id(raft::handle_t const& handle,
                       handle.get_stream());
     handle.sync_stream();
 
-    if (d_edgelist_majors.size() >
-        mem_frugal_threshold) {  // trade-off potential parallelism to lower peak memory
+    if (mem_frugal_flag) {  // trade-off potential parallelism to lower peak memory
       std::tie(d_rx_edgelist_majors, std::ignore) =
         shuffle_values(comm, d_edgelist_majors.begin(), h_tx_value_counts, handle.get_stream());
       d_edgelist_majors.resize(0, handle.get_stream());

From a34ad17c3450efa46b440dc9a6f539cc3656b0d1 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 23 Feb 2022 16:25:20 -0800
Subject: [PATCH 54/60] added temporary code to experiment performance

---
 .../copy_v_transform_reduce_in_out_nbr.cuh    | 338 ++++++++++++++++++
 cpp/src/link_analysis/pagerank_impl.cuh       |  32 +-
 .../create_graph_from_edgelist_impl.cuh       |  15 +
 cpp/tests/link_analysis/mg_pagerank_test.cpp  |   8 +
 4 files changed, 391 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
index 63f1aae6c8a..7a47803726d 100644
--- a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
@@ -35,6 +35,8 @@
 #include <thrust/tuple.h>
 #include <thrust/type_traits/integer_sequence.h>
 
+#include <nccl.h>  // FIXME: delete
+
 #include <numeric>
 #include <type_traits>
 #include <utility>
@@ -490,11 +492,19 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
                                       col_properties_t<GraphViewType, T>>
     minor_tmp_buffer{};  // relevant only when (GraphViewType::is_multi_gpu && !update_major
   if constexpr (GraphViewType::is_multi_gpu && !update_major) {
+#if 1  // FIXME: delete
+    handle.sync_stream();
+    std::cout << "copy_v allocate minor_tmp_buffer" << std::endl;
+#endif
     if constexpr (GraphViewType::is_adj_matrix_transposed) {
       minor_tmp_buffer = row_properties_t<GraphViewType, T>(handle, graph_view);
     } else {
       minor_tmp_buffer = col_properties_t<GraphViewType, T>(handle, graph_view);
     }
+#if 1  // FIXME: delete
+    handle.sync_stream();
+    std::cout << "copy_v allocate minor_tmp_buffer: SUCCESS" << std::endl;
+#endif
   }
 
   if constexpr (!update_major) {
@@ -577,6 +587,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
         stream_pool_indices = std::vector<size_t>(num_streams);
         std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0});
         handle.sync_stream();
+        std::cout << "copy_v num_streams=" << num_streams << std::endl;  // FIXME: delete
       }
     }
   }
@@ -615,6 +626,328 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
 
   if (stream_pool_indices) { handle.sync_stream(); }
 
+#if 1  // FIXME: delete, just experimenting (to better separate compute time vs reduction time
+  if (stream_pool_indices) {
+    auto num_concurrent_loops = (*stream_pool_indices).size() / max_segments;
+    auto num_rounds =
+      (graph_view.get_number_of_local_adj_matrix_partitions() + (num_concurrent_loops - 1)) /
+      num_concurrent_loops;
+    for (size_t round = 0; round < num_rounds; ++round) {
+      /* computing */
+
+#if 1  // FIXME: delete
+      if constexpr (GraphViewType::is_multi_gpu) { handle.get_comms().barrier(); }
+      auto core_time0 = std::chrono::steady_clock::now();
+#endif
+      for (size_t i = num_concurrent_loops * round;
+           i < std::min(static_cast<size_t>(graph_view.get_number_of_local_adj_matrix_partitions()),
+                        num_concurrent_loops * (round + 1));
+           ++i) {
+        auto matrix_partition =
+          matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
+            graph_view.get_matrix_partition_view(i));
+
+        auto major_init = T{};
+        if constexpr (update_major) {
+          if constexpr (GraphViewType::is_multi_gpu) {
+            auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+            auto const col_comm_rank = col_comm.get_rank();
+            major_init               = (static_cast<int>(i) == col_comm_rank) ? init : T{};
+          } else {
+            major_init = init;
+          }
+        }
+
+        auto matrix_partition_row_value_input = adj_matrix_row_value_input;
+        auto matrix_partition_col_value_input = adj_matrix_col_value_input;
+        if constexpr (GraphViewType::is_adj_matrix_transposed) {
+          matrix_partition_col_value_input.set_local_adj_matrix_partition_idx(i);
+        } else {
+          matrix_partition_row_value_input.set_local_adj_matrix_partition_idx(i);
+        }
+
+        auto major_buffer_first =
+          get_dataframe_buffer_begin(major_tmp_buffers[i % major_tmp_buffers.size()]);
+
+        std::conditional_t<GraphViewType::is_multi_gpu,
+                           std::conditional_t<update_major,
+                                              decltype(major_buffer_first),
+                                              decltype(minor_tmp_buffer.mutable_device_view())>,
+                           VertexValueOutputIterator>
+          output_buffer{};
+        if constexpr (GraphViewType::is_multi_gpu) {
+          if constexpr (update_major) {
+            output_buffer = major_buffer_first;
+          } else {
+            output_buffer = minor_tmp_buffer.mutable_device_view();
+          }
+        } else {
+          output_buffer = vertex_value_output_first;
+        }
+
+        auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
+        if (segment_offsets) {
+          static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
+
+          // FIXME: we may further improve performance by 1) individually tuning block sizes for
+          // different segments; and 2) adding one more segment for very high degree vertices and
+          // running segmented reduction
+          if (matrix_partition.get_dcs_nzd_vertex_count()) {
+            auto exec_stream =
+              stream_pool_indices
+                ? rmm::cuda_stream_view{pool_streams[(i * max_segments) %
+                                                     (*stream_pool_indices).size()]}
+                /* FIXME for temporary testing,
+                   handle.get_stream_from_stream_pool((i * max_segments) %
+                   (*stream_pool_indices).size()) */
+                : handle.get_stream();
+            if constexpr (update_major) {  // this is necessary as we don't visit every vertex in
+                                           // the hypersparse segment in
+                                           // for_all_major_for_all_nbr_hypersparse
+              thrust::fill(rmm::exec_policy(exec_stream),
+                           output_buffer + (*segment_offsets)[3],
+                           output_buffer + (*segment_offsets)[4],
+                           major_init);
+            }
+            if (*(matrix_partition.get_dcs_nzd_vertex_count()) > 0) {
+              raft::grid_1d_thread_t update_grid(
+                *(matrix_partition.get_dcs_nzd_vertex_count()),
+                detail::copy_v_transform_reduce_nbr_for_all_block_size,
+                handle.get_device_properties().maxGridSize[0]);
+              auto segment_output_buffer = output_buffer;
+              if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[3]; }
+              detail::for_all_major_for_all_nbr_hypersparse<update_major, GraphViewType>
+                <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
+                  matrix_partition,
+                  matrix_partition.get_major_first() + (*segment_offsets)[3],
+                  matrix_partition_row_value_input,
+                  matrix_partition_col_value_input,
+                  segment_output_buffer,
+                  e_op,
+                  major_init);
+            }
+          }
+          if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
+            auto exec_stream =
+              stream_pool_indices
+                ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 1) %
+                                                     (*stream_pool_indices).size()]}
+                /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i *
+                   max_segments + 1) % (*stream_pool_indices).size()) */
+                : handle.get_stream();
+            raft::grid_1d_thread_t update_grid(
+              (*segment_offsets)[3] - (*segment_offsets)[2],
+              detail::copy_v_transform_reduce_nbr_for_all_block_size,
+              handle.get_device_properties().maxGridSize[0]);
+            auto segment_output_buffer = output_buffer;
+            if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[2]; }
+            detail::for_all_major_for_all_nbr_low_degree<update_major, GraphViewType>
+              <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
+                matrix_partition,
+                matrix_partition.get_major_first() + (*segment_offsets)[2],
+                matrix_partition.get_major_first() + (*segment_offsets)[3],
+                matrix_partition_row_value_input,
+                matrix_partition_col_value_input,
+                segment_output_buffer,
+                e_op,
+                major_init);
+          }
+          if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
+            auto exec_stream =
+              stream_pool_indices
+                ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 2) %
+                                                     (*stream_pool_indices).size()]}
+                /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i *
+                   max_segments + 2) % (*stream_pool_indices).size()) */
+                : handle.get_stream();
+            raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1],
+                                             detail::copy_v_transform_reduce_nbr_for_all_block_size,
+                                             handle.get_device_properties().maxGridSize[0]);
+            auto segment_output_buffer = output_buffer;
+            if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[1]; }
+            detail::for_all_major_for_all_nbr_mid_degree<update_major, GraphViewType>
+              <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
+                matrix_partition,
+                matrix_partition.get_major_first() + (*segment_offsets)[1],
+                matrix_partition.get_major_first() + (*segment_offsets)[2],
+                matrix_partition_row_value_input,
+                matrix_partition_col_value_input,
+                segment_output_buffer,
+                e_op,
+                major_init);
+          }
+          if ((*segment_offsets)[1] > 0) {
+            auto exec_stream =
+              stream_pool_indices
+                ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 3) %
+                                                     (*stream_pool_indices).size()]}
+                /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i *
+                   max_segments + 3) % (*stream_pool_indices).size()) */
+                : handle.get_stream();
+            raft::grid_1d_block_t update_grid(
+              (*segment_offsets)[1],
+              detail::copy_v_transform_reduce_nbr_for_all_block_size,
+              handle.get_device_properties().maxGridSize[0]);
+            detail::for_all_major_for_all_nbr_high_degree<update_major, GraphViewType>
+              <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
+                matrix_partition,
+                matrix_partition.get_major_first(),
+                matrix_partition.get_major_first() + (*segment_offsets)[1],
+                matrix_partition_row_value_input,
+                matrix_partition_col_value_input,
+                output_buffer,
+                e_op,
+                major_init);
+          }
+        } else {
+          if (matrix_partition.get_major_size() > 0) {
+            raft::grid_1d_thread_t update_grid(
+              matrix_partition.get_major_size(),
+              detail::copy_v_transform_reduce_nbr_for_all_block_size,
+              handle.get_device_properties().maxGridSize[0]);
+            detail::for_all_major_for_all_nbr_low_degree<update_major, GraphViewType>
+              <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+                matrix_partition,
+                matrix_partition.get_major_first(),
+                matrix_partition.get_major_last(),
+                matrix_partition_row_value_input,
+                matrix_partition_col_value_input,
+                output_buffer,
+                e_op,
+                major_init);
+          }
+        }
+      }
+
+#if 1  // FIXME: for temporary testing
+      for (size_t i = 0; i < pool_streams.size(); ++i) {
+        CUDA_TRY(cudaStreamSynchronize(pool_streams[i]));
+      }
+#else
+      handle.sync_stream_pool(*stream_pool_indices);
+#endif
+
+      /* communication */
+
+#if 1  // FIXME: delete
+      auto core_time1 = std::chrono::steady_clock::now();
+      if constexpr (GraphViewType::is_multi_gpu) { handle.get_comms().barrier(); }
+      auto core_time2 = std::chrono::steady_clock::now();
+#endif
+      if constexpr (GraphViewType::is_multi_gpu && update_major) {
+        ncclGroupStart();  // SIMPLE
+        for (size_t i = num_concurrent_loops * round;
+             i <
+             std::min(static_cast<size_t>(graph_view.get_number_of_local_adj_matrix_partitions()),
+                      num_concurrent_loops * (round + 1));
+             ++i) {
+#if 1  // SIMPLE
+          auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+#else
+          auto& comm     = handle.get_comms();
+          auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+          auto const row_comm_rank = row_comm.get_rank();
+          auto const row_comm_size = row_comm.get_size();
+          auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+          auto const col_comm_rank = col_comm.get_rank();
+          auto const col_comm_size = col_comm.get_size();
+#endif
+
+          auto matrix_partition =
+            matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
+              graph_view.get_matrix_partition_view(i));
+
+          auto major_buffer_first =
+            get_dataframe_buffer_begin(major_tmp_buffers[i % major_tmp_buffers.size()]);
+
+#if 1  // SIMPLE
+          device_reduce(col_comm,
+                        major_buffer_first,
+                        vertex_value_output_first,
+                        matrix_partition.get_major_size(),
+                        raft::comms::op_t::SUM,
+                        i,
+                        handle.get_stream());
+#else
+          auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
+          if (segment_offsets && stream_pool_indices) {
+            if ((*segment_offsets).back() - (*segment_offsets)[3] > 0) {
+              device_reduce(col_comm,
+                        major_buffer_first + (*segment_offsets)[3],
+                        vertex_value_output_first + (*segment_offsets)[3],
+                        (*segment_offsets).back() - (*segment_offsets)[3],
+                        raft::comms::op_t::SUM,
+                        i,
+                        pool_streams[(i * max_segments) % (*stream_pool_indices).size()]/* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) */);
+            }
+            if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
+              device_reduce(col_comm,
+                            major_buffer_first + (*segment_offsets)[2],
+                            vertex_value_output_first + (*segment_offsets)[2],
+                            (*segment_offsets)[3] - (*segment_offsets)[2],
+                            raft::comms::op_t::SUM,
+                            i,
+                            pool_streams[(i * max_segments + 1) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 1) % (*stream_pool_indices).size()) */);
+            }
+            if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
+              device_reduce(col_comm,
+                            major_buffer_first + (*segment_offsets)[1],
+                            vertex_value_output_first + (*segment_offsets)[1],
+                            (*segment_offsets)[2] - (*segment_offsets)[1],
+                            raft::comms::op_t::SUM,
+                            i,
+                            pool_streams[(i * max_segments + 2) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 2) % (*stream_pool_indices).size()) */);
+            }
+            if ((*segment_offsets)[1] > 0) {
+              device_reduce(col_comm,
+                            major_buffer_first,
+                            vertex_value_output_first,
+                            (*segment_offsets)[1],
+                            raft::comms::op_t::SUM,
+                            i,
+                            pool_streams[(i * max_segments + 3) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 3) % (*stream_pool_indices).size()) */);
+            }
+          } else {
+            device_reduce(col_comm,
+                          major_buffer_first,
+                          vertex_value_output_first,
+                          matrix_partition.get_major_size(),
+                          raft::comms::op_t::SUM,
+                          i,
+                          handle.get_stream());
+          }
+#endif
+        }
+        ncclGroupEnd();  // SIMPLE
+      }
+
+#if 1  // SIMPLE
+      handle.sync_stream();  // SIMPLE
+#else
+#if 1  // FIXME: for temporary testing
+      for (size_t i = 0; i < pool_streams.size(); ++i) {
+        CUDA_TRY(cudaStreamSynchronize(pool_streams[i]));
+      }
+#else
+      handle.sync_stream_pool(*stream_pool_indices);
+#endif
+#endif
+#if 1  // FIXME: delete
+      auto core_time3 = std::chrono::steady_clock::now();
+      if constexpr (GraphViewType::is_multi_gpu) { handle.get_comms().barrier(); }
+      auto core_time4 = std::chrono::steady_clock::now();
+      std::chrono::duration<double> elapsed_total = core_time4 - core_time0;
+      std::chrono::duration<double> elapsed0 = core_time1 - core_time0;
+      std::chrono::duration<double> elapsed1 = core_time2 - core_time1;
+      std::chrono::duration<double> elapsed2 = core_time3 - core_time2;
+      std::chrono::duration<double> elapsed3 = core_time4 - core_time3;
+      std::cout << "copy_v core took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << ") ms." << std::endl;
+#endif
+    }
+  } else {
+    CUGRAPH_FAIL("should not be reached.");
+  }
+#else
   for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
     auto matrix_partition =
       matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
@@ -826,6 +1159,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       }
     }
   }
+#endif
 
   if (stream_pool_indices) {
 #if 1  // FIXME: for temporary testing
@@ -838,6 +1172,10 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
   }
 
   if constexpr (GraphViewType::is_multi_gpu && !update_major) {
+#if 1  // FIXME: delete
+    handle.sync_stream();
+    std::cout << "minor reduction" << std::endl;
+#endif
     auto& comm               = handle.get_comms();
     auto const comm_rank     = comm.get_rank();
     auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh
index e346a6892b9..eec2126aea7 100644
--- a/cpp/src/link_analysis/pagerank_impl.cuh
+++ b/cpp/src/link_analysis/pagerank_impl.cuh
@@ -55,6 +55,12 @@ void pagerank(
   bool has_initial_guess,
   bool do_expensive_check)
 {
+#if 1  // FIXME: delete
+size_t free_size{};
+size_t total_size{};
+CUDA_TRY(cudaMemGetInfo(&free_size, &total_size));
+std::cout << "PageRank start free_size=" << static_cast<double>(free_size) / (1024.0 * 1024.0 * 1024.0) << " GB total_size=" << static_cast<double>(total_size) / (1024.0 * 1024.0 * 1024.0) << " GB." << std::endl;
+#endif
   using vertex_t = typename GraphViewType::vertex_type;
   using weight_t = typename GraphViewType::weight_type;
 
@@ -78,6 +84,9 @@ void pagerank(
                              : vertex_t{0};
 
   // 1. check input arguments
+#if 1  // FIXME: delete
+handle.sync_stream(); std::cout << "PageRank check inputs" << std::endl;
+#endif
 
   CUGRAPH_EXPECTS((personalization_vertices.has_value() == false) ||
                     (personalization_values.has_value() && personalization_vector_size.has_value()),
@@ -142,6 +151,9 @@ void pagerank(
   }
 
   // 2. compute the sums of the out-going edge weights (if not provided)
+#if 1  // FIXME: delete
+handle.sync_stream(); std::cout << "PageRank compute out_weight_sums" << std::endl;
+#endif
 
   auto tmp_vertex_out_weight_sums = precomputed_vertex_out_weight_sums
                                       ? std::nullopt
@@ -152,6 +164,9 @@ void pagerank(
                                       : (*tmp_vertex_out_weight_sums).data();
 
   // 3. initialize pagerank values
+#if 1  // FIXME: delete
+handle.sync_stream(); std::cout << "PageRank initialize PageRank values" << std::endl;
+#endif
 
   if (has_initial_guess) {
     auto sum = reduce_v(handle, pull_graph_view, pageranks, result_t{0.0});
@@ -171,6 +186,9 @@ void pagerank(
   }
 
   // 4. sum the personalization values
+#if 1  // FIXME: delete
+handle.sync_stream(); std::cout << "PageRank sum personalization values" << std::endl;
+#endif
 
   result_t personalization_sum{0.0};
   if (aggregate_personalization_vector_size > 0) {
@@ -185,6 +203,9 @@ void pagerank(
   }
 
   // 5. pagerank iteration
+#if 1  // FIXME: delete
+handle.sync_stream(); std::cout << "PageRank iteration" << std::endl;
+#endif
 
   // old PageRank values
   rmm::device_uvector<result_t> old_pageranks(pull_graph_view.get_number_of_local_vertices(),
@@ -197,6 +218,7 @@ void pagerank(
     if constexpr (GraphViewType::is_multi_gpu) {
       handle.get_comms().barrier();
     }
+    std::cout << "PageRank iteration " << iter << " start" << std::endl;
     auto time0 = std::chrono::steady_clock::now();
 #endif
     thrust::copy(handle.get_thrust_policy(),
@@ -282,6 +304,11 @@ void pagerank(
 #if 1 // FIXME: delete
     handle.sync_stream();
     auto time4 = std::chrono::steady_clock::now();
+    if constexpr (GraphViewType::is_multi_gpu) {
+      handle.get_comms().barrier();
+    }
+    handle.sync_stream();
+    auto time5 = std::chrono::steady_clock::now();
 #endif
     auto diff_sum = transform_reduce_v(
       handle,
@@ -292,13 +319,14 @@ void pagerank(
 
 #if 1 // FIXME: delete
     handle.sync_stream();
-    auto time5 = std::chrono::steady_clock::now();
-    std::chrono::duration<double> elapsed_total = time5 - time0;
+    auto time6 = std::chrono::steady_clock::now();
+    std::chrono::duration<double> elapsed_total = time6 - time0;
     std::chrono::duration<double> elapsed0 = time1 - time0;
     std::chrono::duration<double> elapsed1 = time2 - time1;
     std::chrono::duration<double> elapsed2 = time3 - time2;
     std::chrono::duration<double> elapsed3 = time4 - time3;
     std::chrono::duration<double> elapsed4 = time5 - time4;
+    std::chrono::duration<double> elapsed5 = time6 - time5;
     std::cout << "PageRank iter " << iter << " took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << ") ms." << std::endl;
 #endif
     iter++;
diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
index 0b711936ee0..88cba4cef43 100644
--- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh
+++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
@@ -218,6 +218,9 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
 
   // 1. groupby edges to their target local adjacency matrix partition (and further groupby within
   // the local partition by applying the compute_gpu_id_from_vertex_t to minor vertex IDs).
+#if 1  // FIXME: delete
+handle.sync_stream(); std::cout << "create_graph 0" << std::endl;
+#endif
 
   auto edge_counts = cugraph::detail::groupby_and_count_edgelist_by_local_partition_id(
     handle,
@@ -225,6 +228,9 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
     store_transposed ? edgelist_rows : edgelist_cols,
     edgelist_weights,
     true);
+#if 1  // FIXME: delete
+handle.sync_stream(); std::cout << "create_graph 0a" << std::endl;
+#endif
 
   std::vector<size_t> h_edge_counts(edge_counts.size());
   raft::update_host(
@@ -249,6 +255,9 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
                    edgelist_displacements.begin() + 1);
 
   // 2. split the input edges to local partitions
+#if 1  // FIXME: delete
+handle.sync_stream(); std::cout << "create_graph 1" << std::endl;
+#endif
 
   std::vector<rmm::device_uvector<vertex_t>> edgelist_src_partitions{};
   edgelist_src_partitions.reserve(col_comm_size);
@@ -294,6 +303,9 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
   }
 
   // 2. renumber
+#if 1  // FIXME: delete
+handle.sync_stream(); std::cout << "create_graph 2" << std::endl;
+#endif
 
   std::vector<vertex_t*> major_ptrs(col_comm_size);
   std::vector<vertex_t*> minor_ptrs(major_ptrs.size());
@@ -310,6 +322,9 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
     minor_ptrs,
     edgelist_edge_counts,
     edgelist_intra_partition_segment_offsets);
+#if 1  // FIXME: delete
+handle.sync_stream(); std::cout << "create_graph 3" << std::endl;
+#endif
 
   // 3. create a graph
 
diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp
index c2a9bf74e2e..9b3876b4e5d 100644
--- a/cpp/tests/link_analysis/mg_pagerank_test.cpp
+++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp
@@ -82,12 +82,20 @@ class Tests_MGPageRank
       CUGRAPH_EXPECTS((comm_size % num_gpus_per_node) == 0,
                       "Invalid MPI configuration: in multi-node execution, # MPI processes should "
                       "be a multiple of the number of GPUs per node.");
+#if 1
+      auto col_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
+      while (comm_size % col_comm_size != 0) {
+        --col_comm_size;
+      }
+      row_comm_size = comm_size / col_comm_size;
+#else
       auto num_nodes = comm_size / num_gpus_per_node;
       row_comm_size  = static_cast<int>(sqrt(static_cast<double>(num_nodes)));
       while (num_nodes % row_comm_size != 0) {
         --row_comm_size;
       }
       row_comm_size *= num_gpus_per_node;
+#endif
     } else {
       row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
       while (comm_size % row_comm_size != 0) {

From b1e56e7cea8a537a070702b0c67b43015190da87 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 23 Feb 2022 16:30:12 -0800
Subject: [PATCH 55/60] additional cut in peak memory and maximum single
 allocation size (to avoid malloc failure due to fragmentation with the pool
 allocator)

---
 .../cugraph/utilities/shuffle_comm.cuh        | 182 +++++++++++++++++-
 cpp/src/structure/renumber_edgelist_impl.cuh  |   2 +-
 2 files changed, 179 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh
index cfed6c33dd3..f8d2280628a 100644
--- a/cpp/include/cugraph/utilities/shuffle_comm.cuh
+++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh
@@ -166,6 +166,135 @@ struct kv_pair_group_id_greater_equal_t {
   }
 };
 
+template <typename ValueIterator>
+void swap_partitions(ValueIterator value_first,
+                     ValueIterator value_last,
+                     size_t first_partition_size,
+                     rmm::cuda_stream_view stream_view)
+{
+  auto num_elements          = static_cast<size_t>(thrust::distance(value_first, value_last));
+  auto second_partition_size = num_elements - first_partition_size;
+  if (first_partition_size >= second_partition_size) {
+    auto tmp_value_buffer =
+      allocate_dataframe_buffer<typename thrust::iterator_traits<ValueIterator>::value_type>(
+        first_partition_size, stream_view);
+
+    thrust::copy(rmm::exec_policy(stream_view),
+                 value_first,
+                 value_first + first_partition_size,
+                 get_dataframe_buffer_begin(tmp_value_buffer));
+
+    thrust::copy(rmm::exec_policy(stream_view),
+                 value_first + first_partition_size,
+                 value_first + num_elements,
+                 value_first);
+
+    thrust::copy(rmm::exec_policy(stream_view),
+                 get_dataframe_buffer_begin(tmp_value_buffer),
+                 get_dataframe_buffer_end(tmp_value_buffer),
+                 value_first + second_partition_size);
+  } else {
+    auto tmp_value_buffer =
+      allocate_dataframe_buffer<typename thrust::iterator_traits<ValueIterator>::value_type>(
+        second_partition_size, stream_view);
+
+    thrust::copy(rmm::exec_policy(stream_view),
+                 value_first + first_partition_size,
+                 value_first + num_elements,
+                 get_dataframe_buffer_begin(tmp_value_buffer));
+
+    thrust::copy(rmm::exec_policy(stream_view),
+                 value_first,
+                 value_first + first_partition_size,
+                 value_first + (num_elements - first_partition_size));
+
+    thrust::copy(rmm::exec_policy(stream_view),
+                 get_dataframe_buffer_begin(tmp_value_buffer),
+                 get_dataframe_buffer_end(tmp_value_buffer),
+                 value_first);
+  }
+}
+
+template <typename KeyIterator, typename ValueIterator>
+void swap_partitions(KeyIterator key_first,
+                     KeyIterator key_last,
+                     ValueIterator value_first,
+                     size_t first_partition_size,
+                     rmm::cuda_stream_view stream_view)
+{
+  auto num_elements          = static_cast<size_t>(thrust::distance(key_first, key_last));
+  auto second_partition_size = num_elements - first_partition_size;
+  if (first_partition_size >= second_partition_size) {
+    auto tmp_key_buffer =
+      allocate_dataframe_buffer<typename thrust::iterator_traits<KeyIterator>::value_type>(
+        first_partition_size, stream_view);
+    auto tmp_value_buffer =
+      allocate_dataframe_buffer<typename thrust::iterator_traits<ValueIterator>::value_type>(
+        first_partition_size, stream_view);
+
+    thrust::copy(rmm::exec_policy(stream_view),
+                 key_first,
+                 key_first + first_partition_size,
+                 get_dataframe_buffer_begin(tmp_key_buffer));
+    thrust::copy(rmm::exec_policy(stream_view),
+                 value_first,
+                 value_first + first_partition_size,
+                 get_dataframe_buffer_begin(tmp_value_buffer));
+
+    thrust::copy(rmm::exec_policy(stream_view),
+                 key_first + first_partition_size,
+                 key_first + num_elements,
+                 key_first);
+    thrust::copy(rmm::exec_policy(stream_view),
+                 value_first + first_partition_size,
+                 value_first + num_elements,
+                 value_first);
+
+    thrust::copy(rmm::exec_policy(stream_view),
+                 get_dataframe_buffer_begin(tmp_key_buffer),
+                 get_dataframe_buffer_end(tmp_key_buffer),
+                 key_first + second_partition_size);
+    thrust::copy(rmm::exec_policy(stream_view),
+                 get_dataframe_buffer_begin(tmp_value_buffer),
+                 get_dataframe_buffer_end(tmp_value_buffer),
+                 value_first + second_partition_size);
+  } else {
+    auto tmp_key_buffer =
+      allocate_dataframe_buffer<typename thrust::iterator_traits<KeyIterator>::value_type>(
+        second_partition_size, stream_view);
+    auto tmp_value_buffer =
+      allocate_dataframe_buffer<typename thrust::iterator_traits<ValueIterator>::value_type>(
+        second_partition_size, stream_view);
+
+    thrust::copy(rmm::exec_policy(stream_view),
+                 key_first + first_partition_size,
+                 key_first + num_elements,
+                 get_dataframe_buffer_begin(tmp_key_buffer));
+    thrust::copy(rmm::exec_policy(stream_view),
+                 value_first + first_partition_size,
+                 value_first + num_elements,
+                 get_dataframe_buffer_begin(tmp_value_buffer));
+
+    thrust::copy(rmm::exec_policy(stream_view),
+                 key_first,
+                 key_first + first_partition_size,
+                 key_first + (num_elements - first_partition_size));
+    thrust::copy(rmm::exec_policy(stream_view),
+                 value_first,
+                 value_first + first_partition_size,
+                 value_first + (num_elements - first_partition_size));
+
+    thrust::copy(rmm::exec_policy(stream_view),
+                 get_dataframe_buffer_begin(tmp_key_buffer),
+                 get_dataframe_buffer_end(tmp_key_buffer),
+                 key_first);
+    thrust::copy(rmm::exec_policy(stream_view),
+                 get_dataframe_buffer_begin(tmp_value_buffer),
+                 get_dataframe_buffer_end(tmp_value_buffer),
+                 value_first);
+  }
+}
+
 // Use roughly half temporary buffer than thrust::partition (if first & second partition sizes are
 // comparable). This also uses multiple smaller allocations than one single allocation (thrust::sort
 // does this) of the same aggregate size if the input iterators are the zip iterators (this is more
@@ -330,8 +459,28 @@ void mem_frugal_groupby(
                        });
         }
       } else {
-        auto second_first = mem_frugal_partition(
-          value_firsts[i], value_lasts[i], value_to_group_id_op, pivot, stream_view);
+        ValueIterator second_first{};
+        auto num_elements = static_cast<size_t>(thrust::distance(value_firsts[i], value_lasts[i]));
+        auto first_chunk_partition_first  = mem_frugal_partition(value_firsts[i],
+                                                                value_firsts[i] + num_elements / 2,
+                                                                value_to_group_id_op,
+                                                                pivot,
+                                                                stream_view);
+        auto second_chunk_partition_first = mem_frugal_partition(value_firsts[i] + num_elements / 2,
+                                                                 value_lasts[i],
+                                                                 value_to_group_id_op,
+                                                                 pivot,
+                                                                 stream_view);
+        auto no_less_size                 = static_cast<size_t>(
+          thrust::distance(first_chunk_partition_first, value_firsts[i] + num_elements / 2));
+        auto less_size = static_cast<size_t>(
+          thrust::distance(value_firsts[i] + num_elements / 2, second_chunk_partition_first));
+        swap_partitions(value_firsts[i] + (num_elements / 2 - no_less_size),
+                        value_firsts[i] + (num_elements / 2 + less_size),
+                        no_less_size,
+                        stream_view);
+
+        second_first = value_firsts[i] + ((num_elements / 2 - no_less_size) + less_size);
         if (pivot - group_firsts[i] > 1) {
           group_firsts.push_back(group_firsts[i]);
           group_lasts.push_back(pivot);
@@ -402,8 +551,33 @@ void mem_frugal_groupby(
                               });
         }
       } else {
-        auto second_first = mem_frugal_partition(
-          key_firsts[i], key_lasts[i], value_firsts[i], key_to_group_id_op, pivot, stream_view);
+        std::tuple<KeyIterator, ValueIterator> second_first{};
+        auto num_elements = static_cast<size_t>(thrust::distance(key_firsts[i], key_lasts[i]));
+        auto first_chunk_partition_first  = mem_frugal_partition(key_firsts[i],
+                                                                key_firsts[i] + num_elements / 2,
+                                                                value_firsts[i],
+                                                                key_to_group_id_op,
+                                                                pivot,
+                                                                stream_view);
+        auto second_chunk_partition_first = mem_frugal_partition(key_firsts[i] + num_elements / 2,
+                                                                 key_lasts[i],
+                                                                 value_firsts[i] + num_elements / 2,
+                                                                 key_to_group_id_op,
+                                                                 pivot,
+                                                                 stream_view);
+        auto no_less_size                 = static_cast<size_t>(thrust::distance(
+          std::get<0>(first_chunk_partition_first), key_firsts[i] + num_elements / 2));
+        auto less_size                    = static_cast<size_t>(thrust::distance(
+          key_firsts[i] + num_elements / 2, std::get<0>(second_chunk_partition_first)));
+        swap_partitions(key_firsts[i] + (num_elements / 2 - no_less_size),
+                        key_firsts[i] + (num_elements / 2 + less_size),
+                        value_firsts[i] + (num_elements / 2 - no_less_size),
+                        no_less_size,
+                        stream_view);
+
+        second_first =
+          std::make_tuple(key_firsts[i] + ((num_elements / 2 - no_less_size) + less_size),
+                          value_firsts[i] + ((num_elements / 2 - no_less_size) + less_size));
         if (pivot - group_firsts[i] > 1) {
           group_firsts.push_back(group_firsts[i]);
           group_lasts.push_back(pivot);
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index 2a3ed5df5df..1aca1120544 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -760,7 +760,7 @@ renumber_edgelist(
     }
   }
 
-  if ((static_cast<double>(partition.get_matrix_partition_minor_size() / load_factor) >=
+  if ((static_cast<double>(partition.get_matrix_partition_minor_size() * (1.0 + 1.0 / load_factor)) >=
        static_cast<double>(number_of_edges / comm_size)) &&
       edgelist_intra_partition_segment_offsets) {  // memory footprint dominated by the O(V/sqrt(P))
                                                    // part than the O(E/P) part

From d716f6c7c030535660a3bdfeb58f2c7b53c01e81 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 8 Mar 2022 17:02:16 -0800
Subject: [PATCH 56/60] remove temporary experimental code

---
 .../copy_v_transform_reduce_in_out_nbr.cuh    | 447 ++----------------
 1 file changed, 42 insertions(+), 405 deletions(-)

diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
index 7a47803726d..ad76ee1fd67 100644
--- a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
@@ -35,8 +35,6 @@
 #include <thrust/tuple.h>
 #include <thrust/type_traits/integer_sequence.h>
 
-#include <nccl.h>  // FIXME: delete
-
 #include <numeric>
 #include <type_traits>
 #include <utility>
@@ -492,19 +490,11 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
                                       col_properties_t<GraphViewType, T>>
     minor_tmp_buffer{};  // relevant only when (GraphViewType::is_multi_gpu && !update_major
   if constexpr (GraphViewType::is_multi_gpu && !update_major) {
-#if 1  // FIXME: delete
-    handle.sync_stream();
-    std::cout << "copy_v allocate minor_tmp_buffer" << std::endl;
-#endif
     if constexpr (GraphViewType::is_adj_matrix_transposed) {
       minor_tmp_buffer = row_properties_t<GraphViewType, T>(handle, graph_view);
     } else {
       minor_tmp_buffer = col_properties_t<GraphViewType, T>(handle, graph_view);
     }
-#if 1  // FIXME: delete
-    handle.sync_stream();
-    std::cout << "copy_v allocate minor_tmp_buffer: SUCCESS" << std::endl;
-#endif
   }
 
   if constexpr (!update_major) {
@@ -527,9 +517,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
   }
 
   std::optional<std::vector<size_t>> stream_pool_indices{std::nullopt};
-#if 1  // FIXME: for temporary testing
-  std::vector<cudaStream_t> pool_streams{};
-#endif
   if constexpr (GraphViewType::is_multi_gpu) {
     if ((graph_view.get_local_adj_matrix_partition_segment_offsets(0)) &&
         (handle.get_stream_pool_size() >= max_segments)) {
@@ -569,25 +556,9 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       }
 
       if (num_streams >= max_segments) {
-#if 1  // FIXME: for temporary testing
-        pool_streams.resize(num_streams);
-        for (size_t i = 0; i < pool_streams.size() / max_segments; ++i) {
-          static_assert(max_segments == 4);
-          CUDA_TRY(cudaStreamCreateWithPriority(
-            &pool_streams[i * max_segments], cudaStreamNonBlocking, -2));
-          CUDA_TRY(cudaStreamCreateWithPriority(
-            &pool_streams[i * max_segments + 1], cudaStreamNonBlocking, -2));
-          CUDA_TRY(cudaStreamCreateWithPriority(
-            &pool_streams[i * max_segments + 2], cudaStreamNonBlocking, -1));
-          CUDA_TRY(cudaStreamCreateWithPriority(
-            &pool_streams[i * max_segments + 3], cudaStreamNonBlocking, 0));
-        }
-#endif
-
         stream_pool_indices = std::vector<size_t>(num_streams);
         std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0});
         handle.sync_stream();
-        std::cout << "copy_v num_streams=" << num_streams << std::endl;  // FIXME: delete
       }
     }
   }
@@ -626,328 +597,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
 
   if (stream_pool_indices) { handle.sync_stream(); }
 
-#if 1  // FIXME: delete, just experimenting (to better separate compute time vs reduction time
-  if (stream_pool_indices) {
-    auto num_concurrent_loops = (*stream_pool_indices).size() / max_segments;
-    auto num_rounds =
-      (graph_view.get_number_of_local_adj_matrix_partitions() + (num_concurrent_loops - 1)) /
-      num_concurrent_loops;
-    for (size_t round = 0; round < num_rounds; ++round) {
-      /* computing */
-
-#if 1  // FIXME: delete
-      if constexpr (GraphViewType::is_multi_gpu) { handle.get_comms().barrier(); }
-      auto core_time0 = std::chrono::steady_clock::now();
-#endif
-      for (size_t i = num_concurrent_loops * round;
-           i < std::min(static_cast<size_t>(graph_view.get_number_of_local_adj_matrix_partitions()),
-                        num_concurrent_loops * (round + 1));
-           ++i) {
-        auto matrix_partition =
-          matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
-            graph_view.get_matrix_partition_view(i));
-
-        auto major_init = T{};
-        if constexpr (update_major) {
-          if constexpr (GraphViewType::is_multi_gpu) {
-            auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-            auto const col_comm_rank = col_comm.get_rank();
-            major_init               = (static_cast<int>(i) == col_comm_rank) ? init : T{};
-          } else {
-            major_init = init;
-          }
-        }
-
-        auto matrix_partition_row_value_input = adj_matrix_row_value_input;
-        auto matrix_partition_col_value_input = adj_matrix_col_value_input;
-        if constexpr (GraphViewType::is_adj_matrix_transposed) {
-          matrix_partition_col_value_input.set_local_adj_matrix_partition_idx(i);
-        } else {
-          matrix_partition_row_value_input.set_local_adj_matrix_partition_idx(i);
-        }
-
-        auto major_buffer_first =
-          get_dataframe_buffer_begin(major_tmp_buffers[i % major_tmp_buffers.size()]);
-
-        std::conditional_t<GraphViewType::is_multi_gpu,
-                           std::conditional_t<update_major,
-                                              decltype(major_buffer_first),
-                                              decltype(minor_tmp_buffer.mutable_device_view())>,
-                           VertexValueOutputIterator>
-          output_buffer{};
-        if constexpr (GraphViewType::is_multi_gpu) {
-          if constexpr (update_major) {
-            output_buffer = major_buffer_first;
-          } else {
-            output_buffer = minor_tmp_buffer.mutable_device_view();
-          }
-        } else {
-          output_buffer = vertex_value_output_first;
-        }
-
-        auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
-        if (segment_offsets) {
-          static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
-
-          // FIXME: we may further improve performance by 1) individually tuning block sizes for
-          // different segments; and 2) adding one more segment for very high degree vertices and
-          // running segmented reduction
-          if (matrix_partition.get_dcs_nzd_vertex_count()) {
-            auto exec_stream =
-              stream_pool_indices
-                ? rmm::cuda_stream_view{pool_streams[(i * max_segments) %
-                                                     (*stream_pool_indices).size()]}
-                /* FIXME for temporary testing,
-                   handle.get_stream_from_stream_pool((i * max_segments) %
-                   (*stream_pool_indices).size()) */
-                : handle.get_stream();
-            if constexpr (update_major) {  // this is necessary as we don't visit every vertex in
-                                           // the hypersparse segment in
-                                           // for_all_major_for_all_nbr_hypersparse
-              thrust::fill(rmm::exec_policy(exec_stream),
-                           output_buffer + (*segment_offsets)[3],
-                           output_buffer + (*segment_offsets)[4],
-                           major_init);
-            }
-            if (*(matrix_partition.get_dcs_nzd_vertex_count()) > 0) {
-              raft::grid_1d_thread_t update_grid(
-                *(matrix_partition.get_dcs_nzd_vertex_count()),
-                detail::copy_v_transform_reduce_nbr_for_all_block_size,
-                handle.get_device_properties().maxGridSize[0]);
-              auto segment_output_buffer = output_buffer;
-              if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[3]; }
-              detail::for_all_major_for_all_nbr_hypersparse<update_major, GraphViewType>
-                <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
-                  matrix_partition,
-                  matrix_partition.get_major_first() + (*segment_offsets)[3],
-                  matrix_partition_row_value_input,
-                  matrix_partition_col_value_input,
-                  segment_output_buffer,
-                  e_op,
-                  major_init);
-            }
-          }
-          if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
-            auto exec_stream =
-              stream_pool_indices
-                ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 1) %
-                                                     (*stream_pool_indices).size()]}
-                /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i *
-                   max_segments + 1) % (*stream_pool_indices).size()) */
-                : handle.get_stream();
-            raft::grid_1d_thread_t update_grid(
-              (*segment_offsets)[3] - (*segment_offsets)[2],
-              detail::copy_v_transform_reduce_nbr_for_all_block_size,
-              handle.get_device_properties().maxGridSize[0]);
-            auto segment_output_buffer = output_buffer;
-            if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[2]; }
-            detail::for_all_major_for_all_nbr_low_degree<update_major, GraphViewType>
-              <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
-                matrix_partition,
-                matrix_partition.get_major_first() + (*segment_offsets)[2],
-                matrix_partition.get_major_first() + (*segment_offsets)[3],
-                matrix_partition_row_value_input,
-                matrix_partition_col_value_input,
-                segment_output_buffer,
-                e_op,
-                major_init);
-          }
-          if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
-            auto exec_stream =
-              stream_pool_indices
-                ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 2) %
-                                                     (*stream_pool_indices).size()]}
-                /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i *
-                   max_segments + 2) % (*stream_pool_indices).size()) */
-                : handle.get_stream();
-            raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1],
-                                             detail::copy_v_transform_reduce_nbr_for_all_block_size,
-                                             handle.get_device_properties().maxGridSize[0]);
-            auto segment_output_buffer = output_buffer;
-            if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[1]; }
-            detail::for_all_major_for_all_nbr_mid_degree<update_major, GraphViewType>
-              <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
-                matrix_partition,
-                matrix_partition.get_major_first() + (*segment_offsets)[1],
-                matrix_partition.get_major_first() + (*segment_offsets)[2],
-                matrix_partition_row_value_input,
-                matrix_partition_col_value_input,
-                segment_output_buffer,
-                e_op,
-                major_init);
-          }
-          if ((*segment_offsets)[1] > 0) {
-            auto exec_stream =
-              stream_pool_indices
-                ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 3) %
-                                                     (*stream_pool_indices).size()]}
-                /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i *
-                   max_segments + 3) % (*stream_pool_indices).size()) */
-                : handle.get_stream();
-            raft::grid_1d_block_t update_grid(
-              (*segment_offsets)[1],
-              detail::copy_v_transform_reduce_nbr_for_all_block_size,
-              handle.get_device_properties().maxGridSize[0]);
-            detail::for_all_major_for_all_nbr_high_degree<update_major, GraphViewType>
-              <<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
-                matrix_partition,
-                matrix_partition.get_major_first(),
-                matrix_partition.get_major_first() + (*segment_offsets)[1],
-                matrix_partition_row_value_input,
-                matrix_partition_col_value_input,
-                output_buffer,
-                e_op,
-                major_init);
-          }
-        } else {
-          if (matrix_partition.get_major_size() > 0) {
-            raft::grid_1d_thread_t update_grid(
-              matrix_partition.get_major_size(),
-              detail::copy_v_transform_reduce_nbr_for_all_block_size,
-              handle.get_device_properties().maxGridSize[0]);
-            detail::for_all_major_for_all_nbr_low_degree<update_major, GraphViewType>
-              <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
-                matrix_partition,
-                matrix_partition.get_major_first(),
-                matrix_partition.get_major_last(),
-                matrix_partition_row_value_input,
-                matrix_partition_col_value_input,
-                output_buffer,
-                e_op,
-                major_init);
-          }
-        }
-      }
-
-#if 1  // FIXME: for temporary testing
-      for (size_t i = 0; i < pool_streams.size(); ++i) {
-        CUDA_TRY(cudaStreamSynchronize(pool_streams[i]));
-      }
-#else
-      handle.sync_stream_pool(*stream_pool_indices);
-#endif
-
-      /* communication */
-
-#if 1  // FIXME: delete
-      auto core_time1 = std::chrono::steady_clock::now();
-      if constexpr (GraphViewType::is_multi_gpu) { handle.get_comms().barrier(); }
-      auto core_time2 = std::chrono::steady_clock::now();
-#endif
-      if constexpr (GraphViewType::is_multi_gpu && update_major) {
-        ncclGroupStart();  // SIMPLE
-        for (size_t i = num_concurrent_loops * round;
-             i <
-             std::min(static_cast<size_t>(graph_view.get_number_of_local_adj_matrix_partitions()),
-                      num_concurrent_loops * (round + 1));
-             ++i) {
-#if 1  // SIMPLE
-          auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-#else
-          auto& comm     = handle.get_comms();
-          auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-          auto const row_comm_rank = row_comm.get_rank();
-          auto const row_comm_size = row_comm.get_size();
-          auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-          auto const col_comm_rank = col_comm.get_rank();
-          auto const col_comm_size = col_comm.get_size();
-#endif
-
-          auto matrix_partition =
-            matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
-              graph_view.get_matrix_partition_view(i));
-
-          auto major_buffer_first =
-            get_dataframe_buffer_begin(major_tmp_buffers[i % major_tmp_buffers.size()]);
-
-#if 1  // SIMPLE
-          device_reduce(col_comm,
-                        major_buffer_first,
-                        vertex_value_output_first,
-                        matrix_partition.get_major_size(),
-                        raft::comms::op_t::SUM,
-                        i,
-                        handle.get_stream());
-#else
-          auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
-          if (segment_offsets && stream_pool_indices) {
-            if ((*segment_offsets).back() - (*segment_offsets)[3] > 0) {
-              device_reduce(col_comm,
-                        major_buffer_first + (*segment_offsets)[3],
-                        vertex_value_output_first + (*segment_offsets)[3],
-                        (*segment_offsets).back() - (*segment_offsets)[3],
-                        raft::comms::op_t::SUM,
-                        i,
-                        pool_streams[(i * max_segments) % (*stream_pool_indices).size()]/* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) */);
-            }
-            if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
-              device_reduce(col_comm,
-                            major_buffer_first + (*segment_offsets)[2],
-                            vertex_value_output_first + (*segment_offsets)[2],
-                            (*segment_offsets)[3] - (*segment_offsets)[2],
-                            raft::comms::op_t::SUM,
-                            i,
-                            pool_streams[(i * max_segments + 1) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 1) % (*stream_pool_indices).size()) */);
-            }
-            if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
-              device_reduce(col_comm,
-                            major_buffer_first + (*segment_offsets)[1],
-                            vertex_value_output_first + (*segment_offsets)[1],
-                            (*segment_offsets)[2] - (*segment_offsets)[1],
-                            raft::comms::op_t::SUM,
-                            i,
-                            pool_streams[(i * max_segments + 2) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 2) % (*stream_pool_indices).size()) */);
-            }
-            if ((*segment_offsets)[1] > 0) {
-              device_reduce(col_comm,
-                            major_buffer_first,
-                            vertex_value_output_first,
-                            (*segment_offsets)[1],
-                            raft::comms::op_t::SUM,
-                            i,
-                            pool_streams[(i * max_segments + 3) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 3) % (*stream_pool_indices).size()) */);
-            }
-          } else {
-            device_reduce(col_comm,
-                          major_buffer_first,
-                          vertex_value_output_first,
-                          matrix_partition.get_major_size(),
-                          raft::comms::op_t::SUM,
-                          i,
-                          handle.get_stream());
-          }
-#endif
-        }
-        ncclGroupEnd();  // SIMPLE
-      }
-
-#if 1  // SIMPLE
-      handle.sync_stream();  // SIMPLE
-#else
-#if 1  // FIXME: for temporary testing
-      for (size_t i = 0; i < pool_streams.size(); ++i) {
-        CUDA_TRY(cudaStreamSynchronize(pool_streams[i]));
-      }
-#else
-      handle.sync_stream_pool(*stream_pool_indices);
-#endif
-#endif
-#if 1  // FIXME: delete
-      auto core_time3 = std::chrono::steady_clock::now();
-      if constexpr (GraphViewType::is_multi_gpu) { handle.get_comms().barrier(); }
-      auto core_time4 = std::chrono::steady_clock::now();
-      std::chrono::duration<double> elapsed_total = core_time4 - core_time0;
-      std::chrono::duration<double> elapsed0 = core_time1 - core_time0;
-      std::chrono::duration<double> elapsed1 = core_time2 - core_time1;
-      std::chrono::duration<double> elapsed2 = core_time3 - core_time2;
-      std::chrono::duration<double> elapsed3 = core_time4 - core_time3;
-      std::cout << "copy_v core took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << ") ms." << std::endl;
-#endif
-    }
-  } else {
-    CUGRAPH_FAIL("should not be reached.");
-  }
-#else
   for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
     auto matrix_partition =
       matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
@@ -964,7 +613,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       }
     }
 
-    // FIXME: need to double check whether this leads to actual copy
     auto matrix_partition_row_value_input = adj_matrix_row_value_input;
     auto matrix_partition_col_value_input = adj_matrix_col_value_input;
     if constexpr (GraphViewType::is_adj_matrix_transposed) {
@@ -1000,13 +648,10 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       // different segments; and 2) adding one more segment for very high degree vertices and
       // running segmented reduction
       if (matrix_partition.get_dcs_nzd_vertex_count()) {
-        auto exec_stream = stream_pool_indices
-                             ? rmm::cuda_stream_view{pool_streams[(i * max_segments) %
-                                                                  (*stream_pool_indices).size()]}
-                             /* FIXME for temporary testing,
-                                handle.get_stream_from_stream_pool((i * max_segments) %
-                                (*stream_pool_indices).size()) */
-                             : handle.get_stream();
+        auto exec_stream =
+          stream_pool_indices
+            ? handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size())
+            : handle.get_stream();
         if constexpr (update_major) {  // this is necessary as we don't visit every vertex in the
                                        // hypersparse segment in
                                        // for_all_major_for_all_nbr_hypersparse
@@ -1034,10 +679,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       }
       if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
         auto exec_stream = stream_pool_indices
-                             ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 1) %
-                                                                  (*stream_pool_indices).size()]}
-                             /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i *
-                                max_segments + 1) % (*stream_pool_indices).size()) */
+                             ? handle.get_stream_from_stream_pool((i * max_segments + 1) %
+                                                                  (*stream_pool_indices).size())
                              : handle.get_stream();
         raft::grid_1d_thread_t update_grid((*segment_offsets)[3] - (*segment_offsets)[2],
                                            detail::copy_v_transform_reduce_nbr_for_all_block_size,
@@ -1057,10 +700,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       }
       if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
         auto exec_stream = stream_pool_indices
-                             ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 2) %
-                                                                  (*stream_pool_indices).size()]}
-                             /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i *
-                                max_segments + 2) % (*stream_pool_indices).size()) */
+                             ? handle.get_stream_from_stream_pool((i * max_segments + 2) %
+                                                                  (*stream_pool_indices).size())
                              : handle.get_stream();
         raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1],
                                          detail::copy_v_transform_reduce_nbr_for_all_block_size,
@@ -1080,10 +721,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       }
       if ((*segment_offsets)[1] > 0) {
         auto exec_stream = stream_pool_indices
-                             ? rmm::cuda_stream_view{pool_streams[(i * max_segments + 3) %
-                                                                  (*stream_pool_indices).size()]}
-                             /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i *
-                                max_segments + 3) % (*stream_pool_indices).size()) */
+                             ? handle.get_stream_from_stream_pool((i * max_segments + 3) %
+                                                                  (*stream_pool_indices).size())
                              : handle.get_stream();
         raft::grid_1d_block_t update_grid((*segment_offsets)[1],
                                           detail::copy_v_transform_reduce_nbr_for_all_block_size,
@@ -1128,25 +767,44 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
 
       if (segment_offsets && stream_pool_indices) {
         if ((*segment_offsets).back() - (*segment_offsets)[3] > 0) {
+          device_reduce(
+            col_comm,
+            major_buffer_first + (*segment_offsets)[3],
+            vertex_value_output_first + (*segment_offsets)[3],
+            (*segment_offsets).back() - (*segment_offsets)[3],
+            raft::comms::op_t::SUM,
+            i,
+            handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()));
+        }
+        if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
           device_reduce(col_comm,
-                        major_buffer_first + (*segment_offsets)[3],
-                        vertex_value_output_first + (*segment_offsets)[3],
-                        (*segment_offsets).back() - (*segment_offsets)[3],
+                        major_buffer_first + (*segment_offsets)[2],
+                        vertex_value_output_first + (*segment_offsets)[2],
+                        (*segment_offsets)[3] - (*segment_offsets)[2],
                         raft::comms::op_t::SUM,
                         i,
-                        pool_streams[(i * max_segments) % (*stream_pool_indices).size()]/* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) */);
-        }
-        if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
-          device_reduce(
-            col_comm, major_buffer_first + (*segment_offsets)[2], vertex_value_output_first + (*segment_offsets)[2], (*segment_offsets)[3] - (*segment_offsets)[2], raft::comms::op_t::SUM, i, pool_streams[(i * max_segments + 1) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 1) % (*stream_pool_indices).size()) */);
+                        handle.get_stream_from_stream_pool((i * max_segments + 1) %
+                                                           (*stream_pool_indices).size()));
         }
         if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
-          device_reduce(
-            col_comm, major_buffer_first + (*segment_offsets)[1], vertex_value_output_first + (*segment_offsets)[1], (*segment_offsets)[2] - (*segment_offsets)[1], raft::comms::op_t::SUM, i, pool_streams[(i * max_segments + 2) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 2) % (*stream_pool_indices).size()) */);
+          device_reduce(col_comm,
+                        major_buffer_first + (*segment_offsets)[1],
+                        vertex_value_output_first + (*segment_offsets)[1],
+                        (*segment_offsets)[2] - (*segment_offsets)[1],
+                        raft::comms::op_t::SUM,
+                        i,
+                        handle.get_stream_from_stream_pool((i * max_segments + 2) %
+                                                           (*stream_pool_indices).size()));
         }
         if ((*segment_offsets)[1] > 0) {
-          device_reduce(
-            col_comm, major_buffer_first, vertex_value_output_first, (*segment_offsets)[1], raft::comms::op_t::SUM, i, pool_streams[(i * max_segments + 3) % (*stream_pool_indices).size()] /* FIXME for temporary testing, handle.get_stream_from_stream_pool((i * max_segments + 3) % (*stream_pool_indices).size()) */);
+          device_reduce(col_comm,
+                        major_buffer_first,
+                        vertex_value_output_first,
+                        (*segment_offsets)[1],
+                        raft::comms::op_t::SUM,
+                        i,
+                        handle.get_stream_from_stream_pool((i * max_segments + 3) %
+                                                           (*stream_pool_indices).size()));
         }
       } else {
         device_reduce(col_comm,
@@ -1159,23 +817,10 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       }
     }
   }
-#endif
 
-  if (stream_pool_indices) {
-#if 1  // FIXME: for temporary testing
-    for (size_t i = 0; i < pool_streams.size(); ++i) {
-      CUDA_TRY(cudaStreamSynchronize(pool_streams[i]));
-    }
-#else
-    handle.sync_stream_pool(*stream_pool_indices);
-#endif
-  }
+  if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); }
 
   if constexpr (GraphViewType::is_multi_gpu && !update_major) {
-#if 1  // FIXME: delete
-    handle.sync_stream();
-    std::cout << "minor reduction" << std::endl;
-#endif
     auto& comm               = handle.get_comms();
     auto const comm_rank     = comm.get_rank();
     auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
@@ -1237,14 +882,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       }
     }
   }
-// FIXME: for temporary testing
-#if 1
-  if (stream_pool_indices) {
-    for (size_t i = 0; i < pool_streams.size(); ++i) {
-      CUDA_TRY(cudaStreamDestroy(pool_streams[i]));
-    }
-  }
-#endif
 }
 
 }  // namespace detail

From 99401f3103fea9d310e6d394b90f76ea489ce18c Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 8 Mar 2022 17:09:06 -0800
Subject: [PATCH 57/60] remove temporary experimental code

---
 cpp/src/link_analysis/pagerank_impl.cuh      | 64 +-------------------
 cpp/tests/link_analysis/mg_pagerank_test.cpp | 63 +------------------
 2 files changed, 4 insertions(+), 123 deletions(-)

diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh
index eec2126aea7..d33a7e97f82 100644
--- a/cpp/src/link_analysis/pagerank_impl.cuh
+++ b/cpp/src/link_analysis/pagerank_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -55,12 +55,6 @@ void pagerank(
   bool has_initial_guess,
   bool do_expensive_check)
 {
-#if 1  // FIXME: delete
-size_t free_size{};
-size_t total_size{};
-CUDA_TRY(cudaMemGetInfo(&free_size, &total_size));
-std::cout << "PageRank start free_size=" << static_cast<double>(free_size) / (1024.0 * 1024.0 * 1024.0) << " GB total_size=" << static_cast<double>(total_size) / (1024.0 * 1024.0 * 1024.0) << " GB." << std::endl;
-#endif
   using vertex_t = typename GraphViewType::vertex_type;
   using weight_t = typename GraphViewType::weight_type;
 
@@ -84,9 +78,6 @@ std::cout << "PageRank start free_size=" << static_cast<double>(free_size) / (10
                              : vertex_t{0};
 
   // 1. check input arguments
-#if 1  // FIXME: delete
-handle.sync_stream(); std::cout << "PageRank check inputs" << std::endl;
-#endif
 
   CUGRAPH_EXPECTS((personalization_vertices.has_value() == false) ||
                     (personalization_values.has_value() && personalization_vector_size.has_value()),
@@ -151,9 +142,6 @@ handle.sync_stream(); std::cout << "PageRank check inputs" << std::endl;
   }
 
   // 2. compute the sums of the out-going edge weights (if not provided)
-#if 1  // FIXME: delete
-handle.sync_stream(); std::cout << "PageRank compute out_weight_sums" << std::endl;
-#endif
 
   auto tmp_vertex_out_weight_sums = precomputed_vertex_out_weight_sums
                                       ? std::nullopt
@@ -164,9 +152,6 @@ handle.sync_stream(); std::cout << "PageRank compute out_weight_sums" << std::en
                                       : (*tmp_vertex_out_weight_sums).data();
 
   // 3. initialize pagerank values
-#if 1  // FIXME: delete
-handle.sync_stream(); std::cout << "PageRank initialize PageRank values" << std::endl;
-#endif
 
   if (has_initial_guess) {
     auto sum = reduce_v(handle, pull_graph_view, pageranks, result_t{0.0});
@@ -186,9 +171,6 @@ handle.sync_stream(); std::cout << "PageRank initialize PageRank values" << std:
   }
 
   // 4. sum the personalization values
-#if 1  // FIXME: delete
-handle.sync_stream(); std::cout << "PageRank sum personalization values" << std::endl;
-#endif
 
   result_t personalization_sum{0.0};
   if (aggregate_personalization_vector_size > 0) {
@@ -203,9 +185,6 @@ handle.sync_stream(); std::cout << "PageRank sum personalization values" << std:
   }
 
   // 5. pagerank iteration
-#if 1  // FIXME: delete
-handle.sync_stream(); std::cout << "PageRank iteration" << std::endl;
-#endif
 
   // old PageRank values
   rmm::device_uvector<result_t> old_pageranks(pull_graph_view.get_number_of_local_vertices(),
@@ -213,14 +192,6 @@ handle.sync_stream(); std::cout << "PageRank iteration" << std::endl;
   row_properties_t<GraphViewType, result_t> adj_matrix_row_pageranks(handle, pull_graph_view);
   size_t iter{0};
   while (true) {
-#if 1 // FIXME: delete
-    handle.sync_stream();
-    if constexpr (GraphViewType::is_multi_gpu) {
-      handle.get_comms().barrier();
-    }
-    std::cout << "PageRank iteration " << iter << " start" << std::endl;
-    auto time0 = std::chrono::steady_clock::now();
-#endif
     thrust::copy(handle.get_thrust_policy(),
                  pageranks,
                  pageranks + pull_graph_view.get_number_of_local_vertices(),
@@ -252,16 +223,8 @@ handle.sync_stream(); std::cout << "PageRank iteration" << std::endl;
                         return pagerank / divisor;
                       });
 
-#if 1 // FIXME: delete
-    handle.sync_stream();
-    auto time1 = std::chrono::steady_clock::now();
-#endif
     copy_to_adj_matrix_row(handle, pull_graph_view, pageranks, adj_matrix_row_pageranks);
 
-#if 1 // FIXME: delete
-    handle.sync_stream();
-    auto time2 = std::chrono::steady_clock::now();
-#endif
     auto unvarying_part = aggregate_personalization_vector_size == 0
                             ? (dangling_sum * alpha + static_cast<result_t>(1.0 - alpha)) /
                                 static_cast<result_t>(num_vertices)
@@ -278,10 +241,6 @@ handle.sync_stream(); std::cout << "PageRank iteration" << std::endl;
       unvarying_part,
       pageranks);
 
-#if 1 // FIXME: delete
-    handle.sync_stream();
-    auto time3 = std::chrono::steady_clock::now();
-#endif
     if (aggregate_personalization_vector_size > 0) {
       auto vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
         pull_graph_view.get_vertex_partition_view());
@@ -301,15 +260,6 @@ handle.sync_stream(); std::cout << "PageRank iteration" << std::endl;
         });
     }
 
-#if 1 // FIXME: delete
-    handle.sync_stream();
-    auto time4 = std::chrono::steady_clock::now();
-    if constexpr (GraphViewType::is_multi_gpu) {
-      handle.get_comms().barrier();
-    }
-    handle.sync_stream();
-    auto time5 = std::chrono::steady_clock::now();
-#endif
     auto diff_sum = transform_reduce_v(
       handle,
       pull_graph_view,
@@ -317,18 +267,6 @@ handle.sync_stream(); std::cout << "PageRank iteration" << std::endl;
       [] __device__(auto val) { return std::abs(thrust::get<0>(val) - thrust::get<1>(val)); },
       result_t{0.0});
 
-#if 1 // FIXME: delete
-    handle.sync_stream();
-    auto time6 = std::chrono::steady_clock::now();
-    std::chrono::duration<double> elapsed_total = time6 - time0;
-    std::chrono::duration<double> elapsed0 = time1 - time0;
-    std::chrono::duration<double> elapsed1 = time2 - time1;
-    std::chrono::duration<double> elapsed2 = time3 - time2;
-    std::chrono::duration<double> elapsed3 = time4 - time3;
-    std::chrono::duration<double> elapsed4 = time5 - time4;
-    std::chrono::duration<double> elapsed5 = time6 - time5;
-    std::cout << "PageRank iter " << iter << " took " << elapsed_total.count() * 1e3 << " ms, breakdown=(" << elapsed0.count() * 1e3 << "," << elapsed1.count() * 1e3 << "," << elapsed2.count() * 1e3 << "," << elapsed3.count() * 1e3 << "," << elapsed4.count() * 1e3 << ") ms." << std::endl;
-#endif
     iter++;
 
     if (diff_sum < epsilon) {
diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp
index 9b3876b4e5d..1b1965582fe 100644
--- a/cpp/tests/link_analysis/mg_pagerank_test.cpp
+++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp
@@ -64,76 +64,19 @@ class Tests_MGPageRank
     auto constexpr pool_size = 64;  // FIXME: tuning parameter
     raft::handle_t handle(rmm::cuda_stream_per_thread, std::make_shared<rmm::cuda_stream_pool>(pool_size));
     HighResClock hr_clock{};
-#if 1  // FIXME: delete
-    auto time0 = std::chrono::steady_clock::now();
-#endif
 
     raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
     auto& comm           = handle.get_comms();
     auto const comm_size = comm.get_size();
     auto const comm_rank = comm.get_rank();
 
-    int row_comm_size{};
-    int num_gpus_per_node{};
-    RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node));
-    if (comm_size > num_gpus_per_node) {  // multi-node, inter-node communication bandwidth
-                                          // (Infinniband) is more likely to be a bottleneck than
-                                          // intra-node (NVLink) communication bandwidth
-      CUGRAPH_EXPECTS((comm_size % num_gpus_per_node) == 0,
-                      "Invalid MPI configuration: in multi-node execution, # MPI processes should "
-                      "be a multiple of the number of GPUs per node.");
-#if 1
-      auto col_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
-      while (comm_size % col_comm_size != 0) {
-        --col_comm_size;
-      }
-      row_comm_size = comm_size / col_comm_size;
-#else
-      auto num_nodes = comm_size / num_gpus_per_node;
-      row_comm_size  = static_cast<int>(sqrt(static_cast<double>(num_nodes)));
-      while (num_nodes % row_comm_size != 0) {
-        --row_comm_size;
-      }
-      row_comm_size *= num_gpus_per_node;
-#endif
-    } else {
-      row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
-      while (comm_size % row_comm_size != 0) {
-        --row_comm_size;
-      }
+    auto  row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
+    while (comm_size % row_comm_size != 0) {
+      --row_comm_size;
     }
 
     cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
       subcomm_factory(handle, row_comm_size);
-#if 1  // FIXME: delete
-    {
-      rmm::device_uvector<int32_t> tx_ints(comm_size, handle.get_stream());
-      rmm::device_uvector<int32_t> rx_ints(comm_size, handle.get_stream());
-      std::vector<size_t> tx_sizes(comm_size, size_t{1});
-      std::vector<size_t> tx_offsets(comm_size);
-      std::iota(tx_offsets.begin(), tx_offsets.end(), size_t{0});
-      std::vector<int32_t> tx_ranks(comm_size);
-      std::iota(tx_ranks.begin(), tx_ranks.end(), int32_t{0});
-      auto rx_sizes   = tx_sizes;
-      auto rx_offsets = tx_offsets;
-      auto rx_ranks   = tx_ranks;
-      handle.get_comms().device_multicast_sendrecv(tx_ints.data(),
-                                                   tx_sizes,
-                                                   tx_offsets,
-                                                   tx_ranks,
-                                                   rx_ints.data(),
-                                                   rx_sizes,
-                                                   rx_offsets,
-                                                   rx_ranks,
-                                                   handle.get_stream());
-      handle.sync_stream();
-    }
-    auto time1                            = std::chrono::steady_clock::now();
-    std::chrono::duration<double> elapsed = time1 - time0;
-    std::cout << "Handle initialization and 1st all-to-all (comm_size=" << comm_size
-              << ", row_comm_size=" << row_comm_size << ") took " << elapsed.count() * 1e3 << " ms."
-              << std::endl;
-#endif
 
     // 2. create MG graph
 

From 0664216fbfdad51dcd6d06ed77e391a72fbb4fdd Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 8 Mar 2022 17:09:50 -0800
Subject: [PATCH 58/60] fix formatting error

---
 cpp/tests/link_analysis/mg_pagerank_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp
index 1b1965582fe..bb03adf6553 100644
--- a/cpp/tests/link_analysis/mg_pagerank_test.cpp
+++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp
@@ -70,7 +70,7 @@ class Tests_MGPageRank
     auto const comm_size = comm.get_size();
     auto const comm_rank = comm.get_rank();
 
-    auto  row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
+    auto row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
     while (comm_size % row_comm_size != 0) {
       --row_comm_size;
     }

From c136d3a627422264a46209b68da36233adeffd41 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 8 Mar 2022 17:11:04 -0800
Subject: [PATCH 59/60] undo copyright update with the file with no change

---
 cpp/src/link_analysis/pagerank_impl.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh
index d33a7e97f82..b6023d21bf2 100644
--- a/cpp/src/link_analysis/pagerank_impl.cuh
+++ b/cpp/src/link_analysis/pagerank_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From a4f6528b27e0c710d1c137e9bbcb3a7ac3e69421 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 8 Mar 2022 17:14:48 -0800
Subject: [PATCH 60/60] clang-format & copyright year

---
 cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh       | 2 +-
 .../cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh       | 2 +-
 cpp/tests/link_analysis/mg_pagerank_test.cpp                   | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
index f93ee1a6ae5..1d1b3810a53 100644
--- a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
+++ b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
index ad76ee1fd67..d6f2e9f7a34 100644
--- a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp
index bb03adf6553..1f199668d6f 100644
--- a/cpp/tests/link_analysis/mg_pagerank_test.cpp
+++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp
@@ -62,7 +62,8 @@ class Tests_MGPageRank
     // 1. initialize handle
 
     auto constexpr pool_size = 64;  // FIXME: tuning parameter
-    raft::handle_t handle(rmm::cuda_stream_per_thread, std::make_shared<rmm::cuda_stream_pool>(pool_size));
+    raft::handle_t handle(rmm::cuda_stream_per_thread,
+                          std::make_shared<rmm::cuda_stream_pool>(pool_size));
     HighResClock hr_clock{};
 
     raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);