diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 1997fd75dab..5a3cb65caa5 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -432,6 +432,7 @@ add_library(cugraph SHARED
     src/experimental/graph_view.cu
     src/experimental/coarsen_graph.cu
     src/experimental/renumber_edgelist.cu
+    src/experimental/renumber_utils.cu
     src/experimental/relabel.cu
     src/experimental/induced_subgraph.cu
     src/experimental/bfs.cu
diff --git a/cpp/include/dendrogram.hpp b/cpp/include/dendrogram.hpp
index bb9ba470a52..aa0802e80b3 100644
--- a/cpp/include/dendrogram.hpp
+++ b/cpp/include/dendrogram.hpp
@@ -27,7 +27,7 @@ class Dendrogram {
  public:
   void add_level(vertex_t first_index,
                  vertex_t num_verts,
-                 cudaStream_t stream                 = 0,
+                 cudaStream_t stream,
                  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
   {
     level_ptr_.push_back(std::make_unique<rmm::device_uvector<vertex_t>>(num_verts, stream, mr));
diff --git a/cpp/include/experimental/detail/graph_utils.cuh b/cpp/include/experimental/detail/graph_utils.cuh
index 084d68b8ba4..d79788e59ce 100644
--- a/cpp/include/experimental/detail/graph_utils.cuh
+++ b/cpp/include/experimental/detail/graph_utils.cuh
@@ -56,65 +56,32 @@ rmm::device_uvector<edge_t> compute_major_degrees(
   rmm::device_uvector<edge_t> degrees(0, handle.get_stream());
 
   vertex_t max_num_local_degrees{0};
-  for (int i = 0; i < (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size);
-       ++i) {
-    auto vertex_partition_idx = partition.is_hypergraph_partitioned()
-                                  ? static_cast<size_t>(i * row_comm_size + row_comm_rank)
-                                  : static_cast<size_t>(col_comm_rank * row_comm_size + i);
+  for (int i = 0; i < col_comm_size; ++i) {
+    auto vertex_partition_idx  = static_cast<size_t>(i * row_comm_size + row_comm_rank);
     auto vertex_partition_size = partition.get_vertex_partition_size(vertex_partition_idx);
     max_num_local_degrees      = std::max(max_num_local_degrees, vertex_partition_size);
-    if (i == (partition.is_hypergraph_partitioned() ? col_comm_rank : row_comm_rank)) {
-      degrees.resize(vertex_partition_size, handle.get_stream());
-    }
+    if (i == col_comm_rank) { degrees.resize(vertex_partition_size, handle.get_stream()); }
   }
   local_degrees.resize(max_num_local_degrees, handle.get_stream());
-  for (int i = 0; i < (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size);
-       ++i) {
-    auto vertex_partition_idx = partition.is_hypergraph_partitioned()
-                                  ? static_cast<size_t>(i * row_comm_size + row_comm_rank)
-                                  : static_cast<size_t>(col_comm_rank * row_comm_size + i);
+  for (int i = 0; i < col_comm_size; ++i) {
+    auto vertex_partition_idx = static_cast<size_t>(i * row_comm_size + row_comm_rank);
     vertex_t major_first{};
     vertex_t major_last{};
     std::tie(major_first, major_last) = partition.get_vertex_partition_range(vertex_partition_idx);
-    auto p_offsets =
-      partition.is_hypergraph_partitioned()
-        ? adj_matrix_partition_offsets[i]
-        : adj_matrix_partition_offsets[0] +
-            (major_first - partition.get_vertex_partition_first(col_comm_rank * row_comm_size));
+    auto p_offsets                    = adj_matrix_partition_offsets[i];
     thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
                       thrust::make_counting_iterator(vertex_t{0}),
                       thrust::make_counting_iterator(major_last - major_first),
                       local_degrees.data(),
                       [p_offsets] __device__(auto i) { return p_offsets[i + 1] - p_offsets[i]; });
-    if (partition.is_hypergraph_partitioned()) {
-      col_comm.reduce(local_degrees.data(),
-                      i == col_comm_rank ? degrees.data() : static_cast<edge_t *>(nullptr),
-                      static_cast<size_t>(major_last - major_first),
-                      raft::comms::op_t::SUM,
-                      i,
-                      handle.get_stream());
-    } else {
-      row_comm.reduce(local_degrees.data(),
-                      i == row_comm_rank ? degrees.data() : static_cast<edge_t *>(nullptr),
-                      static_cast<size_t>(major_last - major_first),
-                      raft::comms::op_t::SUM,
-                      i,
-                      handle.get_stream());
-    }
+    col_comm.reduce(local_degrees.data(),
+                    i == col_comm_rank ? degrees.data() : static_cast<edge_t *>(nullptr),
+                    static_cast<size_t>(major_last - major_first),
+                    raft::comms::op_t::SUM,
+                    i,
+                    handle.get_stream());
   }
 
-  raft::comms::status_t status{};
-  if (partition.is_hypergraph_partitioned()) {
-    status =
-      col_comm.sync_stream(handle.get_stream());  // this is neessary as local_degrees will become
-                                                  // out-of-scope once this function returns.
-  } else {
-    status =
-      row_comm.sync_stream(handle.get_stream());  // this is neessary as local_degrees will become
-                                                  // out-of-scope once this function returns.
-  }
-  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
-
   return degrees;
 }
 
@@ -170,7 +137,6 @@ struct compute_gpu_id_from_vertex_t {
 
 template <typename vertex_t>
 struct compute_gpu_id_from_edge_t {
-  bool hypergraph_partitioned{false};
   int comm_size{0};
   int row_comm_size{0};
   int col_comm_size{0};
@@ -180,12 +146,22 @@ struct compute_gpu_id_from_edge_t {
     cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
     auto major_comm_rank = static_cast<int>(hash_func(major) % comm_size);
     auto minor_comm_rank = static_cast<int>(hash_func(minor) % comm_size);
-    if (hypergraph_partitioned) {
-      return (minor_comm_rank / col_comm_size) * row_comm_size + (major_comm_rank % row_comm_size);
-    } else {
-      return (major_comm_rank - (major_comm_rank % row_comm_size)) +
-             (minor_comm_rank / col_comm_size);
-    }
+    return (minor_comm_rank / row_comm_size) * row_comm_size + (major_comm_rank % row_comm_size);
+  }
+};
+
+template <typename vertex_t>
+struct compute_partition_id_from_edge_t {
+  int comm_size{0};
+  int row_comm_size{0};
+  int col_comm_size{0};
+
+  __device__ int operator()(vertex_t major, vertex_t minor) const
+  {
+    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
+    auto major_comm_rank = static_cast<int>(hash_func(major) % comm_size);
+    auto minor_comm_rank = static_cast<int>(hash_func(minor) % comm_size);
+    return major_comm_rank * col_comm_size + minor_comm_rank / row_comm_size;
   }
 };
 
diff --git a/cpp/include/experimental/graph.hpp b/cpp/include/experimental/graph.hpp
index 6a10256e6f4..a380200ea1f 100644
--- a/cpp/include/experimental/graph.hpp
+++ b/cpp/include/experimental/graph.hpp
@@ -188,6 +188,20 @@ template <typename edge_t>
 struct invalid_edge_id : invalid_idx<edge_t> {
 };
 
+template <typename vertex_t>
+__host__ __device__ std::enable_if_t<std::is_signed<vertex_t>::value, bool> is_valid_vertex(
+  vertex_t num_vertices, vertex_t v)
+{
+  return (v >= 0) && (v < num_vertices);
+}
+
+template <typename vertex_t>
+__host__ __device__ std::enable_if_t<std::is_unsigned<vertex_t>::value, bool> is_valid_vertex(
+  vertex_t num_vertices, vertex_t v)
+{
+  return v < num_vertices;
+}
+
 }  // namespace experimental
 }  // namespace cugraph
 
diff --git a/cpp/include/experimental/graph_functions.hpp b/cpp/include/experimental/graph_functions.hpp
index 7b4bb466b97..100742adccd 100644
--- a/cpp/include/experimental/graph_functions.hpp
+++ b/cpp/include/experimental/graph_functions.hpp
@@ -17,13 +17,13 @@
 
 #include <experimental/graph.hpp>
 #include <experimental/graph_view.hpp>
-#include <utilities/error.hpp>
 
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <memory>
 #include <tuple>
+#include <vector>
 
 namespace cugraph {
 namespace experimental {
@@ -40,19 +40,24 @@ namespace experimental {
  * or multi-GPU (true).
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
- * @param edgelist_major_vertices Edge source vertex IDs (if the graph adjacency matrix is stored as
+ * @param edgelist_major_vertices Pointers (one pointer per local graph adjacency matrix partition
+ * assigned to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as
  * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex
- * IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t functor to
- * every (major, minor) pair should return the local GPU ID for this function to work (edges should
- * be pre-shuffled).
- * @param edgelist_minor_vertices Edge destination vertex IDs (if the graph adjacency matrix is
- * stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored).
- * Vertex IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t
- * functor to every (major, minor) pair should return the local GPU ID for this function to work
- * (edges should be pre-shuffled).
- * @param num_edgelist_edges Number of edges in the edgelist.
- * @param is_hypergraph_partitioned Flag indicating whether we are assuming hypergraph partitioning
- * (this flag will be removed in the future).
+ * IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target
+ * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major,
+ * minor) pair should return the GPU ID of this process and applying the
+ * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition
+ * should return the partition ID of the corresponding matrix partition.
+ * @param edgelist_minor_vertices Pointers (one pointer per local graph adjacency matrix partition
+ * assigned to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored
+ * as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs
+ * are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target
+ * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major,
+ * minor) pair should return the GPU ID of this process and applying the
+ * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition
+ * should return the partition ID of the corresponding matrix partition.
+ * @param edgelist_edge_counts Edge counts (one count per local graph adjacency matrix partition
+ * assigned to this process).
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  * @return std::tuple<rmm::device_uvector<vertex_t>, partition_t<vertex_t>, vertex_t, edge_t>
  * Quadruplet of labels (vertex IDs before renumbering) for the entire set of vertices (assigned to
@@ -63,10 +68,9 @@ template <typename vertex_t, typename edge_t, bool multi_gpu>
 std::enable_if_t<multi_gpu,
                  std::tuple<rmm::device_uvector<vertex_t>, partition_t<vertex_t>, vertex_t, edge_t>>
 renumber_edgelist(raft::handle_t const& handle,
-                  vertex_t* edgelist_major_vertices /* [INOUT] */,
-                  vertex_t* edgelist_minor_vertices /* [INOUT] */,
-                  edge_t num_edgelist_edges,
-                  bool is_hypergraph_partitioned,
+                  std::vector<vertex_t*> const& edgelist_major_vertices /* [INOUT] */,
+                  std::vector<vertex_t*> const& edgelist_minor_vertices /* [INOUT] */,
+                  std::vector<edge_t> const& edgelist_edge_counts,
                   bool do_expensive_check = false);
 
 /**
@@ -115,19 +119,24 @@ std::enable_if_t<!multi_gpu, rmm::device_uvector<vertex_t>> renumber_edgelist(
  * the compute_gpu_id_from_vertex_t to every vertex should return the local GPU ID for this function
  * to work (vertices should be pre-shuffled).
  * @param num_local_vertices Number of local vertices.
- * @param edgelist_major_vertices Edge source vertex IDs (if the graph adjacency matrix is stored as
+ * @param edgelist_major_vertices Pointers (one pointer per local graph adjacency matrix partition
+ * assigned to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as
  * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex
- * IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t functor to
- * every (major, minor) pair should return the local GPU ID for this function to work (edges should
- * be pre-shuffled).
- * @param edgelist_minor_vertices Edge destination vertex IDs (if the graph adjacency matrix is
- * stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored).
- * Vertex IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t
- * functor to every (major, minor) pair should return the local GPU ID for this function to work
- * (edges should be pre-shuffled).
- * @param num_edgelist_edges Number of edges in the edgelist.
- * @param is_hypergraph_partitioned Flag indicating whether we are assuming hypergraph partitioning
- * (this flag will be removed in the future).
+ * IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target
+ * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major,
+ * minor) pair should return the GPU ID of this process and applying the
+ * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition
+ * should return the partition ID of the corresponding matrix partition.
+ * @param edgelist_minor_vertices Pointers (one pointer per local graph adjacency matrix partition
+ * assigned to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored
+ * as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs
+ * are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target
+ * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major,
+ * minor) pair should return the GPU ID of this process and applying the
+ * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition
+ * should return the partition ID of the corresponding matrix partition.
+ * @param edgelist_edge_counts Edge counts (one count per local graph adjacency matrix partition
+ * assigned to this process).
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  * @return std::tuple<rmm::device_uvector<vertex_t>, partition_t<vertex_t>, vertex_t, edge_t>
  * Quadruplet of labels (vertex IDs before renumbering) for the entire set of vertices (assigned to
@@ -140,10 +149,9 @@ std::enable_if_t<multi_gpu,
 renumber_edgelist(raft::handle_t const& handle,
                   vertex_t const* local_vertices,
                   vertex_t num_local_vertices,
-                  vertex_t* edgelist_major_vertices /* [INOUT] */,
-                  vertex_t* edgelist_minor_vertices /* [INOUT] */,
-                  edge_t num_edgelist_edges,
-                  bool is_hypergraph_partitioned,
+                  std::vector<vertex_t*> const& edgelist_major_vertices /* [INOUT] */,
+                  std::vector<vertex_t*> const& edgelist_minor_vertices /* [INOUT] */,
+                  std::vector<edge_t> const& edgelist_edge_counts,
                   bool do_expensive_check = false);
 
 /**
@@ -181,6 +189,102 @@ std::enable_if_t<!multi_gpu, rmm::device_uvector<vertex_t>> renumber_edgelist(
   edge_t num_edgelist_edges,
   bool do_expensive_check = false);
 
+/**
+ * @brief Renumber external vertices to internal vertices based on the provoided @p
+ * renumber_map_labels.
+ *
+ * Note cugraph::experimental::invalid_id<vertex_t>::value remains unchanged.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param vertices Pointer to the vertices to be renumbered. The input external vertices are
+ * renumbered to internal vertices in-place.
+ * @param num_vertices Number of vertices to be renumbered.
+ * @param renumber_map_labels Pointer to the external vertices corresponding to the internal
+ * vertices in the range [@p local_int_vertex_first, @p local_int_vertex_last).
+ * @param local_int_vertex_first The first local internal vertex (inclusive, assigned to this
+ * process in multi-GPU).
+ * @param local_int_vertex_last The last local internal vertex (exclusive, assigned to this process
+ * in multi-GPU).
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename vertex_t, bool multi_gpu>
+void renumber_ext_vertices(raft::handle_t const& handle,
+                           vertex_t* vertices /* [INOUT] */,
+                           size_t num_vertices,
+                           vertex_t const* renumber_map_labels,
+                           vertex_t local_int_vertex_first,
+                           vertex_t local_int_vertex_last,
+                           bool do_expensive_check = false);
+
+/**
+ * @brief Unrenumber local internal vertices to external vertices based on the providied @p
+ * renumber_map_labels.
+ *
+ * Note cugraph::experimental::invalid_id<vertex_t>::value remains unchanged.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param vertices Pointer to the local internal vertices to be unrenumbered. Each input element
+ * should be in [@p local_int_vertex_first, @p local_int_vertex_last). The input internal vertices
+ * are renumbered to external vertices in-place.
+ * @param num_vertices Number of vertices to be unrenumbered.
+ * @param renumber_map_labels Pointer to the external vertices corresponding to the internal
+ * vertices in the range [@p local_int_vertex_first, @p local_int_vertex_last).
+ * @param local_int_vertex_first The first local internal vertex (inclusive, assigned to this
+ * process in multi-GPU).
+ * @param local_int_vertex_last The last local internal vertex (exclusive, assigned to this process
+ * in multi-GPU).
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename vertex_t>
+void unrenumber_local_int_vertices(
+  raft::handle_t const& handle,
+  vertex_t* vertices /* [INOUT] */,
+  size_t num_vertices,
+  vertex_t const* renumber_map_labels /* size = local_int_vertex_last - local_int_vertex_first */,
+  vertex_t local_int_vertex_first,
+  vertex_t local_int_vertex_last,
+  bool do_expensive_check = false);
+
+/**
+ * @brief Unrenumber (possibly non-local) internal vertices to external vertices based on the
+ * providied @p renumber_map_labels.
+ *
+ * Note cugraph::experimental::invalid_id<vertex_t>::value remains unchanged.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param vertices Pointer to the internal vertices to be unrenumbered. The input internal vertices
+ * are renumbered to external vertices in-place.
+ * @param num_vertices Number of vertices to be unrenumbered.
+ * @param renumber_map_labels Pointer to the external vertices corresponding to the internal
+ * vertices in the range [@p local_int_vertex_first, @p local_int_vertex_last).
+ * @param local_int_vertex_first The first local internal vertex (inclusive, assigned to this
+ * process in multi-GPU).
+ * @param local_int_vertex_last The last local internal vertex (exclusive, assigned to this process
+ * in multi-GPU).
+ * @param vertex_partition_lasts Last local internal vertices (exclusive, assigned to each process
+ * in multi-GPU).
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename vertex_t, bool multi_gpu>
+void unrenumber_int_vertices(raft::handle_t const& handle,
+                             vertex_t* vertices /* [INOUT] */,
+                             size_t num_vertices,
+                             vertex_t const* renumber_map_labels,
+                             vertex_t local_int_vertex_first,
+                             vertex_t local_int_vertex_last,
+                             std::vector<vertex_t>& vertex_partition_lasts,
+                             bool do_expensive_check = false);
+
 /**
  * @brief Compute the coarsened graph.
  *
diff --git a/cpp/include/experimental/graph_view.hpp b/cpp/include/experimental/graph_view.hpp
index 5d3d09bb087..47c93b42ca9 100644
--- a/cpp/include/experimental/graph_view.hpp
+++ b/cpp/include/experimental/graph_view.hpp
@@ -40,32 +40,11 @@ namespace experimental {
  *
  * We need to partition 1D vertex arrays (storing per vertex values) and the 2D graph adjacency
  * matrix (or transposed 2D graph adjacency matrix) of G. An 1D vertex array of size V is divided to
- * P linear partitions; each partition has the size close to V / P. We consider two different
- * strategies to partition the 2D matrix: the default strategy and the hypergraph partitioning based
- * strategy (the latter is for future extension).
- * FIXME: in the future we may use the latter for both as this leads to simpler communication
- * patterns and better control over parallelism vs memory footprint trade-off.
+ * P linear partitions; each partition has the size close to V / P.
  *
- * In the default case, one GPU will be responsible for 1 rectangular partition. The matrix will be
- * horizontally partitioned first to P_row slabs. Each slab will be further vertically partitioned
- * to P_col rectangles. Each rectangular partition will have the size close to V / P_row by V /
- * P_col.
- *
- * To be more specific, a GPU with (col_comm_rank, row_comm_rank) will be responsible for one
- * rectangular partition [a,b) by [c,d) where a = vertex_partition_offsets[row_comm_size *
- * col_comm_rank], b = vertex_partition_offsets[row_comm_size * (col_comm_rank + 1)], c =
- * vertex_partition_offsets[col_comm_size * row_comm_rank], and d =
- * vertex_partition_offsets[col_comm_size * (row_comm_rank + 1)].
- *
- * In the future, we may apply hyper-graph partitioning to divide V vertices to P groups minimizing
- * edge cuts across groups while balancing the number of vertices in each group. We will also
- * renumber vertices so the vertices in each group are mapped to consecutive integers. Then, there
- * will be more non-zeros in the diagonal partitions of the 2D graph adjacency matrix (or the
- * transposed 2D graph adjacency matrix) than the off-diagonal partitions. The default strategy does
- * not balance the number of nonzeros if hyper-graph partitioning is applied. To solve this problem,
- * the matrix is first horizontally partitioned to P slabs, then each slab will be further
- * vertically partitioned to P_row (instead of P_col in the default case) rectangles. One GPU will
- * be responsible col_comm_size rectangular partitions in this case.
+ * The 2D graph adjacency matrix is first horizontally partitioned to P slabs, then each slab will
+ * be further vertically partitioned to P_row (instead of P_col in the default case) rectangles. One
+ * GPU will be responsible col_comm_size rectangular partitions.
  *
  * To be more specific, a GPU with (col_comm_rank, row_comm_rank) will be responsible for
  * col_comm_size rectangular partitions [a_i,b_i) by [c,d) where a_i =
@@ -85,13 +64,11 @@ class partition_t {
   partition_t() = default;
 
   partition_t(std::vector<vertex_t> const& vertex_partition_offsets,
-              bool hypergraph_partitioned,
               int row_comm_size,
               int col_comm_size,
               int row_comm_rank,
               int col_comm_rank)
     : vertex_partition_offsets_(vertex_partition_offsets),
-      hypergraph_partitioned_(hypergraph_partitioned),
       comm_rank_(col_comm_rank * row_comm_size + row_comm_rank),
       row_comm_size_(row_comm_size),
       col_comm_size_(col_comm_size),
@@ -159,10 +136,7 @@ class partition_t {
            get_vertex_partition_first(vertex_partition_idx);
   }
 
-  size_t get_number_of_matrix_partitions() const
-  {
-    return hypergraph_partitioned_ ? col_comm_size_ : 1;
-  }
+  size_t get_number_of_matrix_partitions() const { return col_comm_size_; }
 
   // major: row of the graph adjacency matrix (if the graph adjacency matrix is stored as is) or
   // column of the graph adjacency matrix (if the transposed graph adjacency matrix is stored).
@@ -175,16 +149,18 @@ class partition_t {
 
   vertex_t get_matrix_partition_major_first(size_t partition_idx) const
   {
-    return hypergraph_partitioned_
-             ? vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_]
-             : vertex_partition_offsets_[col_comm_rank_ * row_comm_size_];
+    return vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_];
   }
 
   vertex_t get_matrix_partition_major_last(size_t partition_idx) const
   {
-    return hypergraph_partitioned_
-             ? vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_ + 1]
-             : vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_];
+    return vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_ + 1];
+  }
+
+  vertex_t get_matrix_partition_major_size(size_t partition_idx) const
+  {
+    return get_matrix_partition_major_last(partition_idx) -
+           get_matrix_partition_major_first(partition_idx);
   }
 
   vertex_t get_matrix_partition_major_value_start_offset(size_t partition_idx) const
@@ -204,24 +180,21 @@ class partition_t {
 
   vertex_t get_matrix_partition_minor_first() const
   {
-    return hypergraph_partitioned_ ? vertex_partition_offsets_[col_comm_rank_ * row_comm_size_]
-                                   : vertex_partition_offsets_[row_comm_rank_ * col_comm_size_];
+    return vertex_partition_offsets_[col_comm_rank_ * row_comm_size_];
   }
 
   vertex_t get_matrix_partition_minor_last() const
   {
-    return hypergraph_partitioned_
-             ? vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_]
-             : vertex_partition_offsets_[(row_comm_rank_ + 1) * col_comm_size_];
+    return vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_];
   }
 
-  // FIXME: this function may be removed if we use the same partitioning strategy whether hypergraph
-  // partitioning is applied or not
-  bool is_hypergraph_partitioned() const { return hypergraph_partitioned_; }
+  vertex_t get_matrix_partition_minor_size() const
+  {
+    return get_matrix_partition_minor_last() - get_matrix_partition_minor_first();
+  }
 
  private:
   std::vector<vertex_t> vertex_partition_offsets_{};  // size = P + 1
-  bool hypergraph_partitioned_{false};
 
   int comm_rank_{0};
   int row_comm_size_{0};
@@ -236,6 +209,7 @@ class partition_t {
 struct graph_properties_t {
   bool is_symmetric{false};
   bool is_multigraph{false};
+  bool is_weighted{false};
 };
 
 namespace detail {
@@ -277,6 +251,7 @@ class graph_base_t {
 
   bool is_symmetric() const { return properties_.is_symmetric; }
   bool is_multigraph() const { return properties_.is_multigraph; }
+  bool is_weighted() const { return properties_.is_weighted; }
 
  protected:
   raft::handle_t const* get_handle_ptr() const { return handle_ptr_; };
@@ -334,11 +309,6 @@ class graph_view_t<vertex_t,
                bool sorted_by_global_degree_within_vertex_partition,
                bool do_expensive_check = false);
 
-  bool is_weighted() const { return adj_matrix_partition_weights_.size() > 0; }
-
-  // FIXME: this should be removed once MNMG Louvain is updated to use graph primitives
-  partition_t<vertex_t> get_partition() const { return partition_; }
-
   vertex_t get_number_of_local_vertices() const
   {
     return partition_.get_local_vertex_last() - partition_.get_local_vertex_first();
@@ -421,6 +391,12 @@ class graph_view_t<vertex_t,
                             : partition_.get_matrix_partition_major_last(adj_matrix_partition_idx);
   }
 
+  vertex_t get_number_of_local_adj_matrix_partition_rows(size_t adj_matrix_partition_idx) const
+  {
+    return get_local_adj_matrix_partition_row_last(adj_matrix_partition_idx) -
+           get_local_adj_matrix_partition_row_first(adj_matrix_partition_idx);
+  }
+
   vertex_t get_local_adj_matrix_partition_row_value_start_offset(
     size_t adj_matrix_partition_idx) const
   {
@@ -441,6 +417,12 @@ class graph_view_t<vertex_t,
                             : partition_.get_matrix_partition_minor_last();
   }
 
+  vertex_t get_number_of_local_adj_matrix_partition_cols(size_t adj_matrix_partition_idx) const
+  {
+    return get_local_adj_matrix_partition_col_last(adj_matrix_partition_idx) -
+           get_local_adj_matrix_partition_col_first(adj_matrix_partition_idx);
+  }
+
   vertex_t get_local_adj_matrix_partition_col_value_start_offset(
     size_t adj_matrix_partition_idx) const
   {
@@ -449,8 +431,6 @@ class graph_view_t<vertex_t,
              : vertex_t{0};
   }
 
-  bool is_hypergraph_partitioned() const { return partition_.is_hypergraph_partitioned(); }
-
   // FIXME: this function is not part of the public stable API. This function is mainly for pattern
   // accelerator implementation. This function is currently public to support the legacy
   // implementations directly accessing CSR/CSC data, but this function will eventually become
@@ -504,6 +484,12 @@ class graph_view_t<vertex_t,
   rmm::device_uvector<weight_t> compute_in_weight_sums(raft::handle_t const& handle) const;
   rmm::device_uvector<weight_t> compute_out_weight_sums(raft::handle_t const& handle) const;
 
+  edge_t compute_max_in_degree(raft::handle_t const& handle) const;
+  edge_t compute_max_out_degree(raft::handle_t const& handle) const;
+
+  weight_t compute_max_in_weight_sum(raft::handle_t const& handle) const;
+  weight_t compute_max_out_weight_sum(raft::handle_t const& handle) const;
+
  private:
   std::vector<edge_t const*> adj_matrix_partition_offsets_{};
   std::vector<vertex_t const*> adj_matrix_partition_indices_{};
@@ -549,8 +535,6 @@ class graph_view_t<vertex_t,
                bool sorted_by_degree,
                bool do_expensive_check = false);
 
-  bool is_weighted() const { return weights_ != nullptr; }
-
   vertex_t get_number_of_local_vertices() const { return this->get_number_of_vertices(); }
 
   constexpr vertex_t get_local_vertex_first() const { return vertex_t{0}; }
@@ -628,8 +612,6 @@ class graph_view_t<vertex_t,
     return vertex_t{0};
   }
 
-  bool is_hypergraph_partitioned() const { return false; }
-
   // FIXME: this function is not part of the public stable API.This function is mainly for pattern
   // accelerator implementation. This function is currently public to support the legacy
   // implementations directly accessing CSR/CSC data, but this function will eventually become
@@ -654,6 +636,12 @@ class graph_view_t<vertex_t,
   rmm::device_uvector<weight_t> compute_in_weight_sums(raft::handle_t const& handle) const;
   rmm::device_uvector<weight_t> compute_out_weight_sums(raft::handle_t const& handle) const;
 
+  edge_t compute_max_in_degree(raft::handle_t const& handle) const;
+  edge_t compute_max_out_degree(raft::handle_t const& handle) const;
+
+  weight_t compute_max_in_weight_sum(raft::handle_t const& handle) const;
+  weight_t compute_max_out_weight_sum(raft::handle_t const& handle) const;
+
  private:
   edge_t const* offsets_{nullptr};
   vertex_t const* indices_{nullptr};
diff --git a/cpp/include/matrix_partition_device.cuh b/cpp/include/matrix_partition_device.cuh
index b41119e7be6..30d6540bcfe 100644
--- a/cpp/include/matrix_partition_device.cuh
+++ b/cpp/include/matrix_partition_device.cuh
@@ -192,7 +192,7 @@ class matrix_partition_device_t<GraphViewType, std::enable_if_t<!GraphViewType::
     assert(partition_idx == 0);
   }
 
-  typename GraphViewType::vertex_type get_major_value_start_offset() const
+  __host__ __device__ typename GraphViewType::vertex_type get_major_value_start_offset() const
   {
     return typename GraphViewType::vertex_type{0};
   }
diff --git a/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh b/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh
index d4559de06af..ca20b9a1285 100644
--- a/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh
+++ b/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh
@@ -51,31 +51,27 @@ void copy_to_matrix_major(raft::handle_t const& handle,
                           MatrixMajorValueOutputIterator matrix_major_value_output_first)
 {
   if (GraphViewType::is_multi_gpu) {
-    if (graph_view.is_hypergraph_partitioned()) {
-      CUGRAPH_FAIL("unimplemented.");
-    } else {
-      auto& comm           = handle.get_comms();
-      auto const comm_rank = comm.get_rank();
-      auto& row_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-      auto const row_comm_rank = row_comm.get_rank();
-      auto const row_comm_size = row_comm.get_size();
-      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-      auto const col_comm_rank = col_comm.get_rank();
-      auto const col_comm_size = col_comm.get_size();
-
-      std::vector<size_t> rx_counts(row_comm_size, size_t{0});
-      std::vector<size_t> displacements(row_comm_size, size_t{0});
-      for (int i = 0; i < row_comm_size; ++i) {
-        rx_counts[i]     = graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i);
-        displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1];
-      }
-      device_allgatherv(row_comm,
-                        vertex_value_input_first,
-                        matrix_major_value_output_first,
-                        rx_counts,
-                        displacements,
-                        handle.get_stream());
+    auto& comm               = handle.get_comms();
+    auto const comm_rank     = comm.get_rank();
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_rank = row_comm.get_rank();
+    auto const row_comm_size = row_comm.get_size();
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_rank = col_comm.get_rank();
+    auto const col_comm_size = col_comm.get_size();
+
+    std::vector<size_t> rx_counts(col_comm_size, size_t{0});
+    std::vector<size_t> displacements(col_comm_size, size_t{0});
+    for (int i = 0; i < col_comm_size; ++i) {
+      rx_counts[i]     = graph_view.get_vertex_partition_size(i * row_comm_size + row_comm_rank);
+      displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1];
     }
+    device_allgatherv(col_comm,
+                      vertex_value_input_first,
+                      matrix_major_value_output_first,
+                      rx_counts,
+                      displacements,
+                      handle.get_stream());
   } else {
     assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed
              ? graph_view.get_number_of_local_adj_matrix_partition_cols()
@@ -101,80 +97,78 @@ void copy_to_matrix_major(raft::handle_t const& handle,
   using vertex_t = typename GraphViewType::vertex_type;
 
   if (GraphViewType::is_multi_gpu) {
-    if (graph_view.is_hypergraph_partitioned()) {
-      CUGRAPH_FAIL("unimplemented.");
-    } else {
-      auto& comm           = handle.get_comms();
-      auto const comm_rank = comm.get_rank();
-      auto& row_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-      auto const row_comm_rank = row_comm.get_rank();
-      auto const row_comm_size = row_comm.get_size();
-      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-      auto const col_comm_rank = col_comm.get_rank();
-      auto const col_comm_size = col_comm.get_size();
-
-      auto rx_counts =
-        host_scalar_allgather(row_comm,
-                              static_cast<size_t>(thrust::distance(vertex_first, vertex_last)),
-                              handle.get_stream());
-
-      matrix_partition_device_t<GraphViewType> matrix_partition(graph_view, 0);
-      for (int i = 0; i < row_comm_size; ++i) {
-        rmm::device_uvector<vertex_t> rx_vertices(row_comm_rank == i ? size_t{0} : rx_counts[i],
-                                                  handle.get_stream());
-        auto rx_tmp_buffer = allocate_dataframe_buffer<
-          typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_counts[i],
-                                                                               handle.get_stream());
-        auto rx_value_first = get_dataframe_buffer_begin<
-          typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_tmp_buffer);
+    auto& comm               = handle.get_comms();
+    auto const comm_rank     = comm.get_rank();
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_rank = row_comm.get_rank();
+    auto const row_comm_size = row_comm.get_size();
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_rank = col_comm.get_rank();
+    auto const col_comm_size = col_comm.get_size();
+
+    auto rx_counts =
+      host_scalar_allgather(col_comm,
+                            static_cast<size_t>(thrust::distance(vertex_first, vertex_last)),
+                            handle.get_stream());
+
+    for (int i = 0; i < col_comm_size; ++i) {
+      matrix_partition_device_t<GraphViewType> matrix_partition(graph_view, i);
+
+      rmm::device_uvector<vertex_t> rx_vertices(col_comm_rank == i ? size_t{0} : rx_counts[i],
+                                                handle.get_stream());
+      auto rx_tmp_buffer = allocate_dataframe_buffer<
+        typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_counts[i],
+                                                                             handle.get_stream());
+      auto rx_value_first = get_dataframe_buffer_begin<
+        typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_tmp_buffer);
 
-        if (row_comm_rank == i) {
-          vertex_partition_device_t<GraphViewType> vertex_partition(graph_view);
-          auto map_first =
-            thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
-              return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v);
-            });
-          // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
-          // permutation iterator (and directly gathers to the internal buffer)
-          thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                         map_first,
-                         map_first + thrust::distance(vertex_first, vertex_last),
-                         vertex_value_input_first,
-                         rx_value_first);
-        }
+      if (col_comm_rank == i) {
+        vertex_partition_device_t<GraphViewType> vertex_partition(graph_view);
+        auto map_first =
+          thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
+            return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v);
+          });
+        // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
+        // permutation iterator (and directly gathers to the internal buffer)
+        thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       map_first,
+                       map_first + thrust::distance(vertex_first, vertex_last),
+                       vertex_value_input_first,
+                       rx_value_first);
+      }
 
-        // FIXME: these broadcast operations can be placed between ncclGroupStart() and
-        // ncclGroupEnd()
-        device_bcast(
-          row_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
-        device_bcast(
-          row_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream());
+      // FIXME: these broadcast operations can be placed between ncclGroupStart() and
+      // ncclGroupEnd()
+      device_bcast(
+        col_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
+      device_bcast(col_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream());
 
-        if (row_comm_rank == i) {
-          auto map_first =
-            thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) {
-              return matrix_partition.get_major_offset_from_major_nocheck(v);
-            });
-          // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
-          // directly scatters from the internal buffer)
-          thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                          rx_value_first,
-                          rx_value_first + rx_counts[i],
-                          map_first,
-                          matrix_major_value_output_first);
-        } else {
-          auto map_first = thrust::make_transform_iterator(
-            rx_vertices.begin(), [matrix_partition] __device__(auto v) {
-              return matrix_partition.get_major_offset_from_major_nocheck(v);
-            });
-          // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
-          // directly scatters from the internal buffer)
-          thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                          rx_value_first,
-                          rx_value_first + rx_counts[i],
-                          map_first,
-                          matrix_major_value_output_first);
-        }
+      if (col_comm_rank == i) {
+        auto map_first =
+          thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) {
+            return matrix_partition.get_major_offset_from_major_nocheck(v);
+          });
+        // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
+        // directly scatters from the internal buffer)
+        thrust::scatter(
+          rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+          rx_value_first,
+          rx_value_first + rx_counts[i],
+          map_first,
+          matrix_major_value_output_first + matrix_partition.get_major_value_start_offset());
+      } else {
+        auto map_first = thrust::make_transform_iterator(
+          rx_vertices.begin(), [matrix_partition] __device__(auto v) {
+            return matrix_partition.get_major_offset_from_major_nocheck(v);
+          });
+        // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
+        // directly scatters from the internal buffer)
+        thrust::scatter(
+          rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+          rx_value_first,
+          rx_value_first + rx_counts[i],
+          map_first,
+          matrix_major_value_output_first + matrix_partition.get_major_value_start_offset());
       }
     }
   } else {
@@ -199,59 +193,27 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
                           MatrixMinorValueOutputIterator matrix_minor_value_output_first)
 {
   if (GraphViewType::is_multi_gpu) {
-    if (graph_view.is_hypergraph_partitioned()) {
-      CUGRAPH_FAIL("unimplemented.");
-    } else {
-      auto& comm           = handle.get_comms();
-      auto const comm_rank = comm.get_rank();
-      auto& row_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-      auto const row_comm_rank = row_comm.get_rank();
-      auto const row_comm_size = row_comm.get_size();
-      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-      auto const col_comm_rank = col_comm.get_rank();
-      auto const col_comm_size = col_comm.get_size();
-
-      // FIXME: this P2P is unnecessary if we apply the partitioning scheme used with hypergraph
-      // partitioning
-      auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank;
-      auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size;
-      // FIXME: this branch may be no longer necessary with NCCL backend
-      if (comm_src_rank == comm_rank) {
-        assert(comm_dst_rank == comm_rank);
-        thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                     vertex_value_input_first,
-                     vertex_value_input_first + graph_view.get_number_of_local_vertices(),
-                     matrix_minor_value_output_first +
-                       (graph_view.get_vertex_partition_first(comm_src_rank) -
-                        graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)));
-      } else {
-        device_sendrecv<VertexValueInputIterator, MatrixMinorValueOutputIterator>(
-          comm,
-          vertex_value_input_first,
-          static_cast<size_t>(graph_view.get_number_of_local_vertices()),
-          comm_dst_rank,
-          matrix_minor_value_output_first +
-            (graph_view.get_vertex_partition_first(comm_src_rank) -
-             graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)),
-          static_cast<size_t>(graph_view.get_vertex_partition_size(comm_src_rank)),
-          comm_src_rank,
-          handle.get_stream());
-      }
-
-      // FIXME: these broadcast operations can be placed between ncclGroupStart() and
-      // ncclGroupEnd()
-      for (int i = 0; i < col_comm_size; ++i) {
-        auto offset = graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + i) -
-                      graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size);
-        auto count = graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + i);
-        device_bcast(col_comm,
-                     matrix_minor_value_output_first + offset,
-                     matrix_minor_value_output_first + offset,
-                     count,
-                     i,
-                     handle.get_stream());
-      }
+    auto& comm               = handle.get_comms();
+    auto const comm_rank     = comm.get_rank();
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_rank = row_comm.get_rank();
+    auto const row_comm_size = row_comm.get_size();
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_rank = col_comm.get_rank();
+    auto const col_comm_size = col_comm.get_size();
+
+    std::vector<size_t> rx_counts(row_comm_size, size_t{0});
+    std::vector<size_t> displacements(row_comm_size, size_t{0});
+    for (int i = 0; i < row_comm_size; ++i) {
+      rx_counts[i]     = graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i);
+      displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1];
     }
+    device_allgatherv(row_comm,
+                      vertex_value_input_first,
+                      matrix_minor_value_output_first,
+                      rx_counts,
+                      displacements,
+                      handle.get_stream());
   } else {
     assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed
              ? graph_view.get_number_of_local_adj_matrix_partition_rows()
@@ -277,143 +239,75 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
   using vertex_t = typename GraphViewType::vertex_type;
 
   if (GraphViewType::is_multi_gpu) {
-    if (graph_view.is_hypergraph_partitioned()) {
-      CUGRAPH_FAIL("unimplemented.");
-    } else {
-      auto& comm           = handle.get_comms();
-      auto const comm_rank = comm.get_rank();
-      auto& row_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-      auto const row_comm_rank = row_comm.get_rank();
-      auto const row_comm_size = row_comm.get_size();
-      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-      auto const col_comm_rank = col_comm.get_rank();
-      auto const col_comm_size = col_comm.get_size();
-
-      // FIXME: this P2P is unnecessary if apply the same partitioning scheme regardless of
-      // hypergraph partitioning is applied or not
-      auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank;
-      auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size;
-      size_t tx_count    = thrust::distance(vertex_first, vertex_last);
-      size_t rx_count{};
-      // FIXME: it seems like raft::isend and raft::irecv do not properly handle the destination (or
-      // source) == self case. Need to double check and fix this if this is indeed the case (or RAFT
-      // may use ncclSend/ncclRecv instead of UCX for device data).
-      if (comm_src_rank == comm_rank) {
-        assert(comm_dst_rank == comm_rank);
-        rx_count = tx_count;
-      } else {
-        std::vector<raft::comms::request_t> count_requests(2);
-        comm.isend(&tx_count, 1, comm_dst_rank, 0 /* tag */, count_requests.data());
-        comm.irecv(&rx_count, 1, comm_src_rank, 0 /* tag */, count_requests.data() + 1);
-        comm.waitall(count_requests.size(), count_requests.data());
-      }
-
-      vertex_partition_device_t<GraphViewType> vertex_partition(graph_view);
-      rmm::device_uvector<vertex_t> dst_vertices(rx_count, handle.get_stream());
-      auto dst_tmp_buffer = allocate_dataframe_buffer<
-        typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_count,
+    auto& comm               = handle.get_comms();
+    auto const comm_rank     = comm.get_rank();
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_rank = row_comm.get_rank();
+    auto const row_comm_size = row_comm.get_size();
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_rank = col_comm.get_rank();
+    auto const col_comm_size = col_comm.get_size();
+
+    auto rx_counts =
+      host_scalar_allgather(row_comm,
+                            static_cast<size_t>(thrust::distance(vertex_first, vertex_last)),
+                            handle.get_stream());
+
+    matrix_partition_device_t<GraphViewType> matrix_partition(graph_view, 0);
+    for (int i = 0; i < row_comm_size; ++i) {
+      rmm::device_uvector<vertex_t> rx_vertices(row_comm_rank == i ? size_t{0} : rx_counts[i],
+                                                handle.get_stream());
+      auto rx_tmp_buffer = allocate_dataframe_buffer<
+        typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_counts[i],
                                                                              handle.get_stream());
-      auto dst_value_first = get_dataframe_buffer_begin<
-        typename std::iterator_traits<VertexValueInputIterator>::value_type>(dst_tmp_buffer);
-      if (comm_src_rank == comm_rank) {
-        thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                     vertex_first,
-                     vertex_last,
-                     dst_vertices.begin());
-        auto map_first =
-          thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
-            return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v);
-          });
-        thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                       map_first,
-                       map_first + thrust::distance(vertex_first, vertex_last),
-                       vertex_value_input_first,
-                       dst_value_first);
-      } else {
-        auto src_tmp_buffer = allocate_dataframe_buffer<
-          typename std::iterator_traits<VertexValueInputIterator>::value_type>(tx_count,
-                                                                               handle.get_stream());
-        auto src_value_first = get_dataframe_buffer_begin<
-          typename std::iterator_traits<VertexValueInputIterator>::value_type>(src_tmp_buffer);
+      auto rx_value_first = get_dataframe_buffer_begin<
+        typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_tmp_buffer);
 
+      if (row_comm_rank == i) {
+        vertex_partition_device_t<GraphViewType> vertex_partition(graph_view);
         auto map_first =
           thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
             return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v);
           });
+        // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
+        // permutation iterator (and directly gathers to the internal buffer)
         thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
                        map_first,
                        map_first + thrust::distance(vertex_first, vertex_last),
                        vertex_value_input_first,
-                       src_value_first);
-
-        device_sendrecv<decltype(vertex_first), decltype(dst_vertices.begin())>(
-          comm,
-          vertex_first,
-          tx_count,
-          comm_dst_rank,
-          dst_vertices.begin(),
-          rx_count,
-          comm_src_rank,
-          handle.get_stream());
-
-        device_sendrecv<decltype(src_value_first), decltype(dst_value_first)>(comm,
-                                                                              src_value_first,
-                                                                              tx_count,
-                                                                              comm_dst_rank,
-                                                                              dst_value_first,
-                                                                              rx_count,
-                                                                              comm_src_rank,
-                                                                              handle.get_stream());
+                       rx_value_first);
       }
 
-      // FIXME: now we can clear tx_tmp_buffer
-
-      auto rx_counts = host_scalar_allgather(col_comm, rx_count, handle.get_stream());
-
-      matrix_partition_device_t<GraphViewType> matrix_partition(graph_view, 0);
-      for (int i = 0; i < col_comm_size; ++i) {
-        rmm::device_uvector<vertex_t> rx_vertices(col_comm_rank == i ? size_t{0} : rx_counts[i],
-                                                  handle.get_stream());
-        auto rx_tmp_buffer = allocate_dataframe_buffer<
-          typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_counts[i],
-                                                                               handle.get_stream());
-        auto rx_value_first = get_dataframe_buffer_begin<
-          typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_tmp_buffer);
-
-        // FIXME: these broadcast operations can be placed between ncclGroupStart() and
-        // ncclGroupEnd()
-        device_bcast(col_comm,
-                     dst_vertices.begin(),
-                     rx_vertices.begin(),
-                     rx_counts[i],
-                     i,
-                     handle.get_stream());
-        device_bcast(
-          col_comm, dst_value_first, rx_value_first, rx_counts[i], i, handle.get_stream());
-
-        if (col_comm_rank == i) {
-          auto map_first = thrust::make_transform_iterator(
-            dst_vertices.begin(), [matrix_partition] __device__(auto v) {
-              return matrix_partition.get_minor_offset_from_minor_nocheck(v);
-            });
-
-          thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                          dst_value_first,
-                          dst_value_first + rx_counts[i],
-                          map_first,
-                          matrix_minor_value_output_first);
-        } else {
-          auto map_first = thrust::make_transform_iterator(
-            rx_vertices.begin(), [matrix_partition] __device__(auto v) {
-              return matrix_partition.get_minor_offset_from_minor_nocheck(v);
-            });
+      // FIXME: these broadcast operations can be placed between ncclGroupStart() and
+      // ncclGroupEnd()
+      device_bcast(
+        row_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
+      device_bcast(row_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream());
 
-          thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                          rx_value_first,
-                          rx_value_first + rx_counts[i],
-                          map_first,
-                          matrix_minor_value_output_first);
-        }
+      if (row_comm_rank == i) {
+        auto map_first =
+          thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) {
+            return matrix_partition.get_minor_offset_from_minor_nocheck(v);
+          });
+        // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
+        // directly scatters from the internal buffer)
+        thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                        rx_value_first,
+                        rx_value_first + rx_counts[i],
+                        map_first,
+                        matrix_minor_value_output_first);
+      } else {
+        auto map_first = thrust::make_transform_iterator(
+          rx_vertices.begin(), [matrix_partition] __device__(auto v) {
+            return matrix_partition.get_minor_offset_from_minor_nocheck(v);
+          });
+        // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
+        // directly scatters from the internal buffer)
+        thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                        rx_value_first,
+                        rx_value_first + rx_counts[i],
+                        map_first,
+                        matrix_minor_value_output_first);
       }
     }
   } else {
diff --git a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh
index 3059cf95852..e6a73a874ae 100644
--- a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh
+++ b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh
@@ -362,16 +362,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
 
   static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
 
-  auto loop_count = size_t{1};
-  if (GraphViewType::is_multi_gpu) {
-    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-    auto const row_comm_size = row_comm.get_size();
-    loop_count               = graph_view.is_hypergraph_partitioned()
-                   ? graph_view.get_number_of_local_adj_matrix_partitions()
-                   : static_cast<size_t>(row_comm_size);
-  }
-  auto comm_rank = handle.comms_initialized() ? handle.get_comms().get_rank() : int{0};
-
   auto minor_tmp_buffer_size =
     (GraphViewType::is_multi_gpu && (in != GraphViewType::is_adj_matrix_transposed))
       ? GraphViewType::is_adj_matrix_transposed
@@ -386,10 +376,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
     if (GraphViewType::is_multi_gpu) {
       auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
       auto const row_comm_rank = row_comm.get_rank();
-      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-      auto const col_comm_rank = col_comm.get_rank();
-      minor_init = graph_view.is_hypergraph_partitioned() ? (row_comm_rank == 0) ? init : T{}
-                                                          : (col_comm_rank == 0) ? init : T{};
+      minor_init               = (row_comm_rank == 0) ? init : T{};
     }
 
     if (GraphViewType::is_multi_gpu) {
@@ -407,24 +394,13 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
     assert(minor_tmp_buffer_size == 0);
   }
 
-  for (size_t i = 0; i < loop_count; ++i) {
-    matrix_partition_device_t<GraphViewType> matrix_partition(
-      graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i);
+  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+    matrix_partition_device_t<GraphViewType> matrix_partition(graph_view, i);
 
-    auto major_tmp_buffer_size = vertex_t{0};
-    if (GraphViewType::is_multi_gpu) {
-      auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-      auto const row_comm_size = row_comm.get_size();
-      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-      auto const col_comm_rank = col_comm.get_rank();
-
-      major_tmp_buffer_size =
-        (in == GraphViewType::is_adj_matrix_transposed)
-          ? graph_view.is_hypergraph_partitioned()
-              ? matrix_partition.get_major_size()
-              : graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i)
-          : vertex_t{0};
-    }
+    auto major_tmp_buffer_size =
+      GraphViewType::is_multi_gpu && (in == GraphViewType::is_adj_matrix_transposed)
+        ? matrix_partition.get_major_size()
+        : vertex_t{0};
     auto major_tmp_buffer =
       allocate_dataframe_buffer<T>(major_tmp_buffer_size, handle.get_stream());
     auto major_buffer_first = get_dataframe_buffer_begin<T>(major_tmp_buffer);
@@ -432,12 +408,9 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
     auto major_init = T{};
     if (in == GraphViewType::is_adj_matrix_transposed) {
       if (GraphViewType::is_multi_gpu) {
-        auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-        auto const row_comm_rank = row_comm.get_rank();
         auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
         auto const col_comm_rank = col_comm.get_rank();
-        major_init = graph_view.is_hypergraph_partitioned() ? (col_comm_rank == 0) ? init : T{}
-                                                            : (row_comm_rank == 0) ? init : T{};
+        major_init               = (col_comm_rank == 0) ? init : T{};
       } else {
         major_init = init;
       }
@@ -450,8 +423,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       auto const row_comm_size = row_comm.get_size();
       auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
       auto const col_comm_rank = col_comm.get_rank();
-      comm_root_rank = graph_view.is_hypergraph_partitioned() ? i * row_comm_size + row_comm_rank
-                                                              : col_comm_rank * row_comm_size + i;
+      comm_root_rank           = i * row_comm_size + row_comm_rank;
     }
 
     if (graph_view.get_vertex_partition_size(comm_root_rank) > 0) {
@@ -505,25 +477,13 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       auto const col_comm_rank = col_comm.get_rank();
       auto const col_comm_size = col_comm.get_size();
 
-      if (graph_view.is_hypergraph_partitioned()) {
-        device_reduce(
-          col_comm,
-          major_buffer_first,
-          vertex_value_output_first,
-          static_cast<size_t>(graph_view.get_vertex_partition_size(i * row_comm_size + i)),
-          raft::comms::op_t::SUM,
-          i,
-          handle.get_stream());
-      } else {
-        device_reduce(row_comm,
-                      major_buffer_first,
-                      vertex_value_output_first,
-                      static_cast<size_t>(
-                        graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i)),
-                      raft::comms::op_t::SUM,
-                      i,
-                      handle.get_stream());
-      }
+      device_reduce(col_comm,
+                    major_buffer_first,
+                    vertex_value_output_first,
+                    matrix_partition.get_major_size(),
+                    raft::comms::op_t::SUM,
+                    i,
+                    handle.get_stream());
     }
   }
 
@@ -537,53 +497,17 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
     auto const col_comm_rank = col_comm.get_rank();
     auto const col_comm_size = col_comm.get_size();
 
-    if (graph_view.is_hypergraph_partitioned()) {
-      CUGRAPH_FAIL("unimplemented.");
-    } else {
-      for (int i = 0; i < col_comm_size; ++i) {
-        auto offset = (graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + i) -
-                       graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size));
-        auto size   = static_cast<size_t>(
-          graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + i));
-        device_reduce(col_comm,
-                      minor_buffer_first + offset,
-                      minor_buffer_first + offset,
-                      size,
-                      raft::comms::op_t::SUM,
-                      i,
-                      handle.get_stream());
-      }
-
-      // FIXME: this P2P is unnecessary if we apply the partitioning scheme used with hypergraph
-      // partitioning
-      auto comm_src_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size;
-      auto comm_dst_rank = row_comm_rank * col_comm_size + col_comm_rank;
-      // FIXME: this branch may no longer necessary with NCCL backend
-      if (comm_src_rank == comm_rank) {
-        assert(comm_dst_rank == comm_rank);
-        auto offset =
-          graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + col_comm_rank) -
-          graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size);
-        auto size = static_cast<size_t>(
-          graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + col_comm_rank));
-        thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                     minor_buffer_first + offset,
-                     minor_buffer_first + offset + size,
-                     vertex_value_output_first);
-      } else {
-        device_sendrecv<decltype(minor_buffer_first), VertexValueOutputIterator>(
-          comm,
-          minor_buffer_first +
-            (graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + col_comm_rank) -
-             graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)),
-          static_cast<size_t>(
-            graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + col_comm_rank)),
-          comm_dst_rank,
-          vertex_value_output_first,
-          static_cast<size_t>(graph_view.get_vertex_partition_size(comm_rank)),
-          comm_src_rank,
-          handle.get_stream());
-      }
+    for (int i = 0; i < row_comm_size; ++i) {
+      auto offset = (graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size + i) -
+                     graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size));
+      device_reduce(row_comm,
+                    minor_buffer_first + offset,
+                    vertex_value_output_first,
+                    static_cast<size_t>(
+                      graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i)),
+                    raft::comms::op_t::SUM,
+                    i,
+                    handle.get_stream());
     }
   }
 }
diff --git a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
index 19a5f67c9de..22dc2041793 100644
--- a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
+++ b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
@@ -19,6 +19,7 @@
 #include <experimental/graph.hpp>
 #include <experimental/graph_view.hpp>
 #include <matrix_partition_device.cuh>
+#include <utilities/collect_comm.cuh>
 #include <utilities/dataframe_buffer.cuh>
 #include <utilities/error.hpp>
 #include <utilities/host_scalar_comm.cuh>
@@ -170,8 +171,8 @@ __global__ void for_all_major_for_all_nbr_low_degree(
  */
 template <typename GraphViewType,
           typename AdjMatrixRowValueInputIterator,
-          typename VertexIterator,
-          typename VertexIterator2,
+          typename VertexIterator0,
+          typename VertexIterator1,
           typename ValueIterator,
           typename KeyAggregatedEdgeOp,
           typename ReduceOp,
@@ -181,9 +182,9 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
   raft::handle_t const& handle,
   GraphViewType const& graph_view,
   AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  VertexIterator adj_matrix_col_key_first,
-  VertexIterator2 map_key_first,
-  VertexIterator2 map_key_last,
+  VertexIterator0 adj_matrix_col_key_first,
+  VertexIterator1 map_key_first,
+  VertexIterator1 map_key_last,
   ValueIterator map_value_first,
   KeyAggregatedEdgeOp key_aggregated_e_op,
   ReduceOp reduce_op,
@@ -192,8 +193,10 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
 {
   static_assert(!GraphViewType::is_adj_matrix_transposed,
                 "GraphViewType should support the push model.");
-  static_assert(std::is_same<typename std::iterator_traits<VertexIterator>::value_type,
+  static_assert(std::is_same<typename std::iterator_traits<VertexIterator0>::value_type,
                              typename GraphViewType::vertex_type>::value);
+  static_assert(std::is_same<typename std::iterator_traits<VertexIterator0>::value_type,
+                             typename std::iterator_traits<VertexIterator1>::value_type>::value);
   static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
 
   using vertex_t = typename GraphViewType::vertex_type;
@@ -206,64 +209,113 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
   // 1. build a cuco::static_map object for the k, v pairs.
 
   auto kv_map_ptr = std::make_unique<cuco::static_map<vertex_t, value_t>>(
-    static_cast<size_t>(static_cast<double>(thrust::distance(map_key_first, map_key_last)) /
-                        load_factor),
-    invalid_vertex_id<vertex_t>::value,
-    invalid_vertex_id<vertex_t>::value);
-  auto pair_first = thrust::make_transform_iterator(
-    thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first)),
-    [] __device__(auto val) {
-      return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
-    });
-  kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last));
-
-  // 2. aggregate each vertex out-going edges based on keys and transform-reduce.
-
-  auto loop_count = size_t{1};
+    size_t{0}, invalid_vertex_id<vertex_t>::value, invalid_vertex_id<vertex_t>::value);
   if (GraphViewType::is_multi_gpu) {
     auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_rank = row_comm.get_rank();
     auto const row_comm_size = row_comm.get_size();
-    loop_count               = graph_view.is_hypergraph_partitioned()
-                   ? graph_view.get_number_of_local_adj_matrix_partitions()
-                   : static_cast<size_t>(row_comm_size);
+
+    auto map_counts =
+      host_scalar_allgather(row_comm,
+                            static_cast<size_t>(thrust::distance(map_key_first, map_key_last)),
+                            handle.get_stream());
+    std::vector<size_t> map_displacements(row_comm_size, size_t{0});
+    std::partial_sum(map_counts.begin(), map_counts.end() - 1, map_displacements.begin() + 1);
+    rmm::device_uvector<vertex_t> map_keys(map_displacements.back() + map_counts.back(),
+                                           handle.get_stream());
+    auto map_value_buffer =
+      allocate_dataframe_buffer<value_t>(map_keys.size(), handle.get_stream());
+    for (int i = 0; i < row_comm_size; ++i) {
+      device_bcast(row_comm,
+                   map_key_first,
+                   map_keys.begin() + map_displacements[i],
+                   map_counts[i],
+                   i,
+                   handle.get_stream());
+      device_bcast(row_comm,
+                   map_value_first,
+                   get_dataframe_buffer_begin<value_t>(map_value_buffer) + map_displacements[i],
+                   map_counts[i],
+                   i,
+                   handle.get_stream());
+    }
+    // FIXME: these copies are unnecessary, better fix RAFT comm's bcast to take separate input &
+    // output pointers
+    thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 map_key_first,
+                 map_key_last,
+                 map_keys.begin() + map_displacements[row_comm_rank]);
+    thrust::copy(
+      rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+      map_value_first,
+      map_value_first + thrust::distance(map_key_first, map_key_last),
+      get_dataframe_buffer_begin<value_t>(map_value_buffer) + map_displacements[row_comm_rank]);
+
+    handle.get_stream_view().synchronize();  // cuco::static_map currently does not take stream
+
+    kv_map_ptr.reset();
+
+    kv_map_ptr = std::make_unique<cuco::static_map<vertex_t, value_t>>(
+      // FIXME: std::max(..., ...) as a temporary workaround for
+      // https://github.com/NVIDIA/cuCollections/issues/72 and
+      // https://github.com/NVIDIA/cuCollections/issues/73
+      std::max(static_cast<size_t>(static_cast<double>(map_keys.size()) / load_factor),
+               static_cast<size_t>(thrust::distance(map_key_first, map_key_last)) + 1),
+      invalid_vertex_id<vertex_t>::value,
+      invalid_vertex_id<vertex_t>::value);
+
+    auto pair_first = thrust::make_transform_iterator(
+      thrust::make_zip_iterator(thrust::make_tuple(
+        map_keys.begin(), get_dataframe_buffer_begin<value_t>(map_value_buffer))),
+      [] __device__(auto val) {
+        return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
+      });
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (map_keys.size()) { kv_map_ptr->insert(pair_first, pair_first + map_keys.size()); }
+  } else {
+    handle.get_stream_view().synchronize();  // cuco::static_map currently does not take stream
+
+    kv_map_ptr.reset();
+
+    kv_map_ptr = std::make_unique<cuco::static_map<vertex_t, value_t>>(
+      // FIXME: std::max(..., ...) as a temporary workaround for
+      // https://github.com/NVIDIA/cuCollections/issues/72 and
+      // https://github.com/NVIDIA/cuCollections/issues/73
+      std::max(static_cast<size_t>(
+                 static_cast<double>(thrust::distance(map_key_first, map_key_last)) / load_factor),
+               static_cast<size_t>(thrust::distance(map_key_first, map_key_last)) + 1),
+      invalid_vertex_id<vertex_t>::value,
+      invalid_vertex_id<vertex_t>::value);
+
+    auto pair_first = thrust::make_transform_iterator(
+      thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first)),
+      [] __device__(auto val) {
+        return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
+      });
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (thrust::distance(map_key_first, map_key_last) > 0) {
+      kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last));
+    }
   }
 
+  // 2. aggregate each vertex out-going edges based on keys and transform-reduce.
+
   rmm::device_uvector<vertex_t> major_vertices(0, handle.get_stream());
   auto e_op_result_buffer = allocate_dataframe_buffer<T>(0, handle.get_stream());
-  for (size_t i = 0; i < loop_count; ++i) {
-    matrix_partition_device_t<GraphViewType> matrix_partition(
-      graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i);
-
-    int comm_root_rank = 0;
-    if (GraphViewType::is_multi_gpu) {
-      auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-      auto const row_comm_rank = row_comm.get_rank();
-      auto const row_comm_size = row_comm.get_size();
-      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-      auto const col_comm_rank = col_comm.get_rank();
-      comm_root_rank = graph_view.is_hypergraph_partitioned() ? i * row_comm_size + row_comm_rank
-                                                              : col_comm_rank * row_comm_size + i;
-    }
+  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+    matrix_partition_device_t<GraphViewType> matrix_partition(graph_view, i);
 
-    auto num_edges = thrust::transform_reduce(
-      rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-      thrust::make_counting_iterator(graph_view.get_vertex_partition_first(comm_root_rank)),
-      thrust::make_counting_iterator(graph_view.get_vertex_partition_last(comm_root_rank)),
-      [matrix_partition] __device__(auto row) {
-        auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row);
-        return matrix_partition.get_local_degree(row_offset);
-      },
-      edge_t{0},
-      thrust::plus<edge_t>());
-
-    rmm::device_uvector<vertex_t> tmp_major_vertices(num_edges, handle.get_stream());
+    rmm::device_uvector<vertex_t> tmp_major_vertices(matrix_partition.get_number_of_edges(),
+                                                     handle.get_stream());
     rmm::device_uvector<vertex_t> tmp_minor_keys(tmp_major_vertices.size(), handle.get_stream());
     rmm::device_uvector<weight_t> tmp_key_aggregated_edge_weights(tmp_major_vertices.size(),
                                                                   handle.get_stream());
 
-    if (graph_view.get_vertex_partition_size(comm_root_rank) > 0) {
+    if (matrix_partition.get_major_size() > 0) {
       raft::grid_1d_thread_t update_grid(
-        graph_view.get_vertex_partition_size(comm_root_rank),
+        matrix_partition.get_major_size(),
         detail::copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size,
         handle.get_device_properties().maxGridSize[0]);
 
@@ -277,8 +329,8 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
                                                      0,
                                                      handle.get_stream()>>>(
         matrix_partition,
-        graph_view.get_vertex_partition_first(comm_root_rank),
-        graph_view.get_vertex_partition_last(comm_root_rank),
+        matrix_partition.get_major_first(),
+        matrix_partition.get_major_last(),
         adj_matrix_col_key_first,
         tmp_major_vertices.data(),
         tmp_minor_keys.data(),
@@ -300,10 +352,14 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
     tmp_key_aggregated_edge_weights.resize(tmp_major_vertices.size(), handle.get_stream());
 
     if (GraphViewType::is_multi_gpu) {
-      auto& sub_comm           = handle.get_subcomm(graph_view.is_hypergraph_partitioned()
-                                            ? cugraph::partition_2d::key_naming_t().col_name()
-                                            : cugraph::partition_2d::key_naming_t().row_name());
-      auto const sub_comm_size = sub_comm.get_size();
+      auto& comm           = handle.get_comms();
+      auto const comm_size = comm.get_size();
+
+      auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+      auto const row_comm_size = row_comm.get_size();
+
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_size = col_comm.get_size();
 
       triplet_first =
         thrust::make_zip_iterator(thrust::make_tuple(tmp_major_vertices.begin(),
@@ -315,11 +371,13 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
       std::forward_as_tuple(
         std::tie(rx_major_vertices, rx_minor_keys, rx_key_aggregated_edge_weights), std::ignore) =
         groupby_gpuid_and_shuffle_values(
-          sub_comm,
+          col_comm,
           triplet_first,
           triplet_first + tmp_major_vertices.size(),
-          [key_func = detail::compute_gpu_id_from_vertex_t<vertex_t>{sub_comm_size}] __device__(
-            auto val) { return key_func(thrust::get<1>(val)); },
+          [key_func = detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size},
+           row_comm_size] __device__(auto val) {
+            return key_func(thrust::get<1>(val)) / row_comm_size;
+          },
           handle.get_stream());
 
       auto pair_first = thrust::make_zip_iterator(
@@ -355,56 +413,52 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
 
     triplet_first = thrust::make_zip_iterator(thrust::make_tuple(
       tmp_major_vertices.begin(), tmp_minor_keys.begin(), tmp_key_aggregated_edge_weights.begin()));
-    thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                      triplet_first,
-                      triplet_first + tmp_major_vertices.size(),
-                      tmp_e_op_result_buffer_first,
-                      [adj_matrix_row_value_input_first,
-                       key_aggregated_e_op,
-                       matrix_partition,
-                       kv_map = kv_map_ptr->get_device_view()] __device__(auto val) {
-                        auto major = thrust::get<0>(val);
-                        auto key   = thrust::get<1>(val);
-                        auto w     = thrust::get<2>(val);
-                        return key_aggregated_e_op(
-                          major,
-                          key,
-                          w,
-                          *(adj_matrix_row_value_input_first +
-                            matrix_partition.get_major_offset_from_major_nocheck(major)),
-                          kv_map.find(key)->second.load(cuda::std::memory_order_relaxed));
-                      });
+    thrust::transform(
+      rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+      triplet_first,
+      triplet_first + tmp_major_vertices.size(),
+      tmp_e_op_result_buffer_first,
+      [adj_matrix_row_value_input_first =
+         adj_matrix_row_value_input_first + matrix_partition.get_major_value_start_offset(),
+       key_aggregated_e_op,
+       matrix_partition,
+       kv_map = kv_map_ptr->get_device_view()] __device__(auto val) {
+        auto major = thrust::get<0>(val);
+        auto key   = thrust::get<1>(val);
+        auto w     = thrust::get<2>(val);
+        return key_aggregated_e_op(major,
+                                   key,
+                                   w,
+                                   *(adj_matrix_row_value_input_first +
+                                     matrix_partition.get_major_offset_from_major_nocheck(major)),
+                                   kv_map.find(key)->second.load(cuda::std::memory_order_relaxed));
+      });
     tmp_minor_keys.resize(0, handle.get_stream());
     tmp_key_aggregated_edge_weights.resize(0, handle.get_stream());
     tmp_minor_keys.shrink_to_fit(handle.get_stream());
     tmp_key_aggregated_edge_weights.shrink_to_fit(handle.get_stream());
 
     if (GraphViewType::is_multi_gpu) {
-      auto& sub_comm           = handle.get_subcomm(graph_view.is_hypergraph_partitioned()
-                                            ? cugraph::partition_2d::key_naming_t().col_name()
-                                            : cugraph::partition_2d::key_naming_t().row_name());
-      auto const sub_comm_rank = sub_comm.get_rank();
-      auto const sub_comm_size = sub_comm.get_size();
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+      auto const col_comm_size = col_comm.get_size();
 
       // FIXME: additional optimization is possible if reduce_op is a pure function (and reduce_op
       // can be mapped to ncclRedOp_t).
 
       auto rx_sizes =
-        host_scalar_gather(sub_comm, tmp_major_vertices.size(), i, handle.get_stream());
-      std::vector<size_t> rx_displs(
-        static_cast<size_t>(sub_comm_rank) == i ? sub_comm_size : int{0}, size_t{0});
-      if (static_cast<size_t>(sub_comm_rank) == i) {
+        host_scalar_gather(col_comm, tmp_major_vertices.size(), i, handle.get_stream());
+      std::vector<size_t> rx_displs{};
+      rmm::device_uvector<vertex_t> rx_major_vertices(0, handle.get_stream());
+      if (static_cast<size_t>(col_comm_rank) == i) {
+        rx_displs.assign(col_comm_size, size_t{0});
         std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1);
+        rx_major_vertices.resize(rx_displs.back() + rx_sizes.back(), handle.get_stream());
       }
-      rmm::device_uvector<vertex_t> rx_major_vertices(
-        static_cast<size_t>(sub_comm_rank) == i
-          ? std::accumulate(rx_sizes.begin(), rx_sizes.end(), size_t{0})
-          : size_t{0},
-        handle.get_stream());
       auto rx_tmp_e_op_result_buffer =
         allocate_dataframe_buffer<T>(rx_major_vertices.size(), handle.get_stream());
 
-      device_gatherv(sub_comm,
+      device_gatherv(col_comm,
                      tmp_major_vertices.data(),
                      rx_major_vertices.data(),
                      tmp_major_vertices.size(),
@@ -412,7 +466,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
                      rx_displs,
                      i,
                      handle.get_stream());
-      device_gatherv(sub_comm,
+      device_gatherv(col_comm,
                      tmp_e_op_result_buffer_first,
                      get_dataframe_buffer_begin<T>(rx_tmp_e_op_result_buffer),
                      tmp_major_vertices.size(),
@@ -421,7 +475,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
                      i,
                      handle.get_stream());
 
-      if (static_cast<size_t>(sub_comm_rank) == i) {
+      if (static_cast<size_t>(col_comm_rank) == i) {
         major_vertices     = std::move(rx_major_vertices);
         e_op_result_buffer = std::move(rx_tmp_e_op_result_buffer);
       }
diff --git a/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh b/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh
index e621ed91ddb..34721c75e31 100644
--- a/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh
+++ b/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh
@@ -179,20 +179,10 @@ transform_reduce_by_adj_matrix_row_col_key_e(
   using edge_t   = typename GraphViewType::edge_type;
   using weight_t = typename GraphViewType::weight_type;
 
-  auto loop_count = size_t{1};
-  if (GraphViewType::is_multi_gpu) {
-    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-    auto const row_comm_size = row_comm.get_size();
-    loop_count               = graph_view.is_hypergraph_partitioned()
-                   ? graph_view.get_number_of_local_adj_matrix_partitions()
-                   : static_cast<size_t>(row_comm_size);
-  }
-
   rmm::device_uvector<vertex_t> keys(0, handle.get_stream());
   auto value_buffer = allocate_dataframe_buffer<T>(0, handle.get_stream());
-  for (size_t i = 0; i < loop_count; ++i) {
-    matrix_partition_device_t<GraphViewType> matrix_partition(
-      graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i);
+  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+    matrix_partition_device_t<GraphViewType> matrix_partition(graph_view, i);
 
     int comm_root_rank = 0;
     if (GraphViewType::is_multi_gpu) {
@@ -201,8 +191,7 @@ transform_reduce_by_adj_matrix_row_col_key_e(
       auto const row_comm_size = row_comm.get_size();
       auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
       auto const col_comm_rank = col_comm.get_rank();
-      comm_root_rank = graph_view.is_hypergraph_partitioned() ? i * row_comm_size + row_comm_rank
-                                                              : col_comm_rank * row_comm_size + i;
+      comm_root_rank           = i * row_comm_size + row_comm_rank;
     }
 
     auto num_edges = thrust::transform_reduce(
@@ -224,6 +213,13 @@ transform_reduce_by_adj_matrix_row_col_key_e(
                                          detail::transform_reduce_by_key_e_for_all_block_size,
                                          handle.get_device_properties().maxGridSize[0]);
 
+      auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed
+                                      ? vertex_t{0}
+                                      : matrix_partition.get_major_value_start_offset();
+      auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed
+                                      ? matrix_partition.get_major_value_start_offset()
+                                      : vertex_t{0};
+
       // FIXME: This is highly inefficient for graphs with high-degree vertices. If we renumber
       // vertices to insure that rows within a partition are sorted by their out-degree in
       // decreasing order, we will apply this kernel only to low out-degree vertices.
@@ -232,9 +228,10 @@ transform_reduce_by_adj_matrix_row_col_key_e(
           matrix_partition,
           graph_view.get_vertex_partition_first(comm_root_rank),
           graph_view.get_vertex_partition_last(comm_root_rank),
-          adj_matrix_row_value_input_first,
-          adj_matrix_col_value_input_first,
-          adj_matrix_row_col_key_first,
+          adj_matrix_row_value_input_first + row_value_input_offset,
+          adj_matrix_col_value_input_first + col_value_input_offset,
+          adj_matrix_row_col_key_first +
+            (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset),
           e_op,
           tmp_keys.data(),
           get_dataframe_buffer_begin<T>(tmp_value_buffer));
diff --git a/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh
index 4efd32bcac7..4d557b97a30 100644
--- a/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh
+++ b/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh
@@ -25,12 +25,14 @@
 #include <utilities/device_comm.cuh>
 #include <utilities/error.hpp>
 #include <utilities/host_scalar_comm.cuh>
+#include <utilities/shuffle_comm.cuh>
 #include <utilities/thrust_tuple_utils.cuh>
 #include <vertex_partition_device.cuh>
 
 #include <raft/cudart_utils.h>
 #include <rmm/thrust_rmm_allocator.h>
 #include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -115,12 +117,10 @@ __global__ void for_all_frontier_row_for_all_nbr_low_degree(
         static_assert(sizeof(unsigned long long int) == sizeof(size_t));
         auto buffer_idx = atomicAdd(reinterpret_cast<unsigned long long int*>(buffer_idx_ptr),
                                     static_cast<unsigned long long int>(1));
-        *(buffer_key_output_first + buffer_idx) = col;
-        *(buffer_payload_output_first + buffer_idx) =
-          remove_first_thrust_tuple_element<decltype(e_op_result)>()(e_op_result);
+        *(buffer_key_output_first + buffer_idx)     = col;
+        *(buffer_payload_output_first + buffer_idx) = thrust::get<1>(e_op_result);
       }
     }
-
     idx += gridDim.x * blockDim.x;
   }
 }
@@ -155,8 +155,8 @@ size_t reduce_buffer_elements(raft::handle_t const& handle,
     // temporary buffer size exceeds the maximum buffer size (may be definied as percentage of the
     // system HBM size or a function of the maximum number of threads in the system))
     // FIXME: actually, we can find how many unique keys are here by now.
-    // FIXME: if GraphViewType::is_multi_gpu is true, this should be executed on the GPU holding the
-    // vertex unless reduce_op is a pure function.
+    // FIXME: if GraphViewType::is_multi_gpu is true, this should be executed on the GPU holding
+    // the vertex unless reduce_op is a pure function.
     rmm::device_uvector<key_t> keys(num_buffer_elements, handle.get_stream());
     auto value_buffer =
       allocate_dataframe_buffer<payload_t>(num_buffer_elements, handle.get_stream());
@@ -234,8 +234,7 @@ __global__ void update_frontier_and_vertex_output_values(
       auto v_op_result    = v_op(v_val, payload);
       selected_bucket_idx = thrust::get<0>(v_op_result);
       if (selected_bucket_idx != invalid_bucket_idx) {
-        *(vertex_value_output_first + key_offset) =
-          remove_first_thrust_tuple_element<decltype(v_op_result)>()(v_op_result);
+        *(vertex_value_output_first + key_offset)       = thrust::get<1>(v_op_result);
         bucket_block_local_offsets[selected_bucket_idx] = 1;
       }
     }
@@ -349,25 +348,18 @@ void update_frontier_v_push_if_out_nbr(
   static_assert(!GraphViewType::is_adj_matrix_transposed,
                 "GraphViewType should support the push model.");
 
-  using vertex_t = typename GraphViewType::vertex_type;
-  using edge_t   = typename GraphViewType::edge_type;
+  using vertex_t  = typename GraphViewType::vertex_type;
+  using edge_t    = typename GraphViewType::edge_type;
+  using weight_t  = typename GraphViewType::weight_type;
+  using payload_t = typename ReduceOp::type;
 
   // 1. fill the buffer
 
-  vertex_frontier.set_buffer_idx_value(0);
-
-  auto loop_count = size_t{1};
-  if (GraphViewType::is_multi_gpu) {
-    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-    auto const row_comm_size = row_comm.get_size();
-    loop_count               = graph_view.is_hypergraph_partitioned()
-                   ? graph_view.get_number_of_local_adj_matrix_partitions()
-                   : static_cast<size_t>(row_comm_size);
-  }
-
-  for (size_t i = 0; i < loop_count; ++i) {
-    matrix_partition_device_t<GraphViewType> matrix_partition(
-      graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i);
+  rmm::device_uvector<vertex_t> keys(size_t{0}, handle.get_stream());
+  auto payload_buffer = allocate_dataframe_buffer<payload_t>(size_t{0}, handle.get_stream());
+  rmm::device_scalar<size_t> buffer_idx(size_t{0}, handle.get_stream());
+  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+    matrix_partition_device_t<GraphViewType> matrix_partition(graph_view, i);
 
     rmm::device_uvector<vertex_t> frontier_rows(
       0, handle.get_stream());  // relevant only if GraphViewType::is_multi_gpu is true
@@ -380,22 +372,18 @@ void update_frontier_v_push_if_out_nbr(
       auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
       auto const col_comm_rank = col_comm.get_rank();
 
-      auto sub_comm_rank = graph_view.is_hypergraph_partitioned() ? col_comm_rank : row_comm_rank;
-      frontier_size      = host_scalar_bcast(
-        graph_view.is_hypergraph_partitioned() ? col_comm : row_comm,
-        (static_cast<size_t>(sub_comm_rank) == i) ? thrust::distance(vertex_first, vertex_last)
-                                                  : size_t{0},
-        i,
-        handle.get_stream());
+      auto sub_comm_rank = col_comm_rank;
+      frontier_size      = host_scalar_bcast(col_comm,
+                                        (static_cast<size_t>(sub_comm_rank) == i)
+                                          ? thrust::distance(vertex_first, vertex_last)
+                                          : size_t{0},
+                                        i,
+                                        handle.get_stream());
       if (static_cast<size_t>(sub_comm_rank) != i) {
         frontier_rows.resize(frontier_size, handle.get_stream());
       }
-      device_bcast(graph_view.is_hypergraph_partitioned() ? col_comm : row_comm,
-                   vertex_first,
-                   frontier_rows.begin(),
-                   frontier_size,
-                   i,
-                   handle.get_stream());
+      device_bcast(
+        col_comm, vertex_first, frontier_rows.begin(), frontier_size, i, handle.get_stream());
     } else {
       frontier_size = thrust::distance(vertex_first, vertex_last);
     }
@@ -439,10 +427,8 @@ void update_frontier_v_push_if_out_nbr(
     // locking.
     // FIXME: if i != 0, this will require costly reallocation if we don't use the new CUDA feature
     // to reserve address space.
-    vertex_frontier.resize_buffer(vertex_frontier.get_buffer_idx_value() + max_pushes);
-    auto buffer_first         = vertex_frontier.buffer_begin();
-    auto buffer_key_first     = std::get<0>(buffer_first);
-    auto buffer_payload_first = std::get<1>(buffer_first);
+    keys.resize(buffer_idx.value(handle.get_stream()) + max_pushes, handle.get_stream());
+    resize_dataframe_buffer<payload_t>(payload_buffer, keys.size(), handle.get_stream());
 
     auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed
                                     ? vertex_t{0}
@@ -467,9 +453,9 @@ void update_frontier_v_push_if_out_nbr(
           frontier_rows.end(),
           adj_matrix_row_value_input_first + row_value_input_offset,
           adj_matrix_col_value_input_first,
-          buffer_key_first,
-          buffer_payload_first,
-          vertex_frontier.get_buffer_idx_ptr(),
+          keys.begin(),
+          get_dataframe_buffer_begin<payload_t>(payload_buffer),
+          buffer_idx.data(),
           e_op);
       } else {
         detail::for_all_frontier_row_for_all_nbr_low_degree<<<for_all_low_degree_grid.num_blocks,
@@ -481,9 +467,9 @@ void update_frontier_v_push_if_out_nbr(
           vertex_last,
           adj_matrix_row_value_input_first + row_value_input_offset,
           adj_matrix_col_value_input_first,
-          buffer_key_first,
-          buffer_payload_first,
-          vertex_frontier.get_buffer_idx_ptr(),
+          keys.begin(),
+          get_dataframe_buffer_begin<payload_t>(payload_buffer),
+          buffer_idx.data(),
           e_op);
       }
     }
@@ -491,18 +477,12 @@ void update_frontier_v_push_if_out_nbr(
 
   // 2. reduce the buffer
 
-  auto num_buffer_offset = edge_t{0};
-
-  auto buffer_first         = vertex_frontier.buffer_begin();
-  auto buffer_key_first     = std::get<0>(buffer_first) + num_buffer_offset;
-  auto buffer_payload_first = std::get<1>(buffer_first) + num_buffer_offset;
-
-  auto num_buffer_elements = detail::reduce_buffer_elements(handle,
-                                                            buffer_key_first,
-                                                            buffer_payload_first,
-                                                            vertex_frontier.get_buffer_idx_value(),
-                                                            reduce_op);
-
+  auto num_buffer_elements =
+    detail::reduce_buffer_elements(handle,
+                                   keys.begin(),
+                                   get_dataframe_buffer_begin<payload_t>(payload_buffer),
+                                   buffer_idx.value(handle.get_stream()),
+                                   reduce_op);
   if (GraphViewType::is_multi_gpu) {
     auto& comm               = handle.get_comms();
     auto const comm_rank     = comm.get_rank();
@@ -513,12 +493,9 @@ void update_frontier_v_push_if_out_nbr(
     auto const col_comm_rank = col_comm.get_rank();
     auto const col_comm_size = col_comm.get_size();
 
-    std::vector<vertex_t> h_vertex_lasts(graph_view.is_hypergraph_partitioned() ? row_comm_size
-                                                                                : col_comm_size);
+    std::vector<vertex_t> h_vertex_lasts(row_comm_size);
     for (size_t i = 0; i < h_vertex_lasts.size(); ++i) {
-      h_vertex_lasts[i] = graph_view.get_vertex_partition_last(
-        graph_view.is_hypergraph_partitioned() ? col_comm_rank * row_comm_size + i
-                                               : row_comm_rank * col_comm_size + i);
+      h_vertex_lasts[i] = graph_view.get_vertex_partition_last(col_comm_rank * row_comm_size + i);
     }
 
     rmm::device_uvector<vertex_t> d_vertex_lasts(h_vertex_lasts.size(), handle.get_stream());
@@ -527,8 +504,8 @@ void update_frontier_v_push_if_out_nbr(
     rmm::device_uvector<edge_t> d_tx_buffer_last_boundaries(d_vertex_lasts.size(),
                                                             handle.get_stream());
     thrust::lower_bound(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                        buffer_key_first,
-                        buffer_key_first + num_buffer_elements,
+                        keys.begin(),
+                        keys.begin() + num_buffer_elements,
                         d_vertex_lasts.begin(),
                         d_vertex_lasts.end(),
                         d_tx_buffer_last_boundaries.begin());
@@ -537,122 +514,35 @@ void update_frontier_v_push_if_out_nbr(
                       d_tx_buffer_last_boundaries.data(),
                       d_tx_buffer_last_boundaries.size(),
                       handle.get_stream());
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+    handle.get_stream_view().synchronize();
     std::vector<size_t> tx_counts(h_tx_buffer_last_boundaries.size());
     std::adjacent_difference(
       h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin());
 
-    std::vector<size_t> rx_counts(graph_view.is_hypergraph_partitioned() ? row_comm_size
-                                                                         : col_comm_size);
-    std::vector<raft::comms::request_t> count_requests(tx_counts.size() + rx_counts.size());
-    size_t tx_self_i = std::numeric_limits<size_t>::max();
-    for (size_t i = 0; i < tx_counts.size(); ++i) {
-      auto comm_dst_rank = graph_view.is_hypergraph_partitioned()
-                             ? col_comm_rank * row_comm_size + static_cast<int>(i)
-                             : row_comm_rank * col_comm_size + static_cast<int>(i);
-      if (comm_dst_rank == comm_rank) {
-        tx_self_i = i;
-        // FIXME: better define request_null (similar to MPI_REQUEST_NULL) under raft::comms
-        count_requests[i] = std::numeric_limits<raft::comms::request_t>::max();
-      } else {
-        comm.isend(&tx_counts[i], 1, comm_dst_rank, 0 /* tag */, count_requests.data() + i);
-      }
-    }
-    for (size_t i = 0; i < rx_counts.size(); ++i) {
-      auto comm_src_rank = graph_view.is_hypergraph_partitioned()
-                             ? col_comm_rank * row_comm_size + static_cast<int>(i)
-                             : static_cast<int>(i) * row_comm_size + comm_rank / col_comm_size;
-      if (comm_src_rank == comm_rank) {
-        assert(tx_self_i != std::numeric_limits<size_t>::max());
-        rx_counts[i] = tx_counts[tx_self_i];
-        // FIXME: better define request_null (similar to MPI_REQUEST_NULL) under raft::comms
-        count_requests[tx_counts.size() + i] = std::numeric_limits<raft::comms::request_t>::max();
-      } else {
-        comm.irecv(&rx_counts[i],
-                   1,
-                   comm_src_rank,
-                   0 /* tag */,
-                   count_requests.data() + tx_counts.size() + i);
-      }
-    }
-    // FIXME: better define request_null (similar to MPI_REQUEST_NULL) under raft::comms, if
-    // raft::comms::wait immediately returns on seeing request_null, this remove is unnecessary
-    count_requests.erase(std::remove(count_requests.begin(),
-                                     count_requests.end(),
-                                     std::numeric_limits<raft::comms::request_t>::max()),
-                         count_requests.end());
-    comm.waitall(count_requests.size(), count_requests.data());
-
-    std::vector<size_t> tx_offsets(tx_counts.size() + 1, edge_t{0});
-    std::partial_sum(tx_counts.begin(), tx_counts.end(), tx_offsets.begin() + 1);
-    std::vector<size_t> rx_offsets(rx_counts.size() + 1, edge_t{0});
-    std::partial_sum(rx_counts.begin(), rx_counts.end(), rx_offsets.begin() + 1);
-
-    // FIXME: this will require costly reallocation if we don't use the new CUDA feature to reserve
-    // address space.
-    // FIXME: std::max(actual size, 1) as ncclRecv currently hangs if recvuff is nullptr even if
-    // count is 0
-    vertex_frontier.resize_buffer(std::max(num_buffer_elements + rx_offsets.back(), size_t(1)));
-
-    auto buffer_first         = vertex_frontier.buffer_begin();
-    auto buffer_key_first     = std::get<0>(buffer_first) + num_buffer_offset;
-    auto buffer_payload_first = std::get<1>(buffer_first) + num_buffer_offset;
-
-    std::vector<int> tx_dst_ranks(tx_counts.size());
-    std::vector<int> rx_src_ranks(rx_counts.size());
-    for (size_t i = 0; i < tx_dst_ranks.size(); ++i) {
-      tx_dst_ranks[i] = graph_view.is_hypergraph_partitioned()
-                          ? col_comm_rank * row_comm_size + static_cast<int>(i)
-                          : row_comm_rank * col_comm_size + static_cast<int>(i);
-    }
-    for (size_t i = 0; i < rx_src_ranks.size(); ++i) {
-      rx_src_ranks[i] = graph_view.is_hypergraph_partitioned()
-                          ? col_comm_rank * row_comm_size + static_cast<int>(i)
-                          : static_cast<int>(i) * row_comm_size + comm_rank / col_comm_size;
-    }
-
-    device_multicast_sendrecv<decltype(buffer_key_first), decltype(buffer_key_first)>(
-      comm,
-      buffer_key_first,
-      tx_counts,
-      tx_offsets,
-      tx_dst_ranks,
-      buffer_key_first + num_buffer_elements,
-      rx_counts,
-      rx_offsets,
-      rx_src_ranks,
-      handle.get_stream());
-    device_multicast_sendrecv<decltype(buffer_payload_first), decltype(buffer_payload_first)>(
-      comm,
-      buffer_payload_first,
-      tx_counts,
-      tx_offsets,
-      tx_dst_ranks,
-      buffer_payload_first + num_buffer_elements,
-      rx_counts,
-      rx_offsets,
-      rx_src_ranks,
-      handle.get_stream());
-
-    // FIXME: this does not exploit the fact that each segment is sorted. Lost performance
-    // optimization opportunities.
-    // FIXME: we can use [vertex_frontier.buffer_begin(), vertex_frontier.buffer_begin() +
-    // num_buffer_elements) as temporary buffer inside reduce_buffer_elements().
-    num_buffer_offset   = num_buffer_elements;
-    num_buffer_elements = detail::reduce_buffer_elements(handle,
-                                                         buffer_key_first + num_buffer_elements,
-                                                         buffer_payload_first + num_buffer_elements,
-                                                         rx_offsets.back(),
-                                                         reduce_op);
+    rmm::device_uvector<vertex_t> rx_keys(size_t{0}, handle.get_stream());
+    std::tie(rx_keys, std::ignore) =
+      shuffle_values(row_comm, keys.begin(), tx_counts, handle.get_stream());
+    keys = std::move(rx_keys);
+
+    auto rx_payload_buffer = allocate_dataframe_buffer<payload_t>(size_t{0}, handle.get_stream());
+    std::tie(rx_payload_buffer, std::ignore) =
+      shuffle_values(row_comm,
+                     get_dataframe_buffer_begin<payload_t>(payload_buffer),
+                     tx_counts,
+                     handle.get_stream());
+    payload_buffer = std::move(rx_payload_buffer);
+
+    num_buffer_elements =
+      detail::reduce_buffer_elements(handle,
+                                     keys.begin(),
+                                     get_dataframe_buffer_begin<payload_t>(payload_buffer),
+                                     keys.size(),
+                                     reduce_op);
   }
 
   // 3. update vertex properties
 
   if (num_buffer_elements > 0) {
-    auto buffer_first         = vertex_frontier.buffer_begin();
-    auto buffer_key_first     = std::get<0>(buffer_first) + num_buffer_offset;
-    auto buffer_payload_first = std::get<1>(buffer_first) + num_buffer_offset;
-
     raft::grid_1d_thread_t update_grid(num_buffer_elements,
                                        detail::update_frontier_v_push_if_out_nbr_update_block_size,
                                        handle.get_device_properties().maxGridSize[0]);
@@ -666,8 +556,8 @@ void update_frontier_v_push_if_out_nbr(
     detail::update_frontier_and_vertex_output_values<VertexFrontierType::kNumBuckets>
       <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
         vertex_partition,
-        buffer_key_first,
-        buffer_payload_first,
+        keys.begin(),
+        get_dataframe_buffer_begin<payload_t>(payload_buffer),
         num_buffer_elements,
         vertex_value_input_first,
         vertex_value_output_first,
@@ -690,21 +580,5 @@ void update_frontier_v_push_if_out_nbr(
   }
 }
 
-/*
-
-FIXME:
-
-iterating over lower triangular (or upper triangular) : triangle counting
-LRB might be necessary if the cost of processing an edge (i, j) is a function of degree(i) and
-degree(j) : triangle counting
-push-pull switching support (e.g. DOBFS), in this case, we need both
-CSR & CSC (trade-off execution time vs memory requirement, unless graph is symmetric)
-if graph is symmetric, there will be additional optimization opportunities (e.g. in-degree ==
-out-degree) For BFS, sending a bit vector (for the entire set of dest vertices per partitoin may
-work better we can use thrust::set_intersection for triangle counting think about adding thrust
-wrappers for reduction functions. Can I pass nullptr for dummy
-instead of thrust::make_counting_iterator(0)?
-*/
-
 }  // namespace experimental
 }  // namespace cugraph
diff --git a/cpp/include/patterns/vertex_frontier.cuh b/cpp/include/patterns/vertex_frontier.cuh
index c11142d3cf7..375ec097850 100644
--- a/cpp/include/patterns/vertex_frontier.cuh
+++ b/cpp/include/patterns/vertex_frontier.cuh
@@ -48,26 +48,6 @@ inline size_t round_up(size_t number_to_round, size_t modulus)
   return ((number_to_round + (modulus - 1)) / modulus) * modulus;
 }
 
-template <typename TupleType, typename vertex_t, size_t... Is>
-auto make_buffer_zip_iterator_impl(std::vector<void*>& buffer_ptrs,
-                                   size_t offset,
-                                   std::index_sequence<Is...>)
-{
-  auto key_ptr    = reinterpret_cast<vertex_t*>(buffer_ptrs[0]) + offset;
-  auto payload_it = thrust::make_zip_iterator(
-    thrust::make_tuple(reinterpret_cast<typename thrust::tuple_element<Is, TupleType>::type*>(
-      buffer_ptrs[1 + Is])...));
-  return std::make_tuple(key_ptr, payload_it);
-}
-
-template <typename TupleType, typename vertex_t>
-auto make_buffer_zip_iterator(std::vector<void*>& buffer_ptrs, size_t offset)
-{
-  size_t constexpr tuple_size = thrust::tuple_size<TupleType>::value;
-  return make_buffer_zip_iterator_impl<TupleType, vertex_t>(
-    buffer_ptrs, offset, std::make_index_sequence<tuple_size>());
-}
-
 template <size_t num_buckets, typename RowIterator, typename vertex_t, typename SplitOp>
 __global__ void move_and_invalidate_if(RowIterator row_first,
                                        RowIterator row_last,
@@ -199,10 +179,7 @@ class Bucket {
   size_t size_{0};
 };
 
-template <typename ReduceInputTupleType,
-          typename vertex_t,
-          bool is_multi_gpu  = false,
-          size_t num_buckets = 1>
+template <typename vertex_t, bool is_multi_gpu = false, size_t num_buckets = 1>
 class VertexFrontier {
  public:
   static size_t constexpr kNumBuckets = num_buckets;
@@ -211,9 +188,7 @@ class VertexFrontier {
   VertexFrontier(raft::handle_t const& handle, std::vector<size_t> bucket_capacities)
     : handle_ptr_(&handle),
       tmp_bucket_ptrs_(num_buckets, handle.get_stream()),
-      tmp_bucket_sizes_(num_buckets, handle.get_stream()),
-      buffer_ptrs_(kReduceInputTupleSize + 1 /* to store destination column number */, nullptr),
-      buffer_idx_(0, handle_ptr_->get_stream())
+      tmp_bucket_sizes_(num_buckets, handle.get_stream())
   {
     CUGRAPH_EXPECTS(bucket_capacities.size() == num_buckets,
                     "invalid input argument bucket_capacities (size mismatch)");
@@ -228,7 +203,6 @@ class VertexFrontier {
     for (size_t i = 0; i < num_buckets; ++i) {
       buckets_.emplace_back(handle, bucket_capacities[i]);
     }
-    buffer_.set_stream(handle_ptr_->get_stream());
   }
 
   Bucket<vertex_t, is_multi_gpu>& get_bucket(size_t bucket_idx) { return buckets_[bucket_idx]; }
@@ -311,90 +285,11 @@ class VertexFrontier {
     return std::make_tuple(tmp_bucket_ptrs_.data(), tmp_bucket_sizes_.data());
   }
 
-  void resize_buffer(size_t size)
-  {
-    // FIXME: rmm::device_buffer resize incurs copy if memory is reallocated, which is unnecessary
-    // in this case.
-    buffer_.resize(compute_aggregate_buffer_size_in_bytes(size), handle_ptr_->get_stream());
-    if (size > buffer_capacity_) {
-      buffer_capacity_ = size;
-      update_buffer_ptrs();
-    }
-    buffer_size_ = size;
-  }
-
-  void clear_buffer() { resize_buffer(0); }
-
-  void shrink_to_fit_buffer()
-  {
-    if (buffer_size_ != buffer_capacity_) {
-      // FIXME: rmm::device_buffer shrink_to_fit incurs copy if memory is reallocated, which is
-      // unnecessary in this case.
-      buffer_.shrink_to_fit(handle_ptr_->get_stream());
-      update_buffer_ptrs();
-      buffer_capacity_ = buffer_size_;
-    }
-  }
-
-  auto buffer_begin()
-  {
-    return detail::make_buffer_zip_iterator<ReduceInputTupleType, vertex_t>(buffer_ptrs_, 0);
-  }
-
-  auto buffer_end()
-  {
-    return detail::make_buffer_zip_iterator<ReduceInputTupleType, vertex_t>(buffer_ptrs_,
-                                                                            buffer_size_);
-  }
-
-  auto get_buffer_idx_ptr() { return buffer_idx_.data(); }
-
-  size_t get_buffer_idx_value() { return buffer_idx_.value(handle_ptr_->get_stream()); }
-
-  void set_buffer_idx_value(size_t value)
-  {
-    buffer_idx_.set_value(value, handle_ptr_->get_stream());
-  }
-
  private:
-  static size_t constexpr kReduceInputTupleSize = thrust::tuple_size<ReduceInputTupleType>::value;
-  static size_t constexpr kBufferAlignment      = 128;
-
   raft::handle_t const* handle_ptr_{nullptr};
   std::vector<Bucket<vertex_t, is_multi_gpu>> buckets_{};
   rmm::device_uvector<vertex_t*> tmp_bucket_ptrs_;
   rmm::device_uvector<size_t> tmp_bucket_sizes_;
-
-  std::array<size_t, kReduceInputTupleSize> tuple_element_sizes_ =
-    compute_thrust_tuple_element_sizes<ReduceInputTupleType>()();
-  std::vector<void*> buffer_ptrs_{};
-  rmm::device_buffer buffer_{};
-  size_t buffer_size_{0};
-  size_t buffer_capacity_{0};
-  rmm::device_scalar<size_t> buffer_idx_{};
-
-  // FIXME: better pick between this apporach or the approach used in allocate_comm_buffer
-  size_t compute_aggregate_buffer_size_in_bytes(size_t size)
-  {
-    size_t aggregate_buffer_size_in_bytes =
-      detail::round_up(sizeof(vertex_t) * size, kBufferAlignment);
-    for (size_t i = 0; i < kReduceInputTupleSize; ++i) {
-      aggregate_buffer_size_in_bytes +=
-        detail::round_up(tuple_element_sizes_[i] * size, kBufferAlignment);
-    }
-    return aggregate_buffer_size_in_bytes;
-  }
-
-  void update_buffer_ptrs()
-  {
-    uintptr_t ptr   = reinterpret_cast<uintptr_t>(buffer_.data());
-    buffer_ptrs_[0] = reinterpret_cast<void*>(ptr);
-    ptr += detail::round_up(sizeof(vertex_t) * buffer_capacity_, kBufferAlignment);
-    for (size_t i = 0; i < kReduceInputTupleSize; ++i) {
-      buffer_ptrs_[1 + i] = reinterpret_cast<void*>(ptr);
-      ptr += detail::round_up(tuple_element_sizes_[i] * buffer_capacity_, kBufferAlignment);
-    }
-  }
 };
 
 }  // namespace experimental
diff --git a/cpp/include/utilities/collect_comm.cuh b/cpp/include/utilities/collect_comm.cuh
index 8d2227c0f60..481717d7c38 100644
--- a/cpp/include/utilities/collect_comm.cuh
+++ b/cpp/include/utilities/collect_comm.cuh
@@ -58,13 +58,18 @@ collect_values_for_keys(raft::comms::comms_t const &comm,
   double constexpr load_factor = 0.7;
 
   // FIXME: we may compare the performance & memory footprint of this hash based approach vs binary
-  // search based approach
+  // search based approach (especially when thrust::distance(collect_key_first, collect_key_last) <<
+  // thrust::distance(map_key_first, map_key_last)
 
   // 1. build a cuco::static_map object for the map k, v pairs.
 
   auto kv_map_ptr = std::make_unique<cuco::static_map<vertex_t, value_t>>(
-    static_cast<size_t>(static_cast<double>(thrust::distance(map_key_first, map_key_last)) /
-                        load_factor),
+    // FIXME: std::max(..., ...) as a temporary workaround for
+    // https://github.com/NVIDIA/cuCollections/issues/72 and
+    // https://github.com/NVIDIA/cuCollections/issues/73
+    std::max(static_cast<size_t>(
+               static_cast<double>(thrust::distance(map_key_first, map_key_last)) / load_factor),
+             static_cast<size_t>(thrust::distance(map_key_first, map_key_last)) + 1),
     invalid_vertex_id<vertex_t>::value,
     invalid_vertex_id<vertex_t>::value);
   {
@@ -73,7 +78,11 @@ collect_values_for_keys(raft::comms::comms_t const &comm,
       [] __device__(auto val) {
         return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
       });
-    kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last));
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (thrust::distance(map_key_first, map_key_last) > 0) {
+      kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last));
+    }
   }
 
   // 2. collect values for the unique keys in [collect_key_first, collect_key_last)
@@ -82,9 +91,6 @@ collect_values_for_keys(raft::comms::comms_t const &comm,
                                             stream);
   thrust::copy(
     rmm::exec_policy(stream)->on(stream), collect_key_first, collect_key_last, unique_keys.begin());
-  // FIXME: sort and unique are unnecessary if the keys in [collect_key_first, collect_key_last) are
-  // already unique, if this cost becomes a performance bottlenec, we may add
-  // collect_values_for_unique_keys in the future
   thrust::sort(rmm::exec_policy(stream)->on(stream), unique_keys.begin(), unique_keys.end());
   unique_keys.resize(
     thrust::distance(
@@ -107,8 +113,12 @@ collect_values_for_keys(raft::comms::comms_t const &comm,
 
     CUDA_TRY(cudaStreamSynchronize(stream));  // cuco::static_map currently does not take stream
 
-    kv_map_ptr->find(
-      rx_unique_keys.begin(), rx_unique_keys.end(), values_for_rx_unique_keys.begin());
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (rx_unique_keys.size() > 0) {
+      kv_map_ptr->find(
+        rx_unique_keys.begin(), rx_unique_keys.end(), values_for_rx_unique_keys.begin());
+    }
 
     rmm::device_uvector<value_t> rx_values_for_unique_keys(0, stream);
     std::tie(rx_values_for_unique_keys, std::ignore) =
@@ -125,7 +135,11 @@ collect_values_for_keys(raft::comms::comms_t const &comm,
   kv_map_ptr.reset();
 
   kv_map_ptr = std::make_unique<cuco::static_map<vertex_t, value_t>>(
-    static_cast<size_t>(static_cast<double>(unique_keys.size()) / load_factor),
+    // FIXME: std::max(..., ...) as a temporary workaround for
+    // https://github.com/NVIDIA/cuCollections/issues/72 and
+    // https://github.com/NVIDIA/cuCollections/issues/73
+    std::max(static_cast<size_t>(static_cast<double>(unique_keys.size()) / load_factor),
+             unique_keys.size() + 1),
     invalid_vertex_id<vertex_t>::value,
     invalid_vertex_id<vertex_t>::value);
   {
@@ -136,15 +150,154 @@ collect_values_for_keys(raft::comms::comms_t const &comm,
         return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
       });
 
-    kv_map_ptr->insert(pair_first, pair_first + unique_keys.size());
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (unique_keys.size() > 0) { kv_map_ptr->insert(pair_first, pair_first + unique_keys.size()); }
   }
 
   // 4. find values for [collect_key_first, collect_key_last)
 
   auto value_buffer = allocate_dataframe_buffer<value_t>(
     thrust::distance(collect_key_first, collect_key_last), stream);
-  kv_map_ptr->find(
-    collect_key_first, collect_key_last, get_dataframe_buffer_begin<value_t>(value_buffer));
+  // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+  // size is 0; this leads to cudaErrorInvaildConfiguration.
+  if (thrust::distance(collect_key_first, collect_key_last) > 0) {
+    kv_map_ptr->find(
+      collect_key_first, collect_key_last, get_dataframe_buffer_begin<value_t>(value_buffer));
+  }
+
+  return value_buffer;
+}
+
+// for key = [map_key_first, map_key_last), key_to_gpu_id_op(key) should be coincide with
+// comm.get_rank()
+template <typename VertexIterator0,
+          typename VertexIterator1,
+          typename ValueIterator,
+          typename KeyToGPUIdOp>
+decltype(allocate_dataframe_buffer<typename std::iterator_traits<ValueIterator>::value_type>(
+  0, cudaStream_t{nullptr}))
+collect_values_for_unique_keys(raft::comms::comms_t const &comm,
+                               VertexIterator0 map_key_first,
+                               VertexIterator0 map_key_last,
+                               ValueIterator map_value_first,
+                               VertexIterator1 collect_unique_key_first,
+                               VertexIterator1 collect_unique_key_last,
+                               KeyToGPUIdOp key_to_gpu_id_op,
+                               cudaStream_t stream)
+{
+  using vertex_t = typename std::iterator_traits<VertexIterator0>::value_type;
+  static_assert(
+    std::is_same<typename std::iterator_traits<VertexIterator1>::value_type, vertex_t>::value);
+  using value_t = typename std::iterator_traits<ValueIterator>::value_type;
+
+  double constexpr load_factor = 0.7;
+
+  // FIXME: we may compare the performance & memory footprint of this hash based approach vs binary
+  // search based approach (especially when thrust::distance(collect_unique_key_first,
+  // collect_unique_key_last) << thrust::distance(map_key_first, map_key_last)
+
+  // 1. build a cuco::static_map object for the map k, v pairs.
+
+  auto kv_map_ptr = std::make_unique<cuco::static_map<vertex_t, value_t>>(
+    // FIXME: std::max(..., ...) as a temporary workaround for
+    // https://github.com/NVIDIA/cuCollections/issues/72 and
+    // https://github.com/NVIDIA/cuCollections/issues/73
+    std::max(static_cast<size_t>(
+               static_cast<double>(thrust::distance(map_key_first, map_key_last)) / load_factor),
+             static_cast<size_t>(thrust::distance(map_key_first, map_key_last)) + 1),
+    invalid_vertex_id<vertex_t>::value,
+    invalid_vertex_id<vertex_t>::value);
+  {
+    auto pair_first = thrust::make_transform_iterator(
+      thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first)),
+      [] __device__(auto val) {
+        return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
+      });
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (thrust::distance(map_key_first, map_key_last)) {
+      kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last));
+    }
+  }
+
+  // 2. collect values for the unique keys in [collect_unique_key_first, collect_unique_key_last)
+
+  rmm::device_uvector<vertex_t> unique_keys(
+    thrust::distance(collect_unique_key_first, collect_unique_key_last), stream);
+  thrust::copy(rmm::exec_policy(stream)->on(stream),
+               collect_unique_key_first,
+               collect_unique_key_last,
+               unique_keys.begin());
+
+  rmm::device_uvector<value_t> values_for_unique_keys(0, stream);
+  {
+    rmm::device_uvector<vertex_t> rx_unique_keys(0, stream);
+    std::vector<size_t> rx_value_counts{};
+    std::tie(rx_unique_keys, rx_value_counts) = groupby_gpuid_and_shuffle_values(
+      comm,
+      unique_keys.begin(),
+      unique_keys.end(),
+      [key_to_gpu_id_op] __device__(auto val) { return key_to_gpu_id_op(val); },
+      stream);
+
+    rmm::device_uvector<value_t> values_for_rx_unique_keys(rx_unique_keys.size(), stream);
+
+    CUDA_TRY(cudaStreamSynchronize(stream));  // cuco::static_map currently does not take stream
+
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (rx_unique_keys.size() > 0) {
+      kv_map_ptr->find(
+        rx_unique_keys.begin(), rx_unique_keys.end(), values_for_rx_unique_keys.begin());
+    }
+
+    rmm::device_uvector<value_t> rx_values_for_unique_keys(0, stream);
+    std::tie(rx_values_for_unique_keys, std::ignore) =
+      shuffle_values(comm, values_for_rx_unique_keys.begin(), rx_value_counts, stream);
+
+    values_for_unique_keys = std::move(rx_values_for_unique_keys);
+  }
+
+  // 3. re-build a cuco::static_map object for the k, v pairs in unique_keys,
+  // values_for_unique_keys.
+
+  CUDA_TRY(cudaStreamSynchronize(stream));  // cuco::static_map currently does not take stream
+
+  kv_map_ptr.reset();
+
+  kv_map_ptr = std::make_unique<cuco::static_map<vertex_t, value_t>>(
+    // FIXME: std::max(..., ...) as a temporary workaround for
+    // https://github.com/NVIDIA/cuCollections/issues/72 and
+    // https://github.com/NVIDIA/cuCollections/issues/73
+    std::max(static_cast<size_t>(static_cast<double>(unique_keys.size()) / load_factor),
+             unique_keys.size() + 1),
+    invalid_vertex_id<vertex_t>::value,
+    invalid_vertex_id<vertex_t>::value);
+  {
+    auto pair_first = thrust::make_transform_iterator(
+      thrust::make_zip_iterator(
+        thrust::make_tuple(unique_keys.begin(), values_for_unique_keys.begin())),
+      [] __device__(auto val) {
+        return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
+      });
+
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (unique_keys.size() > 0) { kv_map_ptr->insert(pair_first, pair_first + unique_keys.size()); }
+  }
+
+  // 4. find values for [collect_unique_key_first, collect_unique_key_last)
+
+  auto value_buffer = allocate_dataframe_buffer<value_t>(
+    thrust::distance(collect_unique_key_first, collect_unique_key_last), stream);
+  // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+  // size is 0; this leads to cudaErrorInvaildConfiguration.
+  if (thrust::distance(collect_unique_key_first, collect_unique_key_last)) {
+    kv_map_ptr->find(collect_unique_key_first,
+                     collect_unique_key_last,
+                     get_dataframe_buffer_begin<value_t>(value_buffer));
+  }
 
   return value_buffer;
 }
diff --git a/cpp/include/utilities/cython.hpp b/cpp/include/utilities/cython.hpp
index a58331d465a..d8c476760f0 100644
--- a/cpp/include/utilities/cython.hpp
+++ b/cpp/include/utilities/cython.hpp
@@ -93,7 +93,7 @@ struct graph_container_t {
   void* weights;
   void* vertex_partition_offsets;
 
-  size_t num_partition_edges;
+  size_t num_local_edges;
   size_t num_global_vertices;
   size_t num_global_edges;
   numberTypeEnum vertexType;
@@ -103,7 +103,6 @@ struct graph_container_t {
   bool is_multi_gpu;
   bool sorted_by_degree;
   bool do_expensive_check;
-  bool hypergraph_partitioned;
   int row_comm_size;
   int col_comm_size;
   int row_comm_rank;
@@ -147,7 +146,7 @@ struct cy_multi_edgelists_t {
 // replacement for std::tuple<,,>, since std::tuple is not
 // supported in cython
 //
-template <typename vertex_t, typename weight_t>
+template <typename vertex_t, typename edge_t, typename weight_t>
 struct major_minor_weights_t {
   explicit major_minor_weights_t(raft::handle_t const& handle)
     : shuffled_major_vertices_(0, handle.get_stream()),
@@ -155,12 +154,15 @@ struct major_minor_weights_t {
       shuffled_weights_(0, handle.get_stream())
   {
   }
+
   rmm::device_uvector<vertex_t>& get_major(void) { return shuffled_major_vertices_; }
 
   rmm::device_uvector<vertex_t>& get_minor(void) { return shuffled_minor_vertices_; }
 
   rmm::device_uvector<weight_t>& get_weights(void) { return shuffled_weights_; }
 
+  std::vector<edge_t>& get_edge_counts(void) { return edge_counts_; }
+
   std::pair<std::unique_ptr<rmm::device_buffer>, size_t> get_major_wrap(
     void)  // const: triggers errors in Cython autogen-ed C++
   {
@@ -180,10 +182,16 @@ struct major_minor_weights_t {
                           sizeof(weight_t));
   }
 
+  std::unique_ptr<std::vector<edge_t>> get_edge_counts_wrap(void)  // const
+  {
+    return std::make_unique<std::vector<edge_t>>(edge_counts_);
+  }
+
  private:
   rmm::device_uvector<vertex_t> shuffled_major_vertices_;
   rmm::device_uvector<vertex_t> shuffled_minor_vertices_;
   rmm::device_uvector<weight_t> shuffled_weights_;
+  std::vector<edge_t> edge_counts_{};
 };
 
 // aggregate for random_walks() return type
@@ -353,6 +361,9 @@ struct renum_quad_t {
 //   The number of vertices and edges respectively in the graph represented by
 //   the above arrays.
 //
+// bool is_weighted
+//   true if the resulting graph object should store edge weights
+//
 // bool transposed
 //   true if the resulting graph object should store a transposed adjacency
 //   matrix
@@ -369,10 +380,11 @@ void populate_graph_container(graph_container_t& graph_container,
                               numberTypeEnum vertexType,
                               numberTypeEnum edgeType,
                               numberTypeEnum weightType,
-                              size_t num_partition_edges,
+                              size_t num_local_edges,
                               size_t num_global_vertices,
                               size_t num_global_edges,
                               bool sorted_by_degree,
+                              bool is_weighted,
                               bool transposed,
                               bool multi_gpu);
 
@@ -470,14 +482,13 @@ call_random_walks(raft::handle_t const& handle,
 // wrapper for shuffling:
 //
 template <typename vertex_t, typename edge_t, typename weight_t>
-std::unique_ptr<major_minor_weights_t<vertex_t, weight_t>> call_shuffle(
+std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
   raft::handle_t const& handle,
   vertex_t*
     edgelist_major_vertices,  // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place
   vertex_t* edgelist_minor_vertices,  // [IN / OUT]
   weight_t* edgelist_weights,         // [IN / OUT]
-  edge_t num_edgelist_edges,
-  bool is_hypergraph_partitioned);  // = false
+  edge_t num_edgelist_edges);
 
 // Wrapper for calling renumber_edeglist() inplace:
 //
@@ -486,8 +497,7 @@ std::unique_ptr<renum_quad_t<vertex_t, edge_t>> call_renumber(
   raft::handle_t const& handle,
   vertex_t* shuffled_edgelist_major_vertices /* [INOUT] */,
   vertex_t* shuffled_edgelist_minor_vertices /* [INOUT] */,
-  edge_t num_edgelist_edges,
-  bool is_hypergraph_partitioned,
+  std::vector<edge_t> const& edge_counts,
   bool do_expensive_check,
   bool multi_gpu);
 
diff --git a/cpp/include/utilities/dataframe_buffer.cuh b/cpp/include/utilities/dataframe_buffer.cuh
index 06352b8e217..e59b12f2a80 100644
--- a/cpp/include/utilities/dataframe_buffer.cuh
+++ b/cpp/include/utilities/dataframe_buffer.cuh
@@ -47,21 +47,19 @@ auto allocate_dataframe_buffer_tuple_impl(std::index_sequence<Is...>,
 }
 
 template <typename TupleType, typename BufferType, size_t I, size_t N>
-void resize_dataframe_buffer_tuple_element_impl(BufferType& buffer,
-                                                size_t new_buffer_size,
-                                                cudaStream_t stream)
-{
-  std::get<I>(buffer).resize(new_buffer_size, stream);
-  resize_dataframe_buffer_tuple_element_impl<TupleType, BufferType, I + 1, N>(
-    buffer, new_buffer_size, stream);
-}
+struct resize_dataframe_buffer_tuple_iterator_element_impl {
+  void run(BufferType& buffer, size_t new_buffer_size, cudaStream_t stream)
+  {
+    std::get<I>(buffer).resize(new_buffer_size, stream);
+    resize_dataframe_buffer_tuple_iterator_element_impl<TupleType, BufferType, I + 1, N>().run(
+      buffer, new_buffer_size, stream);
+  }
+};
 
 template <typename TupleType, typename BufferType, size_t I>
-void resize_dataframe_buffer_tuple_impl(BufferType& buffer,
-                                        size_t new_buffer_size,
-                                        cudaStream_t stream)
-{
-}
+struct resize_dataframe_buffer_tuple_iterator_element_impl<TupleType, BufferType, I, I> {
+  void run(BufferType& buffer, size_t new_buffer_size, cudaStream_t stream) {}
+};
 
 template <typename TupleType, size_t I, typename BufferType>
 auto get_dataframe_buffer_begin_tuple_element_impl(BufferType& buffer)
@@ -108,8 +106,9 @@ template <typename T,
 void resize_dataframe_buffer(BufferType& buffer, size_t new_buffer_size, cudaStream_t stream)
 {
   size_t constexpr tuple_size = thrust::tuple_size<T>::value;
-  detail::resize_dataframe_buffer_tuple_impl<T, BufferType, size_t{0}, tuple_size>(
-    buffer, new_buffer_size, stream);
+  detail::
+    resize_dataframe_buffer_tuple_iterator_element_impl<T, BufferType, size_t{0}, tuple_size>()
+      .run(buffer, new_buffer_size, stream);
 }
 
 template <typename T,
diff --git a/cpp/include/utilities/device_comm.cuh b/cpp/include/utilities/device_comm.cuh
index 53711f21a6c..daf8524e25b 100644
--- a/cpp/include/utilities/device_comm.cuh
+++ b/cpp/include/utilities/device_comm.cuh
@@ -415,6 +415,66 @@ struct device_bcast_tuple_iterator_element_impl<InputIterator, OutputIterator, I
   }
 };
 
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<thrust::detail::is_discard_iterator<OutputIterator>::value, void>
+device_allreduce_impl(raft::comms::comms_t const& comm,
+                      InputIterator input_first,
+                      OutputIterator output_first,
+                      size_t count,
+                      raft::comms::op_t op,
+                      cudaStream_t stream)
+{
+  // no-op
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_allreduce_impl(raft::comms::comms_t const& comm,
+                      InputIterator input_first,
+                      OutputIterator output_first,
+                      size_t count,
+                      raft::comms::op_t op,
+                      cudaStream_t stream)
+{
+  static_assert(std::is_same<typename std::iterator_traits<InputIterator>::value_type,
+                             typename std::iterator_traits<OutputIterator>::value_type>::value);
+  comm.allreduce(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, op, stream);
+}
+
+template <typename InputIterator, typename OutputIterator, size_t I, size_t N>
+struct device_allreduce_tuple_iterator_element_impl {
+  void run(raft::comms::comms_t const& comm,
+           InputIterator input_first,
+           OutputIterator output_first,
+           size_t count,
+           raft::comms::op_t op,
+           cudaStream_t stream) const
+  {
+    device_allreduce_impl(comm,
+                          thrust::get<I>(input_first.get_iterator_tuple()),
+                          thrust::get<I>(output_first.get_iterator_tuple()),
+                          count,
+                          op,
+                          stream);
+    device_allreduce_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>(
+      comm, input_first, output_first, count, op, stream);
+  }
+};
+
+template <typename InputIterator, typename OutputIterator, size_t I>
+struct device_allreduce_tuple_iterator_element_impl<InputIterator, OutputIterator, I, I> {
+  void run(raft::comms::comms_t const& comm,
+           InputIterator input_first,
+           OutputIterator output_first,
+           size_t count,
+           raft::comms::op_t op,
+           cudaStream_t stream) const
+  {
+  }
+};
+
 template <typename InputIterator, typename OutputIterator>
 std::enable_if_t<thrust::detail::is_discard_iterator<OutputIterator>::value, void>
 device_reduce_impl(raft::comms::comms_t const& comm,
@@ -856,6 +916,46 @@ device_bcast(raft::comms::comms_t const& comm,
       comm, input_first, output_first, count, root, stream);
 }
 
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value,
+  void>
+device_allreduce(raft::comms::comms_t const& comm,
+                 InputIterator input_first,
+                 OutputIterator output_first,
+                 size_t count,
+                 raft::comms::op_t op,
+                 cudaStream_t stream)
+{
+  detail::device_allreduce_impl(comm, input_first, output_first, count, op, stream);
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  is_thrust_tuple_of_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value &&
+    is_thrust_tuple<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_allreduce(raft::comms::comms_t const& comm,
+                 InputIterator input_first,
+                 OutputIterator output_first,
+                 size_t count,
+                 raft::comms::op_t op,
+                 cudaStream_t stream)
+{
+  static_assert(
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value ==
+    thrust::tuple_size<typename thrust::iterator_traits<OutputIterator>::value_type>::value);
+
+  size_t constexpr tuple_size =
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value;
+
+  detail::device_allreduce_tuple_iterator_element_impl<InputIterator,
+                                                       OutputIterator,
+                                                       size_t{0},
+                                                       tuple_size>(
+    comm, input_first, output_first, count, op, stream);
+}
+
 template <typename InputIterator, typename OutputIterator>
 std::enable_if_t<
   std::is_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value,
diff --git a/cpp/include/utilities/shuffle_comm.cuh b/cpp/include/utilities/shuffle_comm.cuh
index 8c363c9a346..b318009d9bf 100644
--- a/cpp/include/utilities/shuffle_comm.cuh
+++ b/cpp/include/utilities/shuffle_comm.cuh
@@ -22,6 +22,12 @@
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <thrust/distance.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/scatter.h>
+#include <thrust/tuple.h>
+
 #include <algorithm>
 #include <numeric>
 #include <vector>
@@ -31,89 +37,6 @@ namespace experimental {
 
 namespace detail {
 
-template <typename ValueIterator, typename ValueToGPUIdOp>
-rmm::device_uvector<size_t> sort_and_count(raft::comms::comms_t const &comm,
-                                           ValueIterator tx_value_first /* [INOUT */,
-                                           ValueIterator tx_value_last /* [INOUT */,
-                                           ValueToGPUIdOp value_to_gpu_id_op,
-                                           cudaStream_t stream)
-{
-  auto const comm_size = comm.get_size();
-
-  thrust::sort(rmm::exec_policy(stream)->on(stream),
-               tx_value_first,
-               tx_value_last,
-               [value_to_gpu_id_op] __device__(auto lhs, auto rhs) {
-                 return value_to_gpu_id_op(lhs) < value_to_gpu_id_op(rhs);
-               });
-
-  auto gpu_id_first = thrust::make_transform_iterator(
-    tx_value_first,
-    [value_to_gpu_id_op] __device__(auto value) { return value_to_gpu_id_op(value); });
-  rmm::device_uvector<int> d_tx_dst_ranks(comm_size, stream);
-  rmm::device_uvector<size_t> d_tx_value_counts(comm_size, stream);
-  auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
-                                    gpu_id_first,
-                                    gpu_id_first + thrust::distance(tx_value_first, tx_value_last),
-                                    thrust::make_constant_iterator(size_t{1}),
-                                    d_tx_dst_ranks.begin(),
-                                    d_tx_value_counts.begin());
-  if (thrust::distance(d_tx_value_counts.begin(), thrust::get<1>(last)) < comm_size) {
-    rmm::device_uvector<size_t> d_counts(comm_size, stream);
-    thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0});
-    thrust::scatter(rmm::exec_policy(stream)->on(stream),
-                    d_tx_value_counts.begin(),
-                    thrust::get<1>(last),
-                    d_tx_dst_ranks.begin(),
-                    d_counts.begin());
-    d_tx_value_counts = std::move(d_counts);
-  }
-
-  return d_tx_value_counts;
-}
-
-template <typename VertexIterator, typename ValueIterator, typename KeyToGPUIdOp>
-rmm::device_uvector<size_t> sort_and_count(raft::comms::comms_t const &comm,
-                                           VertexIterator tx_key_first /* [INOUT */,
-                                           VertexIterator tx_key_last /* [INOUT */,
-                                           ValueIterator tx_value_first /* [INOUT */,
-                                           KeyToGPUIdOp key_to_gpu_id_op,
-                                           cudaStream_t stream)
-{
-  auto const comm_size = comm.get_size();
-
-  thrust::sort_by_key(rmm::exec_policy(stream)->on(stream),
-                      tx_key_first,
-                      tx_key_last,
-                      tx_value_first,
-                      [key_to_gpu_id_op] __device__(auto lhs, auto rhs) {
-                        return key_to_gpu_id_op(lhs) < key_to_gpu_id_op(rhs);
-                      });
-
-  auto gpu_id_first = thrust::make_transform_iterator(
-    tx_key_first, [key_to_gpu_id_op] __device__(auto key) { return key_to_gpu_id_op(key); });
-  rmm::device_uvector<int> d_tx_dst_ranks(comm_size, stream);
-  rmm::device_uvector<size_t> d_tx_value_counts(comm_size, stream);
-  auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
-                                    gpu_id_first,
-                                    gpu_id_first + thrust::distance(tx_key_first, tx_key_last),
-                                    thrust::make_constant_iterator(size_t{1}),
-                                    d_tx_dst_ranks.begin(),
-                                    d_tx_value_counts.begin());
-  if (thrust::distance(d_tx_value_counts.begin(), thrust::get<1>(last)) < comm_size) {
-    rmm::device_uvector<size_t> d_counts(comm_size, stream);
-    thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0});
-    thrust::scatter(rmm::exec_policy(stream)->on(stream),
-                    d_tx_value_counts.begin(),
-                    thrust::get<1>(last),
-                    d_tx_dst_ranks.begin(),
-                    d_counts.begin());
-    d_tx_value_counts = std::move(d_counts);
-  }
-
-  return d_tx_value_counts;
-}
-
 // inline to suppress a complaint about ODR violation
 inline std::tuple<std::vector<size_t>,
                   std::vector<size_t>,
@@ -187,6 +110,86 @@ compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const &comm,
 
 }  // namespace detail
 
+template <typename ValueIterator, typename ValueToGPUIdOp>
+rmm::device_uvector<size_t> groupby_and_count(ValueIterator tx_value_first /* [INOUT */,
+                                              ValueIterator tx_value_last /* [INOUT */,
+                                              ValueToGPUIdOp value_to_group_id_op,
+                                              int num_groups,
+                                              cudaStream_t stream)
+{
+  thrust::sort(rmm::exec_policy(stream)->on(stream),
+               tx_value_first,
+               tx_value_last,
+               [value_to_group_id_op] __device__(auto lhs, auto rhs) {
+                 return value_to_group_id_op(lhs) < value_to_group_id_op(rhs);
+               });
+
+  auto group_id_first = thrust::make_transform_iterator(
+    tx_value_first,
+    [value_to_group_id_op] __device__(auto value) { return value_to_group_id_op(value); });
+  rmm::device_uvector<int> d_tx_dst_ranks(num_groups, stream);
+  rmm::device_uvector<size_t> d_tx_value_counts(d_tx_dst_ranks.size(), stream);
+  auto last =
+    thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
+                          group_id_first,
+                          group_id_first + thrust::distance(tx_value_first, tx_value_last),
+                          thrust::make_constant_iterator(size_t{1}),
+                          d_tx_dst_ranks.begin(),
+                          d_tx_value_counts.begin());
+  if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < num_groups) {
+    rmm::device_uvector<size_t> d_counts(num_groups, stream);
+    thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0});
+    thrust::scatter(rmm::exec_policy(stream)->on(stream),
+                    d_tx_value_counts.begin(),
+                    thrust::get<1>(last),
+                    d_tx_dst_ranks.begin(),
+                    d_counts.begin());
+    d_tx_value_counts = std::move(d_counts);
+  }
+
+  return d_tx_value_counts;
+}
+
+template <typename VertexIterator, typename ValueIterator, typename KeyToGPUIdOp>
+rmm::device_uvector<size_t> groupby_and_count(VertexIterator tx_key_first /* [INOUT */,
+                                              VertexIterator tx_key_last /* [INOUT */,
+                                              ValueIterator tx_value_first /* [INOUT */,
+                                              KeyToGPUIdOp key_to_group_id_op,
+                                              int num_groups,
+                                              cudaStream_t stream)
+{
+  thrust::sort_by_key(rmm::exec_policy(stream)->on(stream),
+                      tx_key_first,
+                      tx_key_last,
+                      tx_value_first,
+                      [key_to_group_id_op] __device__(auto lhs, auto rhs) {
+                        return key_to_group_id_op(lhs) < key_to_group_id_op(rhs);
+                      });
+
+  auto group_id_first = thrust::make_transform_iterator(
+    tx_key_first, [key_to_group_id_op] __device__(auto key) { return key_to_group_id_op(key); });
+  rmm::device_uvector<int> d_tx_dst_ranks(num_groups, stream);
+  rmm::device_uvector<size_t> d_tx_value_counts(d_tx_dst_ranks.size(), stream);
+  auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
+                                    group_id_first,
+                                    group_id_first + thrust::distance(tx_key_first, tx_key_last),
+                                    thrust::make_constant_iterator(size_t{1}),
+                                    d_tx_dst_ranks.begin(),
+                                    d_tx_value_counts.begin());
+  if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < num_groups) {
+    rmm::device_uvector<size_t> d_counts(num_groups, stream);
+    thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0});
+    thrust::scatter(rmm::exec_policy(stream)->on(stream),
+                    d_tx_value_counts.begin(),
+                    thrust::get<1>(last),
+                    d_tx_dst_ranks.begin(),
+                    d_counts.begin());
+    d_tx_value_counts = std::move(d_counts);
+  }
+
+  return d_tx_value_counts;
+}
+
 template <typename TxValueIterator>
 auto shuffle_values(raft::comms::comms_t const &comm,
                     TxValueIterator tx_value_first,
@@ -250,7 +253,7 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const &comm,
   auto const comm_size = comm.get_size();
 
   auto d_tx_value_counts =
-    detail::sort_and_count(comm, tx_value_first, tx_value_last, value_to_gpu_id_op, stream);
+    groupby_and_count(tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), stream);
 
   std::vector<size_t> tx_counts{};
   std::vector<size_t> tx_offsets{};
@@ -301,8 +304,8 @@ auto groupby_gpuid_and_shuffle_kv_pairs(raft::comms::comms_t const &comm,
 {
   auto const comm_size = comm.get_size();
 
-  auto d_tx_value_counts = detail::sort_and_count(
-    comm, tx_key_first, tx_key_last, tx_value_first, key_to_gpu_id_op, stream);
+  auto d_tx_value_counts = groupby_and_count(
+    tx_key_first, tx_key_last, tx_value_first, key_to_gpu_id_op, comm.get_size(), stream);
 
   std::vector<size_t> tx_counts{};
   std::vector<size_t> tx_offsets{};
diff --git a/cpp/include/utilities/thrust_tuple_utils.cuh b/cpp/include/utilities/thrust_tuple_utils.cuh
index 01843a583eb..d5ce6ff1a29 100644
--- a/cpp/include/utilities/thrust_tuple_utils.cuh
+++ b/cpp/include/utilities/thrust_tuple_utils.cuh
@@ -61,13 +61,6 @@ struct compute_thrust_tuple_element_sizes_impl<TupleType, I, I> {
   void compute(std::array<size_t, thrust::tuple_size<TupleType>::value>& arr) const {}
 };
 
-template <typename TupleType, size_t... Is>
-__device__ constexpr auto remove_first_thrust_tuple_element_impl(TupleType const& tuple,
-                                                                 std::index_sequence<Is...>)
-{
-  return thrust::make_tuple(thrust::get<1 + Is>(tuple)...);
-}
-
 template <typename TupleType, size_t I, size_t N>
 struct plus_thrust_tuple_impl {
   __host__ __device__ constexpr void compute(TupleType& lhs, TupleType const& rhs) const
@@ -200,16 +193,6 @@ struct compute_thrust_tuple_element_sizes {
   }
 };
 
-template <typename TupleType>
-struct remove_first_thrust_tuple_element {
-  __device__ constexpr auto operator()(TupleType const& tuple) const
-  {
-    size_t constexpr tuple_size = thrust::tuple_size<TupleType>::value;
-    return detail::remove_first_thrust_tuple_element_impl(
-      tuple, std::make_index_sequence<tuple_size - 1>());
-  }
-};
-
 template <typename TupleType>
 struct plus_thrust_tuple {
   __host__ __device__ constexpr TupleType operator()(TupleType const& lhs,
diff --git a/cpp/src/community/ecg.cu b/cpp/src/community/ecg.cu
index 45f7d723191..a176dfbd1c8 100644
--- a/cpp/src/community/ecg.cu
+++ b/cpp/src/community/ecg.cu
@@ -117,7 +117,7 @@ class EcgLouvain : public cugraph::Louvain<graph_type> {
 
   void initialize_dendrogram_level(vertex_t num_vertices) override
   {
-    this->dendrogram_->add_level(0, num_vertices);
+    this->dendrogram_->add_level(0, num_vertices, this->stream_);
 
     get_permutation_vector(
       num_vertices, seed_, this->dendrogram_->current_level_begin(), this->stream_);
diff --git a/cpp/src/community/leiden.cuh b/cpp/src/community/leiden.cuh
index aae2d3712b5..4ffb7c20eb2 100644
--- a/cpp/src/community/leiden.cuh
+++ b/cpp/src/community/leiden.cuh
@@ -132,7 +132,7 @@ class Leiden : public Louvain<graph_type> {
       //
       //  Initialize every cluster to reference each vertex to itself
       //
-      this->dendrogram_->add_level(0, current_graph.number_of_vertices);
+      this->dendrogram_->add_level(0, current_graph.number_of_vertices, this->stream_);
 
       thrust::sequence(rmm::exec_policy(this->stream_)->on(this->stream_),
                        this->dendrogram_->current_level_begin(),
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index 0862bbc62a9..e3569d4c850 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -210,7 +210,7 @@ class Louvain {
 
   virtual void initialize_dendrogram_level(vertex_t num_vertices)
   {
-    dendrogram_->add_level(0, num_vertices);
+    dendrogram_->add_level(0, num_vertices, stream_);
 
     thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
                      dendrogram_->current_level_begin(),
diff --git a/cpp/src/experimental/bfs.cu b/cpp/src/experimental/bfs.cu
index 7adfbd7fbd7..9145e3737b6 100644
--- a/cpp/src/experimental/bfs.cu
+++ b/cpp/src/experimental/bfs.cu
@@ -93,10 +93,7 @@ void bfs(raft::handle_t const &handle,
   enum class Bucket { cur, num_buckets };
   std::vector<size_t> bucket_sizes(static_cast<size_t>(Bucket::num_buckets),
                                    push_graph_view.get_number_of_local_vertices());
-  VertexFrontier<thrust::tuple<vertex_t>,
-                 vertex_t,
-                 GraphViewType::is_multi_gpu,
-                 static_cast<size_t>(Bucket::num_buckets)>
+  VertexFrontier<vertex_t, GraphViewType::is_multi_gpu, static_cast<size_t>(Bucket::num_buckets)>
     vertex_frontier(handle, bucket_sizes);
 
   if (push_graph_view.is_local_vertex_nocheck(source_vertex)) {
@@ -133,19 +130,16 @@ void bfs(raft::handle_t const &handle,
               *(distances + vertex_partition.get_local_vertex_offset_from_vertex_nocheck(dst));
             if (distance != invalid_distance) { push = false; }
           }
-          // FIXME: need to test this works properly if payload size is 0 (returns a tuple of size
-          // 1)
           return thrust::make_tuple(push, src);
         },
-        reduce_op::any<thrust::tuple<vertex_t>>(),
+        reduce_op::any<vertex_t>(),
         distances,
         thrust::make_zip_iterator(thrust::make_tuple(distances, predecessor_first)),
         vertex_frontier,
         [depth] __device__(auto v_val, auto pushed_val) {
-          auto idx = (v_val == invalid_distance)
-                       ? static_cast<size_t>(Bucket::cur)
-                       : VertexFrontier<thrust::tuple<vertex_t>, vertex_t>::kInvalidBucketIdx;
-          return thrust::make_tuple(idx, depth + 1, thrust::get<0>(pushed_val));
+          auto idx = (v_val == invalid_distance) ? static_cast<size_t>(Bucket::cur)
+                                                 : VertexFrontier<vertex_t>::kInvalidBucketIdx;
+          return thrust::make_tuple(idx, thrust::make_tuple(depth + 1, pushed_val));
         });
 
       auto new_vertex_frontier_aggregate_size =
diff --git a/cpp/src/experimental/coarsen_graph.cu b/cpp/src/experimental/coarsen_graph.cu
index 0cd551b0d73..1eccbd23584 100644
--- a/cpp/src/experimental/coarsen_graph.cu
+++ b/cpp/src/experimental/coarsen_graph.cu
@@ -28,6 +28,7 @@
 
 #include <thrust/copy.h>
 #include <thrust/iterator/discard_iterator.h>
+#include <thrust/scan.h>
 #include <thrust/sort.h>
 #include <thrust/tuple.h>
 
@@ -49,6 +50,7 @@ std::
                                 weight_t const *compressed_sparse_weights,
                                 vertex_t major_first,
                                 vertex_t major_last,
+                                bool is_weighted,
                                 cudaStream_t stream)
 {
   edge_t number_of_edges{0};
@@ -57,8 +59,7 @@ std::
   CUDA_TRY(cudaStreamSynchronize(stream));
   rmm::device_uvector<vertex_t> edgelist_major_vertices(number_of_edges, stream);
   rmm::device_uvector<vertex_t> edgelist_minor_vertices(number_of_edges, stream);
-  rmm::device_uvector<weight_t> edgelist_weights(
-    compressed_sparse_weights != nullptr ? number_of_edges : 0, stream);
+  rmm::device_uvector<weight_t> edgelist_weights(is_weighted ? number_of_edges : 0, stream);
 
   // FIXME: this is highly inefficient for very high-degree vertices, for better performance, we can
   // fill high-degree vertices using one CUDA block per vertex, mid-degree vertices using one CUDA
@@ -77,7 +78,7 @@ std::
                compressed_sparse_indices,
                compressed_sparse_indices + number_of_edges,
                edgelist_minor_vertices.begin());
-  if (compressed_sparse_weights != nullptr) {
+  if (is_weighted) {
     thrust::copy(rmm::exec_policy(stream)->on(stream),
                  compressed_sparse_weights,
                  compressed_sparse_weights + number_of_edges,
@@ -89,62 +90,62 @@ std::
                          std::move(edgelist_weights));
 }
 
-template <typename vertex_t, typename weight_t>
-void sort_and_coarsen_edgelist(rmm::device_uvector<vertex_t> &edgelist_major_vertices /* [INOUT] */,
-                               rmm::device_uvector<vertex_t> &edgelist_minor_vertices /* [INOUT] */,
-                               rmm::device_uvector<weight_t> &edgelist_weights /* [INOUT] */,
-                               cudaStream_t stream)
+template <typename vertex_t, typename edge_t, typename weight_t>
+edge_t groupby_e_and_coarsen_edgelist(vertex_t *edgelist_major_vertices /* [INOUT] */,
+                                      vertex_t *edgelist_minor_vertices /* [INOUT] */,
+                                      weight_t *edgelist_weights /* [INOUT] */,
+                                      edge_t number_of_edges,
+                                      bool is_weighted,
+                                      cudaStream_t stream)
 {
-  auto pair_first = thrust::make_zip_iterator(
-    thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin()));
+  auto pair_first =
+    thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices));
 
-  size_t number_of_edges{0};
-  if (edgelist_weights.size() > 0) {
+  if (is_weighted) {
     thrust::sort_by_key(rmm::exec_policy(stream)->on(stream),
                         pair_first,
-                        pair_first + edgelist_major_vertices.size(),
-                        edgelist_weights.begin());
+                        pair_first + number_of_edges,
+                        edgelist_weights);
 
-    rmm::device_uvector<vertex_t> tmp_edgelist_major_vertices(edgelist_major_vertices.size(),
-                                                              stream);
+    rmm::device_uvector<vertex_t> tmp_edgelist_major_vertices(number_of_edges, stream);
     rmm::device_uvector<vertex_t> tmp_edgelist_minor_vertices(tmp_edgelist_major_vertices.size(),
                                                               stream);
     rmm::device_uvector<weight_t> tmp_edgelist_weights(tmp_edgelist_major_vertices.size(), stream);
     auto it = thrust::reduce_by_key(
       rmm::exec_policy(stream)->on(stream),
       pair_first,
-      pair_first + edgelist_major_vertices.size(),
-      edgelist_weights.begin(),
+      pair_first + number_of_edges,
+      edgelist_weights,
       thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(),
                                                    tmp_edgelist_minor_vertices.begin())),
       tmp_edgelist_weights.begin());
-    number_of_edges = thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it));
+    auto ret =
+      static_cast<edge_t>(thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it)));
 
-    edgelist_major_vertices = std::move(tmp_edgelist_major_vertices);
-    edgelist_minor_vertices = std::move(tmp_edgelist_minor_vertices);
-    edgelist_weights        = std::move(tmp_edgelist_weights);
+    auto edge_first =
+      thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(),
+                                                   tmp_edgelist_minor_vertices.begin(),
+                                                   tmp_edgelist_weights.begin()));
+    thrust::copy(rmm::exec_policy(stream)->on(stream),
+                 edge_first,
+                 edge_first + ret,
+                 thrust::make_zip_iterator(thrust::make_tuple(
+                   edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights)));
+
+    return ret;
   } else {
-    thrust::sort(rmm::exec_policy(stream)->on(stream),
-                 pair_first,
-                 pair_first + edgelist_major_vertices.size());
-    auto it         = thrust::unique(rmm::exec_policy(stream)->on(stream),
-                             pair_first,
-                             pair_first + edgelist_major_vertices.size());
-    number_of_edges = thrust::distance(pair_first, it);
+    thrust::sort(rmm::exec_policy(stream)->on(stream), pair_first, pair_first + number_of_edges);
+    return static_cast<edge_t>(thrust::distance(
+      pair_first,
+      thrust::unique(
+        rmm::exec_policy(stream)->on(stream), pair_first, pair_first + number_of_edges)));
   }
-
-  edgelist_major_vertices.resize(number_of_edges, stream);
-  edgelist_minor_vertices.resize(number_of_edges, stream);
-  edgelist_weights.resize(number_of_edges, stream);
-  edgelist_major_vertices.shrink_to_fit(stream);
-  edgelist_minor_vertices.shrink_to_fit(stream);
-  edgelist_weights.shrink_to_fit(stream);
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
 std::
   tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>, rmm::device_uvector<weight_t>>
-  compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist(
+  compressed_sparse_to_relabeled_and_grouped_and_coarsened_edgelist(
     edge_t const *compressed_sparse_offsets,
     vertex_t const *compressed_sparse_indices,
     weight_t const *compressed_sparse_weights,
@@ -154,6 +155,7 @@ std::
     vertex_t major_last,
     vertex_t minor_first,
     vertex_t minor_last,
+    bool is_weighted,
     cudaStream_t stream)
 {
   // FIXME: it might be possible to directly create relabled & coarsened edgelist from the
@@ -168,6 +170,7 @@ std::
                                   compressed_sparse_weights,
                                   major_first,
                                   major_last,
+                                  is_weighted,
                                   stream);
 
   auto pair_first = thrust::make_zip_iterator(
@@ -182,8 +185,21 @@ std::
                                 p_minor_labels[thrust::get<1>(val) - minor_first]);
     });
 
-  sort_and_coarsen_edgelist(
-    edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights, stream);
+  auto number_of_edges =
+    groupby_e_and_coarsen_edgelist(edgelist_major_vertices.data(),
+                                   edgelist_minor_vertices.data(),
+                                   edgelist_weights.data(),
+                                   static_cast<edge_t>(edgelist_major_vertices.size()),
+                                   is_weighted,
+                                   stream);
+  edgelist_major_vertices.resize(number_of_edges, stream);
+  edgelist_major_vertices.shrink_to_fit(stream);
+  edgelist_minor_vertices.resize(number_of_edges, stream);
+  edgelist_minor_vertices.shrink_to_fit(stream);
+  if (is_weighted) {
+    edgelist_weights.resize(number_of_edges, stream);
+    edgelist_weights.shrink_to_fit(stream);
+  }
 
   return std::make_tuple(std::move(edgelist_major_vertices),
                          std::move(edgelist_minor_vertices),
@@ -220,48 +236,66 @@ coarsen_graph(
     // currently, nothing to do
   }
 
-  // 1. locally construct coarsened edge list
+  // 1. construct coarsened edge list
 
-  // FIXME: we don't need adj_matrix_major_labels if we apply the same partitioning scheme
-  // regardless of hypergraph partitioning is applied or not
-  rmm::device_uvector<vertex_t> adj_matrix_major_labels(
-    store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols()
-                     : graph_view.get_number_of_local_adj_matrix_partition_rows(),
-    handle.get_stream());
   rmm::device_uvector<vertex_t> adj_matrix_minor_labels(
     store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_rows()
                      : graph_view.get_number_of_local_adj_matrix_partition_cols(),
     handle.get_stream());
   if (store_transposed) {
-    copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_major_labels.data());
     copy_to_adj_matrix_row(handle, graph_view, labels, adj_matrix_minor_labels.data());
   } else {
-    copy_to_adj_matrix_row(handle, graph_view, labels, adj_matrix_major_labels.data());
     copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_minor_labels.data());
   }
 
-  rmm::device_uvector<vertex_t> coarsened_edgelist_major_vertices(0, handle.get_stream());
-  rmm::device_uvector<vertex_t> coarsened_edgelist_minor_vertices(0, handle.get_stream());
-  rmm::device_uvector<weight_t> coarsened_edgelist_weights(0, handle.get_stream());
+  std::vector<rmm::device_uvector<vertex_t>> coarsened_edgelist_major_vertices{};
+  std::vector<rmm::device_uvector<vertex_t>> coarsened_edgelist_minor_vertices{};
+  std::vector<rmm::device_uvector<weight_t>> coarsened_edgelist_weights{};
+  coarsened_edgelist_major_vertices.reserve(graph_view.get_number_of_local_adj_matrix_partitions());
+  coarsened_edgelist_minor_vertices.reserve(coarsened_edgelist_major_vertices.size());
+  coarsened_edgelist_weights.reserve(
+    graph_view.is_weighted() ? coarsened_edgelist_major_vertices.size() : size_t{0});
+  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+    coarsened_edgelist_major_vertices.emplace_back(0, handle.get_stream());
+    coarsened_edgelist_minor_vertices.emplace_back(0, handle.get_stream());
+    if (graph_view.is_weighted()) {
+      coarsened_edgelist_weights.emplace_back(0, handle.get_stream());
+    }
+  }
   // FIXME: we may compare performance/memory footprint with the hash_based approach especially when
   // cuco::dynamic_map becomes available (so we don't need to preallocate memory assuming the worst
   // case). We may be able to limit the memory requirement close to the final coarsened edgelist
   // with the hash based approach.
   for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
-    // get edge list
+    // 1-1. locally construct coarsened edge list
+
+    rmm::device_uvector<vertex_t> major_labels(
+      store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols(i)
+                       : graph_view.get_number_of_local_adj_matrix_partition_rows(i),
+      handle.get_stream());
+    // FIXME: this copy is unnecessary, beter fix RAFT comm's bcast to take const iterators for
+    // input
+    thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 labels,
+                 labels + major_labels.size(),
+                 major_labels.begin());
+    device_bcast(col_comm,
+                 major_labels.data(),
+                 major_labels.data(),
+                 major_labels.size(),
+                 static_cast<int>(i),
+                 handle.get_stream());
 
     rmm::device_uvector<vertex_t> edgelist_major_vertices(0, handle.get_stream());
     rmm::device_uvector<vertex_t> edgelist_minor_vertices(0, handle.get_stream());
     rmm::device_uvector<weight_t> edgelist_weights(0, handle.get_stream());
     std::tie(edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights) =
-      compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist(
+      compressed_sparse_to_relabeled_and_grouped_and_coarsened_edgelist(
         graph_view.offsets(i),
         graph_view.indices(i),
         graph_view.weights(i),
-        adj_matrix_major_labels.begin() +
-          (store_transposed ? graph_view.get_local_adj_matrix_partition_col_value_start_offset(i)
-                            : graph_view.get_local_adj_matrix_partition_row_value_start_offset(i)),
-        adj_matrix_minor_labels.begin(),
+        major_labels.data(),
+        adj_matrix_minor_labels.data(),
         store_transposed ? graph_view.get_local_adj_matrix_partition_col_first(i)
                          : graph_view.get_local_adj_matrix_partition_row_first(i),
         store_transposed ? graph_view.get_local_adj_matrix_partition_col_last(i)
@@ -270,86 +304,159 @@ coarsen_graph(
                          : graph_view.get_local_adj_matrix_partition_col_first(i),
         store_transposed ? graph_view.get_local_adj_matrix_partition_row_last(i)
                          : graph_view.get_local_adj_matrix_partition_col_last(i),
+        graph_view.is_weighted(),
         handle.get_stream());
 
-    auto cur_size = coarsened_edgelist_major_vertices.size();
-    // FIXME: this can lead to frequent costly reallocation; we may be able to avoid this if we can
-    // reserve address space to avoid expensive reallocation.
-    // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management
-    coarsened_edgelist_major_vertices.resize(cur_size + edgelist_major_vertices.size(),
-                                             handle.get_stream());
-    coarsened_edgelist_minor_vertices.resize(coarsened_edgelist_major_vertices.size(),
-                                             handle.get_stream());
-    coarsened_edgelist_weights.resize(
-      graph_view.is_weighted() ? coarsened_edgelist_major_vertices.size() : 0, handle.get_stream());
-
-    if (graph_view.is_weighted()) {
-      auto src_edge_first =
-        thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(),
-                                                     edgelist_minor_vertices.begin(),
-                                                     edgelist_weights.begin()));
-      auto dst_edge_first =
-        thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices.begin(),
-                                                     coarsened_edgelist_minor_vertices.begin(),
-                                                     coarsened_edgelist_weights.begin())) +
-        cur_size;
-      thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                   src_edge_first,
-                   src_edge_first + edgelist_major_vertices.size(),
-                   dst_edge_first);
-    } else {
-      auto src_edge_first = thrust::make_zip_iterator(
-        thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin()));
-      auto dst_edge_first =
-        thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices.begin(),
-                                                     coarsened_edgelist_minor_vertices.begin())) +
-        cur_size;
-      thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                   src_edge_first,
-                   src_edge_first + edgelist_major_vertices.size(),
-                   dst_edge_first);
+    // 1-2. globaly shuffle
+
+    {
+      rmm::device_uvector<vertex_t> rx_edgelist_major_vertices(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> rx_edgelist_minor_vertices(0, handle.get_stream());
+      rmm::device_uvector<weight_t> rx_edgelist_weights(0, handle.get_stream());
+      if (graph_view.is_weighted()) {
+        auto edge_first =
+          thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(),
+                                                       edgelist_minor_vertices.begin(),
+                                                       edgelist_weights.begin()));
+        std::forward_as_tuple(
+          std::tie(rx_edgelist_major_vertices, rx_edgelist_minor_vertices, rx_edgelist_weights),
+          std::ignore) =
+          groupby_gpuid_and_shuffle_values(
+            handle.get_comms(),
+            edge_first,
+            edge_first + edgelist_major_vertices.size(),
+            [key_func =
+               detail::compute_gpu_id_from_edge_t<vertex_t>{
+                 comm_size, row_comm_size, col_comm_size}] __device__(auto val) {
+              return key_func(thrust::get<0>(val), thrust::get<1>(val));
+            },
+            handle.get_stream());
+      } else {
+        auto edge_first = thrust::make_zip_iterator(
+          thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin()));
+        std::forward_as_tuple(std::tie(rx_edgelist_major_vertices, rx_edgelist_minor_vertices),
+                              std::ignore) =
+          groupby_gpuid_and_shuffle_values(
+            handle.get_comms(),
+            edge_first,
+            edge_first + edgelist_major_vertices.size(),
+            [key_func =
+               detail::compute_gpu_id_from_edge_t<vertex_t>{
+                 comm_size, row_comm_size, col_comm_size}] __device__(auto val) {
+              return key_func(thrust::get<0>(val), thrust::get<1>(val));
+            },
+            handle.get_stream());
+      }
+
+      edgelist_major_vertices = std::move(rx_edgelist_major_vertices);
+      edgelist_minor_vertices = std::move(rx_edgelist_minor_vertices);
+      edgelist_weights        = std::move(rx_edgelist_weights);
     }
-  }
 
-  sort_and_coarsen_edgelist(coarsened_edgelist_major_vertices,
-                            coarsened_edgelist_minor_vertices,
-                            coarsened_edgelist_weights,
-                            handle.get_stream());
-
-  // 2. globally shuffle edge list and re-coarsen
-
-  {
-    auto edge_first =
-      thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices.begin(),
-                                                   coarsened_edgelist_minor_vertices.begin(),
-                                                   coarsened_edgelist_weights.begin()));
-    rmm::device_uvector<vertex_t> rx_edgelist_major_vertices(0, handle.get_stream());
-    rmm::device_uvector<vertex_t> rx_edgelist_minor_vertices(0, handle.get_stream());
-    rmm::device_uvector<weight_t> rx_edgelist_weights(0, handle.get_stream());
-    std::forward_as_tuple(
-      std::tie(rx_edgelist_major_vertices, rx_edgelist_minor_vertices, rx_edgelist_weights),
-      std::ignore) =
-      groupby_gpuid_and_shuffle_values(
-        handle.get_comms(),
-        edge_first,
-        edge_first + coarsened_edgelist_major_vertices.size(),
-        [key_func =
-           detail::compute_gpu_id_from_edge_t<vertex_t>{graph_view.is_hypergraph_partitioned(),
-                                                        comm.get_size(),
-                                                        row_comm.get_size(),
-                                                        col_comm.get_size()}] __device__(auto val) {
-          return key_func(thrust::get<0>(val), thrust::get<1>(val));
-        },
+    // 1-3. append data to local adjacency matrix partitions
+
+    // FIXME: we can skip this if groupby_gpuid_and_shuffle_values is updated to return sorted edge
+    // list based on the final matrix partition (maybe add
+    // groupby_adj_matrix_partition_and_shuffle_values).
+
+    auto local_partition_id_op =
+      [comm_size,
+       key_func = detail::compute_partition_id_from_edge_t<vertex_t>{
+         comm_size, row_comm_size, col_comm_size}] __device__(auto pair) {
+        return key_func(thrust::get<0>(pair), thrust::get<1>(pair)) /
+               comm_size;  // global partition id to local partition id
+      };
+    auto pair_first = thrust::make_zip_iterator(
+      thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin()));
+    auto counts = graph_view.is_weighted()
+                    ? groupby_and_count(pair_first,
+                                        pair_first + edgelist_major_vertices.size(),
+                                        edgelist_weights.begin(),
+                                        local_partition_id_op,
+                                        graph_view.get_number_of_local_adj_matrix_partitions(),
+                                        handle.get_stream())
+                    : groupby_and_count(pair_first,
+                                        pair_first + edgelist_major_vertices.size(),
+                                        local_partition_id_op,
+                                        graph_view.get_number_of_local_adj_matrix_partitions(),
+                                        handle.get_stream());
+
+    std::vector<size_t> h_counts(counts.size());
+    raft::update_host(h_counts.data(), counts.data(), counts.size(), handle.get_stream());
+    handle.get_stream_view().synchronize();
+
+    std::vector<size_t> h_displacements(h_counts.size(), size_t{0});
+    std::partial_sum(h_counts.begin(), h_counts.end() - 1, h_displacements.begin() + 1);
+
+    for (int j = 0; j < col_comm_size; ++j) {
+      auto number_of_partition_edges = groupby_e_and_coarsen_edgelist(
+        edgelist_major_vertices.begin() + h_displacements[j],
+        edgelist_minor_vertices.begin() + h_displacements[j],
+        graph_view.is_weighted() ? edgelist_weights.begin() + h_displacements[j]
+                                 : static_cast<weight_t *>(nullptr),
+        h_counts[j],
+        graph_view.is_weighted(),
         handle.get_stream());
 
-    sort_and_coarsen_edgelist(rx_edgelist_major_vertices,
-                              rx_edgelist_minor_vertices,
-                              rx_edgelist_weights,
-                              handle.get_stream());
+      auto cur_size = coarsened_edgelist_major_vertices[j].size();
+      // FIXME: this can lead to frequent costly reallocation; we may be able to avoid this if we
+      // can reserve address space to avoid expensive reallocation.
+      // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management
+      coarsened_edgelist_major_vertices[j].resize(cur_size + number_of_partition_edges,
+                                                  handle.get_stream());
+      coarsened_edgelist_minor_vertices[j].resize(coarsened_edgelist_major_vertices[j].size(),
+                                                  handle.get_stream());
+      if (graph_view.is_weighted()) {
+        coarsened_edgelist_weights[j].resize(coarsened_edgelist_major_vertices[j].size(),
+                                             handle.get_stream());
+
+        auto src_edge_first =
+          thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(),
+                                                       edgelist_minor_vertices.begin(),
+                                                       edgelist_weights.begin())) +
+          h_displacements[j];
+        auto dst_edge_first =
+          thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices[j].begin(),
+                                                       coarsened_edgelist_minor_vertices[j].begin(),
+                                                       coarsened_edgelist_weights[j].begin())) +
+          cur_size;
+        thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     src_edge_first,
+                     src_edge_first + number_of_partition_edges,
+                     dst_edge_first);
+      } else {
+        auto src_edge_first = thrust::make_zip_iterator(thrust::make_tuple(
+                                edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())) +
+                              h_displacements[j];
+        auto dst_edge_first = thrust::make_zip_iterator(
+                                thrust::make_tuple(coarsened_edgelist_major_vertices[j].begin(),
+                                                   coarsened_edgelist_minor_vertices[j].begin())) +
+                              cur_size;
+        thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     src_edge_first,
+                     src_edge_first + edgelist_major_vertices.size(),
+                     dst_edge_first);
+      }
+    }
+  }
 
-    coarsened_edgelist_major_vertices = std::move(rx_edgelist_major_vertices);
-    coarsened_edgelist_minor_vertices = std::move(rx_edgelist_minor_vertices);
-    coarsened_edgelist_weights        = std::move(rx_edgelist_weights);
+  for (size_t i = 0; i < coarsened_edgelist_major_vertices.size(); ++i) {
+    auto number_of_partition_edges = groupby_e_and_coarsen_edgelist(
+      coarsened_edgelist_major_vertices[i].data(),
+      coarsened_edgelist_minor_vertices[i].data(),
+      graph_view.is_weighted() ? coarsened_edgelist_weights[i].data()
+                               : static_cast<weight_t *>(nullptr),
+      static_cast<edge_t>(coarsened_edgelist_major_vertices[i].size()),
+      graph_view.is_weighted(),
+      handle.get_stream());
+    coarsened_edgelist_major_vertices[i].resize(number_of_partition_edges, handle.get_stream());
+    coarsened_edgelist_major_vertices[i].shrink_to_fit(handle.get_stream());
+    coarsened_edgelist_minor_vertices[i].resize(number_of_partition_edges, handle.get_stream());
+    coarsened_edgelist_minor_vertices[i].shrink_to_fit(handle.get_stream());
+    if (coarsened_edgelist_weights.size() > 0) {
+      coarsened_edgelist_weights[i].resize(number_of_partition_edges, handle.get_stream());
+      coarsened_edgelist_weights[i].shrink_to_fit(handle.get_stream());
+    }
   }
 
   // 3. find unique labels for this GPU
@@ -395,37 +502,43 @@ coarsen_graph(
 
   rmm::device_uvector<vertex_t> renumber_map_labels(0, handle.get_stream());
   partition_t<vertex_t> partition(std::vector<vertex_t>(comm_size + 1, 0),
-                                  graph_view.is_hypergraph_partitioned(),
                                   row_comm_size,
                                   col_comm_size,
                                   row_comm_rank,
                                   col_comm_rank);
   vertex_t number_of_vertices{};
   edge_t number_of_edges{};
-  std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges) =
-    renumber_edgelist<vertex_t, edge_t, multi_gpu>(
-      handle,
-      unique_labels.data(),
-      static_cast<vertex_t>(unique_labels.size()),
-      coarsened_edgelist_major_vertices.data(),
-      coarsened_edgelist_minor_vertices.data(),
-      static_cast<edge_t>(coarsened_edgelist_major_vertices.size()),
-      graph_view.is_hypergraph_partitioned(),
-      do_expensive_check);
+  {
+    std::vector<vertex_t *> major_ptrs(coarsened_edgelist_major_vertices.size());
+    std::vector<vertex_t *> minor_ptrs(major_ptrs.size());
+    std::vector<edge_t> counts(major_ptrs.size());
+    for (size_t i = 0; i < coarsened_edgelist_major_vertices.size(); ++i) {
+      major_ptrs[i] = coarsened_edgelist_major_vertices[i].data();
+      minor_ptrs[i] = coarsened_edgelist_minor_vertices[i].data();
+      counts[i]     = static_cast<edge_t>(coarsened_edgelist_major_vertices[i].size());
+    }
+    std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges) =
+      renumber_edgelist<vertex_t, edge_t, multi_gpu>(handle,
+                                                     unique_labels.data(),
+                                                     static_cast<vertex_t>(unique_labels.size()),
+                                                     major_ptrs,
+                                                     minor_ptrs,
+                                                     counts,
+                                                     do_expensive_check);
+  }
 
   // 5. build a graph
 
   std::vector<edgelist_t<vertex_t, edge_t, weight_t>> edgelists{};
-  if (graph_view.is_hypergraph_partitioned()) {
-    CUGRAPH_FAIL("unimplemented.");
-  } else {
-    edgelists.resize(1);
-    edgelists[0].p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices.data()
-                                                   : coarsened_edgelist_major_vertices.data();
-    edgelists[0].p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices.data()
-                                                   : coarsened_edgelist_minor_vertices.data();
-    edgelists[0].p_edge_weights  = coarsened_edgelist_weights.data();
-    edgelists[0].number_of_edges = static_cast<edge_t>(coarsened_edgelist_major_vertices.size());
+  edgelists.resize(graph_view.get_number_of_local_adj_matrix_partitions());
+  for (size_t i = 0; i < edgelists.size(); ++i) {
+    edgelists[i].p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices[i].data()
+                                                   : coarsened_edgelist_major_vertices[i].data();
+    edgelists[i].p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices[i].data()
+                                                   : coarsened_edgelist_minor_vertices[i].data();
+    edgelists[i].p_edge_weights = graph_view.is_weighted() ? coarsened_edgelist_weights[i].data()
+                                                           : static_cast<weight_t *>(nullptr);
+    edgelists[i].number_of_edges = static_cast<edge_t>(coarsened_edgelist_major_vertices[i].size());
   }
 
   return std::make_tuple(
@@ -435,7 +548,7 @@ coarsen_graph(
       partition,
       number_of_vertices,
       number_of_edges,
-      graph_properties_t{graph_view.is_symmetric(), false},
+      graph_properties_t{graph_view.is_symmetric(), false, graph_view.is_weighted()},
       true),
     std::move(renumber_map_labels));
 }
@@ -466,7 +579,7 @@ coarsen_graph(
   std::tie(coarsened_edgelist_major_vertices,
            coarsened_edgelist_minor_vertices,
            coarsened_edgelist_weights) =
-    compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist(
+    compressed_sparse_to_relabeled_and_grouped_and_coarsened_edgelist(
       graph_view.offsets(),
       graph_view.indices(),
       graph_view.weights(),
@@ -476,6 +589,7 @@ coarsen_graph(
       graph_view.get_number_of_vertices(),
       vertex_t{0},
       graph_view.get_number_of_vertices(),
+      graph_view.is_weighted(),
       handle.get_stream());
 
   rmm::device_uvector<vertex_t> unique_labels(graph_view.get_number_of_vertices(),
@@ -516,7 +630,7 @@ coarsen_graph(
       handle,
       edgelist,
       static_cast<vertex_t>(renumber_map_labels.size()),
-      graph_properties_t{graph_view.is_symmetric(), false},
+      graph_properties_t{graph_view.is_symmetric(), false, graph_view.is_weighted()},
       true),
     std::move(renumber_map_labels));
 }
diff --git a/cpp/src/experimental/generate_rmat_edgelist.cu b/cpp/src/experimental/generate_rmat_edgelist.cu
index 185fa837a70..d75a4654a15 100644
--- a/cpp/src/experimental/generate_rmat_edgelist.cu
+++ b/cpp/src/experimental/generate_rmat_edgelist.cu
@@ -46,13 +46,13 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> generat
   bool clip_and_flip,
   bool scramble_vertex_ids)
 {
-  CUGRAPH_EXPECTS(size_t{1} << scale <= std::numeric_limits<vertex_t>::max(),
+  CUGRAPH_EXPECTS((size_t{1} << scale) <= static_cast<size_t>(std::numeric_limits<vertex_t>::max()),
                   "Invalid input argument: scale too large for vertex_t.");
   CUGRAPH_EXPECTS((a >= 0.0) && (b >= 0.0) && (c >= 0.0) && (a + b + c <= 1.0),
                   "Invalid input argument: a, b, c should be non-negative and a + b + c should not "
                   "be larger than 1.0.");
 
-  raft::random::Rng rng(seed + 10);
+  raft::random::Rng rng(seed);
   // to limit memory footprint (1024 is a tuning parameter)
   auto max_edges_to_generate_per_iteration =
     static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * 1024;
diff --git a/cpp/src/experimental/graph.cu b/cpp/src/experimental/graph.cu
index 5abe141dafd..47c41cb3426 100644
--- a/cpp/src/experimental/graph.cu
+++ b/cpp/src/experimental/graph.cu
@@ -67,12 +67,12 @@ std::
                                 vertex_t major_last,
                                 vertex_t minor_first,
                                 vertex_t minor_last,
+                                bool is_weighted,
                                 cudaStream_t stream)
 {
   rmm::device_uvector<edge_t> offsets((major_last - major_first) + 1, stream);
   rmm::device_uvector<vertex_t> indices(edgelist.number_of_edges, stream);
-  rmm::device_uvector<weight_t> weights(
-    edgelist.p_edge_weights != nullptr ? edgelist.number_of_edges : 0, stream);
+  rmm::device_uvector<weight_t> weights(is_weighted ? edgelist.number_of_edges : 0, stream);
   thrust::fill(rmm::exec_policy(stream)->on(stream), offsets.begin(), offsets.end(), edge_t{0});
   thrust::fill(rmm::exec_policy(stream)->on(stream), indices.begin(), indices.end(), vertex_t{0});
 
@@ -89,8 +89,7 @@ std::
 
   auto p_offsets = offsets.data();
   auto p_indices = indices.data();
-  auto p_weights =
-    edgelist.p_edge_weights != nullptr ? weights.data() : static_cast<weight_t *>(nullptr);
+  auto p_weights = is_weighted ? weights.data() : static_cast<weight_t *>(nullptr);
 
   thrust::for_each(rmm::exec_policy(stream)->on(stream),
                    store_transposed ? edgelist.p_dst_vertices : edgelist.p_src_vertices,
@@ -103,7 +102,7 @@ std::
   thrust::exclusive_scan(
     rmm::exec_policy(stream)->on(stream), offsets.begin(), offsets.end(), offsets.begin());
 
-  if (edgelist.p_edge_weights != nullptr) {
+  if (is_weighted) {
     auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(
       edgelist.p_src_vertices, edgelist.p_dst_vertices, edgelist.p_edge_weights));
     thrust::for_each(rmm::exec_policy(stream)->on(stream),
@@ -191,24 +190,22 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   CUGRAPH_EXPECTS(edgelists.size() > 0,
                   "Invalid input argument: edgelists.size() should be non-zero.");
 
-  bool is_weighted = edgelists[0].p_edge_weights != nullptr;
-
   CUGRAPH_EXPECTS(
     std::any_of(edgelists.begin() + 1,
                 edgelists.end(),
-                [is_weighted](auto edgelist) {
-                  return (edgelist.p_src_vertices == nullptr) ||
-                         (edgelist.p_dst_vertices == nullptr) ||
-                         (is_weighted && (edgelist.p_edge_weights == nullptr)) ||
+                [is_weighted = properties.is_weighted](auto edgelist) {
+                  return ((edgelist.number_of_edges > 0) && (edgelist.p_src_vertices == nullptr)) ||
+                         ((edgelist.number_of_edges > 0) && (edgelist.p_dst_vertices == nullptr)) ||
+                         (is_weighted && (edgelist.number_of_edges > 0) &&
+                          (edgelist.p_edge_weights == nullptr)) ||
                          (!is_weighted && (edgelist.p_edge_weights != nullptr));
                 }) == false,
     "Invalid input argument: edgelists[].p_src_vertices and edgelists[].p_dst_vertices should not "
-    "be nullptr and edgelists[].p_edge_weights should be nullptr (if edgelists[0].p_edge_weights "
-    "is nullptr) or should not be nullptr (otherwise).");
+    "be nullptr if edgelists[].number_of_edges > 0 and edgelists[].p_edge_weights should be "
+    "nullptr if unweighted or should not be nullptr if weighted and edgelists[].number_of_edges > "
+    "0.");
 
-  CUGRAPH_EXPECTS((partition.is_hypergraph_partitioned() &&
-                   (edgelists.size() == static_cast<size_t>(col_comm_size))) ||
-                    (!(partition.is_hypergraph_partitioned()) && (edgelists.size() == 1)),
+  CUGRAPH_EXPECTS(edgelists.size() == static_cast<size_t>(col_comm_size),
                   "Invalid input argument: errneous edgelists.size().");
 
   // optional expensive checks (part 1/3)
@@ -251,7 +248,7 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
 
   adj_matrix_partition_offsets_.reserve(edgelists.size());
   adj_matrix_partition_indices_.reserve(edgelists.size());
-  adj_matrix_partition_weights_.reserve(is_weighted ? edgelists.size() : 0);
+  adj_matrix_partition_weights_.reserve(properties.is_weighted ? edgelists.size() : 0);
   for (size_t i = 0; i < edgelists.size(); ++i) {
     vertex_t major_first{};
     vertex_t major_last{};
@@ -269,10 +266,11 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                                                       major_last,
                                                       minor_first,
                                                       minor_last,
+                                                      properties.is_weighted,
                                                       this->get_handle_ptr()->get_stream());
     adj_matrix_partition_offsets_.push_back(std::move(offsets));
     adj_matrix_partition_indices_.push_back(std::move(indices));
-    if (is_weighted) { adj_matrix_partition_weights_.push_back(std::move(weights)); }
+    if (properties.is_weighted) { adj_matrix_partition_weights_.push_back(std::move(weights)); }
   }
 
   // update degree-based segment offsets (to be used for graph analytics kernel optimization)
@@ -321,22 +319,12 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                         d_thresholds.end(),
                         segment_offsets.begin() + 1);
 
-    rmm::device_uvector<vertex_t> aggregate_segment_offsets(0, default_stream);
-    if (partition.is_hypergraph_partitioned()) {
-      rmm::device_uvector<vertex_t> aggregate_segment_offsets(
-        col_comm_size * segment_offsets.size(), default_stream);
-      col_comm.allgather(segment_offsets.data(),
-                         aggregate_segment_offsets.data(),
-                         segment_offsets.size(),
-                         default_stream);
-    } else {
-      rmm::device_uvector<vertex_t> aggregate_segment_offsets(
-        row_comm_size * segment_offsets.size(), default_stream);
-      row_comm.allgather(segment_offsets.data(),
-                         aggregate_segment_offsets.data(),
-                         segment_offsets.size(),
-                         default_stream);
-    }
+    rmm::device_uvector<vertex_t> aggregate_segment_offsets(col_comm_size * segment_offsets.size(),
+                                                            default_stream);
+    col_comm.allgather(segment_offsets.data(),
+                       aggregate_segment_offsets.data(),
+                       segment_offsets.size(),
+                       default_stream);
 
     vertex_partition_segment_offsets_.resize(aggregate_segment_offsets.size());
     raft::update_host(vertex_partition_segment_offsets_.data(),
@@ -344,18 +332,10 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                       aggregate_segment_offsets.size(),
                       default_stream);
 
-    raft::comms::status_t status{};
-    if (partition.is_hypergraph_partitioned()) {
-      status = col_comm.sync_stream(
-        default_stream);  // this is necessary as degrees, d_thresholds, and segment_offsets will
-                          // become out-of-scope once control flow exits this block and
-                          // vertex_partition_segment_offsets_ can be used right after return.
-    } else {
-      status = row_comm.sync_stream(
-        default_stream);  // this is necessary as degrees, d_thresholds, and segment_offsets will
-                          // become out-of-scope once control flow exits this block and
-                          // vertex_partition_segment_offsets_ can be used right after return.
-    }
+    auto status = col_comm.sync_stream(
+      default_stream);  // this is necessary as degrees, d_thresholds, and segment_offsets will
+                        // become out-of-scope once control flow exits this block and
+                        // vertex_partition_segment_offsets_ can be used right after return.
     CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
   }
 
@@ -393,9 +373,14 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   auto default_stream = this->get_handle_ptr()->get_stream();
 
   CUGRAPH_EXPECTS(
-    (edgelist.p_src_vertices != nullptr) && (edgelist.p_dst_vertices != nullptr),
+    ((edgelist.number_of_edges == 0) || (edgelist.p_src_vertices != nullptr)) &&
+      ((edgelist.number_of_edges == 0) || (edgelist.p_dst_vertices != nullptr)) &&
+      ((properties.is_weighted &&
+        ((edgelist.number_of_edges == 0) || (edgelist.p_edge_weights != nullptr))) ||
+       (!properties.is_weighted && (edgelist.p_edge_weights == nullptr))),
     "Invalid input argument: edgelist.p_src_vertices and edgelist.p_dst_vertices should "
-    "not be nullptr.");
+    "not be nullptr if edgelist.number_of_edges > 0 and edgelist.p_edge_weights should be nullptr "
+    "if unweighted or should not be nullptr if weighted and edgelist.number_of_edges > 0.");
 
   // optional expensive checks (part 1/2)
 
@@ -427,6 +412,7 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                                                     this->get_number_of_vertices(),
                                                     vertex_t{0},
                                                     this->get_number_of_vertices(),
+                                                    properties.is_weighted,
                                                     this->get_handle_ptr()->get_stream());
 
   // update degree-based segment offsets (to be used for graph analytics kernel optimization)
diff --git a/cpp/src/experimental/graph_view.cu b/cpp/src/experimental/graph_view.cu
index f443608e424..c6f39a44333 100644
--- a/cpp/src/experimental/graph_view.cu
+++ b/cpp/src/experimental/graph_view.cu
@@ -24,6 +24,7 @@
 #include <raft/cudart_utils.h>
 #include <rmm/thrust_rmm_allocator.h>
 #include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
 
 #include <thrust/count.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -195,16 +196,12 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
                   "Internal Error: adj_matrix_partition_weights.size() should coincide with "
                   "adj_matrix_partition_offsets.size() (if weighted) or 0 (if unweighted).");
 
-  CUGRAPH_EXPECTS(
-    (partition.is_hypergraph_partitioned() &&
-     (adj_matrix_partition_offsets.size() == static_cast<size_t>(row_comm_size))) ||
-      (!(partition.is_hypergraph_partitioned()) && (adj_matrix_partition_offsets.size() == 1)),
-    "Internal Error: erroneous adj_matrix_partition_offsets.size().");
+  CUGRAPH_EXPECTS(adj_matrix_partition_offsets.size() == static_cast<size_t>(col_comm_size),
+                  "Internal Error: erroneous adj_matrix_partition_offsets.size().");
 
   CUGRAPH_EXPECTS((sorted_by_global_degree_within_vertex_partition &&
                    (vertex_partition_segment_offsets.size() ==
-                    (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size) *
-                      (detail::num_segments_per_vertex_partition + 1))) ||
+                    col_comm_size * (detail::num_segments_per_vertex_partition + 1))) ||
                     (!sorted_by_global_degree_within_vertex_partition &&
                      (vertex_partition_segment_offsets.size() == 0)),
                   "Internal Error: vertex_partition_segment_offsets.size() does not match "
@@ -267,8 +264,7 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
         "Invalid Invalid input argument: sorted_by_global_degree_within_vertex_partition is "
         "set to true, but degrees are not non-ascending.");
 
-      for (int i = 0; i < (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size);
-           ++i) {
+      for (int i = 0; i < col_comm_size; ++i) {
         CUGRAPH_EXPECTS(std::is_sorted(vertex_partition_segment_offsets.begin() +
                                          (detail::num_segments_per_vertex_partition + 1) * i,
                                        vertex_partition_segment_offsets.begin() +
@@ -278,9 +274,7 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
           vertex_partition_segment_offsets[(detail::num_segments_per_vertex_partition + 1) * i] ==
             0,
           "Internal Error: erroneous vertex_partition_segment_offsets.");
-        auto vertex_partition_idx = partition.is_hypergraph_partitioned()
-                                      ? row_comm_size * i + row_comm_rank
-                                      : col_comm_rank * row_comm_size + i;
+        auto vertex_partition_idx = row_comm_size * i + row_comm_rank;
         CUGRAPH_EXPECTS(
           vertex_partition_segment_offsets[(detail::num_segments_per_vertex_partition + 1) * i +
                                            detail::num_segments_per_vertex_partition] ==
@@ -527,6 +521,174 @@ rmm::device_uvector<weight_t> graph_view_t<
   }
 }
 
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+edge_t
+graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  compute_max_in_degree(raft::handle_t const& handle) const
+{
+  auto in_degrees = compute_in_degrees(handle);
+  auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                in_degrees.begin(),
+                                in_degrees.end());
+  rmm::device_scalar<edge_t> ret(handle.get_stream());
+  device_allreduce(
+    handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream());
+  return ret.value(handle.get_stream());
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+edge_t graph_view_t<vertex_t,
+                    edge_t,
+                    weight_t,
+                    store_transposed,
+                    multi_gpu,
+                    std::enable_if_t<!multi_gpu>>::compute_max_in_degree(raft::handle_t const&
+                                                                           handle) const
+{
+  auto in_degrees = compute_in_degrees(handle);
+  auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                in_degrees.begin(),
+                                in_degrees.end());
+  edge_t ret{};
+  raft::update_host(&ret, it, 1, handle.get_stream());
+  handle.get_stream_view().synchronize();
+  return ret;
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+edge_t
+graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  compute_max_out_degree(raft::handle_t const& handle) const
+{
+  auto out_degrees = compute_out_degrees(handle);
+  auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                out_degrees.begin(),
+                                out_degrees.end());
+  rmm::device_scalar<edge_t> ret(handle.get_stream());
+  device_allreduce(
+    handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream());
+  return ret.value(handle.get_stream());
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+edge_t graph_view_t<vertex_t,
+                    edge_t,
+                    weight_t,
+                    store_transposed,
+                    multi_gpu,
+                    std::enable_if_t<!multi_gpu>>::compute_max_out_degree(raft::handle_t const&
+                                                                            handle) const
+{
+  auto out_degrees = compute_out_degrees(handle);
+  auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                out_degrees.begin(),
+                                out_degrees.end());
+  edge_t ret{};
+  raft::update_host(&ret, it, 1, handle.get_stream());
+  handle.get_stream_view().synchronize();
+  return ret;
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+weight_t
+graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  compute_max_in_weight_sum(raft::handle_t const& handle) const
+{
+  auto in_weight_sums = compute_in_weight_sums(handle);
+  auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                in_weight_sums.begin(),
+                                in_weight_sums.end());
+  rmm::device_scalar<weight_t> ret(handle.get_stream());
+  device_allreduce(
+    handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream());
+  return ret.value(handle.get_stream());
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+weight_t graph_view_t<vertex_t,
+                      edge_t,
+                      weight_t,
+                      store_transposed,
+                      multi_gpu,
+                      std::enable_if_t<!multi_gpu>>::compute_max_in_weight_sum(raft::handle_t const&
+                                                                                 handle) const
+{
+  auto in_weight_sums = compute_in_weight_sums(handle);
+  auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                in_weight_sums.begin(),
+                                in_weight_sums.end());
+  weight_t ret{};
+  raft::update_host(&ret, it, 1, handle.get_stream());
+  handle.get_stream_view().synchronize();
+  return ret;
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+weight_t
+graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  compute_max_out_weight_sum(raft::handle_t const& handle) const
+{
+  auto out_weight_sums = compute_out_weight_sums(handle);
+  auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                out_weight_sums.begin(),
+                                out_weight_sums.end());
+  rmm::device_scalar<weight_t> ret(handle.get_stream());
+  device_allreduce(
+    handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream());
+  return ret.value(handle.get_stream());
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+weight_t graph_view_t<
+  vertex_t,
+  edge_t,
+  weight_t,
+  store_transposed,
+  multi_gpu,
+  std::enable_if_t<!multi_gpu>>::compute_max_out_weight_sum(raft::handle_t const& handle) const
+{
+  auto out_weight_sums = compute_out_weight_sums(handle);
+  auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                out_weight_sums.begin(),
+                                out_weight_sums.end());
+  weight_t ret{};
+  raft::update_host(&ret, it, 1, handle.get_stream());
+  handle.get_stream_view().synchronize();
+  return ret;
+}
+
 // explicit instantiation
 
 template class graph_view_t<int32_t, int32_t, float, true, true>;
diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh
index 3136515faa6..24914fb028b 100644
--- a/cpp/src/experimental/louvain.cuh
+++ b/cpp/src/experimental/louvain.cuh
@@ -151,7 +151,8 @@ class Louvain {
  protected:
   void initialize_dendrogram_level(vertex_t num_vertices)
   {
-    dendrogram_->add_level(current_graph_view_.get_local_vertex_first(), num_vertices);
+    dendrogram_->add_level(
+      current_graph_view_.get_local_vertex_first(), num_vertices, handle_.get_stream());
 
     thrust::sequence(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
                      dendrogram_->current_level_begin(),
@@ -369,8 +370,6 @@ class Louvain {
       current_graph_view_.get_number_of_local_vertices(), handle_.get_stream());
     rmm::device_uvector<weight_t> src_cluster_weights_v(next_cluster_v.size(),
                                                         handle_.get_stream());
-    rmm::device_uvector<weight_t> dst_cluster_weights_v(next_cluster_v.size(),
-                                                        handle_.get_stream());
 
     compute_cluster_sum_and_subtract(old_cluster_sum_v, cluster_subtract_v);
 
@@ -396,19 +395,9 @@ class Louvain {
         vertex_to_gpu_id_op,
         handle_.get_stream());
 
-      dst_cluster_weights_v = cugraph::experimental::collect_values_for_keys(
-        handle_.get_comms(),
-        cluster_keys_v_.begin(),
-        cluster_keys_v_.end(),
-        cluster_weights_v_.data(),
-        d_dst_cluster_cache_,
-        d_dst_cluster_cache_ + dst_cluster_cache_v_.size(),
-        vertex_to_gpu_id_op,
-        handle_.get_stream());
-
-      map_key_first   = d_dst_cluster_cache_;
-      map_key_last    = d_dst_cluster_cache_ + dst_cluster_cache_v_.size();
-      map_value_first = dst_cluster_weights_v.begin();
+      map_key_first   = cluster_keys_v_.begin();
+      map_key_last    = cluster_keys_v_.end();
+      map_value_first = cluster_weights_v_.begin();
     } else {
       thrust::sort_by_key(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
                           cluster_keys_v_.begin(),
@@ -432,12 +421,21 @@ class Louvain {
       map_value_first = src_cluster_weights_v.begin();
     }
 
+    rmm::device_uvector<weight_t> src_old_cluster_sum_v(
+      current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), handle_.get_stream());
+    rmm::device_uvector<weight_t> src_cluster_subtract_v(
+      current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), handle_.get_stream());
+    copy_to_adj_matrix_row(
+      handle_, current_graph_view_, old_cluster_sum_v.begin(), src_old_cluster_sum_v.begin());
+    copy_to_adj_matrix_row(
+      handle_, current_graph_view_, cluster_subtract_v.begin(), src_cluster_subtract_v.begin());
+
     copy_v_transform_reduce_key_aggregated_out_nbr(
       handle_,
       current_graph_view_,
-      thrust::make_zip_iterator(thrust::make_tuple(old_cluster_sum_v.begin(),
+      thrust::make_zip_iterator(thrust::make_tuple(src_old_cluster_sum_v.begin(),
                                                    d_src_vertex_weights_cache_,
-                                                   cluster_subtract_v.begin(),
+                                                   src_cluster_subtract_v.begin(),
                                                    d_src_cluster_cache_,
                                                    src_cluster_weights_v.begin())),
 
diff --git a/cpp/src/experimental/relabel.cu b/cpp/src/experimental/relabel.cu
index 62bd6951f71..8d8fb0322a8 100644
--- a/cpp/src/experimental/relabel.cu
+++ b/cpp/src/experimental/relabel.cu
@@ -42,6 +42,7 @@
 namespace cugraph {
 namespace experimental {
 
+// FIXME: think about requiring old_new_label_pairs to be pre-shuffled
 template <typename vertex_t, bool multi_gpu>
 void relabel(raft::handle_t const& handle,
              std::tuple<vertex_t const*, vertex_t const*> old_new_label_pairs,
@@ -120,7 +121,12 @@ void relabel(raft::handle_t const& handle,
         handle.get_stream()));  // cuco::static_map currently does not take stream
 
       cuco::static_map<vertex_t, vertex_t> relabel_map{
-        static_cast<size_t>(static_cast<double>(rx_label_pair_old_labels.size()) / load_factor),
+        // FIXME: std::max(..., ...) as a temporary workaround for
+        // https://github.com/NVIDIA/cuCollections/issues/72 and
+        // https://github.com/NVIDIA/cuCollections/issues/73
+        std::max(
+          static_cast<size_t>(static_cast<double>(rx_label_pair_old_labels.size()) / load_factor),
+          rx_label_pair_old_labels.size() + 1),
         invalid_vertex_id<vertex_t>::value,
         invalid_vertex_id<vertex_t>::value};
 
@@ -130,7 +136,11 @@ void relabel(raft::handle_t const& handle,
         [] __device__(auto val) {
           return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
         });
-      relabel_map.insert(pair_first, pair_first + rx_label_pair_old_labels.size());
+      // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the
+      // grid size is 0; this leads to cudaErrorInvaildConfiguration.
+      if (rx_label_pair_old_labels.size() > 0) {
+        relabel_map.insert(pair_first, pair_first + rx_label_pair_old_labels.size());
+      }
 
       rx_label_pair_old_labels.resize(0, handle.get_stream());
       rx_label_pair_new_labels.resize(0, handle.get_stream());
@@ -152,19 +162,29 @@ void relabel(raft::handle_t const& handle,
         CUDA_TRY(cudaStreamSynchronize(
           handle.get_stream()));  // cuco::static_map currently does not take stream
 
-        relabel_map.find(
-          rx_unique_old_labels.begin(),
-          rx_unique_old_labels.end(),
-          rx_unique_old_labels
-            .begin());  // now rx_unique_old_lables hold new labels for the corresponding old labels
+        // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the
+        // grid size is 0; this leads to cudaErrorInvaildConfiguration.
+        if (rx_unique_old_labels.size() > 0) {
+          relabel_map.find(
+            rx_unique_old_labels.begin(),
+            rx_unique_old_labels.end(),
+            rx_unique_old_labels.begin());  // now rx_unique_old_lables hold new labels for the
+                                            // corresponding old labels
+        }
 
         std::tie(new_labels_for_unique_old_labels, std::ignore) = shuffle_values(
           handle.get_comms(), rx_unique_old_labels.begin(), rx_value_counts, handle.get_stream());
       }
     }
 
+    handle.get_stream_view().synchronize();  // cuco::static_map currently does not take stream
+
     cuco::static_map<vertex_t, vertex_t> relabel_map(
-      static_cast<size_t>(static_cast<double>(unique_old_labels.size()) / load_factor),
+      // FIXME: std::max(..., ...) as a temporary workaround for
+      // https://github.com/NVIDIA/cuCollections/issues/72 and
+      // https://github.com/NVIDIA/cuCollections/issues/73
+      std::max(static_cast<size_t>(static_cast<double>(unique_old_labels.size()) / load_factor),
+               unique_old_labels.size() + 1),
       invalid_vertex_id<vertex_t>::value,
       invalid_vertex_id<vertex_t>::value);
 
@@ -175,11 +195,21 @@ void relabel(raft::handle_t const& handle,
         return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
       });
 
-    relabel_map.insert(pair_first, pair_first + unique_old_labels.size());
-    relabel_map.find(labels, labels + num_labels, labels);
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (unique_old_labels.size() > 0) {
+      relabel_map.insert(pair_first, pair_first + unique_old_labels.size());
+    }
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (num_labels > 0) { relabel_map.find(labels, labels + num_labels, labels); }
   } else {
     cuco::static_map<vertex_t, vertex_t> relabel_map(
-      static_cast<size_t>(static_cast<double>(num_label_pairs) / load_factor),
+      // FIXME: std::max(..., ...) as a temporary workaround for
+      // https://github.com/NVIDIA/cuCollections/issues/72 and
+      // https://github.com/NVIDIA/cuCollections/issues/73
+      std::max(static_cast<size_t>(static_cast<double>(num_label_pairs) / load_factor),
+               static_cast<size_t>(num_label_pairs) + 1),
       invalid_vertex_id<vertex_t>::value,
       invalid_vertex_id<vertex_t>::value);
 
@@ -190,8 +220,12 @@ void relabel(raft::handle_t const& handle,
         return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
       });
 
-    relabel_map.insert(pair_first, pair_first + num_label_pairs);
-    relabel_map.find(labels, labels + num_labels, labels);
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (num_label_pairs > 0) { relabel_map.insert(pair_first, pair_first + num_label_pairs); }
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (num_labels > 0) { relabel_map.find(labels, labels + num_labels, labels); }
   }
 
   if (do_expensive_check) {
diff --git a/cpp/src/experimental/renumber_edgelist.cu b/cpp/src/experimental/renumber_edgelist.cu
index a8847167b87..127bd507271 100644
--- a/cpp/src/experimental/renumber_edgelist.cu
+++ b/cpp/src/experimental/renumber_edgelist.cu
@@ -50,62 +50,153 @@ rmm::device_uvector<vertex_t> compute_renumber_map(
   raft::handle_t const& handle,
   vertex_t const* vertices,
   vertex_t num_local_vertices /* relevant only if vertices != nullptr */,
-  vertex_t const* edgelist_major_vertices,
-  vertex_t const* edgelist_minor_vertices,
-  edge_t num_edgelist_edges)
+  std::vector<vertex_t const*> const& edgelist_major_vertices,
+  std::vector<vertex_t const*> const& edgelist_minor_vertices,
+  std::vector<edge_t> const& edgelist_edge_counts)
 {
   // FIXME: compare this sort based approach with hash based approach in both speed and memory
   // footprint
 
   // 1. acquire (unique major label, count) pairs
 
-  rmm::device_uvector<vertex_t> tmp_labels(num_edgelist_edges, handle.get_stream());
-  thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-               edgelist_major_vertices,
-               edgelist_major_vertices + num_edgelist_edges,
-               tmp_labels.begin());
-  thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-               tmp_labels.begin(),
-               tmp_labels.end());
-  rmm::device_uvector<vertex_t> major_labels(tmp_labels.size(), handle.get_stream());
-  rmm::device_uvector<edge_t> major_counts(major_labels.size(), handle.get_stream());
-  auto major_pair_it =
-    thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                          tmp_labels.begin(),
-                          tmp_labels.end(),
-                          thrust::make_constant_iterator(edge_t{1}),
-                          major_labels.begin(),
-                          major_counts.begin());
-  tmp_labels.resize(0, handle.get_stream());
-  tmp_labels.shrink_to_fit(handle.get_stream());
-  major_labels.resize(thrust::distance(major_labels.begin(), thrust::get<0>(major_pair_it)),
-                      handle.get_stream());
-  major_counts.resize(major_labels.size(), handle.get_stream());
-  major_labels.shrink_to_fit(handle.get_stream());
-  major_counts.shrink_to_fit(handle.get_stream());
+  rmm::device_uvector<vertex_t> major_labels(0, handle.get_stream());
+  rmm::device_uvector<edge_t> major_counts(0, handle.get_stream());
+  for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
+    rmm::device_uvector<vertex_t> tmp_major_labels(0, handle.get_stream());
+    rmm::device_uvector<edge_t> tmp_major_counts(0, handle.get_stream());
+    {
+      rmm::device_uvector<vertex_t> sorted_major_labels(edgelist_edge_counts[i],
+                                                        handle.get_stream());
+      thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   edgelist_major_vertices[i],
+                   edgelist_major_vertices[i] + edgelist_edge_counts[i],
+                   sorted_major_labels.begin());
+      thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   sorted_major_labels.begin(),
+                   sorted_major_labels.end());
+      auto num_unique_labels =
+        thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                         thrust::make_counting_iterator(size_t{0}),
+                         thrust::make_counting_iterator(sorted_major_labels.size()),
+                         [labels = sorted_major_labels.data()] __device__(auto i) {
+                           return (i == 0) || (labels[i - 1] != labels[i]);
+                         });
+      tmp_major_labels.resize(num_unique_labels, handle.get_stream());
+      tmp_major_counts.resize(tmp_major_labels.size(), handle.get_stream());
+      thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                            sorted_major_labels.begin(),
+                            sorted_major_labels.end(),
+                            thrust::make_constant_iterator(edge_t{1}),
+                            tmp_major_labels.begin(),
+                            tmp_major_counts.begin());
+    }
+
+    if (multi_gpu) {
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+      auto const col_comm_size = col_comm.get_size();
+
+      rmm::device_uvector<vertex_t> rx_major_labels(0, handle.get_stream());
+      rmm::device_uvector<edge_t> rx_major_counts(0, handle.get_stream());
+      auto rx_sizes = host_scalar_gather(
+        col_comm, tmp_major_labels.size(), static_cast<int>(i), handle.get_stream());
+      std::vector<size_t> rx_displs{};
+      if (static_cast<int>(i) == col_comm_rank) {
+        rx_displs.assign(col_comm_size, size_t{0});
+        std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1);
+        rx_major_labels.resize(rx_displs.back() + rx_sizes.back(), handle.get_stream());
+        rx_major_counts.resize(rx_major_labels.size(), handle.get_stream());
+      }
+      device_gatherv(col_comm,
+                     thrust::make_zip_iterator(
+                       thrust::make_tuple(tmp_major_labels.begin(), tmp_major_counts.begin())),
+                     thrust::make_zip_iterator(
+                       thrust::make_tuple(rx_major_labels.begin(), rx_major_counts.begin())),
+                     tmp_major_labels.size(),
+                     rx_sizes,
+                     rx_displs,
+                     static_cast<int>(i),
+                     handle.get_stream());
+      if (static_cast<int>(i) == col_comm_rank) {
+        thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                            rx_major_labels.begin(),
+                            rx_major_labels.end(),
+                            rx_major_counts.begin());
+        major_labels.resize(rx_major_labels.size(), handle.get_stream());
+        major_counts.resize(major_labels.size(), handle.get_stream());
+        auto pair_it =
+          thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                rx_major_labels.begin(),
+                                rx_major_labels.end(),
+                                rx_major_counts.begin(),
+                                major_labels.begin(),
+                                major_counts.begin());
+        major_labels.resize(thrust::distance(major_labels.begin(), thrust::get<0>(pair_it)),
+                            handle.get_stream());
+        major_counts.resize(major_labels.size(), handle.get_stream());
+        major_labels.shrink_to_fit(handle.get_stream());
+        major_counts.shrink_to_fit(handle.get_stream());
+      }
+    } else {
+      tmp_major_labels.shrink_to_fit(handle.get_stream());
+      tmp_major_counts.shrink_to_fit(handle.get_stream());
+      major_labels = std::move(tmp_major_labels);
+      major_counts = std::move(tmp_major_counts);
+    }
+  }
 
   // 2. acquire unique minor labels
 
-  rmm::device_uvector<vertex_t> minor_labels(num_edgelist_edges, handle.get_stream());
-  thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-               edgelist_minor_vertices,
-               edgelist_minor_vertices + num_edgelist_edges,
-               minor_labels.begin());
+  std::vector<edge_t> minor_displs(edgelist_minor_vertices.size(), edge_t{0});
+  std::partial_sum(
+    edgelist_edge_counts.begin(), edgelist_edge_counts.end() - 1, minor_displs.begin() + 1);
+  rmm::device_uvector<vertex_t> minor_labels(minor_displs.back() + edgelist_edge_counts.back(),
+                                             handle.get_stream());
+  for (size_t i = 0; i < edgelist_minor_vertices.size(); ++i) {
+    thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 edgelist_minor_vertices[i],
+                 edgelist_minor_vertices[i] + edgelist_edge_counts[i],
+                 minor_labels.begin() + minor_displs[i]);
+  }
   thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
                minor_labels.begin(),
                minor_labels.end());
-  auto minor_label_it =
-    thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                   minor_labels.begin(),
-                   minor_labels.end());
-  minor_labels.resize(thrust::distance(minor_labels.begin(), minor_label_it), handle.get_stream());
+  minor_labels.resize(
+    thrust::distance(minor_labels.begin(),
+                     thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                    minor_labels.begin(),
+                                    minor_labels.end())),
+    handle.get_stream());
+  if (multi_gpu) {
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_size = row_comm.get_size();
+
+    rmm::device_uvector<vertex_t> rx_minor_labels(0, handle.get_stream());
+    std::tie(rx_minor_labels, std::ignore) = groupby_gpuid_and_shuffle_values(
+      row_comm,
+      minor_labels.begin(),
+      minor_labels.end(),
+      [key_func = detail::compute_gpu_id_from_vertex_t<vertex_t>{row_comm_size}] __device__(
+        auto val) { return key_func(val); },
+      handle.get_stream());
+    thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 rx_minor_labels.begin(),
+                 rx_minor_labels.end());
+    rx_minor_labels.resize(
+      thrust::distance(
+        rx_minor_labels.begin(),
+        thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       rx_minor_labels.begin(),
+                       rx_minor_labels.end())),
+      handle.get_stream());
+    minor_labels = std::move(rx_minor_labels);
+  }
   minor_labels.shrink_to_fit(handle.get_stream());
 
   // 3. merge major and minor labels and vertex labels
 
   rmm::device_uvector<vertex_t> merged_labels(major_labels.size() + minor_labels.size(),
                                               handle.get_stream());
-
   rmm::device_uvector<edge_t> merged_counts(merged_labels.size(), handle.get_stream());
   thrust::merge_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
                        major_labels.begin(),
@@ -142,47 +233,7 @@ rmm::device_uvector<vertex_t> compute_renumber_map(
   labels.shrink_to_fit(handle.get_stream());
   counts.shrink_to_fit(handle.get_stream());
 
-  // 4. if multi-GPU, shuffle and reduce (label, count) pairs
-
-  if (multi_gpu) {
-    auto& comm           = handle.get_comms();
-    auto const comm_size = comm.get_size();
-
-    auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(labels.begin(), counts.begin()));
-    rmm::device_uvector<vertex_t> rx_labels(0, handle.get_stream());
-    rmm::device_uvector<edge_t> rx_counts(0, handle.get_stream());
-    std::forward_as_tuple(std::tie(rx_labels, rx_counts), std::ignore) =
-      groupby_gpuid_and_shuffle_values(
-        comm,
-        pair_first,
-        pair_first + labels.size(),
-        [key_func = detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size}] __device__(
-          auto val) { return key_func(thrust::get<0>(val)); },
-        handle.get_stream());
-
-    labels.resize(rx_labels.size(), handle.get_stream());
-    counts.resize(labels.size(), handle.get_stream());
-    thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                        rx_labels.begin(),
-                        rx_labels.end(),
-                        rx_counts.begin());
-    pair_it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                                    rx_labels.begin(),
-                                    rx_labels.end(),
-                                    rx_counts.begin(),
-                                    labels.begin(),
-                                    counts.begin());
-    rx_labels.resize(0, handle.get_stream());
-    rx_counts.resize(0, handle.get_stream());
-    rx_labels.shrink_to_fit(handle.get_stream());
-    rx_counts.shrink_to_fit(handle.get_stream());
-    labels.resize(thrust::distance(labels.begin(), thrust::get<0>(pair_it)), handle.get_stream());
-    counts.resize(labels.size(), handle.get_stream());
-    labels.shrink_to_fit(handle.get_stream());
-    labels.shrink_to_fit(handle.get_stream());
-  }
-
-  // 5. if vertices != nullptr, add isolated vertices
+  // 4. if vertices != nullptr, add isolated vertices
 
   rmm::device_uvector<vertex_t> isolated_vertices(0, handle.get_stream());
   if (vertices != nullptr) {
@@ -232,10 +283,9 @@ void expensive_check_edgelist(
   raft::handle_t const& handle,
   vertex_t const* local_vertices,
   vertex_t num_local_vertices /* relevant only if local_vertices != nullptr */,
-  vertex_t const* edgelist_major_vertices,
-  vertex_t const* edgelist_minor_vertices,
-  edge_t num_edgelist_edges,
-  bool is_hypergraph_partitioned /* relevant only if multi_gpu == true */)
+  std::vector<vertex_t const*> const& edgelist_major_vertices,
+  std::vector<vertex_t const*> const& edgelist_minor_vertices,
+  std::vector<edge_t> const& edgelist_edge_counts)
 {
   rmm::device_uvector<vertex_t> sorted_local_vertices(
     local_vertices != nullptr ? num_local_vertices : vertex_t{0}, handle.get_stream());
@@ -246,6 +296,12 @@ void expensive_check_edgelist(
   thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
                sorted_local_vertices.begin(),
                sorted_local_vertices.end());
+  CUGRAPH_EXPECTS(static_cast<size_t>(thrust::distance(
+                    sorted_local_vertices.begin(),
+                    thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                   sorted_local_vertices.begin(),
+                                   sorted_local_vertices.end()))) == sorted_local_vertices.size(),
+                  "Invalid input argument: local_vertices should not have duplicates.");
 
   if (multi_gpu) {
     auto& comm               = handle.get_comms();
@@ -253,8 +309,15 @@ void expensive_check_edgelist(
     auto const comm_rank     = comm.get_rank();
     auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
     auto const row_comm_size = row_comm.get_size();
+    auto const row_comm_rank = row_comm.get_rank();
     auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
     auto const col_comm_size = col_comm.get_size();
+    auto const col_comm_rank = col_comm.get_rank();
+
+    CUGRAPH_EXPECTS((edgelist_major_vertices.size() == edgelist_minor_vertices.size()) &&
+                      (edgelist_major_vertices.size() == static_cast<size_t>(col_comm_size)),
+                    "Invalid input argument: both edgelist_major_vertices.size() & "
+                    "edgelist_minor_vertices.size() should coincide with col_comm_size.");
 
     CUGRAPH_EXPECTS(
       thrust::count_if(
@@ -268,95 +331,127 @@ void expensive_check_edgelist(
         }) == 0,
       "Invalid input argument: local_vertices should be pre-shuffled.");
 
-    auto edge_first = thrust::make_zip_iterator(
-      thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices));
-    CUGRAPH_EXPECTS(
-      thrust::count_if(
-        rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-        edge_first,
-        edge_first + num_edgelist_edges,
-        [comm_rank,
-         key_func =
-           detail::compute_gpu_id_from_edge_t<vertex_t>{is_hypergraph_partitioned,
-                                                        comm_size,
-                                                        row_comm_size,
-                                                        col_comm_size}] __device__(auto edge) {
-          return key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != comm_rank;
-        }) == 0,
-      "Invalid input argument: edgelist_major_vertices & edgelist_minor_vertices should be "
-      "pre-shuffled.");
-
-    if (local_vertices != nullptr) {
-      rmm::device_uvector<vertex_t> unique_edge_vertices(num_edgelist_edges * 2,
-                                                         handle.get_stream());
-      thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                   edgelist_major_vertices,
-                   edgelist_major_vertices + num_edgelist_edges,
-                   unique_edge_vertices.begin());
-      thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                   edgelist_minor_vertices,
-                   edgelist_minor_vertices + num_edgelist_edges,
-                   unique_edge_vertices.begin() + num_edgelist_edges);
-      thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                   unique_edge_vertices.begin(),
-                   unique_edge_vertices.end());
-      unique_edge_vertices.resize(
-        thrust::distance(
-          unique_edge_vertices.begin(),
-          thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                         unique_edge_vertices.begin(),
-                         unique_edge_vertices.end())),
-        handle.get_stream());
-
-      rmm::device_uvector<vertex_t> rx_unique_edge_vertices(0, handle.get_stream());
-      std::tie(rx_unique_edge_vertices, std::ignore) = groupby_gpuid_and_shuffle_values(
-        handle.get_comms(),
-        unique_edge_vertices.begin(),
-        unique_edge_vertices.end(),
-        [key_func = detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size}] __device__(
-          auto val) { return key_func(val); },
-        handle.get_stream());
-
-      unique_edge_vertices = std::move(rx_unique_edge_vertices);
-
+    for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
+      auto edge_first = thrust::make_zip_iterator(
+        thrust::make_tuple(edgelist_major_vertices[i], edgelist_minor_vertices[i]));
       CUGRAPH_EXPECTS(
         thrust::count_if(
           rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-          unique_edge_vertices.begin(),
-          unique_edge_vertices.end(),
-          [num_local_vertices,
-           sorted_local_vertices = sorted_local_vertices.data()] __device__(auto v) {
-            return !thrust::binary_search(
-              thrust::seq, sorted_local_vertices, sorted_local_vertices + num_local_vertices, v);
+          edge_first,
+          edge_first + edgelist_edge_counts[i],
+          [comm_size,
+           comm_rank,
+           row_comm_rank,
+           col_comm_size,
+           col_comm_rank,
+           i,
+           gpu_id_key_func =
+             detail::compute_gpu_id_from_edge_t<vertex_t>{comm_size, row_comm_size, col_comm_size},
+           partition_id_key_func =
+             detail::compute_partition_id_from_edge_t<vertex_t>{
+               comm_size, row_comm_size, col_comm_size}] __device__(auto edge) {
+            return (gpu_id_key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != comm_rank) ||
+                   (partition_id_key_func(thrust::get<0>(edge), thrust::get<1>(edge)) !=
+                    row_comm_rank * col_comm_size + col_comm_rank + i * comm_size);
           }) == 0,
-        "Invalid input argument: edgelist_major_vertices and/or edgelist_minor_vertices have "
-        "invalid vertex ID(s).");
+        "Invalid input argument: edgelist_major_vertices & edgelist_minor_vertices should be "
+        "pre-shuffled.");
+
+      auto aggregate_vertexlist_size = host_scalar_allreduce(
+        comm,
+        local_vertices != nullptr ? num_local_vertices : vertex_t{0},
+        handle.get_stream());  // local_vertices != nullptr is insufficient in multi-GPU as only a
+                               // subset of GPUs may have a non-zero vertices
+      if (aggregate_vertexlist_size > 0) {
+        auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+        auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+
+        rmm::device_uvector<vertex_t> sorted_major_vertices(0, handle.get_stream());
+        {
+          auto recvcounts =
+            host_scalar_allgather(col_comm, sorted_local_vertices.size(), handle.get_stream());
+          std::vector<size_t> displacements(recvcounts.size(), size_t{0});
+          std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1);
+          sorted_major_vertices.resize(displacements.back() + recvcounts.back(),
+                                       handle.get_stream());
+          device_allgatherv(col_comm,
+                            sorted_local_vertices.data(),
+                            sorted_major_vertices.data(),
+                            recvcounts,
+                            displacements,
+                            handle.get_stream());
+          thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       sorted_major_vertices.begin(),
+                       sorted_major_vertices.end());
+        }
+
+        rmm::device_uvector<vertex_t> sorted_minor_vertices(0, handle.get_stream());
+        {
+          auto recvcounts =
+            host_scalar_allgather(row_comm, sorted_local_vertices.size(), handle.get_stream());
+          std::vector<size_t> displacements(recvcounts.size(), size_t{0});
+          std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1);
+          sorted_minor_vertices.resize(displacements.back() + recvcounts.back(),
+                                       handle.get_stream());
+          device_allgatherv(row_comm,
+                            sorted_local_vertices.data(),
+                            sorted_minor_vertices.data(),
+                            recvcounts,
+                            displacements,
+                            handle.get_stream());
+          thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       sorted_minor_vertices.begin(),
+                       sorted_minor_vertices.end());
+        }
+
+        auto edge_first = thrust::make_zip_iterator(
+          thrust::make_tuple(edgelist_major_vertices[i], edgelist_minor_vertices[i]));
+        CUGRAPH_EXPECTS(
+          thrust::count_if(
+            rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+            edge_first,
+            edge_first + edgelist_edge_counts[i],
+            [num_major_vertices    = static_cast<vertex_t>(sorted_major_vertices.size()),
+             sorted_major_vertices = sorted_major_vertices.data(),
+             num_minor_vertices    = static_cast<vertex_t>(sorted_minor_vertices.size()),
+             sorted_minor_vertices = sorted_minor_vertices.data()] __device__(auto e) {
+              return !thrust::binary_search(thrust::seq,
+                                            sorted_major_vertices,
+                                            sorted_major_vertices + num_major_vertices,
+                                            thrust::get<0>(e)) ||
+                     !thrust::binary_search(thrust::seq,
+                                            sorted_minor_vertices,
+                                            sorted_minor_vertices + num_minor_vertices,
+                                            thrust::get<1>(e));
+            }) == 0,
+          "Invalid input argument: edgelist_major_vertices and/or edgelist_mior_vertices have "
+          "invalid vertex ID(s).");
+      }
     }
   } else {
-    if (local_vertices != nullptr) {
-      CUGRAPH_EXPECTS(
-        thrust::count_if(
-          rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-          edgelist_major_vertices,
-          edgelist_major_vertices + num_edgelist_edges,
-          [num_local_vertices,
-           sorted_local_vertices = sorted_local_vertices.data()] __device__(auto v) {
-            return !thrust::binary_search(
-              thrust::seq, sorted_local_vertices, sorted_local_vertices + num_local_vertices, v);
-          }) == 0,
-        "Invalid input argument: edgelist_major_vertices has invalid vertex ID(s).");
+    assert(edgelist_major_vertices.size() == 1);
+    assert(edgelist_minor_vertices.size() == 1);
 
+    if (local_vertices != nullptr) {
+      auto edge_first = thrust::make_zip_iterator(
+        thrust::make_tuple(edgelist_major_vertices[0], edgelist_minor_vertices[0]));
       CUGRAPH_EXPECTS(
-        thrust::count_if(
-          rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-          edgelist_major_vertices,
-          edgelist_major_vertices + num_edgelist_edges,
-          [num_local_vertices,
-           sorted_local_vertices = sorted_local_vertices.data()] __device__(auto v) {
-            return !thrust::binary_search(
-              thrust::seq, sorted_local_vertices, sorted_local_vertices + num_local_vertices, v);
-          }) == 0,
-        "Invalid input argument: edgelist_major_vertices has invalid vertex ID(s).");
+        thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                         edge_first,
+                         edge_first + edgelist_edge_counts[0],
+                         [num_local_vertices,
+                          sorted_local_vertices = sorted_local_vertices.data()] __device__(auto e) {
+                           return !thrust::binary_search(thrust::seq,
+                                                         sorted_local_vertices,
+                                                         sorted_local_vertices + num_local_vertices,
+                                                         thrust::get<0>(e)) ||
+                                  !thrust::binary_search(thrust::seq,
+                                                         sorted_local_vertices,
+                                                         sorted_local_vertices + num_local_vertices,
+                                                         thrust::get<1>(e));
+                         }) == 0,
+        "Invalid input argument: edgelist_major_vertices and/or edgelist_minor_vertices have "
+        "invalid vertex ID(s).");
     }
   }
 }
@@ -368,15 +463,15 @@ std::enable_if_t<multi_gpu,
 renumber_edgelist(raft::handle_t const& handle,
                   vertex_t const* local_vertices,
                   vertex_t num_local_vertices /* relevant only if local_vertices != nullptr */,
-                  vertex_t* edgelist_major_vertices /* [INOUT] */,
-                  vertex_t* edgelist_minor_vertices /* [INOUT] */,
-                  edge_t num_edgelist_edges,
-                  bool is_hypergraph_partitioned,
+                  std::vector<vertex_t*> const& edgelist_major_vertices /* [INOUT] */,
+                  std::vector<vertex_t*> const& edgelist_minor_vertices /* [INOUT] */,
+                  std::vector<edge_t> const& edgelist_edge_counts,
                   bool do_expensive_check)
 {
   // FIXME: remove this check once we drop Pascal support
-  CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7,
-                  "Relabel not supported on Pascal and older architectures.");
+  CUGRAPH_EXPECTS(
+    handle.get_device_properties().major >= 7,
+    "This version of enumber_edgelist not supported on Pascal and older architectures.");
 
 #ifdef CUCO_STATIC_MAP_DEFINED
   auto& comm               = handle.get_comms();
@@ -389,14 +484,20 @@ renumber_edgelist(raft::handle_t const& handle,
   auto const col_comm_size = col_comm.get_size();
   auto const col_comm_rank = col_comm.get_rank();
 
+  std::vector<vertex_t const*> edgelist_const_major_vertices(edgelist_major_vertices.size());
+  std::vector<vertex_t const*> edgelist_const_minor_vertices(edgelist_const_major_vertices.size());
+  for (size_t i = 0; i < edgelist_const_major_vertices.size(); ++i) {
+    edgelist_const_major_vertices[i] = edgelist_major_vertices[i];
+    edgelist_const_minor_vertices[i] = edgelist_minor_vertices[i];
+  }
+
   if (do_expensive_check) {
     expensive_check_edgelist<vertex_t, edge_t, multi_gpu>(handle,
                                                           local_vertices,
                                                           num_local_vertices,
-                                                          edgelist_major_vertices,
-                                                          edgelist_minor_vertices,
-                                                          num_edgelist_edges,
-                                                          is_hypergraph_partitioned);
+                                                          edgelist_const_major_vertices,
+                                                          edgelist_const_minor_vertices,
+                                                          edgelist_edge_counts);
   }
 
   // 1. compute renumber map
@@ -405,142 +506,129 @@ renumber_edgelist(raft::handle_t const& handle,
     detail::compute_renumber_map<vertex_t, edge_t, multi_gpu>(handle,
                                                               local_vertices,
                                                               num_local_vertices,
-                                                              edgelist_major_vertices,
-                                                              edgelist_minor_vertices,
-                                                              num_edgelist_edges);
+                                                              edgelist_const_major_vertices,
+                                                              edgelist_const_minor_vertices,
+                                                              edgelist_edge_counts);
 
   // 2. initialize partition_t object, number_of_vertices, and number_of_edges for the coarsened
   // graph
 
-  auto vertex_partition_counts = host_scalar_allgather(
+  auto vertex_counts = host_scalar_allgather(
     comm, static_cast<vertex_t>(renumber_map_labels.size()), handle.get_stream());
   std::vector<vertex_t> vertex_partition_offsets(comm_size + 1, 0);
-  std::partial_sum(vertex_partition_counts.begin(),
-                   vertex_partition_counts.end(),
-                   vertex_partition_offsets.begin() + 1);
+  std::partial_sum(
+    vertex_counts.begin(), vertex_counts.end(), vertex_partition_offsets.begin() + 1);
 
-  partition_t<vertex_t> partition(vertex_partition_offsets,
-                                  is_hypergraph_partitioned,
-                                  row_comm_size,
-                                  col_comm_size,
-                                  row_comm_rank,
-                                  col_comm_rank);
+  partition_t<vertex_t> partition(
+    vertex_partition_offsets, row_comm_size, col_comm_size, row_comm_rank, col_comm_rank);
 
   auto number_of_vertices = vertex_partition_offsets.back();
-  auto number_of_edges    = host_scalar_allreduce(comm, num_edgelist_edges, handle.get_stream());
+  auto number_of_edges    = host_scalar_allreduce(
+    comm,
+    std::accumulate(edgelist_edge_counts.begin(), edgelist_edge_counts.end(), edge_t{0}),
+    handle.get_stream());
 
   // 3. renumber edges
 
-  if (is_hypergraph_partitioned) {
-    CUGRAPH_FAIL("unimplemented.");
-  } else {
-    double constexpr load_factor = 0.7;
+  double constexpr load_factor = 0.7;
 
-    // FIXME: compare this hash based approach with a binary search based approach in both memory
-    // footprint and execution time
+  // FIXME: compare this hash based approach with a binary search based approach in both memory
+  // footprint and execution time
 
-    {
-      vertex_t major_first{};
-      vertex_t major_last{};
-      std::tie(major_first, major_last) = partition.get_matrix_partition_major_range(0);
-      rmm::device_uvector<vertex_t> renumber_map_major_labels(major_last - major_first,
-                                                              handle.get_stream());
-      std::vector<size_t> recvcounts(row_comm_size);
-      for (int i = 0; i < row_comm_size; ++i) {
-        recvcounts[i] = partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i);
-      }
-      std::vector<size_t> displacements(row_comm_size, 0);
-      std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1);
-      device_allgatherv(row_comm,
-                        renumber_map_labels.begin(),
-                        renumber_map_major_labels.begin(),
-                        recvcounts,
-                        displacements,
-                        handle.get_stream());
-
-      CUDA_TRY(cudaStreamSynchronize(
-        handle.get_stream()));  // cuco::static_map currently does not take stream
-
-      cuco::static_map<vertex_t, vertex_t> renumber_map{
-        static_cast<size_t>(static_cast<double>(renumber_map_major_labels.size()) / load_factor),
-        invalid_vertex_id<vertex_t>::value,
-        invalid_vertex_id<vertex_t>::value};
-      auto pair_first = thrust::make_transform_iterator(
-        thrust::make_zip_iterator(thrust::make_tuple(renumber_map_major_labels.begin(),
-                                                     thrust::make_counting_iterator(major_first))),
-        [] __device__(auto val) {
-          return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
-        });
-      renumber_map.insert(pair_first, pair_first + renumber_map_major_labels.size());
-      renumber_map.find(edgelist_major_vertices,
-                        edgelist_major_vertices + num_edgelist_edges,
-                        edgelist_major_vertices);
+  for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
+    rmm::device_uvector<vertex_t> renumber_map_major_labels(
+      col_comm_rank == static_cast<int>(i) ? vertex_t{0}
+                                           : partition.get_matrix_partition_major_size(i),
+      handle.get_stream());
+    device_bcast(col_comm,
+                 renumber_map_labels.data(),
+                 renumber_map_major_labels.data(),
+                 partition.get_matrix_partition_major_size(i),
+                 i,
+                 handle.get_stream());
+
+    CUDA_TRY(cudaStreamSynchronize(
+      handle.get_stream()));  // cuco::static_map currently does not take stream
+
+    cuco::static_map<vertex_t, vertex_t> renumber_map{
+      // FIXME: std::max(..., ...) as a temporary workaround for
+      // https://github.com/NVIDIA/cuCollections/issues/72 and
+      // https://github.com/NVIDIA/cuCollections/issues/73
+      std::max(static_cast<size_t>(
+                 static_cast<double>(partition.get_matrix_partition_major_size(i)) / load_factor),
+               static_cast<size_t>(partition.get_matrix_partition_major_size(i)) + 1),
+      invalid_vertex_id<vertex_t>::value,
+      invalid_vertex_id<vertex_t>::value};
+    auto pair_first = thrust::make_transform_iterator(
+      thrust::make_zip_iterator(thrust::make_tuple(
+        col_comm_rank == static_cast<int>(i) ? renumber_map_labels.begin()
+                                             : renumber_map_major_labels.begin(),
+        thrust::make_counting_iterator(partition.get_matrix_partition_major_first(i)))),
+      [] __device__(auto val) {
+        return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
+      });
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (partition.get_matrix_partition_major_size(i) > 0) {
+      renumber_map.insert(pair_first, pair_first + partition.get_matrix_partition_major_size(i));
     }
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (edgelist_edge_counts[i]) {
+      renumber_map.find(edgelist_major_vertices[i],
+                        edgelist_major_vertices[i] + edgelist_edge_counts[i],
+                        edgelist_major_vertices[i]);
+    }
+  }
 
-    {
-      vertex_t minor_first{};
-      vertex_t minor_last{};
-      std::tie(minor_first, minor_last) = partition.get_matrix_partition_minor_range();
-      rmm::device_uvector<vertex_t> renumber_map_minor_labels(minor_last - minor_first,
-                                                              handle.get_stream());
-
-      // FIXME: this P2P is unnecessary if we apply the partitioning scheme used with hypergraph
-      // partitioning
-      auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank;
-      auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size;
-      // FIXME: this branch may be no longer necessary with NCCL backend
-      if (comm_src_rank == comm_rank) {
-        assert(comm_dst_rank == comm_rank);
-        thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                     renumber_map_labels.begin(),
-                     renumber_map_labels.end(),
-                     renumber_map_minor_labels.begin() +
-                       (partition.get_vertex_partition_first(comm_src_rank) -
-                        partition.get_vertex_partition_first(row_comm_rank * col_comm_size)));
-      } else {
-        device_sendrecv(comm,
-                        renumber_map_labels.begin(),
-                        renumber_map_labels.size(),
-                        comm_dst_rank,
-                        renumber_map_minor_labels.begin() +
-                          (partition.get_vertex_partition_first(comm_src_rank) -
-                           partition.get_vertex_partition_first(row_comm_rank * col_comm_size)),
-                        static_cast<size_t>(partition.get_vertex_partition_size(comm_src_rank)),
-                        comm_src_rank,
-                        handle.get_stream());
-      }
-
-      // FIXME: these broadcast operations can be placed between ncclGroupStart() and
-      // ncclGroupEnd()
-      for (int i = 0; i < col_comm_size; ++i) {
-        auto offset = partition.get_vertex_partition_first(row_comm_rank * col_comm_size + i) -
-                      partition.get_vertex_partition_first(row_comm_rank * col_comm_size);
-        auto count = partition.get_vertex_partition_size(row_comm_rank * col_comm_size + i);
-        device_bcast(col_comm,
-                     renumber_map_minor_labels.begin() + offset,
-                     renumber_map_minor_labels.begin() + offset,
-                     count,
-                     i,
-                     handle.get_stream());
-      }
+  {
+    rmm::device_uvector<vertex_t> renumber_map_minor_labels(
+      partition.get_matrix_partition_minor_size(), handle.get_stream());
+    std::vector<size_t> recvcounts(row_comm_size);
+    for (int i = 0; i < row_comm_size; ++i) {
+      recvcounts[i] = partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i);
+    }
+    std::vector<size_t> displacements(recvcounts.size(), 0);
+    std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1);
+    device_allgatherv(row_comm,
+                      renumber_map_labels.begin(),
+                      renumber_map_minor_labels.begin(),
+                      recvcounts,
+                      displacements,
+                      handle.get_stream());
 
-      CUDA_TRY(cudaStreamSynchronize(
-        handle.get_stream()));  // cuco::static_map currently does not take stream
+    CUDA_TRY(cudaStreamSynchronize(
+      handle.get_stream()));  // cuco::static_map currently does not take stream
 
-      cuco::static_map<vertex_t, vertex_t> renumber_map{
+    cuco::static_map<vertex_t, vertex_t> renumber_map{
+      // FIXME: std::max(..., ...) as a temporary workaround for
+      // https://github.com/NVIDIA/cuCollections/issues/72 and
+      // https://github.com/NVIDIA/cuCollections/issues/73
+      std::max(
         static_cast<size_t>(static_cast<double>(renumber_map_minor_labels.size()) / load_factor),
-        invalid_vertex_id<vertex_t>::value,
-        invalid_vertex_id<vertex_t>::value};
-      auto pair_first = thrust::make_transform_iterator(
-        thrust::make_zip_iterator(thrust::make_tuple(renumber_map_minor_labels.begin(),
-                                                     thrust::make_counting_iterator(minor_first))),
-        [] __device__(auto val) {
-          return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
-        });
+        renumber_map_minor_labels.size() + 1),
+      invalid_vertex_id<vertex_t>::value,
+      invalid_vertex_id<vertex_t>::value};
+    auto pair_first = thrust::make_transform_iterator(
+      thrust::make_zip_iterator(thrust::make_tuple(
+        renumber_map_minor_labels.begin(),
+        thrust::make_counting_iterator(partition.get_matrix_partition_minor_first()))),
+      [] __device__(auto val) {
+        return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
+      });
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (renumber_map_minor_labels.size()) {
       renumber_map.insert(pair_first, pair_first + renumber_map_minor_labels.size());
-      renumber_map.find(edgelist_minor_vertices,
-                        edgelist_minor_vertices + num_edgelist_edges,
-                        edgelist_minor_vertices);
+    }
+    for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
+      // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the
+      // grid size is 0; this leads to cudaErrorInvaildConfiguration.
+      if (edgelist_edge_counts[i]) {
+        renumber_map.find(edgelist_minor_vertices[i],
+                          edgelist_minor_vertices[i] + edgelist_edge_counts[i],
+                          edgelist_minor_vertices[i]);
+      }
     }
   }
 
@@ -565,27 +653,28 @@ std::enable_if_t<!multi_gpu, rmm::device_uvector<vertex_t>> renumber_edgelist(
   bool do_expensive_check)
 {
   // FIXME: remove this check once we drop Pascal support
-  CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7,
-                  "Relabel not supported on Pascal and older architectures.");
+  CUGRAPH_EXPECTS(
+    handle.get_device_properties().major >= 7,
+    "This version of renumber_edgelist not supported on Pascal and older architectures.");
 
 #ifdef CUCO_STATIC_MAP_DEFINED
   if (do_expensive_check) {
-    expensive_check_edgelist<vertex_t, edge_t, multi_gpu>(handle,
-                                                          vertices,
-                                                          num_vertices,
-                                                          edgelist_major_vertices,
-                                                          edgelist_minor_vertices,
-                                                          num_edgelist_edges,
-                                                          false);
+    expensive_check_edgelist<vertex_t, edge_t, multi_gpu>(
+      handle,
+      vertices,
+      num_vertices,
+      std::vector<vertex_t const*>{edgelist_major_vertices},
+      std::vector<vertex_t const*>{edgelist_minor_vertices},
+      std::vector<edge_t>{num_edgelist_edges});
   }
 
-  auto renumber_map_labels =
-    detail::compute_renumber_map<vertex_t, edge_t, multi_gpu>(handle,
-                                                              vertices,
-                                                              num_vertices,
-                                                              edgelist_major_vertices,
-                                                              edgelist_minor_vertices,
-                                                              num_edgelist_edges);
+  auto renumber_map_labels = detail::compute_renumber_map<vertex_t, edge_t, multi_gpu>(
+    handle,
+    vertices,
+    num_vertices,
+    std::vector<vertex_t const*>{edgelist_major_vertices},
+    std::vector<vertex_t const*>{edgelist_minor_vertices},
+    std::vector<edge_t>{num_edgelist_edges});
 
   double constexpr load_factor = 0.7;
 
@@ -593,7 +682,11 @@ std::enable_if_t<!multi_gpu, rmm::device_uvector<vertex_t>> renumber_edgelist(
   // footprint and execution time
 
   cuco::static_map<vertex_t, vertex_t> renumber_map{
-    static_cast<size_t>(static_cast<double>(renumber_map_labels.size()) / load_factor),
+    // FIXME: std::max(..., ...) as a temporary workaround for
+    // https://github.com/NVIDIA/cuCollections/issues/72 and
+    // https://github.com/NVIDIA/cuCollections/issues/73
+    std::max(static_cast<size_t>(static_cast<double>(renumber_map_labels.size()) / load_factor),
+             renumber_map_labels.size() + 1),
     invalid_vertex_id<vertex_t>::value,
     invalid_vertex_id<vertex_t>::value};
   auto pair_first = thrust::make_transform_iterator(
@@ -602,11 +695,21 @@ std::enable_if_t<!multi_gpu, rmm::device_uvector<vertex_t>> renumber_edgelist(
     [] __device__(auto val) {
       return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
     });
-  renumber_map.insert(pair_first, pair_first + renumber_map_labels.size());
-  renumber_map.find(
-    edgelist_major_vertices, edgelist_major_vertices + num_edgelist_edges, edgelist_major_vertices);
-  renumber_map.find(
-    edgelist_minor_vertices, edgelist_minor_vertices + num_edgelist_edges, edgelist_minor_vertices);
+  // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+  // size is 0; this leads to cudaErrorInvaildConfiguration.
+  if (renumber_map_labels.size()) {
+    renumber_map.insert(pair_first, pair_first + renumber_map_labels.size());
+  }
+  // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+  // size is 0; this leads to cudaErrorInvaildConfiguration.
+  if (num_edgelist_edges > 0) {
+    renumber_map.find(edgelist_major_vertices,
+                      edgelist_major_vertices + num_edgelist_edges,
+                      edgelist_major_vertices);
+    renumber_map.find(edgelist_minor_vertices,
+                      edgelist_minor_vertices + num_edgelist_edges,
+                      edgelist_minor_vertices);
+  }
 
   return renumber_map_labels;
 #else
@@ -620,22 +723,21 @@ template <typename vertex_t, typename edge_t, bool multi_gpu>
 std::enable_if_t<multi_gpu,
                  std::tuple<rmm::device_uvector<vertex_t>, partition_t<vertex_t>, vertex_t, edge_t>>
 renumber_edgelist(raft::handle_t const& handle,
-                  vertex_t* edgelist_major_vertices /* [INOUT] */,
-                  vertex_t* edgelist_minor_vertices /* [INOUT] */,
-                  edge_t num_edgelist_edges,
-                  bool is_hypergraph_partitioned,
+                  std::vector<vertex_t*> const& edgelist_major_vertices /* [INOUT] */,
+                  std::vector<vertex_t*> const& edgelist_minor_vertices /* [INOUT] */,
+                  std::vector<edge_t> const& edgelist_edge_counts,
                   bool do_expensive_check)
 {
   // FIXME: remove this check once we drop Pascal support
-  CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7,
-                  "Relabel not supported on Pascal and older architectures.");
+  CUGRAPH_EXPECTS(
+    handle.get_device_properties().major >= 7,
+    "This version of renumber_edgelist not supported on Pascal and older architectures.");
   return detail::renumber_edgelist<vertex_t, edge_t, multi_gpu>(handle,
                                                                 static_cast<vertex_t*>(nullptr),
                                                                 vertex_t{0},
                                                                 edgelist_major_vertices,
                                                                 edgelist_minor_vertices,
-                                                                num_edgelist_edges,
-                                                                is_hypergraph_partitioned,
+                                                                edgelist_edge_counts,
                                                                 do_expensive_check);
 }
 
@@ -648,8 +750,9 @@ std::enable_if_t<!multi_gpu, rmm::device_uvector<vertex_t>> renumber_edgelist(
   bool do_expensive_check)
 {
   // FIXME: remove this check once we drop Pascal support
-  CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7,
-                  "Relabel not supported on Pascal and older architectures.");
+  CUGRAPH_EXPECTS(
+    handle.get_device_properties().major >= 7,
+    "This version of renumber_edgelist not supported on Pascal and older architectures.");
   return detail::renumber_edgelist<vertex_t, edge_t, multi_gpu>(handle,
                                                                 static_cast<vertex_t*>(nullptr),
                                                                 vertex_t{0} /* dummy */,
@@ -665,22 +768,21 @@ std::enable_if_t<multi_gpu,
 renumber_edgelist(raft::handle_t const& handle,
                   vertex_t const* local_vertices,
                   vertex_t num_local_vertices,
-                  vertex_t* edgelist_major_vertices /* [INOUT] */,
-                  vertex_t* edgelist_minor_vertices /* [INOUT] */,
-                  edge_t num_edgelist_edges,
-                  bool is_hypergraph_partitioned,
+                  std::vector<vertex_t*> const& edgelist_major_vertices /* [INOUT] */,
+                  std::vector<vertex_t*> const& edgelist_minor_vertices /* [INOUT] */,
+                  std::vector<edge_t> const& edgelist_edge_counts,
                   bool do_expensive_check)
 {
   // FIXME: remove this check once we drop Pascal support
-  CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7,
-                  "Relabel not supported on Pascal and older architectures.");
+  CUGRAPH_EXPECTS(
+    handle.get_device_properties().major >= 7,
+    "This version of renumber_edgelist not supported on Pascal and older architectures.");
   return detail::renumber_edgelist<vertex_t, edge_t, multi_gpu>(handle,
                                                                 local_vertices,
                                                                 num_local_vertices,
                                                                 edgelist_major_vertices,
                                                                 edgelist_minor_vertices,
-                                                                num_edgelist_edges,
-                                                                is_hypergraph_partitioned,
+                                                                edgelist_edge_counts,
                                                                 do_expensive_check);
 }
 
@@ -695,8 +797,9 @@ std::enable_if_t<!multi_gpu, rmm::device_uvector<vertex_t>> renumber_edgelist(
   bool do_expensive_check)
 {
   // FIXME: remove this check once we drop Pascal support
-  CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7,
-                  "Relabel not supported on Pascal and older architectures.");
+  CUGRAPH_EXPECTS(
+    handle.get_device_properties().major >= 7,
+    "This version of renumber_edgelist not supported on Pascal and older architectures.");
   return detail::renumber_edgelist<vertex_t, edge_t, multi_gpu>(handle,
                                                                 vertices,
                                                                 num_vertices,
@@ -711,12 +814,12 @@ std::enable_if_t<!multi_gpu, rmm::device_uvector<vertex_t>> renumber_edgelist(
 // instantiations for <vertex_t == int32_t, edge_t == int32_t>
 //
 template std::tuple<rmm::device_uvector<int32_t>, partition_t<int32_t>, int32_t, int32_t>
-renumber_edgelist<int32_t, int32_t, true>(raft::handle_t const& handle,
-                                          int32_t* edgelist_major_vertices /* [INOUT] */,
-                                          int32_t* edgelist_minor_vertices /* [INOUT] */,
-                                          int32_t num_edgelist_edges,
-                                          bool is_hypergraph_partitioned,
-                                          bool do_expensive_check);
+renumber_edgelist<int32_t, int32_t, true>(
+  raft::handle_t const& handle,
+  std::vector<int32_t*> const& edgelist_major_vertices /* [INOUT] */,
+  std::vector<int32_t*> const& edgelist_minor_vertices /* [INOUT] */,
+  std::vector<int32_t> const& edgelist_edge_counts,
+  bool do_expensive_check);
 
 template rmm::device_uvector<int32_t> renumber_edgelist<int32_t, int32_t, false>(
   raft::handle_t const& handle,
@@ -726,14 +829,14 @@ template rmm::device_uvector<int32_t> renumber_edgelist<int32_t, int32_t, false>
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>, partition_t<int32_t>, int32_t, int32_t>
-renumber_edgelist<int32_t, int32_t, true>(raft::handle_t const& handle,
-                                          int32_t const* local_vertices,
-                                          int32_t num_local_vertices,
-                                          int32_t* edgelist_major_vertices /* [INOUT] */,
-                                          int32_t* edgelist_minor_vertices /* [INOUT] */,
-                                          int32_t num_edgelist_edges,
-                                          bool is_hypergraph_partitioned,
-                                          bool do_expensive_check);
+renumber_edgelist<int32_t, int32_t, true>(
+  raft::handle_t const& handle,
+  int32_t const* local_vertices,
+  int32_t num_local_vertices,
+  std::vector<int32_t*> const& edgelist_major_vertices /* [INOUT] */,
+  std::vector<int32_t*> const& edgelist_minor_vertices /* [INOUT] */,
+  std::vector<int32_t> const& edgelist_edge_counts,
+  bool do_expensive_check);
 
 template rmm::device_uvector<int32_t> renumber_edgelist<int32_t, int32_t, false>(
   raft::handle_t const& handle,
@@ -747,12 +850,12 @@ template rmm::device_uvector<int32_t> renumber_edgelist<int32_t, int32_t, false>
 // instantiations for <vertex_t == int32_t, edge_t == int64_t>
 //
 template std::tuple<rmm::device_uvector<int32_t>, partition_t<int32_t>, int32_t, int64_t>
-renumber_edgelist<int32_t, int64_t, true>(raft::handle_t const& handle,
-                                          int32_t* edgelist_major_vertices /* [INOUT] */,
-                                          int32_t* edgelist_minor_vertices /* [INOUT] */,
-                                          int64_t num_edgelist_edges,
-                                          bool is_hypergraph_partitioned,
-                                          bool do_expensive_check);
+renumber_edgelist<int32_t, int64_t, true>(
+  raft::handle_t const& handle,
+  std::vector<int32_t*> const& edgelist_major_vertices /* [INOUT] */,
+  std::vector<int32_t*> const& edgelist_minor_vertices /* [INOUT] */,
+  std::vector<int64_t> const& edgelist_edge_counts,
+  bool do_expensive_check);
 
 template rmm::device_uvector<int32_t> renumber_edgelist<int32_t, int64_t, false>(
   raft::handle_t const& handle,
@@ -762,14 +865,14 @@ template rmm::device_uvector<int32_t> renumber_edgelist<int32_t, int64_t, false>
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>, partition_t<int32_t>, int32_t, int64_t>
-renumber_edgelist<int32_t, int64_t, true>(raft::handle_t const& handle,
-                                          int32_t const* local_vertices,
-                                          int32_t num_local_vertices,
-                                          int32_t* edgelist_major_vertices /* [INOUT] */,
-                                          int32_t* edgelist_minor_vertices /* [INOUT] */,
-                                          int64_t num_edgelist_edges,
-                                          bool is_hypergraph_partitioned,
-                                          bool do_expensive_check);
+renumber_edgelist<int32_t, int64_t, true>(
+  raft::handle_t const& handle,
+  int32_t const* local_vertices,
+  int32_t num_local_vertices,
+  std::vector<int32_t*> const& edgelist_major_vertices /* [INOUT] */,
+  std::vector<int32_t*> const& edgelist_minor_vertices /* [INOUT] */,
+  std::vector<int64_t> const& edgelist_edge_counts,
+  bool do_expensive_check);
 
 template rmm::device_uvector<int32_t> renumber_edgelist<int32_t, int64_t, false>(
   raft::handle_t const& handle,
@@ -783,12 +886,12 @@ template rmm::device_uvector<int32_t> renumber_edgelist<int32_t, int64_t, false>
 // instantiations for <vertex_t == int64_t, edge_t == int64_t>
 //
 template std::tuple<rmm::device_uvector<int64_t>, partition_t<int64_t>, int64_t, int64_t>
-renumber_edgelist<int64_t, int64_t, true>(raft::handle_t const& handle,
-                                          int64_t* edgelist_major_vertices /* [INOUT] */,
-                                          int64_t* edgelist_minor_vertices /* [INOUT] */,
-                                          int64_t num_edgelist_edges,
-                                          bool is_hypergraph_partitioned,
-                                          bool do_expensive_check);
+renumber_edgelist<int64_t, int64_t, true>(
+  raft::handle_t const& handle,
+  std::vector<int64_t*> const& edgelist_major_vertices /* [INOUT] */,
+  std::vector<int64_t*> const& edgelist_minor_vertices /* [INOUT] */,
+  std::vector<int64_t> const& edgelist_edge_counts,
+  bool do_expensive_check);
 
 template rmm::device_uvector<int64_t> renumber_edgelist<int64_t, int64_t, false>(
   raft::handle_t const& handle,
@@ -798,14 +901,14 @@ template rmm::device_uvector<int64_t> renumber_edgelist<int64_t, int64_t, false>
   bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int64_t>, partition_t<int64_t>, int64_t, int64_t>
-renumber_edgelist<int64_t, int64_t, true>(raft::handle_t const& handle,
-                                          int64_t const* local_vertices,
-                                          int64_t num_local_vertices,
-                                          int64_t* edgelist_major_vertices /* [INOUT] */,
-                                          int64_t* edgelist_minor_vertices /* [INOUT] */,
-                                          int64_t num_edgelist_edges,
-                                          bool is_hypergraph_partitioned,
-                                          bool do_expensive_check);
+renumber_edgelist<int64_t, int64_t, true>(
+  raft::handle_t const& handle,
+  int64_t const* local_vertices,
+  int64_t num_local_vertices,
+  std::vector<int64_t*> const& edgelist_major_vertices /* [INOUT] */,
+  std::vector<int64_t*> const& edgelist_minor_vertices /* [INOUT] */,
+  std::vector<int64_t> const& edgelist_edge_counts,
+  bool do_expensive_check);
 
 template rmm::device_uvector<int64_t> renumber_edgelist<int64_t, int64_t, false>(
   raft::handle_t const& handle,
diff --git a/cpp/src/experimental/renumber_utils.cu b/cpp/src/experimental/renumber_utils.cu
new file mode 100644
index 00000000000..8f59683d9d6
--- /dev/null
+++ b/cpp/src/experimental/renumber_utils.cu
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <experimental/include_cuco_static_map.cuh>
+
+#include <experimental/detail/graph_utils.cuh>
+#include <experimental/graph.hpp>
+#include <experimental/graph_functions.hpp>
+#include <utilities/collect_comm.cuh>
+#include <utilities/error.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+namespace cugraph {
+namespace experimental {
+
+template <typename vertex_t, bool multi_gpu>
+void renumber_ext_vertices(raft::handle_t const& handle,
+                           vertex_t* vertices /* [INOUT] */,
+                           size_t num_vertices,
+                           vertex_t const* renumber_map_labels,
+                           vertex_t local_int_vertex_first,
+                           vertex_t local_int_vertex_last,
+                           bool do_expensive_check)
+{
+  double constexpr load_factor = 0.7;
+
+  // FIXME: remove this check once we drop Pascal support
+  CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7,
+                  "renumber_vertices() not supported on Pascal and older architectures.");
+
+#ifdef CUCO_STATIC_MAP_DEFINED
+  if (do_expensive_check) {
+    rmm::device_uvector<vertex_t> labels(local_int_vertex_last - local_int_vertex_first,
+                                         handle.get_stream());
+    thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 renumber_map_labels,
+                 renumber_map_labels + labels.size(),
+                 labels.begin());
+    thrust::sort(
+      rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), labels.begin(), labels.end());
+    CUGRAPH_EXPECTS(thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                   labels.begin(),
+                                   labels.end()) == labels.end(),
+                    "Invalid input arguments: renumber_map_labels have duplicate elements.");
+  }
+
+  auto renumber_map_ptr = std::make_unique<cuco::static_map<vertex_t, vertex_t>>(
+    size_t{0}, invalid_vertex_id<vertex_t>::value, invalid_vertex_id<vertex_t>::value);
+  if (multi_gpu) {
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+
+    rmm::device_uvector<vertex_t> sorted_unique_ext_vertices(num_vertices, handle.get_stream());
+    sorted_unique_ext_vertices.resize(
+      thrust::distance(
+        sorted_unique_ext_vertices.begin(),
+        thrust::copy_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                        vertices,
+                        vertices + num_vertices,
+                        sorted_unique_ext_vertices.begin(),
+                        [] __device__(auto v) { return v != invalid_vertex_id<vertex_t>::value; })),
+      handle.get_stream());
+    thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 sorted_unique_ext_vertices.begin(),
+                 sorted_unique_ext_vertices.end());
+    sorted_unique_ext_vertices.resize(
+      thrust::distance(
+        sorted_unique_ext_vertices.begin(),
+        thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       sorted_unique_ext_vertices.begin(),
+                       sorted_unique_ext_vertices.end())),
+      handle.get_stream());
+
+    auto int_vertices_for_sorted_unique_ext_vertices = collect_values_for_unique_keys(
+      comm,
+      renumber_map_labels,
+      renumber_map_labels + (local_int_vertex_last - local_int_vertex_first),
+      thrust::make_counting_iterator(local_int_vertex_first),
+      sorted_unique_ext_vertices.begin(),
+      sorted_unique_ext_vertices.end(),
+      detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size},
+      handle.get_stream());
+
+    handle.get_stream_view().synchronize();  // cuco::static_map currently does not take stream
+
+    renumber_map_ptr.reset();
+
+    renumber_map_ptr = std::make_unique<cuco::static_map<vertex_t, vertex_t>>(
+      // FIXME: std::max(..., ...) as a temporary workaround for
+      // https://github.com/NVIDIA/cuCollections/issues/72 and
+      // https://github.com/NVIDIA/cuCollections/issues/73
+      std::max(
+        static_cast<size_t>(static_cast<double>(sorted_unique_ext_vertices.size()) / load_factor),
+        sorted_unique_ext_vertices.size() + 1),
+      invalid_vertex_id<vertex_t>::value,
+      invalid_vertex_id<vertex_t>::value);
+
+    auto kv_pair_first = thrust::make_transform_iterator(
+      thrust::make_zip_iterator(thrust::make_tuple(
+        sorted_unique_ext_vertices.begin(), int_vertices_for_sorted_unique_ext_vertices.begin())),
+      [] __device__(auto val) {
+        return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
+      });
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (sorted_unique_ext_vertices.size()) {
+      renumber_map_ptr->insert(kv_pair_first, kv_pair_first + sorted_unique_ext_vertices.size());
+    }
+  } else {
+    handle.get_stream_view().synchronize();  // cuco::static_map currently does not take stream
+
+    renumber_map_ptr.reset();
+
+    renumber_map_ptr = std::make_unique<cuco::static_map<vertex_t, vertex_t>>(
+      // FIXME: std::max(..., ...) as a temporary workaround for
+      // https://github.com/NVIDIA/cuCollections/issues/72 and
+      // https://github.com/NVIDIA/cuCollections/issues/73
+      std::max(static_cast<size_t>(
+                 static_cast<double>(local_int_vertex_last - local_int_vertex_first) / load_factor),
+               static_cast<size_t>(local_int_vertex_last - local_int_vertex_first) + 1),
+      invalid_vertex_id<vertex_t>::value,
+      invalid_vertex_id<vertex_t>::value);
+
+    auto pair_first = thrust::make_transform_iterator(
+      thrust::make_zip_iterator(
+        thrust::make_tuple(renumber_map_labels, thrust::make_counting_iterator(vertex_t{0}))),
+      [] __device__(auto val) {
+        return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
+      });
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if ((local_int_vertex_last - local_int_vertex_first) > 0) {
+      renumber_map_ptr->insert(pair_first,
+                               pair_first + (local_int_vertex_last - local_int_vertex_first));
+    }
+  }
+
+  if (do_expensive_check) {
+    rmm::device_uvector<bool> contains(num_vertices, handle.get_stream());
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (num_vertices > 0) {
+      renumber_map_ptr->contains(vertices, vertices + num_vertices, contains.begin());
+    }
+    auto vc_pair_first = thrust::make_zip_iterator(thrust::make_tuple(vertices, contains.begin()));
+    CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                     vc_pair_first,
+                                     vc_pair_first + num_vertices,
+                                     [] __device__(auto pair) {
+                                       auto v = thrust::get<0>(pair);
+                                       auto c = thrust::get<1>(pair);
+                                       return v == invalid_vertex_id<vertex_t>::value
+                                                ? (c == true)
+                                                : (c == false);
+                                     }) == 0,
+                    "Invalid input arguments: vertices have elements that are missing in "
+                    "(aggregate) renumber_map_labels.");
+  }
+
+  // FIXME: a temporary workaround for https://github.com/NVIDIA/cuCollections/issues/74
+#if 1
+  thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    vertices,
+                    vertices + num_vertices,
+                    vertices,
+                    [view = renumber_map_ptr->get_device_view()] __device__(auto v) {
+                      return v != invalid_vertex_id<vertex_t>::value
+                               ? view.find(v)->second.load(cuda::std::memory_order_relaxed)
+                               : invalid_vertex_id<vertex_t>::value;
+                    });
+#else
+  // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+  // size is 0; this leads to cudaErrorInvaildConfiguration.
+  if (num_vertices > 0) { renumber_map_ptr->find(vertices, vertices + num_vertices, vertices); }
+#endif
+#endif
+}
+
+template <typename vertex_t>
+void unrenumber_local_int_vertices(
+  raft::handle_t const& handle,
+  vertex_t* vertices /* [INOUT] */,
+  size_t num_vertices,
+  vertex_t const* renumber_map_labels /* size = local_int_vertex_last - local_int_vertex_first */,
+  vertex_t local_int_vertex_first,
+  vertex_t local_int_vertex_last,
+  bool do_expensive_check)
+{
+  // FIXME: remove this check once we drop Pascal support
+  CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7,
+                  "unrenumber_local_vertices() not supported on Pascal and older architectures.");
+
+#ifdef CUCO_STATIC_MAP_DEFINED
+  if (do_expensive_check) {
+    CUGRAPH_EXPECTS(
+      thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       vertices,
+                       vertices + num_vertices,
+                       [local_int_vertex_first, local_int_vertex_last] __device__(auto v) {
+                         return v != invalid_vertex_id<vertex_t>::value &&
+                                (v < local_int_vertex_first || v >= local_int_vertex_last);
+                       }) == 0,
+      "Invalid input arguments: there are non-local vertices in [vertices, vertices "
+      "+ num_vertices).");
+  }
+
+  thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    vertices,
+                    vertices + num_vertices,
+                    vertices,
+                    [renumber_map_labels, local_int_vertex_first] __device__(auto v) {
+                      return v == invalid_vertex_id<vertex_t>::value
+                               ? v
+                               : renumber_map_labels[v - local_int_vertex_first];
+                    });
+#endif
+}
+
+template <typename vertex_t, bool multi_gpu>
+void unrenumber_int_vertices(raft::handle_t const& handle,
+                             vertex_t* vertices /* [INOUT] */,
+                             size_t num_vertices,
+                             vertex_t const* renumber_map_labels,
+                             vertex_t local_int_vertex_first,
+                             vertex_t local_int_vertex_last,
+                             std::vector<vertex_t>& vertex_partition_lasts,
+                             bool do_expensive_check)
+{
+  double constexpr load_factor = 0.7;
+
+  // FIXME: remove this check once we drop Pascal support
+  CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7,
+                  "unrenumber_vertices() not supported on Pascal and older architectures.");
+
+#ifdef CUCO_STATIC_MAP_DEFINED
+  if (do_expensive_check) {
+    CUGRAPH_EXPECTS(
+      thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       vertices,
+                       vertices + num_vertices,
+                       [int_vertex_last = vertex_partition_lasts.back()] __device__(auto v) {
+                         return v != invalid_vertex_id<vertex_t>::value &&
+                                !is_valid_vertex(int_vertex_last, v);
+                       }) == 0,
+      "Invalid input arguments: there are out-of-range vertices in [vertices, vertices "
+      "+ num_vertices).");
+  }
+
+  if (multi_gpu) {
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+
+    rmm::device_uvector<vertex_t> sorted_unique_int_vertices(num_vertices, handle.get_stream());
+    sorted_unique_int_vertices.resize(
+      thrust::distance(
+        sorted_unique_int_vertices.begin(),
+        thrust::copy_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                        vertices,
+                        vertices + num_vertices,
+                        sorted_unique_int_vertices.begin(),
+                        [] __device__(auto v) { return v != invalid_vertex_id<vertex_t>::value; })),
+      handle.get_stream());
+    thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 sorted_unique_int_vertices.begin(),
+                 sorted_unique_int_vertices.end());
+    sorted_unique_int_vertices.resize(
+      thrust::distance(
+        sorted_unique_int_vertices.begin(),
+        thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       sorted_unique_int_vertices.begin(),
+                       sorted_unique_int_vertices.end())),
+      handle.get_stream());
+
+    rmm::device_uvector<vertex_t> d_vertex_partition_lasts(vertex_partition_lasts.size(),
+                                                           handle.get_stream());
+    raft::update_device(d_vertex_partition_lasts.data(),
+                        vertex_partition_lasts.data(),
+                        vertex_partition_lasts.size(),
+                        handle.get_stream());
+    rmm::device_uvector<size_t> d_tx_int_vertex_offsets(d_vertex_partition_lasts.size(),
+                                                        handle.get_stream());
+    thrust::lower_bound(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                        sorted_unique_int_vertices.begin(),
+                        sorted_unique_int_vertices.end(),
+                        d_vertex_partition_lasts.begin(),
+                        d_vertex_partition_lasts.end(),
+                        d_tx_int_vertex_offsets.begin());
+    std::vector<size_t> h_tx_int_vertex_counts(d_tx_int_vertex_offsets.size());
+    raft::update_host(h_tx_int_vertex_counts.data(),
+                      d_tx_int_vertex_offsets.data(),
+                      d_tx_int_vertex_offsets.size(),
+                      handle.get_stream());
+    handle.get_stream_view().synchronize();
+    std::adjacent_difference(
+      h_tx_int_vertex_counts.begin(), h_tx_int_vertex_counts.end(), h_tx_int_vertex_counts.begin());
+
+    rmm::device_uvector<vertex_t> rx_int_vertices(0, handle.get_stream());
+    std::vector<size_t> rx_int_vertex_counts{};
+    std::tie(rx_int_vertices, rx_int_vertex_counts) = shuffle_values(
+      comm, sorted_unique_int_vertices.begin(), h_tx_int_vertex_counts, handle.get_stream());
+
+    auto tx_ext_vertices = std::move(rx_int_vertices);
+    thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                      tx_ext_vertices.begin(),
+                      tx_ext_vertices.end(),
+                      tx_ext_vertices.begin(),
+                      [renumber_map_labels, local_int_vertex_first] __device__(auto v) {
+                        return renumber_map_labels[v - local_int_vertex_first];
+                      });
+
+    rmm::device_uvector<vertex_t> rx_ext_vertices_for_sorted_unique_int_vertices(
+      0, handle.get_stream());
+    std::tie(rx_ext_vertices_for_sorted_unique_int_vertices, std::ignore) =
+      shuffle_values(comm, tx_ext_vertices.begin(), rx_int_vertex_counts, handle.get_stream());
+
+    handle.get_stream_view().synchronize();  // cuco::static_map currently does not take stream
+
+    cuco::static_map<vertex_t, vertex_t> unrenumber_map(
+      // FIXME: std::max(..., ...) as a temporary workaround for
+      // https://github.com/NVIDIA/cuCollections/issues/72 and
+      // https://github.com/NVIDIA/cuCollections/issues/73
+      std::max(
+        static_cast<size_t>(static_cast<double>(sorted_unique_int_vertices.size()) / load_factor),
+        sorted_unique_int_vertices.size() + 1),
+      invalid_vertex_id<vertex_t>::value,
+      invalid_vertex_id<vertex_t>::value);
+
+    auto pair_first = thrust::make_transform_iterator(
+      thrust::make_zip_iterator(
+        thrust::make_tuple(sorted_unique_int_vertices.begin(),
+                           rx_ext_vertices_for_sorted_unique_int_vertices.begin())),
+      [] __device__(auto val) {
+        return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
+      });
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (sorted_unique_int_vertices.size()) {
+      unrenumber_map.insert(pair_first, pair_first + sorted_unique_int_vertices.size());
+    }
+    // FIXME: a temporary workaround for https://github.com/NVIDIA/cuCollections/issues/74
+#if 1
+    thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                      vertices,
+                      vertices + num_vertices,
+                      vertices,
+                      [view = unrenumber_map.get_device_view()] __device__(auto v) {
+                        return v != invalid_vertex_id<vertex_t>::value
+                                 ? view.find(v)->second.load(cuda::std::memory_order_relaxed)
+                                 : invalid_vertex_id<vertex_t>::value;
+                      });
+#else
+    // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid
+    // size is 0; this leads to cudaErrorInvaildConfiguration.
+    if (num_vertices > 0) { unrenumber_map.find(vertices, vertices + num_vertices, vertices); }
+#endif
+  } else {
+    unrenumber_local_int_vertices(handle,
+                                  vertices,
+                                  num_vertices,
+                                  renumber_map_labels,
+                                  local_int_vertex_first,
+                                  local_int_vertex_last,
+                                  do_expensive_check);
+  }
+#endif
+}
+
+// explicit instantiation
+
+template void renumber_ext_vertices<int32_t, false>(raft::handle_t const& handle,
+                                                    int32_t* vertices,
+                                                    size_t num_vertices,
+                                                    int32_t const* renumber_map_labels,
+                                                    int32_t local_int_vertex_first,
+                                                    int32_t local_int_vertex_last,
+                                                    bool do_expensive_check);
+
+template void renumber_ext_vertices<int32_t, true>(raft::handle_t const& handle,
+                                                   int32_t* vertices,
+                                                   size_t num_vertices,
+                                                   int32_t const* renumber_map_labels,
+                                                   int32_t local_int_vertex_first,
+                                                   int32_t local_int_vertex_last,
+                                                   bool do_expensive_check);
+
+template void renumber_ext_vertices<int64_t, false>(raft::handle_t const& handle,
+                                                    int64_t* vertices,
+                                                    size_t num_vertices,
+                                                    int64_t const* renumber_map_labels,
+                                                    int64_t local_int_vertex_first,
+                                                    int64_t local_int_vertex_last,
+                                                    bool do_expensive_check);
+
+template void renumber_ext_vertices<int64_t, true>(raft::handle_t const& handle,
+                                                   int64_t* vertices,
+                                                   size_t num_vertices,
+                                                   int64_t const* renumber_map_labels,
+                                                   int64_t local_int_vertex_first,
+                                                   int64_t local_int_vertex_last,
+                                                   bool do_expensive_check);
+
+template void unrenumber_local_int_vertices<int32_t>(raft::handle_t const& handle,
+                                                     int32_t* vertices,
+                                                     size_t num_vertices,
+                                                     int32_t const* renumber_map_labels,
+                                                     int32_t local_int_vertex_first,
+                                                     int32_t local_int_vertex_last,
+                                                     bool do_expensive_check);
+
+template void unrenumber_local_int_vertices<int64_t>(raft::handle_t const& handle,
+                                                     int64_t* vertices,
+                                                     size_t num_vertices,
+                                                     int64_t const* renumber_map_labels,
+                                                     int64_t local_int_vertex_first,
+                                                     int64_t local_int_vertex_last,
+                                                     bool do_expensive_check);
+
+template void unrenumber_int_vertices<int32_t, false>(raft::handle_t const& handle,
+                                                      int32_t* vertices,
+                                                      size_t num_vertices,
+                                                      int32_t const* renumber_map_labels,
+                                                      int32_t local_int_vertex_first,
+                                                      int32_t local_int_vertex_last,
+                                                      std::vector<int32_t>& vertex_partition_lasts,
+                                                      bool do_expensive_check);
+
+template void unrenumber_int_vertices<int32_t, true>(raft::handle_t const& handle,
+                                                     int32_t* vertices,
+                                                     size_t num_vertices,
+                                                     int32_t const* renumber_map_labels,
+                                                     int32_t local_int_vertex_first,
+                                                     int32_t local_int_vertex_last,
+                                                     std::vector<int32_t>& vertex_partition_lasts,
+                                                     bool do_expensive_check);
+
+template void unrenumber_int_vertices<int64_t, false>(raft::handle_t const& handle,
+                                                      int64_t* vertices,
+                                                      size_t num_vertices,
+                                                      int64_t const* renumber_map_labels,
+                                                      int64_t local_int_vertex_first,
+                                                      int64_t local_int_vertex_last,
+                                                      std::vector<int64_t>& vertex_partition_lasts,
+                                                      bool do_expensive_check);
+
+template void unrenumber_int_vertices<int64_t, true>(raft::handle_t const& handle,
+                                                     int64_t* vertices,
+                                                     size_t num_vertices,
+                                                     int64_t const* renumber_map_labels,
+                                                     int64_t local_int_vertex_first,
+                                                     int64_t local_int_vertex_last,
+                                                     std::vector<int64_t>& vertex_partition_lasts,
+                                                     bool do_expensive_check);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/experimental/sssp.cu b/cpp/src/experimental/sssp.cu
index 4996b3734cb..373444cb0a2 100644
--- a/cpp/src/experimental/sssp.cu
+++ b/cpp/src/experimental/sssp.cu
@@ -70,6 +70,9 @@ void sssp(raft::handle_t const &handle,
 
   CUGRAPH_EXPECTS(push_graph_view.is_valid_vertex(source_vertex),
                   "Invalid input argument: source vertex out-of-range.");
+  CUGRAPH_EXPECTS(push_graph_view.is_weighted(),
+                  "Invalid input argument: an unweighted graph is passed to SSSP, BFS is more "
+                  "efficient for unweighted graphs.");
 
   if (do_expensive_check) {
     auto num_negative_edge_weights =
@@ -126,10 +129,7 @@ void sssp(raft::handle_t const &handle,
   // FIXME: need to double check the bucket sizes are sufficient
   std::vector<size_t> bucket_sizes(static_cast<size_t>(Bucket::num_buckets),
                                    push_graph_view.get_number_of_local_vertices());
-  VertexFrontier<thrust::tuple<weight_t, vertex_t>,
-                 vertex_t,
-                 GraphViewType::is_multi_gpu,
-                 static_cast<size_t>(Bucket::num_buckets)>
+  VertexFrontier<vertex_t, GraphViewType::is_multi_gpu, static_cast<size_t>(Bucket::num_buckets)>
     vertex_frontier(handle, bucket_sizes);
 
   // 5. SSSP iteration
@@ -188,7 +188,7 @@ void sssp(raft::handle_t const &handle,
           threshold         = old_distance < threshold ? old_distance : threshold;
         }
         if (new_distance >= threshold) { push = false; }
-        return thrust::make_tuple(push, new_distance, src);
+        return thrust::make_tuple(push, thrust::make_tuple(new_distance, src));
       },
       reduce_op::min<thrust::tuple<weight_t, vertex_t>>(),
       distances,
@@ -199,8 +199,8 @@ void sssp(raft::handle_t const &handle,
         auto idx      = new_dist < v_val
                      ? (new_dist < near_far_threshold ? static_cast<size_t>(Bucket::new_near)
                                                       : static_cast<size_t>(Bucket::far))
-                     : VertexFrontier<thrust::tuple<vertex_t>, vertex_t>::kInvalidBucketIdx;
-        return thrust::make_tuple(idx, thrust::get<0>(pushed_val), thrust::get<1>(pushed_val));
+                     : VertexFrontier<vertex_t>::kInvalidBucketIdx;
+        return thrust::make_tuple(idx, pushed_val);
       });
 
     vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).clear();
@@ -222,7 +222,7 @@ void sssp(raft::handle_t const &handle,
             auto dist =
               *(distances + vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v));
             if (dist < old_near_far_threshold) {
-              return VertexFrontier<thrust::tuple<vertex_t>, vertex_t>::kInvalidBucketIdx;
+              return VertexFrontier<vertex_t>::kInvalidBucketIdx;
             } else if (dist < near_far_threshold) {
               return static_cast<size_t>(Bucket::cur_near);
             } else {
diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu
index a9e3146bbcd..4a2b98ea815 100644
--- a/cpp/src/utilities/cython.cu
+++ b/cpp/src/utilities/cython.cu
@@ -20,22 +20,101 @@
 #include <experimental/graph_view.hpp>
 #include <graph.hpp>
 #include <partition_manager.hpp>
-#include <raft/handle.hpp>
 #include <utilities/cython.hpp>
 #include <utilities/error.hpp>
 #include <utilities/shuffle_comm.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
 #include <thrust/copy.h>
+#include <thrust/fill.h>
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/scatter.h>
+
+#include <numeric>
+#include <vector>
 
 namespace cugraph {
 namespace cython {
 
 namespace detail {
 
-// FIXME: Add description of this function
+// workaround for CUDA extended lambda restrictions
+template <typename vertex_t>
+struct compute_local_partition_id_t {
+  vertex_t const* lasts{nullptr};
+  size_t num_local_partitions{0};
+
+  __device__ size_t operator()(vertex_t v)
+  {
+    for (size_t i = 0; i < num_local_partitions; ++i) {
+      if (v < lasts[i]) { return i; }
+    }
+    return num_local_partitions;
+  }
+};
+
+// FIXME: this is unnecessary if edge_counts_ in the major_minor_weights_t object returned by
+// call_shuffle() is passed back, better be fixed. this code assumes that the entire set of edges
+// for each partition are consecutively stored.
+template <typename vertex_t, typename edge_t, bool transposed>
+std::vector<edge_t> compute_edge_counts(raft::handle_t const& handle,
+                                        graph_container_t const& graph_container)
+{
+  auto num_local_partitions = static_cast<size_t>(graph_container.col_comm_size);
+
+  std::vector<vertex_t> partition_offsets_vector(
+    reinterpret_cast<vertex_t*>(graph_container.vertex_partition_offsets),
+    reinterpret_cast<vertex_t*>(graph_container.vertex_partition_offsets) +
+      (graph_container.row_comm_size * graph_container.col_comm_size) + 1);
+
+  std::vector<vertex_t> h_lasts(num_local_partitions);
+  for (size_t i = 0; i < h_lasts.size(); ++i) {
+    h_lasts[i] = partition_offsets_vector[graph_container.row_comm_size * (i + 1)];
+  }
+  rmm::device_uvector<vertex_t> d_lasts(h_lasts.size(), handle.get_stream());
+  raft::update_device(d_lasts.data(), h_lasts.data(), h_lasts.size(), handle.get_stream());
+  auto major_vertices = transposed
+                          ? reinterpret_cast<vertex_t const*>(graph_container.dst_vertices)
+                          : reinterpret_cast<vertex_t const*>(graph_container.src_vertices);
+  auto key_first = thrust::make_transform_iterator(
+    major_vertices, compute_local_partition_id_t<vertex_t>{d_lasts.data(), num_local_partitions});
+  rmm::device_uvector<size_t> d_local_partition_ids(num_local_partitions, handle.get_stream());
+  rmm::device_uvector<edge_t> d_edge_counts(d_local_partition_ids.size(), handle.get_stream());
+  auto it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                  key_first,
+                                  key_first + graph_container.num_local_edges,
+                                  thrust::make_constant_iterator(edge_t{1}),
+                                  d_local_partition_ids.begin(),
+                                  d_edge_counts.begin());
+  if (static_cast<size_t>(thrust::distance(d_local_partition_ids.begin(), thrust::get<0>(it))) <
+      num_local_partitions) {
+    rmm::device_uvector<edge_t> d_counts(num_local_partitions, handle.get_stream());
+    thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 d_counts.begin(),
+                 d_counts.end(),
+                 edge_t{0});
+    thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    d_edge_counts.begin(),
+                    thrust::get<1>(it),
+                    d_local_partition_ids.begin(),
+                    d_counts.begin());
+    d_edge_counts = std::move(d_counts);
+  }
+  std::vector<edge_t> h_edge_counts(num_local_partitions, 0);
+  raft::update_host(
+    h_edge_counts.data(), d_edge_counts.data(), d_edge_counts.size(), handle.get_stream());
+  handle.get_stream_view().synchronize();
+
+  return h_edge_counts;
+}
+
 template <typename vertex_t,
           typename edge_t,
           typename weight_t,
@@ -45,19 +124,31 @@ template <typename vertex_t,
 std::unique_ptr<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>
 create_graph(raft::handle_t const& handle, graph_container_t const& graph_container)
 {
-  std::vector<experimental::edgelist_t<vertex_t, edge_t, weight_t>> edgelist(
-    {{reinterpret_cast<vertex_t*>(graph_container.src_vertices),
-      reinterpret_cast<vertex_t*>(graph_container.dst_vertices),
-      reinterpret_cast<weight_t*>(graph_container.weights),
-      static_cast<edge_t>(graph_container.num_partition_edges)}});
+  auto num_local_partitions = static_cast<size_t>(graph_container.col_comm_size);
 
   std::vector<vertex_t> partition_offsets_vector(
     reinterpret_cast<vertex_t*>(graph_container.vertex_partition_offsets),
     reinterpret_cast<vertex_t*>(graph_container.vertex_partition_offsets) +
       (graph_container.row_comm_size * graph_container.col_comm_size) + 1);
 
+  auto edge_counts = compute_edge_counts<vertex_t, edge_t, transposed>(handle, graph_container);
+
+  std::vector<edge_t> displacements(edge_counts.size(), 0);
+  std::partial_sum(edge_counts.begin(), edge_counts.end() - 1, displacements.begin() + 1);
+
+  std::vector<cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t>> edgelists(
+    num_local_partitions);
+  for (size_t i = 0; i < edgelists.size(); ++i) {
+    edgelists[i] = cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t>{
+      reinterpret_cast<vertex_t*>(graph_container.src_vertices) + displacements[i],
+      reinterpret_cast<vertex_t*>(graph_container.dst_vertices) + displacements[i],
+      graph_container.graph_props.is_weighted
+        ? reinterpret_cast<weight_t*>(graph_container.weights) + displacements[i]
+        : static_cast<weight_t*>(nullptr),
+      edge_counts[i]};
+  }
+
   experimental::partition_t<vertex_t> partition(partition_offsets_vector,
-                                                graph_container.hypergraph_partitioned,
                                                 graph_container.row_comm_size,
                                                 graph_container.col_comm_size,
                                                 graph_container.row_comm_rank,
@@ -65,14 +156,12 @@ create_graph(raft::handle_t const& handle, graph_container_t const& graph_contai
 
   return std::make_unique<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>(
     handle,
-    edgelist,
+    edgelists,
     partition,
     static_cast<vertex_t>(graph_container.num_global_vertices),
     static_cast<edge_t>(graph_container.num_global_edges),
     graph_container.graph_props,
-    // FIXME:  This currently fails if sorted_by_degree is true...
-    // graph_container.sorted_by_degree,
-    false,
+    true,
     graph_container.do_expensive_check);
 }
 
@@ -89,7 +178,7 @@ create_graph(raft::handle_t const& handle, graph_container_t const& graph_contai
     reinterpret_cast<vertex_t*>(graph_container.src_vertices),
     reinterpret_cast<vertex_t*>(graph_container.dst_vertices),
     reinterpret_cast<weight_t*>(graph_container.weights),
-    static_cast<edge_t>(graph_container.num_partition_edges)};
+    static_cast<edge_t>(graph_container.num_local_edges)};
   return std::make_unique<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>(
     handle,
     edgelist,
@@ -113,10 +202,11 @@ void populate_graph_container(graph_container_t& graph_container,
                               numberTypeEnum vertexType,
                               numberTypeEnum edgeType,
                               numberTypeEnum weightType,
-                              size_t num_partition_edges,
+                              size_t num_local_edges,
                               size_t num_global_vertices,
                               size_t num_global_edges,
                               bool sorted_by_degree,
+                              bool is_weighted,
                               bool transposed,
                               bool multi_gpu)
 {
@@ -124,7 +214,6 @@ void populate_graph_container(graph_container_t& graph_container,
                   "populate_graph_container() can only be called on an empty container.");
 
   bool do_expensive_check{true};
-  bool hypergraph_partitioned{false};
 
   if (multi_gpu) {
     auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
@@ -143,7 +232,7 @@ void populate_graph_container(graph_container_t& graph_container,
   graph_container.src_vertices             = src_vertices;
   graph_container.dst_vertices             = dst_vertices;
   graph_container.weights                  = weights;
-  graph_container.num_partition_edges      = num_partition_edges;
+  graph_container.num_local_edges          = num_local_edges;
   graph_container.num_global_vertices      = num_global_vertices;
   graph_container.num_global_edges         = num_global_edges;
   graph_container.vertexType               = vertexType;
@@ -151,11 +240,11 @@ void populate_graph_container(graph_container_t& graph_container,
   graph_container.weightType               = weightType;
   graph_container.transposed               = transposed;
   graph_container.is_multi_gpu             = multi_gpu;
-  graph_container.hypergraph_partitioned   = hypergraph_partitioned;
   graph_container.sorted_by_degree         = sorted_by_degree;
   graph_container.do_expensive_check       = do_expensive_check;
 
-  experimental::graph_properties_t graph_props{.is_symmetric = false, .is_multigraph = false};
+  experimental::graph_properties_t graph_props{
+    .is_symmetric = false, .is_multigraph = false, .is_weighted = is_weighted};
   graph_container.graph_props = graph_props;
 
   graph_container.graph_type = graphTypeEnum::graph_t;
@@ -177,7 +266,7 @@ void populate_graph_container_legacy(graph_container_t& graph_container,
                                      int* local_offsets)
 {
   CUGRAPH_EXPECTS(graph_container.graph_type == graphTypeEnum::null,
-                  "populate_graph_container() can only be called on an empty container.");
+                  "populate_graph_container_legacy() can only be called on an empty container.");
 
   // FIXME: This is soon-to-be legacy code left in place until the new graph_t
   // class is supported everywhere else. Remove everything down to the comment
@@ -802,23 +891,23 @@ void call_sssp(raft::handle_t const& handle,
 // wrapper for shuffling:
 //
 template <typename vertex_t, typename edge_t, typename weight_t>
-std::unique_ptr<major_minor_weights_t<vertex_t, weight_t>> call_shuffle(
+std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
   raft::handle_t const& handle,
   vertex_t*
     edgelist_major_vertices,  // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place
   vertex_t* edgelist_minor_vertices,  // [IN / OUT]
   weight_t* edgelist_weights,         // [IN / OUT]
-  edge_t num_edgelist_edges,
-  bool is_hypergraph_partitioned)  // = false
+  edge_t num_edgelist_edges)
 {
-  auto& comm = handle.get_comms();
+  auto& comm               = handle.get_comms();
+  auto const comm_size     = comm.get_size();
+  auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+  auto const row_comm_size = row_comm.get_size();
+  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_comm_size = col_comm.get_size();
 
-  auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-
-  auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-
-  std::unique_ptr<major_minor_weights_t<vertex_t, weight_t>> ptr_ret =
-    std::make_unique<major_minor_weights_t<vertex_t, weight_t>>(handle);
+  std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> ptr_ret =
+    std::make_unique<major_minor_weights_t<vertex_t, edge_t, weight_t>>(handle);
 
   if (edgelist_weights != nullptr) {
     auto zip_edge = thrust::make_zip_iterator(
@@ -833,10 +922,7 @@ std::unique_ptr<major_minor_weights_t<vertex_t, weight_t>> call_shuffle(
         zip_edge + num_edgelist_edges,
         [key_func =
            cugraph::experimental::detail::compute_gpu_id_from_edge_t<vertex_t>{
-             is_hypergraph_partitioned,
-             comm.get_size(),
-             row_comm.get_size(),
-             col_comm.get_size()}] __device__(auto val) {
+             comm.get_size(), row_comm.get_size(), col_comm.get_size()}] __device__(auto val) {
           return key_func(thrust::get<0>(val), thrust::get<1>(val));
         },
         handle.get_stream());
@@ -852,15 +938,46 @@ std::unique_ptr<major_minor_weights_t<vertex_t, weight_t>> call_shuffle(
         zip_edge + num_edgelist_edges,
         [key_func =
            cugraph::experimental::detail::compute_gpu_id_from_edge_t<vertex_t>{
-             is_hypergraph_partitioned,
-             comm.get_size(),
-             row_comm.get_size(),
-             col_comm.get_size()}] __device__(auto val) {
+             comm.get_size(), row_comm.get_size(), col_comm.get_size()}] __device__(auto val) {
           return key_func(thrust::get<0>(val), thrust::get<1>(val));
         },
         handle.get_stream());
   }
 
+  auto local_partition_id_op =
+    [comm_size,
+     key_func = cugraph::experimental::detail::compute_partition_id_from_edge_t<vertex_t>{
+       comm_size, row_comm_size, col_comm_size}] __device__(auto pair) {
+      return key_func(thrust::get<0>(pair), thrust::get<1>(pair)) /
+             comm_size;  // global partition id to local partition id
+    };
+  auto pair_first = thrust::make_zip_iterator(
+    thrust::make_tuple(ptr_ret->get_major().data(), ptr_ret->get_minor().data()));
+
+  auto edge_counts =
+    (edgelist_weights != nullptr)
+      ? cugraph::experimental::groupby_and_count(pair_first,
+                                                 pair_first + ptr_ret->get_major().size(),
+                                                 ptr_ret->get_weights().data(),
+                                                 local_partition_id_op,
+                                                 col_comm_size,
+                                                 handle.get_stream())
+      : cugraph::experimental::groupby_and_count(pair_first,
+                                                 pair_first + ptr_ret->get_major().size(),
+                                                 local_partition_id_op,
+                                                 col_comm_size,
+                                                 handle.get_stream());
+
+  std::vector<size_t> h_edge_counts(edge_counts.size());
+  raft::update_host(
+    h_edge_counts.data(), edge_counts.data(), edge_counts.size(), handle.get_stream());
+  handle.get_stream_view().synchronize();
+
+  ptr_ret->get_edge_counts().resize(h_edge_counts.size());
+  for (size_t i = 0; i < h_edge_counts.size(); ++i) {
+    ptr_ret->get_edge_counts()[i] = static_cast<edge_t>(h_edge_counts[i]);
+  }
+
   return ptr_ret;  // RVO-ed
 }
 
@@ -872,8 +989,7 @@ std::unique_ptr<renum_quad_t<vertex_t, edge_t>> call_renumber(
   raft::handle_t const& handle,
   vertex_t* shuffled_edgelist_major_vertices /* [INOUT] */,
   vertex_t* shuffled_edgelist_minor_vertices /* [INOUT] */,
-  edge_t num_edgelist_edges,
-  bool is_hypergraph_partitioned,
+  std::vector<edge_t> const& edge_counts,
   bool do_expensive_check,
   bool multi_gpu)  // bc. cython cannot take non-type template params
 {
@@ -883,33 +999,31 @@ std::unique_ptr<renum_quad_t<vertex_t, edge_t>> call_renumber(
     std::make_unique<renum_quad_t<vertex_t, edge_t>>(handle);
 
   if (multi_gpu) {
+    std::vector<edge_t> displacements(edge_counts.size(), edge_t{0});
+    std::partial_sum(edge_counts.begin(), edge_counts.end() - 1, displacements.begin() + 1);
+    std::vector<vertex_t*> major_ptrs(edge_counts.size());
+    std::vector<vertex_t*> minor_ptrs(major_ptrs.size());
+    for (size_t i = 0; i < edge_counts.size(); ++i) {
+      major_ptrs[i] = shuffled_edgelist_major_vertices + displacements[i];
+      minor_ptrs[i] = shuffled_edgelist_minor_vertices + displacements[i];
+    }
+
     std::tie(
       p_ret->get_dv(), p_ret->get_partition(), p_ret->get_num_vertices(), p_ret->get_num_edges()) =
       cugraph::experimental::renumber_edgelist<vertex_t, edge_t, true>(
-        handle,
-        shuffled_edgelist_major_vertices,
-        shuffled_edgelist_minor_vertices,
-        num_edgelist_edges,
-        is_hypergraph_partitioned,
-        do_expensive_check);
+        handle, major_ptrs, minor_ptrs, edge_counts, do_expensive_check);
   } else {
-    auto ret_f = cugraph::experimental::renumber_edgelist<vertex_t, edge_t, false>(
+    p_ret->get_dv() = cugraph::experimental::renumber_edgelist<vertex_t, edge_t, false>(
       handle,
       shuffled_edgelist_major_vertices,
       shuffled_edgelist_minor_vertices,
-      num_edgelist_edges,
+      edge_counts[0],
       do_expensive_check);
 
-    auto tot_vertices = static_cast<vertex_t>(ret_f.size());
-
-    p_ret->get_dv() = std::move(ret_f);
-    cugraph::experimental::partition_t<vertex_t> part_sg(
-      std::vector<vertex_t>{0, tot_vertices}, false, 1, 1, 0, 0);
-
-    p_ret->get_partition() = std::move(part_sg);
+    p_ret->get_partition() = cugraph::experimental::partition_t<vertex_t>{};  // dummy
 
-    p_ret->get_num_vertices() = tot_vertices;
-    p_ret->get_num_edges()    = num_edgelist_edges;
+    p_ret->get_num_vertices() = static_cast<vertex_t>(p_ret->get_dv().size());
+    p_ret->get_num_edges()    = edge_counts[0];
   }
 
   return p_ret;  // RVO-ed (copy ellision)
@@ -1142,53 +1256,47 @@ template void call_sssp(raft::handle_t const& handle,
                         int64_t* predecessors,
                         const int64_t source_vertex);
 
-template std::unique_ptr<major_minor_weights_t<int32_t, float>> call_shuffle(
+template std::unique_ptr<major_minor_weights_t<int32_t, int32_t, float>> call_shuffle(
   raft::handle_t const& handle,
   int32_t* edgelist_major_vertices,
   int32_t* edgelist_minor_vertices,
   float* edgelist_weights,
-  int32_t num_edgelist_edges,
-  bool is_hypergraph_partitioned);
+  int32_t num_edgelist_edges);
 
-template std::unique_ptr<major_minor_weights_t<int32_t, float>> call_shuffle(
+template std::unique_ptr<major_minor_weights_t<int32_t, int64_t, float>> call_shuffle(
   raft::handle_t const& handle,
   int32_t* edgelist_major_vertices,
   int32_t* edgelist_minor_vertices,
   float* edgelist_weights,
-  int64_t num_edgelist_edges,
-  bool is_hypergraph_partitioned);
+  int64_t num_edgelist_edges);
 
-template std::unique_ptr<major_minor_weights_t<int32_t, double>> call_shuffle(
+template std::unique_ptr<major_minor_weights_t<int32_t, int32_t, double>> call_shuffle(
   raft::handle_t const& handle,
   int32_t* edgelist_major_vertices,
   int32_t* edgelist_minor_vertices,
   double* edgelist_weights,
-  int32_t num_edgelist_edges,
-  bool is_hypergraph_partitioned);
+  int32_t num_edgelist_edges);
 
-template std::unique_ptr<major_minor_weights_t<int32_t, double>> call_shuffle(
+template std::unique_ptr<major_minor_weights_t<int32_t, int64_t, double>> call_shuffle(
   raft::handle_t const& handle,
   int32_t* edgelist_major_vertices,
   int32_t* edgelist_minor_vertices,
   double* edgelist_weights,
-  int64_t num_edgelist_edges,
-  bool is_hypergraph_partitioned);
+  int64_t num_edgelist_edges);
 
-template std::unique_ptr<major_minor_weights_t<int64_t, float>> call_shuffle(
+template std::unique_ptr<major_minor_weights_t<int64_t, int64_t, float>> call_shuffle(
   raft::handle_t const& handle,
   int64_t* edgelist_major_vertices,
   int64_t* edgelist_minor_vertices,
   float* edgelist_weights,
-  int64_t num_edgelist_edges,
-  bool is_hypergraph_partitioned);
+  int64_t num_edgelist_edges);
 
-template std::unique_ptr<major_minor_weights_t<int64_t, double>> call_shuffle(
+template std::unique_ptr<major_minor_weights_t<int64_t, int64_t, double>> call_shuffle(
   raft::handle_t const& handle,
   int64_t* edgelist_major_vertices,
   int64_t* edgelist_minor_vertices,
   double* edgelist_weights,
-  int64_t num_edgelist_edges,
-  bool is_hypergraph_partitioned);
+  int64_t num_edgelist_edges);
 
 // TODO: add the remaining relevant EIDIr's:
 //
@@ -1196,8 +1304,7 @@ template std::unique_ptr<renum_quad_t<int32_t, int32_t>> call_renumber(
   raft::handle_t const& handle,
   int32_t* shuffled_edgelist_major_vertices /* [INOUT] */,
   int32_t* shuffled_edgelist_minor_vertices /* [INOUT] */,
-  int32_t num_edgelist_edges,
-  bool is_hypergraph_partitioned,
+  std::vector<int32_t> const& edge_counts,
   bool do_expensive_check,
   bool multi_gpu);
 
@@ -1205,8 +1312,7 @@ template std::unique_ptr<renum_quad_t<int32_t, int64_t>> call_renumber(
   raft::handle_t const& handle,
   int32_t* shuffled_edgelist_major_vertices /* [INOUT] */,
   int32_t* shuffled_edgelist_minor_vertices /* [INOUT] */,
-  int64_t num_edgelist_edges,
-  bool is_hypergraph_partitioned,
+  std::vector<int64_t> const& edge_counts,
   bool do_expensive_check,
   bool multi_gpu);
 
@@ -1214,8 +1320,7 @@ template std::unique_ptr<renum_quad_t<int64_t, int64_t>> call_renumber(
   raft::handle_t const& handle,
   int64_t* shuffled_edgelist_major_vertices /* [INOUT] */,
   int64_t* shuffled_edgelist_minor_vertices /* [INOUT] */,
-  int64_t num_edgelist_edges,
-  bool is_hypergraph_partitioned,
+  std::vector<int64_t> const& edge_counts,
   bool do_expensive_check,
   bool multi_gpu);
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 3b65b0edb29..89975f673ae 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -20,9 +20,10 @@
 # - common test utils -----------------------------------------------------------------------------
 
 add_library(cugraphtestutil STATIC
-            "${CMAKE_CURRENT_SOURCE_DIR}/utilities/generate_graph_from_edgelist.cu"
             "${CMAKE_CURRENT_SOURCE_DIR}/utilities/matrix_market_file_utilities.cu"
             "${CMAKE_CURRENT_SOURCE_DIR}/utilities/rmat_utilities.cu"
+            "${CMAKE_CURRENT_SOURCE_DIR}/utilities/generate_graph_from_edgelist.cu"
+            "${CMAKE_CURRENT_SOURCE_DIR}/utilities/thrust_wrapper.cu"
             "${CMAKE_CURRENT_SOURCE_DIR}/utilities/misc_utilities.cpp"
             "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c")
 
@@ -445,7 +446,34 @@ if(BUILD_CUGRAPH_MG_TESTS)
         target_link_libraries(MG_PAGERANK_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX)
 
         ###########################################################################################
-        # - MG LOUVAIN tests ---------------------------------------------------------------------
+        # - MG KATZ CENTRALITY tests --------------------------------------------------------------
+
+        set(MG_KATZ_CENTRALITY_TEST_SRCS
+            "${CMAKE_CURRENT_SOURCE_DIR}/experimental/mg_katz_centrality_test.cpp")
+
+        ConfigureTest(MG_KATZ_CENTRALITY_TEST "${MG_KATZ_CENTRALITY_TEST_SRCS}")
+        target_link_libraries(MG_KATZ_CENTRALITY_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX)
+
+        ###########################################################################################
+        # - MG BFS tests --------------------------------------------------------------------------
+
+        set(MG_BFS_TEST_SRCS
+            "${CMAKE_CURRENT_SOURCE_DIR}/experimental/mg_bfs_test.cpp")
+
+        ConfigureTest(MG_BFS_TEST "${MG_BFS_TEST_SRCS}")
+        target_link_libraries(MG_BFS_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX)
+
+        ###########################################################################################
+        # - MG SSSP tests -------------------------------------------------------------------------
+
+        set(MG_SSSP_TEST_SRCS
+            "${CMAKE_CURRENT_SOURCE_DIR}/experimental/mg_sssp_test.cpp")
+
+        ConfigureTest(MG_SSSP_TEST "${MG_SSSP_TEST_SRCS}")
+        target_link_libraries(MG_SSSP_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX)
+
+        ###########################################################################################
+        # - MG LOUVAIN tests ----------------------------------------------------------------------
 
         set(MG_LOUVAIN_TEST_SRCS
 	    "${CMAKE_CURRENT_SOURCE_DIR}/community/mg_louvain_helper.cu"
@@ -453,7 +481,6 @@ if(BUILD_CUGRAPH_MG_TESTS)
 
         ConfigureTest(MG_LOUVAIN_TEST "${MG_LOUVAIN_TEST_SRCS}")
         target_link_libraries(MG_LOUVAIN_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX)
-        target_link_libraries(MG_LOUVAIN_TEST PRIVATE cugraph)
 
     else(MPI_CXX_FOUND)
        message(FATAL_ERROR "OpenMPI NOT found, cannot build MG tests.")
diff --git a/cpp/tests/community/egonet_test.cu b/cpp/tests/community/egonet_test.cu
index e7fea43be42..d61080c685e 100644
--- a/cpp/tests/community/egonet_test.cu
+++ b/cpp/tests/community/egonet_test.cu
@@ -21,7 +21,6 @@
 #include <algorithms.hpp>
 #include <experimental/graph.hpp>
 #include <experimental/graph_view.hpp>
-#include <graph.hpp>
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
@@ -129,8 +128,10 @@ class Tests_InducedEgo : public ::testing::TestWithParam<InducedEgo_Usecase> {
       ASSERT_TRUE(h_cugraph_ego_edge_offsets[i] <= h_cugraph_ego_edge_offsets[i + 1]);
     auto n_vertices = graph_view.get_number_of_vertices();
     for (size_t i = 0; i < d_ego_edgelist_src.size(); i++) {
-      ASSERT_TRUE(cugraph::test::is_valid_vertex(n_vertices, h_cugraph_ego_edgelist_src[i]));
-      ASSERT_TRUE(cugraph::test::is_valid_vertex(n_vertices, h_cugraph_ego_edgelist_dst[i]));
+      ASSERT_TRUE(
+        cugraph::experimental::is_valid_vertex(n_vertices, h_cugraph_ego_edgelist_src[i]));
+      ASSERT_TRUE(
+        cugraph::experimental::is_valid_vertex(n_vertices, h_cugraph_ego_edgelist_dst[i]));
     }
 
     /*
diff --git a/cpp/tests/community/mg_louvain_helper.cu b/cpp/tests/community/mg_louvain_helper.cu
index a7f95e6d718..661065ca65b 100644
--- a/cpp/tests/community/mg_louvain_helper.cu
+++ b/cpp/tests/community/mg_louvain_helper.cu
@@ -323,7 +323,8 @@ coarsen_graph(
     handle,
     edgelist,
     new_number_of_vertices,
-    cugraph::experimental::graph_properties_t{graph_view.is_symmetric(), false},
+    cugraph::experimental::graph_properties_t{
+      graph_view.is_symmetric(), false, graph_view.is_weighted()},
     true);
 }
 
diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp
index f6596a6b59a..8a1a3010a6f 100644
--- a/cpp/tests/community/mg_louvain_test.cpp
+++ b/cpp/tests/community/mg_louvain_test.cpp
@@ -31,10 +31,13 @@
 
 #include <gtest/gtest.h>
 
-void compare(float modularity, float sg_modularity) { ASSERT_FLOAT_EQ(modularity, sg_modularity); }
-void compare(double modularity, double sg_modularity)
+void compare(float mg_modularity, float sg_modularity)
 {
-  ASSERT_DOUBLE_EQ(modularity, sg_modularity);
+  ASSERT_FLOAT_EQ(mg_modularity, sg_modularity);
+}
+void compare(double mg_modularity, double sg_modularity)
+{
+  ASSERT_DOUBLE_EQ(mg_modularity, sg_modularity);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -90,13 +93,13 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam<Louvain_Usecase>
                           cugraph::Dendrogram<vertex_t> const& dendrogram,
                           weight_t resolution,
                           int rank,
-                          weight_t modularity)
+                          weight_t mg_modularity)
   {
     auto sg_graph =
       std::make_unique<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false>>(
         handle);
     rmm::device_uvector<vertex_t> d_clustering_v(0, handle.get_stream());
-    weight_t sg_modularity;
+    weight_t sg_modularity{-1.0};
 
     if (rank == 0) {
       // Create initial SG graph, renumbered according to the MNMG renumber map
@@ -160,7 +163,7 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam<Louvain_Usecase>
         }
       });
 
-    if (rank == 0) compare(modularity, sg_modularity);
+    if (rank == 0) compare(mg_modularity, sg_modularity);
   }
 
   // Compare the results of running louvain on multiple GPUs to that of a
@@ -197,9 +200,9 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam<Louvain_Usecase>
     auto mg_graph_view = mg_graph.view();
 
     std::unique_ptr<cugraph::Dendrogram<vertex_t>> dendrogram;
-    weight_t modularity;
+    weight_t mg_modularity;
 
-    std::tie(dendrogram, modularity) =
+    std::tie(dendrogram, mg_modularity) =
       cugraph::louvain(handle, mg_graph_view, param.max_level, param.resolution);
 
     SCOPED_TRACE("compare modularity input: " + param.graph_file_full_path);
@@ -213,7 +216,7 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam<Louvain_Usecase>
                                                    *dendrogram,
                                                    param.resolution,
                                                    comm_rank,
-                                                   modularity);
+                                                   mg_modularity);
   }
 };
 
diff --git a/cpp/tests/experimental/bfs_test.cpp b/cpp/tests/experimental/bfs_test.cpp
index ad9ece99ef9..8fce9488d8a 100644
--- a/cpp/tests/experimental/bfs_test.cpp
+++ b/cpp/tests/experimental/bfs_test.cpp
@@ -16,9 +16,11 @@
 
 #include <utilities/base_fixture.hpp>
 #include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
 
 #include <algorithms.hpp>
 #include <experimental/graph.hpp>
+#include <experimental/graph_functions.hpp>
 #include <experimental/graph_view.hpp>
 
 #include <raft/cudart_utils.h>
@@ -28,10 +30,16 @@
 
 #include <gtest/gtest.h>
 
+#include <algorithm>
 #include <iterator>
 #include <limits>
 #include <vector>
 
+// do the perf measurements
+// enabled by command line parameter s'--perf'
+//
+static int PERF = 0;
+
 template <typename vertex_t, typename edge_t>
 void bfs_reference(edge_t const* offsets,
                    vertex_t const* indices,
@@ -74,9 +82,12 @@ void bfs_reference(edge_t const* offsets,
 
 typedef struct BFS_Usecase_t {
   cugraph::test::input_graph_specifier_t input_graph_specifier{};
-  size_t source{false};
 
-  BFS_Usecase_t(std::string const& graph_file_path, size_t source) : source(source)
+  size_t source{0};
+  bool check_correctness{false};
+
+  BFS_Usecase_t(std::string const& graph_file_path, size_t source, bool check_correctness = true)
+    : source(source), check_correctness(check_correctness)
   {
     std::string graph_file_full_path{};
     if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
@@ -88,13 +99,43 @@ typedef struct BFS_Usecase_t {
     input_graph_specifier.graph_file_full_path = graph_file_full_path;
   };
 
-  BFS_Usecase_t(cugraph::test::rmat_params_t rmat_params, size_t source) : source(source)
+  BFS_Usecase_t(cugraph::test::rmat_params_t rmat_params,
+                size_t source,
+                bool check_correctness = true)
+    : source(source), check_correctness(check_correctness)
   {
     input_graph_specifier.tag         = cugraph::test::input_graph_specifier_t::RMAT_PARAMS;
     input_graph_specifier.rmat_params = rmat_params;
   }
 } BFS_Usecase;
 
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false>,
+           rmm::device_uvector<vertex_t>>
+read_graph(raft::handle_t const& handle, BFS_Usecase const& configuration, bool renumber)
+{
+  return configuration.input_graph_specifier.tag ==
+             cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH
+           ? cugraph::test::
+               read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, false>(
+                 handle, configuration.input_graph_specifier.graph_file_full_path, false, renumber)
+           : cugraph::test::
+               generate_graph_from_rmat_params<vertex_t, edge_t, weight_t, false, false>(
+                 handle,
+                 configuration.input_graph_specifier.rmat_params.scale,
+                 configuration.input_graph_specifier.rmat_params.edge_factor,
+                 configuration.input_graph_specifier.rmat_params.a,
+                 configuration.input_graph_specifier.rmat_params.b,
+                 configuration.input_graph_specifier.rmat_params.c,
+                 configuration.input_graph_specifier.rmat_params.seed,
+                 configuration.input_graph_specifier.rmat_params.undirected,
+                 configuration.input_graph_specifier.rmat_params.scramble_vertex_ids,
+                 false,
+                 renumber,
+                 std::vector<size_t>{0},
+                 size_t{1});
+}
+
 class Tests_BFS : public ::testing::TestWithParam<BFS_Usecase> {
  public:
   Tests_BFS() {}
@@ -107,58 +148,21 @@ class Tests_BFS : public ::testing::TestWithParam<BFS_Usecase> {
   template <typename vertex_t, typename edge_t>
   void run_current_test(BFS_Usecase const& configuration)
   {
+    constexpr bool renumber = true;
+
     using weight_t = float;
 
     raft::handle_t handle{};
 
     cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> graph(handle);
-    std::tie(graph, std::ignore) =
-      configuration.input_graph_specifier.tag ==
-          cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH
-        ? cugraph::test::
-            read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, false>(
-              handle, configuration.input_graph_specifier.graph_file_full_path, false, false)
-        : cugraph::test::generate_graph_from_rmat_params<vertex_t, edge_t, weight_t, false, false>(
-            handle,
-            configuration.input_graph_specifier.rmat_params.scale,
-            configuration.input_graph_specifier.rmat_params.edge_factor,
-            configuration.input_graph_specifier.rmat_params.a,
-            configuration.input_graph_specifier.rmat_params.b,
-            configuration.input_graph_specifier.rmat_params.c,
-            configuration.input_graph_specifier.rmat_params.seed,
-            configuration.input_graph_specifier.rmat_params.undirected,
-            configuration.input_graph_specifier.rmat_params.scramble_vertex_ids,
-            false,
-            false);
+    rmm::device_uvector<vertex_t> d_renumber_map_labels(0, handle.get_stream());
+    std::tie(graph, d_renumber_map_labels) =
+      read_graph<vertex_t, edge_t, weight_t>(handle, configuration, renumber);
     auto graph_view = graph.view();
 
-    std::vector<edge_t> h_offsets(graph_view.get_number_of_vertices() + 1);
-    std::vector<vertex_t> h_indices(graph_view.get_number_of_edges());
-    raft::update_host(h_offsets.data(),
-                      graph_view.offsets(),
-                      graph_view.get_number_of_vertices() + 1,
-                      handle.get_stream());
-    raft::update_host(h_indices.data(),
-                      graph_view.indices(),
-                      graph_view.get_number_of_edges(),
-                      handle.get_stream());
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-
-    ASSERT_TRUE(configuration.source >= 0 &&
-                configuration.source <= graph_view.get_number_of_vertices())
-      << "Starting sources should be >= 0 and"
-      << " less than the number of vertices in the graph.";
-
-    std::vector<vertex_t> h_reference_distances(graph_view.get_number_of_vertices());
-    std::vector<vertex_t> h_reference_predecessors(graph_view.get_number_of_vertices());
-
-    bfs_reference(h_offsets.data(),
-                  h_indices.data(),
-                  h_reference_distances.data(),
-                  h_reference_predecessors.data(),
-                  graph_view.get_number_of_vertices(),
-                  static_cast<vertex_t>(configuration.source),
-                  std::numeric_limits<vertex_t>::max());
+    ASSERT_TRUE(static_cast<vertex_t>(configuration.source) >= 0 &&
+                static_cast<vertex_t>(configuration.source) < graph_view.get_number_of_vertices())
+      << "Invalid starting source.";
 
     rmm::device_uvector<vertex_t> d_distances(graph_view.get_number_of_vertices(),
                                               handle.get_stream());
@@ -169,46 +173,120 @@ class Tests_BFS : public ::testing::TestWithParam<BFS_Usecase> {
 
     cugraph::experimental::bfs(handle,
                                graph_view,
-                               d_distances.begin(),
-                               d_predecessors.begin(),
+                               d_distances.data(),
+                               d_predecessors.data(),
                                static_cast<vertex_t>(configuration.source),
                                false,
-                               std::numeric_limits<vertex_t>::max(),
-                               false);
+                               std::numeric_limits<vertex_t>::max());
 
     CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
 
-    std::vector<vertex_t> h_cugraph_distances(graph_view.get_number_of_vertices());
-    std::vector<vertex_t> h_cugraph_predecessors(graph_view.get_number_of_vertices());
-
-    raft::update_host(
-      h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream());
-    raft::update_host(h_cugraph_predecessors.data(),
-                      d_predecessors.data(),
-                      d_predecessors.size(),
-                      handle.get_stream());
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-
-    ASSERT_TRUE(std::equal(
-      h_reference_distances.begin(), h_reference_distances.end(), h_cugraph_distances.begin()))
-      << "distances do not match with the reference values.";
-
-    for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) {
-      auto i = std::distance(h_cugraph_predecessors.begin(), it);
-      if (*it == cugraph::invalid_vertex_id<vertex_t>::value) {
-        ASSERT_TRUE(h_reference_predecessors[i] == *it)
-          << "vertex reachability do not match with the reference.";
+    if (configuration.check_correctness) {
+      cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> unrenumbered_graph(
+        handle);
+      if (renumber) {
+        std::tie(unrenumbered_graph, std::ignore) =
+          read_graph<vertex_t, edge_t, weight_t>(handle, configuration, false);
+      }
+      auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view;
+
+      std::vector<edge_t> h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1);
+      std::vector<vertex_t> h_indices(unrenumbered_graph_view.get_number_of_edges());
+      raft::update_host(h_offsets.data(),
+                        unrenumbered_graph_view.offsets(),
+                        unrenumbered_graph_view.get_number_of_vertices() + 1,
+                        handle.get_stream());
+      raft::update_host(h_indices.data(),
+                        unrenumbered_graph_view.indices(),
+                        unrenumbered_graph_view.get_number_of_edges(),
+                        handle.get_stream());
+
+      handle.get_stream_view().synchronize();
+
+      auto unrenumbered_source = static_cast<vertex_t>(configuration.source);
+      if (renumber) {
+        std::vector<vertex_t> h_renumber_map_labels(d_renumber_map_labels.size());
+        raft::update_host(h_renumber_map_labels.data(),
+                          d_renumber_map_labels.data(),
+                          d_renumber_map_labels.size(),
+                          handle.get_stream());
+
+        handle.get_stream_view().synchronize();
+
+        unrenumbered_source = h_renumber_map_labels[configuration.source];
+      }
+
+      std::vector<vertex_t> h_reference_distances(unrenumbered_graph_view.get_number_of_vertices());
+      std::vector<vertex_t> h_reference_predecessors(
+        unrenumbered_graph_view.get_number_of_vertices());
+
+      bfs_reference(h_offsets.data(),
+                    h_indices.data(),
+                    h_reference_distances.data(),
+                    h_reference_predecessors.data(),
+                    unrenumbered_graph_view.get_number_of_vertices(),
+                    unrenumbered_source,
+                    std::numeric_limits<vertex_t>::max());
+
+      std::vector<vertex_t> h_cugraph_distances(graph_view.get_number_of_vertices());
+      std::vector<vertex_t> h_cugraph_predecessors(graph_view.get_number_of_vertices());
+      if (renumber) {
+        cugraph::experimental::unrenumber_local_int_vertices(handle,
+                                                             d_predecessors.data(),
+                                                             d_predecessors.size(),
+                                                             d_renumber_map_labels.data(),
+                                                             vertex_t{0},
+                                                             graph_view.get_number_of_vertices(),
+                                                             true);
+
+        auto d_unrenumbered_distances = cugraph::test::sort_by_key(
+          handle, d_renumber_map_labels.data(), d_distances.data(), d_renumber_map_labels.size());
+        auto d_unrenumbered_predecessors = cugraph::test::sort_by_key(handle,
+                                                                      d_renumber_map_labels.data(),
+                                                                      d_predecessors.data(),
+                                                                      d_renumber_map_labels.size());
+        raft::update_host(h_cugraph_distances.data(),
+                          d_unrenumbered_distances.data(),
+                          d_unrenumbered_distances.size(),
+                          handle.get_stream());
+        raft::update_host(h_cugraph_predecessors.data(),
+                          d_unrenumbered_predecessors.data(),
+                          d_unrenumbered_predecessors.size(),
+                          handle.get_stream());
+
+        handle.get_stream_view().synchronize();
       } else {
-        ASSERT_TRUE(h_reference_distances[*it] + 1 == h_reference_distances[i])
-          << "distance to this vertex != distance to the predecessor vertex + 1.";
-        bool found{false};
-        for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) {
-          if (h_indices[j] == i) {
-            found = true;
-            break;
+        raft::update_host(
+          h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream());
+        raft::update_host(h_cugraph_predecessors.data(),
+                          d_predecessors.data(),
+                          d_predecessors.size(),
+                          handle.get_stream());
+
+        handle.get_stream_view().synchronize();
+      }
+
+      ASSERT_TRUE(std::equal(
+        h_reference_distances.begin(), h_reference_distances.end(), h_cugraph_distances.begin()))
+        << "distances do not match with the reference values.";
+
+      for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) {
+        auto i = std::distance(h_cugraph_predecessors.begin(), it);
+        if (*it == cugraph::invalid_vertex_id<vertex_t>::value) {
+          ASSERT_TRUE(h_reference_predecessors[i] == *it)
+            << "vertex reachability does not match with the reference.";
+        } else {
+          ASSERT_TRUE(h_reference_distances[*it] + 1 == h_reference_distances[i])
+            << "distance to this vertex != distance to the predecessor vertex + 1.";
+          bool found{false};
+          for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) {
+            if (h_indices[j] == i) {
+              found = true;
+              break;
+            }
           }
+          ASSERT_TRUE(found) << "no edge from the predecessor vertex to this vertex.";
         }
-        ASSERT_TRUE(found) << "no edge from the predecessor vertex to this vertex.";
       }
     }
   }
@@ -221,12 +299,17 @@ INSTANTIATE_TEST_CASE_P(
   simple_test,
   Tests_BFS,
   ::testing::Values(
+    // enable correctness checks
     BFS_Usecase("test/datasets/karate.mtx", 0),
     BFS_Usecase("test/datasets/polbooks.mtx", 0),
     BFS_Usecase("test/datasets/netscience.mtx", 0),
     BFS_Usecase("test/datasets/netscience.mtx", 100),
     BFS_Usecase("test/datasets/wiki2003.mtx", 1000),
     BFS_Usecase("test/datasets/wiki-Talk.mtx", 1000),
-    BFS_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0)));
+    BFS_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0),
+    // disable correctness checks for large graphs
+    BFS_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false},
+                0,
+                false)));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/coarsen_graph_test.cpp b/cpp/tests/experimental/coarsen_graph_test.cpp
index 789619f2cd9..0fc0634bbbc 100644
--- a/cpp/tests/experimental/coarsen_graph_test.cpp
+++ b/cpp/tests/experimental/coarsen_graph_test.cpp
@@ -54,13 +54,14 @@ void check_coarsened_graph_results(edge_t* org_offsets,
   ASSERT_TRUE(std::count_if(org_indices,
                             org_indices + org_offsets[num_org_vertices],
                             [num_org_vertices](auto nbr) {
-                              return !cugraph::test::is_valid_vertex(num_org_vertices, nbr);
+                              return !cugraph::experimental::is_valid_vertex(num_org_vertices, nbr);
                             }) == 0);
   ASSERT_TRUE(std::is_sorted(coarse_offsets, coarse_offsets + num_coarse_vertices));
   ASSERT_TRUE(std::count_if(coarse_indices,
                             coarse_indices + coarse_offsets[num_coarse_vertices],
                             [num_coarse_vertices](auto nbr) {
-                              return !cugraph::test::is_valid_vertex(num_coarse_vertices, nbr);
+                              return !cugraph::experimental::is_valid_vertex(num_coarse_vertices,
+                                                                             nbr);
                             }) == 0);
   ASSERT_TRUE(num_coarse_vertices <= num_org_vertices);
 
diff --git a/cpp/tests/experimental/generate_rmat_test.cpp b/cpp/tests/experimental/generate_rmat_test.cpp
index 666106d62ca..221accea4f7 100644
--- a/cpp/tests/experimental/generate_rmat_test.cpp
+++ b/cpp/tests/experimental/generate_rmat_test.cpp
@@ -18,6 +18,7 @@
 #include <utilities/base_fixture.hpp>
 #include <utilities/test_utilities.hpp>
 
+#include <experimental/graph.hpp>
 #include <experimental/graph_generator.hpp>
 
 #include <raft/cudart_utils.h>
@@ -201,17 +202,19 @@ class Tests_GenerateRmat : public ::testing::TestWithParam<GenerateRmat_Usecase>
         (h_cugraph_srcs.size() == (size_t{1} << configuration.scale) * configuration.edge_factor) &&
         (h_cugraph_dsts.size() == (size_t{1} << configuration.scale) * configuration.edge_factor))
         << "Returned an invalid number of R-mat graph edges.";
-      ASSERT_TRUE(
-        std::count_if(h_cugraph_srcs.begin(),
-                      h_cugraph_srcs.end(),
-                      [num_vertices = static_cast<vertex_t>(size_t{1} << configuration.scale)](
-                        auto v) { return !cugraph::test::is_valid_vertex(num_vertices, v); }) == 0)
+      ASSERT_TRUE(std::count_if(h_cugraph_srcs.begin(),
+                                h_cugraph_srcs.end(),
+                                [num_vertices = static_cast<vertex_t>(
+                                   size_t{1} << configuration.scale)](auto v) {
+                                  return !cugraph::experimental::is_valid_vertex(num_vertices, v);
+                                }) == 0)
         << "Returned R-mat graph edges have invalid source vertex IDs.";
-      ASSERT_TRUE(
-        std::count_if(h_cugraph_dsts.begin(),
-                      h_cugraph_dsts.end(),
-                      [num_vertices = static_cast<vertex_t>(size_t{1} << configuration.scale)](
-                        auto v) { return !cugraph::test::is_valid_vertex(num_vertices, v); }) == 0)
+      ASSERT_TRUE(std::count_if(h_cugraph_dsts.begin(),
+                                h_cugraph_dsts.end(),
+                                [num_vertices = static_cast<vertex_t>(
+                                   size_t{1} << configuration.scale)](auto v) {
+                                  return !cugraph::experimental::is_valid_vertex(num_vertices, v);
+                                }) == 0)
         << "Returned R-mat graph edges have invalid destination vertex IDs.";
 
       if (!scramble) {
diff --git a/cpp/tests/experimental/graph_test.cpp b/cpp/tests/experimental/graph_test.cpp
index 949f6d2e08e..6ce32e0c836 100644
--- a/cpp/tests/experimental/graph_test.cpp
+++ b/cpp/tests/experimental/graph_test.cpp
@@ -139,7 +139,7 @@ class Tests_Graph : public ::testing::TestWithParam<Graph_Usecase> {
         handle,
         edgelist,
         number_of_vertices,
-        cugraph::experimental::graph_properties_t{is_symmetric, false},
+        cugraph::experimental::graph_properties_t{is_symmetric, false, configuration.test_weighted},
         false,
         true);
 
diff --git a/cpp/tests/experimental/katz_centrality_test.cpp b/cpp/tests/experimental/katz_centrality_test.cpp
index 776bb60716c..71011f3d018 100644
--- a/cpp/tests/experimental/katz_centrality_test.cpp
+++ b/cpp/tests/experimental/katz_centrality_test.cpp
@@ -16,9 +16,11 @@
 
 #include <utilities/base_fixture.hpp>
 #include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
 
 #include <algorithms.hpp>
 #include <experimental/graph.hpp>
+#include <experimental/graph_functions.hpp>
 #include <experimental/graph_view.hpp>
 
 #include <raft/cudart_utils.h>
@@ -34,6 +36,11 @@
 #include <numeric>
 #include <vector>
 
+// do the perf measurements
+// enabled by command line parameter s'--perf'
+//
+static int PERF = 0;
+
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
 void katz_centrality_reference(edge_t const* offsets,
                                vertex_t const* indices,
@@ -92,9 +99,12 @@ typedef struct KatzCentrality_Usecase_t {
   cugraph::test::input_graph_specifier_t input_graph_specifier{};
 
   bool test_weighted{false};
+  bool check_correctness{false};
 
-  KatzCentrality_Usecase_t(std::string const& graph_file_path, bool test_weighted)
-    : test_weighted(test_weighted)
+  KatzCentrality_Usecase_t(std::string const& graph_file_path,
+                           bool test_weighted,
+                           bool check_correctness = true)
+    : test_weighted(test_weighted), check_correctness(check_correctness)
   {
     std::string graph_file_full_path{};
     if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
@@ -107,15 +117,45 @@ typedef struct KatzCentrality_Usecase_t {
   };
 
   KatzCentrality_Usecase_t(cugraph::test::rmat_params_t rmat_params,
-                           double personalization_ratio,
-                           bool test_weighted)
-    : test_weighted(test_weighted)
+                           bool test_weighted,
+                           bool check_correctness = true)
+    : test_weighted(test_weighted), check_correctness(check_correctness)
   {
     input_graph_specifier.tag         = cugraph::test::input_graph_specifier_t::RMAT_PARAMS;
     input_graph_specifier.rmat_params = rmat_params;
   }
 } KatzCentrality_Usecase;
 
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, true, false>,
+           rmm::device_uvector<vertex_t>>
+read_graph(raft::handle_t const& handle, KatzCentrality_Usecase const& configuration, bool renumber)
+{
+  return configuration.input_graph_specifier.tag ==
+             cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH
+           ? cugraph::test::
+               read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, true, false>(
+                 handle,
+                 configuration.input_graph_specifier.graph_file_full_path,
+                 configuration.test_weighted,
+                 renumber)
+           : cugraph::test::
+               generate_graph_from_rmat_params<vertex_t, edge_t, weight_t, true, false>(
+                 handle,
+                 configuration.input_graph_specifier.rmat_params.scale,
+                 configuration.input_graph_specifier.rmat_params.edge_factor,
+                 configuration.input_graph_specifier.rmat_params.a,
+                 configuration.input_graph_specifier.rmat_params.b,
+                 configuration.input_graph_specifier.rmat_params.c,
+                 configuration.input_graph_specifier.rmat_params.seed,
+                 configuration.input_graph_specifier.rmat_params.undirected,
+                 configuration.input_graph_specifier.rmat_params.scramble_vertex_ids,
+                 configuration.test_weighted,
+                 renumber,
+                 std::vector<size_t>{0},
+                 size_t{1});
+}
+
 class Tests_KatzCentrality : public ::testing::TestWithParam<KatzCentrality_Usecase> {
  public:
   Tests_KatzCentrality() {}
@@ -128,76 +168,26 @@ class Tests_KatzCentrality : public ::testing::TestWithParam<KatzCentrality_Usec
   template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
   void run_current_test(KatzCentrality_Usecase const& configuration)
   {
+    constexpr bool renumber = true;
+
     raft::handle_t handle{};
 
     cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, true, false> graph(handle);
-    std::tie(graph, std::ignore) =
-      configuration.input_graph_specifier.tag ==
-          cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH
-        ? cugraph::test::
-            read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, true, false>(
-              handle,
-              configuration.input_graph_specifier.graph_file_full_path,
-              configuration.test_weighted,
-              false)
-        : cugraph::test::generate_graph_from_rmat_params<vertex_t, edge_t, weight_t, true, false>(
-            handle,
-            configuration.input_graph_specifier.rmat_params.scale,
-            configuration.input_graph_specifier.rmat_params.edge_factor,
-            configuration.input_graph_specifier.rmat_params.a,
-            configuration.input_graph_specifier.rmat_params.b,
-            configuration.input_graph_specifier.rmat_params.c,
-            configuration.input_graph_specifier.rmat_params.seed,
-            configuration.input_graph_specifier.rmat_params.undirected,
-            configuration.input_graph_specifier.rmat_params.scramble_vertex_ids,
-            configuration.test_weighted,
-            false);
+    rmm::device_uvector<vertex_t> d_renumber_map_labels(0, handle.get_stream());
+    std::tie(graph, d_renumber_map_labels) =
+      read_graph<vertex_t, edge_t, weight_t>(handle, configuration, renumber);
     auto graph_view = graph.view();
 
-    std::vector<edge_t> h_offsets(graph_view.get_number_of_vertices() + 1);
-    std::vector<vertex_t> h_indices(graph_view.get_number_of_edges());
-    std::vector<weight_t> h_weights{};
-    raft::update_host(h_offsets.data(),
-                      graph_view.offsets(),
-                      graph_view.get_number_of_vertices() + 1,
-                      handle.get_stream());
-    raft::update_host(h_indices.data(),
-                      graph_view.indices(),
-                      graph_view.get_number_of_edges(),
-                      handle.get_stream());
-    if (graph_view.is_weighted()) {
-      h_weights.assign(graph_view.get_number_of_edges(), weight_t{0.0});
-      raft::update_host(h_weights.data(),
-                        graph_view.weights(),
-                        graph_view.get_number_of_edges(),
-                        handle.get_stream());
-    }
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-
-    std::vector<result_t> h_reference_katz_centralities(graph_view.get_number_of_vertices());
-
-    std::vector<edge_t> tmps(h_offsets.size());
-    std::adjacent_difference(h_offsets.begin(), h_offsets.end(), tmps.begin());
-    auto max_it = std::max_element(tmps.begin(), tmps.end());
+    auto degrees = graph_view.compute_in_degrees(handle);
+    std::vector<edge_t> h_degrees(degrees.size());
+    raft::update_host(h_degrees.data(), degrees.data(), degrees.size(), handle.get_stream());
+    handle.get_stream_view().synchronize();
+    auto max_it = std::max_element(h_degrees.begin(), h_degrees.end());
 
     result_t const alpha = result_t{1.0} / static_cast<result_t>(*max_it + 1);
     result_t constexpr beta{1.0};
     result_t constexpr epsilon{1e-6};
 
-    katz_centrality_reference(
-      h_offsets.data(),
-      h_indices.data(),
-      h_weights.size() > 0 ? h_weights.data() : static_cast<weight_t*>(nullptr),
-      static_cast<result_t*>(nullptr),
-      h_reference_katz_centralities.data(),
-      graph_view.get_number_of_vertices(),
-      alpha,
-      beta,
-      epsilon,
-      std::numeric_limits<size_t>::max(),
-      false,
-      true);
-
     rmm::device_uvector<result_t> d_katz_centralities(graph_view.get_number_of_vertices(),
                                                       handle.get_stream());
 
@@ -206,39 +196,98 @@ class Tests_KatzCentrality : public ::testing::TestWithParam<KatzCentrality_Usec
     cugraph::experimental::katz_centrality(handle,
                                            graph_view,
                                            static_cast<result_t*>(nullptr),
-                                           d_katz_centralities.begin(),
+                                           d_katz_centralities.data(),
                                            alpha,
                                            beta,
                                            epsilon,
                                            std::numeric_limits<size_t>::max(),
                                            false,
-                                           true,
-                                           false);
+                                           true);
 
     CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
 
-    std::vector<result_t> h_cugraph_katz_centralities(graph_view.get_number_of_vertices());
-
-    raft::update_host(h_cugraph_katz_centralities.data(),
-                      d_katz_centralities.data(),
-                      d_katz_centralities.size(),
-                      handle.get_stream());
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-
-    auto threshold_ratio = 1e-3;
-    auto threshold_magnitude =
-      (1.0 / static_cast<result_t>(graph_view.get_number_of_vertices())) *
-      threshold_ratio;  // skip comparison for low Katz Centrality verties (lowly ranked vertices)
-    auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
-      return std::abs(lhs - rhs) <
-             std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
-    };
-
-    ASSERT_TRUE(std::equal(h_reference_katz_centralities.begin(),
-                           h_reference_katz_centralities.end(),
-                           h_cugraph_katz_centralities.begin(),
-                           nearly_equal))
-      << "Katz centrality values do not match with the reference values.";
+    if (configuration.check_correctness) {
+      cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, true, false> unrenumbered_graph(
+        handle);
+      if (renumber) {
+        std::tie(unrenumbered_graph, std::ignore) =
+          read_graph<vertex_t, edge_t, weight_t>(handle, configuration, false);
+      }
+      auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view;
+
+      std::vector<edge_t> h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1);
+      std::vector<vertex_t> h_indices(unrenumbered_graph_view.get_number_of_edges());
+      std::vector<weight_t> h_weights{};
+      raft::update_host(h_offsets.data(),
+                        unrenumbered_graph_view.offsets(),
+                        unrenumbered_graph_view.get_number_of_vertices() + 1,
+                        handle.get_stream());
+      raft::update_host(h_indices.data(),
+                        unrenumbered_graph_view.indices(),
+                        unrenumbered_graph_view.get_number_of_edges(),
+                        handle.get_stream());
+      if (unrenumbered_graph_view.is_weighted()) {
+        h_weights.assign(unrenumbered_graph_view.get_number_of_edges(), weight_t{0.0});
+        raft::update_host(h_weights.data(),
+                          unrenumbered_graph_view.weights(),
+                          unrenumbered_graph_view.get_number_of_edges(),
+                          handle.get_stream());
+      }
+
+      handle.get_stream_view().synchronize();
+
+      std::vector<result_t> h_reference_katz_centralities(
+        unrenumbered_graph_view.get_number_of_vertices());
+
+      katz_centrality_reference(
+        h_offsets.data(),
+        h_indices.data(),
+        h_weights.size() > 0 ? h_weights.data() : static_cast<weight_t*>(nullptr),
+        static_cast<result_t*>(nullptr),
+        h_reference_katz_centralities.data(),
+        unrenumbered_graph_view.get_number_of_vertices(),
+        alpha,
+        beta,
+        epsilon,
+        std::numeric_limits<size_t>::max(),
+        false,
+        true);
+
+      std::vector<result_t> h_cugraph_katz_centralities(graph_view.get_number_of_vertices());
+      if (renumber) {
+        auto d_unrenumbered_katz_centralities =
+          cugraph::test::sort_by_key(handle,
+                                     d_renumber_map_labels.data(),
+                                     d_katz_centralities.data(),
+                                     d_renumber_map_labels.size());
+        raft::update_host(h_cugraph_katz_centralities.data(),
+                          d_unrenumbered_katz_centralities.data(),
+                          d_unrenumbered_katz_centralities.size(),
+                          handle.get_stream());
+      } else {
+        raft::update_host(h_cugraph_katz_centralities.data(),
+                          d_katz_centralities.data(),
+                          d_katz_centralities.size(),
+                          handle.get_stream());
+      }
+
+      handle.get_stream_view().synchronize();
+
+      auto threshold_ratio = 1e-3;
+      auto threshold_magnitude =
+        (1.0 / static_cast<result_t>(graph_view.get_number_of_vertices())) *
+        threshold_ratio;  // skip comparison for low Katz Centrality verties (lowly ranked vertices)
+      auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
+        return std::abs(lhs - rhs) <
+               std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
+      };
+
+      ASSERT_TRUE(std::equal(h_reference_katz_centralities.begin(),
+                             h_reference_katz_centralities.end(),
+                             h_cugraph_katz_centralities.begin(),
+                             nearly_equal))
+        << "Katz centrality values do not match with the reference values.";
+    }
   }
 };
 
@@ -252,6 +301,7 @@ INSTANTIATE_TEST_CASE_P(
   simple_test,
   Tests_KatzCentrality,
   ::testing::Values(
+    // enable correctness checks
     KatzCentrality_Usecase("test/datasets/karate.mtx", false),
     KatzCentrality_Usecase("test/datasets/karate.mtx", true),
     KatzCentrality_Usecase("test/datasets/web-Google.mtx", false),
@@ -261,16 +311,15 @@ INSTANTIATE_TEST_CASE_P(
     KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", false),
     KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", true),
     KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false},
-                           0.0,
-                           false),
-    KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false},
-                           0.5,
                            false),
     KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false},
-                           0.0,
                            true),
-    KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false},
-                           0.5,
-                           true)));
+    // disable correctness checks for large graphs
+    KatzCentrality_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false},
+                           false,
+                           false),
+    KatzCentrality_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false},
+                           true,
+                           false)));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/mg_bfs_test.cpp b/cpp/tests/experimental/mg_bfs_test.cpp
new file mode 100644
index 00000000000..76ccb5d9de3
--- /dev/null
+++ b/cpp/tests/experimental/mg_bfs_test.cpp
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <algorithms.hpp>
+#include <experimental/graph.hpp>
+#include <experimental/graph_functions.hpp>
+#include <experimental/graph_view.hpp>
+#include <partition_manager.hpp>
+
+#include <raft/comms/comms.hpp>
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+typedef struct BFS_Usecase_t {
+  cugraph::test::input_graph_specifier_t input_graph_specifier{};
+
+  size_t source{0};
+  bool check_correctness{false};
+
+  BFS_Usecase_t(std::string const& graph_file_path, size_t source, bool check_correctness = true)
+    : source(source), check_correctness(check_correctness)
+  {
+    std::string graph_file_full_path{};
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+    input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH;
+    input_graph_specifier.graph_file_full_path = graph_file_full_path;
+  };
+
+  BFS_Usecase_t(cugraph::test::rmat_params_t rmat_params,
+                size_t source,
+                bool check_correctness = true)
+    : source(source), check_correctness(check_correctness)
+  {
+    input_graph_specifier.tag         = cugraph::test::input_graph_specifier_t::RMAT_PARAMS;
+    input_graph_specifier.rmat_params = rmat_params;
+  }
+} BFS_Usecase;
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, multi_gpu>,
+           rmm::device_uvector<vertex_t>>
+read_graph(raft::handle_t const& handle, BFS_Usecase const& configuration, bool renumber)
+{
+  auto& comm           = handle.get_comms();
+  auto const comm_size = comm.get_size();
+  auto const comm_rank = comm.get_rank();
+
+  std::vector<size_t> partition_ids(multi_gpu ? size_t{1} : static_cast<size_t>(comm_size));
+  std::iota(partition_ids.begin(),
+            partition_ids.end(),
+            multi_gpu ? static_cast<size_t>(comm_rank) : size_t{0});
+
+  return configuration.input_graph_specifier.tag ==
+             cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH
+           ? cugraph::test::
+               read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, multi_gpu>(
+                 handle, configuration.input_graph_specifier.graph_file_full_path, false, renumber)
+           : cugraph::test::
+               generate_graph_from_rmat_params<vertex_t, edge_t, weight_t, false, multi_gpu>(
+                 handle,
+                 configuration.input_graph_specifier.rmat_params.scale,
+                 configuration.input_graph_specifier.rmat_params.edge_factor,
+                 configuration.input_graph_specifier.rmat_params.a,
+                 configuration.input_graph_specifier.rmat_params.b,
+                 configuration.input_graph_specifier.rmat_params.c,
+                 configuration.input_graph_specifier.rmat_params.seed,
+                 configuration.input_graph_specifier.rmat_params.undirected,
+                 configuration.input_graph_specifier.rmat_params.scramble_vertex_ids,
+                 false,
+                 renumber,
+                 partition_ids,
+                 static_cast<size_t>(comm_size));
+}
+
+class Tests_MGBFS : public ::testing::TestWithParam<BFS_Usecase> {
+ public:
+  Tests_MGBFS() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of running BFS on multiple GPUs to that of a single-GPU run
+  template <typename vertex_t, typename edge_t>
+  void run_current_test(BFS_Usecase const& configuration)
+  {
+    using weight_t = float;
+
+    // 1. initialize handle
+
+    raft::handle_t handle{};
+
+    raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+    auto const comm_rank = comm.get_rank();
+
+    auto row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
+    while (comm_size % row_comm_size != 0) { --row_comm_size; }
+    cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
+      subcomm_factory(handle, row_comm_size);
+
+    // 2. create MG graph
+
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, true> mg_graph(handle);
+    rmm::device_uvector<vertex_t> d_mg_renumber_map_labels(0, handle.get_stream());
+    std::tie(mg_graph, d_mg_renumber_map_labels) =
+      read_graph<vertex_t, edge_t, weight_t, true>(handle, configuration, true);
+
+    auto mg_graph_view = mg_graph.view();
+
+    ASSERT_TRUE(static_cast<vertex_t>(configuration.source) >= 0 &&
+                static_cast<vertex_t>(configuration.source) <
+                  mg_graph_view.get_number_of_vertices())
+      << "Invalid starting source.";
+
+    // 3. run MG BFS
+
+    rmm::device_uvector<vertex_t> d_mg_distances(mg_graph_view.get_number_of_local_vertices(),
+                                                 handle.get_stream());
+    rmm::device_uvector<vertex_t> d_mg_predecessors(mg_graph_view.get_number_of_local_vertices(),
+                                                    handle.get_stream());
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    cugraph::experimental::bfs(handle,
+                               mg_graph_view,
+                               d_mg_distances.data(),
+                               d_mg_predecessors.data(),
+                               static_cast<vertex_t>(configuration.source),
+                               false,
+                               std::numeric_limits<vertex_t>::max(),
+                               true);
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    // 5. copmare SG & MG results
+
+    if (configuration.check_correctness) {
+      // 5-1. create SG graph
+
+      cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> sg_graph(handle);
+      std::tie(sg_graph, std::ignore) =
+        read_graph<vertex_t, edge_t, weight_t, false>(handle, configuration, false);
+
+      auto sg_graph_view = sg_graph.view();
+
+      std::vector<vertex_t> vertex_partition_lasts(comm_size);
+      for (size_t i = 0; i < vertex_partition_lasts.size(); ++i) {
+        vertex_partition_lasts[i] = mg_graph_view.get_vertex_partition_last(i);
+      }
+
+      rmm::device_scalar<vertex_t> d_source(static_cast<vertex_t>(configuration.source),
+                                            handle.get_stream());
+      cugraph::experimental::unrenumber_int_vertices<vertex_t, true>(
+        handle,
+        d_source.data(),
+        size_t{1},
+        d_mg_renumber_map_labels.data(),
+        mg_graph_view.get_local_vertex_first(),
+        mg_graph_view.get_local_vertex_last(),
+        vertex_partition_lasts,
+        true);
+      auto unrenumbered_source = d_source.value(handle.get_stream());
+
+      // 5-2. run SG BFS
+
+      rmm::device_uvector<vertex_t> d_sg_distances(sg_graph_view.get_number_of_local_vertices(),
+                                                   handle.get_stream());
+      rmm::device_uvector<vertex_t> d_sg_predecessors(sg_graph_view.get_number_of_local_vertices(),
+                                                      handle.get_stream());
+
+      cugraph::experimental::bfs(handle,
+                                 sg_graph_view,
+                                 d_sg_distances.data(),
+                                 d_sg_predecessors.data(),
+                                 unrenumbered_source,
+                                 false,
+                                 std::numeric_limits<vertex_t>::max(),
+                                 true);
+
+      // 5-3. compare
+
+      std::vector<edge_t> h_sg_offsets(sg_graph_view.get_number_of_vertices() + 1);
+      std::vector<vertex_t> h_sg_indices(sg_graph_view.get_number_of_edges());
+      raft::update_host(h_sg_offsets.data(),
+                        sg_graph_view.offsets(),
+                        sg_graph_view.get_number_of_vertices() + 1,
+                        handle.get_stream());
+      raft::update_host(h_sg_indices.data(),
+                        sg_graph_view.indices(),
+                        sg_graph_view.get_number_of_edges(),
+                        handle.get_stream());
+
+      std::vector<vertex_t> h_sg_distances(sg_graph_view.get_number_of_vertices());
+      std::vector<vertex_t> h_sg_predecessors(sg_graph_view.get_number_of_vertices());
+      raft::update_host(
+        h_sg_distances.data(), d_sg_distances.data(), d_sg_distances.size(), handle.get_stream());
+      raft::update_host(h_sg_predecessors.data(),
+                        d_sg_predecessors.data(),
+                        d_sg_predecessors.size(),
+                        handle.get_stream());
+
+      std::vector<vertex_t> h_mg_distances(mg_graph_view.get_number_of_local_vertices());
+      std::vector<vertex_t> h_mg_predecessors(mg_graph_view.get_number_of_local_vertices());
+      raft::update_host(
+        h_mg_distances.data(), d_mg_distances.data(), d_mg_distances.size(), handle.get_stream());
+      cugraph::experimental::unrenumber_int_vertices<vertex_t, true>(
+        handle,
+        d_mg_predecessors.data(),
+        d_mg_predecessors.size(),
+        d_mg_renumber_map_labels.data(),
+        mg_graph_view.get_local_vertex_first(),
+        mg_graph_view.get_local_vertex_last(),
+        vertex_partition_lasts,
+        true);
+      raft::update_host(h_mg_predecessors.data(),
+                        d_mg_predecessors.data(),
+                        d_mg_predecessors.size(),
+                        handle.get_stream());
+
+      std::vector<vertex_t> h_mg_renumber_map_labels(d_mg_renumber_map_labels.size());
+      raft::update_host(h_mg_renumber_map_labels.data(),
+                        d_mg_renumber_map_labels.data(),
+                        d_mg_renumber_map_labels.size(),
+                        handle.get_stream());
+
+      handle.get_stream_view().synchronize();
+
+      for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) {
+        auto mapped_vertex = h_mg_renumber_map_labels[i];
+        ASSERT_TRUE(h_mg_distances[i] == h_sg_distances[mapped_vertex])
+          << "MG BFS distance for vertex: " << mapped_vertex << " in rank: " << comm_rank
+          << " has value: " << h_mg_distances[i]
+          << " different from the corresponding SG value: " << h_sg_distances[mapped_vertex];
+        if (h_mg_predecessors[i] == cugraph::invalid_vertex_id<vertex_t>::value) {
+          ASSERT_TRUE(h_sg_predecessors[mapped_vertex] == h_mg_predecessors[i])
+            << "vertex reachability does not match with the SG result.";
+        } else {
+          ASSERT_TRUE(h_sg_distances[h_mg_predecessors[i]] + 1 == h_sg_distances[mapped_vertex])
+            << "distances to this vertex != distances to the predecessor vertex + 1.";
+          bool found{false};
+          for (auto j = h_sg_offsets[h_mg_predecessors[i]];
+               j < h_sg_offsets[h_mg_predecessors[i] + 1];
+               ++j) {
+            if (h_sg_indices[j] == mapped_vertex) {
+              found = true;
+              break;
+            }
+          }
+          ASSERT_TRUE(found) << "no edge from the predecessor vertex to this vertex.";
+        }
+      }
+    }
+  }
+};
+
+TEST_P(Tests_MGBFS, CheckInt32Int32) { run_current_test<int32_t, int32_t>(GetParam()); }
+
+INSTANTIATE_TEST_CASE_P(
+  simple_test,
+  Tests_MGBFS,
+  ::testing::Values(
+    // enable correctness checks
+    BFS_Usecase("test/datasets/karate.mtx", 0),
+    BFS_Usecase("test/datasets/web-Google.mtx", 0),
+    BFS_Usecase("test/datasets/ljournal-2008.mtx", 0),
+    BFS_Usecase("test/datasets/webbase-1M.mtx", 0),
+    BFS_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0),
+    // disable correctness checks for large graphs
+    BFS_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false},
+                0,
+                false)));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/mg_katz_centrality_test.cpp b/cpp/tests/experimental/mg_katz_centrality_test.cpp
new file mode 100644
index 00000000000..e3033af3771
--- /dev/null
+++ b/cpp/tests/experimental/mg_katz_centrality_test.cpp
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <algorithms.hpp>
+#include <partition_manager.hpp>
+
+#include <raft/comms/comms.hpp>
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+typedef struct KatzCentrality_Usecase_t {
+  cugraph::test::input_graph_specifier_t input_graph_specifier{};
+
+  bool test_weighted{false};
+  bool check_correctness{false};
+
+  KatzCentrality_Usecase_t(std::string const& graph_file_path,
+                           bool test_weighted,
+                           bool check_correctness = true)
+    : test_weighted(test_weighted), check_correctness(check_correctness)
+  {
+    std::string graph_file_full_path{};
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+    input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH;
+    input_graph_specifier.graph_file_full_path = graph_file_full_path;
+  };
+
+  KatzCentrality_Usecase_t(cugraph::test::rmat_params_t rmat_params,
+                           bool test_weighted,
+                           bool check_correctness = true)
+    : test_weighted(test_weighted), check_correctness(check_correctness)
+  {
+    input_graph_specifier.tag         = cugraph::test::input_graph_specifier_t::RMAT_PARAMS;
+    input_graph_specifier.rmat_params = rmat_params;
+  }
+} KatzCentrality_Usecase;
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, true, multi_gpu>,
+           rmm::device_uvector<vertex_t>>
+read_graph(raft::handle_t const& handle, KatzCentrality_Usecase const& configuration, bool renumber)
+{
+  auto& comm           = handle.get_comms();
+  auto const comm_size = comm.get_size();
+  auto const comm_rank = comm.get_rank();
+
+  std::vector<size_t> partition_ids(multi_gpu ? size_t{1} : static_cast<size_t>(comm_size));
+  std::iota(partition_ids.begin(),
+            partition_ids.end(),
+            multi_gpu ? static_cast<size_t>(comm_rank) : size_t{0});
+
+  return configuration.input_graph_specifier.tag ==
+             cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH
+           ? cugraph::test::
+               read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, true, multi_gpu>(
+                 handle,
+                 configuration.input_graph_specifier.graph_file_full_path,
+                 configuration.test_weighted,
+                 renumber)
+           : cugraph::test::
+               generate_graph_from_rmat_params<vertex_t, edge_t, weight_t, true, multi_gpu>(
+                 handle,
+                 configuration.input_graph_specifier.rmat_params.scale,
+                 configuration.input_graph_specifier.rmat_params.edge_factor,
+                 configuration.input_graph_specifier.rmat_params.a,
+                 configuration.input_graph_specifier.rmat_params.b,
+                 configuration.input_graph_specifier.rmat_params.c,
+                 configuration.input_graph_specifier.rmat_params.seed,
+                 configuration.input_graph_specifier.rmat_params.undirected,
+                 configuration.input_graph_specifier.rmat_params.scramble_vertex_ids,
+                 configuration.test_weighted,
+                 renumber,
+                 partition_ids,
+                 static_cast<size_t>(comm_size));
+}
+
+class Tests_MGKatzCentrality : public ::testing::TestWithParam<KatzCentrality_Usecase> {
+ public:
+  Tests_MGKatzCentrality() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of running Katz Centrality on multiple GPUs to that of a single-GPU run
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(KatzCentrality_Usecase const& configuration)
+  {
+    // 1. initialize handle
+
+    raft::handle_t handle{};
+
+    raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+    auto const comm_rank = comm.get_rank();
+
+    auto row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
+    while (comm_size % row_comm_size != 0) { --row_comm_size; }
+    cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
+      subcomm_factory(handle, row_comm_size);
+
+    // 2. create MG graph
+
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, true, true> mg_graph(handle);
+    rmm::device_uvector<vertex_t> d_mg_renumber_map_labels(0, handle.get_stream());
+    std::tie(mg_graph, d_mg_renumber_map_labels) =
+      read_graph<vertex_t, edge_t, weight_t, true>(handle, configuration, true);
+
+    auto mg_graph_view = mg_graph.view();
+
+    // 3. compute max in-degree
+
+    auto max_in_degree = mg_graph_view.compute_max_in_degree(handle);
+
+    // 4. run MG Katz Centrality
+
+    result_t const alpha = result_t{1.0} / static_cast<result_t>(max_in_degree + 1);
+    result_t constexpr beta{1.0};
+    result_t constexpr epsilon{1e-6};
+
+    rmm::device_uvector<result_t> d_mg_katz_centralities(
+      mg_graph_view.get_number_of_local_vertices(), handle.get_stream());
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    cugraph::experimental::katz_centrality(handle,
+                                           mg_graph_view,
+                                           static_cast<result_t*>(nullptr),
+                                           d_mg_katz_centralities.data(),
+                                           alpha,
+                                           beta,
+                                           epsilon,
+                                           std::numeric_limits<size_t>::max(),
+                                           false,
+                                           true);
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    // 5. copmare SG & MG results
+
+    if (configuration.check_correctness) {
+      // 5-1. create SG graph
+
+      cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, true, false> sg_graph(handle);
+      std::tie(sg_graph, std::ignore) =
+        read_graph<vertex_t, edge_t, weight_t, false>(handle, configuration, false);
+
+      auto sg_graph_view = sg_graph.view();
+
+      // 5-3. run SG Katz Centrality
+
+      rmm::device_uvector<result_t> d_sg_katz_centralities(sg_graph_view.get_number_of_vertices(),
+                                                           handle.get_stream());
+
+      cugraph::experimental::katz_centrality(handle,
+                                             sg_graph_view,
+                                             static_cast<result_t*>(nullptr),
+                                             d_sg_katz_centralities.data(),
+                                             alpha,
+                                             beta,
+                                             epsilon,
+                                             std::numeric_limits<size_t>::max(),  // max_iterations
+                                             false,
+                                             true);
+
+      // 5-4. compare
+
+      std::vector<result_t> h_sg_katz_centralities(sg_graph_view.get_number_of_vertices());
+      raft::update_host(h_sg_katz_centralities.data(),
+                        d_sg_katz_centralities.data(),
+                        d_sg_katz_centralities.size(),
+                        handle.get_stream());
+
+      std::vector<result_t> h_mg_katz_centralities(mg_graph_view.get_number_of_local_vertices());
+      raft::update_host(h_mg_katz_centralities.data(),
+                        d_mg_katz_centralities.data(),
+                        d_mg_katz_centralities.size(),
+                        handle.get_stream());
+
+      std::vector<vertex_t> h_mg_renumber_map_labels(d_mg_renumber_map_labels.size());
+      raft::update_host(h_mg_renumber_map_labels.data(),
+                        d_mg_renumber_map_labels.data(),
+                        d_mg_renumber_map_labels.size(),
+                        handle.get_stream());
+
+      handle.get_stream_view().synchronize();
+
+      auto threshold_ratio = 1e-3;
+      auto threshold_magnitude =
+        (1.0 / static_cast<result_t>(mg_graph_view.get_number_of_vertices())) *
+        threshold_ratio;  // skip comparison for low KatzCentrality verties (lowly ranked vertices)
+      auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
+        return std::abs(lhs - rhs) <
+               std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
+      };
+
+      for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) {
+        auto mapped_vertex = h_mg_renumber_map_labels[i];
+        ASSERT_TRUE(nearly_equal(h_mg_katz_centralities[i], h_sg_katz_centralities[mapped_vertex]))
+          << "MG KatzCentrality value for vertex: " << mapped_vertex << " in rank: " << comm_rank
+          << " has value: " << h_mg_katz_centralities[i]
+          << " which exceeds the error margin for comparing to SG value: "
+          << h_sg_katz_centralities[mapped_vertex];
+      }
+    }
+  }
+};
+
+TEST_P(Tests_MGKatzCentrality, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float>(GetParam());
+}
+
+INSTANTIATE_TEST_CASE_P(
+  simple_test,
+  Tests_MGKatzCentrality,
+  ::testing::Values(
+    // enable correctness checks
+    KatzCentrality_Usecase("test/datasets/karate.mtx", false),
+    KatzCentrality_Usecase("test/datasets/karate.mtx", true),
+    KatzCentrality_Usecase("test/datasets/web-Google.mtx", false),
+    KatzCentrality_Usecase("test/datasets/web-Google.mtx", true),
+    KatzCentrality_Usecase("test/datasets/ljournal-2008.mtx", false),
+    KatzCentrality_Usecase("test/datasets/ljournal-2008.mtx", true),
+    KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", false),
+    KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", true),
+    KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false},
+                           false),
+    KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false},
+                           true),
+    // disable correctness checks for large graphs
+    KatzCentrality_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false},
+                           false,
+                           false),
+    KatzCentrality_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false},
+                           true,
+                           false)));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/mg_sssp_test.cpp b/cpp/tests/experimental/mg_sssp_test.cpp
new file mode 100644
index 00000000000..48e4dc869f4
--- /dev/null
+++ b/cpp/tests/experimental/mg_sssp_test.cpp
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <algorithms.hpp>
+#include <experimental/graph.hpp>
+#include <experimental/graph_functions.hpp>
+#include <experimental/graph_view.hpp>
+#include <partition_manager.hpp>
+
+#include <raft/comms/comms.hpp>
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+typedef struct SSSP_Usecase_t {
+  cugraph::test::input_graph_specifier_t input_graph_specifier{};
+
+  size_t source{0};
+  bool check_correctness{false};
+
+  SSSP_Usecase_t(std::string const& graph_file_path, size_t source, bool check_correctness = true)
+    : source(source), check_correctness(check_correctness)
+  {
+    std::string graph_file_full_path{};
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+    input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH;
+    input_graph_specifier.graph_file_full_path = graph_file_full_path;
+  };
+
+  SSSP_Usecase_t(cugraph::test::rmat_params_t rmat_params,
+                 size_t source,
+                 bool check_correctness = true)
+    : source(source), check_correctness(check_correctness)
+  {
+    input_graph_specifier.tag         = cugraph::test::input_graph_specifier_t::RMAT_PARAMS;
+    input_graph_specifier.rmat_params = rmat_params;
+  }
+} SSSP_Usecase;
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, multi_gpu>,
+           rmm::device_uvector<vertex_t>>
+read_graph(raft::handle_t const& handle, SSSP_Usecase const& configuration, bool renumber)
+{
+  auto& comm           = handle.get_comms();
+  auto const comm_size = comm.get_size();
+  auto const comm_rank = comm.get_rank();
+
+  std::vector<size_t> partition_ids(multi_gpu ? size_t{1} : static_cast<size_t>(comm_size));
+  std::iota(partition_ids.begin(),
+            partition_ids.end(),
+            multi_gpu ? static_cast<size_t>(comm_rank) : size_t{0});
+
+  return configuration.input_graph_specifier.tag ==
+             cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH
+           ? cugraph::test::
+               read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, multi_gpu>(
+                 handle, configuration.input_graph_specifier.graph_file_full_path, true, renumber)
+           : cugraph::test::
+               generate_graph_from_rmat_params<vertex_t, edge_t, weight_t, false, multi_gpu>(
+                 handle,
+                 configuration.input_graph_specifier.rmat_params.scale,
+                 configuration.input_graph_specifier.rmat_params.edge_factor,
+                 configuration.input_graph_specifier.rmat_params.a,
+                 configuration.input_graph_specifier.rmat_params.b,
+                 configuration.input_graph_specifier.rmat_params.c,
+                 configuration.input_graph_specifier.rmat_params.seed,
+                 configuration.input_graph_specifier.rmat_params.undirected,
+                 configuration.input_graph_specifier.rmat_params.scramble_vertex_ids,
+                 true,
+                 renumber,
+                 partition_ids,
+                 static_cast<size_t>(comm_size));
+}
+
+class Tests_MGSSSP : public ::testing::TestWithParam<SSSP_Usecase> {
+ public:
+  Tests_MGSSSP() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of running SSSP on multiple GPUs to that of a single-GPU run
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(SSSP_Usecase const& configuration)
+  {
+    // 1. initialize handle
+
+    raft::handle_t handle{};
+
+    raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+    auto const comm_rank = comm.get_rank();
+
+    auto row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
+    while (comm_size % row_comm_size != 0) { --row_comm_size; }
+    cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
+      subcomm_factory(handle, row_comm_size);
+
+    // 2. create MG graph
+
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, true> mg_graph(handle);
+    rmm::device_uvector<vertex_t> d_mg_renumber_map_labels(0, handle.get_stream());
+    std::tie(mg_graph, d_mg_renumber_map_labels) =
+      read_graph<vertex_t, edge_t, weight_t, true>(handle, configuration, true);
+
+    auto mg_graph_view = mg_graph.view();
+
+    ASSERT_TRUE(static_cast<vertex_t>(configuration.source) >= 0 &&
+                static_cast<vertex_t>(configuration.source) <
+                  mg_graph_view.get_number_of_vertices())
+      << "Invalid starting source.";
+
+    // 3. run MG SSSP
+
+    rmm::device_uvector<weight_t> d_mg_distances(mg_graph_view.get_number_of_local_vertices(),
+                                                 handle.get_stream());
+    rmm::device_uvector<vertex_t> d_mg_predecessors(mg_graph_view.get_number_of_local_vertices(),
+                                                    handle.get_stream());
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    // FIXME: disable do_expensive_check
+    cugraph::experimental::sssp(handle,
+                                mg_graph_view,
+                                d_mg_distances.data(),
+                                d_mg_predecessors.data(),
+                                static_cast<vertex_t>(configuration.source),
+                                std::numeric_limits<weight_t>::max(),
+                                true);
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    // 5. copmare SG & MG results
+
+    if (configuration.check_correctness) {
+      // 5-1. create SG graph
+
+      cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> sg_graph(handle);
+      std::tie(sg_graph, std::ignore) =
+        read_graph<vertex_t, edge_t, weight_t, false>(handle, configuration, false);
+
+      auto sg_graph_view = sg_graph.view();
+
+      std::vector<vertex_t> vertex_partition_lasts(comm_size);
+      for (size_t i = 0; i < vertex_partition_lasts.size(); ++i) {
+        vertex_partition_lasts[i] = mg_graph_view.get_vertex_partition_last(i);
+      }
+
+      rmm::device_scalar<vertex_t> d_source(static_cast<vertex_t>(configuration.source),
+                                            handle.get_stream());
+      cugraph::experimental::unrenumber_int_vertices<vertex_t, true>(
+        handle,
+        d_source.data(),
+        size_t{1},
+        d_mg_renumber_map_labels.data(),
+        mg_graph_view.get_local_vertex_first(),
+        mg_graph_view.get_local_vertex_last(),
+        vertex_partition_lasts,
+        true);
+      auto unrenumbered_source = d_source.value(handle.get_stream());
+
+      // 5-2. run SG SSSP
+
+      rmm::device_uvector<weight_t> d_sg_distances(sg_graph_view.get_number_of_local_vertices(),
+                                                   handle.get_stream());
+      rmm::device_uvector<vertex_t> d_sg_predecessors(sg_graph_view.get_number_of_local_vertices(),
+                                                      handle.get_stream());
+
+      // FIXME: disable do_expensive_check
+      cugraph::experimental::sssp(handle,
+                                  sg_graph_view,
+                                  d_sg_distances.data(),
+                                  d_sg_predecessors.data(),
+                                  unrenumbered_source,
+                                  std::numeric_limits<weight_t>::max(),
+                                  true);
+
+      // 5-3. compare
+
+      std::vector<edge_t> h_sg_offsets(sg_graph_view.get_number_of_vertices() + 1);
+      std::vector<vertex_t> h_sg_indices(sg_graph_view.get_number_of_edges());
+      std::vector<weight_t> h_sg_weights(sg_graph_view.get_number_of_edges());
+      raft::update_host(h_sg_offsets.data(),
+                        sg_graph_view.offsets(),
+                        sg_graph_view.get_number_of_vertices() + 1,
+                        handle.get_stream());
+      raft::update_host(h_sg_indices.data(),
+                        sg_graph_view.indices(),
+                        sg_graph_view.get_number_of_edges(),
+                        handle.get_stream());
+      raft::update_host(h_sg_weights.data(),
+                        sg_graph_view.weights(),
+                        sg_graph_view.get_number_of_edges(),
+                        handle.get_stream());
+
+      std::vector<weight_t> h_sg_distances(sg_graph_view.get_number_of_vertices());
+      std::vector<vertex_t> h_sg_predecessors(sg_graph_view.get_number_of_vertices());
+      raft::update_host(
+        h_sg_distances.data(), d_sg_distances.data(), d_sg_distances.size(), handle.get_stream());
+      raft::update_host(h_sg_predecessors.data(),
+                        d_sg_predecessors.data(),
+                        d_sg_predecessors.size(),
+                        handle.get_stream());
+
+      std::vector<weight_t> h_mg_distances(mg_graph_view.get_number_of_local_vertices());
+      std::vector<vertex_t> h_mg_predecessors(mg_graph_view.get_number_of_local_vertices());
+      raft::update_host(
+        h_mg_distances.data(), d_mg_distances.data(), d_mg_distances.size(), handle.get_stream());
+      cugraph::experimental::unrenumber_int_vertices<vertex_t, true>(
+        handle,
+        d_mg_predecessors.data(),
+        d_mg_predecessors.size(),
+        d_mg_renumber_map_labels.data(),
+        mg_graph_view.get_local_vertex_first(),
+        mg_graph_view.get_local_vertex_last(),
+        vertex_partition_lasts,
+        true);
+      raft::update_host(h_mg_predecessors.data(),
+                        d_mg_predecessors.data(),
+                        d_mg_predecessors.size(),
+                        handle.get_stream());
+
+      std::vector<vertex_t> h_mg_renumber_map_labels(d_mg_renumber_map_labels.size());
+      raft::update_host(h_mg_renumber_map_labels.data(),
+                        d_mg_renumber_map_labels.data(),
+                        d_mg_renumber_map_labels.size(),
+                        handle.get_stream());
+
+      handle.get_stream_view().synchronize();
+
+      auto max_weight_element = std::max_element(h_sg_weights.begin(), h_sg_weights.end());
+      auto epsilon            = *max_weight_element * weight_t{1e-6};
+      auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; };
+
+      for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) {
+        auto mapped_vertex = h_mg_renumber_map_labels[i];
+        ASSERT_TRUE(nearly_equal(h_mg_distances[i], h_sg_distances[mapped_vertex]))
+          << "MG SSSP distance for vertex: " << mapped_vertex << " in rank: " << comm_rank
+          << " has value: " << h_mg_distances[i]
+          << " different from the corresponding SG value: " << h_sg_distances[mapped_vertex];
+        if (h_mg_predecessors[i] == cugraph::invalid_vertex_id<vertex_t>::value) {
+          ASSERT_TRUE(h_sg_predecessors[mapped_vertex] == h_mg_predecessors[i])
+            << "vertex reachability does not match with the SG result.";
+        } else {
+          auto pred_distance = h_sg_distances[h_mg_predecessors[i]];
+          bool found{false};
+          for (auto j = h_sg_offsets[h_mg_predecessors[i]];
+               j < h_sg_offsets[h_mg_predecessors[i] + 1];
+               ++j) {
+            if (h_sg_indices[j] == mapped_vertex) {
+              if (nearly_equal(pred_distance + h_sg_weights[j], h_sg_distances[mapped_vertex])) {
+                found = true;
+                break;
+              }
+            }
+          }
+          ASSERT_TRUE(found)
+            << "no edge from the predecessor vertex to this vertex with the matching weight.";
+        }
+      }
+    }
+  }
+};
+
+TEST_P(Tests_MGSSSP, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(GetParam());
+}
+
+INSTANTIATE_TEST_CASE_P(
+  simple_test,
+  Tests_MGSSSP,
+  ::testing::Values(
+    // enable correctness checks
+    SSSP_Usecase("test/datasets/karate.mtx", 0),
+    SSSP_Usecase("test/datasets/dblp.mtx", 0),
+    SSSP_Usecase("test/datasets/wiki2003.mtx", 1000),
+    SSSP_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0),
+    // disable correctness checks for large graphs
+    SSSP_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false},
+                 0,
+                 false)));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/pagerank_test.cpp b/cpp/tests/experimental/pagerank_test.cpp
index ff3b073cbc7..649fe11d805 100644
--- a/cpp/tests/experimental/pagerank_test.cpp
+++ b/cpp/tests/experimental/pagerank_test.cpp
@@ -16,9 +16,11 @@
 
 #include <utilities/base_fixture.hpp>
 #include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
 
 #include <algorithms.hpp>
 #include <experimental/graph.hpp>
+#include <experimental/graph_functions.hpp>
 #include <experimental/graph_view.hpp>
 
 #include <raft/cudart_utils.h>
@@ -35,6 +37,11 @@
 #include <random>
 #include <vector>
 
+// do the perf measurements
+// enabled by command line parameter s'--perf'
+//
+static int PERF = 0;
+
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
 void pagerank_reference(edge_t const* offsets,
                         vertex_t const* indices,
@@ -128,11 +135,15 @@ typedef struct PageRank_Usecase_t {
 
   double personalization_ratio{0.0};
   bool test_weighted{false};
+  bool check_correctness{false};
 
   PageRank_Usecase_t(std::string const& graph_file_path,
                      double personalization_ratio,
-                     bool test_weighted)
-    : personalization_ratio(personalization_ratio), test_weighted(test_weighted)
+                     bool test_weighted,
+                     bool check_correctness = true)
+    : personalization_ratio(personalization_ratio),
+      test_weighted(test_weighted),
+      check_correctness(check_correctness)
   {
     std::string graph_file_full_path{};
     if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
@@ -146,14 +157,47 @@ typedef struct PageRank_Usecase_t {
 
   PageRank_Usecase_t(cugraph::test::rmat_params_t rmat_params,
                      double personalization_ratio,
-                     bool test_weighted)
-    : personalization_ratio(personalization_ratio), test_weighted(test_weighted)
+                     bool test_weighted,
+                     bool check_correctness = true)
+    : personalization_ratio(personalization_ratio),
+      test_weighted(test_weighted),
+      check_correctness(check_correctness)
   {
     input_graph_specifier.tag         = cugraph::test::input_graph_specifier_t::RMAT_PARAMS;
     input_graph_specifier.rmat_params = rmat_params;
   }
 } PageRank_Usecase;
 
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, true, false>,
+           rmm::device_uvector<vertex_t>>
+read_graph(raft::handle_t const& handle, PageRank_Usecase const& configuration, bool renumber)
+{
+  return configuration.input_graph_specifier.tag ==
+             cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH
+           ? cugraph::test::
+               read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, true, false>(
+                 handle,
+                 configuration.input_graph_specifier.graph_file_full_path,
+                 configuration.test_weighted,
+                 renumber)
+           : cugraph::test::
+               generate_graph_from_rmat_params<vertex_t, edge_t, weight_t, true, false>(
+                 handle,
+                 configuration.input_graph_specifier.rmat_params.scale,
+                 configuration.input_graph_specifier.rmat_params.edge_factor,
+                 configuration.input_graph_specifier.rmat_params.a,
+                 configuration.input_graph_specifier.rmat_params.b,
+                 configuration.input_graph_specifier.rmat_params.c,
+                 configuration.input_graph_specifier.rmat_params.seed,
+                 configuration.input_graph_specifier.rmat_params.undirected,
+                 configuration.input_graph_specifier.rmat_params.scramble_vertex_ids,
+                 configuration.test_weighted,
+                 renumber,
+                 std::vector<size_t>{0},
+                 size_t{1});
+}
+
 class Tests_PageRank : public ::testing::TestWithParam<PageRank_Usecase> {
  public:
   Tests_PageRank() {}
@@ -166,52 +210,16 @@ class Tests_PageRank : public ::testing::TestWithParam<PageRank_Usecase> {
   template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
   void run_current_test(PageRank_Usecase const& configuration)
   {
+    constexpr bool renumber = true;
+
     raft::handle_t handle{};
 
     cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, true, false> graph(handle);
-    std::tie(graph, std::ignore) =
-      configuration.input_graph_specifier.tag ==
-          cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH
-        ? cugraph::test::
-            read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, true, false>(
-              handle,
-              configuration.input_graph_specifier.graph_file_full_path,
-              configuration.test_weighted,
-              false)
-        : cugraph::test::generate_graph_from_rmat_params<vertex_t, edge_t, weight_t, true, false>(
-            handle,
-            configuration.input_graph_specifier.rmat_params.scale,
-            configuration.input_graph_specifier.rmat_params.edge_factor,
-            configuration.input_graph_specifier.rmat_params.a,
-            configuration.input_graph_specifier.rmat_params.b,
-            configuration.input_graph_specifier.rmat_params.c,
-            configuration.input_graph_specifier.rmat_params.seed,
-            configuration.input_graph_specifier.rmat_params.undirected,
-            configuration.input_graph_specifier.rmat_params.scramble_vertex_ids,
-            configuration.test_weighted,
-            false);
+    rmm::device_uvector<vertex_t> d_renumber_map_labels(0, handle.get_stream());
+    std::tie(graph, d_renumber_map_labels) =
+      read_graph<vertex_t, edge_t, weight_t>(handle, configuration, renumber);
     auto graph_view = graph.view();
 
-    std::vector<edge_t> h_offsets(graph_view.get_number_of_vertices() + 1);
-    std::vector<vertex_t> h_indices(graph_view.get_number_of_edges());
-    std::vector<weight_t> h_weights{};
-    raft::update_host(h_offsets.data(),
-                      graph_view.offsets(),
-                      graph_view.get_number_of_vertices() + 1,
-                      handle.get_stream());
-    raft::update_host(h_indices.data(),
-                      graph_view.indices(),
-                      graph_view.get_number_of_edges(),
-                      handle.get_stream());
-    if (graph_view.is_weighted()) {
-      h_weights.assign(graph_view.get_number_of_edges(), weight_t{0.0});
-      raft::update_host(h_weights.data(),
-                        graph_view.weights(),
-                        graph_view.get_number_of_edges(),
-                        handle.get_stream());
-    }
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-
     std::vector<vertex_t> h_personalization_vertices{};
     std::vector<result_t> h_personalization_values{};
     if (configuration.personalization_ratio > 0.0) {
@@ -260,21 +268,6 @@ class Tests_PageRank : public ::testing::TestWithParam<PageRank_Usecase> {
     result_t constexpr alpha{0.85};
     result_t constexpr epsilon{1e-6};
 
-    std::vector<result_t> h_reference_pageranks(graph_view.get_number_of_vertices());
-
-    pagerank_reference(h_offsets.data(),
-                       h_indices.data(),
-                       h_weights.size() > 0 ? h_weights.data() : static_cast<weight_t*>(nullptr),
-                       h_personalization_vertices.data(),
-                       h_personalization_values.data(),
-                       h_reference_pageranks.data(),
-                       graph_view.get_number_of_vertices(),
-                       static_cast<vertex_t>(h_personalization_vertices.size()),
-                       alpha,
-                       epsilon,
-                       std::numeric_limits<size_t>::max(),
-                       false);
-
     rmm::device_uvector<result_t> d_pageranks(graph_view.get_number_of_vertices(),
                                               handle.get_stream());
 
@@ -286,7 +279,7 @@ class Tests_PageRank : public ::testing::TestWithParam<PageRank_Usecase> {
                                     d_personalization_vertices.data(),
                                     d_personalization_values.data(),
                                     static_cast<vertex_t>(d_personalization_vertices.size()),
-                                    d_pageranks.begin(),
+                                    d_pageranks.data(),
                                     alpha,
                                     epsilon,
                                     std::numeric_limits<size_t>::max(),
@@ -295,26 +288,129 @@ class Tests_PageRank : public ::testing::TestWithParam<PageRank_Usecase> {
 
     CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
 
-    std::vector<result_t> h_cugraph_pageranks(graph_view.get_number_of_vertices());
-
-    raft::update_host(
-      h_cugraph_pageranks.data(), d_pageranks.data(), d_pageranks.size(), handle.get_stream());
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-
-    auto threshold_ratio = 1e-3;
-    auto threshold_magnitude =
-      (1.0 / static_cast<result_t>(graph_view.get_number_of_vertices())) *
-      threshold_ratio;  // skip comparison for low PageRank verties (lowly ranked vertices)
-    auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
-      return std::abs(lhs - rhs) <
-             std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
-    };
-
-    ASSERT_TRUE(std::equal(h_reference_pageranks.begin(),
-                           h_reference_pageranks.end(),
-                           h_cugraph_pageranks.begin(),
-                           nearly_equal))
-      << "PageRank values do not match with the reference values.";
+    if (configuration.check_correctness) {
+      cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, true, false> unrenumbered_graph(
+        handle);
+      if (renumber) {
+        std::tie(unrenumbered_graph, std::ignore) =
+          read_graph<vertex_t, edge_t, weight_t>(handle, configuration, false);
+      }
+      auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view;
+
+      std::vector<edge_t> h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1);
+      std::vector<vertex_t> h_indices(unrenumbered_graph_view.get_number_of_edges());
+      std::vector<weight_t> h_weights{};
+      raft::update_host(h_offsets.data(),
+                        unrenumbered_graph_view.offsets(),
+                        unrenumbered_graph_view.get_number_of_vertices() + 1,
+                        handle.get_stream());
+      raft::update_host(h_indices.data(),
+                        unrenumbered_graph_view.indices(),
+                        unrenumbered_graph_view.get_number_of_edges(),
+                        handle.get_stream());
+      if (unrenumbered_graph_view.is_weighted()) {
+        h_weights.assign(unrenumbered_graph_view.get_number_of_edges(), weight_t{0.0});
+        raft::update_host(h_weights.data(),
+                          unrenumbered_graph_view.weights(),
+                          unrenumbered_graph_view.get_number_of_edges(),
+                          handle.get_stream());
+      }
+
+      std::vector<vertex_t> h_unrenumbered_personalization_vertices(
+        d_personalization_vertices.size());
+      std::vector<result_t> h_unrenumbered_personalization_values(
+        h_unrenumbered_personalization_vertices.size());
+      if (renumber) {
+        rmm::device_uvector<vertex_t> d_unrenumbered_personalization_vertices(
+          d_personalization_vertices.size(), handle.get_stream());
+        rmm::device_uvector<result_t> d_unrenumbered_personalization_values(
+          d_unrenumbered_personalization_vertices.size(), handle.get_stream());
+        raft::copy_async(d_unrenumbered_personalization_vertices.data(),
+                         d_personalization_vertices.data(),
+                         d_personalization_vertices.size(),
+                         handle.get_stream());
+        raft::copy_async(d_unrenumbered_personalization_values.data(),
+                         d_personalization_values.data(),
+                         d_personalization_values.size(),
+                         handle.get_stream());
+        cugraph::experimental::unrenumber_local_int_vertices(
+          handle,
+          d_unrenumbered_personalization_vertices.data(),
+          d_unrenumbered_personalization_vertices.size(),
+          d_renumber_map_labels.data(),
+          vertex_t{0},
+          graph_view.get_number_of_vertices());
+        cugraph::test::sort_by_key(handle,
+                                   d_unrenumbered_personalization_vertices.data(),
+                                   d_unrenumbered_personalization_values.data(),
+                                   d_unrenumbered_personalization_vertices.size());
+
+        raft::update_host(h_unrenumbered_personalization_vertices.data(),
+                          d_unrenumbered_personalization_vertices.data(),
+                          d_unrenumbered_personalization_vertices.size(),
+                          handle.get_stream());
+        raft::update_host(h_unrenumbered_personalization_values.data(),
+                          d_unrenumbered_personalization_values.data(),
+                          d_unrenumbered_personalization_values.size(),
+                          handle.get_stream());
+      } else {
+        raft::update_host(h_unrenumbered_personalization_vertices.data(),
+                          d_personalization_vertices.data(),
+                          d_personalization_vertices.size(),
+                          handle.get_stream());
+        raft::update_host(h_unrenumbered_personalization_values.data(),
+                          d_personalization_values.data(),
+                          d_personalization_values.size(),
+                          handle.get_stream());
+      }
+
+      handle.get_stream_view().synchronize();
+
+      std::vector<result_t> h_reference_pageranks(unrenumbered_graph_view.get_number_of_vertices());
+
+      pagerank_reference(h_offsets.data(),
+                         h_indices.data(),
+                         h_weights.size() > 0 ? h_weights.data() : static_cast<weight_t*>(nullptr),
+                         h_unrenumbered_personalization_vertices.data(),
+                         h_unrenumbered_personalization_values.data(),
+                         h_reference_pageranks.data(),
+                         unrenumbered_graph_view.get_number_of_vertices(),
+                         static_cast<vertex_t>(h_personalization_vertices.size()),
+                         alpha,
+                         epsilon,
+                         std::numeric_limits<size_t>::max(),
+                         false);
+
+      std::vector<result_t> h_cugraph_pageranks(graph_view.get_number_of_vertices());
+      if (renumber) {
+        auto d_unrenumbered_pageranks = cugraph::test::sort_by_key(
+          handle, d_renumber_map_labels.data(), d_pageranks.data(), d_renumber_map_labels.size());
+        raft::update_host(h_cugraph_pageranks.data(),
+                          d_unrenumbered_pageranks.data(),
+                          d_unrenumbered_pageranks.size(),
+                          handle.get_stream());
+      } else {
+        raft::update_host(
+          h_cugraph_pageranks.data(), d_pageranks.data(), d_pageranks.size(), handle.get_stream());
+      }
+
+      handle.get_stream_view().synchronize();
+
+      auto threshold_ratio = 1e-3;
+      auto threshold_magnitude =
+        (1.0 / static_cast<result_t>(graph_view.get_number_of_vertices())) *
+        threshold_ratio;  // skip comparison for low PageRank verties (lowly ranked vertices)
+      auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
+        return std::abs(lhs - rhs) <
+               std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
+      };
+
+      ASSERT_TRUE(std::equal(h_reference_pageranks.begin(),
+                             h_reference_pageranks.end(),
+                             h_cugraph_pageranks.begin(),
+                             nearly_equal))
+        << "PageRank values do not match with the reference values.";
+    }
   }
 };
 
@@ -328,6 +424,7 @@ INSTANTIATE_TEST_CASE_P(
   simple_test,
   Tests_PageRank,
   ::testing::Values(
+    // enable correctness checks
     PageRank_Usecase("test/datasets/karate.mtx", 0.0, false),
     PageRank_Usecase("test/datasets/karate.mtx", 0.5, false),
     PageRank_Usecase("test/datasets/karate.mtx", 0.0, true),
@@ -355,6 +452,15 @@ INSTANTIATE_TEST_CASE_P(
                      true),
     PageRank_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false},
                      0.5,
-                     true)));
+                     true),
+    // disable correctness checks for large graphs
+    PageRank_Usecase(
+      cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, false, false),
+    PageRank_Usecase(
+      cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, false, false),
+    PageRank_Usecase(
+      cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, true, false),
+    PageRank_Usecase(
+      cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, true, false)));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/rw_low_level_test.cu b/cpp/tests/experimental/rw_low_level_test.cu
index a32e258d366..8b562bc41f6 100644
--- a/cpp/tests/experimental/rw_low_level_test.cu
+++ b/cpp/tests/experimental/rw_low_level_test.cu
@@ -53,7 +53,8 @@ graph_t<vertex_t, edge_t, weight_t, false, false> make_graph(raft::handle_t cons
                                                              std::vector<vertex_t> const& v_dst,
                                                              std::vector<weight_t> const& v_w,
                                                              vertex_t num_vertices,
-                                                             edge_t num_edges)
+                                                             edge_t num_edges,
+                                                             bool is_weighted)
 {
   vector_test_t<vertex_t> d_src(num_edges, handle.get_stream());
   vector_test_t<vertex_t> d_dst(num_edges, handle.get_stream());
@@ -67,7 +68,7 @@ graph_t<vertex_t, edge_t, weight_t, false, false> make_graph(raft::handle_t cons
     d_src.data(), d_dst.data(), d_weights.data(), num_edges};
 
   graph_t<vertex_t, edge_t, weight_t, false, false> graph(
-    handle, edgelist, num_vertices, graph_properties_t{}, false);
+    handle, edgelist, num_vertices, graph_properties_t{false, false, is_weighted}, false);
 
   return graph;
 }
@@ -119,7 +120,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphRWStart)
   std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
   std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
 
-  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges);
+  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true);
 
   auto graph_view = graph.view();
 
@@ -199,7 +200,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphCoalesceExperiments)
   std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
   std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
 
-  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges);
+  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true);
 
   auto graph_view = graph.view();
 
@@ -275,7 +276,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphColExtraction)
   std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
   std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
 
-  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges);
+  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true);
 
   auto graph_view = graph.view();
 
@@ -371,7 +372,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphRndGenColIndx)
   std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
   std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
 
-  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges);
+  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true);
 
   auto graph_view = graph.view();
 
@@ -449,7 +450,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphUpdatePathSizes)
   std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
   std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
 
-  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges);
+  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true);
 
   auto graph_view = graph.view();
 
@@ -521,7 +522,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphScatterUpdate)
   std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
   std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
 
-  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges);
+  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true);
 
   auto graph_view = graph.view();
 
@@ -666,7 +667,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphCoalesceDefragment)
   std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
   std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
 
-  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges);
+  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true);
 
   auto graph_view = graph.view();
 
@@ -741,7 +742,7 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphRandomWalk)
   std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
   std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
 
-  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges);
+  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true);
 
   auto graph_view = graph.view();
 
diff --git a/cpp/tests/experimental/sssp_test.cpp b/cpp/tests/experimental/sssp_test.cpp
index 611abcb0d75..9364d261dec 100644
--- a/cpp/tests/experimental/sssp_test.cpp
+++ b/cpp/tests/experimental/sssp_test.cpp
@@ -16,9 +16,11 @@
 
 #include <utilities/base_fixture.hpp>
 #include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
 
 #include <algorithms.hpp>
 #include <experimental/graph.hpp>
+#include <experimental/graph_functions.hpp>
 #include <experimental/graph_view.hpp>
 
 #include <raft/cudart_utils.h>
@@ -28,12 +30,18 @@
 
 #include <gtest/gtest.h>
 
+#include <algorithm>
 #include <iterator>
 #include <limits>
 #include <queue>
 #include <tuple>
 #include <vector>
 
+// do the perf measurements
+// enabled by command line parameter s'--perf'
+//
+static int PERF = 0;
+
 // Dijkstra's algorithm
 template <typename vertex_t, typename edge_t, typename weight_t>
 void sssp_reference(edge_t const* offsets,
@@ -80,9 +88,12 @@ void sssp_reference(edge_t const* offsets,
 
 typedef struct SSSP_Usecase_t {
   cugraph::test::input_graph_specifier_t input_graph_specifier{};
-  size_t source{false};
 
-  SSSP_Usecase_t(std::string const& graph_file_path, size_t source) : source(source)
+  size_t source{0};
+  bool check_correctness{false};
+
+  SSSP_Usecase_t(std::string const& graph_file_path, size_t source, bool check_correctness = true)
+    : source(source), check_correctness(check_correctness)
   {
     std::string graph_file_full_path{};
     if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
@@ -94,13 +105,43 @@ typedef struct SSSP_Usecase_t {
     input_graph_specifier.graph_file_full_path = graph_file_full_path;
   };
 
-  SSSP_Usecase_t(cugraph::test::rmat_params_t rmat_params, size_t source) : source(source)
+  SSSP_Usecase_t(cugraph::test::rmat_params_t rmat_params,
+                 size_t source,
+                 bool check_correctness = true)
+    : source(source), check_correctness(check_correctness)
   {
     input_graph_specifier.tag         = cugraph::test::input_graph_specifier_t::RMAT_PARAMS;
     input_graph_specifier.rmat_params = rmat_params;
   }
 } SSSP_Usecase;
 
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false>,
+           rmm::device_uvector<vertex_t>>
+read_graph(raft::handle_t const& handle, SSSP_Usecase const& configuration, bool renumber)
+{
+  return configuration.input_graph_specifier.tag ==
+             cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH
+           ? cugraph::test::
+               read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, false>(
+                 handle, configuration.input_graph_specifier.graph_file_full_path, true, renumber)
+           : cugraph::test::
+               generate_graph_from_rmat_params<vertex_t, edge_t, weight_t, false, false>(
+                 handle,
+                 configuration.input_graph_specifier.rmat_params.scale,
+                 configuration.input_graph_specifier.rmat_params.edge_factor,
+                 configuration.input_graph_specifier.rmat_params.a,
+                 configuration.input_graph_specifier.rmat_params.b,
+                 configuration.input_graph_specifier.rmat_params.c,
+                 configuration.input_graph_specifier.rmat_params.seed,
+                 configuration.input_graph_specifier.rmat_params.undirected,
+                 configuration.input_graph_specifier.rmat_params.scramble_vertex_ids,
+                 true,
+                 renumber,
+                 std::vector<size_t>{0},
+                 size_t{1});
+}
+
 class Tests_SSSP : public ::testing::TestWithParam<SSSP_Usecase> {
  public:
   Tests_SSSP() {}
@@ -113,61 +154,18 @@ class Tests_SSSP : public ::testing::TestWithParam<SSSP_Usecase> {
   template <typename vertex_t, typename edge_t, typename weight_t>
   void run_current_test(SSSP_Usecase const& configuration)
   {
+    constexpr bool renumber = true;
+
     raft::handle_t handle{};
 
     cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> graph(handle);
-    std::tie(graph, std::ignore) =
-      configuration.input_graph_specifier.tag ==
-          cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH
-        ? cugraph::test::
-            read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, false>(
-              handle, configuration.input_graph_specifier.graph_file_full_path, true, false)
-        : cugraph::test::generate_graph_from_rmat_params<vertex_t, edge_t, weight_t, false, false>(
-            handle,
-            configuration.input_graph_specifier.rmat_params.scale,
-            configuration.input_graph_specifier.rmat_params.edge_factor,
-            configuration.input_graph_specifier.rmat_params.a,
-            configuration.input_graph_specifier.rmat_params.b,
-            configuration.input_graph_specifier.rmat_params.c,
-            configuration.input_graph_specifier.rmat_params.seed,
-            configuration.input_graph_specifier.rmat_params.undirected,
-            configuration.input_graph_specifier.rmat_params.scramble_vertex_ids,
-            true,
-            false);
+    rmm::device_uvector<vertex_t> d_renumber_map_labels(0, handle.get_stream());
+    std::tie(graph, d_renumber_map_labels) =
+      read_graph<vertex_t, edge_t, weight_t>(handle, configuration, renumber);
     auto graph_view = graph.view();
 
-    std::vector<edge_t> h_offsets(graph_view.get_number_of_vertices() + 1);
-    std::vector<vertex_t> h_indices(graph_view.get_number_of_edges());
-    std::vector<weight_t> h_weights(graph_view.get_number_of_edges());
-    raft::update_host(h_offsets.data(),
-                      graph_view.offsets(),
-                      graph_view.get_number_of_vertices() + 1,
-                      handle.get_stream());
-    raft::update_host(h_indices.data(),
-                      graph_view.indices(),
-                      graph_view.get_number_of_edges(),
-                      handle.get_stream());
-    raft::update_host(h_weights.data(),
-                      graph_view.weights(),
-                      graph_view.get_number_of_edges(),
-                      handle.get_stream());
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-
-    ASSERT_TRUE(configuration.source >= 0 &&
-                configuration.source <= graph_view.get_number_of_vertices())
-      << "Starting sources should be >= 0 and"
-      << " less than the number of vertices in the graph.";
-
-    std::vector<weight_t> h_reference_distances(graph_view.get_number_of_vertices());
-    std::vector<vertex_t> h_reference_predecessors(graph_view.get_number_of_vertices());
-
-    sssp_reference(h_offsets.data(),
-                   h_indices.data(),
-                   h_weights.data(),
-                   h_reference_distances.data(),
-                   h_reference_predecessors.data(),
-                   graph_view.get_number_of_vertices(),
-                   static_cast<vertex_t>(configuration.source));
+    ASSERT_TRUE(static_cast<vertex_t>(configuration.source) >= 0 &&
+                static_cast<vertex_t>(configuration.source) < graph_view.get_number_of_vertices());
 
     rmm::device_uvector<weight_t> d_distances(graph_view.get_number_of_vertices(),
                                               handle.get_stream());
@@ -178,53 +176,135 @@ class Tests_SSSP : public ::testing::TestWithParam<SSSP_Usecase> {
 
     cugraph::experimental::sssp(handle,
                                 graph_view,
-                                d_distances.begin(),
-                                d_predecessors.begin(),
+                                d_distances.data(),
+                                d_predecessors.data(),
                                 static_cast<vertex_t>(configuration.source),
                                 std::numeric_limits<weight_t>::max(),
                                 false);
 
     CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
 
-    std::vector<weight_t> h_cugraph_distances(graph_view.get_number_of_vertices());
-    std::vector<vertex_t> h_cugraph_predecessors(graph_view.get_number_of_vertices());
-
-    raft::update_host(
-      h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream());
-    raft::update_host(h_cugraph_predecessors.data(),
-                      d_predecessors.data(),
-                      d_predecessors.size(),
-                      handle.get_stream());
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-
-    auto max_weight_element = std::max_element(h_weights.begin(), h_weights.end());
-    auto epsilon            = *max_weight_element * weight_t{1e-6};
-    auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; };
-
-    ASSERT_TRUE(std::equal(h_reference_distances.begin(),
-                           h_reference_distances.end(),
-                           h_cugraph_distances.begin(),
-                           nearly_equal))
-      << "distances do not match with the reference values.";
-
-    for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) {
-      auto i = std::distance(h_cugraph_predecessors.begin(), it);
-      if (*it == cugraph::invalid_vertex_id<vertex_t>::value) {
-        ASSERT_TRUE(h_reference_predecessors[i] == *it)
-          << "vertex reachability do not match with the reference.";
+    if (configuration.check_correctness) {
+      cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> unrenumbered_graph(
+        handle);
+      if (renumber) {
+        std::tie(unrenumbered_graph, std::ignore) =
+          read_graph<vertex_t, edge_t, weight_t>(handle, configuration, false);
+      }
+      auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view;
+
+      std::vector<edge_t> h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1);
+      std::vector<vertex_t> h_indices(unrenumbered_graph_view.get_number_of_edges());
+      std::vector<weight_t> h_weights(unrenumbered_graph_view.get_number_of_edges());
+      raft::update_host(h_offsets.data(),
+                        unrenumbered_graph_view.offsets(),
+                        unrenumbered_graph_view.get_number_of_vertices() + 1,
+                        handle.get_stream());
+      raft::update_host(h_indices.data(),
+                        unrenumbered_graph_view.indices(),
+                        unrenumbered_graph_view.get_number_of_edges(),
+                        handle.get_stream());
+      raft::update_host(h_weights.data(),
+                        unrenumbered_graph_view.weights(),
+                        unrenumbered_graph_view.get_number_of_edges(),
+                        handle.get_stream());
+
+      handle.get_stream_view().synchronize();
+
+      auto unrenumbered_source = static_cast<vertex_t>(configuration.source);
+      if (renumber) {
+        std::vector<vertex_t> h_renumber_map_labels(d_renumber_map_labels.size());
+        raft::update_host(h_renumber_map_labels.data(),
+                          d_renumber_map_labels.data(),
+                          d_renumber_map_labels.size(),
+                          handle.get_stream());
+
+        handle.get_stream_view().synchronize();
+
+        unrenumbered_source = h_renumber_map_labels[configuration.source];
+      }
+
+      std::vector<weight_t> h_reference_distances(unrenumbered_graph_view.get_number_of_vertices());
+      std::vector<vertex_t> h_reference_predecessors(
+        unrenumbered_graph_view.get_number_of_vertices());
+
+      sssp_reference(h_offsets.data(),
+                     h_indices.data(),
+                     h_weights.data(),
+                     h_reference_distances.data(),
+                     h_reference_predecessors.data(),
+                     unrenumbered_graph_view.get_number_of_vertices(),
+                     unrenumbered_source,
+                     std::numeric_limits<weight_t>::max());
+
+      std::vector<weight_t> h_cugraph_distances(graph_view.get_number_of_vertices());
+      std::vector<vertex_t> h_cugraph_predecessors(graph_view.get_number_of_vertices());
+      if (renumber) {
+        cugraph::experimental::unrenumber_local_int_vertices(handle,
+                                                             d_predecessors.data(),
+                                                             d_predecessors.size(),
+                                                             d_renumber_map_labels.data(),
+                                                             vertex_t{0},
+                                                             graph_view.get_number_of_vertices(),
+                                                             true);
+
+        auto d_unrenumbered_distances = cugraph::test::sort_by_key(
+          handle, d_renumber_map_labels.data(), d_distances.data(), d_renumber_map_labels.size());
+        auto d_unrenumbered_predecessors = cugraph::test::sort_by_key(handle,
+                                                                      d_renumber_map_labels.data(),
+                                                                      d_predecessors.data(),
+                                                                      d_renumber_map_labels.size());
+
+        raft::update_host(h_cugraph_distances.data(),
+                          d_unrenumbered_distances.data(),
+                          d_unrenumbered_distances.size(),
+                          handle.get_stream());
+        raft::update_host(h_cugraph_predecessors.data(),
+                          d_unrenumbered_predecessors.data(),
+                          d_unrenumbered_predecessors.size(),
+                          handle.get_stream());
+
+        handle.get_stream_view().synchronize();
       } else {
-        auto pred_distance = h_reference_distances[*it];
-        bool found{false};
-        for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) {
-          if (h_indices[j] == i) {
-            if (nearly_equal(pred_distance + h_weights[j], h_reference_distances[i])) {
-              found = true;
-              break;
+        raft::update_host(
+          h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream());
+        raft::update_host(h_cugraph_predecessors.data(),
+                          d_predecessors.data(),
+                          d_predecessors.size(),
+                          handle.get_stream());
+
+        handle.get_stream_view().synchronize();
+      }
+
+      auto max_weight_element = std::max_element(h_weights.begin(), h_weights.end());
+      auto epsilon            = *max_weight_element * weight_t{1e-6};
+      auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; };
+
+      ASSERT_TRUE(std::equal(h_reference_distances.begin(),
+                             h_reference_distances.end(),
+                             h_cugraph_distances.begin(),
+                             nearly_equal))
+        << "distances do not match with the reference values.";
+
+      for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) {
+        auto i = std::distance(h_cugraph_predecessors.begin(), it);
+        if (*it == cugraph::invalid_vertex_id<vertex_t>::value) {
+          ASSERT_TRUE(h_reference_predecessors[i] == *it)
+            << "vertex reachability do not match with the reference.";
+        } else {
+          auto pred_distance = h_reference_distances[*it];
+          bool found{false};
+          for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) {
+            if (h_indices[j] == i) {
+              if (nearly_equal(pred_distance + h_weights[j], h_reference_distances[i])) {
+                found = true;
+                break;
+              }
             }
           }
+          ASSERT_TRUE(found)
+            << "no edge from the predecessor vertex to this vertex with the matching weight.";
         }
-        ASSERT_TRUE(found)
-          << "no edge from the predecessor vertex to this vertex with the matching weight.";
       }
     }
   }
@@ -237,9 +317,14 @@ INSTANTIATE_TEST_CASE_P(
   simple_test,
   Tests_SSSP,
   ::testing::Values(
+    // enable correctness checks
     SSSP_Usecase("test/datasets/karate.mtx", 0),
     SSSP_Usecase("test/datasets/dblp.mtx", 0),
     SSSP_Usecase("test/datasets/wiki2003.mtx", 1000),
-    SSSP_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0)));
+    SSSP_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0),
+    // disable correctness checks for large graphs
+    SSSP_Usecase(cugraph::test::rmat_params_t{20, 16, 0.57, 0.19, 0.19, 0, false, false},
+                 0,
+                 false)));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/pagerank/mg_pagerank_test.cpp b/cpp/tests/pagerank/mg_pagerank_test.cpp
index 85ee9a4243e..f7b1e8dfbb4 100644
--- a/cpp/tests/pagerank/mg_pagerank_test.cpp
+++ b/cpp/tests/pagerank/mg_pagerank_test.cpp
@@ -16,13 +16,19 @@
 
 #include <utilities/base_fixture.hpp>
 #include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
 
 #include <algorithms.hpp>
+#include <experimental/graph.hpp>
+#include <experimental/graph_functions.hpp>
+#include <experimental/graph_view.hpp>
 #include <partition_manager.hpp>
 
 #include <raft/comms/comms.hpp>
 #include <raft/comms/mpi_comms.hpp>
 #include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <gtest/gtest.h>
 
@@ -33,11 +39,15 @@ typedef struct PageRank_Usecase_t {
 
   double personalization_ratio{0.0};
   bool test_weighted{false};
+  bool check_correctness{false};
 
   PageRank_Usecase_t(std::string const& graph_file_path,
                      double personalization_ratio,
-                     bool test_weighted)
-    : personalization_ratio(personalization_ratio), test_weighted(test_weighted)
+                     bool test_weighted,
+                     bool check_correctness = true)
+    : personalization_ratio(personalization_ratio),
+      test_weighted(test_weighted),
+      check_correctness(check_correctness)
   {
     std::string graph_file_full_path{};
     if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
@@ -51,14 +61,56 @@ typedef struct PageRank_Usecase_t {
 
   PageRank_Usecase_t(cugraph::test::rmat_params_t rmat_params,
                      double personalization_ratio,
-                     bool test_weighted)
-    : personalization_ratio(personalization_ratio), test_weighted(test_weighted)
+                     bool test_weighted,
+                     bool check_correctness = true)
+    : personalization_ratio(personalization_ratio),
+      test_weighted(test_weighted),
+      check_correctness(check_correctness)
   {
     input_graph_specifier.tag         = cugraph::test::input_graph_specifier_t::RMAT_PARAMS;
     input_graph_specifier.rmat_params = rmat_params;
   }
 } PageRank_Usecase;
 
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, true, multi_gpu>,
+           rmm::device_uvector<vertex_t>>
+read_graph(raft::handle_t const& handle, PageRank_Usecase const& configuration, bool renumber)
+{
+  auto& comm           = handle.get_comms();
+  auto const comm_size = comm.get_size();
+  auto const comm_rank = comm.get_rank();
+
+  std::vector<size_t> partition_ids(multi_gpu ? size_t{1} : static_cast<size_t>(comm_size));
+  std::iota(partition_ids.begin(),
+            partition_ids.end(),
+            multi_gpu ? static_cast<size_t>(comm_rank) : size_t{0});
+
+  return configuration.input_graph_specifier.tag ==
+             cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH
+           ? cugraph::test::
+               read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, true, multi_gpu>(
+                 handle,
+                 configuration.input_graph_specifier.graph_file_full_path,
+                 configuration.test_weighted,
+                 renumber)
+           : cugraph::test::
+               generate_graph_from_rmat_params<vertex_t, edge_t, weight_t, true, multi_gpu>(
+                 handle,
+                 configuration.input_graph_specifier.rmat_params.scale,
+                 configuration.input_graph_specifier.rmat_params.edge_factor,
+                 configuration.input_graph_specifier.rmat_params.a,
+                 configuration.input_graph_specifier.rmat_params.b,
+                 configuration.input_graph_specifier.rmat_params.c,
+                 configuration.input_graph_specifier.rmat_params.seed,
+                 configuration.input_graph_specifier.rmat_params.undirected,
+                 configuration.input_graph_specifier.rmat_params.scramble_vertex_ids,
+                 configuration.test_weighted,
+                 renumber,
+                 partition_ids,
+                 static_cast<size_t>(comm_size));
+}
+
 class Tests_MGPageRank : public ::testing::TestWithParam<PageRank_Usecase> {
  public:
   Tests_MGPageRank() {}
@@ -68,7 +120,7 @@ class Tests_MGPageRank : public ::testing::TestWithParam<PageRank_Usecase> {
   virtual void SetUp() {}
   virtual void TearDown() {}
 
-  // Compare the results of running pagerank on multiple GPUs to that of a single-GPU run
+  // Compare the results of running PageRank on multiple GPUs to that of a single-GPU run
   template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
   void run_current_test(PageRank_Usecase const& configuration)
   {
@@ -86,168 +138,40 @@ class Tests_MGPageRank : public ::testing::TestWithParam<PageRank_Usecase> {
     cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
       subcomm_factory(handle, row_comm_size);
 
-    // 2. create SG & MG graphs
-
-    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, true, false> sg_graph(handle);
-    rmm::device_uvector<vertex_t> d_sg_renumber_map_labels(0, handle.get_stream());
-    std::tie(sg_graph, d_sg_renumber_map_labels) =
-      configuration.input_graph_specifier.tag ==
-          cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH
-        ? cugraph::test::
-            read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, true, false>(
-              handle,
-              configuration.input_graph_specifier.graph_file_full_path,
-              configuration.test_weighted,
-              true)
-        : cugraph::test::generate_graph_from_rmat_params<vertex_t, edge_t, weight_t, true, false>(
-            handle,
-            configuration.input_graph_specifier.rmat_params.scale,
-            configuration.input_graph_specifier.rmat_params.edge_factor,
-            configuration.input_graph_specifier.rmat_params.a,
-            configuration.input_graph_specifier.rmat_params.b,
-            configuration.input_graph_specifier.rmat_params.c,
-            configuration.input_graph_specifier.rmat_params.seed,
-            configuration.input_graph_specifier.rmat_params.undirected,
-            configuration.input_graph_specifier.rmat_params.scramble_vertex_ids,
-            configuration.test_weighted,
-            true);
-
-    auto sg_graph_view = sg_graph.view();
+    // 2. create MG graph
 
     cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, true, true> mg_graph(handle);
     rmm::device_uvector<vertex_t> d_mg_renumber_map_labels(0, handle.get_stream());
     std::tie(mg_graph, d_mg_renumber_map_labels) =
-      configuration.input_graph_specifier.tag ==
-          cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH
-        ? cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, true, true>(
-            handle,
-            configuration.input_graph_specifier.graph_file_full_path,
-            configuration.test_weighted,
-            true)
-        : cugraph::test::generate_graph_from_rmat_params<vertex_t, edge_t, weight_t, true, true>(
-            handle,
-            configuration.input_graph_specifier.rmat_params.scale,
-            configuration.input_graph_specifier.rmat_params.edge_factor,
-            configuration.input_graph_specifier.rmat_params.a,
-            configuration.input_graph_specifier.rmat_params.b,
-            configuration.input_graph_specifier.rmat_params.c,
-            configuration.input_graph_specifier.rmat_params.seed,
-            configuration.input_graph_specifier.rmat_params.undirected,
-            configuration.input_graph_specifier.rmat_params.scramble_vertex_ids,
-            configuration.test_weighted,
-            true);
+      read_graph<vertex_t, edge_t, weight_t, true>(handle, configuration, true);
 
     auto mg_graph_view = mg_graph.view();
 
-    std::vector<vertex_t> h_sg_renumber_map_labels(d_sg_renumber_map_labels.size());
-    raft::update_host(h_sg_renumber_map_labels.data(),
-                      d_sg_renumber_map_labels.data(),
-                      d_sg_renumber_map_labels.size(),
-                      handle.get_stream());
-
-    std::vector<vertex_t> h_mg_renumber_map_labels(mg_graph_view.get_number_of_local_vertices());
-    raft::update_host(h_mg_renumber_map_labels.data(),
-                      d_mg_renumber_map_labels.data(),
-                      d_mg_renumber_map_labels.size(),
-                      handle.get_stream());
+    // 3. generate personalization vertex/value pairs
 
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-
-    // 2. generate personalization vertex/value pairs
-
-    std::vector<vertex_t> h_personalization_vertices{};
-    std::vector<result_t> h_personalization_values{};
+    std::vector<vertex_t> h_mg_personalization_vertices{};
+    std::vector<result_t> h_mg_personalization_values{};
     if (configuration.personalization_ratio > 0.0) {
-      std::default_random_engine generator{};
+      std::default_random_engine generator{
+        static_cast<long unsigned int>(comm.get_rank()) /* seed */};
       std::uniform_real_distribution<double> distribution{0.0, 1.0};
-      h_personalization_vertices.resize(sg_graph_view.get_number_of_vertices());
-      std::iota(h_personalization_vertices.begin(), h_personalization_vertices.end(), vertex_t{0});
-      h_personalization_vertices.erase(
-        std::remove_if(h_personalization_vertices.begin(),
-                       h_personalization_vertices.end(),
+      h_mg_personalization_vertices.resize(mg_graph_view.get_number_of_local_vertices());
+      std::iota(h_mg_personalization_vertices.begin(),
+                h_mg_personalization_vertices.end(),
+                mg_graph_view.get_local_vertex_first());
+      h_mg_personalization_vertices.erase(
+        std::remove_if(h_mg_personalization_vertices.begin(),
+                       h_mg_personalization_vertices.end(),
                        [&generator, &distribution, configuration](auto v) {
                          return distribution(generator) >= configuration.personalization_ratio;
                        }),
-        h_personalization_vertices.end());
-      h_personalization_values.resize(h_personalization_vertices.size());
-      std::for_each(h_personalization_values.begin(),
-                    h_personalization_values.end(),
+        h_mg_personalization_vertices.end());
+      h_mg_personalization_values.resize(h_mg_personalization_vertices.size());
+      std::for_each(h_mg_personalization_values.begin(),
+                    h_mg_personalization_values.end(),
                     [&distribution, &generator](auto& val) { val = distribution(generator); });
     }
 
-    result_t constexpr alpha{0.85};
-    result_t constexpr epsilon{1e-6};
-
-    // 3. run SG pagerank
-
-    std::vector<vertex_t> h_sg_personalization_vertices{};
-    std::vector<result_t> h_sg_personalization_values{};
-    if (h_personalization_vertices.size() > 0) {
-      for (vertex_t i = 0; i < sg_graph_view.get_number_of_vertices(); ++i) {
-        auto it = std::lower_bound(h_personalization_vertices.begin(),
-                                   h_personalization_vertices.end(),
-                                   h_sg_renumber_map_labels[i]);
-        if (*it == h_sg_renumber_map_labels[i]) {
-          h_sg_personalization_vertices.push_back(i);
-          h_sg_personalization_values.push_back(
-            h_personalization_values[std::distance(h_personalization_vertices.begin(), it)]);
-        }
-      }
-    }
-
-    rmm::device_uvector<vertex_t> d_sg_personalization_vertices(
-      h_sg_personalization_vertices.size(), handle.get_stream());
-    rmm::device_uvector<result_t> d_sg_personalization_values(d_sg_personalization_vertices.size(),
-                                                              handle.get_stream());
-    if (d_sg_personalization_vertices.size() > 0) {
-      raft::update_device(d_sg_personalization_vertices.data(),
-                          h_sg_personalization_vertices.data(),
-                          h_sg_personalization_vertices.size(),
-                          handle.get_stream());
-      raft::update_device(d_sg_personalization_values.data(),
-                          h_sg_personalization_values.data(),
-                          h_sg_personalization_values.size(),
-                          handle.get_stream());
-    }
-
-    rmm::device_uvector<result_t> d_sg_pageranks(sg_graph_view.get_number_of_vertices(),
-                                                 handle.get_stream());
-
-    cugraph::experimental::pagerank(handle,
-                                    sg_graph_view,
-                                    static_cast<weight_t*>(nullptr),
-                                    d_sg_personalization_vertices.data(),
-                                    d_sg_personalization_values.data(),
-                                    static_cast<vertex_t>(d_sg_personalization_vertices.size()),
-                                    d_sg_pageranks.begin(),
-                                    alpha,
-                                    epsilon,
-                                    std::numeric_limits<size_t>::max(),  // max_iterations
-                                    false,
-                                    false);
-
-    std::vector<result_t> h_sg_pageranks(sg_graph_view.get_number_of_vertices());
-    raft::update_host(
-      h_sg_pageranks.data(), d_sg_pageranks.data(), d_sg_pageranks.size(), handle.get_stream());
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-
-    // 4. run MG pagerank
-
-    std::vector<vertex_t> h_mg_personalization_vertices{};
-    std::vector<result_t> h_mg_personalization_values{};
-    if (h_personalization_vertices.size() > 0) {
-      for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) {
-        auto it = std::lower_bound(h_personalization_vertices.begin(),
-                                   h_personalization_vertices.end(),
-                                   h_mg_renumber_map_labels[i]);
-        if (*it == h_mg_renumber_map_labels[i]) {
-          h_mg_personalization_vertices.push_back(mg_graph_view.get_local_vertex_first() + i);
-          h_mg_personalization_values.push_back(
-            h_personalization_values[std::distance(h_personalization_vertices.begin(), it)]);
-        }
-      }
-    }
-
     rmm::device_uvector<vertex_t> d_mg_personalization_vertices(
       h_mg_personalization_vertices.size(), handle.get_stream());
     rmm::device_uvector<result_t> d_mg_personalization_values(d_mg_personalization_vertices.size(),
@@ -263,6 +187,11 @@ class Tests_MGPageRank : public ::testing::TestWithParam<PageRank_Usecase> {
                           handle.get_stream());
     }
 
+    // 4. run MG PageRank
+
+    result_t constexpr alpha{0.85};
+    result_t constexpr epsilon{1e-6};
+
     rmm::device_uvector<result_t> d_mg_pageranks(mg_graph_view.get_number_of_local_vertices(),
                                                  handle.get_stream());
 
@@ -274,44 +203,145 @@ class Tests_MGPageRank : public ::testing::TestWithParam<PageRank_Usecase> {
                                     d_mg_personalization_vertices.data(),
                                     d_mg_personalization_values.data(),
                                     static_cast<vertex_t>(d_mg_personalization_vertices.size()),
-                                    d_mg_pageranks.begin(),
+                                    d_mg_pageranks.data(),
                                     alpha,
                                     epsilon,
                                     std::numeric_limits<size_t>::max(),
-                                    false,
                                     false);
 
     CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
 
-    std::vector<result_t> h_mg_pageranks(mg_graph_view.get_number_of_local_vertices());
-    raft::update_host(
-      h_mg_pageranks.data(), d_mg_pageranks.data(), d_mg_pageranks.size(), handle.get_stream());
-    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-
     // 5. copmare SG & MG results
 
-    std::vector<result_t> h_sg_shuffled_pageranks(sg_graph_view.get_number_of_vertices(),
-                                                  result_t{0.0});
-    for (size_t i = 0; i < h_sg_pageranks.size(); ++i) {
-      h_sg_shuffled_pageranks[h_sg_renumber_map_labels[i]] = h_sg_pageranks[i];
-    }
+    if (configuration.check_correctness) {
+      // 5-1. create SG graph
+
+      cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, true, false> sg_graph(handle);
+      std::tie(sg_graph, std::ignore) =
+        read_graph<vertex_t, edge_t, weight_t, false>(handle, configuration, false);
+
+      auto sg_graph_view = sg_graph.view();
+
+      // 5-2. collect personalization vertex/value pairs
+
+      rmm::device_uvector<vertex_t> d_sg_personalization_vertices(0, handle.get_stream());
+      rmm::device_uvector<result_t> d_sg_personalization_values(0, handle.get_stream());
+      if (configuration.personalization_ratio > 0.0) {
+        rmm::device_uvector<vertex_t> d_unrenumbered_personalization_vertices(
+          d_mg_personalization_vertices.size(), handle.get_stream());
+        rmm::device_uvector<result_t> d_unrenumbered_personalization_values(
+          d_unrenumbered_personalization_vertices.size(), handle.get_stream());
+        raft::copy_async(d_unrenumbered_personalization_vertices.data(),
+                         d_mg_personalization_vertices.data(),
+                         d_mg_personalization_vertices.size(),
+                         handle.get_stream());
+        raft::copy_async(d_unrenumbered_personalization_values.data(),
+                         d_mg_personalization_values.data(),
+                         d_mg_personalization_values.size(),
+                         handle.get_stream());
+
+        std::vector<vertex_t> vertex_partition_lasts(comm_size);
+        for (size_t i = 0; i < vertex_partition_lasts.size(); ++i) {
+          vertex_partition_lasts[i] = mg_graph_view.get_vertex_partition_last(i);
+        }
+        cugraph::experimental::unrenumber_int_vertices<vertex_t, true>(
+          handle,
+          d_unrenumbered_personalization_vertices.data(),
+          d_unrenumbered_personalization_vertices.size(),
+          d_mg_renumber_map_labels.data(),
+          mg_graph_view.get_local_vertex_first(),
+          mg_graph_view.get_local_vertex_last(),
+          vertex_partition_lasts,
+          handle.get_stream());
+
+        rmm::device_scalar<size_t> d_local_personalization_vector_size(
+          d_unrenumbered_personalization_vertices.size(), handle.get_stream());
+        rmm::device_uvector<size_t> d_recvcounts(comm_size, handle.get_stream());
+        comm.allgather(
+          d_local_personalization_vector_size.data(), d_recvcounts.data(), 1, handle.get_stream());
+        std::vector<size_t> recvcounts(d_recvcounts.size());
+        raft::update_host(
+          recvcounts.data(), d_recvcounts.data(), d_recvcounts.size(), handle.get_stream());
+        auto status = comm.sync_stream(handle.get_stream());
+        ASSERT_EQ(status, raft::comms::status_t::SUCCESS);
+
+        std::vector<size_t> displacements(recvcounts.size(), size_t{0});
+        std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1);
+
+        d_sg_personalization_vertices.resize(displacements.back() + recvcounts.back(),
+                                             handle.get_stream());
+        d_sg_personalization_values.resize(d_sg_personalization_vertices.size(),
+                                           handle.get_stream());
+
+        comm.allgatherv(d_unrenumbered_personalization_vertices.data(),
+                        d_sg_personalization_vertices.data(),
+                        recvcounts.data(),
+                        displacements.data(),
+                        handle.get_stream());
+        comm.allgatherv(d_unrenumbered_personalization_values.data(),
+                        d_sg_personalization_values.data(),
+                        recvcounts.data(),
+                        displacements.data(),
+                        handle.get_stream());
+
+        cugraph::test::sort_by_key(handle,
+                                   d_unrenumbered_personalization_vertices.data(),
+                                   d_unrenumbered_personalization_values.data(),
+                                   d_unrenumbered_personalization_vertices.size());
+      }
+
+      // 5-3. run SG PageRank
+
+      rmm::device_uvector<result_t> d_sg_pageranks(sg_graph_view.get_number_of_vertices(),
+                                                   handle.get_stream());
+
+      cugraph::experimental::pagerank(handle,
+                                      sg_graph_view,
+                                      static_cast<weight_t*>(nullptr),
+                                      d_sg_personalization_vertices.data(),
+                                      d_sg_personalization_values.data(),
+                                      static_cast<vertex_t>(d_sg_personalization_vertices.size()),
+                                      d_sg_pageranks.data(),
+                                      alpha,
+                                      epsilon,
+                                      std::numeric_limits<size_t>::max(),  // max_iterations
+                                      false);
+
+      // 5-4. compare
+
+      std::vector<result_t> h_sg_pageranks(sg_graph_view.get_number_of_vertices());
+      raft::update_host(
+        h_sg_pageranks.data(), d_sg_pageranks.data(), d_sg_pageranks.size(), handle.get_stream());
+
+      std::vector<result_t> h_mg_pageranks(mg_graph_view.get_number_of_local_vertices());
+      raft::update_host(
+        h_mg_pageranks.data(), d_mg_pageranks.data(), d_mg_pageranks.size(), handle.get_stream());
+
+      std::vector<vertex_t> h_mg_renumber_map_labels(d_mg_renumber_map_labels.size());
+      raft::update_host(h_mg_renumber_map_labels.data(),
+                        d_mg_renumber_map_labels.data(),
+                        d_mg_renumber_map_labels.size(),
+                        handle.get_stream());
+
+      handle.get_stream_view().synchronize();
+
+      auto threshold_ratio = 1e-3;
+      auto threshold_magnitude =
+        (1.0 / static_cast<result_t>(mg_graph_view.get_number_of_vertices())) *
+        threshold_ratio;  // skip comparison for low PageRank verties (lowly ranked vertices)
+      auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
+        return std::abs(lhs - rhs) <
+               std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
+      };
 
-    auto threshold_ratio = 1e-3;
-    auto threshold_magnitude =
-      (1.0 / static_cast<result_t>(mg_graph_view.get_number_of_vertices())) *
-      threshold_ratio;  // skip comparison for low PageRank verties (lowly ranked vertices)
-    auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
-      return std::abs(lhs - rhs) <
-             std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
-    };
-
-    for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) {
-      auto mapped_vertex = h_mg_renumber_map_labels[i];
-      ASSERT_TRUE(nearly_equal(h_mg_pageranks[i], h_sg_shuffled_pageranks[mapped_vertex]))
-        << "MG PageRank value for vertex: " << i << " in rank: " << comm_rank
-        << " has value: " << h_mg_pageranks[i]
-        << " which exceeds the error margin for comparing to SG value: "
-        << h_sg_shuffled_pageranks[mapped_vertex];
+      for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) {
+        auto mapped_vertex = h_mg_renumber_map_labels[i];
+        ASSERT_TRUE(nearly_equal(h_mg_pageranks[i], h_sg_pageranks[mapped_vertex]))
+          << "MG PageRank value for vertex: " << mapped_vertex << " in rank: " << comm_rank
+          << " has value: " << h_mg_pageranks[i]
+          << " which exceeds the error margin for comparing to SG value: "
+          << h_sg_pageranks[mapped_vertex];
+      }
     }
   }
 };
@@ -325,6 +355,7 @@ INSTANTIATE_TEST_CASE_P(
   simple_test,
   Tests_MGPageRank,
   ::testing::Values(
+    // enable correctness checks
     PageRank_Usecase("test/datasets/karate.mtx", 0.0, false),
     PageRank_Usecase("test/datasets/karate.mtx", 0.5, false),
     PageRank_Usecase("test/datasets/karate.mtx", 0.0, true),
@@ -352,6 +383,15 @@ INSTANTIATE_TEST_CASE_P(
                      true),
     PageRank_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false},
                      0.5,
-                     true)));
+                     true),
+    // disable correctness checks for large graphs
+    PageRank_Usecase(
+      cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, false, false),
+    PageRank_Usecase(
+      cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, false, false),
+    PageRank_Usecase(
+      cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, true, false),
+    PageRank_Usecase(
+      cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, true, false)));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/utilities/generate_graph_from_edgelist.cu b/cpp/tests/utilities/generate_graph_from_edgelist.cu
index 1b9fe6051f7..a9df392d2fb 100644
--- a/cpp/tests/utilities/generate_graph_from_edgelist.cu
+++ b/cpp/tests/utilities/generate_graph_from_edgelist.cu
@@ -18,6 +18,7 @@
 #include <experimental/detail/graph_utils.cuh>
 #include <experimental/graph_functions.hpp>
 #include <utilities/error.hpp>
+#include <utilities/shuffle_comm.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
 
@@ -28,7 +29,7 @@
 namespace cugraph {
 namespace test {
 
-namespace detail {
+namespace {
 
 template <typename vertex_t,
           typename edge_t,
@@ -40,14 +41,14 @@ std::enable_if_t<
   std::tuple<
     cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
     rmm::device_uvector<vertex_t>>>
-generate_graph_from_edgelist(raft::handle_t const& handle,
-                             rmm::device_uvector<vertex_t>&& vertices,
-                             rmm::device_uvector<vertex_t>&& edgelist_rows,
-                             rmm::device_uvector<vertex_t>&& edgelist_cols,
-                             rmm::device_uvector<weight_t>&& edgelist_weights,
-                             bool is_symmetric,
-                             bool test_weighted,
-                             bool renumber)
+generate_graph_from_edgelist_impl(raft::handle_t const& handle,
+                                  rmm::device_uvector<vertex_t>&& vertices,
+                                  rmm::device_uvector<vertex_t>&& edgelist_rows,
+                                  rmm::device_uvector<vertex_t>&& edgelist_cols,
+                                  rmm::device_uvector<weight_t>&& edgelist_weights,
+                                  bool is_symmetric,
+                                  bool test_weighted,
+                                  bool renumber)
 {
   CUGRAPH_EXPECTS(renumber, "renumber should be true if multi_gpu is true.");
 
@@ -59,95 +60,88 @@ generate_graph_from_edgelist(raft::handle_t const& handle,
   auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
   auto const col_comm_size = col_comm.get_size();
 
-  vertex_t number_of_vertices = static_cast<vertex_t>(vertices.size());
-
-  auto vertex_key_func =
-    cugraph::experimental::detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size};
-  vertices.resize(thrust::distance(vertices.begin(),
-                                   thrust::remove_if(
-                                     rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                                     vertices.begin(),
-                                     vertices.end(),
-                                     [comm_rank, key_func = vertex_key_func] __device__(auto val) {
-                                       return key_func(val) != comm_rank;
-                                     })),
-                  handle.get_stream());
-  vertices.shrink_to_fit(handle.get_stream());
-
-  auto edge_key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t<vertex_t>{
-    false, comm_size, row_comm_size, col_comm_size};
-  size_t number_of_local_edges{};
-  if (test_weighted) {
-    auto edge_first = thrust::make_zip_iterator(
-      thrust::make_tuple(edgelist_rows.begin(), edgelist_cols.begin(), edgelist_weights.begin()));
-    number_of_local_edges = thrust::distance(
-      edge_first,
-      thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                        edge_first,
-                        edge_first + edgelist_rows.size(),
-                        [comm_rank, key_func = edge_key_func] __device__(auto e) {
-                          auto major = store_transposed ? thrust::get<1>(e) : thrust::get<0>(e);
-                          auto minor = store_transposed ? thrust::get<0>(e) : thrust::get<1>(e);
-                          return key_func(major, minor) != comm_rank;
-                        }));
-  } else {
-    auto edge_first =
-      thrust::make_zip_iterator(thrust::make_tuple(edgelist_rows.begin(), edgelist_cols.begin()));
-    number_of_local_edges = thrust::distance(
-      edge_first,
-      thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                        edge_first,
-                        edge_first + edgelist_rows.size(),
-                        [comm_rank, key_func = edge_key_func] __device__(auto e) {
-                          auto major = store_transposed ? thrust::get<1>(e) : thrust::get<0>(e);
-                          auto minor = store_transposed ? thrust::get<0>(e) : thrust::get<1>(e);
-                          return key_func(major, minor) != comm_rank;
-                        }));
-  }
-
-  edgelist_rows.resize(number_of_local_edges, handle.get_stream());
-  edgelist_rows.shrink_to_fit(handle.get_stream());
-  edgelist_cols.resize(number_of_local_edges, handle.get_stream());
-  edgelist_cols.shrink_to_fit(handle.get_stream());
-  if (test_weighted) {
-    edgelist_weights.resize(number_of_local_edges, handle.get_stream());
-    edgelist_weights.shrink_to_fit(handle.get_stream());
-  }
+  auto local_partition_id_op =
+    [comm_size,
+     key_func = cugraph::experimental::detail::compute_partition_id_from_edge_t<vertex_t>{
+       comm_size, row_comm_size, col_comm_size}] __device__(auto pair) {
+      return key_func(thrust::get<0>(pair), thrust::get<1>(pair)) /
+             comm_size;  // global partition id to local partition id
+    };
+  auto pair_first =
+    store_transposed
+      ? thrust::make_zip_iterator(thrust::make_tuple(edgelist_cols.begin(), edgelist_rows.begin()))
+      : thrust::make_zip_iterator(thrust::make_tuple(edgelist_rows.begin(), edgelist_cols.begin()));
+  auto edge_counts = test_weighted
+                       ? cugraph::experimental::groupby_and_count(pair_first,
+                                                                  pair_first + edgelist_rows.size(),
+                                                                  edgelist_weights.begin(),
+                                                                  local_partition_id_op,
+                                                                  col_comm_size,
+                                                                  handle.get_stream())
+                       : cugraph::experimental::groupby_and_count(pair_first,
+                                                                  pair_first + edgelist_rows.size(),
+                                                                  local_partition_id_op,
+                                                                  col_comm_size,
+                                                                  handle.get_stream());
+
+  std::vector<size_t> h_edge_counts(edge_counts.size());
+  raft::update_host(
+    h_edge_counts.data(), edge_counts.data(), edge_counts.size(), handle.get_stream());
+  handle.get_stream_view().synchronize();
+
+  std::vector<size_t> h_displacements(h_edge_counts.size(), size_t{0});
+  std::partial_sum(h_edge_counts.begin(), h_edge_counts.end() - 1, h_displacements.begin() + 1);
 
   // 3. renumber
 
   rmm::device_uvector<vertex_t> renumber_map_labels(0, handle.get_stream());
   cugraph::experimental::partition_t<vertex_t> partition{};
-  vertex_t aggregate_number_of_vertices{};
+  vertex_t number_of_vertices{};
   edge_t number_of_edges{};
-  // FIXME: set do_expensive_check to false once validated
-  std::tie(renumber_map_labels, partition, aggregate_number_of_vertices, number_of_edges) =
-    cugraph::experimental::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
-      handle,
-      vertices.data(),
-      static_cast<vertex_t>(vertices.size()),
-      store_transposed ? edgelist_cols.data() : edgelist_rows.data(),
-      store_transposed ? edgelist_rows.data() : edgelist_cols.data(),
-      edgelist_rows.size(),
-      false,
-      true);
-  assert(aggregate_number_of_vertices == number_of_vertices);
+  {
+    std::vector<vertex_t*> major_ptrs(h_edge_counts.size());
+    std::vector<vertex_t*> minor_ptrs(major_ptrs.size());
+    std::vector<edge_t> counts(major_ptrs.size());
+    for (size_t i = 0; i < h_edge_counts.size(); ++i) {
+      major_ptrs[i] =
+        (store_transposed ? edgelist_cols.begin() : edgelist_rows.begin()) + h_displacements[i];
+      minor_ptrs[i] =
+        (store_transposed ? edgelist_rows.begin() : edgelist_cols.begin()) + h_displacements[i];
+      counts[i] = static_cast<edge_t>(h_edge_counts[i]);
+    }
+    // FIXME: set do_expensive_check to false once validated
+    std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges) =
+      cugraph::experimental::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
+        handle,
+        vertices.data(),
+        static_cast<vertex_t>(vertices.size()),
+        major_ptrs,
+        minor_ptrs,
+        counts,
+        true);
+  }
 
   // 4. create a graph
 
+  std::vector<cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t>> edgelists(
+    h_edge_counts.size());
+  for (size_t i = 0; i < h_edge_counts.size(); ++i) {
+    edgelists[i] = cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t>{
+      edgelist_rows.data() + h_displacements[i],
+      edgelist_cols.data() + h_displacements[i],
+      test_weighted ? edgelist_weights.data() + h_displacements[i]
+                    : static_cast<weight_t*>(nullptr),
+      static_cast<edge_t>(h_edge_counts[i])};
+  }
+
   return std::make_tuple(
     cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
       handle,
-      std::vector<cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t>>{
-        cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t>{
-          edgelist_rows.data(),
-          edgelist_cols.data(),
-          test_weighted ? edgelist_weights.data() : nullptr,
-          static_cast<edge_t>(edgelist_rows.size())}},
+      edgelists,
       partition,
       number_of_vertices,
       number_of_edges,
-      cugraph::experimental::graph_properties_t{is_symmetric, false},
+      cugraph::experimental::graph_properties_t{is_symmetric, false, test_weighted},
       true,
       true),
     std::move(renumber_map_labels));
@@ -163,14 +157,14 @@ std::enable_if_t<
   std::tuple<
     cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
     rmm::device_uvector<vertex_t>>>
-generate_graph_from_edgelist(raft::handle_t const& handle,
-                             rmm::device_uvector<vertex_t>&& vertices,
-                             rmm::device_uvector<vertex_t>&& edgelist_rows,
-                             rmm::device_uvector<vertex_t>&& edgelist_cols,
-                             rmm::device_uvector<weight_t>&& edgelist_weights,
-                             bool is_symmetric,
-                             bool test_weighted,
-                             bool renumber)
+generate_graph_from_edgelist_impl(raft::handle_t const& handle,
+                                  rmm::device_uvector<vertex_t>&& vertices,
+                                  rmm::device_uvector<vertex_t>&& edgelist_rows,
+                                  rmm::device_uvector<vertex_t>&& edgelist_cols,
+                                  rmm::device_uvector<weight_t>&& edgelist_weights,
+                                  bool is_symmetric,
+                                  bool test_weighted,
+                                  bool renumber)
 {
   vertex_t number_of_vertices = static_cast<vertex_t>(vertices.size());
 
@@ -196,13 +190,13 @@ generate_graph_from_edgelist(raft::handle_t const& handle,
         test_weighted ? edgelist_weights.data() : nullptr,
         static_cast<edge_t>(edgelist_rows.size())},
       number_of_vertices,
-      cugraph::experimental::graph_properties_t{is_symmetric, false},
+      cugraph::experimental::graph_properties_t{is_symmetric, false, test_weighted},
       renumber ? true : false,
       true),
     std::move(renumber_map_labels));
 }
 
-}  // namespace detail
+}  // namespace
 
 template <typename vertex_t,
           typename edge_t,
@@ -220,16 +214,15 @@ generate_graph_from_edgelist(raft::handle_t const& handle,
                              bool test_weighted,
                              bool renumber)
 {
-  return detail::
-    generate_graph_from_edgelist<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
-      handle,
-      std::move(vertices),
-      std::move(edgelist_rows),
-      std::move(edgelist_cols),
-      std::move(edgelist_weights),
-      is_symmetric,
-      test_weighted,
-      renumber);
+  return generate_graph_from_edgelist_impl<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+    handle,
+    std::move(vertices),
+    std::move(edgelist_rows),
+    std::move(edgelist_cols),
+    std::move(edgelist_weights),
+    is_symmetric,
+    test_weighted,
+    renumber);
 }
 
 // explicit instantiations
diff --git a/cpp/tests/utilities/matrix_market_file_utilities.cu b/cpp/tests/utilities/matrix_market_file_utilities.cu
index ddbbac603ee..bf7539864be 100644
--- a/cpp/tests/utilities/matrix_market_file_utilities.cu
+++ b/cpp/tests/utilities/matrix_market_file_utilities.cu
@@ -13,9 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include <utilities/test_utilities.hpp>
 
+#include <experimental/detail/graph_utils.cuh>
 #include <functions.hpp>
+#include <partition_manager.hpp>
 #include <utilities/error.hpp>
 
 #include <raft/cudart_utils.h>
@@ -339,7 +342,73 @@ read_graph_from_matrix_market_file(raft::handle_t const& handle,
                    d_vertices.begin(),
                    d_vertices.end(),
                    vertex_t{0});
+  handle.get_stream_view().synchronize();
+
+  if (multi_gpu) {
+    auto& comm               = handle.get_comms();
+    auto const comm_size     = comm.get_size();
+    auto const comm_rank     = comm.get_rank();
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_size = row_comm.get_size();
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_size = col_comm.get_size();
+
+    auto vertex_key_func =
+      cugraph::experimental::detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size};
+    d_vertices.resize(
+      thrust::distance(
+        d_vertices.begin(),
+        thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                          d_vertices.begin(),
+                          d_vertices.end(),
+                          [comm_rank, key_func = vertex_key_func] __device__(auto val) {
+                            return key_func(val) != comm_rank;
+                          })),
+      handle.get_stream());
+    d_vertices.shrink_to_fit(handle.get_stream());
+
+    auto edge_key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t<vertex_t>{
+      comm_size, row_comm_size, col_comm_size};
+    size_t number_of_local_edges{};
+    if (test_weighted) {
+      auto edge_first       = thrust::make_zip_iterator(thrust::make_tuple(
+        d_edgelist_rows.begin(), d_edgelist_cols.begin(), d_edgelist_weights.begin()));
+      number_of_local_edges = thrust::distance(
+        edge_first,
+        thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                          edge_first,
+                          edge_first + d_edgelist_rows.size(),
+                          [comm_rank, key_func = edge_key_func] __device__(auto e) {
+                            auto major = store_transposed ? thrust::get<1>(e) : thrust::get<0>(e);
+                            auto minor = store_transposed ? thrust::get<0>(e) : thrust::get<1>(e);
+                            return key_func(major, minor) != comm_rank;
+                          }));
+    } else {
+      auto edge_first = thrust::make_zip_iterator(
+        thrust::make_tuple(d_edgelist_rows.begin(), d_edgelist_cols.begin()));
+      number_of_local_edges = thrust::distance(
+        edge_first,
+        thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                          edge_first,
+                          edge_first + d_edgelist_rows.size(),
+                          [comm_rank, key_func = edge_key_func] __device__(auto e) {
+                            auto major = store_transposed ? thrust::get<1>(e) : thrust::get<0>(e);
+                            auto minor = store_transposed ? thrust::get<0>(e) : thrust::get<1>(e);
+                            return key_func(major, minor) != comm_rank;
+                          }));
+    }
+
+    d_edgelist_rows.resize(number_of_local_edges, handle.get_stream());
+    d_edgelist_rows.shrink_to_fit(handle.get_stream());
+    d_edgelist_cols.resize(number_of_local_edges, handle.get_stream());
+    d_edgelist_cols.shrink_to_fit(handle.get_stream());
+    if (test_weighted) {
+      d_edgelist_weights.resize(number_of_local_edges, handle.get_stream());
+      d_edgelist_weights.shrink_to_fit(handle.get_stream());
+    }
+  }
 
+  handle.get_stream_view().synchronize();
   return generate_graph_from_edgelist<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
     handle,
     std::move(d_vertices),
diff --git a/cpp/tests/utilities/rmat_utilities.cu b/cpp/tests/utilities/rmat_utilities.cu
index 16ea7a486fc..3f0bb0b4a1f 100644
--- a/cpp/tests/utilities/rmat_utilities.cu
+++ b/cpp/tests/utilities/rmat_utilities.cu
@@ -13,10 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include <utilities/test_utilities.hpp>
 
+#include <experimental/detail/graph_utils.cuh>
 #include <experimental/graph_generator.hpp>
+#include <partition_manager.hpp>
 #include <utilities/error.hpp>
+#include <utilities/shuffle_comm.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <raft/random/rng.cuh>
@@ -41,39 +45,191 @@ generate_graph_from_rmat_params(raft::handle_t const& handle,
                                 double a,
                                 double b,
                                 double c,
-                                uint64_t seed,
+                                uint64_t base_seed,
                                 bool undirected,
                                 bool scramble_vertex_ids,
                                 bool test_weighted,
-                                bool renumber)
+                                bool renumber,
+                                std::vector<size_t> const& partition_ids,
+                                size_t num_partitions)
 {
+  CUGRAPH_EXPECTS(!multi_gpu || renumber, "renumber should be true if multi_gpu is true.");
+  CUGRAPH_EXPECTS(size_t{1} << scale <= static_cast<size_t>(std::numeric_limits<vertex_t>::max()),
+                  "vertex_t overflow.");
+  CUGRAPH_EXPECTS(
+    (size_t{1} << scale) * edge_factor <= static_cast<size_t>(std::numeric_limits<edge_t>::max()),
+    " edge_t overflow.");
+
+  vertex_t number_of_vertices = static_cast<vertex_t>(size_t{1} << scale);
+  edge_t number_of_edges =
+    static_cast<edge_t>(static_cast<size_t>(number_of_vertices) * edge_factor);
+
+  std::vector<edge_t> partition_edge_counts(partition_ids.size());
+  std::vector<vertex_t> partition_vertex_firsts(partition_ids.size());
+  std::vector<vertex_t> partition_vertex_lasts(partition_ids.size());
+  for (size_t i = 0; i < partition_ids.size(); ++i) {
+    auto id = partition_ids[i];
+
+    partition_edge_counts[i] = number_of_edges / num_partitions +
+                               (id < number_of_edges % num_partitions ? edge_t{1} : edge_t{0});
+
+    partition_vertex_firsts[i] = (number_of_vertices / num_partitions) * id;
+    partition_vertex_lasts[i]  = (number_of_vertices / num_partitions) * (id + 1);
+    if (id < number_of_vertices % num_partitions) {
+      partition_vertex_firsts[i] += id;
+      partition_vertex_lasts[i] += id + 1;
+    } else {
+      partition_vertex_firsts[i] += number_of_vertices % num_partitions;
+      partition_vertex_lasts[i] += number_of_vertices % num_partitions;
+    }
+  }
+
   rmm::device_uvector<vertex_t> d_edgelist_rows(0, handle.get_stream());
   rmm::device_uvector<vertex_t> d_edgelist_cols(0, handle.get_stream());
-  std::tie(d_edgelist_rows, d_edgelist_cols) =
-    cugraph::experimental::generate_rmat_edgelist<vertex_t>(
-      handle, scale, edge_factor, a, b, c, seed, undirected ? true : false, scramble_vertex_ids);
+  rmm::device_uvector<weight_t> d_edgelist_weights(0, handle.get_stream());
+  for (size_t i = 0; i < partition_ids.size(); ++i) {
+    auto id = partition_ids[i];
+
+    rmm::device_uvector<vertex_t> d_tmp_rows(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> d_tmp_cols(0, handle.get_stream());
+    std::tie(i == 0 ? d_edgelist_rows : d_tmp_rows, i == 0 ? d_edgelist_cols : d_tmp_cols) =
+      cugraph::experimental::generate_rmat_edgelist<vertex_t>(handle,
+                                                              scale,
+                                                              partition_edge_counts[i],
+                                                              a,
+                                                              b,
+                                                              c,
+                                                              base_seed + id,
+                                                              undirected ? true : false,
+                                                              scramble_vertex_ids);
+
+    rmm::device_uvector<weight_t> d_tmp_weights(0, handle.get_stream());
+    if (test_weighted) {
+      if (i == 0) {
+        d_edgelist_weights.resize(d_edgelist_rows.size(), handle.get_stream());
+      } else {
+        d_tmp_weights.resize(d_tmp_rows.size(), handle.get_stream());
+      }
+
+      raft::random::Rng rng(base_seed + num_partitions + id);
+      rng.uniform<weight_t, size_t>(i == 0 ? d_edgelist_weights.data() : d_tmp_weights.data(),
+                                    i == 0 ? d_edgelist_weights.size() : d_tmp_weights.size(),
+                                    weight_t{0.0},
+                                    weight_t{1.0},
+                                    handle.get_stream());
+    }
+
+    if (i > 0) {
+      auto start_offset = d_edgelist_rows.size();
+      d_edgelist_rows.resize(start_offset + d_tmp_rows.size(), handle.get_stream());
+      d_edgelist_cols.resize(d_edgelist_rows.size(), handle.get_stream());
+      thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   d_tmp_rows.begin(),
+                   d_tmp_rows.end(),
+                   d_edgelist_rows.begin() + start_offset);
+      thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   d_tmp_cols.begin(),
+                   d_tmp_cols.end(),
+                   d_edgelist_cols.begin() + start_offset);
+      if (test_weighted) {
+        d_edgelist_weights.resize(d_edgelist_rows.size(), handle.get_stream());
+        thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     d_tmp_weights.begin(),
+                     d_tmp_weights.end(),
+                     d_edgelist_weights.begin() + start_offset);
+      }
+    }
+  }
+
   if (undirected) {
     // FIXME: need to symmetrize
     CUGRAPH_FAIL("unimplemented.");
   }
 
-  rmm::device_uvector<weight_t> d_edgelist_weights(test_weighted ? d_edgelist_rows.size() : 0,
-                                                   handle.get_stream());
-  if (test_weighted) {
-    raft::random::Rng rng(seed + 1);
-    rng.uniform<weight_t, size_t>(d_edgelist_weights.data(),
-                                  d_edgelist_weights.size(),
-                                  weight_t{0.0},
-                                  weight_t{1.0},
-                                  handle.get_stream());
+  if (multi_gpu) {
+    auto& comm               = handle.get_comms();
+    auto const comm_size     = comm.get_size();
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_size = row_comm.get_size();
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_size = col_comm.get_size();
+
+    rmm::device_uvector<vertex_t> d_rx_edgelist_rows(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> d_rx_edgelist_cols(0, handle.get_stream());
+    rmm::device_uvector<weight_t> d_rx_edgelist_weights(0, handle.get_stream());
+    if (test_weighted) {
+      auto edge_first = thrust::make_zip_iterator(
+        thrust::make_tuple(store_transposed ? d_edgelist_cols.begin() : d_edgelist_rows.begin(),
+                           store_transposed ? d_edgelist_rows.begin() : d_edgelist_cols.begin(),
+                           d_edgelist_weights.begin()));
+
+      std::forward_as_tuple(std::tie(store_transposed ? d_rx_edgelist_cols : d_rx_edgelist_rows,
+                                     store_transposed ? d_rx_edgelist_rows : d_rx_edgelist_cols,
+                                     d_rx_edgelist_weights),
+                            std::ignore) =
+        cugraph::experimental::groupby_gpuid_and_shuffle_values(
+          comm,  // handle.get_comms(),
+          edge_first,
+          edge_first + d_edgelist_rows.size(),
+          [key_func =
+             cugraph::experimental::detail::compute_gpu_id_from_edge_t<vertex_t>{
+               comm_size, row_comm_size, col_comm_size}] __device__(auto val) {
+            return key_func(thrust::get<0>(val), thrust::get<1>(val));
+          },
+          handle.get_stream());
+    } else {
+      auto edge_first = thrust::make_zip_iterator(
+        thrust::make_tuple(store_transposed ? d_edgelist_cols.begin() : d_edgelist_rows.begin(),
+                           store_transposed ? d_edgelist_rows.begin() : d_edgelist_cols.begin()));
+
+      std::forward_as_tuple(std::tie(store_transposed ? d_rx_edgelist_cols : d_rx_edgelist_rows,
+                                     store_transposed ? d_rx_edgelist_rows : d_rx_edgelist_cols),
+                            std::ignore) =
+        cugraph::experimental::groupby_gpuid_and_shuffle_values(
+          comm,  // handle.get_comms(),
+          edge_first,
+          edge_first + d_edgelist_rows.size(),
+          [key_func =
+             cugraph::experimental::detail::compute_gpu_id_from_edge_t<vertex_t>{
+               comm_size, row_comm_size, col_comm_size}] __device__(auto val) {
+            return key_func(thrust::get<0>(val), thrust::get<1>(val));
+          },
+          handle.get_stream());
+    }
+
+    d_edgelist_rows    = std::move(d_rx_edgelist_rows);
+    d_edgelist_cols    = std::move(d_rx_edgelist_cols);
+    d_edgelist_weights = std::move(d_rx_edgelist_weights);
+  }
+
+  rmm::device_uvector<vertex_t> d_vertices(0, handle.get_stream());
+  for (size_t i = 0; i < partition_ids.size(); ++i) {
+    auto id = partition_ids[i];
+
+    auto start_offset = d_vertices.size();
+    d_vertices.resize(start_offset + (partition_vertex_lasts[i] - partition_vertex_firsts[i]),
+                      handle.get_stream());
+    thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     d_vertices.begin() + start_offset,
+                     d_vertices.end(),
+                     partition_vertex_firsts[i]);
   }
 
-  rmm::device_uvector<vertex_t> d_vertices(static_cast<vertex_t>(size_t{1} << scale),
-                                           handle.get_stream());
-  thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                   d_vertices.begin(),
-                   d_vertices.end(),
-                   vertex_t{0});
+  if (multi_gpu) {
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+
+    rmm::device_uvector<vertex_t> d_rx_vertices(0, handle.get_stream());
+    std::tie(d_rx_vertices, std::ignore) = cugraph::experimental::groupby_gpuid_and_shuffle_values(
+      comm,  // handle.get_comms(),
+      d_vertices.begin(),
+      d_vertices.end(),
+      [key_func =
+         cugraph::experimental::detail::compute_gpu_id_from_vertex_t<vertex_t>{
+           comm_size}] __device__(auto val) { return key_func(val); },
+      handle.get_stream());
+    d_vertices = std::move(d_rx_vertices);
+  }
 
   return generate_graph_from_edgelist<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
     handle,
@@ -90,59 +246,71 @@ generate_graph_from_rmat_params(raft::handle_t const& handle,
 
 template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, float, false, false>,
                     rmm::device_uvector<int32_t>>
-generate_graph_from_rmat_params<int32_t, int32_t, float, false, false>(raft::handle_t const& handle,
-                                                                       size_t scale,
-                                                                       size_t edge_factor,
-                                                                       double a,
-                                                                       double b,
-                                                                       double c,
-                                                                       uint64_t seed,
-                                                                       bool undirected,
-                                                                       bool scramble_vertex_ids,
-                                                                       bool test_weighted,
-                                                                       bool renumber);
+generate_graph_from_rmat_params<int32_t, int32_t, float, false, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, float, false, true>,
                     rmm::device_uvector<int32_t>>
-generate_graph_from_rmat_params<int32_t, int32_t, float, false, true>(raft::handle_t const& handle,
-                                                                      size_t scale,
-                                                                      size_t edge_factor,
-                                                                      double a,
-                                                                      double b,
-                                                                      double c,
-                                                                      uint64_t seed,
-                                                                      bool undirected,
-                                                                      bool scramble_vertex_ids,
-                                                                      bool test_weighted,
-                                                                      bool renumber);
+generate_graph_from_rmat_params<int32_t, int32_t, float, false, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, float, true, false>,
                     rmm::device_uvector<int32_t>>
-generate_graph_from_rmat_params<int32_t, int32_t, float, true, false>(raft::handle_t const& handle,
-                                                                      size_t scale,
-                                                                      size_t edge_factor,
-                                                                      double a,
-                                                                      double b,
-                                                                      double c,
-                                                                      uint64_t seed,
-                                                                      bool undirected,
-                                                                      bool scramble_vertex_ids,
-                                                                      bool test_weighted,
-                                                                      bool renumber);
+generate_graph_from_rmat_params<int32_t, int32_t, float, true, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, float, true, true>,
                     rmm::device_uvector<int32_t>>
-generate_graph_from_rmat_params<int32_t, int32_t, float, true, true>(raft::handle_t const& handle,
-                                                                     size_t scale,
-                                                                     size_t edge_factor,
-                                                                     double a,
-                                                                     double b,
-                                                                     double c,
-                                                                     uint64_t seed,
-                                                                     bool undirected,
-                                                                     bool scramble_vertex_ids,
-                                                                     bool test_weighted,
-                                                                     bool renumber);
+generate_graph_from_rmat_params<int32_t, int32_t, float, true, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, double, false, false>,
                     rmm::device_uvector<int32_t>>
@@ -157,105 +325,128 @@ generate_graph_from_rmat_params<int32_t, int32_t, double, false, false>(
   bool undirected,
   bool scramble_vertex_ids,
   bool test_weighted,
-  bool renumber);
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, double, false, true>,
                     rmm::device_uvector<int32_t>>
-generate_graph_from_rmat_params<int32_t, int32_t, double, false, true>(raft::handle_t const& handle,
-                                                                       size_t scale,
-                                                                       size_t edge_factor,
-                                                                       double a,
-                                                                       double b,
-                                                                       double c,
-                                                                       uint64_t seed,
-                                                                       bool undirected,
-                                                                       bool scramble_vertex_ids,
-                                                                       bool test_weighted,
-                                                                       bool renumber);
+generate_graph_from_rmat_params<int32_t, int32_t, double, false, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, double, true, false>,
                     rmm::device_uvector<int32_t>>
-generate_graph_from_rmat_params<int32_t, int32_t, double, true, false>(raft::handle_t const& handle,
-                                                                       size_t scale,
-                                                                       size_t edge_factor,
-                                                                       double a,
-                                                                       double b,
-                                                                       double c,
-                                                                       uint64_t seed,
-                                                                       bool undirected,
-                                                                       bool scramble_vertex_ids,
-                                                                       bool test_weighted,
-                                                                       bool renumber);
+generate_graph_from_rmat_params<int32_t, int32_t, double, true, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int32_t, int32_t, double, true, true>,
                     rmm::device_uvector<int32_t>>
-generate_graph_from_rmat_params<int32_t, int32_t, double, true, true>(raft::handle_t const& handle,
-                                                                      size_t scale,
-                                                                      size_t edge_factor,
-                                                                      double a,
-                                                                      double b,
-                                                                      double c,
-                                                                      uint64_t seed,
-                                                                      bool undirected,
-                                                                      bool scramble_vertex_ids,
-                                                                      bool test_weighted,
-                                                                      bool renumber);
+generate_graph_from_rmat_params<int32_t, int32_t, double, true, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, float, false, false>,
                     rmm::device_uvector<int32_t>>
-generate_graph_from_rmat_params<int32_t, int64_t, float, false, false>(raft::handle_t const& handle,
-                                                                       size_t scale,
-                                                                       size_t edge_factor,
-                                                                       double a,
-                                                                       double b,
-                                                                       double c,
-                                                                       uint64_t seed,
-                                                                       bool undirected,
-                                                                       bool scramble_vertex_ids,
-                                                                       bool test_weighted,
-                                                                       bool renumber);
+generate_graph_from_rmat_params<int32_t, int64_t, float, false, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, float, false, true>,
                     rmm::device_uvector<int32_t>>
-generate_graph_from_rmat_params<int32_t, int64_t, float, false, true>(raft::handle_t const& handle,
-                                                                      size_t scale,
-                                                                      size_t edge_factor,
-                                                                      double a,
-                                                                      double b,
-                                                                      double c,
-                                                                      uint64_t seed,
-                                                                      bool undirected,
-                                                                      bool scramble_vertex_ids,
-                                                                      bool test_weighted,
-                                                                      bool renumber);
+generate_graph_from_rmat_params<int32_t, int64_t, float, false, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, float, true, false>,
                     rmm::device_uvector<int32_t>>
-generate_graph_from_rmat_params<int32_t, int64_t, float, true, false>(raft::handle_t const& handle,
-                                                                      size_t scale,
-                                                                      size_t edge_factor,
-                                                                      double a,
-                                                                      double b,
-                                                                      double c,
-                                                                      uint64_t seed,
-                                                                      bool undirected,
-                                                                      bool scramble_vertex_ids,
-                                                                      bool test_weighted,
-                                                                      bool renumber);
+generate_graph_from_rmat_params<int32_t, int64_t, float, true, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, float, true, true>,
                     rmm::device_uvector<int32_t>>
-generate_graph_from_rmat_params<int32_t, int64_t, float, true, true>(raft::handle_t const& handle,
-                                                                     size_t scale,
-                                                                     size_t edge_factor,
-                                                                     double a,
-                                                                     double b,
-                                                                     double c,
-                                                                     uint64_t seed,
-                                                                     bool undirected,
-                                                                     bool scramble_vertex_ids,
-                                                                     bool test_weighted,
-                                                                     bool renumber);
+generate_graph_from_rmat_params<int32_t, int64_t, float, true, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, double, false, false>,
                     rmm::device_uvector<int32_t>>
@@ -270,105 +461,128 @@ generate_graph_from_rmat_params<int32_t, int64_t, double, false, false>(
   bool undirected,
   bool scramble_vertex_ids,
   bool test_weighted,
-  bool renumber);
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, double, false, true>,
                     rmm::device_uvector<int32_t>>
-generate_graph_from_rmat_params<int32_t, int64_t, double, false, true>(raft::handle_t const& handle,
-                                                                       size_t scale,
-                                                                       size_t edge_factor,
-                                                                       double a,
-                                                                       double b,
-                                                                       double c,
-                                                                       uint64_t seed,
-                                                                       bool undirected,
-                                                                       bool scramble_vertex_ids,
-                                                                       bool test_weighted,
-                                                                       bool renumber);
+generate_graph_from_rmat_params<int32_t, int64_t, double, false, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, double, true, false>,
                     rmm::device_uvector<int32_t>>
-generate_graph_from_rmat_params<int32_t, int64_t, double, true, false>(raft::handle_t const& handle,
-                                                                       size_t scale,
-                                                                       size_t edge_factor,
-                                                                       double a,
-                                                                       double b,
-                                                                       double c,
-                                                                       uint64_t seed,
-                                                                       bool undirected,
-                                                                       bool scramble_vertex_ids,
-                                                                       bool test_weighted,
-                                                                       bool renumber);
+generate_graph_from_rmat_params<int32_t, int64_t, double, true, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int32_t, int64_t, double, true, true>,
                     rmm::device_uvector<int32_t>>
-generate_graph_from_rmat_params<int32_t, int64_t, double, true, true>(raft::handle_t const& handle,
-                                                                      size_t scale,
-                                                                      size_t edge_factor,
-                                                                      double a,
-                                                                      double b,
-                                                                      double c,
-                                                                      uint64_t seed,
-                                                                      bool undirected,
-                                                                      bool scramble_vertex_ids,
-                                                                      bool test_weighted,
-                                                                      bool renumber);
+generate_graph_from_rmat_params<int32_t, int64_t, double, true, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, float, false, false>,
                     rmm::device_uvector<int64_t>>
-generate_graph_from_rmat_params<int64_t, int64_t, float, false, false>(raft::handle_t const& handle,
-                                                                       size_t scale,
-                                                                       size_t edge_factor,
-                                                                       double a,
-                                                                       double b,
-                                                                       double c,
-                                                                       uint64_t seed,
-                                                                       bool undirected,
-                                                                       bool scramble_vertex_ids,
-                                                                       bool test_weighted,
-                                                                       bool renumber);
+generate_graph_from_rmat_params<int64_t, int64_t, float, false, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, float, false, true>,
                     rmm::device_uvector<int64_t>>
-generate_graph_from_rmat_params<int64_t, int64_t, float, false, true>(raft::handle_t const& handle,
-                                                                      size_t scale,
-                                                                      size_t edge_factor,
-                                                                      double a,
-                                                                      double b,
-                                                                      double c,
-                                                                      uint64_t seed,
-                                                                      bool undirected,
-                                                                      bool scramble_vertex_ids,
-                                                                      bool test_weighted,
-                                                                      bool renumber);
+generate_graph_from_rmat_params<int64_t, int64_t, float, false, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, float, true, false>,
                     rmm::device_uvector<int64_t>>
-generate_graph_from_rmat_params<int64_t, int64_t, float, true, false>(raft::handle_t const& handle,
-                                                                      size_t scale,
-                                                                      size_t edge_factor,
-                                                                      double a,
-                                                                      double b,
-                                                                      double c,
-                                                                      uint64_t seed,
-                                                                      bool undirected,
-                                                                      bool scramble_vertex_ids,
-                                                                      bool test_weighted,
-                                                                      bool renumber);
+generate_graph_from_rmat_params<int64_t, int64_t, float, true, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, float, true, true>,
                     rmm::device_uvector<int64_t>>
-generate_graph_from_rmat_params<int64_t, int64_t, float, true, true>(raft::handle_t const& handle,
-                                                                     size_t scale,
-                                                                     size_t edge_factor,
-                                                                     double a,
-                                                                     double b,
-                                                                     double c,
-                                                                     uint64_t seed,
-                                                                     bool undirected,
-                                                                     bool scramble_vertex_ids,
-                                                                     bool test_weighted,
-                                                                     bool renumber);
+generate_graph_from_rmat_params<int64_t, int64_t, float, true, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, double, false, false>,
                     rmm::device_uvector<int64_t>>
@@ -383,49 +597,60 @@ generate_graph_from_rmat_params<int64_t, int64_t, double, false, false>(
   bool undirected,
   bool scramble_vertex_ids,
   bool test_weighted,
-  bool renumber);
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, double, false, true>,
                     rmm::device_uvector<int64_t>>
-generate_graph_from_rmat_params<int64_t, int64_t, double, false, true>(raft::handle_t const& handle,
-                                                                       size_t scale,
-                                                                       size_t edge_factor,
-                                                                       double a,
-                                                                       double b,
-                                                                       double c,
-                                                                       uint64_t seed,
-                                                                       bool undirected,
-                                                                       bool scramble_vertex_ids,
-                                                                       bool test_weighted,
-                                                                       bool renumber);
+generate_graph_from_rmat_params<int64_t, int64_t, double, false, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, double, true, false>,
                     rmm::device_uvector<int64_t>>
-generate_graph_from_rmat_params<int64_t, int64_t, double, true, false>(raft::handle_t const& handle,
-                                                                       size_t scale,
-                                                                       size_t edge_factor,
-                                                                       double a,
-                                                                       double b,
-                                                                       double c,
-                                                                       uint64_t seed,
-                                                                       bool undirected,
-                                                                       bool scramble_vertex_ids,
-                                                                       bool test_weighted,
-                                                                       bool renumber);
+generate_graph_from_rmat_params<int64_t, int64_t, double, true, false>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 template std::tuple<cugraph::experimental::graph_t<int64_t, int64_t, double, true, true>,
                     rmm::device_uvector<int64_t>>
-generate_graph_from_rmat_params<int64_t, int64_t, double, true, true>(raft::handle_t const& handle,
-                                                                      size_t scale,
-                                                                      size_t edge_factor,
-                                                                      double a,
-                                                                      double b,
-                                                                      double c,
-                                                                      uint64_t seed,
-                                                                      bool undirected,
-                                                                      bool scramble_vertex_ids,
-                                                                      bool test_weighted,
-                                                                      bool renumber);
+generate_graph_from_rmat_params<int64_t, int64_t, double, true, true>(
+  raft::handle_t const& handle,
+  size_t scale,
+  size_t edge_factor,
+  double a,
+  double b,
+  double c,
+  uint64_t seed,
+  bool undirected,
+  bool scramble_vertex_ids,
+  bool test_weighted,
+  bool renumber,
+  std::vector<size_t> const& partition_ids,
+  size_t num_partitions);
 
 }  // namespace test
 }  // namespace cugraph
diff --git a/cpp/tests/utilities/test_utilities.hpp b/cpp/tests/utilities/test_utilities.hpp
index 37e87c62247..e81a76b4163 100644
--- a/cpp/tests/utilities/test_utilities.hpp
+++ b/cpp/tests/utilities/test_utilities.hpp
@@ -106,6 +106,22 @@ static const std::string& get_rapids_dataset_root_dir()
   return rdrd;
 }
 
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+           rmm::device_uvector<vertex_t>>
+generate_graph_from_edgelist(raft::handle_t const& handle,
+                             rmm::device_uvector<vertex_t>&& vertices,
+                             rmm::device_uvector<vertex_t>&& edgelist_rows,
+                             rmm::device_uvector<vertex_t>&& edgelist_cols,
+                             rmm::device_uvector<weight_t>&& edgelist_weights,
+                             bool is_symmetric,
+                             bool test_weighted,
+                             bool renumber);
+
 // returns a tuple of (rows, columns, weights, number_of_vertices, is_symmetric)
 template <typename vertex_t, typename weight_t>
 std::tuple<rmm::device_uvector<vertex_t>,
@@ -130,22 +146,6 @@ read_graph_from_matrix_market_file(raft::handle_t const& handle,
                                    bool test_weighted,
                                    bool renumber);
 
-template <typename vertex_t,
-          typename edge_t,
-          typename weight_t,
-          bool store_transposed,
-          bool multi_gpu>
-std::tuple<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
-           rmm::device_uvector<vertex_t>>
-generate_graph_from_edgelist(raft::handle_t const& handle,
-                             rmm::device_uvector<vertex_t>&& vertices,
-                             rmm::device_uvector<vertex_t>&& edgelist_rows,
-                             rmm::device_uvector<vertex_t>&& edgelist_cols,
-                             rmm::device_uvector<weight_t>&& edgelist_weights,
-                             bool is_symmetric,
-                             bool test_weighted,
-                             bool renumber);
-
 template <typename vertex_t,
           typename edge_t,
           typename weight_t,
@@ -163,7 +163,9 @@ generate_graph_from_rmat_params(raft::handle_t const& handle,
                                 bool undirected,
                                 bool scramble_vertex_ids,
                                 bool test_weighted,
-                                bool renumber);
+                                bool renumber,
+                                std::vector<size_t> const& partition_ids,
+                                size_t num_partitions);
 
 struct rmat_params_t {
   size_t scale{};
@@ -182,19 +184,5 @@ struct input_graph_specifier_t {
   rmat_params_t rmat_params{};
 };
 
-template <typename vertex_t>
-std::enable_if_t<std::is_signed<vertex_t>::value, bool> is_valid_vertex(vertex_t num_vertices,
-                                                                        vertex_t v)
-{
-  return (v >= 0) && (v < num_vertices);
-}
-
-template <typename vertex_t>
-std::enable_if_t<std::is_unsigned<vertex_t>::value, bool> is_valid_vertex(vertex_t num_vertices,
-                                                                          vertex_t v)
-{
-  return v < num_vertices;
-}
-
 }  // namespace test
 }  // namespace cugraph
diff --git a/cpp/tests/utilities/thrust_wrapper.cu b/cpp/tests/utilities/thrust_wrapper.cu
new file mode 100644
index 00000000000..5d32fb8a5d1
--- /dev/null
+++ b/cpp/tests/utilities/thrust_wrapper.cu
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/thrust_wrapper.hpp>
+
+#include <raft/handle.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/sort.h>
+
+namespace cugraph {
+namespace test {
+
+template <typename vertex_t, typename value_t>
+rmm::device_uvector<value_t> sort_by_key(raft::handle_t const& handle,
+                                         vertex_t const* keys,
+                                         value_t const* values,
+                                         size_t num_pairs)
+{
+  rmm::device_uvector<vertex_t> sorted_keys(num_pairs, handle.get_stream_view());
+  rmm::device_uvector<value_t> sorted_values(num_pairs, handle.get_stream_view());
+
+  thrust::copy(
+    rmm::exec_policy(handle.get_stream_view()), keys, keys + num_pairs, sorted_keys.begin());
+  thrust::copy(
+    rmm::exec_policy(handle.get_stream_view()), values, values + num_pairs, sorted_values.begin());
+
+  thrust::sort_by_key(rmm::exec_policy(handle.get_stream_view()),
+                      sorted_keys.begin(),
+                      sorted_keys.end(),
+                      sorted_values.begin());
+
+  return sorted_values;
+}
+
+template rmm::device_uvector<float> sort_by_key<int32_t, float>(raft::handle_t const& handle,
+                                                                int32_t const* keys,
+                                                                float const* values,
+                                                                size_t num_pairs);
+
+template rmm::device_uvector<double> sort_by_key<int32_t, double>(raft::handle_t const& handle,
+                                                                  int32_t const* keys,
+                                                                  double const* values,
+                                                                  size_t num_pairs);
+
+template rmm::device_uvector<int32_t> sort_by_key<int32_t, int32_t>(raft::handle_t const& handle,
+                                                                    int32_t const* keys,
+                                                                    int32_t const* values,
+                                                                    size_t num_pairs);
+
+template rmm::device_uvector<float> sort_by_key<int64_t, float>(raft::handle_t const& handle,
+                                                                int64_t const* keys,
+                                                                float const* values,
+                                                                size_t num_pairs);
+
+template rmm::device_uvector<double> sort_by_key<int64_t, double>(raft::handle_t const& handle,
+                                                                  int64_t const* keys,
+                                                                  double const* values,
+                                                                  size_t num_pairs);
+
+template rmm::device_uvector<int64_t> sort_by_key<int64_t, int64_t>(raft::handle_t const& handle,
+                                                                    int64_t const* keys,
+                                                                    int64_t const* values,
+                                                                    size_t num_pairs);
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/utilities/thrust_wrapper.hpp b/cpp/tests/utilities/thrust_wrapper.hpp
new file mode 100644
index 00000000000..579dc3c550f
--- /dev/null
+++ b/cpp/tests/utilities/thrust_wrapper.hpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cugraph {
+namespace test {
+
+template <typename vertex_t, typename value_t>
+rmm::device_uvector<value_t> sort_by_key(raft::handle_t const& handle,
+                                         vertex_t const* keys,
+                                         value_t const* values,
+                                         size_t num_pairs);
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/python/cugraph/community/egonet_wrapper.pyx b/python/cugraph/community/egonet_wrapper.pyx
index ead41705628..23aa159314f 100644
--- a/python/cugraph/community/egonet_wrapper.pyx
+++ b/python/cugraph/community/egonet_wrapper.pyx
@@ -42,7 +42,7 @@ def egonet(input_graph, vertices, radius=1):
 
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
-    num_partition_edges = num_edges
+    num_local_edges = num_edges
 
     cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0]
@@ -50,8 +50,10 @@ def egonet(input_graph, vertices, radius=1):
     if weights is not None:
         c_edge_weights = weights.__cuda_array_interface__['data'][0]
         weight_t = weights.dtype
+        is_weighted = True
     else:
         weight_t = np.dtype("float32")
+        is_weighted = False
 
     # Pointers for egonet
     vertices = vertices.astype('int32')
@@ -72,10 +74,11 @@ def egonet(input_graph, vertices, radius=1):
                              <numberTypeEnum>(<int>(numberTypeMap[vertex_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[edge_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
-                             num_partition_edges,
+                             num_local_edges,
                              num_verts,
                              num_edges,
                              False,
+                             is_weighted,
                              False, False) 
 
     if(weight_t==np.dtype("float32")):
diff --git a/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx b/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx
index ccae26fe7e6..5fb9de788cf 100644
--- a/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx
+++ b/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx
@@ -52,8 +52,12 @@ def mg_katz_centrality(input_df,
     if "value" in input_df.columns:
         weights = input_df['value']
         weight_t = weights.dtype
+        is_weighted = True
+        raise NotImplementedError # FIXME: c_edge_weights is always set to NULL
     else:
+        weights = None
         weight_t = np.dtype("float32")
+        is_weighted = False
 
     if alpha is None:
         alpha = 0.1
@@ -67,11 +71,13 @@ def mg_katz_centrality(input_df,
                      np.dtype("double") : <int>numberTypeEnum.doubleType}
 
     # FIXME: needs to be edge_t type not int
-    cdef int num_partition_edges = len(src)
+    cdef int num_local_edges = len(src)
 
     cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_edge_weights = <uintptr_t>NULL
+    if weights is not None:
+      c_edge_weights = weights.__cuda_array_interface__['data'][0]
     
     # FIXME: data is on device, move to host (to_pandas()), convert to np array and access pointer to pass to C
     vertex_partition_offsets_host = vertex_partition_offsets.values_host
@@ -85,9 +91,10 @@ def mg_katz_centrality(input_df,
                              <numberTypeEnum>(<int>(numberTypeMap[vertex_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[edge_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
-                             num_partition_edges,
+                             num_local_edges,
                              num_global_verts, num_global_edges,
                              True,
+                             is_weighted,
                              True, True) 
 
     df = cudf.DataFrame()
diff --git a/python/cugraph/dask/community/louvain_wrapper.pyx b/python/cugraph/dask/community/louvain_wrapper.pyx
index f58630d07aa..a3cebeac272 100644
--- a/python/cugraph/dask/community/louvain_wrapper.pyx
+++ b/python/cugraph/dask/community/louvain_wrapper.pyx
@@ -56,12 +56,12 @@ def louvain(input_df,
 
     src = input_df['src']
     dst = input_df['dst']
-    num_partition_edges = len(src)
+    num_local_edges = len(src)
 
     if "value" in input_df.columns:
         weights = input_df['value']
     else:
-        weights = cudf.Series(np.full(num_partition_edges, 1.0, dtype=np.float32))
+        weights = cudf.Series(np.full(num_local_edges, 1.0, dtype=np.float32))
 
     vertex_t = src.dtype
     if num_global_edges > (2**31 - 1):
@@ -94,9 +94,10 @@ def louvain(input_df,
                              <numberTypeEnum>(<int>(numberTypeMap[vertex_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[edge_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
-                             num_partition_edges,
+                             num_local_edges,
                              num_global_verts, num_global_edges,
                              sorted_by_degree,
+                             True,
                              False, True)  # store_transposed, multi_gpu
 
     # Create the output dataframe, column lengths must be equal to the number of
diff --git a/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx b/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx
index 12f2342559b..c2f92f0f33b 100644
--- a/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx
+++ b/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx
@@ -51,8 +51,12 @@ def mg_pagerank(input_df,
     if "value" in input_df.columns:
         weights = input_df['value']
         weight_t = weights.dtype
+        is_weighted = True
+        raise NotImplementedError # FIXME: c_edge_weights is always set to NULL
     else:
+        weights = None
         weight_t = np.dtype("float32")
+        is_weighted = False
 
     # FIXME: Offsets and indices are currently hardcoded to int, but this may
     #        not be acceptable in the future.
@@ -62,11 +66,13 @@ def mg_pagerank(input_df,
                      np.dtype("double") : <int>numberTypeEnum.doubleType}
 
     # FIXME: needs to be edge_t type not int
-    cdef int num_partition_edges = len(src)
+    cdef int num_local_edges = len(src)
 
     cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_edge_weights = <uintptr_t>NULL
+    if weights is not None:
+      c_edge_weights = weights.__cuda_array_interface__['data'][0]
     
     # FIXME: data is on device, move to host (to_pandas()), convert to np array and access pointer to pass to C
     vertex_partition_offsets_host = vertex_partition_offsets.values_host
@@ -81,9 +87,10 @@ def mg_pagerank(input_df,
                              <numberTypeEnum>(<int>(numberTypeMap[vertex_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[edge_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
-                             num_partition_edges,
+                             num_local_edges,
                              num_global_verts, num_global_edges,
                              True,
+                             is_weighted,
                              True, True) 
 
     df = cudf.DataFrame()
diff --git a/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx b/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx
index 527cb2bcf0a..44630ba5fb3 100644
--- a/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx
+++ b/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx
@@ -58,7 +58,7 @@ def mg_bfs(input_df,
                      np.dtype("double") : <int>numberTypeEnum.doubleType}
 
     # FIXME: needs to be edge_t type not int
-    cdef int num_partition_edges = len(src)
+    cdef int num_local_edges = len(src)
 
     cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0]
@@ -77,9 +77,10 @@ def mg_bfs(input_df,
                              <numberTypeEnum>(<int>(numberTypeMap[vertex_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[edge_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
-                             num_partition_edges,
+                             num_local_edges,
                              num_global_verts, num_global_edges,
                              True,
+                             False, # BFS runs on unweighted graphs
                              False, True) 
 
     # Generate the cudf.DataFrame result
diff --git a/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx b/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx
index 15d956836b4..82a4ebe04d6 100644
--- a/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx
+++ b/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx
@@ -46,9 +46,11 @@ def mg_sssp(input_df,
     if "value" in input_df.columns:
         weights = input_df['value']
         weight_t = weights.dtype
+        is_weighted = True
     else:
         weights = None
         weight_t = np.dtype("float32")
+        is_weighted = False
 
     # FIXME: Offsets and indices are currently hardcoded to int, but this may
     #        not be acceptable in the future.
@@ -58,7 +60,7 @@ def mg_sssp(input_df,
                      np.dtype("double") : <int>numberTypeEnum.doubleType}
 
     # FIXME: needs to be edge_t type not int
-    cdef int num_partition_edges = len(src)
+    cdef int num_local_edges = len(src)
 
     cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0]
@@ -79,9 +81,10 @@ def mg_sssp(input_df,
                              <numberTypeEnum>(<int>(numberTypeMap[vertex_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[edge_t])),
                              <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
-                             num_partition_edges,
+                             num_local_edges,
                              num_global_verts, num_global_edges,
                              True,
+                             is_weighted,
                              False, True) 
 
     # Generate the cudf.DataFrame result
diff --git a/python/cugraph/link_analysis/pagerank_wrapper.pyx b/python/cugraph/link_analysis/pagerank_wrapper.pyx
index 81a68d42360..2c619a052ec 100644
--- a/python/cugraph/link_analysis/pagerank_wrapper.pyx
+++ b/python/cugraph/link_analysis/pagerank_wrapper.pyx
@@ -42,7 +42,7 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
     # FIXME: needs to be edge_t type not int
-    cdef int num_partition_edges = len(src)
+    cdef int num_local_edges = len(src)
 
     df = cudf.DataFrame()
     df['vertex'] = cudf.Series(np.arange(num_verts, dtype=np.int32))
@@ -71,8 +71,10 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.
     if weights is not None:
         c_edge_weights = weights.__cuda_array_interface__['data'][0]
         weight_t = weights.dtype
+        is_weighted = True
     else:
         weight_t = np.dtype("float32")
+        is_weighted = False
 
     # FIXME: Offsets and indices are currently hardcoded to int, but this may
     #        not be acceptable in the future.
@@ -96,10 +98,10 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.
                              <numberTypeEnum>(<int>(numberTypeEnum.int32Type)),
                              <numberTypeEnum>(<int>(numberTypeEnum.int32Type)),
                              <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
-                             #num_verts, num_edges,
-                             num_partition_edges,
+                             num_local_edges,
                              num_verts, num_edges,
                              False,
+                             is_weighted,
                              True,
                              False)
 
diff --git a/python/cugraph/structure/graph_utilities.pxd b/python/cugraph/structure/graph_utilities.pxd
index 10c90f44cb8..b169e42ccf8 100644
--- a/python/cugraph/structure/graph_utilities.pxd
+++ b/python/cugraph/structure/graph_utilities.pxd
@@ -46,10 +46,11 @@ cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
         numberTypeEnum vertexType,
         numberTypeEnum edgeType,
         numberTypeEnum weightType,
-        size_t num_partition_edges,
+        size_t num_local_edges,
         size_t num_global_vertices,
         size_t num_global_edges,
         bool sorted_by_degree,
+        bool is_weighted,
         bool transposed,
         bool multi_gpu) except +
 
@@ -106,18 +107,21 @@ cdef extern from "experimental/graph_view.hpp" namespace "cugraph::experimental"
 #
 cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
 
-    cdef cppclass major_minor_weights_t[vertex_t, weight_t]:
+    cdef cppclass major_minor_weights_t[vertex_t, edge_t, weight_t]:
         major_minor_weights_t(const handle_t &handle)
         pair[unique_ptr[device_buffer], size_t] get_major_wrap()
         pair[unique_ptr[device_buffer], size_t] get_minor_wrap()
         pair[unique_ptr[device_buffer], size_t] get_weights_wrap()
+        unique_ptr[vector[edge_t]] get_edge_counts_wrap()
 
 
 ctypedef fused shuffled_vertices_t:
-    major_minor_weights_t[int, float]
-    major_minor_weights_t[int, double]
-    major_minor_weights_t[long, float]
-    major_minor_weights_t[long, double]
+    major_minor_weights_t[int, int, float]
+    major_minor_weights_t[int, int, double]
+    major_minor_weights_t[int, long, float]
+    major_minor_weights_t[int, long, double]
+    major_minor_weights_t[long, long, float]
+    major_minor_weights_t[long, long, double]
     
 # 3. return type for renumber:
 #
@@ -151,13 +155,12 @@ cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
 #
 cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
 
-    cdef unique_ptr[major_minor_weights_t[vertex_t, weight_t]] call_shuffle[vertex_t, edge_t, weight_t](
+    cdef unique_ptr[major_minor_weights_t[vertex_t, edge_t, weight_t]] call_shuffle[vertex_t, edge_t, weight_t](
         const handle_t &handle,
         vertex_t *edgelist_major_vertices,
         vertex_t *edgelist_minor_vertices,
         weight_t* edgelist_weights,
-        edge_t num_edges,
-        bool is_hyper_partitioned) except +
+        edge_t num_edges) except +
 
 # 5. `renumber_edgelist()` wrapper
 #
@@ -167,7 +170,6 @@ cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
         const handle_t &handle,
         vertex_t *edgelist_major_vertices,
         vertex_t *edgelist_minor_vertices,
-        edge_t num_edges,
-        bool is_hyper_partitioned,
+        const vector[edge_t]& edge_counts,
         bool do_check,
         bool multi_gpu) except +
diff --git a/python/cugraph/structure/renumber_wrapper.pyx b/python/cugraph/structure/renumber_wrapper.pyx
index 682c6b32a0f..99626cdee08 100644
--- a/python/cugraph/structure/renumber_wrapper.pyx
+++ b/python/cugraph/structure/renumber_wrapper.pyx
@@ -22,6 +22,7 @@ from libc.stdint cimport uintptr_t
 from cython.operator cimport dereference as deref
 import numpy as np
 
+from libcpp.memory cimport make_unique
 from libcpp.utility cimport move
 from rmm._lib.device_buffer cimport device_buffer, DeviceBuffer
 
@@ -103,13 +104,11 @@ def renumber(input_df,           # maybe use cpdef ?
         raise Exception("Incompatible vertex_t and edge_t types.")
 
     # FIXME: needs to be edge_t type not int
-    cdef int num_partition_edges = len(major_vertices)
+    cdef int num_local_edges = len(major_vertices)
 
     cdef uintptr_t c_major_vertices = major_vertices.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_minor_vertices = minor_vertices.__cuda_array_interface__['data'][0]
 
-    cdef bool is_hyper_partitioned = False # for now
-
     cdef uintptr_t shuffled_major = <uintptr_t>NULL
     cdef uintptr_t shuffled_minor = <uintptr_t>NULL
     
@@ -119,12 +118,14 @@ def renumber(input_df,           # maybe use cpdef ?
     cdef pair[unique_ptr[device_buffer], size_t] pair_original
     cdef pair[unique_ptr[device_buffer], size_t] pair_partition
 
-    # tparams: vertex_t, weight_t:
+    # tparams: vertex_t, edge_t, weight_t:
     #
-    cdef unique_ptr[major_minor_weights_t[int, float]] ptr_shuffled_32_32
-    cdef unique_ptr[major_minor_weights_t[int, double]] ptr_shuffled_32_64
-    cdef unique_ptr[major_minor_weights_t[long, float]] ptr_shuffled_64_32
-    cdef unique_ptr[major_minor_weights_t[long, double]] ptr_shuffled_64_64
+    cdef unique_ptr[major_minor_weights_t[int, int, float]] ptr_shuffled_32_32_32
+    cdef unique_ptr[major_minor_weights_t[int, int, double]] ptr_shuffled_32_32_64
+    cdef unique_ptr[major_minor_weights_t[int, long, float]] ptr_shuffled_32_64_32
+    cdef unique_ptr[major_minor_weights_t[int, long, double]] ptr_shuffled_32_64_64
+    cdef unique_ptr[major_minor_weights_t[long, long, float]] ptr_shuffled_64_64_32
+    cdef unique_ptr[major_minor_weights_t[long, long, double]] ptr_shuffled_64_64_64
 
     # tparams: vertex_t, edge_t:
     #
@@ -132,6 +133,11 @@ def renumber(input_df,           # maybe use cpdef ?
     cdef unique_ptr[renum_quad_t[int, long]] ptr_renum_quad_32_64
     cdef unique_ptr[renum_quad_t[long, long]] ptr_renum_quad_64_64
 
+    # tparam: vertex_t:
+    #
+    cdef unique_ptr[vector[int]] edge_counts_32
+    cdef unique_ptr[vector[long]] edge_counts_64
+
     # tparam: vertex_t:
     #
     cdef unique_ptr[vector[int]] uniq_partition_vector_32
@@ -143,31 +149,32 @@ def renumber(input_df,           # maybe use cpdef ?
         if ( edge_t == np.dtype("int32")):
             if( weight_t == np.dtype("float32")):
                 if(is_multi_gpu):
-                    ptr_shuffled_32_32.reset(call_shuffle[int, int, float](deref(handle_ptr),
+                    ptr_shuffled_32_32_32.reset(call_shuffle[int, int, float](deref(handle_ptr),
                                                                            <int*>c_major_vertices,
                                                                            <int*>c_minor_vertices,
                                                                            <float*>c_edge_weights,
-                                                                           num_partition_edges,
-                                                                           is_hyper_partitioned).release())
-                    shuffled_df = renumber_helper(ptr_shuffled_32_32.get(), vertex_t, weights)
+                                                                           num_local_edges).release())
+                    shuffled_df = renumber_helper(ptr_shuffled_32_32_32.get(), vertex_t, weights)
                     major_vertices = shuffled_df['major_vertices']
                     minor_vertices = shuffled_df['minor_vertices']
-                    num_partition_edges = len(shuffled_df)
+                    num_local_edges = len(shuffled_df)
                     if not transposed:
                         major = 'src'; minor = 'dst'
                     else:
                         major = 'dst'; minor = 'src'
                     shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False)
+                    edge_counts_32 = move(ptr_shuffled_32_32_32.get().get_edge_counts_wrap())
                 else:
                     shuffled_df = input_df
-                       
+                    edge_counts_32 = make_unique[vector[int]](1, num_local_edges)
+                      
                 shuffled_major = major_vertices.__cuda_array_interface__['data'][0]
                 shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0]
+
                 ptr_renum_quad_32_32.reset(call_renumber[int, int](deref(handle_ptr),
                                                                    <int*>shuffled_major,
                                                                    <int*>shuffled_minor,
-                                                                   num_partition_edges,
-                                                                   is_hyper_partitioned,
+                                                                   deref(edge_counts_32.get()),
                                                                    1,
                                                                    mg_flag).release())
                 
@@ -190,8 +197,7 @@ def renumber(input_df,           # maybe use cpdef ?
                                                        uniq_partition_vector_32.get()[0].at(rank_indx+1)),
                                              dtype=vertex_t)
                 else:
-                    new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(0),
-                                                       uniq_partition_vector_32.get()[0].at(1)),
+                    new_series = cudf.Series(np.arange(0, ptr_renum_quad_32_32.get().get_num_vertices()),
                                              dtype=vertex_t)                
                 # create new cudf df
                 #
@@ -205,24 +211,25 @@ def renumber(input_df,           # maybe use cpdef ?
 
             elif( weight_t == np.dtype("float64")):
                 if(is_multi_gpu):
-                    ptr_shuffled_32_64.reset(call_shuffle[int, int, double](deref(handle_ptr),
+                    ptr_shuffled_32_32_64.reset(call_shuffle[int, int, double](deref(handle_ptr),
                                                                             <int*>c_major_vertices,
                                                                             <int*>c_minor_vertices,
                                                                             <double*>c_edge_weights,
-                                                                            num_partition_edges,
-                                                                            is_hyper_partitioned).release())
+                                                                            num_local_edges).release())
                 
-                    shuffled_df = renumber_helper(ptr_shuffled_32_64.get(), vertex_t, weights)
+                    shuffled_df = renumber_helper(ptr_shuffled_32_32_64.get(), vertex_t, weights)
                     major_vertices = shuffled_df['major_vertices']
                     minor_vertices = shuffled_df['minor_vertices']
-                    num_partition_edges = len(shuffled_df)
+                    num_local_edges = len(shuffled_df)
                     if not transposed:
                         major = 'src'; minor = 'dst'
                     else:
                         major = 'dst'; minor = 'src'
                     shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False)
+                    edge_counts_32 = move(ptr_shuffled_32_32_64.get().get_edge_counts_wrap())
                 else:
                     shuffled_df = input_df
+                    edge_counts_32 = make_unique[vector[int]](1, num_local_edges)
       
                 shuffled_major = major_vertices.__cuda_array_interface__['data'][0]
                 shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0]
@@ -230,8 +237,7 @@ def renumber(input_df,           # maybe use cpdef ?
                 ptr_renum_quad_32_32.reset(call_renumber[int, int](deref(handle_ptr),
                                                                    <int*>shuffled_major,
                                                                    <int*>shuffled_minor,
-                                                                   num_partition_edges,
-                                                                   is_hyper_partitioned,
+                                                                   deref(edge_counts_32.get()),
                                                                    do_check,
                                                                    mg_flag).release())
                 
@@ -254,8 +260,7 @@ def renumber(input_df,           # maybe use cpdef ?
                                                        uniq_partition_vector_32.get()[0].at(rank_indx+1)),
                                              dtype=vertex_t)
                 else:
-                    new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(0),
-                                                       uniq_partition_vector_32.get()[0].at(1)),
+                    new_series = cudf.Series(np.arange(0, ptr_renum_quad_32_32.get().get_num_vertices()),
                                              dtype=vertex_t)
                 
                 # create new cudf df
@@ -271,24 +276,25 @@ def renumber(input_df,           # maybe use cpdef ?
         elif ( edge_t == np.dtype("int64")):
             if( weight_t == np.dtype("float32")):
                 if(is_multi_gpu):
-                    ptr_shuffled_32_32.reset(call_shuffle[int, long, float](deref(handle_ptr),
+                    ptr_shuffled_32_64_32.reset(call_shuffle[int, long, float](deref(handle_ptr),
                                                                             <int*>c_major_vertices,
                                                                             <int*>c_minor_vertices,
                                                                             <float*>c_edge_weights,
-                                                                            num_partition_edges,
-                                                                            is_hyper_partitioned).release())
+                                                                            num_local_edges).release())
                 
-                    shuffled_df = renumber_helper(ptr_shuffled_32_32.get(), vertex_t, weights)
+                    shuffled_df = renumber_helper(ptr_shuffled_32_64_32.get(), vertex_t, weights)
                     major_vertices = shuffled_df['major_vertices']
                     minor_vertices = shuffled_df['minor_vertices']
-                    num_partition_edges = len(shuffled_df)
+                    num_local_edges = len(shuffled_df)
                     if not transposed:
                         major = 'src'; minor = 'dst'
                     else:
                         major = 'dst'; minor = 'src'
                     shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False)
+                    edge_counts_64 = move(ptr_shuffled_32_64_32.get().get_edge_counts_wrap())
                 else:
                     shuffled_df = input_df
+                    edge_counts_64 = make_unique[vector[long]](1, num_local_edges)
                  
                 shuffled_major = major_vertices.__cuda_array_interface__['data'][0]
                 shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0]
@@ -296,8 +302,7 @@ def renumber(input_df,           # maybe use cpdef ?
                 ptr_renum_quad_32_64.reset(call_renumber[int, long](deref(handle_ptr),
                                                                     <int*>shuffled_major,
                                                                     <int*>shuffled_minor,
-                                                                    num_partition_edges,
-                                                                    is_hyper_partitioned,
+                                                                    deref(edge_counts_64.get()),
                                                                     do_check,
                                                                     mg_flag).release())
                 
@@ -320,8 +325,7 @@ def renumber(input_df,           # maybe use cpdef ?
                                                        uniq_partition_vector_32.get()[0].at(rank_indx+1)),
                                              dtype=vertex_t)
                 else:
-                    new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(0),
-                                                       uniq_partition_vector_32.get()[0].at(1)),
+                    new_series = cudf.Series(np.arange(0, ptr_renum_quad_32_64.get().get_num_vertices()),
                                              dtype=vertex_t)
                
                 # create new cudf df
@@ -335,24 +339,25 @@ def renumber(input_df,           # maybe use cpdef ?
                 return renumbered_map, shuffled_df
             elif( weight_t == np.dtype("float64")):
                 if(is_multi_gpu):
-                    ptr_shuffled_32_64.reset(call_shuffle[int, long, double](deref(handle_ptr),
+                    ptr_shuffled_32_64_64.reset(call_shuffle[int, long, double](deref(handle_ptr),
                                                                              <int*>c_major_vertices,
                                                                              <int*>c_minor_vertices,
                                                                              <double*>c_edge_weights,
-                                                                             num_partition_edges,
-                                                                             is_hyper_partitioned).release())
+                                                                             num_local_edges).release())
                 
-                    shuffled_df = renumber_helper(ptr_shuffled_32_64.get(), vertex_t, weights)
+                    shuffled_df = renumber_helper(ptr_shuffled_32_64_64.get(), vertex_t, weights)
                     major_vertices = shuffled_df['major_vertices']
                     minor_vertices = shuffled_df['minor_vertices']
-                    num_partition_edges = len(shuffled_df)
+                    num_local_edges = len(shuffled_df)
                     if not transposed:
                         major = 'src'; minor = 'dst'
                     else:
                         major = 'dst'; minor = 'src'
                     shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False)
+                    edge_counts_64 = move(ptr_shuffled_32_64_64.get().get_edge_counts_wrap())
                 else:
                     shuffled_df = input_df
+                    edge_counts_64 = make_unique[vector[long]](1, num_local_edges)
                                        
                 shuffled_major = major_vertices.__cuda_array_interface__['data'][0]
                 shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0]
@@ -360,8 +365,7 @@ def renumber(input_df,           # maybe use cpdef ?
                 ptr_renum_quad_32_64.reset(call_renumber[int, long](deref(handle_ptr),
                                                                     <int*>shuffled_major,
                                                                     <int*>shuffled_minor,
-                                                                    num_partition_edges,
-                                                                    is_hyper_partitioned,
+                                                                    deref(edge_counts_64.get()),
                                                                     do_check,
                                                                     mg_flag).release())
                 
@@ -384,8 +388,7 @@ def renumber(input_df,           # maybe use cpdef ?
                                                        uniq_partition_vector_32.get()[0].at(rank_indx+1)),
                                              dtype=vertex_t)
                 else:
-                    new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(0),
-                                                       uniq_partition_vector_32.get()[0].at(1)),
+                    new_series = cudf.Series(np.arange(0, ptr_renum_quad_32_64.get().get_num_vertices()),
                                              dtype=vertex_t)                
                 # create new cudf df
                 #
@@ -401,24 +404,25 @@ def renumber(input_df,           # maybe use cpdef ?
         if ( edge_t == np.dtype("int64")):
             if( weight_t == np.dtype("float32")):
                 if(is_multi_gpu):
-                    ptr_shuffled_64_32.reset(call_shuffle[long, long, float](deref(handle_ptr),
+                    ptr_shuffled_64_64_32.reset(call_shuffle[long, long, float](deref(handle_ptr),
                                                                             <long*>c_major_vertices,
                                                                             <long*>c_minor_vertices,
                                                                             <float*>c_edge_weights,
-                                                                            num_partition_edges,
-                                                                            is_hyper_partitioned).release())
+                                                                            num_local_edges).release())
                 
-                    shuffled_df = renumber_helper(ptr_shuffled_64_32.get(), vertex_t, weights)
+                    shuffled_df = renumber_helper(ptr_shuffled_64_64_32.get(), vertex_t, weights)
                     major_vertices = shuffled_df['major_vertices']
                     minor_vertices = shuffled_df['minor_vertices']
-                    num_partition_edges = len(shuffled_df)
+                    num_local_edges = len(shuffled_df)
                     if not transposed:
                         major = 'src'; minor = 'dst'
                     else:
                         major = 'dst'; minor = 'src'
                     shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False)
+                    edge_counts_64 = move(ptr_shuffled_64_64_32.get().get_edge_counts_wrap())
                 else:
                     shuffled_df = input_df
+                    edge_counts_64 = make_unique[vector[long]](1, num_local_edges)
 
                 shuffled_major = major_vertices.__cuda_array_interface__['data'][0]
                 shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0]
@@ -426,8 +430,7 @@ def renumber(input_df,           # maybe use cpdef ?
                 ptr_renum_quad_64_64.reset(call_renumber[long, long](deref(handle_ptr),
                                                                      <long*>shuffled_major,
                                                                      <long*>shuffled_minor,
-                                                                     num_partition_edges,
-                                                                     is_hyper_partitioned,
+                                                                     deref(edge_counts_64.get()),
                                                                      do_check,
                                                                      mg_flag).release())
                 
@@ -450,8 +453,7 @@ def renumber(input_df,           # maybe use cpdef ?
                                                        uniq_partition_vector_64.get()[0].at(rank_indx+1)),
                                              dtype=vertex_t)
                 else:
-                    new_series = cudf.Series(np.arange(uniq_partition_vector_64.get()[0].at(0),
-                                                       uniq_partition_vector_64.get()[0].at(1)),
+                    new_series = cudf.Series(np.arange(0, ptr_renum_quad_64_64.get().get_num_vertices()),
                                              dtype=vertex_t)
                 
                 # create new cudf df
@@ -466,24 +468,25 @@ def renumber(input_df,           # maybe use cpdef ?
 
             elif( weight_t == np.dtype("float64")):
                 if(is_multi_gpu):
-                    ptr_shuffled_64_64.reset(call_shuffle[long, long, double](deref(handle_ptr),
+                    ptr_shuffled_64_64_64.reset(call_shuffle[long, long, double](deref(handle_ptr),
                                                                               <long*>c_major_vertices,
                                                                               <long*>c_minor_vertices,
                                                                               <double*>c_edge_weights,
-                                                                              num_partition_edges,
-                                                                              is_hyper_partitioned).release())
+                                                                              num_local_edges).release())
                 
-                    shuffled_df = renumber_helper(ptr_shuffled_64_64.get(), vertex_t, weights)
+                    shuffled_df = renumber_helper(ptr_shuffled_64_64_64.get(), vertex_t, weights)
                     major_vertices = shuffled_df['major_vertices']
                     minor_vertices = shuffled_df['minor_vertices']
-                    num_partition_edges = len(shuffled_df)
+                    num_local_edges = len(shuffled_df)
                     if not transposed:
                         major = 'src'; minor = 'dst'
                     else:
                         major = 'dst'; minor = 'src'
                     shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False)
+                    edge_counts_64 = move(ptr_shuffled_64_64_64.get().get_edge_counts_wrap())
                 else:
                     shuffled_df = input_df
+                    edge_counts_64 = make_unique[vector[long]](1, num_local_edges)
 
                 shuffled_major = major_vertices.__cuda_array_interface__['data'][0]
                 shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0]
@@ -491,8 +494,7 @@ def renumber(input_df,           # maybe use cpdef ?
                 ptr_renum_quad_64_64.reset(call_renumber[long, long](deref(handle_ptr),
                                                                      <long*>shuffled_major,
                                                                      <long*>shuffled_minor,
-                                                                     num_partition_edges,
-                                                                     is_hyper_partitioned,
+                                                                     deref(edge_counts_64.get()),
                                                                      do_check,
                                                                      mg_flag).release())
                 
@@ -515,8 +517,7 @@ def renumber(input_df,           # maybe use cpdef ?
                                                        uniq_partition_vector_64.get()[0].at(rank_indx+1)),
                                              dtype=vertex_t)
                 else:
-                    new_series = cudf.Series(np.arange(uniq_partition_vector_64.get()[0].at(0),
-                                                       uniq_partition_vector_64.get()[0].at(1)),
+                    new_series = cudf.Series(np.arange(0, ptr_renum_quad_64_64.get().get_num_vertices()),
                                              dtype=vertex_t)
                 
                 # create new cudf df